Mastermind/src/utils/openai-sdk.js

821 lines
24 KiB
JavaScript

const { BrowserWindow } = require('electron');
const fs = require('fs');
const path = require('path');
const os = require('os');
const { spawn } = require('child_process');
// OpenAI SDK will be loaded dynamically
let OpenAI = null;
// OpenAI SDK-based provider (for BotHub, Azure, and other OpenAI-compatible APIs)
// This uses the standard Chat Completions API with Whisper for transcription
let openaiClient = null;
let currentConfig = null;
let conversationMessages = [];
let isProcessing = false;
let audioInputMode = 'auto';
let isPushToTalkActive = false;
// macOS audio capture
let systemAudioProc = null;
let audioBuffer = Buffer.alloc(0);
let transcriptionTimer = null;
const TRANSCRIPTION_INTERVAL_MS = 3000; // Transcribe every 3 seconds
const MIN_AUDIO_DURATION_MS = 500; // Minimum audio duration to transcribe
const SAMPLE_RATE = 24000;
function sendToRenderer(channel, data) {
const windows = BrowserWindow.getAllWindows();
if (windows.length > 0) {
windows[0].webContents.send(channel, data);
}
}
async function initializeOpenAISDK(config) {
const { apiKey, baseUrl, model } = config;
if (!apiKey) {
throw new Error('OpenAI API key is required');
}
// Dynamic import for ES module
if (!OpenAI) {
const openaiModule = await import('openai');
OpenAI = openaiModule.default;
}
const clientConfig = {
apiKey: apiKey,
};
// Use custom baseURL if provided
if (baseUrl && baseUrl.trim() !== '') {
clientConfig.baseURL = baseUrl;
}
openaiClient = new OpenAI(clientConfig);
currentConfig = config;
conversationMessages = [];
console.log('OpenAI SDK initialized with baseURL:', clientConfig.baseURL || 'default');
sendToRenderer('update-status', 'Ready (OpenAI SDK)');
return true;
}
function setSystemPrompt(systemPrompt) {
// Clear conversation and set system prompt
conversationMessages = [];
if (systemPrompt) {
conversationMessages.push({
role: 'system',
content: systemPrompt,
});
}
}
// Create WAV file from raw PCM data
function createWavBuffer(pcmBuffer, sampleRate = 24000, numChannels = 1, bitsPerSample = 16) {
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
const blockAlign = numChannels * (bitsPerSample / 8);
const dataSize = pcmBuffer.length;
const headerSize = 44;
const fileSize = headerSize + dataSize - 8;
const wavBuffer = Buffer.alloc(headerSize + dataSize);
// RIFF header
wavBuffer.write('RIFF', 0);
wavBuffer.writeUInt32LE(fileSize, 4);
wavBuffer.write('WAVE', 8);
// fmt chunk
wavBuffer.write('fmt ', 12);
wavBuffer.writeUInt32LE(16, 16); // fmt chunk size
wavBuffer.writeUInt16LE(1, 20); // audio format (1 = PCM)
wavBuffer.writeUInt16LE(numChannels, 22);
wavBuffer.writeUInt32LE(sampleRate, 24);
wavBuffer.writeUInt32LE(byteRate, 28);
wavBuffer.writeUInt16LE(blockAlign, 32);
wavBuffer.writeUInt16LE(bitsPerSample, 34);
// data chunk
wavBuffer.write('data', 36);
wavBuffer.writeUInt32LE(dataSize, 40);
// Copy PCM data
pcmBuffer.copy(wavBuffer, 44);
return wavBuffer;
}
async function transcribeAudio(audioBuffer, mimeType = 'audio/wav') {
if (!openaiClient) {
throw new Error('OpenAI client not initialized');
}
try {
// Save audio buffer to temp file (OpenAI SDK requires file path)
const tempDir = os.tmpdir();
const tempFile = path.join(tempDir, `audio_${Date.now()}.wav`);
// Convert base64 to buffer if needed
let buffer = audioBuffer;
if (typeof audioBuffer === 'string') {
buffer = Buffer.from(audioBuffer, 'base64');
}
// Create proper WAV file with header
const wavBuffer = createWavBuffer(buffer, SAMPLE_RATE, 1, 16);
fs.writeFileSync(tempFile, wavBuffer);
const transcription = await openaiClient.audio.transcriptions.create({
file: fs.createReadStream(tempFile),
model: currentConfig.whisperModel || 'whisper-1',
response_format: 'text',
});
// Clean up temp file
try {
fs.unlinkSync(tempFile);
} catch (e) {
// Ignore cleanup errors
}
return transcription;
} catch (error) {
console.error('Transcription error:', error);
throw error;
}
}
async function sendTextMessage(text) {
if (!openaiClient) {
return { success: false, error: 'OpenAI client not initialized' };
}
if (isProcessing) {
return { success: false, error: 'Already processing a request' };
}
isProcessing = true;
try {
// Add user message to conversation
conversationMessages.push({
role: 'user',
content: text,
});
sendToRenderer('update-status', 'Thinking...');
const stream = await openaiClient.chat.completions.create({
model: currentConfig.model || 'gpt-4o',
messages: conversationMessages,
stream: true,
max_tokens: 4096,
});
let fullResponse = '';
let isFirst = true;
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content;
if (content) {
fullResponse += content;
sendToRenderer(isFirst ? 'new-response' : 'update-response', fullResponse);
isFirst = false;
}
}
// Add assistant response to conversation
conversationMessages.push({
role: 'assistant',
content: fullResponse,
});
sendToRenderer('update-status', 'Ready');
isProcessing = false;
return { success: true, text: fullResponse };
} catch (error) {
console.error('Chat completion error:', error);
sendToRenderer('update-status', 'Error: ' + error.message);
isProcessing = false;
return { success: false, error: error.message };
}
}
async function sendImageMessage(base64Image, prompt) {
if (!openaiClient) {
return { success: false, error: 'OpenAI client not initialized' };
}
if (isProcessing) {
return { success: false, error: 'Already processing a request' };
}
isProcessing = true;
try {
sendToRenderer('update-status', 'Analyzing image...');
const messages = [
...conversationMessages,
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{
type: 'image_url',
image_url: {
url: `data:image/jpeg;base64,${base64Image}`,
},
},
],
},
];
const stream = await openaiClient.chat.completions.create({
model: currentConfig.visionModel || currentConfig.model || 'gpt-4o',
messages: messages,
stream: true,
max_tokens: 4096,
});
let fullResponse = '';
let isFirst = true;
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content;
if (content) {
fullResponse += content;
sendToRenderer(isFirst ? 'new-response' : 'update-response', fullResponse);
isFirst = false;
}
}
// Add to conversation history (text only for follow-ups)
conversationMessages.push({
role: 'user',
content: prompt,
});
conversationMessages.push({
role: 'assistant',
content: fullResponse,
});
sendToRenderer('update-status', 'Ready');
isProcessing = false;
return { success: true, text: fullResponse, model: currentConfig.visionModel || currentConfig.model };
} catch (error) {
console.error('Vision error:', error);
sendToRenderer('update-status', 'Error: ' + error.message);
isProcessing = false;
return { success: false, error: error.message };
}
}
// Process audio chunk and get response
// This accumulates audio and transcribes when silence is detected or timer expires
let audioChunks = [];
let lastAudioTime = 0;
let firstChunkTime = 0;
const SILENCE_THRESHOLD_MS = 1500; // 1.5 seconds of silence
const MAX_BUFFER_DURATION_MS = 5000; // 5 seconds max buffering before forced transcription
let silenceCheckTimer = null;
let windowsTranscriptionTimer = null;
async function processAudioChunk(base64Audio, mimeType) {
if (!openaiClient) {
return { success: false, error: 'OpenAI client not initialized' };
}
const now = Date.now();
const buffer = Buffer.from(base64Audio, 'base64');
if (audioInputMode === 'push-to-talk') {
if (!isPushToTalkActive) {
return { success: true, ignored: true };
}
// In push-to-talk mode we only buffer while active
audioChunks.push(buffer);
lastAudioTime = now;
return { success: true, buffering: true };
}
// Track first chunk time for duration-based flushing
if (audioChunks.length === 0) {
firstChunkTime = now;
// Start periodic transcription timer (Windows needs this)
if (!windowsTranscriptionTimer && process.platform === 'win32') {
console.log('Starting Windows periodic transcription timer...');
windowsTranscriptionTimer = setInterval(async () => {
if (audioChunks.length > 0) {
const bufferDuration = Date.now() - firstChunkTime;
if (bufferDuration >= MAX_BUFFER_DURATION_MS) {
console.log(`Periodic flush: ${bufferDuration}ms of audio buffered`);
await flushAudioAndTranscribe();
}
}
}, 2000); // Check every 2 seconds
}
}
// Add to audio buffer
audioChunks.push(buffer);
lastAudioTime = now;
// Clear existing timer
if (silenceCheckTimer) {
clearTimeout(silenceCheckTimer);
}
// Set timer to check for silence
silenceCheckTimer = setTimeout(async () => {
const silenceDuration = Date.now() - lastAudioTime;
if (silenceDuration >= SILENCE_THRESHOLD_MS && audioChunks.length > 0) {
console.log('Silence detected, flushing audio for transcription...');
await flushAudioAndTranscribe();
}
}, SILENCE_THRESHOLD_MS);
return { success: true, buffering: true };
}
async function flushAudioAndTranscribe() {
if (audioChunks.length === 0) {
return { success: true, text: '' };
}
// Clear Windows transcription timer
if (windowsTranscriptionTimer) {
clearInterval(windowsTranscriptionTimer);
windowsTranscriptionTimer = null;
}
try {
// Combine all audio chunks
const combinedBuffer = Buffer.concat(audioChunks);
const chunkCount = audioChunks.length;
audioChunks = [];
firstChunkTime = 0;
// Calculate audio duration
const bytesPerSample = 2;
const audioDurationMs = (combinedBuffer.length / bytesPerSample / SAMPLE_RATE) * 1000;
console.log(`Transcribing ${chunkCount} chunks (${audioDurationMs.toFixed(0)}ms of audio)...`);
// Transcribe
const transcription = await transcribeAudio(combinedBuffer);
if (transcription && transcription.trim()) {
console.log('Transcription result:', transcription);
// Send to chat
const response = await sendTextMessage(transcription);
return {
success: true,
transcription: transcription,
response: response.text,
};
}
return { success: true, text: '' };
} catch (error) {
console.error('Flush audio error:', error);
return { success: false, error: error.message };
}
}
function notifyPushToTalkState() {
sendToRenderer('push-to-talk-state', {
active: isPushToTalkActive,
inputMode: audioInputMode,
});
}
function resetRealtimeAudioBuffer() {
audioChunks = [];
firstChunkTime = 0;
lastAudioTime = 0;
if (silenceCheckTimer) {
clearTimeout(silenceCheckTimer);
silenceCheckTimer = null;
}
if (windowsTranscriptionTimer) {
clearInterval(windowsTranscriptionTimer);
windowsTranscriptionTimer = null;
}
}
function updateTranscriptionTimerForPushToTalk() {
if (audioInputMode === 'push-to-talk') {
stopTranscriptionTimer();
return;
}
if (systemAudioProc && !transcriptionTimer) {
startTranscriptionTimer();
}
}
async function setPushToTalkActive(active) {
const wasActive = isPushToTalkActive;
isPushToTalkActive = active;
if (active) {
// Starting recording - clear any old buffers
resetRealtimeAudioBuffer();
audioBuffer = Buffer.alloc(0);
console.log('Push-to-Talk: Recording started');
sendToRenderer('update-status', 'Recording...');
}
notifyPushToTalkState();
// When user stops recording in PTT mode, send audio for transcription
if (!active && wasActive && audioInputMode === 'push-to-talk') {
console.log('Push-to-Talk: Recording stopped, transcribing...');
sendToRenderer('update-status', 'Transcribing...');
// For browser-based audio (Windows)
if (audioChunks.length > 0) {
await flushAudioAndTranscribe();
}
// For macOS SystemAudioDump
if (audioBuffer.length > 0) {
await transcribeBufferedAudio(true); // Force transcription
}
sendToRenderer('update-status', 'Listening...');
}
}
async function togglePushToTalk() {
if (isPushToTalkActive) {
await setPushToTalkActive(false);
} else {
await setPushToTalkActive(true);
}
}
function updatePushToTalkSettings(inputMode) {
if (inputMode) {
audioInputMode = inputMode;
}
if (audioInputMode !== 'push-to-talk' && isPushToTalkActive) {
isPushToTalkActive = false;
}
if (audioInputMode !== 'push-to-talk') {
resetRealtimeAudioBuffer();
audioBuffer = Buffer.alloc(0);
}
notifyPushToTalkState();
updateTranscriptionTimerForPushToTalk();
}
function clearConversation() {
const systemMessage = conversationMessages.find(m => m.role === 'system');
conversationMessages = systemMessage ? [systemMessage] : [];
audioChunks = [];
// Clear timers
if (silenceCheckTimer) {
clearTimeout(silenceCheckTimer);
silenceCheckTimer = null;
}
if (windowsTranscriptionTimer) {
clearInterval(windowsTranscriptionTimer);
windowsTranscriptionTimer = null;
}
}
function closeOpenAISDK() {
stopMacOSAudioCapture();
openaiClient = null;
currentConfig = null;
conversationMessages = [];
audioChunks = [];
isProcessing = false;
isPushToTalkActive = false;
// Clear timers
if (silenceCheckTimer) {
clearTimeout(silenceCheckTimer);
silenceCheckTimer = null;
}
if (windowsTranscriptionTimer) {
clearInterval(windowsTranscriptionTimer);
windowsTranscriptionTimer = null;
}
notifyPushToTalkState();
sendToRenderer('update-status', 'Disconnected');
}
// ============ macOS Audio Capture ============
async function killExistingSystemAudioDump() {
return new Promise(resolve => {
const { exec } = require('child_process');
exec('pkill -f SystemAudioDump', error => {
// Ignore errors (process might not exist)
setTimeout(resolve, 100);
});
});
}
function convertStereoToMono(stereoBuffer) {
const samples = stereoBuffer.length / 4;
const monoBuffer = Buffer.alloc(samples * 2);
for (let i = 0; i < samples; i++) {
const leftSample = stereoBuffer.readInt16LE(i * 4);
monoBuffer.writeInt16LE(leftSample, i * 2);
}
return monoBuffer;
}
// Calculate RMS (Root Mean Square) volume level of audio buffer
function calculateRMS(buffer) {
const samples = buffer.length / 2;
if (samples === 0) return 0;
let sumSquares = 0;
for (let i = 0; i < samples; i++) {
const sample = buffer.readInt16LE(i * 2);
sumSquares += sample * sample;
}
return Math.sqrt(sumSquares / samples);
}
// Check if audio contains speech (simple VAD based on volume threshold)
function hasSpeech(buffer, threshold = 500) {
const rms = calculateRMS(buffer);
return rms > threshold;
}
async function transcribeBufferedAudio(forcePTT = false) {
if (audioBuffer.length === 0 || isProcessing) {
return;
}
// In push-to-talk mode, only transcribe when explicitly requested (forcePTT=true)
if (audioInputMode === 'push-to-talk' && !forcePTT) {
return;
}
// Calculate audio duration
const bytesPerSample = 2;
const audioDurationMs = (audioBuffer.length / bytesPerSample / SAMPLE_RATE) * 1000;
if (audioDurationMs < MIN_AUDIO_DURATION_MS) {
return; // Not enough audio
}
// Check if there's actual speech in the audio (Voice Activity Detection)
// Skip VAD check in PTT mode - user explicitly wants to transcribe
if (!forcePTT && !hasSpeech(audioBuffer)) {
// Clear buffer if it's just silence/noise
audioBuffer = Buffer.alloc(0);
return;
}
// Take current buffer and reset
const currentBuffer = audioBuffer;
audioBuffer = Buffer.alloc(0);
try {
console.log(`Transcribing ${audioDurationMs.toFixed(0)}ms of audio...`);
if (!forcePTT) {
sendToRenderer('update-status', 'Transcribing...');
}
const transcription = await transcribeAudio(currentBuffer, 'audio/wav');
if (transcription && transcription.trim() && transcription.trim().length > 2) {
console.log('Transcription:', transcription);
sendToRenderer('update-status', 'Processing...');
// Send to chat
await sendTextMessage(transcription);
} else if (forcePTT) {
console.log('Push-to-Talk: No speech detected in recording');
}
if (!forcePTT) {
sendToRenderer('update-status', 'Listening...');
}
} catch (error) {
console.error('Transcription error:', error);
if (!forcePTT) {
sendToRenderer('update-status', 'Listening...');
}
}
}
async function startMacOSAudioCapture() {
if (process.platform !== 'darwin') return false;
// Kill any existing SystemAudioDump processes first
await killExistingSystemAudioDump();
console.log('=== Starting macOS audio capture (OpenAI SDK) ===');
sendToRenderer('update-status', 'Starting audio capture...');
const { app } = require('electron');
const fs = require('fs');
let systemAudioPath;
if (app.isPackaged) {
systemAudioPath = path.join(process.resourcesPath, 'SystemAudioDump');
} else {
systemAudioPath = path.join(__dirname, '../assets', 'SystemAudioDump');
}
console.log('SystemAudioDump config:', {
path: systemAudioPath,
isPackaged: app.isPackaged,
resourcesPath: process.resourcesPath,
exists: fs.existsSync(systemAudioPath),
});
// Check if file exists
if (!fs.existsSync(systemAudioPath)) {
console.error('FATAL: SystemAudioDump not found at:', systemAudioPath);
sendToRenderer('update-status', 'Error: Audio binary not found');
return false;
}
// Check and fix executable permissions
try {
fs.accessSync(systemAudioPath, fs.constants.X_OK);
console.log('SystemAudioDump is executable');
} catch (err) {
console.warn('SystemAudioDump not executable, fixing permissions...');
try {
fs.chmodSync(systemAudioPath, 0o755);
console.log('Fixed executable permissions');
} catch (chmodErr) {
console.error('Failed to fix permissions:', chmodErr);
sendToRenderer('update-status', 'Error: Cannot execute audio binary');
return false;
}
}
const spawnOptions = {
stdio: ['ignore', 'pipe', 'pipe'],
env: {
...process.env,
},
};
console.log('Spawning SystemAudioDump...');
systemAudioProc = spawn(systemAudioPath, [], spawnOptions);
if (!systemAudioProc.pid) {
console.error('FATAL: Failed to start SystemAudioDump - no PID');
sendToRenderer('update-status', 'Error: Audio capture failed to start');
return false;
}
console.log('SystemAudioDump started with PID:', systemAudioProc.pid);
const CHUNK_DURATION = 0.1;
const BYTES_PER_SAMPLE = 2;
const CHANNELS = 2;
const CHUNK_SIZE = SAMPLE_RATE * BYTES_PER_SAMPLE * CHANNELS * CHUNK_DURATION;
let tempBuffer = Buffer.alloc(0);
let chunkCount = 0;
let firstDataReceived = false;
systemAudioProc.stdout.on('data', data => {
if (!firstDataReceived) {
firstDataReceived = true;
console.log('First audio data received! Size:', data.length);
sendToRenderer('update-status', 'Listening...');
}
tempBuffer = Buffer.concat([tempBuffer, data]);
while (tempBuffer.length >= CHUNK_SIZE) {
const chunk = tempBuffer.slice(0, CHUNK_SIZE);
tempBuffer = tempBuffer.slice(CHUNK_SIZE);
// Convert stereo to mono
const monoChunk = CHANNELS === 2 ? convertStereoToMono(chunk) : chunk;
if (audioInputMode === 'push-to-talk' && !isPushToTalkActive) {
continue;
}
// Add to audio buffer for transcription
audioBuffer = Buffer.concat([audioBuffer, monoChunk]);
chunkCount++;
if (chunkCount % 100 === 0) {
console.log(`Audio: ${chunkCount} chunks processed, buffer size: ${audioBuffer.length}`);
}
}
// Limit buffer size (max 30 seconds of audio)
const maxBufferSize = SAMPLE_RATE * BYTES_PER_SAMPLE * 30;
if (audioBuffer.length > maxBufferSize) {
audioBuffer = audioBuffer.slice(-maxBufferSize);
}
});
systemAudioProc.stderr.on('data', data => {
const msg = data.toString();
console.error('SystemAudioDump stderr:', msg);
if (msg.toLowerCase().includes('error')) {
sendToRenderer('update-status', 'Audio error: ' + msg.substring(0, 50));
}
});
systemAudioProc.on('close', (code, signal) => {
console.log('SystemAudioDump closed:', { code, signal, chunksProcessed: chunkCount, tempBufferSize: tempBuffer.length });
if (code !== 0 && code !== null) {
sendToRenderer('update-status', `Audio stopped (exit: ${code}, signal: ${signal})`);
}
systemAudioProc = null;
stopTranscriptionTimer();
});
systemAudioProc.on('error', err => {
console.error('SystemAudioDump spawn error:', err.message, err.stack);
sendToRenderer('update-status', 'Audio error: ' + err.message);
systemAudioProc = null;
stopTranscriptionTimer();
});
systemAudioProc.on('exit', (code, signal) => {
console.log('SystemAudioDump exit event:', { code, signal });
});
// Start periodic transcription
updateTranscriptionTimerForPushToTalk();
sendToRenderer('update-status', 'Listening...');
return true;
}
function startTranscriptionTimer() {
// Don't start auto-transcription timer in push-to-talk mode
if (audioInputMode === 'push-to-talk') {
return;
}
stopTranscriptionTimer();
transcriptionTimer = setInterval(transcribeBufferedAudio, TRANSCRIPTION_INTERVAL_MS);
}
function stopTranscriptionTimer() {
if (transcriptionTimer) {
clearInterval(transcriptionTimer);
transcriptionTimer = null;
}
}
function stopMacOSAudioCapture() {
stopTranscriptionTimer();
if (systemAudioProc) {
console.log('Stopping SystemAudioDump for OpenAI SDK...');
systemAudioProc.kill('SIGTERM');
systemAudioProc = null;
}
audioBuffer = Buffer.alloc(0);
}
module.exports = {
initializeOpenAISDK,
setSystemPrompt,
transcribeAudio,
sendTextMessage,
sendImageMessage,
processAudioChunk,
flushAudioAndTranscribe,
togglePushToTalk,
updatePushToTalkSettings,
clearConversation,
closeOpenAISDK,
startMacOSAudioCapture,
stopMacOSAudioCapture,
};