const { BrowserWindow } = require('electron'); const fs = require('fs'); const path = require('path'); const os = require('os'); const { spawn } = require('child_process'); // OpenAI SDK will be loaded dynamically let OpenAI = null; // OpenAI SDK-based provider (for BotHub, Azure, and other OpenAI-compatible APIs) // This uses the standard Chat Completions API with Whisper for transcription let openaiClient = null; let currentConfig = null; let conversationMessages = []; let isProcessing = false; let audioInputMode = 'auto'; let isPushToTalkActive = false; // macOS audio capture let systemAudioProc = null; let audioBuffer = Buffer.alloc(0); let transcriptionTimer = null; const TRANSCRIPTION_INTERVAL_MS = 3000; // Transcribe every 3 seconds const MIN_AUDIO_DURATION_MS = 500; // Minimum audio duration to transcribe const SAMPLE_RATE = 24000; function sendToRenderer(channel, data) { const windows = BrowserWindow.getAllWindows(); if (windows.length > 0) { windows[0].webContents.send(channel, data); } } async function initializeOpenAISDK(config) { const { apiKey, baseUrl, model } = config; if (!apiKey) { throw new Error('OpenAI API key is required'); } // Dynamic import for ES module if (!OpenAI) { const openaiModule = await import('openai'); OpenAI = openaiModule.default; } const clientConfig = { apiKey: apiKey, }; // Use custom baseURL if provided if (baseUrl && baseUrl.trim() !== '') { clientConfig.baseURL = baseUrl; } openaiClient = new OpenAI(clientConfig); currentConfig = config; conversationMessages = []; console.log('OpenAI SDK initialized with baseURL:', clientConfig.baseURL || 'default'); sendToRenderer('update-status', 'Ready (OpenAI SDK)'); return true; } function setSystemPrompt(systemPrompt) { // Clear conversation and set system prompt conversationMessages = []; if (systemPrompt) { conversationMessages.push({ role: 'system', content: systemPrompt, }); } } // Create WAV file from raw PCM data function createWavBuffer(pcmBuffer, sampleRate = 24000, numChannels = 1, bitsPerSample = 16) { const byteRate = sampleRate * numChannels * (bitsPerSample / 8); const blockAlign = numChannels * (bitsPerSample / 8); const dataSize = pcmBuffer.length; const headerSize = 44; const fileSize = headerSize + dataSize - 8; const wavBuffer = Buffer.alloc(headerSize + dataSize); // RIFF header wavBuffer.write('RIFF', 0); wavBuffer.writeUInt32LE(fileSize, 4); wavBuffer.write('WAVE', 8); // fmt chunk wavBuffer.write('fmt ', 12); wavBuffer.writeUInt32LE(16, 16); // fmt chunk size wavBuffer.writeUInt16LE(1, 20); // audio format (1 = PCM) wavBuffer.writeUInt16LE(numChannels, 22); wavBuffer.writeUInt32LE(sampleRate, 24); wavBuffer.writeUInt32LE(byteRate, 28); wavBuffer.writeUInt16LE(blockAlign, 32); wavBuffer.writeUInt16LE(bitsPerSample, 34); // data chunk wavBuffer.write('data', 36); wavBuffer.writeUInt32LE(dataSize, 40); // Copy PCM data pcmBuffer.copy(wavBuffer, 44); return wavBuffer; } async function transcribeAudio(audioBuffer, mimeType = 'audio/wav') { if (!openaiClient) { throw new Error('OpenAI client not initialized'); } try { // Save audio buffer to temp file (OpenAI SDK requires file path) const tempDir = os.tmpdir(); const tempFile = path.join(tempDir, `audio_${Date.now()}.wav`); // Convert base64 to buffer if needed let buffer = audioBuffer; if (typeof audioBuffer === 'string') { buffer = Buffer.from(audioBuffer, 'base64'); } // Create proper WAV file with header const wavBuffer = createWavBuffer(buffer, SAMPLE_RATE, 1, 16); fs.writeFileSync(tempFile, wavBuffer); const transcription = await openaiClient.audio.transcriptions.create({ file: fs.createReadStream(tempFile), model: currentConfig.whisperModel || 'whisper-1', response_format: 'text', }); // Clean up temp file try { fs.unlinkSync(tempFile); } catch (e) { // Ignore cleanup errors } return transcription; } catch (error) { console.error('Transcription error:', error); throw error; } } async function sendTextMessage(text) { if (!openaiClient) { return { success: false, error: 'OpenAI client not initialized' }; } if (isProcessing) { return { success: false, error: 'Already processing a request' }; } isProcessing = true; try { // Add user message to conversation conversationMessages.push({ role: 'user', content: text, }); sendToRenderer('update-status', 'Thinking...'); const stream = await openaiClient.chat.completions.create({ model: currentConfig.model || 'gpt-4o', messages: conversationMessages, stream: true, max_tokens: 4096, }); let fullResponse = ''; let isFirst = true; for await (const chunk of stream) { const content = chunk.choices[0]?.delta?.content; if (content) { fullResponse += content; sendToRenderer(isFirst ? 'new-response' : 'update-response', fullResponse); isFirst = false; } } // Add assistant response to conversation conversationMessages.push({ role: 'assistant', content: fullResponse, }); sendToRenderer('update-status', 'Ready'); isProcessing = false; return { success: true, text: fullResponse }; } catch (error) { console.error('Chat completion error:', error); sendToRenderer('update-status', 'Error: ' + error.message); isProcessing = false; return { success: false, error: error.message }; } } async function sendImageMessage(base64Image, prompt) { if (!openaiClient) { return { success: false, error: 'OpenAI client not initialized' }; } if (isProcessing) { return { success: false, error: 'Already processing a request' }; } isProcessing = true; try { sendToRenderer('update-status', 'Analyzing image...'); const messages = [ ...conversationMessages, { role: 'user', content: [ { type: 'text', text: prompt }, { type: 'image_url', image_url: { url: `data:image/jpeg;base64,${base64Image}`, }, }, ], }, ]; const stream = await openaiClient.chat.completions.create({ model: currentConfig.visionModel || currentConfig.model || 'gpt-4o', messages: messages, stream: true, max_tokens: 4096, }); let fullResponse = ''; let isFirst = true; for await (const chunk of stream) { const content = chunk.choices[0]?.delta?.content; if (content) { fullResponse += content; sendToRenderer(isFirst ? 'new-response' : 'update-response', fullResponse); isFirst = false; } } // Add to conversation history (text only for follow-ups) conversationMessages.push({ role: 'user', content: prompt, }); conversationMessages.push({ role: 'assistant', content: fullResponse, }); sendToRenderer('update-status', 'Ready'); isProcessing = false; return { success: true, text: fullResponse, model: currentConfig.visionModel || currentConfig.model }; } catch (error) { console.error('Vision error:', error); sendToRenderer('update-status', 'Error: ' + error.message); isProcessing = false; return { success: false, error: error.message }; } } // Process audio chunk and get response // This accumulates audio and transcribes when silence is detected or timer expires let audioChunks = []; let lastAudioTime = 0; let firstChunkTime = 0; const SILENCE_THRESHOLD_MS = 1500; // 1.5 seconds of silence const MAX_BUFFER_DURATION_MS = 5000; // 5 seconds max buffering before forced transcription let silenceCheckTimer = null; let windowsTranscriptionTimer = null; async function processAudioChunk(base64Audio, mimeType) { if (!openaiClient) { return { success: false, error: 'OpenAI client not initialized' }; } const now = Date.now(); const buffer = Buffer.from(base64Audio, 'base64'); if (audioInputMode === 'push-to-talk') { if (!isPushToTalkActive) { return { success: true, ignored: true }; } // In push-to-talk mode we only buffer while active audioChunks.push(buffer); lastAudioTime = now; return { success: true, buffering: true }; } // Track first chunk time for duration-based flushing if (audioChunks.length === 0) { firstChunkTime = now; // Start periodic transcription timer (Windows needs this) if (!windowsTranscriptionTimer && process.platform === 'win32') { console.log('Starting Windows periodic transcription timer...'); windowsTranscriptionTimer = setInterval(async () => { if (audioChunks.length > 0) { const bufferDuration = Date.now() - firstChunkTime; if (bufferDuration >= MAX_BUFFER_DURATION_MS) { console.log(`Periodic flush: ${bufferDuration}ms of audio buffered`); await flushAudioAndTranscribe(); } } }, 2000); // Check every 2 seconds } } // Add to audio buffer audioChunks.push(buffer); lastAudioTime = now; // Clear existing timer if (silenceCheckTimer) { clearTimeout(silenceCheckTimer); } // Set timer to check for silence silenceCheckTimer = setTimeout(async () => { const silenceDuration = Date.now() - lastAudioTime; if (silenceDuration >= SILENCE_THRESHOLD_MS && audioChunks.length > 0) { console.log('Silence detected, flushing audio for transcription...'); await flushAudioAndTranscribe(); } }, SILENCE_THRESHOLD_MS); return { success: true, buffering: true }; } async function flushAudioAndTranscribe() { if (audioChunks.length === 0) { return { success: true, text: '' }; } // Clear Windows transcription timer if (windowsTranscriptionTimer) { clearInterval(windowsTranscriptionTimer); windowsTranscriptionTimer = null; } try { // Combine all audio chunks const combinedBuffer = Buffer.concat(audioChunks); const chunkCount = audioChunks.length; audioChunks = []; firstChunkTime = 0; // Calculate audio duration const bytesPerSample = 2; const audioDurationMs = (combinedBuffer.length / bytesPerSample / SAMPLE_RATE) * 1000; console.log(`Transcribing ${chunkCount} chunks (${audioDurationMs.toFixed(0)}ms of audio)...`); // Transcribe const transcription = await transcribeAudio(combinedBuffer); if (transcription && transcription.trim()) { console.log('Transcription result:', transcription); // Send to chat const response = await sendTextMessage(transcription); return { success: true, transcription: transcription, response: response.text, }; } return { success: true, text: '' }; } catch (error) { console.error('Flush audio error:', error); return { success: false, error: error.message }; } } function notifyPushToTalkState() { sendToRenderer('push-to-talk-state', { active: isPushToTalkActive, inputMode: audioInputMode, }); } function resetRealtimeAudioBuffer() { audioChunks = []; firstChunkTime = 0; lastAudioTime = 0; if (silenceCheckTimer) { clearTimeout(silenceCheckTimer); silenceCheckTimer = null; } if (windowsTranscriptionTimer) { clearInterval(windowsTranscriptionTimer); windowsTranscriptionTimer = null; } } function updateTranscriptionTimerForPushToTalk() { if (audioInputMode === 'push-to-talk') { stopTranscriptionTimer(); return; } if (systemAudioProc && !transcriptionTimer) { startTranscriptionTimer(); } } async function setPushToTalkActive(active) { const wasActive = isPushToTalkActive; isPushToTalkActive = active; if (active) { // Starting recording - clear any old buffers resetRealtimeAudioBuffer(); audioBuffer = Buffer.alloc(0); console.log('Push-to-Talk: Recording started'); sendToRenderer('update-status', 'Recording...'); } notifyPushToTalkState(); // When user stops recording in PTT mode, send audio for transcription if (!active && wasActive && audioInputMode === 'push-to-talk') { console.log('Push-to-Talk: Recording stopped, transcribing...'); sendToRenderer('update-status', 'Transcribing...'); // For browser-based audio (Windows) if (audioChunks.length > 0) { await flushAudioAndTranscribe(); } // For macOS SystemAudioDump if (audioBuffer.length > 0) { await transcribeBufferedAudio(true); // Force transcription } sendToRenderer('update-status', 'Listening...'); } } async function togglePushToTalk() { if (isPushToTalkActive) { await setPushToTalkActive(false); } else { await setPushToTalkActive(true); } } function updatePushToTalkSettings(inputMode) { if (inputMode) { audioInputMode = inputMode; } if (audioInputMode !== 'push-to-talk' && isPushToTalkActive) { isPushToTalkActive = false; } if (audioInputMode !== 'push-to-talk') { resetRealtimeAudioBuffer(); audioBuffer = Buffer.alloc(0); } notifyPushToTalkState(); updateTranscriptionTimerForPushToTalk(); } function clearConversation() { const systemMessage = conversationMessages.find(m => m.role === 'system'); conversationMessages = systemMessage ? [systemMessage] : []; audioChunks = []; // Clear timers if (silenceCheckTimer) { clearTimeout(silenceCheckTimer); silenceCheckTimer = null; } if (windowsTranscriptionTimer) { clearInterval(windowsTranscriptionTimer); windowsTranscriptionTimer = null; } } function closeOpenAISDK() { stopMacOSAudioCapture(); openaiClient = null; currentConfig = null; conversationMessages = []; audioChunks = []; isProcessing = false; isPushToTalkActive = false; // Clear timers if (silenceCheckTimer) { clearTimeout(silenceCheckTimer); silenceCheckTimer = null; } if (windowsTranscriptionTimer) { clearInterval(windowsTranscriptionTimer); windowsTranscriptionTimer = null; } notifyPushToTalkState(); sendToRenderer('update-status', 'Disconnected'); } // ============ macOS Audio Capture ============ async function killExistingSystemAudioDump() { return new Promise(resolve => { const { exec } = require('child_process'); exec('pkill -f SystemAudioDump', error => { // Ignore errors (process might not exist) setTimeout(resolve, 100); }); }); } function convertStereoToMono(stereoBuffer) { const samples = stereoBuffer.length / 4; const monoBuffer = Buffer.alloc(samples * 2); for (let i = 0; i < samples; i++) { const leftSample = stereoBuffer.readInt16LE(i * 4); monoBuffer.writeInt16LE(leftSample, i * 2); } return monoBuffer; } // Calculate RMS (Root Mean Square) volume level of audio buffer function calculateRMS(buffer) { const samples = buffer.length / 2; if (samples === 0) return 0; let sumSquares = 0; for (let i = 0; i < samples; i++) { const sample = buffer.readInt16LE(i * 2); sumSquares += sample * sample; } return Math.sqrt(sumSquares / samples); } // Check if audio contains speech (simple VAD based on volume threshold) function hasSpeech(buffer, threshold = 500) { const rms = calculateRMS(buffer); return rms > threshold; } async function transcribeBufferedAudio(forcePTT = false) { if (audioBuffer.length === 0 || isProcessing) { return; } // In push-to-talk mode, only transcribe when explicitly requested (forcePTT=true) if (audioInputMode === 'push-to-talk' && !forcePTT) { return; } // Calculate audio duration const bytesPerSample = 2; const audioDurationMs = (audioBuffer.length / bytesPerSample / SAMPLE_RATE) * 1000; if (audioDurationMs < MIN_AUDIO_DURATION_MS) { return; // Not enough audio } // Check if there's actual speech in the audio (Voice Activity Detection) // Skip VAD check in PTT mode - user explicitly wants to transcribe if (!forcePTT && !hasSpeech(audioBuffer)) { // Clear buffer if it's just silence/noise audioBuffer = Buffer.alloc(0); return; } // Take current buffer and reset const currentBuffer = audioBuffer; audioBuffer = Buffer.alloc(0); try { console.log(`Transcribing ${audioDurationMs.toFixed(0)}ms of audio...`); if (!forcePTT) { sendToRenderer('update-status', 'Transcribing...'); } const transcription = await transcribeAudio(currentBuffer, 'audio/wav'); if (transcription && transcription.trim() && transcription.trim().length > 2) { console.log('Transcription:', transcription); sendToRenderer('update-status', 'Processing...'); // Send to chat await sendTextMessage(transcription); } else if (forcePTT) { console.log('Push-to-Talk: No speech detected in recording'); } if (!forcePTT) { sendToRenderer('update-status', 'Listening...'); } } catch (error) { console.error('Transcription error:', error); if (!forcePTT) { sendToRenderer('update-status', 'Listening...'); } } } async function startMacOSAudioCapture() { if (process.platform !== 'darwin') return false; // Kill any existing SystemAudioDump processes first await killExistingSystemAudioDump(); console.log('=== Starting macOS audio capture (OpenAI SDK) ==='); sendToRenderer('update-status', 'Starting audio capture...'); const { app } = require('electron'); const fs = require('fs'); let systemAudioPath; if (app.isPackaged) { systemAudioPath = path.join(process.resourcesPath, 'SystemAudioDump'); } else { systemAudioPath = path.join(__dirname, '../assets', 'SystemAudioDump'); } console.log('SystemAudioDump config:', { path: systemAudioPath, isPackaged: app.isPackaged, resourcesPath: process.resourcesPath, exists: fs.existsSync(systemAudioPath), }); // Check if file exists if (!fs.existsSync(systemAudioPath)) { console.error('FATAL: SystemAudioDump not found at:', systemAudioPath); sendToRenderer('update-status', 'Error: Audio binary not found'); return false; } // Check and fix executable permissions try { fs.accessSync(systemAudioPath, fs.constants.X_OK); console.log('SystemAudioDump is executable'); } catch (err) { console.warn('SystemAudioDump not executable, fixing permissions...'); try { fs.chmodSync(systemAudioPath, 0o755); console.log('Fixed executable permissions'); } catch (chmodErr) { console.error('Failed to fix permissions:', chmodErr); sendToRenderer('update-status', 'Error: Cannot execute audio binary'); return false; } } const spawnOptions = { stdio: ['ignore', 'pipe', 'pipe'], env: { ...process.env, }, }; console.log('Spawning SystemAudioDump...'); systemAudioProc = spawn(systemAudioPath, [], spawnOptions); if (!systemAudioProc.pid) { console.error('FATAL: Failed to start SystemAudioDump - no PID'); sendToRenderer('update-status', 'Error: Audio capture failed to start'); return false; } console.log('SystemAudioDump started with PID:', systemAudioProc.pid); const CHUNK_DURATION = 0.1; const BYTES_PER_SAMPLE = 2; const CHANNELS = 2; const CHUNK_SIZE = SAMPLE_RATE * BYTES_PER_SAMPLE * CHANNELS * CHUNK_DURATION; let tempBuffer = Buffer.alloc(0); let chunkCount = 0; let firstDataReceived = false; systemAudioProc.stdout.on('data', data => { if (!firstDataReceived) { firstDataReceived = true; console.log('First audio data received! Size:', data.length); sendToRenderer('update-status', 'Listening...'); } tempBuffer = Buffer.concat([tempBuffer, data]); while (tempBuffer.length >= CHUNK_SIZE) { const chunk = tempBuffer.slice(0, CHUNK_SIZE); tempBuffer = tempBuffer.slice(CHUNK_SIZE); // Convert stereo to mono const monoChunk = CHANNELS === 2 ? convertStereoToMono(chunk) : chunk; if (audioInputMode === 'push-to-talk' && !isPushToTalkActive) { continue; } // Add to audio buffer for transcription audioBuffer = Buffer.concat([audioBuffer, monoChunk]); chunkCount++; if (chunkCount % 100 === 0) { console.log(`Audio: ${chunkCount} chunks processed, buffer size: ${audioBuffer.length}`); } } // Limit buffer size (max 30 seconds of audio) const maxBufferSize = SAMPLE_RATE * BYTES_PER_SAMPLE * 30; if (audioBuffer.length > maxBufferSize) { audioBuffer = audioBuffer.slice(-maxBufferSize); } }); systemAudioProc.stderr.on('data', data => { const msg = data.toString(); console.error('SystemAudioDump stderr:', msg); if (msg.toLowerCase().includes('error')) { sendToRenderer('update-status', 'Audio error: ' + msg.substring(0, 50)); } }); systemAudioProc.on('close', (code, signal) => { console.log('SystemAudioDump closed:', { code, signal, chunksProcessed: chunkCount, tempBufferSize: tempBuffer.length }); if (code !== 0 && code !== null) { sendToRenderer('update-status', `Audio stopped (exit: ${code}, signal: ${signal})`); } systemAudioProc = null; stopTranscriptionTimer(); }); systemAudioProc.on('error', err => { console.error('SystemAudioDump spawn error:', err.message, err.stack); sendToRenderer('update-status', 'Audio error: ' + err.message); systemAudioProc = null; stopTranscriptionTimer(); }); systemAudioProc.on('exit', (code, signal) => { console.log('SystemAudioDump exit event:', { code, signal }); }); // Start periodic transcription updateTranscriptionTimerForPushToTalk(); sendToRenderer('update-status', 'Listening...'); return true; } function startTranscriptionTimer() { // Don't start auto-transcription timer in push-to-talk mode if (audioInputMode === 'push-to-talk') { return; } stopTranscriptionTimer(); transcriptionTimer = setInterval(transcribeBufferedAudio, TRANSCRIPTION_INTERVAL_MS); } function stopTranscriptionTimer() { if (transcriptionTimer) { clearInterval(transcriptionTimer); transcriptionTimer = null; } } function stopMacOSAudioCapture() { stopTranscriptionTimer(); if (systemAudioProc) { console.log('Stopping SystemAudioDump for OpenAI SDK...'); systemAudioProc.kill('SIGTERM'); systemAudioProc = null; } audioBuffer = Buffer.alloc(0); } module.exports = { initializeOpenAISDK, setSystemPrompt, transcribeAudio, sendTextMessage, sendImageMessage, processAudioChunk, flushAudioAndTranscribe, togglePushToTalk, updatePushToTalkSettings, clearConversation, closeOpenAISDK, startMacOSAudioCapture, stopMacOSAudioCapture, };