Implement Push-to-Talk feature and enhance audio input settings in AssistantView and CustomizeView. Update README for API key instructions and improve audio processing logic in OpenAI SDK. Adjust pnpm-lock.yaml for dependency updates.
This commit is contained in:
parent
669c019fd8
commit
656e8f0932
38
README.md
38
README.md
@ -1,10 +1,5 @@
|
||||
<img width="1299" height="424" alt="cd (1)" src="https://github.com/user-attachments/assets/b25fff4d-043d-4f38-9985-f832ae0d0f6e" />
|
||||
|
||||
## Recall.ai - API for desktop recording
|
||||
|
||||
If you’re looking for a hosted desktop recording API, consider checking out [Recall.ai](https://www.recall.ai/product/desktop-recording-sdk/?utm_source=github&utm_medium=sponsorship&utm_campaign=sohzm-cheating-daddy), an API that records Zoom, Google Meet, Microsoft Teams, in-person meetings, and more.
|
||||
|
||||
This project is sponsored by Recall.ai.
|
||||
<!-- <img width="1299" height="424" alt="cd (1)" src="https://github.com/user-attachments/assets/b25fff4d-043d-4f38-9985-f832ae0d0f6e" /> -->
|
||||
# Mastermind
|
||||
|
||||
---
|
||||
|
||||
@ -14,33 +9,36 @@ This project is sponsored by Recall.ai.
|
||||
> [!NOTE]
|
||||
> During testing it wont answer if you ask something, you need to simulate interviewer asking question, which it will answer
|
||||
|
||||
A real-time AI assistant that provides contextual help during video calls, interviews, presentations, and meetings using screen capture and audio analysis.
|
||||
A real-time AI assistant that provides contextual help during video calls, interviews, presentations, and meetings using screen capture and audio analysis. It is fork of [Cheating Daddy](https://github.com/sohzm/cheating-daddy) project.
|
||||
|
||||
## Features
|
||||
|
||||
- **Live AI Assistance**: Real-time help powered by Google Gemini 2.0 Flash Live
|
||||
- **Live AI Assistance**: Real-time help powered by Gemini API / OpenAI SDK / OpenAI Realtime API, so you can choose which one you want to use
|
||||
- **Screen & Audio Capture**: Analyzes what you see and hear for contextual responses
|
||||
- **Multiple Profiles**: Interview, Sales Call, Business Meeting, Presentation, Negotiation
|
||||
- **Transparent Overlay**: Always-on-top window that can be positioned anywhere
|
||||
- **Transparent Overlay**: Always-on-top window that can be positioned anywhere, if something goes wrong you can hide it without stoping session and losing context!
|
||||
- **Click-through Mode**: Make window transparent to clicks when needed
|
||||
- **Cross-platform**: Works on macOS, Windows, and Linux (kinda, dont use, just for testing rn)
|
||||
|
||||
## Setup
|
||||
|
||||
1. **Get a Gemini API Key**: Visit [Google AI Studio](https://aistudio.google.com/apikey)
|
||||
2. **Install Dependencies**: `npm install`
|
||||
3. **Run the App**: `npm start`
|
||||
1. **Get a API Key**: Visit [Google AI Studio](https://aistudio.google.com/apikey) or [OpenAI](https://platform.openai.com/docs/api-reference) or any other OpenAI-compatible API!
|
||||
2. **Install Dependencies**: `pnpm install`
|
||||
3. **Run the App**: `pnpm start`
|
||||
|
||||
## Usage
|
||||
|
||||
1. Enter your Gemini API key in the main window
|
||||
1. Enter your API key in the main window, select provider and model you want to use in preferences
|
||||
2. Choose your profile and language in settings
|
||||
3. Click "Start Session" to begin
|
||||
4. Position the window using keyboard shortcuts
|
||||
5. The AI will provide real-time assistance based on your screen and what interview asks
|
||||
3. Click "Start Session" to begin, if you want to use push-to-talk mode, you can enable it in preferences
|
||||
4. Position the window using keyboard shortcuts, or use your mouse to move it
|
||||
5. The AI will provide real-time assistance based on your screen and system audio/microphone input, you can also send text messages to AI by pressing Enter
|
||||
|
||||
## Keyboard Shortcuts
|
||||
|
||||
> [!NOTE]
|
||||
> All keyboard shortcuts are customizable in settings. You can check some default shortcuts below.
|
||||
|
||||
- **Window Movement**: `Ctrl/Cmd + Arrow Keys` - Move window
|
||||
- **Click-through**: `Ctrl/Cmd + M` - Toggle mouse events
|
||||
- **Close/Back**: `Ctrl/Cmd + \` - Close window or go back
|
||||
@ -48,13 +46,13 @@ A real-time AI assistant that provides contextual help during video calls, inter
|
||||
|
||||
## Audio Capture
|
||||
|
||||
- **macOS**: [SystemAudioDump](https://github.com/Mohammed-Yasin-Mulla/Sound) for system audio
|
||||
- **Windows**: Loopback audio capture
|
||||
- **macOS**: [SystemAudioDump](https://github.com/Mohammed-Yasin-Mulla/Sound) for system audio capture, you can use microphone input as well
|
||||
- **Windows**: Loopback audio capture, you can use microphone input as well
|
||||
- **Linux**: Microphone input
|
||||
|
||||
## Requirements
|
||||
|
||||
- Electron-compatible OS (macOS, Windows, Linux)
|
||||
- Gemini API key
|
||||
- AI Provider API key
|
||||
- Screen recording permissions
|
||||
- Microphone/audio permissions
|
||||
|
||||
784
pnpm-lock.yaml
generated
784
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
@ -366,6 +366,57 @@ export class AssistantView extends LitElement {
|
||||
.region-select-btn span {
|
||||
margin-left: 4px;
|
||||
}
|
||||
|
||||
.ptt-toggle-btn {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
background: transparent;
|
||||
color: var(--text-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 6px 12px;
|
||||
border-radius: 20px;
|
||||
font-size: 12px;
|
||||
cursor: pointer;
|
||||
transition: all 0.15s ease;
|
||||
}
|
||||
|
||||
.ptt-toggle-btn:hover {
|
||||
background: var(--hover-background);
|
||||
color: var(--text-color);
|
||||
border-color: var(--text-color);
|
||||
}
|
||||
|
||||
.ptt-toggle-btn.active {
|
||||
color: var(--error-color);
|
||||
border-color: var(--error-color);
|
||||
}
|
||||
|
||||
.ptt-indicator {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
font-size: 11px;
|
||||
color: var(--text-secondary);
|
||||
margin-bottom: 6px;
|
||||
}
|
||||
|
||||
.ptt-dot {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
border-radius: 50%;
|
||||
background: var(--border-color);
|
||||
box-shadow: 0 0 0 1px var(--border-color);
|
||||
}
|
||||
|
||||
.ptt-dot.active {
|
||||
background: var(--error-color);
|
||||
box-shadow: 0 0 0 1px var(--error-color);
|
||||
}
|
||||
|
||||
.ptt-label {
|
||||
font-family: 'SF Mono', Monaco, monospace;
|
||||
}
|
||||
`;
|
||||
|
||||
static properties = {
|
||||
@ -377,6 +428,9 @@ export class AssistantView extends LitElement {
|
||||
flashCount: { type: Number },
|
||||
flashLiteCount: { type: Number },
|
||||
aiProvider: { type: String },
|
||||
pushToTalkActive: { type: Boolean },
|
||||
audioInputMode: { type: String },
|
||||
pushToTalkKeybind: { type: String },
|
||||
};
|
||||
|
||||
constructor() {
|
||||
@ -388,6 +442,9 @@ export class AssistantView extends LitElement {
|
||||
this.flashCount = 0;
|
||||
this.flashLiteCount = 0;
|
||||
this.aiProvider = 'gemini';
|
||||
this.pushToTalkActive = false;
|
||||
this.audioInputMode = 'auto';
|
||||
this.pushToTalkKeybind = '';
|
||||
}
|
||||
|
||||
getProfileNames() {
|
||||
@ -507,6 +564,7 @@ export class AssistantView extends LitElement {
|
||||
|
||||
// Load limits on mount
|
||||
this.loadLimits();
|
||||
this.loadPushToTalkKeybind();
|
||||
|
||||
// Set up IPC listeners for keyboard shortcuts
|
||||
if (window.require) {
|
||||
@ -532,10 +590,17 @@ export class AssistantView extends LitElement {
|
||||
this.scrollResponseDown();
|
||||
};
|
||||
|
||||
this.handlePushToTalkState = (event, state) => {
|
||||
this.pushToTalkActive = state?.active ?? false;
|
||||
this.audioInputMode = state?.inputMode ?? 'auto';
|
||||
this.requestUpdate();
|
||||
};
|
||||
|
||||
ipcRenderer.on('navigate-previous-response', this.handlePreviousResponse);
|
||||
ipcRenderer.on('navigate-next-response', this.handleNextResponse);
|
||||
ipcRenderer.on('scroll-response-up', this.handleScrollUp);
|
||||
ipcRenderer.on('scroll-response-down', this.handleScrollDown);
|
||||
ipcRenderer.on('push-to-talk-state', this.handlePushToTalkState);
|
||||
}
|
||||
}
|
||||
|
||||
@ -557,6 +622,9 @@ export class AssistantView extends LitElement {
|
||||
if (this.handleScrollDown) {
|
||||
ipcRenderer.removeListener('scroll-response-down', this.handleScrollDown);
|
||||
}
|
||||
if (this.handlePushToTalkState) {
|
||||
ipcRenderer.removeListener('push-to-talk-state', this.handlePushToTalkState);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -584,6 +652,15 @@ export class AssistantView extends LitElement {
|
||||
}
|
||||
}
|
||||
|
||||
async loadPushToTalkKeybind() {
|
||||
if (window.cheatingDaddy?.storage?.getKeybinds) {
|
||||
const isMac = window.cheatingDaddy?.isMacOS || navigator.platform.includes('Mac');
|
||||
const defaultKeybind = isMac ? 'Ctrl+Space' : 'Ctrl+Space';
|
||||
const keybinds = await window.cheatingDaddy.storage.getKeybinds();
|
||||
this.pushToTalkKeybind = keybinds?.pushToTalk || defaultKeybind;
|
||||
}
|
||||
}
|
||||
|
||||
getTotalUsed() {
|
||||
return this.flashCount + this.flashLiteCount;
|
||||
}
|
||||
@ -608,6 +685,14 @@ export class AssistantView extends LitElement {
|
||||
}
|
||||
}
|
||||
|
||||
handlePushToTalkToggle() {
|
||||
if (!window.require) {
|
||||
return;
|
||||
}
|
||||
const { ipcRenderer } = window.require('electron');
|
||||
ipcRenderer.send('push-to-talk-toggle');
|
||||
}
|
||||
|
||||
scrollToBottom() {
|
||||
setTimeout(() => {
|
||||
const container = this.shadowRoot.querySelector('.response-container');
|
||||
@ -649,10 +734,26 @@ export class AssistantView extends LitElement {
|
||||
|
||||
render() {
|
||||
const responseCounter = this.getResponseCounter();
|
||||
const showPushToTalk = this.aiProvider === 'openai-sdk' && this.audioInputMode === 'push-to-talk';
|
||||
const keybindLabel = this.pushToTalkKeybind || 'Hotkey';
|
||||
const pushToTalkLabel = this.pushToTalkActive
|
||||
? 'Recording...'
|
||||
: `Press ${keybindLabel} to start/stop`;
|
||||
const pushToTalkButtonLabel = this.pushToTalkActive ? 'Stop' : 'Record';
|
||||
|
||||
return html`
|
||||
<div class="response-container" id="responseContainer"></div>
|
||||
|
||||
${showPushToTalk
|
||||
? html`
|
||||
<div class="ptt-indicator">
|
||||
<span class="ptt-dot ${this.pushToTalkActive ? 'active' : ''}"></span>
|
||||
<span>Push-to-Talk:</span>
|
||||
<span class="ptt-label">${pushToTalkLabel}</span>
|
||||
</div>
|
||||
`
|
||||
: ''}
|
||||
|
||||
<div class="text-input-container">
|
||||
<button class="nav-button" @click=${this.navigateToPreviousResponse} ?disabled=${this.currentResponseIndex <= 0}>
|
||||
<svg width="24px" height="24px" stroke-width="1.7" viewBox="0 0 24 24" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
@ -671,6 +772,17 @@ export class AssistantView extends LitElement {
|
||||
<input type="text" id="textInput" placeholder="Type a message to the AI..." @keydown=${this.handleTextKeydown} />
|
||||
|
||||
<div class="capture-buttons">
|
||||
${showPushToTalk
|
||||
? html`
|
||||
<button
|
||||
class="ptt-toggle-btn ${this.pushToTalkActive ? 'active' : ''}"
|
||||
@click=${this.handlePushToTalkToggle}
|
||||
title="Toggle Push-to-Talk recording"
|
||||
>
|
||||
${pushToTalkButtonLabel}
|
||||
</button>
|
||||
`
|
||||
: ''}
|
||||
<button class="region-select-btn" @click=${this.handleRegionSelect} title="Select region to analyze (like Win+Shift+S)">
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 20 20" fill="currentColor">
|
||||
<path
|
||||
|
||||
@ -537,6 +537,7 @@ export class CustomizeView extends LitElement {
|
||||
color: var(--error-color);
|
||||
border-left: 2px solid var(--error-color);
|
||||
}
|
||||
|
||||
`;
|
||||
|
||||
static properties = {
|
||||
@ -549,6 +550,7 @@ export class CustomizeView extends LitElement {
|
||||
backgroundTransparency: { type: Number },
|
||||
fontSize: { type: Number },
|
||||
theme: { type: String },
|
||||
audioInputMode: { type: String },
|
||||
onProfileChange: { type: Function },
|
||||
onLanguageChange: { type: Function },
|
||||
onImageQualityChange: { type: Function },
|
||||
@ -587,6 +589,7 @@ export class CustomizeView extends LitElement {
|
||||
|
||||
// Audio mode default
|
||||
this.audioMode = 'speaker_only';
|
||||
this.audioInputMode = 'auto';
|
||||
|
||||
// Custom prompt
|
||||
this.customPrompt = '';
|
||||
@ -795,6 +798,7 @@ export class CustomizeView extends LitElement {
|
||||
this.backgroundTransparency = prefs.backgroundTransparency ?? 0.8;
|
||||
this.fontSize = prefs.fontSize ?? 20;
|
||||
this.audioMode = prefs.audioMode ?? 'speaker_only';
|
||||
this.audioInputMode = prefs.audioInputMode ?? 'auto';
|
||||
this.customPrompt = prefs.customPrompt ?? '';
|
||||
this.theme = prefs.theme ?? 'dark';
|
||||
this.aiProvider = prefs.aiProvider ?? 'gemini';
|
||||
@ -820,6 +824,7 @@ export class CustomizeView extends LitElement {
|
||||
|
||||
this.updateBackgroundTransparency();
|
||||
this.updateFontSize();
|
||||
this.notifyPushToTalkSettings();
|
||||
this.requestUpdate();
|
||||
} catch (error) {
|
||||
console.error('Error loading settings:', error);
|
||||
@ -832,6 +837,10 @@ export class CustomizeView extends LitElement {
|
||||
resizeLayout();
|
||||
}
|
||||
|
||||
disconnectedCallback() {
|
||||
super.disconnectedCallback();
|
||||
}
|
||||
|
||||
getProfiles() {
|
||||
return [
|
||||
{
|
||||
@ -944,6 +953,28 @@ export class CustomizeView extends LitElement {
|
||||
this.requestUpdate();
|
||||
}
|
||||
|
||||
async handleAudioInputModeChange(e) {
|
||||
this.audioInputMode = e.target.value;
|
||||
await cheatingDaddy.storage.updatePreference('audioInputMode', e.target.value);
|
||||
this.notifyPushToTalkSettings();
|
||||
this.requestUpdate();
|
||||
}
|
||||
|
||||
notifyPushToTalkSettings() {
|
||||
if (!window.require) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
const { ipcRenderer } = window.require('electron');
|
||||
ipcRenderer.send('update-push-to-talk-settings', {
|
||||
inputMode: this.audioInputMode,
|
||||
});
|
||||
ipcRenderer.send('update-keybinds', this.keybinds);
|
||||
} catch (error) {
|
||||
console.error('Failed to notify push-to-talk settings:', error);
|
||||
}
|
||||
}
|
||||
|
||||
async handleThemeChange(e) {
|
||||
this.theme = e.target.value;
|
||||
await cheatingDaddy.theme.save(this.theme);
|
||||
@ -965,6 +996,7 @@ export class CustomizeView extends LitElement {
|
||||
nextResponse: isMac ? 'Cmd+]' : 'Ctrl+]',
|
||||
scrollUp: isMac ? 'Cmd+Shift+Up' : 'Ctrl+Shift+Up',
|
||||
scrollDown: isMac ? 'Cmd+Shift+Down' : 'Ctrl+Shift+Down',
|
||||
pushToTalk: isMac ? 'Ctrl+Space' : 'Ctrl+Space',
|
||||
};
|
||||
}
|
||||
|
||||
@ -1050,6 +1082,11 @@ export class CustomizeView extends LitElement {
|
||||
name: 'Scroll Response Down',
|
||||
description: 'Scroll the AI response content down',
|
||||
},
|
||||
{
|
||||
key: 'pushToTalk',
|
||||
name: 'Push-to-Talk',
|
||||
description: 'Activate audio recording (OpenAI SDK only)',
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
@ -1319,6 +1356,9 @@ export class CustomizeView extends LitElement {
|
||||
}
|
||||
|
||||
renderAudioSection() {
|
||||
const isPushToTalkAvailable = this.aiProvider === 'openai-sdk';
|
||||
const pushToTalkDisabled = !isPushToTalkAvailable;
|
||||
|
||||
return html`
|
||||
<div class="content-header">Audio Settings</div>
|
||||
<div class="form-grid">
|
||||
@ -1331,6 +1371,28 @@ export class CustomizeView extends LitElement {
|
||||
</select>
|
||||
<div class="form-description">Choose which audio sources to capture for the AI.</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label class="form-label">Audio Input Mode</label>
|
||||
<select
|
||||
class="form-control"
|
||||
.value=${this.audioInputMode}
|
||||
@change=${this.handleAudioInputModeChange}
|
||||
?disabled=${pushToTalkDisabled}
|
||||
>
|
||||
<option value="auto">Automatic (Always Listening)</option>
|
||||
<option value="push-to-talk">Push-to-Talk (Hotkey Activated)</option>
|
||||
</select>
|
||||
<div class="form-description">
|
||||
${pushToTalkDisabled
|
||||
? 'Push-to-Talk is available only with the OpenAI SDK provider.'
|
||||
: this.audioInputMode === 'auto'
|
||||
? 'Audio is continuously recorded and transcribed when silence is detected.'
|
||||
: 'Audio recording starts when you press and hold/toggle the hotkey.'}
|
||||
</div>
|
||||
</div>
|
||||
${this.audioInputMode === 'push-to-talk'
|
||||
? html`<div class="form-description">Use the Push-to-Talk hotkey (toggle) to start/stop recording.</div>`
|
||||
: ''}
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
@ -295,7 +295,7 @@ export class HelpView extends LitElement {
|
||||
<span>Community & Support</span>
|
||||
</div>
|
||||
<div class="community-links">
|
||||
<div class="community-link" @click=${() => this.handleExternalLinkClick('https://cheatingdaddy.com')}>
|
||||
<!-- <div class="community-link" @click=${() => this.handleExternalLinkClick('https://cheatingdaddy.com')}>
|
||||
<svg
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
@ -312,8 +312,8 @@ export class HelpView extends LitElement {
|
||||
></path>
|
||||
</svg>
|
||||
Website
|
||||
</div>
|
||||
<div class="community-link" @click=${() => this.handleExternalLinkClick('https://github.com/sohzm/cheating-daddy')}>
|
||||
</div> -->
|
||||
<div class="community-link" @click=${() => this.handleExternalLinkClick('https://github.com/ShiftyX1/Mastermind')}>
|
||||
<svg
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
@ -329,7 +329,7 @@ export class HelpView extends LitElement {
|
||||
</svg>
|
||||
GitHub
|
||||
</div>
|
||||
<div class="community-link" @click=${() => this.handleExternalLinkClick('https://discord.gg/GCBdubnXfJ')}>
|
||||
<!-- <div class="community-link" @click=${() => this.handleExternalLinkClick('https://discord.gg/GCBdubnXfJ')}>
|
||||
<svg
|
||||
viewBox="0 0 24 24"
|
||||
fill="none"
|
||||
@ -353,7 +353,7 @@ export class HelpView extends LitElement {
|
||||
></path>
|
||||
</svg>
|
||||
Discord
|
||||
</div>
|
||||
</div> -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
@ -33,6 +33,7 @@ const DEFAULT_PREFERENCES = {
|
||||
selectedImageQuality: 'medium',
|
||||
advancedMode: false,
|
||||
audioMode: 'speaker_only',
|
||||
audioInputMode: 'auto',
|
||||
fontSize: 'medium',
|
||||
backgroundTransparency: 0.8,
|
||||
googleSearchEnabled: false,
|
||||
|
||||
@ -186,6 +186,7 @@ async function initializeAISession(customPrompt = '', profile = 'interview', lan
|
||||
try {
|
||||
await openaiSdkProvider.initializeOpenAISDK(providerConfig);
|
||||
openaiSdkProvider.setSystemPrompt(systemPrompt);
|
||||
openaiSdkProvider.updatePushToTalkSettings(prefs.audioInputMode || 'auto');
|
||||
sendToRenderer('update-status', 'Ready (OpenAI SDK)');
|
||||
return true;
|
||||
} catch (error) {
|
||||
@ -325,6 +326,16 @@ function setupAIProviderIpcHandlers(geminiSessionRef) {
|
||||
saveConversationTurn(transcription, response);
|
||||
});
|
||||
|
||||
ipcMain.on('push-to-talk-toggle', () => {
|
||||
if (currentProvider === 'openai-sdk') {
|
||||
openaiSdkProvider.togglePushToTalk();
|
||||
}
|
||||
});
|
||||
|
||||
ipcMain.on('update-push-to-talk-settings', (event, { inputMode } = {}) => {
|
||||
openaiSdkProvider.updatePushToTalkSettings(inputMode || 'auto');
|
||||
});
|
||||
|
||||
ipcMain.handle('initialize-ai-session', async (event, customPrompt, profile, language) => {
|
||||
return await initializeAISession(customPrompt, profile, language);
|
||||
});
|
||||
|
||||
@ -14,6 +14,8 @@ let openaiClient = null;
|
||||
let currentConfig = null;
|
||||
let conversationMessages = [];
|
||||
let isProcessing = false;
|
||||
let audioInputMode = 'auto';
|
||||
let isPushToTalkActive = false;
|
||||
|
||||
// macOS audio capture
|
||||
let systemAudioProc = null;
|
||||
@ -294,6 +296,18 @@ async function processAudioChunk(base64Audio, mimeType) {
|
||||
const now = Date.now();
|
||||
const buffer = Buffer.from(base64Audio, 'base64');
|
||||
|
||||
if (audioInputMode === 'push-to-talk') {
|
||||
if (!isPushToTalkActive) {
|
||||
return { success: true, ignored: true };
|
||||
}
|
||||
|
||||
// In push-to-talk mode we only buffer while active
|
||||
audioChunks.push(buffer);
|
||||
lastAudioTime = now;
|
||||
|
||||
return { success: true, buffering: true };
|
||||
}
|
||||
|
||||
// Track first chunk time for duration-based flushing
|
||||
if (audioChunks.length === 0) {
|
||||
firstChunkTime = now;
|
||||
@ -380,6 +394,97 @@ async function flushAudioAndTranscribe() {
|
||||
}
|
||||
}
|
||||
|
||||
function notifyPushToTalkState() {
|
||||
sendToRenderer('push-to-talk-state', {
|
||||
active: isPushToTalkActive,
|
||||
inputMode: audioInputMode,
|
||||
});
|
||||
}
|
||||
|
||||
function resetRealtimeAudioBuffer() {
|
||||
audioChunks = [];
|
||||
firstChunkTime = 0;
|
||||
lastAudioTime = 0;
|
||||
|
||||
if (silenceCheckTimer) {
|
||||
clearTimeout(silenceCheckTimer);
|
||||
silenceCheckTimer = null;
|
||||
}
|
||||
if (windowsTranscriptionTimer) {
|
||||
clearInterval(windowsTranscriptionTimer);
|
||||
windowsTranscriptionTimer = null;
|
||||
}
|
||||
}
|
||||
|
||||
function updateTranscriptionTimerForPushToTalk() {
|
||||
if (audioInputMode === 'push-to-talk') {
|
||||
stopTranscriptionTimer();
|
||||
return;
|
||||
}
|
||||
|
||||
if (systemAudioProc && !transcriptionTimer) {
|
||||
startTranscriptionTimer();
|
||||
}
|
||||
}
|
||||
|
||||
async function setPushToTalkActive(active) {
|
||||
const wasActive = isPushToTalkActive;
|
||||
isPushToTalkActive = active;
|
||||
|
||||
if (active) {
|
||||
// Starting recording - clear any old buffers
|
||||
resetRealtimeAudioBuffer();
|
||||
audioBuffer = Buffer.alloc(0);
|
||||
console.log('Push-to-Talk: Recording started');
|
||||
sendToRenderer('update-status', 'Recording...');
|
||||
}
|
||||
|
||||
notifyPushToTalkState();
|
||||
|
||||
// When user stops recording in PTT mode, send audio for transcription
|
||||
if (!active && wasActive && audioInputMode === 'push-to-talk') {
|
||||
console.log('Push-to-Talk: Recording stopped, transcribing...');
|
||||
sendToRenderer('update-status', 'Transcribing...');
|
||||
|
||||
// For browser-based audio (Windows)
|
||||
if (audioChunks.length > 0) {
|
||||
await flushAudioAndTranscribe();
|
||||
}
|
||||
// For macOS SystemAudioDump
|
||||
if (audioBuffer.length > 0) {
|
||||
await transcribeBufferedAudio(true); // Force transcription
|
||||
}
|
||||
|
||||
sendToRenderer('update-status', 'Listening...');
|
||||
}
|
||||
}
|
||||
|
||||
async function togglePushToTalk() {
|
||||
if (isPushToTalkActive) {
|
||||
await setPushToTalkActive(false);
|
||||
} else {
|
||||
await setPushToTalkActive(true);
|
||||
}
|
||||
}
|
||||
|
||||
function updatePushToTalkSettings(inputMode) {
|
||||
if (inputMode) {
|
||||
audioInputMode = inputMode;
|
||||
}
|
||||
|
||||
if (audioInputMode !== 'push-to-talk' && isPushToTalkActive) {
|
||||
isPushToTalkActive = false;
|
||||
}
|
||||
|
||||
if (audioInputMode !== 'push-to-talk') {
|
||||
resetRealtimeAudioBuffer();
|
||||
audioBuffer = Buffer.alloc(0);
|
||||
}
|
||||
|
||||
notifyPushToTalkState();
|
||||
updateTranscriptionTimerForPushToTalk();
|
||||
}
|
||||
|
||||
function clearConversation() {
|
||||
const systemMessage = conversationMessages.find(m => m.role === 'system');
|
||||
conversationMessages = systemMessage ? [systemMessage] : [];
|
||||
@ -403,6 +508,7 @@ function closeOpenAISDK() {
|
||||
conversationMessages = [];
|
||||
audioChunks = [];
|
||||
isProcessing = false;
|
||||
isPushToTalkActive = false;
|
||||
|
||||
// Clear timers
|
||||
if (silenceCheckTimer) {
|
||||
@ -414,6 +520,7 @@ function closeOpenAISDK() {
|
||||
windowsTranscriptionTimer = null;
|
||||
}
|
||||
|
||||
notifyPushToTalkState();
|
||||
sendToRenderer('update-status', 'Disconnected');
|
||||
}
|
||||
|
||||
@ -461,11 +568,16 @@ function hasSpeech(buffer, threshold = 500) {
|
||||
return rms > threshold;
|
||||
}
|
||||
|
||||
async function transcribeBufferedAudio() {
|
||||
async function transcribeBufferedAudio(forcePTT = false) {
|
||||
if (audioBuffer.length === 0 || isProcessing) {
|
||||
return;
|
||||
}
|
||||
|
||||
// In push-to-talk mode, only transcribe when explicitly requested (forcePTT=true)
|
||||
if (audioInputMode === 'push-to-talk' && !forcePTT) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Calculate audio duration
|
||||
const bytesPerSample = 2;
|
||||
const audioDurationMs = (audioBuffer.length / bytesPerSample / SAMPLE_RATE) * 1000;
|
||||
@ -475,7 +587,8 @@ async function transcribeBufferedAudio() {
|
||||
}
|
||||
|
||||
// Check if there's actual speech in the audio (Voice Activity Detection)
|
||||
if (!hasSpeech(audioBuffer)) {
|
||||
// Skip VAD check in PTT mode - user explicitly wants to transcribe
|
||||
if (!forcePTT && !hasSpeech(audioBuffer)) {
|
||||
// Clear buffer if it's just silence/noise
|
||||
audioBuffer = Buffer.alloc(0);
|
||||
return;
|
||||
@ -487,7 +600,9 @@ async function transcribeBufferedAudio() {
|
||||
|
||||
try {
|
||||
console.log(`Transcribing ${audioDurationMs.toFixed(0)}ms of audio...`);
|
||||
if (!forcePTT) {
|
||||
sendToRenderer('update-status', 'Transcribing...');
|
||||
}
|
||||
|
||||
const transcription = await transcribeAudio(currentBuffer, 'audio/wav');
|
||||
|
||||
@ -497,14 +612,20 @@ async function transcribeBufferedAudio() {
|
||||
|
||||
// Send to chat
|
||||
await sendTextMessage(transcription);
|
||||
} else if (forcePTT) {
|
||||
console.log('Push-to-Talk: No speech detected in recording');
|
||||
}
|
||||
|
||||
if (!forcePTT) {
|
||||
sendToRenderer('update-status', 'Listening...');
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Transcription error:', error);
|
||||
if (!forcePTT) {
|
||||
sendToRenderer('update-status', 'Listening...');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function startMacOSAudioCapture() {
|
||||
if (process.platform !== 'darwin') return false;
|
||||
@ -598,6 +719,10 @@ async function startMacOSAudioCapture() {
|
||||
// Convert stereo to mono
|
||||
const monoChunk = CHANNELS === 2 ? convertStereoToMono(chunk) : chunk;
|
||||
|
||||
if (audioInputMode === 'push-to-talk' && !isPushToTalkActive) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add to audio buffer for transcription
|
||||
audioBuffer = Buffer.concat([audioBuffer, monoChunk]);
|
||||
|
||||
@ -643,7 +768,7 @@ async function startMacOSAudioCapture() {
|
||||
});
|
||||
|
||||
// Start periodic transcription
|
||||
startTranscriptionTimer();
|
||||
updateTranscriptionTimerForPushToTalk();
|
||||
|
||||
sendToRenderer('update-status', 'Listening...');
|
||||
|
||||
@ -651,6 +776,10 @@ async function startMacOSAudioCapture() {
|
||||
}
|
||||
|
||||
function startTranscriptionTimer() {
|
||||
// Don't start auto-transcription timer in push-to-talk mode
|
||||
if (audioInputMode === 'push-to-talk') {
|
||||
return;
|
||||
}
|
||||
stopTranscriptionTimer();
|
||||
transcriptionTimer = setInterval(transcribeBufferedAudio, TRANSCRIPTION_INTERVAL_MS);
|
||||
}
|
||||
@ -682,6 +811,8 @@ module.exports = {
|
||||
sendImageMessage,
|
||||
processAudioChunk,
|
||||
flushAudioAndTranscribe,
|
||||
togglePushToTalk,
|
||||
updatePushToTalkSettings,
|
||||
clearConversation,
|
||||
closeOpenAISDK,
|
||||
startMacOSAudioCapture,
|
||||
|
||||
@ -186,6 +186,10 @@ ipcRenderer.on('update-status', (event, status) => {
|
||||
cheatingDaddy.setStatus(status);
|
||||
});
|
||||
|
||||
ipcRenderer.on('push-to-talk-toggle', () => {
|
||||
ipcRenderer.send('push-to-talk-toggle');
|
||||
});
|
||||
|
||||
async function startCapture(screenshotIntervalSeconds = 5, imageQuality = 'medium') {
|
||||
// Store the image quality for manual screenshots
|
||||
currentImageQuality = imageQuality;
|
||||
|
||||
@ -9,6 +9,7 @@ let windowResizing = false;
|
||||
let resizeAnimation = null;
|
||||
const RESIZE_ANIMATION_DURATION = 500; // milliseconds
|
||||
|
||||
|
||||
function createWindow(sendToRenderer, geminiSessionRef) {
|
||||
// Get layout preference (default to 'normal')
|
||||
let windowWidth = 1100;
|
||||
@ -155,6 +156,7 @@ function getDefaultKeybinds() {
|
||||
scrollUp: isMac ? 'Cmd+Shift+Up' : 'Ctrl+Shift+Up',
|
||||
scrollDown: isMac ? 'Cmd+Shift+Down' : 'Ctrl+Shift+Down',
|
||||
emergencyErase: isMac ? 'Cmd+Shift+E' : 'Ctrl+Shift+E',
|
||||
pushToTalk: isMac ? 'Ctrl+Space' : 'Ctrl+Space',
|
||||
};
|
||||
}
|
||||
|
||||
@ -164,6 +166,10 @@ function updateGlobalShortcuts(keybinds, mainWindow, sendToRenderer, geminiSessi
|
||||
// Unregister all existing shortcuts
|
||||
globalShortcut.unregisterAll();
|
||||
|
||||
const prefs = storage.getPreferences();
|
||||
const audioInputMode = prefs.audioInputMode || 'auto';
|
||||
const enablePushToTalk = audioInputMode === 'push-to-talk';
|
||||
|
||||
const primaryDisplay = screen.getPrimaryDisplay();
|
||||
const { width, height } = primaryDisplay.workAreaSize;
|
||||
const moveIncrement = Math.floor(Math.min(width, height) * 0.1);
|
||||
@ -343,6 +349,18 @@ function updateGlobalShortcuts(keybinds, mainWindow, sendToRenderer, geminiSessi
|
||||
console.error(`Failed to register emergencyErase (${keybinds.emergencyErase}):`, error);
|
||||
}
|
||||
}
|
||||
|
||||
// Register push-to-talk shortcut (OpenAI SDK only, gated by preferences)
|
||||
if (keybinds.pushToTalk && enablePushToTalk) {
|
||||
try {
|
||||
globalShortcut.register(keybinds.pushToTalk, () => {
|
||||
sendToRenderer('push-to-talk-toggle');
|
||||
});
|
||||
console.log(`Registered pushToTalk (toggle): ${keybinds.pushToTalk}`);
|
||||
} catch (error) {
|
||||
console.error(`Failed to register pushToTalk (${keybinds.pushToTalk}):`, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function setupWindowIpcHandlers(mainWindow, sendToRenderer, geminiSessionRef) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user