diff --git a/apps/desktop/src/main/main.ts b/apps/desktop/src/main/main.ts index 677f068..32c3180 100644 --- a/apps/desktop/src/main/main.ts +++ b/apps/desktop/src/main/main.ts @@ -1,13 +1,27 @@ import dotenv from "dotenv"; dotenv.config(); -import { app } from "electron"; +import { app, ipcMain } from "electron"; +import { logger } from "./logger"; import started from "electron-squirrel-startup"; import { AppManager } from "./core/app-manager"; import { updateElectronApp } from "update-electron-app"; import { isWindows } from "../utils/platform"; +// Setup renderer logging relay (allows renderer to send logs to main process) +ipcMain.handle( + "log-message", + (_event, level: string, scope: string, ...args: unknown[]) => { + const scopedLogger = + logger[scope as keyof typeof logger] || logger.renderer; + const logMethod = scopedLogger[level as keyof typeof scopedLogger]; + if (typeof logMethod === "function") { + logMethod(...args); + } + }, +); + if (started) { app.quit(); } diff --git a/apps/desktop/src/main/managers/recording-manager.ts b/apps/desktop/src/main/managers/recording-manager.ts index 0be8110..0ad13d3 100644 --- a/apps/desktop/src/main/managers/recording-manager.ts +++ b/apps/desktop/src/main/managers/recording-manager.ts @@ -1,29 +1,65 @@ -import { ipcMain, app, dialog } from "electron"; +import { ipcMain, app } from "electron"; import { EventEmitter } from "node:events"; +import { Mutex } from "async-mutex"; import { logger, logPerformance } from "../logger"; import type { ServiceManager } from "@/main/managers/service-manager"; import type { RecordingState } from "../../types/recording"; -import { Mutex } from "async-mutex"; import type { ShortcutManager } from "./shortcut-manager"; import { StreamingWavWriter } from "../../utils/streaming-wav-writer"; import * as fs from "node:fs"; import * as path from "node:path"; export type RecordingMode = "idle" | "ptt" | "hands-free"; +export type TerminationCode = + | "dismissed" + | "quick_release" + | "no_audio" + | "error"; + +// Timing thresholds (ms) +const QUICK_PRESS_THRESHOLD = 500; +const NO_AUDIO_TIMEOUT = 5000; +const STUCK_STATE_TIMEOUT = 10000; /** * Manages recording state and coordinates audio recording across the application * Acts as the single source of truth for recording status + * + * State Machine: + * IDLE -> STARTING -> RECORDING -> STOPPING -> IDLE + * + * Key design decisions: + * - Mutex serializes lifecycle operations (doStart, endRecording) + * - Audio chunks accumulated in memory, file written only at the end + * - Single terminationCode field determines final action in handleFinalChunk */ export class RecordingManager extends EventEmitter { - private currentSessionId: string | null = null; + // Core state private recordingState: RecordingState = "idle"; - private recordingMutex = new Mutex(); private recordingMode: RecordingMode = "idle"; - private currentAudioRecording: { - audioFilePath: string; - wavWriter: StreamingWavWriter; - } | null = null; + + // Lifecycle mutex - serializes doStart and endRecording + private lifecycleMutex = new Mutex(); + + // Timing + private recordingInitiatedAt: number | null = null; + private cancelTimer: NodeJS.Timeout | null = null; + private noAudioTimer: NodeJS.Timeout | null = null; + private stuckStateTimer: NodeJS.Timeout | null = null; + + // Session state + private currentSessionId: string | null = null; + private initPromise: Promise | null = null; + private firstChunkReceived: boolean = false; + + // In-memory audio buffer - written to file only in handleFinalChunk + private audioChunks: Float32Array[] = []; + + // Termination code - set during stopping to determine final action + // null = normal (transcribe + paste), "dismissed" = save file only, others = discard + private terminationCode: TerminationCode | null = null; + + // Performance tracking private recordingStartedAt: number | null = null; private recordingStoppedAt: number | null = null; @@ -43,9 +79,9 @@ export class RecordingManager extends EventEmitter { lastPTTState = isPressed; if (isPressed) { - await this.startPTT(); + await this.onPTTPress(); } else { - await this.stopPTT(); + await this.onPTTRelease(); } } }); @@ -67,7 +103,7 @@ export class RecordingManager extends EventEmitter { }); // Broadcast state change to all windows - this.broadcastStateChange(); + this.emit("state-changed", this.getState()); } private setMode(newMode: RecordingMode): void { @@ -79,32 +115,539 @@ export class RecordingManager extends EventEmitter { }); // Broadcast mode change to all windows - this.broadcastModeChange(); + this.emit("mode-changed", this.getRecordingMode()); } public getState(): RecordingState { return this.recordingState; } - private broadcastStateChange(): void { - // Emit event for internal listeners (tRPC subscription will pick this up) - this.emit("state-changed", this.getState()); + public getRecordingMode(): RecordingMode { + return this.recordingMode; } - private broadcastModeChange(): void { - // Emit event for internal listeners (tRPC subscription will pick this up) - this.emit("mode-changed", this.getRecordingMode()); + // ═══════════════════════════════════════════════════════════════════ + // EVENT HANDLERS + // ═══════════════════════════════════════════════════════════════════ + + // PTT key pressed + public async onPTTPress() { + // Double-tap detection: timer pending means quick release happened + if (this.cancelTimer) { + clearTimeout(this.cancelTimer); + this.cancelTimer = null; + this.setMode("hands-free"); + logger.audio.info("Double-tap PTT detected, switching to hands-free"); + return; + } + + // Not recording? Start PTT recording + if (this.recordingState === "idle") { + this.recordingInitiatedAt = Date.now(); + await this.doStart("ptt"); + return; + } + + // Already recording in hands-free mode - handle based on timing + if ( + this.recordingState === "recording" && + this.recordingMode === "hands-free" + ) { + if (this.isQuickAction()) { + logger.audio.info("Quick PTT in hands-free mode, cancelling"); + await this.endRecording("quick_release"); + } else { + logger.audio.info("PTT in hands-free mode, stopping recording"); + await this.endRecording(); + } + } + } + + // PTT key released + public async onPTTRelease() { + // Hands-free mode ignores PTT release + if (this.recordingMode !== "ptt") return; + if (this.recordingState !== "recording") return; + + if (this.isQuickAction()) { + // Quick release - wait for potential double-tap before cancelling + this.cancelTimer = setTimeout(() => { + this.cancelTimer = null; + logger.audio.info("Quick release timeout, cancelling"); + this.endRecording("quick_release"); + }, QUICK_PRESS_THRESHOLD); + } else { + // Normal release - stop and transcribe + await this.endRecording(); + } + } + + // Toggle shortcut pressed + public async toggleHandsFree() { + // Double-tap detection: timer pending means quick release happened + if (this.cancelTimer) { + clearTimeout(this.cancelTimer); + this.cancelTimer = null; + this.setMode("hands-free"); + logger.audio.info("Double-tap toggle detected, switching to hands-free"); + return; + } + + // Not recording? Start hands-free recording + if (this.recordingState === "idle") { + this.recordingInitiatedAt = Date.now(); + await this.doStart("hands-free"); + return; + } + + // Already recording + if (this.recordingState === "recording") { + // In PTT mode? Switch to hands-free + if (this.recordingMode === "ptt") { + logger.audio.info("Toggle in PTT mode, switching to hands-free"); + this.setMode("hands-free"); + return; + } + + // In hands-free mode - stop or cancel based on timing + if (this.recordingMode === "hands-free") { + if (this.isQuickAction()) { + logger.audio.info("Quick toggle in hands-free mode, cancelling"); + await this.endRecording("quick_release"); + } else { + await this.endRecording(); + } + } + } + } + + // ═══════════════════════════════════════════════════════════════════ + // STATE TRANSITIONS + // ═══════════════════════════════════════════════════════════════════ + + /** + * Start recording with mutex protection + */ + private async doStart(mode: "ptt" | "hands-free") { + await this.lifecycleMutex.runExclusive(async () => { + if (this.recordingState !== "idle") { + logger.audio.warn("Cannot start recording - not idle", { + currentState: this.recordingState, + }); + this.recordingInitiatedAt = null; + return; + } + + const startTime = performance.now(); + logger.audio.info("RecordingManager: doStart called", { mode }); + + // Sync state broadcast + this.setState("starting"); + this.setMode(mode); + this.terminationCode = null; + this.firstChunkReceived = false; + this.recordingStartedAt = performance.now(); + this.recordingStoppedAt = null; + this.audioChunks = []; + + const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); + this.currentSessionId = `session-${timestamp}`; + this.setState("recording"); + + this.startNoAudioTimer(); + + // Async init inside mutex + this.initPromise = this.initializeSession(); + await this.initPromise; + + const totalDuration = performance.now() - startTime; + logger.audio.info("Recording started", { + sessionId: this.currentSessionId, + duration: `${totalDuration.toFixed(2)}ms`, + }); + }); + } + + /** + * Initialize session asynchronously + * No file operations here - chunks accumulate in memory + */ + private async initializeSession(): Promise { + try { + // Reset VAD state for fresh speech detection + const vadService = this.serviceManager.getService("vadService"); + vadService.reset(); + + // Refresh accessibility context + const nativeBridge = this.serviceManager.getService("nativeBridge"); + nativeBridge.refreshAccessibilityContext(); + + // AWAIT mute to ensure it completes before mutex releases + await nativeBridge.call("muteSystemAudio", {}); + } catch (error) { + logger.audio.error("Failed to initialize session", { error }); + } + } + + /** + * End recording - unified method for stop and cancel + * @param code - null for normal stop, or cancellation code + */ + private async endRecording( + code: TerminationCode | null = null, + ): Promise { + await this.lifecycleMutex.runExclusive(async () => { + if (this.recordingState !== "recording") { + logger.audio.warn("Cannot end recording - not recording", { + currentState: this.recordingState, + }); + return; + } + + // Wait for init to complete + if (this.initPromise) { + await this.initPromise; + this.initPromise = null; + } + + const sessionId = this.currentSessionId; + + logger.audio.info("Ending recording", { sessionId, code }); + + // Set termination code and timestamps + this.terminationCode = code; + this.recordingStoppedAt = performance.now(); + + // State transition first - signals worklet to stop and send final chunk + this.setState("stopping"); + this.clearTimers(); + this.recordingInitiatedAt = null; + this.setMode("idle"); + + // Restore audio after state change (can happen while final chunk is in flight) + try { + const nativeBridge = this.serviceManager.getService("nativeBridge"); + await nativeBridge.call("restoreSystemAudio", {}); + } catch (error) { + logger.main.warn("Failed to restore system audio", { error }); + } + + // Cancel streaming for cancel codes (not null, not dismissed) + if (code && code !== "dismissed" && sessionId) { + try { + const transcriptionService = this.serviceManager.getService( + "transcriptionService", + ); + await transcriptionService.cancelStreamingSession(sessionId); + } catch (error) { + logger.audio.warn("Failed to cancel streaming session", { error }); + } + } + + // Safety timeout for stuck state + this.stuckStateTimer = setTimeout(() => { + if (this.recordingState === "stopping") { + logger.audio.warn("No final chunk received, forcing idle"); + this.forceIdle(); + } + }, STUCK_STATE_TIMEOUT); + }); + } + + // ═══════════════════════════════════════════════════════════════════ + // CHUNK PROCESSING + // ═══════════════════════════════════════════════════════════════════ + + private async handleAudioChunk( + chunk: Float32Array, + isFinalChunk: boolean, + ): Promise { + // Only process if recording or stopping + if ( + this.recordingState !== "recording" && + this.recordingState !== "stopping" + ) { + logger.audio.debug("Discarding audio chunk - not in active state", { + state: this.recordingState, + isFinalChunk, + }); + return; + } + + // Wait for async init to complete + if (this.initPromise) { + await this.initPromise; + } + + // Track first chunk for no-audio detection + if (!this.firstChunkReceived && chunk.length > 0) { + this.firstChunkReceived = true; + this.clearNoAudioTimer(); + logger.audio.info("First audio chunk received"); + } + + // Handle final chunk + if (isFinalChunk) { + // Add final chunk to buffer before processing (it may contain audio data) + if (chunk.length > 0) { + this.audioChunks.push(chunk); + + // Also send to transcription if we have a session and not terminated + if (this.currentSessionId && !this.terminationCode) { + try { + const transcriptionService = this.serviceManager.getService( + "transcriptionService", + ); + await transcriptionService.processStreamingChunk({ + sessionId: this.currentSessionId, + audioChunk: chunk, + recordingStartedAt: this.recordingStartedAt || undefined, + }); + } catch (error) { + logger.audio.error("Error processing final chunk:", error); + } + } + } + await this.handleFinalChunk(); + return; + } + + // Only accumulate during recording (not stopping) + if (this.recordingState !== "recording") { + return; + } + + const sessionId = this.currentSessionId; + if (!sessionId || chunk.length === 0) { + return; + } + + // Accumulate in memory + this.audioChunks.push(chunk); + + // Stream to transcription (skip if terminated) + if (!this.terminationCode) { + try { + const transcriptionService = this.serviceManager.getService( + "transcriptionService", + ); + await transcriptionService.processStreamingChunk({ + sessionId, + audioChunk: chunk, + recordingStartedAt: this.recordingStartedAt || undefined, + }); + } catch (error) { + logger.audio.error("Error processing chunk:", error); + } + } + } + + /** + * Handle the final chunk - unified termination logic + */ + private async handleFinalChunk(): Promise { + // Clear stuck state timer + if (this.stuckStateTimer) { + clearTimeout(this.stuckStateTimer); + this.stuckStateTimer = null; + } + + if (this.recordingState !== "stopping") { + logger.audio.debug("Unexpected state in handleFinalChunk", { + state: this.recordingState, + }); + return; + } + + const sessionId = this.currentSessionId || ""; + const chunks = this.audioChunks; + const code = this.terminationCode; + + // CANCELLED (quick_release, no_audio, error) - discard buffer + if (code && code !== "dismissed") { + logger.audio.info("Recording cancelled", { + code, + chunksDiscarded: chunks.length, + }); + + this.emit("recording-cancelled", { sessionId, code }); + this.audioChunks = []; + this.resetSessionState(); + this.setState("idle"); + return; + } + + // Write audio file (for NORMAL and DISMISSED) + let audioFilePath: string | null = null; + + if (chunks.length > 0) { + try { + audioFilePath = await this.createAudioFile(sessionId); + const wavWriter = new StreamingWavWriter(audioFilePath); + + for (const chunk of chunks) { + await wavWriter.appendAudio(chunk); + } + await wavWriter.finalize(); + + logger.audio.info("Audio file written", { + sessionId, + filePath: audioFilePath, + chunks: chunks.length, + }); + } catch (error) { + logger.audio.error("Failed to write audio file", { error }); + audioFilePath = null; + } + } + this.audioChunks = []; + + // DISMISSED - just save file, skip transcription + if (code === "dismissed") { + // Cancel streaming session to prevent memory leak and audio bleed + try { + const transcriptionService = this.serviceManager.getService( + "transcriptionService", + ); + await transcriptionService.cancelStreamingSession(sessionId); + } catch (error) { + logger.audio.warn("Failed to cancel streaming session", { error }); + } + + this.emit("transcription-dismissed", { sessionId, audioFilePath }); + logger.audio.info("Recording dismissed, file saved for potential undo", { + audioFilePath, + }); + this.resetSessionState(); + this.setState("idle"); + return; + } + + // NORMAL - get transcription and paste + let result = ""; + try { + const transcriptionService = this.serviceManager.getService( + "transcriptionService", + ); + result = await transcriptionService.finalizeSession({ + sessionId, + audioFilePath: audioFilePath || undefined, + recordingStartedAt: this.recordingStartedAt || undefined, + recordingStoppedAt: this.recordingStoppedAt || undefined, + }); + } catch (error) { + logger.audio.error("Failed to get final transcription", { error }); + } + + logPerformance("streaming transcription complete", Date.now(), { + sessionId, + resultLength: result?.length || 0, + }); + + if (result) { + await this.pasteTranscription(result); + } + + this.resetSessionState(); + this.setState("idle"); + } + + // ═══════════════════════════════════════════════════════════════════ + // DISMISS SUPPORT + // ═══════════════════════════════════════════════════════════════════ + + /** + * Dismiss the current recording (called during stopping state) + * Saves audio file but skips transcription + */ + public dismiss(): void { + if (this.recordingState === "stopping") { + this.terminationCode = "dismissed"; + logger.audio.info("Recording dismissed"); + } + } + + // ═══════════════════════════════════════════════════════════════════ + // HELPER METHODS + // ═══════════════════════════════════════════════════════════════════ + + private isQuickAction(): boolean { + if (!this.recordingInitiatedAt) return false; + return Date.now() - this.recordingInitiatedAt < QUICK_PRESS_THRESHOLD; + } + + private clearTimers(): void { + if (this.cancelTimer) { + clearTimeout(this.cancelTimer); + this.cancelTimer = null; + } + if (this.noAudioTimer) { + clearTimeout(this.noAudioTimer); + this.noAudioTimer = null; + } + if (this.stuckStateTimer) { + clearTimeout(this.stuckStateTimer); + this.stuckStateTimer = null; + } + } + + private clearNoAudioTimer(): void { + if (this.noAudioTimer) { + clearTimeout(this.noAudioTimer); + this.noAudioTimer = null; + } + } + + private startNoAudioTimer(): void { + this.noAudioTimer = setTimeout(() => { + if (this.recordingState === "recording" && !this.firstChunkReceived) { + logger.audio.warn("No audio detected for 5 seconds"); + this.emit("no-audio-detected"); + this.endRecording("no_audio"); + } + }, NO_AUDIO_TIMEOUT); + } + + private async forceIdle(): Promise { + logger.audio.warn("Forcing idle due to stuck state"); + + // Cancel streaming session if one exists to prevent memory leak and audio bleed + if (this.currentSessionId) { + try { + const transcriptionService = this.serviceManager.getService( + "transcriptionService", + ); + await transcriptionService.cancelStreamingSession( + this.currentSessionId, + ); + } catch (error) { + logger.audio.warn("Failed to cancel streaming session", { error }); + } + } + + this.audioChunks = []; + this.resetSessionState(); + this.setState("idle"); + } + + private resetSessionState(): void { + this.currentSessionId = null; + this.initPromise = null; + this.firstChunkReceived = false; + this.recordingInitiatedAt = null; + this.recordingMode = "idle"; + this.audioChunks = []; + this.terminationCode = null; + this.clearTimers(); } /** * Create audio file for recording session */ private async createAudioFile(sessionId: string): Promise { - // Create audio directory in app temp path const audioDir = path.join(app.getPath("temp"), "amical-audio"); await fs.promises.mkdir(audioDir, { recursive: true }); - // Create file path const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); const filePath = path.join(audioDir, `audio-${sessionId}-${timestamp}.wav`); @@ -116,324 +659,6 @@ export class RecordingManager extends EventEmitter { return filePath; } - private setupIPCHandlers(): void { - // Handle audio data chunks from renderer - ipcMain.handle( - "audio-data-chunk", - async (event, chunk: ArrayBuffer, isFinalChunk: boolean) => { - if (!(chunk instanceof ArrayBuffer)) { - logger.audio.error("Received invalid audio chunk type", { - type: typeof chunk, - }); - throw new Error("Invalid audio chunk type received."); - } - - // Convert ArrayBuffer back to Float32Array - const float32Array = new Float32Array(chunk); - logger.audio.debug("Received audio chunk", { - samples: float32Array.length, - isFinalChunk, - }); - - await this.handleAudioChunk(float32Array, isFinalChunk); - }, - ); - - // Handle log messages from renderer processes - ipcMain.handle( - "log-message", - (event, level: string, scope: string, ...args: any[]) => { - const scopedLogger = - logger[scope as keyof typeof logger] || logger.renderer; - const logMethod = scopedLogger[level as keyof typeof scopedLogger]; - if (typeof logMethod === "function") { - logMethod(...args); - } - }, - ); - } - - public async startRecording(mode: "ptt" | "hands-free") { - await this.recordingMutex.runExclusive(async () => { - const startTime = performance.now(); - logger.audio.info("RecordingManager: startRecording called", { mode }); - - // Check if transcription service is available and has models - const modelCheckStartTime = performance.now(); - const transcriptionService = this.serviceManager.getService( - "transcriptionService", - ); - - const hasModels = await transcriptionService.isModelAvailable(); - const modelCheckDuration = performance.now() - modelCheckStartTime; - logger.audio.info( - `RecordingManager: Model availability check took ${modelCheckDuration.toFixed(2)}ms`, - ); - - if (!hasModels) { - logger.audio.error("No transcription models available"); - // Show error dialog - dialog.showErrorBox( - "No Models Available", - "Please download a transcription model from Speech Models before recording.", - ); - return; - } - // if we were previously in ptt mode, we override - // priority is given to hands-free mode - // we don't need to check the other way around - if (mode === "hands-free") { - this.setMode("hands-free"); - } - - // Check if already recording - if (this.recordingState !== "idle") { - logger.audio.warn("Cannot start recording - already in progress", { - currentState: this.recordingState, - }); - return; - } - - this.setState("starting"); - this.setMode(mode); - - // Reset VAD state for fresh speech detection in this recording - const vadService = this.serviceManager.getService("vadService"); - vadService.reset(); - - this.recordingStartedAt = performance.now(); - this.recordingStoppedAt = null; // Reset stopped time - - // Create session ID - const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); - this.currentSessionId = `session-${timestamp}`; - - // Refresh accessibility context from NativeBridge (async, not awaited) - const nativeBridge = this.serviceManager.getService("nativeBridge"); - nativeBridge?.refreshAccessibilityContext(); - logger.audio.info( - "RecordingManager: Triggered accessibility context refresh (async)", - ); - - // Create audio file and WAV writer - const fileCreationStartTime = performance.now(); - const audioFilePath = await this.createAudioFile(this.currentSessionId); - const fileCreationDuration = performance.now() - fileCreationStartTime; - logger.audio.info( - `RecordingManager: Audio file creation took ${fileCreationDuration.toFixed(2)}ms`, - ); - - this.currentAudioRecording = { - audioFilePath, - wavWriter: new StreamingWavWriter(audioFilePath), - }; - - logger.audio.info("Audio recording initialized", { - sessionId: this.currentSessionId, - audioFilePath, - }); - - // Mute system audio (async, non-blocking) - const muteStartTime = performance.now(); - if (nativeBridge) { - nativeBridge - .call("muteSystemAudio", {}) - .then(() => { - const muteDuration = performance.now() - muteStartTime; - logger.audio.info( - `RecordingManager: System audio mute took ${muteDuration.toFixed(2)}ms`, - ); - }) - .catch((error) => { - logger.main.warn("Failed to mute system audio", { error }); - }); - } - - this.setState("recording"); - const totalDuration = performance.now() - startTime; - logger.audio.info("Recording started successfully", { - sessionId: this.currentSessionId, - totalStartupDuration: `${totalDuration.toFixed(2)}ms`, - }); - - return; - }); - } - - public async stopRecording() { - await this.recordingMutex.runExclusive(async () => { - // Check if recording - if (this.recordingState !== "recording") { - logger.audio.warn("Cannot stop recording - not currently recording", { - currentState: this.recordingState, - }); - return; - } - - this.recordingStoppedAt = performance.now(); - - this.setState("stopping"); - - // Reset recording mode when stopping - this.recordingMode = "idle"; - - // Restore system audio - try { - const nativeBridge = this.serviceManager.getService("nativeBridge"); - if (nativeBridge) { - await nativeBridge.call("restoreSystemAudio", {}); - } - } catch (error) { - logger.main.warn("Native bridge not available for audio restore"); - } - - logger.audio.info("Recording stop initiated", { - sessionId: this.currentSessionId, - }); - - // State will transition to "idle" when final chunk is processed - // Session will be cleared when final chunk is processed - return; - }); - } - - // PTT-specific methods - public async startPTT() { - this.startRecording("ptt"); - } - - public async stopPTT() { - if (this.recordingMode !== "ptt") { - return; - } - this.stopRecording(); - } - - // Hands-free mode toggle - public async toggleHandsFree() { - if (this.recordingState === "idle") { - this.startRecording("hands-free"); - return; - } - if (this.recordingMode === "hands-free") { - this.stopRecording(); - return; - } - this.startRecording("hands-free"); - return; - } - - // Get current mode - public getRecordingMode(): RecordingMode { - return this.recordingMode; - } - - private async handleAudioChunk( - chunk: Float32Array, - isFinalChunk: boolean, - ): Promise { - // Validate we're in a recording state - if ( - this.recordingState !== "recording" && - this.recordingState !== "stopping" - ) { - logger.audio.error("Received audio chunk while not recording", { - state: this.recordingState, - isFinalChunk, - }); - return; - } - - // Session should already exist from startRecording - if (!this.currentSessionId || !this.currentAudioRecording) { - logger.audio.error( - "No session ID or audio recording found while handling audio chunk", - ); - return; - } - - // Skip empty chunks unless it's the final one - if (chunk.length === 0 && !isFinalChunk) { - logger.audio.debug("Skipping empty non-final chunk"); - return; - } - - await this.currentAudioRecording.wavWriter.appendAudio(chunk); - - try { - const transcriptionService = this.serviceManager.getService( - "transcriptionService", - ); - if (!transcriptionService) { - throw new Error("Transcription service not available"); - } - const startTime = Date.now(); - - // Process the chunk - pass isFinal flag, audio file path, and timing - const transcriptionResult = - await transcriptionService.processStreamingChunk({ - sessionId: this.currentSessionId, - audioChunk: chunk, - isFinal: isFinalChunk, - audioFilePath: this.currentAudioRecording.audioFilePath, - recordingStartedAt: this.recordingStartedAt || undefined, - recordingStoppedAt: this.recordingStoppedAt || undefined, - }); - - logger.audio.debug("Processed audio chunk", { - chunkSize: chunk.length, - processingTimeMs: Date.now() - startTime, - resultLength: transcriptionResult.length, - isFinal: isFinalChunk, - }); - - // If this was the final chunk, handle completion - if (isFinalChunk) { - // Finalize the WAV file - await this.currentAudioRecording.wavWriter.finalize(); - logger.audio.info("Finalized WAV file", { - sessionId: this.currentSessionId, - filePath: this.currentAudioRecording.audioFilePath, - dataSize: this.currentAudioRecording.wavWriter.getDataSize(), - }); - - logPerformance("streaming transcription complete", startTime, { - sessionId: this.currentSessionId, - resultLength: transcriptionResult?.length || 0, - }); - - logger.audio.info("Streaming transcription completed", { - sessionId: this.currentSessionId, - resultLength: transcriptionResult?.length || 0, - hasResult: !!transcriptionResult, - }); - - // Paste the final formatted transcription - if (transcriptionResult) { - await this.pasteTranscription(transcriptionResult); - } - - // Clean up session and audio recording - this.currentSessionId = null; - this.currentAudioRecording = null; - - // Ensure state is idle after completion - if (this.recordingState === "stopping") { - this.setState("idle"); - } - } - } catch (error) { - logger.audio.error("Error processing audio chunk:", error); - - if (isFinalChunk) { - // Clean up session and audio recording on error - this.currentSessionId = null; - this.currentAudioRecording = null; - this.setState("idle"); - } - } - } - private async pasteTranscription(transcription: string): Promise { if (!transcription || typeof transcription !== "string") { logger.main.warn("Invalid transcription, not pasting"); @@ -454,25 +679,74 @@ export class RecordingManager extends EventEmitter { } } catch (error) { logger.main.warn( - "Swift bridge not available, cannot paste transcription", + "Native bridge not available, cannot paste transcription", { error: error instanceof Error ? error.message : String(error) }, ); } } + // ═══════════════════════════════════════════════════════════════════ + // IPC HANDLERS + // ═══════════════════════════════════════════════════════════════════ + + private setupIPCHandlers(): void { + // Handle audio data chunks from renderer + ipcMain.handle( + "audio-data-chunk", + async (_event, chunk: ArrayBuffer, isFinalChunk: boolean) => { + if (!(chunk instanceof ArrayBuffer)) { + logger.audio.error("Received invalid audio chunk type", { + type: typeof chunk, + }); + throw new Error("Invalid audio chunk type received."); + } + + // Convert ArrayBuffer back to Float32Array + const float32Array = new Float32Array(chunk); + logger.audio.debug("Received audio chunk", { + samples: float32Array.length, + isFinalChunk, + }); + + await this.handleAudioChunk(float32Array, isFinalChunk); + }, + ); + } + + // ═══════════════════════════════════════════════════════════════════ + // PUBLIC API (for tRPC routers) + // ═══════════════════════════════════════════════════════════════════ + + /** + * Signal to start recording (called from tRPC) + */ + public async signalStart(): Promise { + if (this.recordingState === "idle") { + this.recordingInitiatedAt = Date.now(); + await this.doStart("hands-free"); + } + } + + /** + * Signal to stop recording (called from tRPC) + */ + public async signalStop(): Promise { + if (this.recordingState === "recording") { + await this.endRecording(); + } + } + // Clean up resources async cleanup(): Promise { + this.clearTimers(); + // Stop recording if active - if ( - this.recordingState === "recording" || - this.recordingState === "starting" - ) { - await this.stopRecording(); + if (this.recordingState === "recording") { + await this.endRecording(); } - // Clear any active session and audio recording - this.currentSessionId = null; - this.currentAudioRecording = null; + // Clear any active session + this.resetSessionState(); this.setState("idle"); } } diff --git a/apps/desktop/src/pipeline/core/pipeline-types.ts b/apps/desktop/src/pipeline/core/pipeline-types.ts index ff6f302..30c17d4 100644 --- a/apps/desktop/src/pipeline/core/pipeline-types.ts +++ b/apps/desktop/src/pipeline/core/pipeline-types.ts @@ -7,18 +7,20 @@ import { PipelineContext } from "./context"; import { GetAccessibilityContextResult } from "@amical/types"; export { PipelineContext, SharedPipelineData } from "./context"; +// Context for transcription operations (shared between transcribe and flush) +export interface TranscribeContext { + vocabulary?: Map; + accessibilityContext?: GetAccessibilityContextResult | null; + previousChunk?: string; + aggregatedTranscription?: string; + language?: string; +} + // Transcription input parameters export interface TranscribeParams { audioData: Float32Array; speechProbability?: number; // Speech probability from frontend VAD (0-1) - flush?: boolean; // Whether to flush any buffered audio - context: { - vocabulary?: Map; - accessibilityContext?: GetAccessibilityContextResult | null; - previousChunk?: string; - aggregatedTranscription?: string; - language?: string; - }; + context: TranscribeContext; } // Formatting input parameters @@ -37,6 +39,8 @@ export interface FormatParams { export interface TranscriptionProvider { readonly name: string; transcribe(params: TranscribeParams): Promise; + flush(context: TranscribeContext): Promise; + reset(): void; // Clear internal buffers without transcribing } // Formatting provider interface @@ -71,7 +75,7 @@ export interface StreamingSession { firstChunkReceivedAt?: number; // When first audio chunk arrived at transcription service recordingStartedAt?: number; // When user pressed record button (from RecordingManager) recordingStoppedAt?: number; // When user released record button (from RecordingManager) - finalChunkReceivedAt?: number; // When final chunk arrived at transcription service + finalizationStartedAt?: number; // When finalizeSession() was called } // Simple pipeline configuration diff --git a/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts b/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts index 79bbd3b..a9c44d4 100644 --- a/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts +++ b/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts @@ -1,6 +1,7 @@ import { TranscriptionProvider, TranscribeParams, + TranscribeContext, } from "../../core/pipeline-types"; import { logger } from "../../../main/logger"; import { AuthService } from "../../../services/auth-service"; @@ -51,21 +52,16 @@ export class AmicalCloudProvider implements TranscriptionProvider { }); } + /** + * Process an audio chunk - buffers and conditionally transcribes + */ async transcribe(params: TranscribeParams): Promise { try { - const { - audioData, - speechProbability = 1, - flush = false, - context, - } = params; + const { audioData, speechProbability = 1, context } = params; - // Store language for use in API call (undefined = auto-detect) + // Store context for API call this.currentLanguage = context.language; - - // Store accessibility context for the API request this.currentAccessibilityContext = context?.accessibilityContext ?? null; - this.currentAggregatedTranscription = context?.aggregatedTranscription; // Check authentication @@ -89,40 +85,46 @@ export class AmicalCloudProvider implements TranscriptionProvider { this.currentSilenceFrameCount++; } - // Calculate durations - const silenceDuration = - ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) * - 1000; - const speechDuration = - ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000; - - // Determine if we should process - const shouldProcess = - flush || - (speechDuration >= this.MIN_SPEECH_DURATION_MS && - silenceDuration >= this.MAX_SILENCE_DURATION_MS); - - if (!shouldProcess) { + // Only transcribe if speech/silence patterns indicate we should + if (!this.shouldTranscribe()) { return ""; } - // Process accumulated audio (pass flush flag for formatting decision) - const result = await this.processAudio(flush); - - // Clear buffer after processing - this.frameBuffer = []; - this.frameBufferSpeechProbabilities = []; - this.currentSilenceFrameCount = 0; - - return result; + return this.doTranscription(false); } catch (error) { logger.transcription.error("Cloud transcription error:", error); throw error; } } - private async processAudio(isFinal: boolean = false): Promise { - // Combine all frames into a single Float32Array (may be empty) + /** + * Flush any buffered audio and return transcription with formatting + * Called at the end of a recording session + */ + async flush(context: TranscribeContext): Promise { + try { + // Store context for API call + this.currentLanguage = context.language; + this.currentAccessibilityContext = context?.accessibilityContext ?? null; + this.currentAggregatedTranscription = context?.aggregatedTranscription; + + // Check authentication + if (!(await this.authService.isAuthenticated())) { + throw new Error("Authentication required for cloud transcription"); + } + + return this.doTranscription(true); + } catch (error) { + logger.transcription.error("Cloud transcription error:", error); + throw error; + } + } + + /** + * Shared transcription logic - aggregates buffer, calls cloud API, clears state + */ + private async doTranscription(enableFormatting: boolean): Promise { + // Combine all frames into a single Float32Array const totalLength = this.frameBuffer.reduce( (acc, frame) => acc + frame.length, 0, @@ -134,9 +136,43 @@ export class AmicalCloudProvider implements TranscriptionProvider { offset += frame.length; } - // Try transcription with automatic retry on 401 - // Enable formatting only on final chunk - return this.makeTranscriptionRequest(combinedAudio, false, isFinal); + // Clear frame buffers only (context values needed for API call below) + this.frameBuffer = []; + this.frameBufferSpeechProbabilities = []; + this.currentSilenceFrameCount = 0; + + // Make the API request + return this.makeTranscriptionRequest( + combinedAudio, + false, + enableFormatting, + ); + } + + /** + * Clear internal buffers without transcribing + * Called when cancelling a session to prevent audio bleed + */ + reset(): void { + this.frameBuffer = []; + this.frameBufferSpeechProbabilities = []; + this.currentSilenceFrameCount = 0; + this.currentLanguage = undefined; + this.currentAccessibilityContext = null; + this.currentAggregatedTranscription = undefined; + } + + private shouldTranscribe(): boolean { + const silenceDuration = + ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) * + 1000; + const speechDuration = + ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000; + + return ( + speechDuration >= this.MIN_SPEECH_DURATION_MS && + silenceDuration >= this.MAX_SILENCE_DURATION_MS + ); } private async makeTranscriptionRequest( @@ -144,9 +180,13 @@ export class AmicalCloudProvider implements TranscriptionProvider { isRetry = false, enableFormatting = false, ): Promise { - // Skip API call if no audio and formatting not requested - if (audioData.length === 0 && !enableFormatting) { - return ""; + // Skip API call if there's nothing to process + if (audioData.length === 0) { + const hasTextToFormat = + enableFormatting && this.currentAggregatedTranscription?.trim(); + if (!hasTextToFormat) { + return ""; + } } // Get auth token @@ -166,112 +206,104 @@ export class AmicalCloudProvider implements TranscriptionProvider { formatting: enableFormatting, }); - try { - const response = await fetch(`${this.apiEndpoint}/transcribe`, { - method: "POST", - headers: { - "Content-Type": "application/json", - Authorization: `Bearer ${idToken}`, - "User-Agent": getUserAgent(), + const response = await fetch(`${this.apiEndpoint}/transcribe`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${idToken}`, + "User-Agent": getUserAgent(), + }, + body: JSON.stringify({ + audioData: Array.from(audioData), + language: this.currentLanguage, + previousTranscription: this.currentAggregatedTranscription, + formatting: { + enabled: enableFormatting, }, - body: JSON.stringify({ - audioData: Array.from(audioData), - language: this.currentLanguage, - previousTranscription: this.currentAggregatedTranscription, - formatting: { - enabled: enableFormatting, - }, - sharedContext: this.currentAccessibilityContext - ? { - selectedText: - this.currentAccessibilityContext.context?.textSelection - ?.selectedText, - beforeText: - this.currentAccessibilityContext.context?.textSelection - ?.preSelectionText, - afterText: - this.currentAccessibilityContext.context?.textSelection - ?.postSelectionText, - appType: detectApplicationType( - this.currentAccessibilityContext, - ), - appBundleId: - this.currentAccessibilityContext.context?.application - ?.bundleIdentifier, - appName: - this.currentAccessibilityContext.context?.application?.name, - appUrl: - this.currentAccessibilityContext.context?.windowInfo?.url, - surroundingContext: "", // Empty for now, future enhancement - } - : undefined, - }), - }); + sharedContext: this.currentAccessibilityContext + ? { + selectedText: + this.currentAccessibilityContext.context?.textSelection + ?.selectedText, + beforeText: + this.currentAccessibilityContext.context?.textSelection + ?.preSelectionText, + afterText: + this.currentAccessibilityContext.context?.textSelection + ?.postSelectionText, + appType: detectApplicationType(this.currentAccessibilityContext), + appBundleId: + this.currentAccessibilityContext.context?.application + ?.bundleIdentifier, + appName: + this.currentAccessibilityContext.context?.application?.name, + appUrl: this.currentAccessibilityContext.context?.windowInfo?.url, + surroundingContext: "", // Empty for now, future enhancement + } + : undefined, + }), + }); - // Handle 401 with token refresh and retry - if (response.status === 401) { - if (isRetry) { - // Already retried once, give up - throw new Error("Authentication failed - please log in again"); - } + // Handle 401 with token refresh and retry + if (response.status === 401) { + if (isRetry) { + // Already retried once, give up + throw new Error("Authentication failed - please log in again"); + } - logger.transcription.warn( - "Got 401 response, attempting token refresh and retry", + logger.transcription.warn( + "Got 401 response, attempting token refresh and retry", + ); + + try { + // Force token refresh + await this.authService.refreshTokenIfNeeded(); + + // Retry the request once (preserve formatting flag) + return await this.makeTranscriptionRequest( + audioData, + true, + enableFormatting, ); - - try { - // Force token refresh - await this.authService.refreshTokenIfNeeded(); - - // Retry the request once (preserve formatting flag) - return await this.makeTranscriptionRequest( - audioData, - true, - enableFormatting, - ); - } catch (refreshError) { - logger.transcription.error("Token refresh failed:", refreshError); - throw new Error("Authentication failed - please log in again"); - } + } catch (refreshError) { + logger.transcription.error("Token refresh failed:", refreshError); + throw new Error("Authentication failed - please log in again"); } - - if (response.status === 403) { - throw new Error("Subscription required for cloud transcription"); - } - - if (response.status === 429) { - const errorData = await response.json(); - throw new Error( - `Word limit exceeded: ${errorData.currentWords}/${errorData.limit}`, - ); - } - - if (!response.ok) { - const errorText = await response.text(); - logger.transcription.error("Cloud API error:", { - status: response.status, - statusText: response.statusText, - error: errorText, - }); - throw new Error(`Cloud API error: ${response.statusText}`); - } - - const result: CloudTranscriptionResponse = await response.json(); - - if (!result.success) { - throw new Error(result.error || "Cloud transcription failed"); - } - - logger.transcription.info("Cloud transcription successful", { - textLength: result.transcription?.length || 0, - language: result.language, - duration: result.duration, - }); - - return result.transcription || ""; - } catch (error) { - logger.transcription.error("Cloud transcription request failed:", error); - throw error; } + + if (response.status === 403) { + throw new Error("Subscription required for cloud transcription"); + } + + if (response.status === 429) { + const errorData = await response.json(); + throw new Error( + `Word limit exceeded: ${errorData.currentWords}/${errorData.limit}`, + ); + } + + if (!response.ok) { + const errorText = await response.text(); + logger.transcription.error("Cloud API error:", { + status: response.status, + statusText: response.statusText, + error: errorText, + }); + throw new Error(`Cloud API error: ${response.statusText}`); + } + + const result: CloudTranscriptionResponse = await response.json(); + + if (!result.success) { + throw new Error(result.error || "Cloud transcription failed"); + } + + logger.transcription.info("Cloud transcription successful", { + textLength: result.transcription?.length || 0, + language: result.language, + duration: result.duration, + }); + + return result.transcription || ""; } } diff --git a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts index 4580f25..e2c9ec2 100644 --- a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts +++ b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts @@ -1,6 +1,7 @@ import { TranscriptionProvider, TranscribeParams, + TranscribeContext, } from "../../core/pipeline-types"; import { logger } from "../../../main/logger"; import { ModelService } from "../../../services/model-service"; @@ -74,74 +75,79 @@ export class WhisperProvider implements TranscriptionProvider { } } + /** + * Process an audio chunk - buffers and conditionally transcribes + */ async transcribe(params: TranscribeParams): Promise { + await this.initializeWhisper(); + + const { audioData, speechProbability = 1, context } = params; + + // Add frame to buffer with speech probability + this.frameBuffer.push(audioData); + this.frameBufferSpeechProbabilities.push(speechProbability); + + // Consider it speech if probability is above threshold + const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD; + + logger.transcription.debug( + `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.currentSilenceFrameCount}`, + ); + + // Handle speech/silence logic + if (isSpeech) { + this.currentSilenceFrameCount = 0; + this.lastSpeechTimestamp = Date.now(); + } else { + this.currentSilenceFrameCount++; + } + + // Only transcribe if speech/silence patterns indicate we should + if (!this.shouldTranscribe()) { + return ""; + } + + return this.doTranscription(context); + } + + /** + * Flush any buffered audio and return transcription + * Called at the end of a recording session + */ + async flush(context: TranscribeContext): Promise { + if (this.frameBuffer.length === 0) { + return ""; + } + + await this.initializeWhisper(); + return this.doTranscription(context); + } + + /** + * Shared transcription logic - aggregates buffer, calls whisper, clears state + * Assumes initializeWhisper() was already called by caller + */ + private async doTranscription(context: TranscribeContext): Promise { try { - await this.initializeWhisper(); - - // Extract parameters from the new structure - const { - audioData, - speechProbability = 1, - context, - flush = false, - } = params; const { vocabulary, aggregatedTranscription, language } = context; - // Audio data is already Float32Array - - // Add frame to buffer with speech probability - this.frameBuffer.push(audioData); - this.frameBufferSpeechProbabilities.push(speechProbability); - - // Consider it speech if probability is above threshold - const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD; - - logger.transcription.debug( - `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.currentSilenceFrameCount}`, - ); - - // Handle speech/silence logic - if (isSpeech) { - this.currentSilenceFrameCount = 0; - this.lastSpeechTimestamp = Date.now(); - } else { - this.currentSilenceFrameCount++; - } - - // Determine if we should transcribe - const shouldTranscribe = flush || this.shouldTranscribe(); - - if (!shouldTranscribe) { - // Keep buffering - return ""; - } - const isAllSilent = this.isAllSilent(); // Aggregate buffered frames const aggregatedAudio = this.aggregateFrames(); - // Clear buffers immediately after aggregation, before async operations - this.frameBuffer = []; - this.frameBufferSpeechProbabilities = []; - this.currentSilenceFrameCount = 0; + // Clear buffers immediately after aggregation + this.reset(); if (isAllSilent && this.IGNORE_FULLY_SILENT_CHUNKS) { logger.transcription.debug("Skipping transcription - all silent"); return ""; } - // Skip if too short or only silence - /* if (aggregatedAudio.length < this.FRAME_SIZE * 2) { - logger.transcription.debug("Skipping transcription - audio too short"); - return ""; - } */ - logger.transcription.debug( `Starting transcription of ${aggregatedAudio.length} samples (${((aggregatedAudio.length / this.SAMPLE_RATE) * 1000).toFixed(0)}ms)`, ); - // Transcribe using the local Whisper wrapper if (!this.workerWrapper) { throw new Error("Worker wrapper is not initialized"); } @@ -152,7 +158,7 @@ export class WhisperProvider implements TranscriptionProvider { aggregatedTranscription, ); - const text = await this.workerWrapper!.exec("transcribeAudio", [ + const text = await this.workerWrapper.exec("transcribeAudio", [ aggregatedAudio, { language: language || "auto", @@ -174,11 +180,20 @@ export class WhisperProvider implements TranscriptionProvider { } } + /** + * Clear internal buffers without transcribing + * Called when cancelling a session to prevent audio bleed + */ + reset(): void { + this.frameBuffer = []; + this.frameBufferSpeechProbabilities = []; + this.currentSilenceFrameCount = 0; + } + private shouldTranscribe(): boolean { // Transcribe if: // 1. We have significant silence after speech // 2. Buffer is getting too large - // 3. Final chunk was received (handled elsewhere) const bufferDurationMs = ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000; @@ -186,7 +201,7 @@ export class WhisperProvider implements TranscriptionProvider { ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000; - // If we have speech (potential cause frameBuffer might just be all silence too, and thats okay) and then significant silence, transcribe + // If we have speech and then significant silence, transcribe if ( this.frameBuffer.length > 0 && silenceDurationMs > this.MAX_SILENCE_DURATION_MS @@ -357,9 +372,6 @@ export class WhisperProvider implements TranscriptionProvider { } } - // Clear buffers - this.frameBuffer = []; - this.frameBufferSpeechProbabilities = []; - this.currentSilenceFrameCount = 0; + this.reset(); } } diff --git a/apps/desktop/src/services/transcription-service.ts b/apps/desktop/src/services/transcription-service.ts index 7b1c29f..77ec690 100644 --- a/apps/desktop/src/services/transcription-service.ts +++ b/apps/desktop/src/services/transcription-service.ts @@ -214,23 +214,14 @@ export class TranscriptionService { /** * Process a single audio chunk in streaming mode + * For finalization, use finalizeSession() instead */ async processStreamingChunk(options: { sessionId: string; audioChunk: Float32Array; - isFinal?: boolean; - audioFilePath?: string; recordingStartedAt?: number; - recordingStoppedAt?: number; }): Promise { - const { - sessionId, - audioChunk, - isFinal = false, - audioFilePath, - recordingStartedAt, - recordingStoppedAt, - } = options; + const { sessionId, audioChunk, recordingStartedAt } = options; // Run VAD on the audio chunk let speechProbability = 0; @@ -281,7 +272,7 @@ export class TranscriptionService { context: streamingContext, transcriptionResults: [], firstChunkReceivedAt: performance.now(), - recordingStartedAt: recordingStartedAt, // From RecordingManager (when user pressed record) + recordingStartedAt: recordingStartedAt, }; this.streamingSessions.set(sessionId, session); @@ -305,11 +296,10 @@ export class TranscriptionService { // Select the appropriate provider const provider = await this.selectProvider(); - // Transcribe with flush parameter for final chunks + // Transcribe chunk (flush is done separately in finalizeSession) const chunkTranscription = await provider.transcribe({ audioData: audioChunk, - speechProbability: speechProbability, // Now from VAD service - flush: isFinal, // Pass flush flag for final chunks + speechProbability: speechProbability, context: { vocabulary: session.context.sharedData.vocabulary, accessibilityContext: session.context.sharedData.accessibilityContext, @@ -334,25 +324,96 @@ export class TranscriptionService { sessionId, frameSize: audioChunk.length, hadTranscription: chunkTranscription.length > 0, - isFinal, }); } finally { // Release transcription mutex - always release even on error this.transcriptionMutex.release(); } - const completeTranscriptionTillNow = session.transcriptionResults - .join(" ") - .trim(); - // this is the final chunk, save the transcription - if (!isFinal) { - return completeTranscriptionTillNow; + return session.transcriptionResults.join(" ").trim(); + } + + /** + * Cancel a streaming session without processing + * Used when recording is cancelled (e.g., quick tap, accidental activation) + */ + async cancelStreamingSession(sessionId: string): Promise { + if (this.streamingSessions.has(sessionId)) { + // Acquire mutex to prevent race with processStreamingChunk + await this.transcriptionMutex.acquire(); + try { + // Clear provider buffers to prevent audio bleed into next session + this.currentProvider?.reset(); + + this.streamingSessions.delete(sessionId); + logger.transcription.info("Streaming session cancelled", { sessionId }); + } finally { + this.transcriptionMutex.release(); + } + } + } + + /** + * Finalize a streaming session - flush provider, format, save to DB + * Call this instead of processStreamingChunk with isFinal=true + */ + async finalizeSession(options: { + sessionId: string; + audioFilePath?: string; + recordingStartedAt?: number; + recordingStoppedAt?: number; + }): Promise { + const { sessionId, audioFilePath, recordingStartedAt, recordingStoppedAt } = + options; + + const session = this.streamingSessions.get(sessionId); + if (!session) { + logger.transcription.warn("No session found to finalize", { sessionId }); + return ""; } - session.finalChunkReceivedAt = performance.now(); + // Update session timestamps + session.finalizationStartedAt = performance.now(); session.recordingStoppedAt = recordingStoppedAt; + if (recordingStartedAt && !session.recordingStartedAt) { + session.recordingStartedAt = recordingStartedAt; + } - let completeTranscription = completeTranscriptionTillNow; + // Flush provider to get any remaining buffered audio + await this.transcriptionMutex.acquire(); + try { + const previousChunk = + session.transcriptionResults.length > 0 + ? session.transcriptionResults[ + session.transcriptionResults.length - 1 + ] + : undefined; + const aggregatedTranscription = session.transcriptionResults + .join(" ") + .trim(); + + const provider = await this.selectProvider(); + const finalTranscription = await provider.flush({ + vocabulary: session.context.sharedData.vocabulary, + accessibilityContext: session.context.sharedData.accessibilityContext, + previousChunk, + aggregatedTranscription: aggregatedTranscription || undefined, + language: session.context.sharedData.userPreferences?.language, + }); + + if (finalTranscription.trim()) { + session.transcriptionResults.push(finalTranscription); + logger.transcription.info("Whisper returned final transcription", { + sessionId, + transcriptionLength: finalTranscription.length, + totalResults: session.transcriptionResults.length, + }); + } + } finally { + this.transcriptionMutex.release(); + } + + let completeTranscription = session.transcriptionResults.join(" ").trim(); let formattingDuration: number | undefined; logger.transcription.info("Finalizing streaming session", { diff --git a/apps/desktop/src/trpc/routers/recording.ts b/apps/desktop/src/trpc/routers/recording.ts index 449bec6..5b077ba 100644 --- a/apps/desktop/src/trpc/routers/recording.ts +++ b/apps/desktop/src/trpc/routers/recording.ts @@ -15,7 +15,7 @@ export const recordingRouter = createRouter({ if (!recordingManager) { throw new Error("Recording manager not available"); } - return await recordingManager.startRecording("hands-free"); + return await recordingManager.signalStart(); }), signalStop: procedure.mutation(async ({ ctx }) => { @@ -23,7 +23,7 @@ export const recordingRouter = createRouter({ if (!recordingManager) { throw new Error("Recording manager not available"); } - return await recordingManager.stopRecording(); + return await recordingManager.signalStop(); }), // Using Observable instead of async generator due to Symbol.asyncDispose conflict diff --git a/apps/desktop/src/types/recording.ts b/apps/desktop/src/types/recording.ts index cbfe379..997a0c2 100644 --- a/apps/desktop/src/types/recording.ts +++ b/apps/desktop/src/types/recording.ts @@ -1,6 +1 @@ -export type RecordingState = - | "idle" - | "starting" - | "recording" - | "stopping" - | "error"; +export type RecordingState = "idle" | "starting" | "recording" | "stopping"; diff --git a/apps/desktop/src/utils/streaming-wav-writer.ts b/apps/desktop/src/utils/streaming-wav-writer.ts index 1b6b791..b7f3b52 100644 --- a/apps/desktop/src/utils/streaming-wav-writer.ts +++ b/apps/desktop/src/utils/streaming-wav-writer.ts @@ -135,6 +135,25 @@ export class StreamingWavWriter { } } + /** + * Abort writing and close the file stream without finalizing + * Used when recording is cancelled + */ + async abort(): Promise { + if (this.isFinalized) return; + + this.isFinalized = true; // Prevent further writes + + // Close the stream + await new Promise((resolve) => { + this.fileStream.end(() => resolve()); + }); + + logger.transcription.info("WAV writer aborted", { + path: this.fileStream.path, + }); + } + /** * Get the current size of audio data written */