From 30d3574f763c9f81368596838fd4a94d00a9cbf0 Mon Sep 17 00:00:00 2001 From: haritabh-z01 Date: Fri, 7 Nov 2025 09:31:07 +0530 Subject: [PATCH] chore: adjust silence frames processing to reduce hallucinations --- .../transcription/simple-fork-wrapper.ts | 2 +- .../transcription/whisper-provider.ts | 54 ++++++++++---- .../providers/transcription/whisper-worker.ts | 70 ------------------- apps/desktop/vite.main.config.mts | 4 -- 4 files changed, 40 insertions(+), 90 deletions(-) delete mode 100644 apps/desktop/src/pipeline/providers/transcription/whisper-worker.ts diff --git a/apps/desktop/src/pipeline/providers/transcription/simple-fork-wrapper.ts b/apps/desktop/src/pipeline/providers/transcription/simple-fork-wrapper.ts index 22e1496..f282196 100644 --- a/apps/desktop/src/pipeline/providers/transcription/simple-fork-wrapper.ts +++ b/apps/desktop/src/pipeline/providers/transcription/simple-fork-wrapper.ts @@ -98,7 +98,7 @@ export class SimpleForkWrapper { this.pendingCalls.clear(); } - async exec(method: string, args: any[]): Promise { + async exec(method: string, args: unknown[]): Promise { if (!this.worker) { await this.initialize(); } diff --git a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts index 98ecd24..92d883d 100644 --- a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts +++ b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts @@ -17,7 +17,7 @@ export class WhisperProvider implements TranscriptionProvider { // Frame aggregation state private frameBuffer: Float32Array[] = []; private frameBufferSpeechProbabilities: number[] = []; // Track speech probabilities for each frame - private silenceFrameCount = 0; + private currentSilenceFrameCount = 0; private lastSpeechTimestamp = 0; private getNodeBinaryPath(): string { @@ -40,11 +40,13 @@ export class WhisperProvider implements TranscriptionProvider { } // Configuration + private readonly TRIM_TRAILING_AND_LEADING_SILENCE = false; private readonly FRAME_SIZE = 512; // 32ms at 16kHz private readonly MIN_SPEECH_DURATION_MS = 500; // Minimum speech duration to transcribe - private readonly MAX_SILENCE_DURATION_MS = 800; // Max silence before cutting + private readonly MAX_SILENCE_DURATION_MS = 3000; // Max silence before cutting private readonly SAMPLE_RATE = 16000; private readonly SPEECH_PROBABILITY_THRESHOLD = 0.2; // Threshold for speech detection + private readonly IGNORE_FULLY_SILENT_CHUNKS = true; constructor(modelManager: ModelManagerService) { this.modelManager = modelManager; @@ -81,7 +83,7 @@ export class WhisperProvider implements TranscriptionProvider { // Extract parameters from the new structure const { audioData, - speechProbability = 0, + speechProbability = 1, context, flush = false, } = params; @@ -97,15 +99,15 @@ export class WhisperProvider implements TranscriptionProvider { const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD; logger.transcription.debug( - `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.silenceFrameCount}`, + `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.currentSilenceFrameCount}`, ); // Handle speech/silence logic if (isSpeech) { - this.silenceFrameCount = 0; + this.currentSilenceFrameCount = 0; this.lastSpeechTimestamp = Date.now(); } else { - this.silenceFrameCount++; + this.currentSilenceFrameCount++; } // Determine if we should transcribe @@ -116,13 +118,20 @@ export class WhisperProvider implements TranscriptionProvider { return ""; } + const isAllSilent = this.isAllSilent(); + // Aggregate buffered frames const aggregatedAudio = this.aggregateFrames(); // Clear buffers immediately after aggregation, before async operations this.frameBuffer = []; this.frameBufferSpeechProbabilities = []; - this.silenceFrameCount = 0; + this.currentSilenceFrameCount = 0; + + if (isAllSilent && this.IGNORE_FULLY_SILENT_CHUNKS) { + logger.transcription.debug("Skipping transcription - all silent"); + return ""; + } // Skip if too short or only silence /* if (aggregatedAudio.length < this.FRAME_SIZE * 2) { @@ -176,9 +185,10 @@ export class WhisperProvider implements TranscriptionProvider { const bufferDurationMs = ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000; const silenceDurationMs = - ((this.silenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000; + ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) * + 1000; - // If we have speech and then significant silence, transcribe + // If we have speech (potential cause frameBuffer might just be all silence too, and thats okay) and then significant silence, transcribe if ( this.frameBuffer.length > 0 && silenceDurationMs > this.MAX_SILENCE_DURATION_MS @@ -201,7 +211,7 @@ export class WhisperProvider implements TranscriptionProvider { bufferDurationMs, silenceDurationMs, frameBufferLength: this.frameBuffer.length, - silenceFrameCount: this.silenceFrameCount, + silenceFrameCount: this.currentSilenceFrameCount, }); return false; @@ -213,7 +223,7 @@ export class WhisperProvider implements TranscriptionProvider { (sum, frame) => sum + frame.length, 0, ); - const aggregated = new Float32Array(totalLength); + let aggregated = new Float32Array(totalLength); // Copy all frames into single array let offset = 0; @@ -223,12 +233,26 @@ export class WhisperProvider implements TranscriptionProvider { } // Trim silence from beginning and end - const trimmed = this.trimSilence(aggregated); + aggregated = this.TRIM_TRAILING_AND_LEADING_SILENCE + ? this.trimSilence(aggregated) + : aggregated; - return trimmed; + return aggregated; } - private trimSilence(audio: Float32Array): Float32Array { + private isAllSilent = () => { + const bufferDurationMs = + ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000; + const silenceDurationMs = + ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) * + 1000; + + return bufferDurationMs === silenceDurationMs; + }; + + private trimSilence( + audio: Float32Array, + ): Float32Array { // Find first speech frame (probability > threshold) let startIdx = 0; for (let i = 0; i < this.frameBufferSpeechProbabilities.length; i++) { @@ -338,6 +362,6 @@ export class WhisperProvider implements TranscriptionProvider { // Clear buffers this.frameBuffer = []; this.frameBufferSpeechProbabilities = []; - this.silenceFrameCount = 0; + this.currentSilenceFrameCount = 0; } } diff --git a/apps/desktop/src/pipeline/providers/transcription/whisper-worker.ts b/apps/desktop/src/pipeline/providers/transcription/whisper-worker.ts deleted file mode 100644 index 2b6da3a..0000000 --- a/apps/desktop/src/pipeline/providers/transcription/whisper-worker.ts +++ /dev/null @@ -1,70 +0,0 @@ -// This file contains just the Whisper-specific operations that need to run in a separate process -import { Whisper } from "@amical/whisper-wrapper"; - -// Simple console-based logging for worker process -const logger = { - transcription: { - info: (message: string, ...args: any[]) => - console.log(`[whisper-worker] INFO: ${message}`, ...args), - error: (message: string, ...args: any[]) => - console.error(`[whisper-worker] ERROR: ${message}`, ...args), - debug: (message: string, ...args: any[]) => - console.log(`[whisper-worker] DEBUG: ${message}`, ...args), - }, -}; - -let whisperInstance: Whisper | null = null; -let currentModelPath: string | null = null; - -export async function initializeModel(modelPath: string): Promise { - if (whisperInstance && currentModelPath === modelPath) { - return; // Already initialized with same model - } - - // Cleanup existing instance - if (whisperInstance) { - await whisperInstance.free(); - whisperInstance = null; - } - - whisperInstance = new Whisper(modelPath, { gpu: true }); - try { - await whisperInstance.load(); - } catch (e) { - logger.transcription.error("Failed to load Whisper model:", e); - throw e; - } - currentModelPath = modelPath; - logger.transcription.info(`Initialized with model: ${modelPath}`); -} - -export async function transcribeAudio( - aggregatedAudio: Float32Array, - options: { - language: string; - initial_prompt: string; - suppress_blank: boolean; - suppress_non_speech_tokens: boolean; - no_timestamps: boolean; - }, -): Promise { - if (!whisperInstance) { - throw new Error("Whisper instance is not initialized"); - } - - const { result } = await whisperInstance.transcribe(aggregatedAudio, options); - const transcription = await result; - - return transcription - .map((segment: { text: string }) => segment.text) - .join(" ") - .trim(); -} - -export async function dispose(): Promise { - if (whisperInstance) { - await whisperInstance.free(); - whisperInstance = null; - currentModelPath = null; - } -} diff --git a/apps/desktop/vite.main.config.mts b/apps/desktop/vite.main.config.mts index ce51f91..74cb813 100644 --- a/apps/desktop/vite.main.config.mts +++ b/apps/desktop/vite.main.config.mts @@ -16,10 +16,6 @@ export default defineConfig({ rollupOptions: { input: { main: resolve(__dirname, "src/main/main.ts"), - "whisper-worker": resolve( - __dirname, - "src/pipeline/providers/transcription/whisper-worker.ts", - ), "whisper-worker-fork": resolve( __dirname, "src/pipeline/providers/transcription/whisper-worker-fork.ts",