From 30d3574f763c9f81368596838fd4a94d00a9cbf0 Mon Sep 17 00:00:00 2001
From: haritabh-z01 <haritabh.z01+github@gmail.com>
Date: Fri, 7 Nov 2025 09:31:07 +0530
Subject: [PATCH] chore: adjust silence frames processing to reduce
 hallucinations

---
 .../transcription/simple-fork-wrapper.ts      |  2 +-
 .../transcription/whisper-provider.ts         | 54 ++++++++++----
 .../providers/transcription/whisper-worker.ts | 70 -------------------
 apps/desktop/vite.main.config.mts             |  4 --
 4 files changed, 40 insertions(+), 90 deletions(-)
 delete mode 100644 apps/desktop/src/pipeline/providers/transcription/whisper-worker.ts
diff --git a/apps/desktop/src/pipeline/providers/transcription/simple-fork-wrapper.ts b/apps/desktop/src/pipeline/providers/transcription/simple-fork-wrapper.ts
index 22e1496..f282196 100644
--- a/apps/desktop/src/pipeline/providers/transcription/simple-fork-wrapper.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/simple-fork-wrapper.ts
@@ -98,7 +98,7 @@ export class SimpleForkWrapper {
     this.pendingCalls.clear();
   }
 
-  async exec<T>(method: string, args: any[]): Promise<T> {
+  async exec<T>(method: string, args: unknown[]): Promise<T> {
     if (!this.worker) {
       await this.initialize();
     }
diff --git a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
index 98ecd24..92d883d 100644
--- a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
@@ -17,7 +17,7 @@ export class WhisperProvider implements TranscriptionProvider {
   // Frame aggregation state
   private frameBuffer: Float32Array[] = [];
   private frameBufferSpeechProbabilities: number[] = []; // Track speech probabilities for each frame
-  private silenceFrameCount = 0;
+  private currentSilenceFrameCount = 0;
   private lastSpeechTimestamp = 0;
 
   private getNodeBinaryPath(): string {
@@ -40,11 +40,13 @@ export class WhisperProvider implements TranscriptionProvider {
   }
 
   // Configuration
+  private readonly TRIM_TRAILING_AND_LEADING_SILENCE = false;
   private readonly FRAME_SIZE = 512; // 32ms at 16kHz
   private readonly MIN_SPEECH_DURATION_MS = 500; // Minimum speech duration to transcribe
-  private readonly MAX_SILENCE_DURATION_MS = 800; // Max silence before cutting
+  private readonly MAX_SILENCE_DURATION_MS = 3000; // Max silence before cutting
   private readonly SAMPLE_RATE = 16000;
   private readonly SPEECH_PROBABILITY_THRESHOLD = 0.2; // Threshold for speech detection
+  private readonly IGNORE_FULLY_SILENT_CHUNKS = true;
 
   constructor(modelManager: ModelManagerService) {
     this.modelManager = modelManager;
@@ -81,7 +83,7 @@ export class WhisperProvider implements TranscriptionProvider {
       // Extract parameters from the new structure
       const {
         audioData,
-        speechProbability = 0,
+        speechProbability = 1,
         context,
         flush = false,
       } = params;
@@ -97,15 +99,15 @@ export class WhisperProvider implements TranscriptionProvider {
       const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD;
 
       logger.transcription.debug(
-        `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.silenceFrameCount}`,
+        `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.currentSilenceFrameCount}`,
       );
 
       // Handle speech/silence logic
       if (isSpeech) {
-        this.silenceFrameCount = 0;
+        this.currentSilenceFrameCount = 0;
         this.lastSpeechTimestamp = Date.now();
       } else {
-        this.silenceFrameCount++;
+        this.currentSilenceFrameCount++;
       }
 
       // Determine if we should transcribe
@@ -116,13 +118,20 @@ export class WhisperProvider implements TranscriptionProvider {
         return "";
       }
 
+      const isAllSilent = this.isAllSilent();
+
       // Aggregate buffered frames
       const aggregatedAudio = this.aggregateFrames();
 
       // Clear buffers immediately after aggregation, before async operations
       this.frameBuffer = [];
       this.frameBufferSpeechProbabilities = [];
-      this.silenceFrameCount = 0;
+      this.currentSilenceFrameCount = 0;
+
+      if (isAllSilent && this.IGNORE_FULLY_SILENT_CHUNKS) {
+        logger.transcription.debug("Skipping transcription - all silent");
+        return "";
+      }
 
       // Skip if too short or only silence
       /* if (aggregatedAudio.length < this.FRAME_SIZE * 2) {
@@ -176,9 +185,10 @@ export class WhisperProvider implements TranscriptionProvider {
     const bufferDurationMs =
       ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
     const silenceDurationMs =
-      ((this.silenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
+      ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
+      1000;
 
-    // If we have speech and then significant silence, transcribe
+    // If we have speech (potential cause frameBuffer might just be all silence too, and thats okay) and then significant silence, transcribe
     if (
       this.frameBuffer.length > 0 &&
       silenceDurationMs > this.MAX_SILENCE_DURATION_MS
@@ -201,7 +211,7 @@ export class WhisperProvider implements TranscriptionProvider {
       bufferDurationMs,
       silenceDurationMs,
       frameBufferLength: this.frameBuffer.length,
-      silenceFrameCount: this.silenceFrameCount,
+      silenceFrameCount: this.currentSilenceFrameCount,
     });
 
     return false;
@@ -213,7 +223,7 @@ export class WhisperProvider implements TranscriptionProvider {
       (sum, frame) => sum + frame.length,
       0,
     );
-    const aggregated = new Float32Array(totalLength);
+    let aggregated = new Float32Array(totalLength);
 
     // Copy all frames into single array
     let offset = 0;
@@ -223,12 +233,26 @@ export class WhisperProvider implements TranscriptionProvider {
     }
 
     // Trim silence from beginning and end
-    const trimmed = this.trimSilence(aggregated);
+    aggregated = this.TRIM_TRAILING_AND_LEADING_SILENCE
+      ? this.trimSilence(aggregated)
+      : aggregated;
 
-    return trimmed;
+    return aggregated;
   }
 
-  private trimSilence(audio: Float32Array): Float32Array {
+  private isAllSilent = () => {
+    const bufferDurationMs =
+      ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
+    const silenceDurationMs =
+      ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
+      1000;
+
+    return bufferDurationMs === silenceDurationMs;
+  };
+
+  private trimSilence(
+    audio: Float32Array<ArrayBuffer>,
+  ): Float32Array<ArrayBuffer> {
     // Find first speech frame (probability > threshold)
     let startIdx = 0;
     for (let i = 0; i < this.frameBufferSpeechProbabilities.length; i++) {
@@ -338,6 +362,6 @@ export class WhisperProvider implements TranscriptionProvider {
     // Clear buffers
     this.frameBuffer = [];
     this.frameBufferSpeechProbabilities = [];
-    this.silenceFrameCount = 0;
+    this.currentSilenceFrameCount = 0;
   }
 }
diff --git a/apps/desktop/src/pipeline/providers/transcription/whisper-worker.ts b/apps/desktop/src/pipeline/providers/transcription/whisper-worker.ts
deleted file mode 100644
index 2b6da3a..0000000
--- a/apps/desktop/src/pipeline/providers/transcription/whisper-worker.ts
+++ /dev/null
@@ -1,70 +0,0 @@
-// This file contains just the Whisper-specific operations that need to run in a separate process
-import { Whisper } from "@amical/whisper-wrapper";
-
-// Simple console-based logging for worker process
-const logger = {
-  transcription: {
-    info: (message: string, ...args: any[]) =>
-      console.log(`[whisper-worker] INFO: ${message}`, ...args),
-    error: (message: string, ...args: any[]) =>
-      console.error(`[whisper-worker] ERROR: ${message}`, ...args),
-    debug: (message: string, ...args: any[]) =>
-      console.log(`[whisper-worker] DEBUG: ${message}`, ...args),
-  },
-};
-
-let whisperInstance: Whisper | null = null;
-let currentModelPath: string | null = null;
-
-export async function initializeModel(modelPath: string): Promise<void> {
-  if (whisperInstance && currentModelPath === modelPath) {
-    return; // Already initialized with same model
-  }
-
-  // Cleanup existing instance
-  if (whisperInstance) {
-    await whisperInstance.free();
-    whisperInstance = null;
-  }
-
-  whisperInstance = new Whisper(modelPath, { gpu: true });
-  try {
-    await whisperInstance.load();
-  } catch (e) {
-    logger.transcription.error("Failed to load Whisper model:", e);
-    throw e;
-  }
-  currentModelPath = modelPath;
-  logger.transcription.info(`Initialized with model: ${modelPath}`);
-}
-
-export async function transcribeAudio(
-  aggregatedAudio: Float32Array,
-  options: {
-    language: string;
-    initial_prompt: string;
-    suppress_blank: boolean;
-    suppress_non_speech_tokens: boolean;
-    no_timestamps: boolean;
-  },
-): Promise<string> {
-  if (!whisperInstance) {
-    throw new Error("Whisper instance is not initialized");
-  }
-
-  const { result } = await whisperInstance.transcribe(aggregatedAudio, options);
-  const transcription = await result;
-
-  return transcription
-    .map((segment: { text: string }) => segment.text)
-    .join(" ")
-    .trim();
-}
-
-export async function dispose(): Promise<void> {
-  if (whisperInstance) {
-    await whisperInstance.free();
-    whisperInstance = null;
-    currentModelPath = null;
-  }
-}
diff --git a/apps/desktop/vite.main.config.mts b/apps/desktop/vite.main.config.mts
index ce51f91..74cb813 100644
--- a/apps/desktop/vite.main.config.mts
+++ b/apps/desktop/vite.main.config.mts
@@ -16,10 +16,6 @@ export default defineConfig({
     rollupOptions: {
       input: {
         main: resolve(__dirname, "src/main/main.ts"),
-        "whisper-worker": resolve(
-          __dirname,
-          "src/pipeline/providers/transcription/whisper-worker.ts",
-        ),
         "whisper-worker-fork": resolve(
           __dirname,
           "src/pipeline/providers/transcription/whisper-worker-fork.ts",