chore: adjust silence frames processing to reduce hallucinations

2025-11-07 09:31:07 +05:30 · 2025-11-07 09:31:07 +05:30 · 30d3574f76
commit 30d3574f76
parent bab8f6b07e
4 changed files with 40 additions and 90 deletions
--- a/apps/desktop/src/pipeline/providers/transcription/simple-fork-wrapper.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/simple-fork-wrapper.ts
@ -98,7 +98,7 @@ export class SimpleForkWrapper {
    this.pendingCalls.clear();
  }

-  async exec<T>(method: string, args: any[]): Promise<T> {
+  async exec<T>(method: string, args: unknown[]): Promise<T> {
    if (!this.worker) {
      await this.initialize();
    }
--- a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
@ -17,7 +17,7 @@ export class WhisperProvider implements TranscriptionProvider {
  // Frame aggregation state
  private frameBuffer: Float32Array[] = [];
  private frameBufferSpeechProbabilities: number[] = []; // Track speech probabilities for each frame
-  private silenceFrameCount = 0;
+  private currentSilenceFrameCount = 0;
  private lastSpeechTimestamp = 0;

  private getNodeBinaryPath(): string {
@ -40,11 +40,13 @@ export class WhisperProvider implements TranscriptionProvider {
  }

  // Configuration
+  private readonly TRIM_TRAILING_AND_LEADING_SILENCE = false;
  private readonly FRAME_SIZE = 512; // 32ms at 16kHz
  private readonly MIN_SPEECH_DURATION_MS = 500; // Minimum speech duration to transcribe
-  private readonly MAX_SILENCE_DURATION_MS = 800; // Max silence before cutting
+  private readonly MAX_SILENCE_DURATION_MS = 3000; // Max silence before cutting
  private readonly SAMPLE_RATE = 16000;
  private readonly SPEECH_PROBABILITY_THRESHOLD = 0.2; // Threshold for speech detection
+  private readonly IGNORE_FULLY_SILENT_CHUNKS = true;

  constructor(modelManager: ModelManagerService) {
    this.modelManager = modelManager;
@ -81,7 +83,7 @@ export class WhisperProvider implements TranscriptionProvider {
      // Extract parameters from the new structure
      const {
        audioData,
-        speechProbability = 0,
+        speechProbability = 1,
        context,
        flush = false,
      } = params;
@ -97,15 +99,15 @@ export class WhisperProvider implements TranscriptionProvider {
      const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD;

      logger.transcription.debug(
-        `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.silenceFrameCount}`,
+        `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.currentSilenceFrameCount}`,
      );

      // Handle speech/silence logic
      if (isSpeech) {
-        this.silenceFrameCount = 0;
+        this.currentSilenceFrameCount = 0;
        this.lastSpeechTimestamp = Date.now();
      } else {
-        this.silenceFrameCount++;
+        this.currentSilenceFrameCount++;
      }

      // Determine if we should transcribe
@ -116,13 +118,20 @@ export class WhisperProvider implements TranscriptionProvider {
        return "";
      }

+      const isAllSilent = this.isAllSilent();
+
      // Aggregate buffered frames
      const aggregatedAudio = this.aggregateFrames();

      // Clear buffers immediately after aggregation, before async operations
      this.frameBuffer = [];
      this.frameBufferSpeechProbabilities = [];
-      this.silenceFrameCount = 0;
+      this.currentSilenceFrameCount = 0;
+
+      if (isAllSilent && this.IGNORE_FULLY_SILENT_CHUNKS) {
+        logger.transcription.debug("Skipping transcription - all silent");
+        return "";
+      }

      // Skip if too short or only silence
      /* if (aggregatedAudio.length < this.FRAME_SIZE * 2) {
@ -176,9 +185,10 @@ export class WhisperProvider implements TranscriptionProvider {
    const bufferDurationMs =
      ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
    const silenceDurationMs =
-      ((this.silenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
+      ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
+      1000;

-    // If we have speech and then significant silence, transcribe
+    // If we have speech (potential cause frameBuffer might just be all silence too, and thats okay) and then significant silence, transcribe
    if (
      this.frameBuffer.length > 0 &&
      silenceDurationMs > this.MAX_SILENCE_DURATION_MS
@ -201,7 +211,7 @@ export class WhisperProvider implements TranscriptionProvider {
      bufferDurationMs,
      silenceDurationMs,
      frameBufferLength: this.frameBuffer.length,
-      silenceFrameCount: this.silenceFrameCount,
+      silenceFrameCount: this.currentSilenceFrameCount,
    });

    return false;
@ -213,7 +223,7 @@ export class WhisperProvider implements TranscriptionProvider {
      (sum, frame) => sum + frame.length,
      0,
    );
-    const aggregated = new Float32Array(totalLength);
+    let aggregated = new Float32Array(totalLength);

    // Copy all frames into single array
    let offset = 0;
@ -223,12 +233,26 @@ export class WhisperProvider implements TranscriptionProvider {
    }

    // Trim silence from beginning and end
-    const trimmed = this.trimSilence(aggregated);
+    aggregated = this.TRIM_TRAILING_AND_LEADING_SILENCE
+      ? this.trimSilence(aggregated)
+      : aggregated;

-    return trimmed;
+    return aggregated;
  }

-  private trimSilence(audio: Float32Array): Float32Array {
+  private isAllSilent = () => {
+    const bufferDurationMs =
+      ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
+    const silenceDurationMs =
+      ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
+      1000;
+
+    return bufferDurationMs === silenceDurationMs;
+  };
+
+  private trimSilence(
+    audio: Float32Array<ArrayBuffer>,
+  ): Float32Array<ArrayBuffer> {
    // Find first speech frame (probability > threshold)
    let startIdx = 0;
    for (let i = 0; i < this.frameBufferSpeechProbabilities.length; i++) {
@ -338,6 +362,6 @@ export class WhisperProvider implements TranscriptionProvider {
    // Clear buffers
    this.frameBuffer = [];
    this.frameBufferSpeechProbabilities = [];
-    this.silenceFrameCount = 0;
+    this.currentSilenceFrameCount = 0;
  }
 }
--- a/apps/desktop/src/pipeline/providers/transcription/whisper-worker.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/whisper-worker.ts
@ -1,70 +0,0 @@
-// This file contains just the Whisper-specific operations that need to run in a separate process
-import { Whisper } from "@amical/whisper-wrapper";
-
-// Simple console-based logging for worker process
-const logger = {
-  transcription: {
-    info: (message: string, ...args: any[]) =>
-      console.log(`[whisper-worker] INFO: ${message}`, ...args),
-    error: (message: string, ...args: any[]) =>
-      console.error(`[whisper-worker] ERROR: ${message}`, ...args),
-    debug: (message: string, ...args: any[]) =>
-      console.log(`[whisper-worker] DEBUG: ${message}`, ...args),
-  },
-};
-
-let whisperInstance: Whisper | null = null;
-let currentModelPath: string | null = null;
-
-export async function initializeModel(modelPath: string): Promise<void> {
-  if (whisperInstance && currentModelPath === modelPath) {
-    return; // Already initialized with same model
-  }
-
-  // Cleanup existing instance
-  if (whisperInstance) {
-    await whisperInstance.free();
-    whisperInstance = null;
-  }
-
-  whisperInstance = new Whisper(modelPath, { gpu: true });
-  try {
-    await whisperInstance.load();
-  } catch (e) {
-    logger.transcription.error("Failed to load Whisper model:", e);
-    throw e;
-  }
-  currentModelPath = modelPath;
-  logger.transcription.info(`Initialized with model: ${modelPath}`);
-}
-
-export async function transcribeAudio(
-  aggregatedAudio: Float32Array,
-  options: {
-    language: string;
-    initial_prompt: string;
-    suppress_blank: boolean;
-    suppress_non_speech_tokens: boolean;
-    no_timestamps: boolean;
-  },
-): Promise<string> {
-  if (!whisperInstance) {
-    throw new Error("Whisper instance is not initialized");
-  }
-
-  const { result } = await whisperInstance.transcribe(aggregatedAudio, options);
-  const transcription = await result;
-
-  return transcription
-    .map((segment: { text: string }) => segment.text)
-    .join(" ")
-    .trim();
-}
-
-export async function dispose(): Promise<void> {
-  if (whisperInstance) {
-    await whisperInstance.free();
-    whisperInstance = null;
-    currentModelPath = null;
-  }
-}
--- a/apps/desktop/vite.main.config.mts
+++ b/apps/desktop/vite.main.config.mts
@ -16,10 +16,6 @@ export default defineConfig({
    rollupOptions: {
      input: {
        main: resolve(__dirname, "src/main/main.ts"),
-        "whisper-worker": resolve(
-          __dirname,
-          "src/pipeline/providers/transcription/whisper-worker.ts",
-        ),
        "whisper-worker-fork": resolve(
          __dirname,
          "src/pipeline/providers/transcription/whisper-worker-fork.ts",