chore: update to use vad v6 and forward vad probs to cloud endpoint

2026-01-08 01:12:31 +05:30 · 2026-01-08 01:12:31 +05:30 · 3b2bc12920
commit 3b2bc12920
parent 73734bfdd9
4 changed files with 30 additions and 9 deletions
--- a/apps/desktop/models/silero_vad_v5.onnx
+++ b/apps/desktop/models/silero_vad_v5.onnx
--- a/apps/desktop/models/silero_vad_v6.onnx
+++ b/apps/desktop/models/silero_vad_v6.onnx
--- a/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts
@ -136,6 +136,9 @@ export class AmicalCloudProvider implements TranscriptionProvider {
      offset += frame.length;
    }

+    // Save VAD probabilities before clearing
+    const vadProbs = [...this.frameBufferSpeechProbabilities];
+
    // Clear frame buffers only (context values needed for API call below)
    this.frameBuffer = [];
    this.frameBufferSpeechProbabilities = [];
@ -144,6 +147,7 @@ export class AmicalCloudProvider implements TranscriptionProvider {
    // Make the API request
    return this.makeTranscriptionRequest(
      combinedAudio,
+      vadProbs,
      false,
      enableFormatting,
    );
@ -177,6 +181,7 @@ export class AmicalCloudProvider implements TranscriptionProvider {

  private async makeTranscriptionRequest(
    audioData: Float32Array,
+    vadProbs: number[],
    isRetry = false,
    enableFormatting = false,
  ): Promise<string> {
@ -215,6 +220,7 @@ export class AmicalCloudProvider implements TranscriptionProvider {
      },
      body: JSON.stringify({
        audioData: Array.from(audioData),
+        vadProbs,
        language: this.currentLanguage,
        previousTranscription: this.currentAggregatedTranscription,
        formatting: {
@ -262,6 +268,7 @@ export class AmicalCloudProvider implements TranscriptionProvider {
        // Retry the request once (preserve formatting flag)
        return await this.makeTranscriptionRequest(
          audioData,
+          vadProbs,
          true,
          enableFormatting,
        );
--- a/apps/desktop/src/services/vad-service.ts
+++ b/apps/desktop/src/services/vad-service.ts
@ -13,10 +13,13 @@ export class VADService extends EventEmitter {

  // Configuration
  private readonly WINDOW_SIZE_SAMPLES = 512; // 32ms at 16kHz
+  private readonly CTX_SIZE = 64; // Context size for v6
+  private readonly INPUT_SIZE = 576; // CTX_SIZE + WINDOW_SIZE_SAMPLES
  private readonly SPEECH_THRESHOLD = 0.1;
  private readonly REDEMPTION_FRAMES = 8;

  // State
+  private context: Float32Array = new Float32Array(64).fill(0); // v6 context buffer
  private speechFrameCount = 0;
  private silenceFrameCount = 0;
  private isSpeaking = false;
@ -33,13 +36,13 @@ export class VADService extends EventEmitter {
        this.modelPath = path.join(
          process.resourcesPath,
          "models",
-          "silero_vad_v5.onnx",
+          "silero_vad_v6.onnx",
        );
      } else {
        // In development, use the source path
        this.modelPath = path.join(
          __dirname,
-          "../../models/silero_vad_v5.onnx",
+          "../../models/silero_vad_v6.onnx",
        );
      }

@ -90,10 +93,14 @@ export class VADService extends EventEmitter {
    }

    try {
-      // Create input tensor - shape should be [1, audio_length]
-      const inputTensor = new ort.Tensor("float32", audioFrames, [
+      // v6: Create combined input [context | frame] with fixed size 576
+      const input = new Float32Array(this.INPUT_SIZE);
+      input.set(this.context, 0);
+      input.set(audioFrames, this.CTX_SIZE);
+
+      const inputTensor = new ort.Tensor("float32", input, [
        1,
-        audioFrames.length,
+        this.INPUT_SIZE,
      ]);

      const srTensor = new ort.Tensor(
@ -109,12 +116,18 @@ export class VADService extends EventEmitter {
        sr: srTensor,
      });

+      // v6: Use dynamic output name detection for robustness
+      const outName = this.session.outputNames[0];
+      const stateName = this.session.outputNames.find((n) => n !== outName)!;
+
      // Update state for next iteration
-      this.state = results.stateN as ort.Tensor;
+      this.state = results[stateName] as ort.Tensor;

      // Get speech probability
-      const output = results.output as ort.Tensor;
-      const probability = output.data[0] as number;
+      const probability = (results[outName].data as Float32Array)[0];
+
+      // v6: Update context = last CTX_SIZE samples of the input
+      this.context = input.slice(this.INPUT_SIZE - this.CTX_SIZE);

      // Apply smoothing logic
      const isSpeaking = this.applySpeechDetectionLogic(probability);
@ -182,10 +195,11 @@ export class VADService extends EventEmitter {

  /**
   * Reset VAD state for a new recording session.
-   * This clears the LSTM state and speech detection counters.
+   * This clears the LSTM state, context buffer, and speech detection counters.
   */
  reset(): void {
    this.resetStates();
+    this.context = new Float32Array(this.CTX_SIZE).fill(0); // Reset v6 context buffer
    this.speechFrameCount = 0;
    this.silenceFrameCount = 0;
    this.isSpeaking = false;