diff --git a/apps/desktop/models/silero_vad_v5.onnx b/apps/desktop/models/silero_vad_v5.onnx deleted file mode 100644 index b3e3a90..0000000 Binary files a/apps/desktop/models/silero_vad_v5.onnx and /dev/null differ diff --git a/apps/desktop/models/silero_vad_v6.onnx b/apps/desktop/models/silero_vad_v6.onnx new file mode 100644 index 0000000..80c5592 Binary files /dev/null and b/apps/desktop/models/silero_vad_v6.onnx differ diff --git a/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts b/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts index a9c44d4..6b5f93f 100644 --- a/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts +++ b/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts @@ -136,6 +136,9 @@ export class AmicalCloudProvider implements TranscriptionProvider { offset += frame.length; } + // Save VAD probabilities before clearing + const vadProbs = [...this.frameBufferSpeechProbabilities]; + // Clear frame buffers only (context values needed for API call below) this.frameBuffer = []; this.frameBufferSpeechProbabilities = []; @@ -144,6 +147,7 @@ export class AmicalCloudProvider implements TranscriptionProvider { // Make the API request return this.makeTranscriptionRequest( combinedAudio, + vadProbs, false, enableFormatting, ); @@ -177,6 +181,7 @@ export class AmicalCloudProvider implements TranscriptionProvider { private async makeTranscriptionRequest( audioData: Float32Array, + vadProbs: number[], isRetry = false, enableFormatting = false, ): Promise { @@ -215,6 +220,7 @@ export class AmicalCloudProvider implements TranscriptionProvider { }, body: JSON.stringify({ audioData: Array.from(audioData), + vadProbs, language: this.currentLanguage, previousTranscription: this.currentAggregatedTranscription, formatting: { @@ -262,6 +268,7 @@ export class AmicalCloudProvider implements TranscriptionProvider { // Retry the request once (preserve formatting flag) return await this.makeTranscriptionRequest( audioData, + vadProbs, true, enableFormatting, ); diff --git a/apps/desktop/src/services/vad-service.ts b/apps/desktop/src/services/vad-service.ts index 9cdec94..6d47be3 100644 --- a/apps/desktop/src/services/vad-service.ts +++ b/apps/desktop/src/services/vad-service.ts @@ -13,10 +13,13 @@ export class VADService extends EventEmitter { // Configuration private readonly WINDOW_SIZE_SAMPLES = 512; // 32ms at 16kHz + private readonly CTX_SIZE = 64; // Context size for v6 + private readonly INPUT_SIZE = 576; // CTX_SIZE + WINDOW_SIZE_SAMPLES private readonly SPEECH_THRESHOLD = 0.1; private readonly REDEMPTION_FRAMES = 8; // State + private context: Float32Array = new Float32Array(64).fill(0); // v6 context buffer private speechFrameCount = 0; private silenceFrameCount = 0; private isSpeaking = false; @@ -33,13 +36,13 @@ export class VADService extends EventEmitter { this.modelPath = path.join( process.resourcesPath, "models", - "silero_vad_v5.onnx", + "silero_vad_v6.onnx", ); } else { // In development, use the source path this.modelPath = path.join( __dirname, - "../../models/silero_vad_v5.onnx", + "../../models/silero_vad_v6.onnx", ); } @@ -90,10 +93,14 @@ export class VADService extends EventEmitter { } try { - // Create input tensor - shape should be [1, audio_length] - const inputTensor = new ort.Tensor("float32", audioFrames, [ + // v6: Create combined input [context | frame] with fixed size 576 + const input = new Float32Array(this.INPUT_SIZE); + input.set(this.context, 0); + input.set(audioFrames, this.CTX_SIZE); + + const inputTensor = new ort.Tensor("float32", input, [ 1, - audioFrames.length, + this.INPUT_SIZE, ]); const srTensor = new ort.Tensor( @@ -109,12 +116,18 @@ export class VADService extends EventEmitter { sr: srTensor, }); + // v6: Use dynamic output name detection for robustness + const outName = this.session.outputNames[0]; + const stateName = this.session.outputNames.find((n) => n !== outName)!; + // Update state for next iteration - this.state = results.stateN as ort.Tensor; + this.state = results[stateName] as ort.Tensor; // Get speech probability - const output = results.output as ort.Tensor; - const probability = output.data[0] as number; + const probability = (results[outName].data as Float32Array)[0]; + + // v6: Update context = last CTX_SIZE samples of the input + this.context = input.slice(this.INPUT_SIZE - this.CTX_SIZE); // Apply smoothing logic const isSpeaking = this.applySpeechDetectionLogic(probability); @@ -182,10 +195,11 @@ export class VADService extends EventEmitter { /** * Reset VAD state for a new recording session. - * This clears the LSTM state and speech detection counters. + * This clears the LSTM state, context buffer, and speech detection counters. */ reset(): void { this.resetStates(); + this.context = new Float32Array(this.CTX_SIZE).fill(0); // Reset v6 context buffer this.speechFrameCount = 0; this.silenceFrameCount = 0; this.isSpeaking = false;