Optimise local transcription calls (#33)

* chore: move audio worklet file to assets * chore: get rid of rickyvad and use vad model directly * fix: handling of onnxruntime in packaged app * chore: run ci on macos * fix: formatting
2025-07-03 12:18:47 +05:30 · 2025-07-03 12:18:47 +05:30 · 5eb5777001
commit 5eb5777001
parent e4b4e92be4
20 changed files with 775 additions and 521 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -13,7 +13,7 @@ env:

 jobs:
  build:
-    runs-on: ubuntu-latest
+    runs-on: macos-latest

    steps:
      - name: Checkout repository
--- a/apps/desktop/forge.config.ts
+++ b/apps/desktop/forge.config.ts
@ -34,6 +34,7 @@ export const EXTERNAL_DEPENDENCIES = [
  "@libsql/linux-x64-musl",
  "@libsql/win32-x64-msvc",
  "libsql",
+  "onnxruntime-node",
  // Add any other native modules you need here
 ];

@ -195,13 +196,16 @@ const config: ForgeConfig = {
    },
  },
  packagerConfig: {
-    asar: true,
+    asar: {
+      unpack: "{*.node,*.dylib,*.so,*.dll}",
+    },
    name: "Amical",
    executableName: "Amical",
    icon: "./assets/logo.icns", // Path to your icon file
    extraResource: [
      "../../packages/native-helpers/swift-helper/bin",
      "./src/db/migrations",
+      "./src/assets",
    ],
    extendInfo: {
      NSMicrophoneUsageDescription:
--- a/apps/desktop/package.json
+++ b/apps/desktop/package.json
@ -86,7 +86,6 @@
    "@radix-ui/react-toggle": "^1.1.9",
    "@radix-ui/react-toggle-group": "^1.1.10",
    "@radix-ui/react-tooltip": "^1.2.7",
-    "@ricky0123/vad-web": "^0.0.24",
    "@tabler/icons-react": "^3.34.0",
    "@tanstack/react-query": "^5.81.2",
    "@tanstack/react-table": "^8.21.3",
@ -116,6 +115,7 @@
    "libsql": "^0.5.13",
    "lucide-react": "^0.510.0",
    "next-themes": "^0.4.6",
+    "onnxruntime-node": "^1.20.1",
    "openai": "^4.98.0",
    "react": "^19.1.0",
    "react-day-picker": "8.10.1",
--- a/apps/desktop/src/assets/audio-recorder-processor.js
+++ b/apps/desktop/src/assets/audio-recorder-processor.js
@ -0,0 +1,56 @@
+class AudioRecorderProcessor extends AudioWorkletProcessor {
+  constructor() {
+    super();
+    this.frameSize = 512; // 32ms at 16kHz
+    this.sampleRate = 16000;
+    this.buffer = [];
+    
+    // Listen for control messages
+    this.port.onmessage = (event) => {
+      if (event.data.type === 'flush') {
+        this.flushBuffer();
+      }
+    };
+  }
+
+  flushBuffer() {
+    // Always send a final frame to signal end of recording
+    const finalFrame = new Float32Array(this.buffer);
+    this.buffer = [];
+    
+    this.port.postMessage({
+      type: 'audioFrame',
+      frame: finalFrame,
+      isFinal: true
+    });
+  }
+
+  process(inputs, outputs, parameters) {
+    const input = inputs[0];
+    if (!input || !input[0]) return true;
+
+    const channelData = input[0];
+    
+    // Add samples to buffer
+    for (let i = 0; i < channelData.length; i++) {
+      this.buffer.push(channelData[i]);
+    }
+
+    // When we have enough samples, send a frame
+    while (this.buffer.length >= this.frameSize) {
+      const frame = this.buffer.slice(0, this.frameSize);
+      this.buffer = this.buffer.slice(this.frameSize);
+
+      // Send frame to main thread
+      this.port.postMessage({
+        type: 'audioFrame',
+        frame: new Float32Array(frame),
+        isFinal: false
+      });
+    }
+
+    return true;
+  }
+}
+
+registerProcessor('audio-recorder-processor', AudioRecorderProcessor);
--- a/apps/desktop/src/assets/silero_vad_v5.onnx
+++ b/apps/desktop/src/assets/silero_vad_v5.onnx
--- a/apps/desktop/src/hooks/audio-recorder-worklet.ts
+++ b/apps/desktop/src/hooks/audio-recorder-worklet.ts
@ -1,65 +0,0 @@
-// AudioWorklet processor source code
-export const audioRecorderWorkletSource = `
-// AudioWorklet processor for real-time audio capture
-// This runs in the audio rendering thread for low-latency processing
-/* eslint-env worker */
-/* global AudioWorkletProcessor, registerProcessor */
-
-class AudioRecorderProcessor extends AudioWorkletProcessor {
-  constructor() {
-    super();
-    this.bufferSize = 4096;
-    this.buffer = new Float32Array(this.bufferSize);
-    this.bufferIndex = 0;
-
-    // Listen for messages from main thread
-    this.port.onmessage = (event) => {
-      if (event.data.command === 'stop') {
-        this.sendBufferedAudio(true); // Send final chunk
-      }
-    };
-  }
-
-  process(inputs, _outputs, _parameters) {
-    const input = inputs[0];
-
-    // Check if we have input audio
-    if (input && input.length > 0) {
-      const inputChannel = input[0]; // Get first (mono) channel
-
-      // Buffer the audio data
-      for (let i = 0; i < inputChannel.length; i++) {
-        this.buffer[this.bufferIndex] = inputChannel[i];
-        this.bufferIndex++;
-
-        // When buffer is full, send it to main thread
-        if (this.bufferIndex >= this.bufferSize) {
-          this.sendBufferedAudio(false);
-          this.bufferIndex = 0; // Reset buffer
-        }
-      }
-    }
-
-    // Keep the processor alive
-    return true;
-  }
-
-  sendBufferedAudio(isFinal) {
-    if (this.bufferIndex > 0 || isFinal) {
-      // Create a copy of the current buffer data
-      const audioData = new Float32Array(this.bufferIndex);
-      audioData.set(this.buffer.subarray(0, this.bufferIndex));
-
-      // Send to main thread
-      this.port.postMessage({
-        type: 'audioData',
-        audioData: audioData,
-        isFinal: isFinal,
-      });
-    }
-  }
-}
-
-// Register the processor
-registerProcessor('audio-recorder-processor', AudioRecorderProcessor);
-`;
--- a/apps/desktop/src/hooks/useAudioCapture.ts
+++ b/apps/desktop/src/hooks/useAudioCapture.ts
@ -1,13 +1,17 @@
-import { useState, useRef, useEffect } from "react";
-import { MicVAD } from "@ricky0123/vad-web";
-import { audioRecorderWorkletSource } from "./audio-recorder-worklet";
+import { useRef, useEffect, useState, useCallback } from "react";
+import audioWorkletUrl from "@/assets/audio-recorder-processor.js?url";
+import { api } from "@/trpc/react";
+
+// Audio configuration
+const FRAME_SIZE = 512; // 32ms at 16kHz
+const SAMPLE_RATE = 16000;

 export interface UseAudioCaptureParams {
  onAudioChunk: (
    arrayBuffer: ArrayBuffer,
+    speechProbability: number,
    isFinalChunk: boolean,
  ) => Promise<void> | void;
-  chunkDurationMs?: number;
  enabled: boolean;
 }

@ -15,268 +19,136 @@ export interface UseAudioCaptureOutput {
  voiceDetected: boolean;
 }

-interface AudioCaptureState {
-  stream: MediaStream | null;
-  vad: MicVAD | null;
-  audioContext: AudioContext | null;
-  audioWorkletNode: AudioWorkletNode | null;
-  source: MediaStreamAudioSourceNode | null;
-  chunkTimer: NodeJS.Timeout | null;
-  pendingAudioChunks: Float32Array[];
-  sendAudioChunk: ((isFinal: boolean) => Promise<void>) | null;
-}
-
 export const useAudioCapture = ({
  onAudioChunk,
-  chunkDurationMs = 28000,
  enabled,
 }: UseAudioCaptureParams): UseAudioCaptureOutput => {
  const [voiceDetected, setVoiceDetected] = useState(false);
-  const stateRef = useRef<AudioCaptureState>({
-    stream: null,
-    vad: null,
-    audioContext: null,
-    audioWorkletNode: null,
-    source: null,
-    chunkTimer: null,
-    pendingAudioChunks: [],
-    sendAudioChunk: null,
+  const audioContextRef = useRef<AudioContext | null>(null);
+  const sourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
+  const workletNodeRef = useRef<AudioWorkletNode | null>(null);
+  const streamRef = useRef<MediaStream | null>(null);
+
+  // Subscribe to voice detection updates via tRPC
+  api.recording.voiceDetectionUpdates.useSubscription(undefined, {
+    onData: (detected: boolean) => {
+      setVoiceDetected(detected);
+    },
+    onError: (err) => {
+      console.error("Voice detection subscription error:", err);
+    },
  });

-  // Main effect to handle enabled state changes
-  useEffect(() => {
-    let isCancelled = false;
+  const startCapture = useCallback(async () => {
+    try {
+      console.log("AudioCapture: Starting audio capture");

-    const cleanup = async () => {
-      const state = stateRef.current;
-
-      // Send final chunk if we have pending audio
-      if (state.sendAudioChunk) {
-        try {
-          await state.sendAudioChunk(true);
-        } catch (error) {
-          console.error("AudioCapture: Error sending final chunk:", error);
-        }
-      }
-
-      // Clear chunk timer
-      if (state.chunkTimer) {
-        clearInterval(state.chunkTimer);
-        state.chunkTimer = null;
-      }
-
-      // Cleanup AudioWorklet
-      if (state.audioWorkletNode) {
-        state.audioWorkletNode.port.postMessage({ command: "stop" });
-        state.audioWorkletNode.disconnect();
-        state.audioWorkletNode = null;
-      }
-
-      if (state.source) {
-        state.source.disconnect();
-        state.source = null;
-      }
-
-      if (state.audioContext && state.audioContext.state !== "closed") {
-        await state.audioContext.close();
-        state.audioContext = null;
-      }
-
-      // Cleanup VAD
-      if (state.vad) {
-        try {
-          state.vad.destroy();
-          console.log("AudioCapture: VAD destroyed");
-        } catch (e) {
-          console.error("Error destroying VAD:", e);
-        }
-        state.vad = null;
-      }
-
-      // Stop media stream
-      if (state.stream) {
-        state.stream.getTracks().forEach((track) => {
-          try {
-            track.stop();
-          } catch (e) {
-            console.error("Error stopping stream track:", e);
-          }
-        });
-        state.stream = null;
-      }
-
-      // Reset state
-      state.pendingAudioChunks = [];
-      state.sendAudioChunk = null;
-      setVoiceDetected(false);
-
-      console.log("AudioCapture: Cleaned up");
-    };
-
-    const startCapture = async () => {
-      console.log("AudioCapture: Starting capture...");
-
-      try {
-        // Get microphone access
-        const stream = await navigator.mediaDevices.getUserMedia({
-          audio: true,
-        });
-        if (isCancelled) {
-          stream.getTracks().forEach((track) => track.stop());
-          return;
-        }
-        stateRef.current.stream = stream;
-
-        // Set up Web Audio API with AudioWorklet for raw PCM data
-        const audioContext = new AudioContext({ sampleRate: 16000 });
-        stateRef.current.audioContext = audioContext;
-
-        // Load AudioWorklet module using blob URL
-        const blob = new Blob([audioRecorderWorkletSource], {
-          type: "application/javascript",
-        });
-        const audioWorkletUrl = URL.createObjectURL(blob);
-
-        try {
-          await audioContext.audioWorklet.addModule(audioWorkletUrl);
-        } finally {
-          URL.revokeObjectURL(audioWorkletUrl);
-        }
-
-        if (isCancelled) {
-          await cleanup();
-          return;
-        }
-
-        const source = audioContext.createMediaStreamSource(stream);
-        stateRef.current.source = source;
-
-        // Create AudioWorklet node
-        const audioWorkletNode = new AudioWorkletNode(
-          audioContext,
-          "audio-recorder-processor",
-        );
-        stateRef.current.audioWorkletNode = audioWorkletNode;
-
-        // Create function to send accumulated chunks
-        const sendAudioChunk = async (isFinal = false) => {
-          const pendingChunks = stateRef.current.pendingAudioChunks;
-          if (pendingChunks.length > 0) {
-            // Combine all pending chunks into one array
-            const totalLength = pendingChunks.reduce(
-              (sum, chunk) => sum + chunk.length,
-              0,
-            );
-            const combinedChunk = new Float32Array(totalLength);
-            let offset = 0;
-
-            for (const chunk of pendingChunks) {
-              combinedChunk.set(chunk, offset);
-              offset += chunk.length;
-            }
-
-            // Convert Float32Array to ArrayBuffer for IPC
-            const arrayBuffer = combinedChunk.buffer.slice(
-              combinedChunk.byteOffset,
-              combinedChunk.byteOffset + combinedChunk.byteLength,
-            );
-
-            try {
-              await onAudioChunk(arrayBuffer, isFinal);
-              console.log(
-                `AudioCapture: Sent chunk: ${combinedChunk.length} samples, final: ${isFinal}`,
-              );
-            } catch (error) {
-              console.error("AudioCapture: Error processing chunk:", error);
-            }
-
-            stateRef.current.pendingAudioChunks = []; // Clear chunks after sending
-          }
-        };
-
-        stateRef.current.sendAudioChunk = sendAudioChunk;
-
-        // Handle messages from AudioWorklet
-        audioWorkletNode.port.onmessage = (event) => {
-          if (event.data.type === "audioData") {
-            const audioData = event.data.audioData as Float32Array;
-            const isFinal = event.data.isFinal as boolean;
-
-            // Store the audio chunk
-            stateRef.current.pendingAudioChunks.push(audioData);
-
-            if (isFinal) {
-              // Send final chunk immediately
-              sendAudioChunk(true);
-            }
-          }
-        };
-
-        // Set up periodic chunk sending
-        const chunkTimer = setInterval(() => {
-          sendAudioChunk(false);
-        }, chunkDurationMs);
-        stateRef.current.chunkTimer = chunkTimer;
-
-        // Connect the audio processing chain
-        source.connect(audioWorkletNode);
-        console.log("AudioCapture: Connected AudioWorklet processing chain");
-
-        // Set up VAD
-        const vad = await MicVAD.new({
-          stream,
-          model: "v5",
-          onSpeechStart: () => {
-            // Check if component is still mounted before updating state
-            if (!isCancelled) {
-              console.log("VAD: Speech started");
-              setVoiceDetected(true);
-            }
-          },
-          onSpeechEnd: () => {
-            console.log("VAD: Speech ended");
-            // Check if component is still mounted before updating state
-            if (!isCancelled) {
-              console.log("VAD: Speech ended");
-              setVoiceDetected(false);
-            }
-          },
-        });
-
-        // Store VAD reference immediately to ensure proper cleanup
-        stateRef.current.vad = vad;
-
-        if (isCancelled) {
-          await cleanup();
-          return;
-        }
-
-        vad.start();
-        console.log("AudioCapture: VAD started");
-
-        console.log("AudioCapture: Fully started");
-      } catch (err) {
-        console.error("AudioCapture: Error starting:", err);
-        await cleanup();
-        throw err;
-      }
-    };
-
-    // Handle enabled state
-    if (enabled) {
-      startCapture().catch((err) => {
-        console.error("AudioCapture: Failed to start:", err);
+      // Get microphone stream
+      streamRef.current = await navigator.mediaDevices.getUserMedia({
+        audio: {
+          channelCount: 1,
+          sampleRate: SAMPLE_RATE,
+          echoCancellation: true,
+          noiseSuppression: true,
+          autoGainControl: true,
+        },
      });
+
+      // Create audio context
+      audioContextRef.current = new AudioContext({ sampleRate: SAMPLE_RATE });
+
+      // Load audio worklet
+      await audioContextRef.current.audioWorklet.addModule(audioWorkletUrl);
+
+      // Create nodes
+      sourceRef.current = audioContextRef.current.createMediaStreamSource(
+        streamRef.current,
+      );
+      workletNodeRef.current = new AudioWorkletNode(
+        audioContextRef.current,
+        "audio-recorder-processor",
+      );
+
+      // Handle audio frames from worklet
+      workletNodeRef.current.port.onmessage = async (event) => {
+        if (event.data.type === "audioFrame") {
+          const frame = event.data.frame;
+          const isFinal = event.data.isFinal || false;
+
+          // Convert to ArrayBuffer for IPC
+          const arrayBuffer = frame.buffer.slice(
+            frame.byteOffset,
+            frame.byteOffset + frame.byteLength,
+          );
+
+          // Send to main process for VAD processing
+          // Main process will update voice detection state
+          await onAudioChunk(arrayBuffer, 0, isFinal); // Speech probability will come from main
+
+          console.log(
+            `AudioCapture: Sent frame: ${frame.length} samples, isFinal: ${isFinal}`,
+          );
+        }
+      };
+
+      // Connect audio graph
+      sourceRef.current.connect(workletNodeRef.current);
+
+      console.log("AudioCapture: Audio capture started");
+    } catch (error) {
+      console.error("AudioCapture: Failed to start capture:", error);
+      throw error;
+    }
+  }, [onAudioChunk]);
+
+  const stopCapture = useCallback(() => {
+    console.log("AudioCapture: Stopping audio capture");
+
+    // Send flush command to worklet before disconnecting
+    if (workletNodeRef.current) {
+      workletNodeRef.current.port.postMessage({ type: "flush" });
+      console.log("AudioCapture: Sent flush command to worklet");
    }

-    // Cleanup function
-    return () => {
-      isCancelled = true;
-      cleanup().catch((err) => {
-        console.error("AudioCapture: Cleanup error:", err);
+    // Disconnect nodes
+    if (sourceRef.current && workletNodeRef.current) {
+      sourceRef.current.disconnect(workletNodeRef.current);
+    }
+
+    // Close audio context
+    if (audioContextRef.current && audioContextRef.current.state !== "closed") {
+      audioContextRef.current.close();
+    }
+
+    // Stop media stream
+    if (streamRef.current) {
+      streamRef.current.getTracks().forEach((track) => track.stop());
+    }
+
+    // Clear refs
+    audioContextRef.current = null;
+    sourceRef.current = null;
+    workletNodeRef.current = null;
+    streamRef.current = null;
+
+    setVoiceDetected(false);
+    console.log("AudioCapture: Audio capture stopped");
+  }, []);
+
+  // Start/stop based on enabled state
+  useEffect(() => {
+    if (enabled) {
+      startCapture().catch((error) => {
+        console.error("AudioCapture: Failed to start:", error);
      });
+    } else {
+      stopCapture();
+    }
+
+    return () => {
+      stopCapture();
    };
-  }, [enabled, onAudioChunk, chunkDurationMs]);
+  }, [enabled, startCapture, stopCapture]);

  return {
    voiceDetected,
--- a/apps/desktop/src/hooks/useRecording.ts
+++ b/apps/desktop/src/hooks/useRecording.ts
@ -4,11 +4,11 @@ import { useAudioCapture } from "./useAudioCapture";
 import type { RecordingState } from "@/types/recording";

 export interface UseRecordingParams {
-  onAudioChunk: (
-    arrayBuffer: ArrayBuffer,
-    isFinalChunk: boolean,
+  onAudioFrame: (
+    audioBuffer: ArrayBuffer,
+    speechProbability: number,
+    isFinal: boolean,
  ) => Promise<void> | void;
-  chunkDurationMs?: number;
  onRecordingStartCallback?: () => Promise<void> | void;
  onRecordingStopCallback?: () => Promise<void> | void;
 }
@ -21,8 +21,7 @@ export interface UseRecordingOutput {
 }

 export const useRecording = ({
-  onAudioChunk,
-  chunkDurationMs = 28000,
+  onAudioFrame,
  onRecordingStartCallback,
  onRecordingStopCallback,
 }: UseRecordingParams): UseRecordingOutput => {
@ -33,13 +32,25 @@ export const useRecording = ({
    stopRecording: stopRecordingMutation,
  } = useRecordingState();

+  // Create handler for audio chunks - just pass through
+  const handleAudioChunk = useCallback(
+    async (
+      arrayBuffer: ArrayBuffer,
+      speechProbability: number,
+      isFinalChunk: boolean,
+    ) => {
+      // Direct pass-through - no aggregation needed
+      await onAudioFrame(arrayBuffer, speechProbability, isFinalChunk);
+    },
+    [onAudioFrame],
+  );
+
  // Manage audio capture when recording is active
  const isActive =
    recordingStatus === "recording" || recordingStatus === "starting";

  const { voiceDetected } = useAudioCapture({
-    onAudioChunk,
-    chunkDurationMs,
+    onAudioChunk: handleAudioChunk,
    enabled: isActive,
  });

@ -121,7 +132,12 @@ export const useRecording = ({
    } catch (error) {
      console.error("Hook: Error stopping recording:", error);
    }
-  }, [recordingStatus, stopRecordingMutation, onRecordingStopCallback]);
+  }, [
+    recordingStatus,
+    stopRecordingMutation,
+    onRecordingStopCallback,
+    onAudioFrame,
+  ]);

  return {
    recordingStatus,
--- a/apps/desktop/src/main/managers/recording-manager.ts
+++ b/apps/desktop/src/main/managers/recording-manager.ts
@ -4,6 +4,7 @@ import { logger, logPerformance } from "../logger";
 import { ServiceManager } from "./service-manager";
 import { appContextStore } from "../../stores/app-context";
 import type { RecordingState, RecordingStatus } from "../../types/recording";
+import { WindowManager } from "../core/window-manager";

 /**
 * Manages recording state and coordinates audio recording across the application
@ -13,12 +14,17 @@ export class RecordingManager extends EventEmitter {
  private currentSessionId: string | null = null;
  private recordingState: RecordingState = "idle";
  private lastError: string | undefined;
+  private windowManager: WindowManager | null = null;

  constructor(private serviceManager: ServiceManager) {
    super();
    this.setupIPCHandlers();
  }

+  public setWindowManager(windowManager: WindowManager): void {
+    this.windowManager = windowManager;
+  }
+
  private setState(newState: RecordingState, error?: string): void {
    const oldState = this.recordingState;
    this.recordingState = newState;
--- a/apps/desktop/src/main/managers/service-manager.ts
+++ b/apps/desktop/src/main/managers/service-manager.ts
@ -6,6 +6,7 @@ import { SwiftIOBridge } from "../../services/platform/swift-bridge-service";
 import { AutoUpdaterService } from "../services/auto-updater";
 import { WindowManager } from "../core/window-manager";
 import { RecordingManager } from "./recording-manager";
+import { VADService } from "../../services/vad-service";

 /**
 * Manages service initialization and lifecycle
@ -17,6 +18,7 @@ export class ServiceManager {
  private modelManagerService: ModelManagerService | null = null;
  private transcriptionService: TranscriptionService | null = null;
  private settingsService: SettingsService | null = null;
+  private vadService: VADService | null = null;

  private swiftIOBridge: SwiftIOBridge | null = null;
  private autoUpdaterService: AutoUpdaterService | null = null;
@ -34,8 +36,9 @@ export class ServiceManager {
      this.initializeSettingsService();
      await this.initializeModelServices();
      this.initializePlatformServices();
+      await this.initializeVADService();
      await this.initializeAIServices();
-      this.initializeRecordingManager();
+      this.initializeRecordingManager(windowManager);
      this.initializeAutoUpdater(windowManager);

      this.isInitialized = true;
@ -57,6 +60,17 @@ export class ServiceManager {
    await this.modelManagerService.initialize();
  }

+  private async initializeVADService(): Promise<void> {
+    try {
+      this.vadService = new VADService();
+      await this.vadService.initialize();
+      logger.main.info("VAD service initialized");
+    } catch (error) {
+      logger.main.error("Failed to initialize VAD service:", error);
+      // Don't throw - VAD is not critical for basic functionality
+    }
+  }
+
  private async initializeAIServices(): Promise<void> {
    try {
      if (!this.modelManagerService) {
@ -65,7 +79,9 @@ export class ServiceManager {

      this.transcriptionService = new TranscriptionService(
        this.modelManagerService,
+        this.vadService,
      );
+      await this.transcriptionService.initialize();

      // Load and configure formatter
      try {
@ -109,8 +125,9 @@ export class ServiceManager {
    }
  }

-  private initializeRecordingManager(): void {
+  private initializeRecordingManager(windowManager: WindowManager): void {
    this.recordingManager = new RecordingManager(this);
+    this.recordingManager.setWindowManager(windowManager);
    logger.main.info("Recording manager initialized");
  }

@ -191,6 +208,15 @@ export class ServiceManager {
    return this.recordingManager;
  }

+  getVADService(): VADService | null {
+    if (!this.isInitialized) {
+      throw new Error(
+        "ServiceManager not initialized. Call initialize() first.",
+      );
+    }
+    return this.vadService;
+  }
+
  async cleanup(): Promise<void> {
    if (this.recordingManager) {
      logger.main.info("Cleaning up recording manager...");
@ -201,6 +227,11 @@ export class ServiceManager {
      this.modelManagerService.cleanup();
    }

+    if (this.vadService) {
+      logger.main.info("Cleaning up VAD service...");
+      await this.vadService.dispose();
+    }
+
    if (this.swiftIOBridge) {
      logger.main.info("Stopping Swift helper...");
      this.swiftIOBridge.stopHelper();
--- a/apps/desktop/src/pipeline/core/pipeline-types.ts
+++ b/apps/desktop/src/pipeline/core/pipeline-types.ts
@ -10,6 +10,7 @@ export { PipelineContext, SharedPipelineData } from "./context";
 // Transcription input parameters
 export interface TranscribeParams {
  audioData: Buffer;
+  speechProbability?: number; // Speech probability from frontend VAD (0-1)
  context: {
    vocabulary?: Map<string, string>;
    accessibilityContext?: GetAccessibilityContextResult | null;
@ -34,6 +35,7 @@ export interface FormatParams {
 export interface TranscriptionProvider {
  readonly name: string;
  transcribe(params: TranscribeParams): Promise<string>;
+  flush?(): Promise<string>; // Optional flush method for providers that buffer
 }

 // Formatting provider interface
--- a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
@ -12,6 +12,19 @@ export class WhisperProvider implements TranscriptionProvider {
  private modelManager: ModelManagerService;
  private whisperInstance: Whisper | null = null;

+  // Frame aggregation state
+  private frameBuffer: Float32Array[] = [];
+  private frameBufferSpeechProbabilities: number[] = []; // Track speech probabilities for each frame
+  private silenceFrameCount = 0;
+  private lastSpeechTimestamp = 0;
+
+  // Configuration
+  private readonly FRAME_SIZE = 512; // 32ms at 16kHz
+  private readonly MIN_SPEECH_DURATION_MS = 500; // Minimum speech duration to transcribe
+  private readonly MAX_SILENCE_DURATION_MS = 2000; // Max silence before cutting
+  private readonly SAMPLE_RATE = 16000;
+  private readonly SPEECH_PROBABILITY_THRESHOLD = 0.2; // Threshold for speech detection
+
  constructor(modelManager: ModelManagerService) {
    this.modelManager = modelManager;
  }
@ -21,20 +34,53 @@ export class WhisperProvider implements TranscriptionProvider {
      await this.initializeWhisper();

      // Extract parameters from the new structure
-      const { audioData, context } = params;
+      const { audioData, speechProbability = 0, context } = params;
      const { vocabulary, previousChunk, aggregatedTranscription } = context;

      // Convert audio buffer to the format expected by smart-whisper
      const audioFloat32Array = await this.convertAudioBuffer(audioData);

+      // Add frame to buffer with speech probability
+      this.frameBuffer.push(audioFloat32Array);
+      this.frameBufferSpeechProbabilities.push(speechProbability);
+
+      // Consider it speech if probability is above threshold
+      const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD;
+
      logger.transcription.debug(
-        `Starting transcription, audio size: ${audioData.length}`,
-        previousChunk
-          ? `Previous chunk: ${previousChunk.substring(0, 50)}...`
-          : "No previous chunk",
-        aggregatedTranscription
-          ? `Aggregated length: ${aggregatedTranscription.length}`
-          : "No aggregated transcription",
+        `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.silenceFrameCount}`,
+      );
+
+      // Handle speech/silence logic
+      if (isSpeech) {
+        this.silenceFrameCount = 0;
+        this.lastSpeechTimestamp = Date.now();
+      } else {
+        this.silenceFrameCount++;
+      }
+
+      // Determine if we should transcribe
+      const shouldTranscribe = this.shouldTranscribe();
+
+      if (!shouldTranscribe) {
+        // Keep buffering
+        return "";
+      }
+
+      // Aggregate buffered frames
+      const aggregatedAudio = this.aggregateFrames();
+
+      // Skip if too short or only silence
+      if (aggregatedAudio.length < this.FRAME_SIZE * 2) {
+        logger.transcription.debug("Skipping transcription - audio too short");
+        this.frameBuffer = [];
+        this.frameBufferSpeechProbabilities = [];
+        this.silenceFrameCount = 0;
+        return "";
+      }
+
+      logger.transcription.debug(
+        `Starting transcription of ${aggregatedAudio.length} samples (${((aggregatedAudio.length / this.SAMPLE_RATE) * 1000).toFixed(0)}ms)`,
      );

      // Transcribe using smart-whisper
@ -49,10 +95,13 @@ export class WhisperProvider implements TranscriptionProvider {
      );

      const { result } = await this.whisperInstance.transcribe(
-        audioFloat32Array,
+        aggregatedAudio,
        {
          language: "auto",
          initial_prompt: initialPrompt,
+          suppress_blank: true,
+          suppress_non_speech_tokens: true,
+          no_timestamps: true,
        },
      );

@ -68,6 +117,11 @@ export class WhisperProvider implements TranscriptionProvider {
        `Transcription completed, length: ${text.length}`,
      );

+      // Clear buffer after successful transcription
+      this.frameBuffer = [];
+      this.frameBufferSpeechProbabilities = [];
+      this.silenceFrameCount = 0;
+
      return text;
    } catch (error) {
      logger.transcription.error("Transcription failed:", error);
@ -75,6 +129,112 @@ export class WhisperProvider implements TranscriptionProvider {
    }
  }

+  private shouldTranscribe(): boolean {
+    // Transcribe if:
+    // 1. We have significant silence after speech
+    // 2. Buffer is getting too large
+    // 3. Final chunk was received (handled elsewhere)
+
+    const bufferDurationMs =
+      ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
+    const silenceDurationMs =
+      ((this.silenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
+
+    // If we have speech and then significant silence, transcribe
+    if (
+      this.frameBuffer.length > 0 &&
+      silenceDurationMs > this.MAX_SILENCE_DURATION_MS
+    ) {
+      logger.transcription.debug(
+        `Transcribing due to ${silenceDurationMs}ms of silence`,
+      );
+      return true;
+    }
+
+    // If buffer is too large (e.g., 30 seconds), transcribe anyway
+    if (bufferDurationMs > 30000) {
+      logger.transcription.debug(
+        `Transcribing due to buffer size: ${bufferDurationMs}ms`,
+      );
+      return true;
+    }
+
+    logger.transcription.error("Not transcribing", {
+      bufferDurationMs,
+      silenceDurationMs,
+      frameBufferLength: this.frameBuffer.length,
+      silenceFrameCount: this.silenceFrameCount,
+    });
+
+    return false;
+  }
+
+  private aggregateFrames(): Float32Array {
+    // Calculate total size
+    const totalLength = this.frameBuffer.reduce(
+      (sum, frame) => sum + frame.length,
+      0,
+    );
+    const aggregated = new Float32Array(totalLength);
+
+    // Copy all frames into single array
+    let offset = 0;
+    for (const frame of this.frameBuffer) {
+      aggregated.set(frame, offset);
+      offset += frame.length;
+    }
+
+    // Trim silence from beginning and end
+    const trimmed = this.trimSilence(aggregated);
+
+    return trimmed;
+  }
+
+  private trimSilence(audio: Float32Array): Float32Array {
+    // Find first speech frame (probability > threshold)
+    let startIdx = 0;
+    for (let i = 0; i < this.frameBufferSpeechProbabilities.length; i++) {
+      if (
+        this.frameBufferSpeechProbabilities[i] >
+        this.SPEECH_PROBABILITY_THRESHOLD
+      ) {
+        startIdx = i * this.FRAME_SIZE;
+        break;
+      }
+    }
+
+    // Find last speech frame (probability > threshold)
+    let endIdx = audio.length;
+    for (let i = this.frameBufferSpeechProbabilities.length - 1; i >= 0; i--) {
+      if (
+        this.frameBufferSpeechProbabilities[i] >
+        this.SPEECH_PROBABILITY_THRESHOLD
+      ) {
+        endIdx = (i + 1) * this.FRAME_SIZE;
+        break;
+      }
+    }
+
+    return audio.slice(startIdx, Math.min(endIdx, audio.length));
+  }
+
+  // Force transcription of any remaining frames
+  async flush(): Promise<string> {
+    if (this.frameBuffer.length === 0) {
+      return "";
+    }
+
+    logger.transcription.error(`Flushing ${this.frameBuffer.length} frames`);
+
+    // Force transcription by setting high silence count
+    this.silenceFrameCount = 999;
+    return this.transcribe({
+      audioData: Buffer.alloc(0), // Empty buffer, we'll use the buffered frames
+      speechProbability: 0,
+      context: {},
+    });
+  }
+
  private generateInitialPrompt(
    vocabulary?: Map<string, string>,
    aggregatedTranscription?: string,
@ -163,5 +323,10 @@ export class WhisperProvider implements TranscriptionProvider {
        this.whisperInstance = null;
      }
    }
+
+    // Clear buffers
+    this.frameBuffer = [];
+    this.frameBufferSpeechProbabilities = [];
+    this.silenceFrameCount = 0;
  }
 }
--- a/apps/desktop/src/renderer/widget/pages/widget/components/FloatingButton.tsx
+++ b/apps/desktop/src/renderer/widget/pages/widget/components/FloatingButton.tsx
@ -19,24 +19,27 @@ export const FloatingButton: React.FC = () => {
    };
  }, []);

-  const handleAudioChunk = useCallback(
-    async (audioChunk: ArrayBuffer, isFinalChunk: boolean) => {
+  const handleAudioFrame = useCallback(
+    async (
+      audioBuffer: ArrayBuffer,
+      speechProbability: number,
+      isFinal: boolean,
+    ) => {
      try {
-        // Send the audio chunk regardless of whether it's final or not
-        await window.electronAPI.sendAudioChunk(audioChunk, isFinalChunk);
-        console.debug(`Sent audio chunk`, {
-          chunkSize: audioChunk.byteLength,
-          isFinalChunk,
+        // Send frame directly to main process
+        // TODO: We need to update the IPC to include speech detection info
+        await window.electronAPI.sendAudioChunk(audioBuffer, isFinal);
+        console.debug(`Sent audio frame`, {
+          size: audioBuffer.byteLength,
+          speechProbability: speechProbability.toFixed(3),
+          isFinal,
        });

-        if (isFinalChunk) {
-          console.log("Final chunk sent to main process");
-          // You might want to add a specific IPC call here if the main process needs an explicit signal
-          // to finalize transcription, e.g., window.electronAPI.finalizeTranscription();
-          // For now, we assume sendAudioChunk is enough and the main process handles the stream end.
+        if (isFinal) {
+          console.log("Final frame sent to main process");
        }
      } catch (error) {
-        console.error("Error sending audio chunk:", error);
+        console.error("Error sending audio frame:", error);
      }
    },
    [],
@ -44,8 +47,7 @@ export const FloatingButton: React.FC = () => {

  const { recordingStatus, startRecording, stopRecording, voiceDetected } =
    useRecording({
-      onAudioChunk: handleAudioChunk,
-      // Optionally, set chunkDurationMs here if needed, e.g., chunkDurationMs: 250
+      onAudioFrame: handleAudioFrame,
    });
  const isRecording =
    recordingStatus === "recording" || recordingStatus === "starting";
--- a/apps/desktop/src/services/transcription-service.ts
+++ b/apps/desktop/src/services/transcription-service.ts
@ -7,11 +7,11 @@ import { createDefaultContext } from "../pipeline/core/context";
 import { WhisperProvider } from "../pipeline/providers/transcription/whisper-provider";
 import { OpenRouterProvider } from "../pipeline/providers/formatting/openrouter-formatter";
 import { ModelManagerService } from "../services/model-manager";
-import { ServiceManager } from "../main/managers/service-manager";
 import { appContextStore } from "../stores/app-context";
 import { createTranscription } from "../db/transcriptions";
 import { logger } from "../main/logger";
 import { v4 as uuid } from "uuid";
+import { VADService } from "./vad-service";

 /**
 * Service for audio transcription and optional formatting
@ -21,9 +21,23 @@ export class TranscriptionService {
  private openRouterProvider: OpenRouterProvider | null = null;
  private formatterEnabled = false;
  private streamingSessions: Map<string, StreamingSession> = new Map();
+  private vadService: VADService | null = null;

-  constructor(modelManagerService: ModelManagerService) {
+  constructor(
+    modelManagerService: ModelManagerService,
+    vadService: VADService | null = null,
+  ) {
    this.whisperProvider = new WhisperProvider(modelManagerService);
+    this.vadService = vadService;
+  }
+
+  async initialize(): Promise<void> {
+    if (this.vadService) {
+      logger.transcription.info("Using VAD service");
+    } else {
+      logger.transcription.warn("VAD service not available");
+    }
+    logger.transcription.info("Transcription service initialized");
  }

  /**
@ -62,6 +76,26 @@ export class TranscriptionService {
    isFinal?: boolean;
  }): Promise<string> {
    const { sessionId, audioChunk, isFinal = false } = options;
+    console.error("processing streaming chunk", {
+      length: audioChunk.length,
+    });
+
+    // Run VAD on the audio chunk
+    let speechProbability = 0;
+    let isSpeaking = false;
+
+    if (audioChunk.length > 0 && this.vadService) {
+      const vadResult = await this.vadService.processAudioFrame(
+        audioChunk.buffer as ArrayBuffer,
+      );
+      speechProbability = vadResult.probability;
+      isSpeaking = vadResult.isSpeaking;
+
+      logger.transcription.debug("VAD result", {
+        probability: speechProbability.toFixed(3),
+        isSpeaking,
+      });
+    }

    // Auto-create session if it doesn't exist
    let session = this.streamingSessions.get(sessionId);
@ -90,7 +124,7 @@ export class TranscriptionService {

    // Process chunk if it has content
    if (audioChunk.length > 0) {
-      // Direct provider call - no step wrapper
+      // Direct frame to Whisper - it will handle aggregation and VAD internally
      const previousChunk =
        session.transcriptionResults.length > 0
          ? session.transcriptionResults[
@ -103,6 +137,7 @@ export class TranscriptionService {

      const chunkTranscription = await this.whisperProvider.transcribe({
        audioData: audioChunk,
+        speechProbability: speechProbability, // Now from VAD service
        context: {
          vocabulary: session.context.sharedData.vocabulary,
          accessibilityContext: session.context.sharedData.accessibilityContext,
@ -111,22 +146,39 @@ export class TranscriptionService {
        },
      });

-      // Accumulate the result
+      // Accumulate the result only if Whisper returned something
+      // (it returns empty string while buffering)
      if (chunkTranscription.trim()) {
        session.transcriptionResults.push(chunkTranscription);
+        logger.transcription.info("Whisper returned transcription", {
+          sessionId,
+          transcriptionLength: chunkTranscription.length,
+          totalResults: session.transcriptionResults.length,
+        });
      }

-      logger.transcription.debug("Processed chunk", {
+      logger.transcription.error("Processed frame", {
        sessionId,
-        chunkSize: audioChunk.length,
-        transcriptionLength: chunkTranscription.length,
-        totalResults: session.transcriptionResults.length,
+        frameSize: audioChunk.length,
+        hadTranscription: chunkTranscription.length > 0,
        isFinal,
      });
    }

-    // If this is the final chunk, apply formatting and save
+    // If this is the final chunk, flush any remaining audio and apply formatting
    if (isFinal) {
+      // Flush any remaining buffered audio in Whisper
+      if (this.whisperProvider.flush) {
+        const flushResult = await this.whisperProvider.flush();
+        if (flushResult.trim()) {
+          session.transcriptionResults.push(flushResult);
+          logger.transcription.info("Flushed final audio", {
+            sessionId,
+            flushLength: flushResult.length,
+          });
+        }
+      }
+
      // Get complete transcription
      let completeTranscription = session.transcriptionResults.join(" ").trim();

@ -137,7 +189,7 @@ export class TranscriptionService {
      });

      // Format if enabled
-      if (this.formatterEnabled && this.openRouterProvider) {
+      if (this.formatterEnabled && this.openRouterProvider && false) {
        const style =
          session.context.sharedData.userPreferences?.formattingStyle;
        completeTranscription = await this.openRouterProvider.format({
@ -188,19 +240,9 @@ export class TranscriptionService {
    // Create default context
    const context = createDefaultContext(uuid());

-    // Simple context building - no complex loading
-    const serviceManager = ServiceManager.getInstance();
-    if (serviceManager) {
-      try {
-        const settingsService = serviceManager.getSettingsService();
-        const formatterConfig = await settingsService.getFormatterConfig();
-      } catch (error) {
-        logger.transcription.warn("Failed to load formatter config", { error });
-      }
-    }
-
    // TODO: Load actual vocabulary
    // TODO: Load user preferences from settings
+    // TODO: Load formatter config from settings

    return context;
  }
@ -210,6 +252,7 @@ export class TranscriptionService {
   */
  async dispose(): Promise<void> {
    await this.whisperProvider.dispose();
+    // VAD service is managed by ServiceManager
    logger.transcription.info("Transcription service disposed");
  }
 }
--- a/apps/desktop/src/services/vad-service.ts
+++ b/apps/desktop/src/services/vad-service.ts
@ -0,0 +1,192 @@
+import * as ort from "onnxruntime-node";
+import { logger } from "../main/logger";
+import { app } from "electron";
+import * as path from "path";
+import { EventEmitter } from "node:events";
+import { existsSync } from "node:fs";
+
+export class VADService extends EventEmitter {
+  private session: ort.InferenceSession | null = null;
+  private modelPath: string | null = null;
+  private state: ort.Tensor | null = null;
+  private sr: number = 16000;
+
+  // Configuration
+  private readonly WINDOW_SIZE_SAMPLES = 512; // 32ms at 16kHz
+  private readonly SPEECH_THRESHOLD = 0.2;
+  private readonly REDEMPTION_FRAMES = 8;
+
+  // State
+  private speechFrameCount = 0;
+  private silenceFrameCount = 0;
+  private isSpeaking = false;
+
+  constructor() {
+    super();
+  }
+
+  async initialize(): Promise<void> {
+    try {
+      // Handle both development and production paths
+      if (app.isPackaged) {
+        // In production, the assets are copied to the resources folder
+        this.modelPath = path.join(
+          process.resourcesPath,
+          "assets",
+          "silero_vad_v5.onnx",
+        );
+      } else {
+        // In development, use the source path
+        this.modelPath = path.join(
+          __dirname,
+          "../../src/assets/silero_vad_v5.onnx",
+        );
+      }
+
+      logger.main.info("Loading VAD model from", this.modelPath);
+
+      // Check if the model file exists
+      if (!existsSync(this.modelPath)) {
+        throw new Error(
+          `VAD model file not found at: ${this.modelPath}. ` +
+            `Make sure the ONNX model is in the assets folder.`,
+        );
+      }
+
+      // Load ONNX model
+      this.session = await ort.InferenceSession.create(this.modelPath, {
+        executionProviders: ["cpu"], // Use CPU provider for compatibility
+      });
+
+      // Initialize hidden states (h and c)
+      this.resetStates();
+
+      logger.main.info("VAD service initialized successfully");
+    } catch (error) {
+      logger.main.error("Failed to initialize VAD service:", error);
+      throw error;
+    }
+  }
+
+  private resetStates(): void {
+    // Silero VAD uses a state tensor with shape [2, 1, 128]
+    const stateSize = 2 * 1 * 128;
+    this.state = new ort.Tensor(
+      "float32",
+      new Float32Array(stateSize).fill(0),
+      [2, 1, 128],
+    );
+  }
+
+  async processBatch(
+    audioFrames: Float32Array,
+  ): Promise<{ probability: number; isSpeaking: boolean }> {
+    if (!this.session || !this.state) {
+      throw new Error("VAD service not initialized");
+    }
+
+    try {
+      // Create input tensor - shape should be [1, audio_length]
+      const inputTensor = new ort.Tensor("float32", audioFrames, [
+        1,
+        audioFrames.length,
+      ]);
+
+      const srTensor = new ort.Tensor(
+        "int64",
+        BigInt64Array.from([BigInt(this.sr)]),
+        [],
+      );
+
+      // Run inference with input, state, and sr
+      const results = await this.session.run({
+        input: inputTensor,
+        state: this.state,
+        sr: srTensor,
+      });
+
+      // Update state for next iteration
+      this.state = results.stateN as ort.Tensor;
+
+      // Get speech probability
+      const output = results.output as ort.Tensor;
+      const probability = output.data[0] as number;
+
+      // Apply smoothing logic
+      const isSpeaking = this.applySpeechDetectionLogic(probability);
+
+      return { probability, isSpeaking };
+    } catch (error) {
+      logger.main.error("VAD inference failed:", error);
+      throw error;
+    }
+  }
+
+  private applySpeechDetectionLogic(probability: number): boolean {
+    const isSpeechFrame = probability > this.SPEECH_THRESHOLD;
+
+    if (isSpeechFrame) {
+      this.speechFrameCount++;
+      this.silenceFrameCount = 0;
+    } else {
+      this.silenceFrameCount++;
+      if (this.silenceFrameCount > this.REDEMPTION_FRAMES) {
+        this.speechFrameCount = 0;
+      }
+    }
+
+    // Start speaking after enough speech frames
+    if (!this.isSpeaking && this.speechFrameCount >= 3) {
+      this.isSpeaking = true;
+      logger.main.debug("Speech started");
+      this.emit("voice-detected", true);
+    }
+
+    // Stop speaking after enough silence
+    if (this.isSpeaking && this.silenceFrameCount >= this.REDEMPTION_FRAMES) {
+      this.isSpeaking = false;
+      logger.main.debug("Speech ended");
+      this.emit("voice-detected", false);
+    }
+
+    return this.isSpeaking;
+  }
+
+  async processAudioFrame(
+    audioBuffer: ArrayBuffer,
+  ): Promise<{ probability: number; isSpeaking: boolean }> {
+    // Convert ArrayBuffer to Float32Array
+    const float32Array = new Float32Array(audioBuffer);
+
+    // Silero VAD requires exactly 512 samples
+    if (float32Array.length !== this.WINDOW_SIZE_SAMPLES) {
+      // If we have fewer samples (e.g., final buffer flush), pad with zeros
+      if (float32Array.length < this.WINDOW_SIZE_SAMPLES) {
+        const paddedArray = new Float32Array(this.WINDOW_SIZE_SAMPLES);
+        paddedArray.set(float32Array);
+        // Rest is already zeros
+        return this.processBatch(paddedArray);
+      } else {
+        // If we have more samples, just process the first 512
+        const truncatedArray = float32Array.slice(0, this.WINDOW_SIZE_SAMPLES);
+        return this.processBatch(truncatedArray);
+      }
+    }
+
+    // Process through VAD
+    return this.processBatch(float32Array);
+  }
+
+  getSpeechState(): boolean {
+    return this.isSpeaking;
+  }
+
+  async dispose(): Promise<void> {
+    if (this.session) {
+      await this.session.release();
+      this.session = null;
+    }
+    this.state = null;
+    logger.main.info("VAD service disposed");
+  }
+}
--- a/apps/desktop/src/trpc/routers/recording.ts
+++ b/apps/desktop/src/trpc/routers/recording.ts
@ -3,6 +3,7 @@ import { observable } from "@trpc/server/observable";
 import superjson from "superjson";
 import { ServiceManager } from "../../main/managers/service-manager";
 import type { RecordingStatus } from "../../types/recording";
+import { logger } from "../../main/logger";

 const t = initTRPC.create({
  isServer: true,
@ -61,4 +62,36 @@ export const recordingRouter = t.router({
      };
    });
  }),
+
+  // Voice detection subscription
+  voiceDetectionUpdates: t.procedure.subscription(() => {
+    return observable<boolean>((emit) => {
+      const serviceManager = ServiceManager.getInstance();
+      if (!serviceManager) {
+        throw new Error("ServiceManager not initialized");
+      }
+
+      const vadService = serviceManager.getVADService();
+      if (!vadService) {
+        logger.main.warn(
+          "VAD service not available for voice detection subscription",
+        );
+        // Emit false and complete immediately if VAD is not available
+        emit.next(false);
+        return () => {};
+      }
+
+      // Set up listener for voice detection changes
+      const handleVoiceDetection = (detected: boolean) => {
+        emit.next(detected);
+      };
+
+      vadService.on("voice-detected", handleVoiceDetection);
+
+      // Cleanup function
+      return () => {
+        vadService.off("voice-detected", handleVoiceDetection);
+      };
+    });
+  }),
 });
--- a/apps/desktop/src/types/vite-env.d.ts
+++ b/apps/desktop/src/types/vite-env.d.ts
@ -0,0 +1,13 @@
+/// <reference types="vite/client" />
+
+// Declare module for URL imports
+declare module "*?url" {
+  const url: string;
+  export default url;
+}
+
+// Declare module for raw imports
+declare module "*?raw" {
+  const content: string;
+  export default content;
+}
--- a/apps/desktop/vite.main.config.mts
+++ b/apps/desktop/vite.main.config.mts
@ -14,6 +14,7 @@ export default defineConfig({
        "@libsql/linux-x64-musl",
        "@libsql/win32-x64-msvc",
        "libsql",
+        "onnxruntime-node",
        /^node:/,
        /^electron$/,
      ],
--- a/package.json
+++ b/package.json
@ -38,7 +38,8 @@
      "drizzle-orm/libsql",
      "@libsql",
      "macos-alias",
-      "fs-xattr"
+      "fs-xattr",
+      "onnxruntime-node"
    ]
  }
 }
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@ -128,9 +128,6 @@ importers:
      '@radix-ui/react-tooltip':
        specifier: ^1.2.7
        version: 1.2.7(@types/react-dom@19.1.5(@types/react@19.1.5))(@types/react@19.1.5)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
-      '@ricky0123/vad-web':
-        specifier: ^0.0.24
-        version: 0.0.24
      '@tabler/icons-react':
        specifier: ^3.34.0
        version: 3.34.0(react@19.1.0)
@ -218,6 +215,9 @@ importers:
      next-themes:
        specifier: ^0.4.6
        version: 0.4.6(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
+      onnxruntime-node:
+        specifier: ^1.20.1
+        version: 1.22.0
      openai:
        specifier: ^4.98.0
        version: 4.103.0(encoding@0.1.13)(ws@8.18.0)(zod@3.25.67)
@ -2042,36 +2042,6 @@ packages:
    resolution: {integrity: sha512-ROFF39F6ZrnzSUEmQQZUar0Jt4xVoP9WnDRdWwF4NNcXs3xBTLgBUDoOwW141y1jP+S8nahIbdxbFC7IShw9Iw==}
    engines: {node: ^12.20.0 || ^14.18.0 || >=16.0.0}

-  '@protobufjs/aspromise@1.1.2':
-    resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==}
-
-  '@protobufjs/base64@1.1.2':
-    resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==}
-
-  '@protobufjs/codegen@2.0.4':
-    resolution: {integrity: sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==}
-
-  '@protobufjs/eventemitter@1.1.0':
-    resolution: {integrity: sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==}
-
-  '@protobufjs/fetch@1.1.0':
-    resolution: {integrity: sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==}
-
-  '@protobufjs/float@1.0.2':
-    resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==}
-
-  '@protobufjs/inquire@1.1.0':
-    resolution: {integrity: sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==}
-
-  '@protobufjs/path@1.1.2':
-    resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==}
-
-  '@protobufjs/pool@1.1.0':
-    resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==}
-
-  '@protobufjs/utf8@1.1.0':
-    resolution: {integrity: sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==}
-
  '@radix-ui/number@1.1.1':
    resolution: {integrity: sha512-MkKCwxlXTgz6CFoJx3pCwn07GKp36+aZyu/u2Ln2VrA5DcdyCZkASEDBTd8x5whTQQL5CiYf4prXKLcgQdv29g==}

@ -2684,9 +2654,6 @@ packages:
  '@radix-ui/rect@1.1.1':
    resolution: {integrity: sha512-HPwpGIzkl28mWyZqG52jiqDJ12waP11Pa1lGoiyUkIEuMLBP0oeK/C89esbXrxsky5we7dfd8U58nm0SgAWpVw==}

-  '@ricky0123/vad-web@0.0.24':
-    resolution: {integrity: sha512-uv6GWW/kq8BkVErMQzXp3uwSyYMT3w/3QJiUerVaaKp7EwhOTIRY+96EoyFdG2WOFU5RkLk/2CVGbI7nDlxhEg==}
-
  '@rollup/plugin-commonjs@28.0.6':
    resolution: {integrity: sha512-XSQB1K7FUU5QP+3lOQmVCE3I0FcbbNvmNT4VJSj93iUjayaARrTQeoRdiYQoftAJBLrR9t2agwAd3ekaTgHNlw==}
    engines: {node: '>=16.0.0 || 14 >= 14.17'}
@ -3317,9 +3284,6 @@ packages:
  '@types/keyv@3.1.4':
    resolution: {integrity: sha512-BQ5aZNSCpj7D6K2ksrRCTmKRLEpnPvWDiLPfoGyhZ++8YtiK9d/3DBKPJgry359X/P1PfruyYwvnvwFjuEiEIg==}

-  '@types/long@4.0.2':
-    resolution: {integrity: sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA==}
-
  '@types/mdast@4.0.4':
    resolution: {integrity: sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==}

@ -3486,6 +3450,10 @@ packages:
    engines: {node: '>=0.4.0'}
    hasBin: true

+  adm-zip@0.5.16:
+    resolution: {integrity: sha512-TGw5yVi4saajsSEgz25grObGHEUaDrniwvA2qwSC060KfqGPdglhvPMA2lPIoxs3PQIItj2iag35fONcQqgUaQ==}
+    engines: {node: '>=12.0'}
+
  agent-base@6.0.2:
    resolution: {integrity: sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==}
    engines: {node: '>= 6.0.0'}
@ -4953,9 +4921,6 @@ packages:
    resolution: {integrity: sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==}
    engines: {node: '>=16'}

-  flatbuffers@1.12.0:
-    resolution: {integrity: sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==}
-
  flatted@3.3.3:
    resolution: {integrity: sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==}

@ -5266,9 +5231,6 @@ packages:
    resolution: {integrity: sha512-5v6yZd4JK3eMI3FqqCouswVqwugaA9r4dNZB1wwcmrD02QkV5H0y7XBQW8QwQqEaZY1pM9aqORSORhJRdNK44Q==}
    engines: {node: '>=6.0'}

-  guid-typescript@1.0.9:
-    resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==}
-
  handlebars@4.7.7:
    resolution: {integrity: sha512-aAcXm5OAfE/8IXkcZvCepKU3VzW1/39Fb5ZuqMtgI/hT8X2YgoMvBY5dLhq/cpOvw7Lk1nK/UF71aLG/ZnVYRA==}
    engines: {node: '>=0.4.7'}
@ -5918,9 +5880,6 @@ packages:
    resolution: {integrity: sha512-5UtUDQ/6edw4ofyljDNcOVJQ4c7OjDro4h3y8e1GQL5iYElYclVHJ3zeWchylvMaKnDbDilC8irOVyexnA/Slw==}
    engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}

-  long@4.0.0:
-    resolution: {integrity: sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA==}
-
  longest-streak@3.1.0:
    resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==}

@ -6506,14 +6465,12 @@ packages:
  oniguruma-to-es@4.3.3:
    resolution: {integrity: sha512-rPiZhzC3wXwE59YQMRDodUwwT9FZ9nNBwQQfsd1wfdtlKEyCdRV0avrTcSZ5xlIvGRVPd/cx6ZN45ECmS39xvg==}

-  onnx-proto@4.0.4:
-    resolution: {integrity: sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA==}
+  onnxruntime-common@1.22.0:
+    resolution: {integrity: sha512-vcuaNWgtF2dGQu/EP5P8UI5rEPEYqXG2sPPe5j9lg2TY/biJF8eWklTMwlDO08iuXq48xJo0awqIpK5mPG+IxA==}

-  onnxruntime-common@1.14.0:
-    resolution: {integrity: sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew==}
-
-  onnxruntime-web@1.14.0:
-    resolution: {integrity: sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw==}
+  onnxruntime-node@1.22.0:
+    resolution: {integrity: sha512-QaAqr7PFekrmEsmu1rpw7OxJYyG+iACjNHoNtQIVt9Oh7st8WDPIIUe6KhF9l35HVJTJd9CV1rePoPmKhSV26g==}
+    os: [win32, darwin, linux]

  openai@4.103.0:
    resolution: {integrity: sha512-eWcz9kdurkGOFDtd5ySS5y251H2uBgq9+1a2lTBnjMMzlexJ40Am5t6Mu76SSE87VvitPa0dkIAp75F+dZVC0g==}
@ -6715,9 +6672,6 @@ packages:
    resolution: {integrity: sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==}
    engines: {node: '>=0.10.0'}

-  platform@1.3.6:
-    resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==}
-
  plist@3.1.0:
    resolution: {integrity: sha512-uysumyrvkUX0rX/dEVqt8gC3sTBzd4zoWfLeS29nb53imdaXVvLINYXTI2GNqzaMuvacNx4uJQ8+b3zXR0pkgQ==}
    engines: {node: '>=10.4.0'}
@ -6804,10 +6758,6 @@ packages:
  property-information@7.1.0:
    resolution: {integrity: sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ==}

-  protobufjs@6.11.4:
-    resolution: {integrity: sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw==}
-    hasBin: true
-
  proxy-addr@2.0.7:
    resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==}
    engines: {node: '>= 0.10'}
@ -10147,29 +10097,6 @@ snapshots:

  '@pkgr/core@0.2.4': {}

-  '@protobufjs/aspromise@1.1.2': {}
-
-  '@protobufjs/base64@1.1.2': {}
-
-  '@protobufjs/codegen@2.0.4': {}
-
-  '@protobufjs/eventemitter@1.1.0': {}
-
-  '@protobufjs/fetch@1.1.0':
-    dependencies:
-      '@protobufjs/aspromise': 1.1.2
-      '@protobufjs/inquire': 1.1.0
-
-  '@protobufjs/float@1.0.2': {}
-
-  '@protobufjs/inquire@1.1.0': {}
-
-  '@protobufjs/path@1.1.2': {}
-
-  '@protobufjs/pool@1.1.0': {}
-
-  '@protobufjs/utf8@1.1.0': {}
-
  '@radix-ui/number@1.1.1': {}

  '@radix-ui/primitive@1.1.2': {}
@ -10823,10 +10750,6 @@ snapshots:

  '@radix-ui/rect@1.1.1': {}

-  '@ricky0123/vad-web@0.0.24':
-    dependencies:
-      onnxruntime-web: 1.14.0
-
  '@rollup/plugin-commonjs@28.0.6(rollup@4.41.0)':
    dependencies:
      '@rollup/pluginutils': 5.2.0(rollup@4.41.0)
@ -11560,8 +11483,6 @@ snapshots:
    dependencies:
      '@types/node': 22.15.12

-  '@types/long@4.0.2': {}
-
  '@types/mdast@4.0.4':
    dependencies:
      '@types/unist': 3.0.3
@ -11744,6 +11665,8 @@ snapshots:

  acorn@8.14.1: {}

+  adm-zip@0.5.16: {}
+
  agent-base@6.0.2:
    dependencies:
      debug: 4.4.1
@ -12006,8 +11929,7 @@ snapshots:
    transitivePeerDependencies:
      - supports-color

-  boolean@3.2.0:
-    optional: true
+  boolean@3.2.0: {}

  bottleneck@2.19.5: {}

@ -12583,8 +12505,7 @@ snapshots:

  detect-node-es@1.1.0: {}

-  detect-node@2.1.0:
-    optional: true
+  detect-node@2.1.0: {}

  devlop@1.1.0:
    dependencies:
@ -12929,8 +12850,7 @@ snapshots:
      is-date-object: 1.1.0
      is-symbol: 1.1.1

-  es6-error@4.1.1:
-    optional: true
+  es6-error@4.1.1: {}

  esast-util-from-estree@2.0.0:
    dependencies:
@ -13474,8 +13394,6 @@ snapshots:
      flatted: 3.3.3
      keyv: 4.5.4

-  flatbuffers@1.12.0: {}
-
  flatted@3.3.3: {}

  flora-colossus@2.0.0:
@ -13833,7 +13751,6 @@ snapshots:
      roarr: 2.15.4
      semver: 7.7.2
      serialize-error: 7.0.1
-    optional: true

  global-dirs@3.0.1:
    dependencies:
@ -13904,8 +13821,6 @@ snapshots:
      section-matter: 1.0.0
      strip-bom-string: 1.0.0

-  guid-typescript@1.0.9: {}
-
  handlebars@4.7.7:
    dependencies:
      minimist: 1.2.8
@ -14443,8 +14358,7 @@ snapshots:

  json-stable-stringify-without-jsonify@1.0.1: {}

-  json-stringify-safe@5.0.1:
-    optional: true
+  json-stringify-safe@5.0.1: {}

  json5@1.0.2:
    dependencies:
@ -14623,8 +14537,6 @@ snapshots:
      strip-ansi: 7.1.0
      wrap-ansi: 8.1.0

-  long@4.0.0: {}
-
  longest-streak@3.1.0: {}

  loose-envify@1.4.0:
@ -14695,7 +14607,6 @@ snapshots:
  matcher@3.0.0:
    dependencies:
      escape-string-regexp: 4.0.0
-    optional: true

  math-intrinsics@1.1.0: {}

@ -15488,20 +15399,13 @@ snapshots:
      regex: 6.0.1
      regex-recursion: 6.0.2

-  onnx-proto@4.0.4:
-    dependencies:
-      protobufjs: 6.11.4
+  onnxruntime-common@1.22.0: {}

-  onnxruntime-common@1.14.0: {}
-
-  onnxruntime-web@1.14.0:
+  onnxruntime-node@1.22.0:
    dependencies:
-      flatbuffers: 1.12.0
-      guid-typescript: 1.0.9
-      long: 4.0.0
-      onnx-proto: 4.0.4
-      onnxruntime-common: 1.14.0
-      platform: 1.3.6
+      adm-zip: 0.5.16
+      global-agent: 3.0.0
+      onnxruntime-common: 1.22.0

  openai@4.103.0(encoding@0.1.13)(ws@8.18.0)(zod@3.25.67):
    dependencies:
@ -15705,8 +15609,6 @@ snapshots:

  pify@2.3.0: {}

-  platform@1.3.6: {}
-
  plist@3.1.0:
    dependencies:
      '@xmldom/xmldom': 0.8.10
@ -15790,22 +15692,6 @@ snapshots:

  property-information@7.1.0: {}

-  protobufjs@6.11.4:
-    dependencies:
-      '@protobufjs/aspromise': 1.1.2
-      '@protobufjs/base64': 1.1.2
-      '@protobufjs/codegen': 2.0.4
-      '@protobufjs/eventemitter': 1.1.0
-      '@protobufjs/fetch': 1.1.0
-      '@protobufjs/float': 1.0.2
-      '@protobufjs/inquire': 1.1.0
-      '@protobufjs/path': 1.1.2
-      '@protobufjs/pool': 1.1.0
-      '@protobufjs/utf8': 1.1.0
-      '@types/long': 4.0.2
-      '@types/node': 22.15.12
-      long: 4.0.0
-
  proxy-addr@2.0.7:
    dependencies:
      forwarded: 0.2.0
@ -16341,7 +16227,6 @@ snapshots:
      json-stringify-safe: 5.0.1
      semver-compare: 1.0.0
      sprintf-js: 1.1.3
-    optional: true

  rollup@4.41.0:
    dependencies:
@ -16425,8 +16310,7 @@ snapshots:

  secure-json-parse@2.7.0: {}

-  semver-compare@1.0.0:
-    optional: true
+  semver-compare@1.0.0: {}

  semver@5.7.2: {}

@ -16462,7 +16346,6 @@ snapshots:
  serialize-error@7.0.1:
    dependencies:
      type-fest: 0.13.1
-    optional: true

  serve-favicon@2.5.0:
    dependencies:
@ -17180,8 +17063,7 @@ snapshots:
    dependencies:
      prelude-ls: 1.2.1

-  type-fest@0.13.1:
-    optional: true
+  type-fest@0.13.1: {}

  type-fest@0.21.3: {}