diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4c55e70..b2d3a4f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ env: jobs: build: - runs-on: ubuntu-latest + runs-on: macos-latest steps: - name: Checkout repository diff --git a/apps/desktop/forge.config.ts b/apps/desktop/forge.config.ts index 6fb495b..2d4f04a 100644 --- a/apps/desktop/forge.config.ts +++ b/apps/desktop/forge.config.ts @@ -34,6 +34,7 @@ export const EXTERNAL_DEPENDENCIES = [ "@libsql/linux-x64-musl", "@libsql/win32-x64-msvc", "libsql", + "onnxruntime-node", // Add any other native modules you need here ]; @@ -195,13 +196,16 @@ const config: ForgeConfig = { }, }, packagerConfig: { - asar: true, + asar: { + unpack: "{*.node,*.dylib,*.so,*.dll}", + }, name: "Amical", executableName: "Amical", icon: "./assets/logo.icns", // Path to your icon file extraResource: [ "../../packages/native-helpers/swift-helper/bin", "./src/db/migrations", + "./src/assets", ], extendInfo: { NSMicrophoneUsageDescription: diff --git a/apps/desktop/package.json b/apps/desktop/package.json index bc4bb79..782e6f9 100644 --- a/apps/desktop/package.json +++ b/apps/desktop/package.json @@ -86,7 +86,6 @@ "@radix-ui/react-toggle": "^1.1.9", "@radix-ui/react-toggle-group": "^1.1.10", "@radix-ui/react-tooltip": "^1.2.7", - "@ricky0123/vad-web": "^0.0.24", "@tabler/icons-react": "^3.34.0", "@tanstack/react-query": "^5.81.2", "@tanstack/react-table": "^8.21.3", @@ -116,6 +115,7 @@ "libsql": "^0.5.13", "lucide-react": "^0.510.0", "next-themes": "^0.4.6", + "onnxruntime-node": "^1.20.1", "openai": "^4.98.0", "react": "^19.1.0", "react-day-picker": "8.10.1", diff --git a/apps/desktop/src/assets/audio-recorder-processor.js b/apps/desktop/src/assets/audio-recorder-processor.js new file mode 100644 index 0000000..ea3e333 --- /dev/null +++ b/apps/desktop/src/assets/audio-recorder-processor.js @@ -0,0 +1,56 @@ +class AudioRecorderProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.frameSize = 512; // 32ms at 16kHz + this.sampleRate = 16000; + this.buffer = []; + + // Listen for control messages + this.port.onmessage = (event) => { + if (event.data.type === 'flush') { + this.flushBuffer(); + } + }; + } + + flushBuffer() { + // Always send a final frame to signal end of recording + const finalFrame = new Float32Array(this.buffer); + this.buffer = []; + + this.port.postMessage({ + type: 'audioFrame', + frame: finalFrame, + isFinal: true + }); + } + + process(inputs, outputs, parameters) { + const input = inputs[0]; + if (!input || !input[0]) return true; + + const channelData = input[0]; + + // Add samples to buffer + for (let i = 0; i < channelData.length; i++) { + this.buffer.push(channelData[i]); + } + + // When we have enough samples, send a frame + while (this.buffer.length >= this.frameSize) { + const frame = this.buffer.slice(0, this.frameSize); + this.buffer = this.buffer.slice(this.frameSize); + + // Send frame to main thread + this.port.postMessage({ + type: 'audioFrame', + frame: new Float32Array(frame), + isFinal: false + }); + } + + return true; + } +} + +registerProcessor('audio-recorder-processor', AudioRecorderProcessor); \ No newline at end of file diff --git a/apps/desktop/src/assets/silero_vad_v5.onnx b/apps/desktop/src/assets/silero_vad_v5.onnx new file mode 100644 index 0000000..b3e3a90 Binary files /dev/null and b/apps/desktop/src/assets/silero_vad_v5.onnx differ diff --git a/apps/desktop/src/hooks/audio-recorder-worklet.ts b/apps/desktop/src/hooks/audio-recorder-worklet.ts deleted file mode 100644 index 4202452..0000000 --- a/apps/desktop/src/hooks/audio-recorder-worklet.ts +++ /dev/null @@ -1,65 +0,0 @@ -// AudioWorklet processor source code -export const audioRecorderWorkletSource = ` -// AudioWorklet processor for real-time audio capture -// This runs in the audio rendering thread for low-latency processing -/* eslint-env worker */ -/* global AudioWorkletProcessor, registerProcessor */ - -class AudioRecorderProcessor extends AudioWorkletProcessor { - constructor() { - super(); - this.bufferSize = 4096; - this.buffer = new Float32Array(this.bufferSize); - this.bufferIndex = 0; - - // Listen for messages from main thread - this.port.onmessage = (event) => { - if (event.data.command === 'stop') { - this.sendBufferedAudio(true); // Send final chunk - } - }; - } - - process(inputs, _outputs, _parameters) { - const input = inputs[0]; - - // Check if we have input audio - if (input && input.length > 0) { - const inputChannel = input[0]; // Get first (mono) channel - - // Buffer the audio data - for (let i = 0; i < inputChannel.length; i++) { - this.buffer[this.bufferIndex] = inputChannel[i]; - this.bufferIndex++; - - // When buffer is full, send it to main thread - if (this.bufferIndex >= this.bufferSize) { - this.sendBufferedAudio(false); - this.bufferIndex = 0; // Reset buffer - } - } - } - - // Keep the processor alive - return true; - } - - sendBufferedAudio(isFinal) { - if (this.bufferIndex > 0 || isFinal) { - // Create a copy of the current buffer data - const audioData = new Float32Array(this.bufferIndex); - audioData.set(this.buffer.subarray(0, this.bufferIndex)); - - // Send to main thread - this.port.postMessage({ - type: 'audioData', - audioData: audioData, - isFinal: isFinal, - }); - } - } -} - -// Register the processor -registerProcessor('audio-recorder-processor', AudioRecorderProcessor); -`; diff --git a/apps/desktop/src/hooks/useAudioCapture.ts b/apps/desktop/src/hooks/useAudioCapture.ts index 143b761..88227ae 100644 --- a/apps/desktop/src/hooks/useAudioCapture.ts +++ b/apps/desktop/src/hooks/useAudioCapture.ts @@ -1,13 +1,17 @@ -import { useState, useRef, useEffect } from "react"; -import { MicVAD } from "@ricky0123/vad-web"; -import { audioRecorderWorkletSource } from "./audio-recorder-worklet"; +import { useRef, useEffect, useState, useCallback } from "react"; +import audioWorkletUrl from "@/assets/audio-recorder-processor.js?url"; +import { api } from "@/trpc/react"; + +// Audio configuration +const FRAME_SIZE = 512; // 32ms at 16kHz +const SAMPLE_RATE = 16000; export interface UseAudioCaptureParams { onAudioChunk: ( arrayBuffer: ArrayBuffer, + speechProbability: number, isFinalChunk: boolean, ) => Promise | void; - chunkDurationMs?: number; enabled: boolean; } @@ -15,268 +19,136 @@ export interface UseAudioCaptureOutput { voiceDetected: boolean; } -interface AudioCaptureState { - stream: MediaStream | null; - vad: MicVAD | null; - audioContext: AudioContext | null; - audioWorkletNode: AudioWorkletNode | null; - source: MediaStreamAudioSourceNode | null; - chunkTimer: NodeJS.Timeout | null; - pendingAudioChunks: Float32Array[]; - sendAudioChunk: ((isFinal: boolean) => Promise) | null; -} - export const useAudioCapture = ({ onAudioChunk, - chunkDurationMs = 28000, enabled, }: UseAudioCaptureParams): UseAudioCaptureOutput => { const [voiceDetected, setVoiceDetected] = useState(false); - const stateRef = useRef({ - stream: null, - vad: null, - audioContext: null, - audioWorkletNode: null, - source: null, - chunkTimer: null, - pendingAudioChunks: [], - sendAudioChunk: null, + const audioContextRef = useRef(null); + const sourceRef = useRef(null); + const workletNodeRef = useRef(null); + const streamRef = useRef(null); + + // Subscribe to voice detection updates via tRPC + api.recording.voiceDetectionUpdates.useSubscription(undefined, { + onData: (detected: boolean) => { + setVoiceDetected(detected); + }, + onError: (err) => { + console.error("Voice detection subscription error:", err); + }, }); - // Main effect to handle enabled state changes - useEffect(() => { - let isCancelled = false; + const startCapture = useCallback(async () => { + try { + console.log("AudioCapture: Starting audio capture"); - const cleanup = async () => { - const state = stateRef.current; - - // Send final chunk if we have pending audio - if (state.sendAudioChunk) { - try { - await state.sendAudioChunk(true); - } catch (error) { - console.error("AudioCapture: Error sending final chunk:", error); - } - } - - // Clear chunk timer - if (state.chunkTimer) { - clearInterval(state.chunkTimer); - state.chunkTimer = null; - } - - // Cleanup AudioWorklet - if (state.audioWorkletNode) { - state.audioWorkletNode.port.postMessage({ command: "stop" }); - state.audioWorkletNode.disconnect(); - state.audioWorkletNode = null; - } - - if (state.source) { - state.source.disconnect(); - state.source = null; - } - - if (state.audioContext && state.audioContext.state !== "closed") { - await state.audioContext.close(); - state.audioContext = null; - } - - // Cleanup VAD - if (state.vad) { - try { - state.vad.destroy(); - console.log("AudioCapture: VAD destroyed"); - } catch (e) { - console.error("Error destroying VAD:", e); - } - state.vad = null; - } - - // Stop media stream - if (state.stream) { - state.stream.getTracks().forEach((track) => { - try { - track.stop(); - } catch (e) { - console.error("Error stopping stream track:", e); - } - }); - state.stream = null; - } - - // Reset state - state.pendingAudioChunks = []; - state.sendAudioChunk = null; - setVoiceDetected(false); - - console.log("AudioCapture: Cleaned up"); - }; - - const startCapture = async () => { - console.log("AudioCapture: Starting capture..."); - - try { - // Get microphone access - const stream = await navigator.mediaDevices.getUserMedia({ - audio: true, - }); - if (isCancelled) { - stream.getTracks().forEach((track) => track.stop()); - return; - } - stateRef.current.stream = stream; - - // Set up Web Audio API with AudioWorklet for raw PCM data - const audioContext = new AudioContext({ sampleRate: 16000 }); - stateRef.current.audioContext = audioContext; - - // Load AudioWorklet module using blob URL - const blob = new Blob([audioRecorderWorkletSource], { - type: "application/javascript", - }); - const audioWorkletUrl = URL.createObjectURL(blob); - - try { - await audioContext.audioWorklet.addModule(audioWorkletUrl); - } finally { - URL.revokeObjectURL(audioWorkletUrl); - } - - if (isCancelled) { - await cleanup(); - return; - } - - const source = audioContext.createMediaStreamSource(stream); - stateRef.current.source = source; - - // Create AudioWorklet node - const audioWorkletNode = new AudioWorkletNode( - audioContext, - "audio-recorder-processor", - ); - stateRef.current.audioWorkletNode = audioWorkletNode; - - // Create function to send accumulated chunks - const sendAudioChunk = async (isFinal = false) => { - const pendingChunks = stateRef.current.pendingAudioChunks; - if (pendingChunks.length > 0) { - // Combine all pending chunks into one array - const totalLength = pendingChunks.reduce( - (sum, chunk) => sum + chunk.length, - 0, - ); - const combinedChunk = new Float32Array(totalLength); - let offset = 0; - - for (const chunk of pendingChunks) { - combinedChunk.set(chunk, offset); - offset += chunk.length; - } - - // Convert Float32Array to ArrayBuffer for IPC - const arrayBuffer = combinedChunk.buffer.slice( - combinedChunk.byteOffset, - combinedChunk.byteOffset + combinedChunk.byteLength, - ); - - try { - await onAudioChunk(arrayBuffer, isFinal); - console.log( - `AudioCapture: Sent chunk: ${combinedChunk.length} samples, final: ${isFinal}`, - ); - } catch (error) { - console.error("AudioCapture: Error processing chunk:", error); - } - - stateRef.current.pendingAudioChunks = []; // Clear chunks after sending - } - }; - - stateRef.current.sendAudioChunk = sendAudioChunk; - - // Handle messages from AudioWorklet - audioWorkletNode.port.onmessage = (event) => { - if (event.data.type === "audioData") { - const audioData = event.data.audioData as Float32Array; - const isFinal = event.data.isFinal as boolean; - - // Store the audio chunk - stateRef.current.pendingAudioChunks.push(audioData); - - if (isFinal) { - // Send final chunk immediately - sendAudioChunk(true); - } - } - }; - - // Set up periodic chunk sending - const chunkTimer = setInterval(() => { - sendAudioChunk(false); - }, chunkDurationMs); - stateRef.current.chunkTimer = chunkTimer; - - // Connect the audio processing chain - source.connect(audioWorkletNode); - console.log("AudioCapture: Connected AudioWorklet processing chain"); - - // Set up VAD - const vad = await MicVAD.new({ - stream, - model: "v5", - onSpeechStart: () => { - // Check if component is still mounted before updating state - if (!isCancelled) { - console.log("VAD: Speech started"); - setVoiceDetected(true); - } - }, - onSpeechEnd: () => { - console.log("VAD: Speech ended"); - // Check if component is still mounted before updating state - if (!isCancelled) { - console.log("VAD: Speech ended"); - setVoiceDetected(false); - } - }, - }); - - // Store VAD reference immediately to ensure proper cleanup - stateRef.current.vad = vad; - - if (isCancelled) { - await cleanup(); - return; - } - - vad.start(); - console.log("AudioCapture: VAD started"); - - console.log("AudioCapture: Fully started"); - } catch (err) { - console.error("AudioCapture: Error starting:", err); - await cleanup(); - throw err; - } - }; - - // Handle enabled state - if (enabled) { - startCapture().catch((err) => { - console.error("AudioCapture: Failed to start:", err); + // Get microphone stream + streamRef.current = await navigator.mediaDevices.getUserMedia({ + audio: { + channelCount: 1, + sampleRate: SAMPLE_RATE, + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + }, }); + + // Create audio context + audioContextRef.current = new AudioContext({ sampleRate: SAMPLE_RATE }); + + // Load audio worklet + await audioContextRef.current.audioWorklet.addModule(audioWorkletUrl); + + // Create nodes + sourceRef.current = audioContextRef.current.createMediaStreamSource( + streamRef.current, + ); + workletNodeRef.current = new AudioWorkletNode( + audioContextRef.current, + "audio-recorder-processor", + ); + + // Handle audio frames from worklet + workletNodeRef.current.port.onmessage = async (event) => { + if (event.data.type === "audioFrame") { + const frame = event.data.frame; + const isFinal = event.data.isFinal || false; + + // Convert to ArrayBuffer for IPC + const arrayBuffer = frame.buffer.slice( + frame.byteOffset, + frame.byteOffset + frame.byteLength, + ); + + // Send to main process for VAD processing + // Main process will update voice detection state + await onAudioChunk(arrayBuffer, 0, isFinal); // Speech probability will come from main + + console.log( + `AudioCapture: Sent frame: ${frame.length} samples, isFinal: ${isFinal}`, + ); + } + }; + + // Connect audio graph + sourceRef.current.connect(workletNodeRef.current); + + console.log("AudioCapture: Audio capture started"); + } catch (error) { + console.error("AudioCapture: Failed to start capture:", error); + throw error; + } + }, [onAudioChunk]); + + const stopCapture = useCallback(() => { + console.log("AudioCapture: Stopping audio capture"); + + // Send flush command to worklet before disconnecting + if (workletNodeRef.current) { + workletNodeRef.current.port.postMessage({ type: "flush" }); + console.log("AudioCapture: Sent flush command to worklet"); } - // Cleanup function - return () => { - isCancelled = true; - cleanup().catch((err) => { - console.error("AudioCapture: Cleanup error:", err); + // Disconnect nodes + if (sourceRef.current && workletNodeRef.current) { + sourceRef.current.disconnect(workletNodeRef.current); + } + + // Close audio context + if (audioContextRef.current && audioContextRef.current.state !== "closed") { + audioContextRef.current.close(); + } + + // Stop media stream + if (streamRef.current) { + streamRef.current.getTracks().forEach((track) => track.stop()); + } + + // Clear refs + audioContextRef.current = null; + sourceRef.current = null; + workletNodeRef.current = null; + streamRef.current = null; + + setVoiceDetected(false); + console.log("AudioCapture: Audio capture stopped"); + }, []); + + // Start/stop based on enabled state + useEffect(() => { + if (enabled) { + startCapture().catch((error) => { + console.error("AudioCapture: Failed to start:", error); }); + } else { + stopCapture(); + } + + return () => { + stopCapture(); }; - }, [enabled, onAudioChunk, chunkDurationMs]); + }, [enabled, startCapture, stopCapture]); return { voiceDetected, diff --git a/apps/desktop/src/hooks/useRecording.ts b/apps/desktop/src/hooks/useRecording.ts index 87396d8..4a91f14 100644 --- a/apps/desktop/src/hooks/useRecording.ts +++ b/apps/desktop/src/hooks/useRecording.ts @@ -4,11 +4,11 @@ import { useAudioCapture } from "./useAudioCapture"; import type { RecordingState } from "@/types/recording"; export interface UseRecordingParams { - onAudioChunk: ( - arrayBuffer: ArrayBuffer, - isFinalChunk: boolean, + onAudioFrame: ( + audioBuffer: ArrayBuffer, + speechProbability: number, + isFinal: boolean, ) => Promise | void; - chunkDurationMs?: number; onRecordingStartCallback?: () => Promise | void; onRecordingStopCallback?: () => Promise | void; } @@ -21,8 +21,7 @@ export interface UseRecordingOutput { } export const useRecording = ({ - onAudioChunk, - chunkDurationMs = 28000, + onAudioFrame, onRecordingStartCallback, onRecordingStopCallback, }: UseRecordingParams): UseRecordingOutput => { @@ -33,13 +32,25 @@ export const useRecording = ({ stopRecording: stopRecordingMutation, } = useRecordingState(); + // Create handler for audio chunks - just pass through + const handleAudioChunk = useCallback( + async ( + arrayBuffer: ArrayBuffer, + speechProbability: number, + isFinalChunk: boolean, + ) => { + // Direct pass-through - no aggregation needed + await onAudioFrame(arrayBuffer, speechProbability, isFinalChunk); + }, + [onAudioFrame], + ); + // Manage audio capture when recording is active const isActive = recordingStatus === "recording" || recordingStatus === "starting"; const { voiceDetected } = useAudioCapture({ - onAudioChunk, - chunkDurationMs, + onAudioChunk: handleAudioChunk, enabled: isActive, }); @@ -121,7 +132,12 @@ export const useRecording = ({ } catch (error) { console.error("Hook: Error stopping recording:", error); } - }, [recordingStatus, stopRecordingMutation, onRecordingStopCallback]); + }, [ + recordingStatus, + stopRecordingMutation, + onRecordingStopCallback, + onAudioFrame, + ]); return { recordingStatus, diff --git a/apps/desktop/src/main/managers/recording-manager.ts b/apps/desktop/src/main/managers/recording-manager.ts index b402112..7722d6b 100644 --- a/apps/desktop/src/main/managers/recording-manager.ts +++ b/apps/desktop/src/main/managers/recording-manager.ts @@ -4,6 +4,7 @@ import { logger, logPerformance } from "../logger"; import { ServiceManager } from "./service-manager"; import { appContextStore } from "../../stores/app-context"; import type { RecordingState, RecordingStatus } from "../../types/recording"; +import { WindowManager } from "../core/window-manager"; /** * Manages recording state and coordinates audio recording across the application @@ -13,12 +14,17 @@ export class RecordingManager extends EventEmitter { private currentSessionId: string | null = null; private recordingState: RecordingState = "idle"; private lastError: string | undefined; + private windowManager: WindowManager | null = null; constructor(private serviceManager: ServiceManager) { super(); this.setupIPCHandlers(); } + public setWindowManager(windowManager: WindowManager): void { + this.windowManager = windowManager; + } + private setState(newState: RecordingState, error?: string): void { const oldState = this.recordingState; this.recordingState = newState; diff --git a/apps/desktop/src/main/managers/service-manager.ts b/apps/desktop/src/main/managers/service-manager.ts index 3c30fda..b2022de 100644 --- a/apps/desktop/src/main/managers/service-manager.ts +++ b/apps/desktop/src/main/managers/service-manager.ts @@ -6,6 +6,7 @@ import { SwiftIOBridge } from "../../services/platform/swift-bridge-service"; import { AutoUpdaterService } from "../services/auto-updater"; import { WindowManager } from "../core/window-manager"; import { RecordingManager } from "./recording-manager"; +import { VADService } from "../../services/vad-service"; /** * Manages service initialization and lifecycle @@ -17,6 +18,7 @@ export class ServiceManager { private modelManagerService: ModelManagerService | null = null; private transcriptionService: TranscriptionService | null = null; private settingsService: SettingsService | null = null; + private vadService: VADService | null = null; private swiftIOBridge: SwiftIOBridge | null = null; private autoUpdaterService: AutoUpdaterService | null = null; @@ -34,8 +36,9 @@ export class ServiceManager { this.initializeSettingsService(); await this.initializeModelServices(); this.initializePlatformServices(); + await this.initializeVADService(); await this.initializeAIServices(); - this.initializeRecordingManager(); + this.initializeRecordingManager(windowManager); this.initializeAutoUpdater(windowManager); this.isInitialized = true; @@ -57,6 +60,17 @@ export class ServiceManager { await this.modelManagerService.initialize(); } + private async initializeVADService(): Promise { + try { + this.vadService = new VADService(); + await this.vadService.initialize(); + logger.main.info("VAD service initialized"); + } catch (error) { + logger.main.error("Failed to initialize VAD service:", error); + // Don't throw - VAD is not critical for basic functionality + } + } + private async initializeAIServices(): Promise { try { if (!this.modelManagerService) { @@ -65,7 +79,9 @@ export class ServiceManager { this.transcriptionService = new TranscriptionService( this.modelManagerService, + this.vadService, ); + await this.transcriptionService.initialize(); // Load and configure formatter try { @@ -109,8 +125,9 @@ export class ServiceManager { } } - private initializeRecordingManager(): void { + private initializeRecordingManager(windowManager: WindowManager): void { this.recordingManager = new RecordingManager(this); + this.recordingManager.setWindowManager(windowManager); logger.main.info("Recording manager initialized"); } @@ -191,6 +208,15 @@ export class ServiceManager { return this.recordingManager; } + getVADService(): VADService | null { + if (!this.isInitialized) { + throw new Error( + "ServiceManager not initialized. Call initialize() first.", + ); + } + return this.vadService; + } + async cleanup(): Promise { if (this.recordingManager) { logger.main.info("Cleaning up recording manager..."); @@ -201,6 +227,11 @@ export class ServiceManager { this.modelManagerService.cleanup(); } + if (this.vadService) { + logger.main.info("Cleaning up VAD service..."); + await this.vadService.dispose(); + } + if (this.swiftIOBridge) { logger.main.info("Stopping Swift helper..."); this.swiftIOBridge.stopHelper(); diff --git a/apps/desktop/src/pipeline/core/pipeline-types.ts b/apps/desktop/src/pipeline/core/pipeline-types.ts index 27cd9a8..ac6a383 100644 --- a/apps/desktop/src/pipeline/core/pipeline-types.ts +++ b/apps/desktop/src/pipeline/core/pipeline-types.ts @@ -10,6 +10,7 @@ export { PipelineContext, SharedPipelineData } from "./context"; // Transcription input parameters export interface TranscribeParams { audioData: Buffer; + speechProbability?: number; // Speech probability from frontend VAD (0-1) context: { vocabulary?: Map; accessibilityContext?: GetAccessibilityContextResult | null; @@ -34,6 +35,7 @@ export interface FormatParams { export interface TranscriptionProvider { readonly name: string; transcribe(params: TranscribeParams): Promise; + flush?(): Promise; // Optional flush method for providers that buffer } // Formatting provider interface diff --git a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts index e025e90..c8b81fb 100644 --- a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts +++ b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts @@ -12,6 +12,19 @@ export class WhisperProvider implements TranscriptionProvider { private modelManager: ModelManagerService; private whisperInstance: Whisper | null = null; + // Frame aggregation state + private frameBuffer: Float32Array[] = []; + private frameBufferSpeechProbabilities: number[] = []; // Track speech probabilities for each frame + private silenceFrameCount = 0; + private lastSpeechTimestamp = 0; + + // Configuration + private readonly FRAME_SIZE = 512; // 32ms at 16kHz + private readonly MIN_SPEECH_DURATION_MS = 500; // Minimum speech duration to transcribe + private readonly MAX_SILENCE_DURATION_MS = 2000; // Max silence before cutting + private readonly SAMPLE_RATE = 16000; + private readonly SPEECH_PROBABILITY_THRESHOLD = 0.2; // Threshold for speech detection + constructor(modelManager: ModelManagerService) { this.modelManager = modelManager; } @@ -21,20 +34,53 @@ export class WhisperProvider implements TranscriptionProvider { await this.initializeWhisper(); // Extract parameters from the new structure - const { audioData, context } = params; + const { audioData, speechProbability = 0, context } = params; const { vocabulary, previousChunk, aggregatedTranscription } = context; // Convert audio buffer to the format expected by smart-whisper const audioFloat32Array = await this.convertAudioBuffer(audioData); + // Add frame to buffer with speech probability + this.frameBuffer.push(audioFloat32Array); + this.frameBufferSpeechProbabilities.push(speechProbability); + + // Consider it speech if probability is above threshold + const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD; + logger.transcription.debug( - `Starting transcription, audio size: ${audioData.length}`, - previousChunk - ? `Previous chunk: ${previousChunk.substring(0, 50)}...` - : "No previous chunk", - aggregatedTranscription - ? `Aggregated length: ${aggregatedTranscription.length}` - : "No aggregated transcription", + `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.silenceFrameCount}`, + ); + + // Handle speech/silence logic + if (isSpeech) { + this.silenceFrameCount = 0; + this.lastSpeechTimestamp = Date.now(); + } else { + this.silenceFrameCount++; + } + + // Determine if we should transcribe + const shouldTranscribe = this.shouldTranscribe(); + + if (!shouldTranscribe) { + // Keep buffering + return ""; + } + + // Aggregate buffered frames + const aggregatedAudio = this.aggregateFrames(); + + // Skip if too short or only silence + if (aggregatedAudio.length < this.FRAME_SIZE * 2) { + logger.transcription.debug("Skipping transcription - audio too short"); + this.frameBuffer = []; + this.frameBufferSpeechProbabilities = []; + this.silenceFrameCount = 0; + return ""; + } + + logger.transcription.debug( + `Starting transcription of ${aggregatedAudio.length} samples (${((aggregatedAudio.length / this.SAMPLE_RATE) * 1000).toFixed(0)}ms)`, ); // Transcribe using smart-whisper @@ -49,10 +95,13 @@ export class WhisperProvider implements TranscriptionProvider { ); const { result } = await this.whisperInstance.transcribe( - audioFloat32Array, + aggregatedAudio, { language: "auto", initial_prompt: initialPrompt, + suppress_blank: true, + suppress_non_speech_tokens: true, + no_timestamps: true, }, ); @@ -68,6 +117,11 @@ export class WhisperProvider implements TranscriptionProvider { `Transcription completed, length: ${text.length}`, ); + // Clear buffer after successful transcription + this.frameBuffer = []; + this.frameBufferSpeechProbabilities = []; + this.silenceFrameCount = 0; + return text; } catch (error) { logger.transcription.error("Transcription failed:", error); @@ -75,6 +129,112 @@ export class WhisperProvider implements TranscriptionProvider { } } + private shouldTranscribe(): boolean { + // Transcribe if: + // 1. We have significant silence after speech + // 2. Buffer is getting too large + // 3. Final chunk was received (handled elsewhere) + + const bufferDurationMs = + ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000; + const silenceDurationMs = + ((this.silenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000; + + // If we have speech and then significant silence, transcribe + if ( + this.frameBuffer.length > 0 && + silenceDurationMs > this.MAX_SILENCE_DURATION_MS + ) { + logger.transcription.debug( + `Transcribing due to ${silenceDurationMs}ms of silence`, + ); + return true; + } + + // If buffer is too large (e.g., 30 seconds), transcribe anyway + if (bufferDurationMs > 30000) { + logger.transcription.debug( + `Transcribing due to buffer size: ${bufferDurationMs}ms`, + ); + return true; + } + + logger.transcription.error("Not transcribing", { + bufferDurationMs, + silenceDurationMs, + frameBufferLength: this.frameBuffer.length, + silenceFrameCount: this.silenceFrameCount, + }); + + return false; + } + + private aggregateFrames(): Float32Array { + // Calculate total size + const totalLength = this.frameBuffer.reduce( + (sum, frame) => sum + frame.length, + 0, + ); + const aggregated = new Float32Array(totalLength); + + // Copy all frames into single array + let offset = 0; + for (const frame of this.frameBuffer) { + aggregated.set(frame, offset); + offset += frame.length; + } + + // Trim silence from beginning and end + const trimmed = this.trimSilence(aggregated); + + return trimmed; + } + + private trimSilence(audio: Float32Array): Float32Array { + // Find first speech frame (probability > threshold) + let startIdx = 0; + for (let i = 0; i < this.frameBufferSpeechProbabilities.length; i++) { + if ( + this.frameBufferSpeechProbabilities[i] > + this.SPEECH_PROBABILITY_THRESHOLD + ) { + startIdx = i * this.FRAME_SIZE; + break; + } + } + + // Find last speech frame (probability > threshold) + let endIdx = audio.length; + for (let i = this.frameBufferSpeechProbabilities.length - 1; i >= 0; i--) { + if ( + this.frameBufferSpeechProbabilities[i] > + this.SPEECH_PROBABILITY_THRESHOLD + ) { + endIdx = (i + 1) * this.FRAME_SIZE; + break; + } + } + + return audio.slice(startIdx, Math.min(endIdx, audio.length)); + } + + // Force transcription of any remaining frames + async flush(): Promise { + if (this.frameBuffer.length === 0) { + return ""; + } + + logger.transcription.error(`Flushing ${this.frameBuffer.length} frames`); + + // Force transcription by setting high silence count + this.silenceFrameCount = 999; + return this.transcribe({ + audioData: Buffer.alloc(0), // Empty buffer, we'll use the buffered frames + speechProbability: 0, + context: {}, + }); + } + private generateInitialPrompt( vocabulary?: Map, aggregatedTranscription?: string, @@ -163,5 +323,10 @@ export class WhisperProvider implements TranscriptionProvider { this.whisperInstance = null; } } + + // Clear buffers + this.frameBuffer = []; + this.frameBufferSpeechProbabilities = []; + this.silenceFrameCount = 0; } } diff --git a/apps/desktop/src/renderer/widget/pages/widget/components/FloatingButton.tsx b/apps/desktop/src/renderer/widget/pages/widget/components/FloatingButton.tsx index e3920cc..f3d01e0 100644 --- a/apps/desktop/src/renderer/widget/pages/widget/components/FloatingButton.tsx +++ b/apps/desktop/src/renderer/widget/pages/widget/components/FloatingButton.tsx @@ -19,24 +19,27 @@ export const FloatingButton: React.FC = () => { }; }, []); - const handleAudioChunk = useCallback( - async (audioChunk: ArrayBuffer, isFinalChunk: boolean) => { + const handleAudioFrame = useCallback( + async ( + audioBuffer: ArrayBuffer, + speechProbability: number, + isFinal: boolean, + ) => { try { - // Send the audio chunk regardless of whether it's final or not - await window.electronAPI.sendAudioChunk(audioChunk, isFinalChunk); - console.debug(`Sent audio chunk`, { - chunkSize: audioChunk.byteLength, - isFinalChunk, + // Send frame directly to main process + // TODO: We need to update the IPC to include speech detection info + await window.electronAPI.sendAudioChunk(audioBuffer, isFinal); + console.debug(`Sent audio frame`, { + size: audioBuffer.byteLength, + speechProbability: speechProbability.toFixed(3), + isFinal, }); - if (isFinalChunk) { - console.log("Final chunk sent to main process"); - // You might want to add a specific IPC call here if the main process needs an explicit signal - // to finalize transcription, e.g., window.electronAPI.finalizeTranscription(); - // For now, we assume sendAudioChunk is enough and the main process handles the stream end. + if (isFinal) { + console.log("Final frame sent to main process"); } } catch (error) { - console.error("Error sending audio chunk:", error); + console.error("Error sending audio frame:", error); } }, [], @@ -44,8 +47,7 @@ export const FloatingButton: React.FC = () => { const { recordingStatus, startRecording, stopRecording, voiceDetected } = useRecording({ - onAudioChunk: handleAudioChunk, - // Optionally, set chunkDurationMs here if needed, e.g., chunkDurationMs: 250 + onAudioFrame: handleAudioFrame, }); const isRecording = recordingStatus === "recording" || recordingStatus === "starting"; diff --git a/apps/desktop/src/services/transcription-service.ts b/apps/desktop/src/services/transcription-service.ts index 10f5893..1d1624b 100644 --- a/apps/desktop/src/services/transcription-service.ts +++ b/apps/desktop/src/services/transcription-service.ts @@ -7,11 +7,11 @@ import { createDefaultContext } from "../pipeline/core/context"; import { WhisperProvider } from "../pipeline/providers/transcription/whisper-provider"; import { OpenRouterProvider } from "../pipeline/providers/formatting/openrouter-formatter"; import { ModelManagerService } from "../services/model-manager"; -import { ServiceManager } from "../main/managers/service-manager"; import { appContextStore } from "../stores/app-context"; import { createTranscription } from "../db/transcriptions"; import { logger } from "../main/logger"; import { v4 as uuid } from "uuid"; +import { VADService } from "./vad-service"; /** * Service for audio transcription and optional formatting @@ -21,9 +21,23 @@ export class TranscriptionService { private openRouterProvider: OpenRouterProvider | null = null; private formatterEnabled = false; private streamingSessions: Map = new Map(); + private vadService: VADService | null = null; - constructor(modelManagerService: ModelManagerService) { + constructor( + modelManagerService: ModelManagerService, + vadService: VADService | null = null, + ) { this.whisperProvider = new WhisperProvider(modelManagerService); + this.vadService = vadService; + } + + async initialize(): Promise { + if (this.vadService) { + logger.transcription.info("Using VAD service"); + } else { + logger.transcription.warn("VAD service not available"); + } + logger.transcription.info("Transcription service initialized"); } /** @@ -62,6 +76,26 @@ export class TranscriptionService { isFinal?: boolean; }): Promise { const { sessionId, audioChunk, isFinal = false } = options; + console.error("processing streaming chunk", { + length: audioChunk.length, + }); + + // Run VAD on the audio chunk + let speechProbability = 0; + let isSpeaking = false; + + if (audioChunk.length > 0 && this.vadService) { + const vadResult = await this.vadService.processAudioFrame( + audioChunk.buffer as ArrayBuffer, + ); + speechProbability = vadResult.probability; + isSpeaking = vadResult.isSpeaking; + + logger.transcription.debug("VAD result", { + probability: speechProbability.toFixed(3), + isSpeaking, + }); + } // Auto-create session if it doesn't exist let session = this.streamingSessions.get(sessionId); @@ -90,7 +124,7 @@ export class TranscriptionService { // Process chunk if it has content if (audioChunk.length > 0) { - // Direct provider call - no step wrapper + // Direct frame to Whisper - it will handle aggregation and VAD internally const previousChunk = session.transcriptionResults.length > 0 ? session.transcriptionResults[ @@ -103,6 +137,7 @@ export class TranscriptionService { const chunkTranscription = await this.whisperProvider.transcribe({ audioData: audioChunk, + speechProbability: speechProbability, // Now from VAD service context: { vocabulary: session.context.sharedData.vocabulary, accessibilityContext: session.context.sharedData.accessibilityContext, @@ -111,22 +146,39 @@ export class TranscriptionService { }, }); - // Accumulate the result + // Accumulate the result only if Whisper returned something + // (it returns empty string while buffering) if (chunkTranscription.trim()) { session.transcriptionResults.push(chunkTranscription); + logger.transcription.info("Whisper returned transcription", { + sessionId, + transcriptionLength: chunkTranscription.length, + totalResults: session.transcriptionResults.length, + }); } - logger.transcription.debug("Processed chunk", { + logger.transcription.error("Processed frame", { sessionId, - chunkSize: audioChunk.length, - transcriptionLength: chunkTranscription.length, - totalResults: session.transcriptionResults.length, + frameSize: audioChunk.length, + hadTranscription: chunkTranscription.length > 0, isFinal, }); } - // If this is the final chunk, apply formatting and save + // If this is the final chunk, flush any remaining audio and apply formatting if (isFinal) { + // Flush any remaining buffered audio in Whisper + if (this.whisperProvider.flush) { + const flushResult = await this.whisperProvider.flush(); + if (flushResult.trim()) { + session.transcriptionResults.push(flushResult); + logger.transcription.info("Flushed final audio", { + sessionId, + flushLength: flushResult.length, + }); + } + } + // Get complete transcription let completeTranscription = session.transcriptionResults.join(" ").trim(); @@ -137,7 +189,7 @@ export class TranscriptionService { }); // Format if enabled - if (this.formatterEnabled && this.openRouterProvider) { + if (this.formatterEnabled && this.openRouterProvider && false) { const style = session.context.sharedData.userPreferences?.formattingStyle; completeTranscription = await this.openRouterProvider.format({ @@ -188,19 +240,9 @@ export class TranscriptionService { // Create default context const context = createDefaultContext(uuid()); - // Simple context building - no complex loading - const serviceManager = ServiceManager.getInstance(); - if (serviceManager) { - try { - const settingsService = serviceManager.getSettingsService(); - const formatterConfig = await settingsService.getFormatterConfig(); - } catch (error) { - logger.transcription.warn("Failed to load formatter config", { error }); - } - } - // TODO: Load actual vocabulary // TODO: Load user preferences from settings + // TODO: Load formatter config from settings return context; } @@ -210,6 +252,7 @@ export class TranscriptionService { */ async dispose(): Promise { await this.whisperProvider.dispose(); + // VAD service is managed by ServiceManager logger.transcription.info("Transcription service disposed"); } } diff --git a/apps/desktop/src/services/vad-service.ts b/apps/desktop/src/services/vad-service.ts new file mode 100644 index 0000000..59c7d70 --- /dev/null +++ b/apps/desktop/src/services/vad-service.ts @@ -0,0 +1,192 @@ +import * as ort from "onnxruntime-node"; +import { logger } from "../main/logger"; +import { app } from "electron"; +import * as path from "path"; +import { EventEmitter } from "node:events"; +import { existsSync } from "node:fs"; + +export class VADService extends EventEmitter { + private session: ort.InferenceSession | null = null; + private modelPath: string | null = null; + private state: ort.Tensor | null = null; + private sr: number = 16000; + + // Configuration + private readonly WINDOW_SIZE_SAMPLES = 512; // 32ms at 16kHz + private readonly SPEECH_THRESHOLD = 0.2; + private readonly REDEMPTION_FRAMES = 8; + + // State + private speechFrameCount = 0; + private silenceFrameCount = 0; + private isSpeaking = false; + + constructor() { + super(); + } + + async initialize(): Promise { + try { + // Handle both development and production paths + if (app.isPackaged) { + // In production, the assets are copied to the resources folder + this.modelPath = path.join( + process.resourcesPath, + "assets", + "silero_vad_v5.onnx", + ); + } else { + // In development, use the source path + this.modelPath = path.join( + __dirname, + "../../src/assets/silero_vad_v5.onnx", + ); + } + + logger.main.info("Loading VAD model from", this.modelPath); + + // Check if the model file exists + if (!existsSync(this.modelPath)) { + throw new Error( + `VAD model file not found at: ${this.modelPath}. ` + + `Make sure the ONNX model is in the assets folder.`, + ); + } + + // Load ONNX model + this.session = await ort.InferenceSession.create(this.modelPath, { + executionProviders: ["cpu"], // Use CPU provider for compatibility + }); + + // Initialize hidden states (h and c) + this.resetStates(); + + logger.main.info("VAD service initialized successfully"); + } catch (error) { + logger.main.error("Failed to initialize VAD service:", error); + throw error; + } + } + + private resetStates(): void { + // Silero VAD uses a state tensor with shape [2, 1, 128] + const stateSize = 2 * 1 * 128; + this.state = new ort.Tensor( + "float32", + new Float32Array(stateSize).fill(0), + [2, 1, 128], + ); + } + + async processBatch( + audioFrames: Float32Array, + ): Promise<{ probability: number; isSpeaking: boolean }> { + if (!this.session || !this.state) { + throw new Error("VAD service not initialized"); + } + + try { + // Create input tensor - shape should be [1, audio_length] + const inputTensor = new ort.Tensor("float32", audioFrames, [ + 1, + audioFrames.length, + ]); + + const srTensor = new ort.Tensor( + "int64", + BigInt64Array.from([BigInt(this.sr)]), + [], + ); + + // Run inference with input, state, and sr + const results = await this.session.run({ + input: inputTensor, + state: this.state, + sr: srTensor, + }); + + // Update state for next iteration + this.state = results.stateN as ort.Tensor; + + // Get speech probability + const output = results.output as ort.Tensor; + const probability = output.data[0] as number; + + // Apply smoothing logic + const isSpeaking = this.applySpeechDetectionLogic(probability); + + return { probability, isSpeaking }; + } catch (error) { + logger.main.error("VAD inference failed:", error); + throw error; + } + } + + private applySpeechDetectionLogic(probability: number): boolean { + const isSpeechFrame = probability > this.SPEECH_THRESHOLD; + + if (isSpeechFrame) { + this.speechFrameCount++; + this.silenceFrameCount = 0; + } else { + this.silenceFrameCount++; + if (this.silenceFrameCount > this.REDEMPTION_FRAMES) { + this.speechFrameCount = 0; + } + } + + // Start speaking after enough speech frames + if (!this.isSpeaking && this.speechFrameCount >= 3) { + this.isSpeaking = true; + logger.main.debug("Speech started"); + this.emit("voice-detected", true); + } + + // Stop speaking after enough silence + if (this.isSpeaking && this.silenceFrameCount >= this.REDEMPTION_FRAMES) { + this.isSpeaking = false; + logger.main.debug("Speech ended"); + this.emit("voice-detected", false); + } + + return this.isSpeaking; + } + + async processAudioFrame( + audioBuffer: ArrayBuffer, + ): Promise<{ probability: number; isSpeaking: boolean }> { + // Convert ArrayBuffer to Float32Array + const float32Array = new Float32Array(audioBuffer); + + // Silero VAD requires exactly 512 samples + if (float32Array.length !== this.WINDOW_SIZE_SAMPLES) { + // If we have fewer samples (e.g., final buffer flush), pad with zeros + if (float32Array.length < this.WINDOW_SIZE_SAMPLES) { + const paddedArray = new Float32Array(this.WINDOW_SIZE_SAMPLES); + paddedArray.set(float32Array); + // Rest is already zeros + return this.processBatch(paddedArray); + } else { + // If we have more samples, just process the first 512 + const truncatedArray = float32Array.slice(0, this.WINDOW_SIZE_SAMPLES); + return this.processBatch(truncatedArray); + } + } + + // Process through VAD + return this.processBatch(float32Array); + } + + getSpeechState(): boolean { + return this.isSpeaking; + } + + async dispose(): Promise { + if (this.session) { + await this.session.release(); + this.session = null; + } + this.state = null; + logger.main.info("VAD service disposed"); + } +} diff --git a/apps/desktop/src/trpc/routers/recording.ts b/apps/desktop/src/trpc/routers/recording.ts index 6ce2167..28fec1d 100644 --- a/apps/desktop/src/trpc/routers/recording.ts +++ b/apps/desktop/src/trpc/routers/recording.ts @@ -3,6 +3,7 @@ import { observable } from "@trpc/server/observable"; import superjson from "superjson"; import { ServiceManager } from "../../main/managers/service-manager"; import type { RecordingStatus } from "../../types/recording"; +import { logger } from "../../main/logger"; const t = initTRPC.create({ isServer: true, @@ -61,4 +62,36 @@ export const recordingRouter = t.router({ }; }); }), + + // Voice detection subscription + voiceDetectionUpdates: t.procedure.subscription(() => { + return observable((emit) => { + const serviceManager = ServiceManager.getInstance(); + if (!serviceManager) { + throw new Error("ServiceManager not initialized"); + } + + const vadService = serviceManager.getVADService(); + if (!vadService) { + logger.main.warn( + "VAD service not available for voice detection subscription", + ); + // Emit false and complete immediately if VAD is not available + emit.next(false); + return () => {}; + } + + // Set up listener for voice detection changes + const handleVoiceDetection = (detected: boolean) => { + emit.next(detected); + }; + + vadService.on("voice-detected", handleVoiceDetection); + + // Cleanup function + return () => { + vadService.off("voice-detected", handleVoiceDetection); + }; + }); + }), }); diff --git a/apps/desktop/src/types/vite-env.d.ts b/apps/desktop/src/types/vite-env.d.ts new file mode 100644 index 0000000..ae09d95 --- /dev/null +++ b/apps/desktop/src/types/vite-env.d.ts @@ -0,0 +1,13 @@ +/// + +// Declare module for URL imports +declare module "*?url" { + const url: string; + export default url; +} + +// Declare module for raw imports +declare module "*?raw" { + const content: string; + export default content; +} diff --git a/apps/desktop/vite.main.config.mts b/apps/desktop/vite.main.config.mts index 6c44978..e511152 100644 --- a/apps/desktop/vite.main.config.mts +++ b/apps/desktop/vite.main.config.mts @@ -14,6 +14,7 @@ export default defineConfig({ "@libsql/linux-x64-musl", "@libsql/win32-x64-msvc", "libsql", + "onnxruntime-node", /^node:/, /^electron$/, ], diff --git a/package.json b/package.json index 9f2307a..5fdceec 100644 --- a/package.json +++ b/package.json @@ -38,7 +38,8 @@ "drizzle-orm/libsql", "@libsql", "macos-alias", - "fs-xattr" + "fs-xattr", + "onnxruntime-node" ] } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 58d450b..e676521 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -128,9 +128,6 @@ importers: '@radix-ui/react-tooltip': specifier: ^1.2.7 version: 1.2.7(@types/react-dom@19.1.5(@types/react@19.1.5))(@types/react@19.1.5)(react-dom@19.1.0(react@19.1.0))(react@19.1.0) - '@ricky0123/vad-web': - specifier: ^0.0.24 - version: 0.0.24 '@tabler/icons-react': specifier: ^3.34.0 version: 3.34.0(react@19.1.0) @@ -218,6 +215,9 @@ importers: next-themes: specifier: ^0.4.6 version: 0.4.6(react-dom@19.1.0(react@19.1.0))(react@19.1.0) + onnxruntime-node: + specifier: ^1.20.1 + version: 1.22.0 openai: specifier: ^4.98.0 version: 4.103.0(encoding@0.1.13)(ws@8.18.0)(zod@3.25.67) @@ -2042,36 +2042,6 @@ packages: resolution: {integrity: sha512-ROFF39F6ZrnzSUEmQQZUar0Jt4xVoP9WnDRdWwF4NNcXs3xBTLgBUDoOwW141y1jP+S8nahIbdxbFC7IShw9Iw==} engines: {node: ^12.20.0 || ^14.18.0 || >=16.0.0} - '@protobufjs/aspromise@1.1.2': - resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==} - - '@protobufjs/base64@1.1.2': - resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==} - - '@protobufjs/codegen@2.0.4': - resolution: {integrity: sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==} - - '@protobufjs/eventemitter@1.1.0': - resolution: {integrity: sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==} - - '@protobufjs/fetch@1.1.0': - resolution: {integrity: sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==} - - '@protobufjs/float@1.0.2': - resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==} - - '@protobufjs/inquire@1.1.0': - resolution: {integrity: sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==} - - '@protobufjs/path@1.1.2': - resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==} - - '@protobufjs/pool@1.1.0': - resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==} - - '@protobufjs/utf8@1.1.0': - resolution: {integrity: sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==} - '@radix-ui/number@1.1.1': resolution: {integrity: sha512-MkKCwxlXTgz6CFoJx3pCwn07GKp36+aZyu/u2Ln2VrA5DcdyCZkASEDBTd8x5whTQQL5CiYf4prXKLcgQdv29g==} @@ -2684,9 +2654,6 @@ packages: '@radix-ui/rect@1.1.1': resolution: {integrity: sha512-HPwpGIzkl28mWyZqG52jiqDJ12waP11Pa1lGoiyUkIEuMLBP0oeK/C89esbXrxsky5we7dfd8U58nm0SgAWpVw==} - '@ricky0123/vad-web@0.0.24': - resolution: {integrity: sha512-uv6GWW/kq8BkVErMQzXp3uwSyYMT3w/3QJiUerVaaKp7EwhOTIRY+96EoyFdG2WOFU5RkLk/2CVGbI7nDlxhEg==} - '@rollup/plugin-commonjs@28.0.6': resolution: {integrity: sha512-XSQB1K7FUU5QP+3lOQmVCE3I0FcbbNvmNT4VJSj93iUjayaARrTQeoRdiYQoftAJBLrR9t2agwAd3ekaTgHNlw==} engines: {node: '>=16.0.0 || 14 >= 14.17'} @@ -3317,9 +3284,6 @@ packages: '@types/keyv@3.1.4': resolution: {integrity: sha512-BQ5aZNSCpj7D6K2ksrRCTmKRLEpnPvWDiLPfoGyhZ++8YtiK9d/3DBKPJgry359X/P1PfruyYwvnvwFjuEiEIg==} - '@types/long@4.0.2': - resolution: {integrity: sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA==} - '@types/mdast@4.0.4': resolution: {integrity: sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==} @@ -3486,6 +3450,10 @@ packages: engines: {node: '>=0.4.0'} hasBin: true + adm-zip@0.5.16: + resolution: {integrity: sha512-TGw5yVi4saajsSEgz25grObGHEUaDrniwvA2qwSC060KfqGPdglhvPMA2lPIoxs3PQIItj2iag35fONcQqgUaQ==} + engines: {node: '>=12.0'} + agent-base@6.0.2: resolution: {integrity: sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==} engines: {node: '>= 6.0.0'} @@ -4953,9 +4921,6 @@ packages: resolution: {integrity: sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==} engines: {node: '>=16'} - flatbuffers@1.12.0: - resolution: {integrity: sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==} - flatted@3.3.3: resolution: {integrity: sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==} @@ -5266,9 +5231,6 @@ packages: resolution: {integrity: sha512-5v6yZd4JK3eMI3FqqCouswVqwugaA9r4dNZB1wwcmrD02QkV5H0y7XBQW8QwQqEaZY1pM9aqORSORhJRdNK44Q==} engines: {node: '>=6.0'} - guid-typescript@1.0.9: - resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==} - handlebars@4.7.7: resolution: {integrity: sha512-aAcXm5OAfE/8IXkcZvCepKU3VzW1/39Fb5ZuqMtgI/hT8X2YgoMvBY5dLhq/cpOvw7Lk1nK/UF71aLG/ZnVYRA==} engines: {node: '>=0.4.7'} @@ -5918,9 +5880,6 @@ packages: resolution: {integrity: sha512-5UtUDQ/6edw4ofyljDNcOVJQ4c7OjDro4h3y8e1GQL5iYElYclVHJ3zeWchylvMaKnDbDilC8irOVyexnA/Slw==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} - long@4.0.0: - resolution: {integrity: sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA==} - longest-streak@3.1.0: resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==} @@ -6506,14 +6465,12 @@ packages: oniguruma-to-es@4.3.3: resolution: {integrity: sha512-rPiZhzC3wXwE59YQMRDodUwwT9FZ9nNBwQQfsd1wfdtlKEyCdRV0avrTcSZ5xlIvGRVPd/cx6ZN45ECmS39xvg==} - onnx-proto@4.0.4: - resolution: {integrity: sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA==} + onnxruntime-common@1.22.0: + resolution: {integrity: sha512-vcuaNWgtF2dGQu/EP5P8UI5rEPEYqXG2sPPe5j9lg2TY/biJF8eWklTMwlDO08iuXq48xJo0awqIpK5mPG+IxA==} - onnxruntime-common@1.14.0: - resolution: {integrity: sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew==} - - onnxruntime-web@1.14.0: - resolution: {integrity: sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw==} + onnxruntime-node@1.22.0: + resolution: {integrity: sha512-QaAqr7PFekrmEsmu1rpw7OxJYyG+iACjNHoNtQIVt9Oh7st8WDPIIUe6KhF9l35HVJTJd9CV1rePoPmKhSV26g==} + os: [win32, darwin, linux] openai@4.103.0: resolution: {integrity: sha512-eWcz9kdurkGOFDtd5ySS5y251H2uBgq9+1a2lTBnjMMzlexJ40Am5t6Mu76SSE87VvitPa0dkIAp75F+dZVC0g==} @@ -6715,9 +6672,6 @@ packages: resolution: {integrity: sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==} engines: {node: '>=0.10.0'} - platform@1.3.6: - resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==} - plist@3.1.0: resolution: {integrity: sha512-uysumyrvkUX0rX/dEVqt8gC3sTBzd4zoWfLeS29nb53imdaXVvLINYXTI2GNqzaMuvacNx4uJQ8+b3zXR0pkgQ==} engines: {node: '>=10.4.0'} @@ -6804,10 +6758,6 @@ packages: property-information@7.1.0: resolution: {integrity: sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ==} - protobufjs@6.11.4: - resolution: {integrity: sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw==} - hasBin: true - proxy-addr@2.0.7: resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==} engines: {node: '>= 0.10'} @@ -10147,29 +10097,6 @@ snapshots: '@pkgr/core@0.2.4': {} - '@protobufjs/aspromise@1.1.2': {} - - '@protobufjs/base64@1.1.2': {} - - '@protobufjs/codegen@2.0.4': {} - - '@protobufjs/eventemitter@1.1.0': {} - - '@protobufjs/fetch@1.1.0': - dependencies: - '@protobufjs/aspromise': 1.1.2 - '@protobufjs/inquire': 1.1.0 - - '@protobufjs/float@1.0.2': {} - - '@protobufjs/inquire@1.1.0': {} - - '@protobufjs/path@1.1.2': {} - - '@protobufjs/pool@1.1.0': {} - - '@protobufjs/utf8@1.1.0': {} - '@radix-ui/number@1.1.1': {} '@radix-ui/primitive@1.1.2': {} @@ -10823,10 +10750,6 @@ snapshots: '@radix-ui/rect@1.1.1': {} - '@ricky0123/vad-web@0.0.24': - dependencies: - onnxruntime-web: 1.14.0 - '@rollup/plugin-commonjs@28.0.6(rollup@4.41.0)': dependencies: '@rollup/pluginutils': 5.2.0(rollup@4.41.0) @@ -11560,8 +11483,6 @@ snapshots: dependencies: '@types/node': 22.15.12 - '@types/long@4.0.2': {} - '@types/mdast@4.0.4': dependencies: '@types/unist': 3.0.3 @@ -11744,6 +11665,8 @@ snapshots: acorn@8.14.1: {} + adm-zip@0.5.16: {} + agent-base@6.0.2: dependencies: debug: 4.4.1 @@ -12006,8 +11929,7 @@ snapshots: transitivePeerDependencies: - supports-color - boolean@3.2.0: - optional: true + boolean@3.2.0: {} bottleneck@2.19.5: {} @@ -12583,8 +12505,7 @@ snapshots: detect-node-es@1.1.0: {} - detect-node@2.1.0: - optional: true + detect-node@2.1.0: {} devlop@1.1.0: dependencies: @@ -12929,8 +12850,7 @@ snapshots: is-date-object: 1.1.0 is-symbol: 1.1.1 - es6-error@4.1.1: - optional: true + es6-error@4.1.1: {} esast-util-from-estree@2.0.0: dependencies: @@ -13474,8 +13394,6 @@ snapshots: flatted: 3.3.3 keyv: 4.5.4 - flatbuffers@1.12.0: {} - flatted@3.3.3: {} flora-colossus@2.0.0: @@ -13833,7 +13751,6 @@ snapshots: roarr: 2.15.4 semver: 7.7.2 serialize-error: 7.0.1 - optional: true global-dirs@3.0.1: dependencies: @@ -13904,8 +13821,6 @@ snapshots: section-matter: 1.0.0 strip-bom-string: 1.0.0 - guid-typescript@1.0.9: {} - handlebars@4.7.7: dependencies: minimist: 1.2.8 @@ -14443,8 +14358,7 @@ snapshots: json-stable-stringify-without-jsonify@1.0.1: {} - json-stringify-safe@5.0.1: - optional: true + json-stringify-safe@5.0.1: {} json5@1.0.2: dependencies: @@ -14623,8 +14537,6 @@ snapshots: strip-ansi: 7.1.0 wrap-ansi: 8.1.0 - long@4.0.0: {} - longest-streak@3.1.0: {} loose-envify@1.4.0: @@ -14695,7 +14607,6 @@ snapshots: matcher@3.0.0: dependencies: escape-string-regexp: 4.0.0 - optional: true math-intrinsics@1.1.0: {} @@ -15488,20 +15399,13 @@ snapshots: regex: 6.0.1 regex-recursion: 6.0.2 - onnx-proto@4.0.4: - dependencies: - protobufjs: 6.11.4 + onnxruntime-common@1.22.0: {} - onnxruntime-common@1.14.0: {} - - onnxruntime-web@1.14.0: + onnxruntime-node@1.22.0: dependencies: - flatbuffers: 1.12.0 - guid-typescript: 1.0.9 - long: 4.0.0 - onnx-proto: 4.0.4 - onnxruntime-common: 1.14.0 - platform: 1.3.6 + adm-zip: 0.5.16 + global-agent: 3.0.0 + onnxruntime-common: 1.22.0 openai@4.103.0(encoding@0.1.13)(ws@8.18.0)(zod@3.25.67): dependencies: @@ -15705,8 +15609,6 @@ snapshots: pify@2.3.0: {} - platform@1.3.6: {} - plist@3.1.0: dependencies: '@xmldom/xmldom': 0.8.10 @@ -15790,22 +15692,6 @@ snapshots: property-information@7.1.0: {} - protobufjs@6.11.4: - dependencies: - '@protobufjs/aspromise': 1.1.2 - '@protobufjs/base64': 1.1.2 - '@protobufjs/codegen': 2.0.4 - '@protobufjs/eventemitter': 1.1.0 - '@protobufjs/fetch': 1.1.0 - '@protobufjs/float': 1.0.2 - '@protobufjs/inquire': 1.1.0 - '@protobufjs/path': 1.1.2 - '@protobufjs/pool': 1.1.0 - '@protobufjs/utf8': 1.1.0 - '@types/long': 4.0.2 - '@types/node': 22.15.12 - long: 4.0.0 - proxy-addr@2.0.7: dependencies: forwarded: 0.2.0 @@ -16341,7 +16227,6 @@ snapshots: json-stringify-safe: 5.0.1 semver-compare: 1.0.0 sprintf-js: 1.1.3 - optional: true rollup@4.41.0: dependencies: @@ -16425,8 +16310,7 @@ snapshots: secure-json-parse@2.7.0: {} - semver-compare@1.0.0: - optional: true + semver-compare@1.0.0: {} semver@5.7.2: {} @@ -16462,7 +16346,6 @@ snapshots: serialize-error@7.0.1: dependencies: type-fest: 0.13.1 - optional: true serve-favicon@2.5.0: dependencies: @@ -17180,8 +17063,7 @@ snapshots: dependencies: prelude-ls: 1.2.1 - type-fest@0.13.1: - optional: true + type-fest@0.13.1: {} type-fest@0.21.3: {}