Optimise local transcription calls (#33)

* chore: move audio worklet file to assets

* chore: get rid of rickyvad and use vad model directly

* fix: handling of onnxruntime in packaged app

* chore: run ci on macos

* fix: formatting
This commit is contained in:
Haritabh 2025-07-03 12:18:47 +05:30 committed by GitHub
parent e4b4e92be4
commit 5eb5777001
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 775 additions and 521 deletions

View file

@ -13,7 +13,7 @@ env:
jobs:
build:
runs-on: ubuntu-latest
runs-on: macos-latest
steps:
- name: Checkout repository

View file

@ -34,6 +34,7 @@ export const EXTERNAL_DEPENDENCIES = [
"@libsql/linux-x64-musl",
"@libsql/win32-x64-msvc",
"libsql",
"onnxruntime-node",
// Add any other native modules you need here
];
@ -195,13 +196,16 @@ const config: ForgeConfig = {
},
},
packagerConfig: {
asar: true,
asar: {
unpack: "{*.node,*.dylib,*.so,*.dll}",
},
name: "Amical",
executableName: "Amical",
icon: "./assets/logo.icns", // Path to your icon file
extraResource: [
"../../packages/native-helpers/swift-helper/bin",
"./src/db/migrations",
"./src/assets",
],
extendInfo: {
NSMicrophoneUsageDescription:

View file

@ -86,7 +86,6 @@
"@radix-ui/react-toggle": "^1.1.9",
"@radix-ui/react-toggle-group": "^1.1.10",
"@radix-ui/react-tooltip": "^1.2.7",
"@ricky0123/vad-web": "^0.0.24",
"@tabler/icons-react": "^3.34.0",
"@tanstack/react-query": "^5.81.2",
"@tanstack/react-table": "^8.21.3",
@ -116,6 +115,7 @@
"libsql": "^0.5.13",
"lucide-react": "^0.510.0",
"next-themes": "^0.4.6",
"onnxruntime-node": "^1.20.1",
"openai": "^4.98.0",
"react": "^19.1.0",
"react-day-picker": "8.10.1",

View file

@ -0,0 +1,56 @@
class AudioRecorderProcessor extends AudioWorkletProcessor {
constructor() {
super();
this.frameSize = 512; // 32ms at 16kHz
this.sampleRate = 16000;
this.buffer = [];
// Listen for control messages
this.port.onmessage = (event) => {
if (event.data.type === 'flush') {
this.flushBuffer();
}
};
}
flushBuffer() {
// Always send a final frame to signal end of recording
const finalFrame = new Float32Array(this.buffer);
this.buffer = [];
this.port.postMessage({
type: 'audioFrame',
frame: finalFrame,
isFinal: true
});
}
process(inputs, outputs, parameters) {
const input = inputs[0];
if (!input || !input[0]) return true;
const channelData = input[0];
// Add samples to buffer
for (let i = 0; i < channelData.length; i++) {
this.buffer.push(channelData[i]);
}
// When we have enough samples, send a frame
while (this.buffer.length >= this.frameSize) {
const frame = this.buffer.slice(0, this.frameSize);
this.buffer = this.buffer.slice(this.frameSize);
// Send frame to main thread
this.port.postMessage({
type: 'audioFrame',
frame: new Float32Array(frame),
isFinal: false
});
}
return true;
}
}
registerProcessor('audio-recorder-processor', AudioRecorderProcessor);

Binary file not shown.

View file

@ -1,65 +0,0 @@
// AudioWorklet processor source code
export const audioRecorderWorkletSource = `
// AudioWorklet processor for real-time audio capture
// This runs in the audio rendering thread for low-latency processing
/* eslint-env worker */
/* global AudioWorkletProcessor, registerProcessor */
class AudioRecorderProcessor extends AudioWorkletProcessor {
constructor() {
super();
this.bufferSize = 4096;
this.buffer = new Float32Array(this.bufferSize);
this.bufferIndex = 0;
// Listen for messages from main thread
this.port.onmessage = (event) => {
if (event.data.command === 'stop') {
this.sendBufferedAudio(true); // Send final chunk
}
};
}
process(inputs, _outputs, _parameters) {
const input = inputs[0];
// Check if we have input audio
if (input && input.length > 0) {
const inputChannel = input[0]; // Get first (mono) channel
// Buffer the audio data
for (let i = 0; i < inputChannel.length; i++) {
this.buffer[this.bufferIndex] = inputChannel[i];
this.bufferIndex++;
// When buffer is full, send it to main thread
if (this.bufferIndex >= this.bufferSize) {
this.sendBufferedAudio(false);
this.bufferIndex = 0; // Reset buffer
}
}
}
// Keep the processor alive
return true;
}
sendBufferedAudio(isFinal) {
if (this.bufferIndex > 0 || isFinal) {
// Create a copy of the current buffer data
const audioData = new Float32Array(this.bufferIndex);
audioData.set(this.buffer.subarray(0, this.bufferIndex));
// Send to main thread
this.port.postMessage({
type: 'audioData',
audioData: audioData,
isFinal: isFinal,
});
}
}
}
// Register the processor
registerProcessor('audio-recorder-processor', AudioRecorderProcessor);
`;

View file

@ -1,13 +1,17 @@
import { useState, useRef, useEffect } from "react";
import { MicVAD } from "@ricky0123/vad-web";
import { audioRecorderWorkletSource } from "./audio-recorder-worklet";
import { useRef, useEffect, useState, useCallback } from "react";
import audioWorkletUrl from "@/assets/audio-recorder-processor.js?url";
import { api } from "@/trpc/react";
// Audio configuration
const FRAME_SIZE = 512; // 32ms at 16kHz
const SAMPLE_RATE = 16000;
export interface UseAudioCaptureParams {
onAudioChunk: (
arrayBuffer: ArrayBuffer,
speechProbability: number,
isFinalChunk: boolean,
) => Promise<void> | void;
chunkDurationMs?: number;
enabled: boolean;
}
@ -15,268 +19,136 @@ export interface UseAudioCaptureOutput {
voiceDetected: boolean;
}
interface AudioCaptureState {
stream: MediaStream | null;
vad: MicVAD | null;
audioContext: AudioContext | null;
audioWorkletNode: AudioWorkletNode | null;
source: MediaStreamAudioSourceNode | null;
chunkTimer: NodeJS.Timeout | null;
pendingAudioChunks: Float32Array[];
sendAudioChunk: ((isFinal: boolean) => Promise<void>) | null;
}
export const useAudioCapture = ({
onAudioChunk,
chunkDurationMs = 28000,
enabled,
}: UseAudioCaptureParams): UseAudioCaptureOutput => {
const [voiceDetected, setVoiceDetected] = useState(false);
const stateRef = useRef<AudioCaptureState>({
stream: null,
vad: null,
audioContext: null,
audioWorkletNode: null,
source: null,
chunkTimer: null,
pendingAudioChunks: [],
sendAudioChunk: null,
const audioContextRef = useRef<AudioContext | null>(null);
const sourceRef = useRef<MediaStreamAudioSourceNode | null>(null);
const workletNodeRef = useRef<AudioWorkletNode | null>(null);
const streamRef = useRef<MediaStream | null>(null);
// Subscribe to voice detection updates via tRPC
api.recording.voiceDetectionUpdates.useSubscription(undefined, {
onData: (detected: boolean) => {
setVoiceDetected(detected);
},
onError: (err) => {
console.error("Voice detection subscription error:", err);
},
});
// Main effect to handle enabled state changes
useEffect(() => {
let isCancelled = false;
const startCapture = useCallback(async () => {
try {
console.log("AudioCapture: Starting audio capture");
const cleanup = async () => {
const state = stateRef.current;
// Send final chunk if we have pending audio
if (state.sendAudioChunk) {
try {
await state.sendAudioChunk(true);
} catch (error) {
console.error("AudioCapture: Error sending final chunk:", error);
}
}
// Clear chunk timer
if (state.chunkTimer) {
clearInterval(state.chunkTimer);
state.chunkTimer = null;
}
// Cleanup AudioWorklet
if (state.audioWorkletNode) {
state.audioWorkletNode.port.postMessage({ command: "stop" });
state.audioWorkletNode.disconnect();
state.audioWorkletNode = null;
}
if (state.source) {
state.source.disconnect();
state.source = null;
}
if (state.audioContext && state.audioContext.state !== "closed") {
await state.audioContext.close();
state.audioContext = null;
}
// Cleanup VAD
if (state.vad) {
try {
state.vad.destroy();
console.log("AudioCapture: VAD destroyed");
} catch (e) {
console.error("Error destroying VAD:", e);
}
state.vad = null;
}
// Stop media stream
if (state.stream) {
state.stream.getTracks().forEach((track) => {
try {
track.stop();
} catch (e) {
console.error("Error stopping stream track:", e);
}
});
state.stream = null;
}
// Reset state
state.pendingAudioChunks = [];
state.sendAudioChunk = null;
setVoiceDetected(false);
console.log("AudioCapture: Cleaned up");
};
const startCapture = async () => {
console.log("AudioCapture: Starting capture...");
try {
// Get microphone access
const stream = await navigator.mediaDevices.getUserMedia({
audio: true,
});
if (isCancelled) {
stream.getTracks().forEach((track) => track.stop());
return;
}
stateRef.current.stream = stream;
// Set up Web Audio API with AudioWorklet for raw PCM data
const audioContext = new AudioContext({ sampleRate: 16000 });
stateRef.current.audioContext = audioContext;
// Load AudioWorklet module using blob URL
const blob = new Blob([audioRecorderWorkletSource], {
type: "application/javascript",
});
const audioWorkletUrl = URL.createObjectURL(blob);
try {
await audioContext.audioWorklet.addModule(audioWorkletUrl);
} finally {
URL.revokeObjectURL(audioWorkletUrl);
}
if (isCancelled) {
await cleanup();
return;
}
const source = audioContext.createMediaStreamSource(stream);
stateRef.current.source = source;
// Create AudioWorklet node
const audioWorkletNode = new AudioWorkletNode(
audioContext,
"audio-recorder-processor",
);
stateRef.current.audioWorkletNode = audioWorkletNode;
// Create function to send accumulated chunks
const sendAudioChunk = async (isFinal = false) => {
const pendingChunks = stateRef.current.pendingAudioChunks;
if (pendingChunks.length > 0) {
// Combine all pending chunks into one array
const totalLength = pendingChunks.reduce(
(sum, chunk) => sum + chunk.length,
0,
);
const combinedChunk = new Float32Array(totalLength);
let offset = 0;
for (const chunk of pendingChunks) {
combinedChunk.set(chunk, offset);
offset += chunk.length;
}
// Convert Float32Array to ArrayBuffer for IPC
const arrayBuffer = combinedChunk.buffer.slice(
combinedChunk.byteOffset,
combinedChunk.byteOffset + combinedChunk.byteLength,
);
try {
await onAudioChunk(arrayBuffer, isFinal);
console.log(
`AudioCapture: Sent chunk: ${combinedChunk.length} samples, final: ${isFinal}`,
);
} catch (error) {
console.error("AudioCapture: Error processing chunk:", error);
}
stateRef.current.pendingAudioChunks = []; // Clear chunks after sending
}
};
stateRef.current.sendAudioChunk = sendAudioChunk;
// Handle messages from AudioWorklet
audioWorkletNode.port.onmessage = (event) => {
if (event.data.type === "audioData") {
const audioData = event.data.audioData as Float32Array;
const isFinal = event.data.isFinal as boolean;
// Store the audio chunk
stateRef.current.pendingAudioChunks.push(audioData);
if (isFinal) {
// Send final chunk immediately
sendAudioChunk(true);
}
}
};
// Set up periodic chunk sending
const chunkTimer = setInterval(() => {
sendAudioChunk(false);
}, chunkDurationMs);
stateRef.current.chunkTimer = chunkTimer;
// Connect the audio processing chain
source.connect(audioWorkletNode);
console.log("AudioCapture: Connected AudioWorklet processing chain");
// Set up VAD
const vad = await MicVAD.new({
stream,
model: "v5",
onSpeechStart: () => {
// Check if component is still mounted before updating state
if (!isCancelled) {
console.log("VAD: Speech started");
setVoiceDetected(true);
}
},
onSpeechEnd: () => {
console.log("VAD: Speech ended");
// Check if component is still mounted before updating state
if (!isCancelled) {
console.log("VAD: Speech ended");
setVoiceDetected(false);
}
},
});
// Store VAD reference immediately to ensure proper cleanup
stateRef.current.vad = vad;
if (isCancelled) {
await cleanup();
return;
}
vad.start();
console.log("AudioCapture: VAD started");
console.log("AudioCapture: Fully started");
} catch (err) {
console.error("AudioCapture: Error starting:", err);
await cleanup();
throw err;
}
};
// Handle enabled state
if (enabled) {
startCapture().catch((err) => {
console.error("AudioCapture: Failed to start:", err);
// Get microphone stream
streamRef.current = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
sampleRate: SAMPLE_RATE,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
},
});
// Create audio context
audioContextRef.current = new AudioContext({ sampleRate: SAMPLE_RATE });
// Load audio worklet
await audioContextRef.current.audioWorklet.addModule(audioWorkletUrl);
// Create nodes
sourceRef.current = audioContextRef.current.createMediaStreamSource(
streamRef.current,
);
workletNodeRef.current = new AudioWorkletNode(
audioContextRef.current,
"audio-recorder-processor",
);
// Handle audio frames from worklet
workletNodeRef.current.port.onmessage = async (event) => {
if (event.data.type === "audioFrame") {
const frame = event.data.frame;
const isFinal = event.data.isFinal || false;
// Convert to ArrayBuffer for IPC
const arrayBuffer = frame.buffer.slice(
frame.byteOffset,
frame.byteOffset + frame.byteLength,
);
// Send to main process for VAD processing
// Main process will update voice detection state
await onAudioChunk(arrayBuffer, 0, isFinal); // Speech probability will come from main
console.log(
`AudioCapture: Sent frame: ${frame.length} samples, isFinal: ${isFinal}`,
);
}
};
// Connect audio graph
sourceRef.current.connect(workletNodeRef.current);
console.log("AudioCapture: Audio capture started");
} catch (error) {
console.error("AudioCapture: Failed to start capture:", error);
throw error;
}
}, [onAudioChunk]);
const stopCapture = useCallback(() => {
console.log("AudioCapture: Stopping audio capture");
// Send flush command to worklet before disconnecting
if (workletNodeRef.current) {
workletNodeRef.current.port.postMessage({ type: "flush" });
console.log("AudioCapture: Sent flush command to worklet");
}
// Cleanup function
return () => {
isCancelled = true;
cleanup().catch((err) => {
console.error("AudioCapture: Cleanup error:", err);
// Disconnect nodes
if (sourceRef.current && workletNodeRef.current) {
sourceRef.current.disconnect(workletNodeRef.current);
}
// Close audio context
if (audioContextRef.current && audioContextRef.current.state !== "closed") {
audioContextRef.current.close();
}
// Stop media stream
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => track.stop());
}
// Clear refs
audioContextRef.current = null;
sourceRef.current = null;
workletNodeRef.current = null;
streamRef.current = null;
setVoiceDetected(false);
console.log("AudioCapture: Audio capture stopped");
}, []);
// Start/stop based on enabled state
useEffect(() => {
if (enabled) {
startCapture().catch((error) => {
console.error("AudioCapture: Failed to start:", error);
});
} else {
stopCapture();
}
return () => {
stopCapture();
};
}, [enabled, onAudioChunk, chunkDurationMs]);
}, [enabled, startCapture, stopCapture]);
return {
voiceDetected,

View file

@ -4,11 +4,11 @@ import { useAudioCapture } from "./useAudioCapture";
import type { RecordingState } from "@/types/recording";
export interface UseRecordingParams {
onAudioChunk: (
arrayBuffer: ArrayBuffer,
isFinalChunk: boolean,
onAudioFrame: (
audioBuffer: ArrayBuffer,
speechProbability: number,
isFinal: boolean,
) => Promise<void> | void;
chunkDurationMs?: number;
onRecordingStartCallback?: () => Promise<void> | void;
onRecordingStopCallback?: () => Promise<void> | void;
}
@ -21,8 +21,7 @@ export interface UseRecordingOutput {
}
export const useRecording = ({
onAudioChunk,
chunkDurationMs = 28000,
onAudioFrame,
onRecordingStartCallback,
onRecordingStopCallback,
}: UseRecordingParams): UseRecordingOutput => {
@ -33,13 +32,25 @@ export const useRecording = ({
stopRecording: stopRecordingMutation,
} = useRecordingState();
// Create handler for audio chunks - just pass through
const handleAudioChunk = useCallback(
async (
arrayBuffer: ArrayBuffer,
speechProbability: number,
isFinalChunk: boolean,
) => {
// Direct pass-through - no aggregation needed
await onAudioFrame(arrayBuffer, speechProbability, isFinalChunk);
},
[onAudioFrame],
);
// Manage audio capture when recording is active
const isActive =
recordingStatus === "recording" || recordingStatus === "starting";
const { voiceDetected } = useAudioCapture({
onAudioChunk,
chunkDurationMs,
onAudioChunk: handleAudioChunk,
enabled: isActive,
});
@ -121,7 +132,12 @@ export const useRecording = ({
} catch (error) {
console.error("Hook: Error stopping recording:", error);
}
}, [recordingStatus, stopRecordingMutation, onRecordingStopCallback]);
}, [
recordingStatus,
stopRecordingMutation,
onRecordingStopCallback,
onAudioFrame,
]);
return {
recordingStatus,

View file

@ -4,6 +4,7 @@ import { logger, logPerformance } from "../logger";
import { ServiceManager } from "./service-manager";
import { appContextStore } from "../../stores/app-context";
import type { RecordingState, RecordingStatus } from "../../types/recording";
import { WindowManager } from "../core/window-manager";
/**
* Manages recording state and coordinates audio recording across the application
@ -13,12 +14,17 @@ export class RecordingManager extends EventEmitter {
private currentSessionId: string | null = null;
private recordingState: RecordingState = "idle";
private lastError: string | undefined;
private windowManager: WindowManager | null = null;
constructor(private serviceManager: ServiceManager) {
super();
this.setupIPCHandlers();
}
public setWindowManager(windowManager: WindowManager): void {
this.windowManager = windowManager;
}
private setState(newState: RecordingState, error?: string): void {
const oldState = this.recordingState;
this.recordingState = newState;

View file

@ -6,6 +6,7 @@ import { SwiftIOBridge } from "../../services/platform/swift-bridge-service";
import { AutoUpdaterService } from "../services/auto-updater";
import { WindowManager } from "../core/window-manager";
import { RecordingManager } from "./recording-manager";
import { VADService } from "../../services/vad-service";
/**
* Manages service initialization and lifecycle
@ -17,6 +18,7 @@ export class ServiceManager {
private modelManagerService: ModelManagerService | null = null;
private transcriptionService: TranscriptionService | null = null;
private settingsService: SettingsService | null = null;
private vadService: VADService | null = null;
private swiftIOBridge: SwiftIOBridge | null = null;
private autoUpdaterService: AutoUpdaterService | null = null;
@ -34,8 +36,9 @@ export class ServiceManager {
this.initializeSettingsService();
await this.initializeModelServices();
this.initializePlatformServices();
await this.initializeVADService();
await this.initializeAIServices();
this.initializeRecordingManager();
this.initializeRecordingManager(windowManager);
this.initializeAutoUpdater(windowManager);
this.isInitialized = true;
@ -57,6 +60,17 @@ export class ServiceManager {
await this.modelManagerService.initialize();
}
private async initializeVADService(): Promise<void> {
try {
this.vadService = new VADService();
await this.vadService.initialize();
logger.main.info("VAD service initialized");
} catch (error) {
logger.main.error("Failed to initialize VAD service:", error);
// Don't throw - VAD is not critical for basic functionality
}
}
private async initializeAIServices(): Promise<void> {
try {
if (!this.modelManagerService) {
@ -65,7 +79,9 @@ export class ServiceManager {
this.transcriptionService = new TranscriptionService(
this.modelManagerService,
this.vadService,
);
await this.transcriptionService.initialize();
// Load and configure formatter
try {
@ -109,8 +125,9 @@ export class ServiceManager {
}
}
private initializeRecordingManager(): void {
private initializeRecordingManager(windowManager: WindowManager): void {
this.recordingManager = new RecordingManager(this);
this.recordingManager.setWindowManager(windowManager);
logger.main.info("Recording manager initialized");
}
@ -191,6 +208,15 @@ export class ServiceManager {
return this.recordingManager;
}
getVADService(): VADService | null {
if (!this.isInitialized) {
throw new Error(
"ServiceManager not initialized. Call initialize() first.",
);
}
return this.vadService;
}
async cleanup(): Promise<void> {
if (this.recordingManager) {
logger.main.info("Cleaning up recording manager...");
@ -201,6 +227,11 @@ export class ServiceManager {
this.modelManagerService.cleanup();
}
if (this.vadService) {
logger.main.info("Cleaning up VAD service...");
await this.vadService.dispose();
}
if (this.swiftIOBridge) {
logger.main.info("Stopping Swift helper...");
this.swiftIOBridge.stopHelper();

View file

@ -10,6 +10,7 @@ export { PipelineContext, SharedPipelineData } from "./context";
// Transcription input parameters
export interface TranscribeParams {
audioData: Buffer;
speechProbability?: number; // Speech probability from frontend VAD (0-1)
context: {
vocabulary?: Map<string, string>;
accessibilityContext?: GetAccessibilityContextResult | null;
@ -34,6 +35,7 @@ export interface FormatParams {
export interface TranscriptionProvider {
readonly name: string;
transcribe(params: TranscribeParams): Promise<string>;
flush?(): Promise<string>; // Optional flush method for providers that buffer
}
// Formatting provider interface

View file

@ -12,6 +12,19 @@ export class WhisperProvider implements TranscriptionProvider {
private modelManager: ModelManagerService;
private whisperInstance: Whisper | null = null;
// Frame aggregation state
private frameBuffer: Float32Array[] = [];
private frameBufferSpeechProbabilities: number[] = []; // Track speech probabilities for each frame
private silenceFrameCount = 0;
private lastSpeechTimestamp = 0;
// Configuration
private readonly FRAME_SIZE = 512; // 32ms at 16kHz
private readonly MIN_SPEECH_DURATION_MS = 500; // Minimum speech duration to transcribe
private readonly MAX_SILENCE_DURATION_MS = 2000; // Max silence before cutting
private readonly SAMPLE_RATE = 16000;
private readonly SPEECH_PROBABILITY_THRESHOLD = 0.2; // Threshold for speech detection
constructor(modelManager: ModelManagerService) {
this.modelManager = modelManager;
}
@ -21,20 +34,53 @@ export class WhisperProvider implements TranscriptionProvider {
await this.initializeWhisper();
// Extract parameters from the new structure
const { audioData, context } = params;
const { audioData, speechProbability = 0, context } = params;
const { vocabulary, previousChunk, aggregatedTranscription } = context;
// Convert audio buffer to the format expected by smart-whisper
const audioFloat32Array = await this.convertAudioBuffer(audioData);
// Add frame to buffer with speech probability
this.frameBuffer.push(audioFloat32Array);
this.frameBufferSpeechProbabilities.push(speechProbability);
// Consider it speech if probability is above threshold
const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD;
logger.transcription.debug(
`Starting transcription, audio size: ${audioData.length}`,
previousChunk
? `Previous chunk: ${previousChunk.substring(0, 50)}...`
: "No previous chunk",
aggregatedTranscription
? `Aggregated length: ${aggregatedTranscription.length}`
: "No aggregated transcription",
`Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.silenceFrameCount}`,
);
// Handle speech/silence logic
if (isSpeech) {
this.silenceFrameCount = 0;
this.lastSpeechTimestamp = Date.now();
} else {
this.silenceFrameCount++;
}
// Determine if we should transcribe
const shouldTranscribe = this.shouldTranscribe();
if (!shouldTranscribe) {
// Keep buffering
return "";
}
// Aggregate buffered frames
const aggregatedAudio = this.aggregateFrames();
// Skip if too short or only silence
if (aggregatedAudio.length < this.FRAME_SIZE * 2) {
logger.transcription.debug("Skipping transcription - audio too short");
this.frameBuffer = [];
this.frameBufferSpeechProbabilities = [];
this.silenceFrameCount = 0;
return "";
}
logger.transcription.debug(
`Starting transcription of ${aggregatedAudio.length} samples (${((aggregatedAudio.length / this.SAMPLE_RATE) * 1000).toFixed(0)}ms)`,
);
// Transcribe using smart-whisper
@ -49,10 +95,13 @@ export class WhisperProvider implements TranscriptionProvider {
);
const { result } = await this.whisperInstance.transcribe(
audioFloat32Array,
aggregatedAudio,
{
language: "auto",
initial_prompt: initialPrompt,
suppress_blank: true,
suppress_non_speech_tokens: true,
no_timestamps: true,
},
);
@ -68,6 +117,11 @@ export class WhisperProvider implements TranscriptionProvider {
`Transcription completed, length: ${text.length}`,
);
// Clear buffer after successful transcription
this.frameBuffer = [];
this.frameBufferSpeechProbabilities = [];
this.silenceFrameCount = 0;
return text;
} catch (error) {
logger.transcription.error("Transcription failed:", error);
@ -75,6 +129,112 @@ export class WhisperProvider implements TranscriptionProvider {
}
}
private shouldTranscribe(): boolean {
// Transcribe if:
// 1. We have significant silence after speech
// 2. Buffer is getting too large
// 3. Final chunk was received (handled elsewhere)
const bufferDurationMs =
((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
const silenceDurationMs =
((this.silenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
// If we have speech and then significant silence, transcribe
if (
this.frameBuffer.length > 0 &&
silenceDurationMs > this.MAX_SILENCE_DURATION_MS
) {
logger.transcription.debug(
`Transcribing due to ${silenceDurationMs}ms of silence`,
);
return true;
}
// If buffer is too large (e.g., 30 seconds), transcribe anyway
if (bufferDurationMs > 30000) {
logger.transcription.debug(
`Transcribing due to buffer size: ${bufferDurationMs}ms`,
);
return true;
}
logger.transcription.error("Not transcribing", {
bufferDurationMs,
silenceDurationMs,
frameBufferLength: this.frameBuffer.length,
silenceFrameCount: this.silenceFrameCount,
});
return false;
}
private aggregateFrames(): Float32Array {
// Calculate total size
const totalLength = this.frameBuffer.reduce(
(sum, frame) => sum + frame.length,
0,
);
const aggregated = new Float32Array(totalLength);
// Copy all frames into single array
let offset = 0;
for (const frame of this.frameBuffer) {
aggregated.set(frame, offset);
offset += frame.length;
}
// Trim silence from beginning and end
const trimmed = this.trimSilence(aggregated);
return trimmed;
}
private trimSilence(audio: Float32Array): Float32Array {
// Find first speech frame (probability > threshold)
let startIdx = 0;
for (let i = 0; i < this.frameBufferSpeechProbabilities.length; i++) {
if (
this.frameBufferSpeechProbabilities[i] >
this.SPEECH_PROBABILITY_THRESHOLD
) {
startIdx = i * this.FRAME_SIZE;
break;
}
}
// Find last speech frame (probability > threshold)
let endIdx = audio.length;
for (let i = this.frameBufferSpeechProbabilities.length - 1; i >= 0; i--) {
if (
this.frameBufferSpeechProbabilities[i] >
this.SPEECH_PROBABILITY_THRESHOLD
) {
endIdx = (i + 1) * this.FRAME_SIZE;
break;
}
}
return audio.slice(startIdx, Math.min(endIdx, audio.length));
}
// Force transcription of any remaining frames
async flush(): Promise<string> {
if (this.frameBuffer.length === 0) {
return "";
}
logger.transcription.error(`Flushing ${this.frameBuffer.length} frames`);
// Force transcription by setting high silence count
this.silenceFrameCount = 999;
return this.transcribe({
audioData: Buffer.alloc(0), // Empty buffer, we'll use the buffered frames
speechProbability: 0,
context: {},
});
}
private generateInitialPrompt(
vocabulary?: Map<string, string>,
aggregatedTranscription?: string,
@ -163,5 +323,10 @@ export class WhisperProvider implements TranscriptionProvider {
this.whisperInstance = null;
}
}
// Clear buffers
this.frameBuffer = [];
this.frameBufferSpeechProbabilities = [];
this.silenceFrameCount = 0;
}
}

View file

@ -19,24 +19,27 @@ export const FloatingButton: React.FC = () => {
};
}, []);
const handleAudioChunk = useCallback(
async (audioChunk: ArrayBuffer, isFinalChunk: boolean) => {
const handleAudioFrame = useCallback(
async (
audioBuffer: ArrayBuffer,
speechProbability: number,
isFinal: boolean,
) => {
try {
// Send the audio chunk regardless of whether it's final or not
await window.electronAPI.sendAudioChunk(audioChunk, isFinalChunk);
console.debug(`Sent audio chunk`, {
chunkSize: audioChunk.byteLength,
isFinalChunk,
// Send frame directly to main process
// TODO: We need to update the IPC to include speech detection info
await window.electronAPI.sendAudioChunk(audioBuffer, isFinal);
console.debug(`Sent audio frame`, {
size: audioBuffer.byteLength,
speechProbability: speechProbability.toFixed(3),
isFinal,
});
if (isFinalChunk) {
console.log("Final chunk sent to main process");
// You might want to add a specific IPC call here if the main process needs an explicit signal
// to finalize transcription, e.g., window.electronAPI.finalizeTranscription();
// For now, we assume sendAudioChunk is enough and the main process handles the stream end.
if (isFinal) {
console.log("Final frame sent to main process");
}
} catch (error) {
console.error("Error sending audio chunk:", error);
console.error("Error sending audio frame:", error);
}
},
[],
@ -44,8 +47,7 @@ export const FloatingButton: React.FC = () => {
const { recordingStatus, startRecording, stopRecording, voiceDetected } =
useRecording({
onAudioChunk: handleAudioChunk,
// Optionally, set chunkDurationMs here if needed, e.g., chunkDurationMs: 250
onAudioFrame: handleAudioFrame,
});
const isRecording =
recordingStatus === "recording" || recordingStatus === "starting";

View file

@ -7,11 +7,11 @@ import { createDefaultContext } from "../pipeline/core/context";
import { WhisperProvider } from "../pipeline/providers/transcription/whisper-provider";
import { OpenRouterProvider } from "../pipeline/providers/formatting/openrouter-formatter";
import { ModelManagerService } from "../services/model-manager";
import { ServiceManager } from "../main/managers/service-manager";
import { appContextStore } from "../stores/app-context";
import { createTranscription } from "../db/transcriptions";
import { logger } from "../main/logger";
import { v4 as uuid } from "uuid";
import { VADService } from "./vad-service";
/**
* Service for audio transcription and optional formatting
@ -21,9 +21,23 @@ export class TranscriptionService {
private openRouterProvider: OpenRouterProvider | null = null;
private formatterEnabled = false;
private streamingSessions: Map<string, StreamingSession> = new Map();
private vadService: VADService | null = null;
constructor(modelManagerService: ModelManagerService) {
constructor(
modelManagerService: ModelManagerService,
vadService: VADService | null = null,
) {
this.whisperProvider = new WhisperProvider(modelManagerService);
this.vadService = vadService;
}
async initialize(): Promise<void> {
if (this.vadService) {
logger.transcription.info("Using VAD service");
} else {
logger.transcription.warn("VAD service not available");
}
logger.transcription.info("Transcription service initialized");
}
/**
@ -62,6 +76,26 @@ export class TranscriptionService {
isFinal?: boolean;
}): Promise<string> {
const { sessionId, audioChunk, isFinal = false } = options;
console.error("processing streaming chunk", {
length: audioChunk.length,
});
// Run VAD on the audio chunk
let speechProbability = 0;
let isSpeaking = false;
if (audioChunk.length > 0 && this.vadService) {
const vadResult = await this.vadService.processAudioFrame(
audioChunk.buffer as ArrayBuffer,
);
speechProbability = vadResult.probability;
isSpeaking = vadResult.isSpeaking;
logger.transcription.debug("VAD result", {
probability: speechProbability.toFixed(3),
isSpeaking,
});
}
// Auto-create session if it doesn't exist
let session = this.streamingSessions.get(sessionId);
@ -90,7 +124,7 @@ export class TranscriptionService {
// Process chunk if it has content
if (audioChunk.length > 0) {
// Direct provider call - no step wrapper
// Direct frame to Whisper - it will handle aggregation and VAD internally
const previousChunk =
session.transcriptionResults.length > 0
? session.transcriptionResults[
@ -103,6 +137,7 @@ export class TranscriptionService {
const chunkTranscription = await this.whisperProvider.transcribe({
audioData: audioChunk,
speechProbability: speechProbability, // Now from VAD service
context: {
vocabulary: session.context.sharedData.vocabulary,
accessibilityContext: session.context.sharedData.accessibilityContext,
@ -111,22 +146,39 @@ export class TranscriptionService {
},
});
// Accumulate the result
// Accumulate the result only if Whisper returned something
// (it returns empty string while buffering)
if (chunkTranscription.trim()) {
session.transcriptionResults.push(chunkTranscription);
logger.transcription.info("Whisper returned transcription", {
sessionId,
transcriptionLength: chunkTranscription.length,
totalResults: session.transcriptionResults.length,
});
}
logger.transcription.debug("Processed chunk", {
logger.transcription.error("Processed frame", {
sessionId,
chunkSize: audioChunk.length,
transcriptionLength: chunkTranscription.length,
totalResults: session.transcriptionResults.length,
frameSize: audioChunk.length,
hadTranscription: chunkTranscription.length > 0,
isFinal,
});
}
// If this is the final chunk, apply formatting and save
// If this is the final chunk, flush any remaining audio and apply formatting
if (isFinal) {
// Flush any remaining buffered audio in Whisper
if (this.whisperProvider.flush) {
const flushResult = await this.whisperProvider.flush();
if (flushResult.trim()) {
session.transcriptionResults.push(flushResult);
logger.transcription.info("Flushed final audio", {
sessionId,
flushLength: flushResult.length,
});
}
}
// Get complete transcription
let completeTranscription = session.transcriptionResults.join(" ").trim();
@ -137,7 +189,7 @@ export class TranscriptionService {
});
// Format if enabled
if (this.formatterEnabled && this.openRouterProvider) {
if (this.formatterEnabled && this.openRouterProvider && false) {
const style =
session.context.sharedData.userPreferences?.formattingStyle;
completeTranscription = await this.openRouterProvider.format({
@ -188,19 +240,9 @@ export class TranscriptionService {
// Create default context
const context = createDefaultContext(uuid());
// Simple context building - no complex loading
const serviceManager = ServiceManager.getInstance();
if (serviceManager) {
try {
const settingsService = serviceManager.getSettingsService();
const formatterConfig = await settingsService.getFormatterConfig();
} catch (error) {
logger.transcription.warn("Failed to load formatter config", { error });
}
}
// TODO: Load actual vocabulary
// TODO: Load user preferences from settings
// TODO: Load formatter config from settings
return context;
}
@ -210,6 +252,7 @@ export class TranscriptionService {
*/
async dispose(): Promise<void> {
await this.whisperProvider.dispose();
// VAD service is managed by ServiceManager
logger.transcription.info("Transcription service disposed");
}
}

View file

@ -0,0 +1,192 @@
import * as ort from "onnxruntime-node";
import { logger } from "../main/logger";
import { app } from "electron";
import * as path from "path";
import { EventEmitter } from "node:events";
import { existsSync } from "node:fs";
export class VADService extends EventEmitter {
private session: ort.InferenceSession | null = null;
private modelPath: string | null = null;
private state: ort.Tensor | null = null;
private sr: number = 16000;
// Configuration
private readonly WINDOW_SIZE_SAMPLES = 512; // 32ms at 16kHz
private readonly SPEECH_THRESHOLD = 0.2;
private readonly REDEMPTION_FRAMES = 8;
// State
private speechFrameCount = 0;
private silenceFrameCount = 0;
private isSpeaking = false;
constructor() {
super();
}
async initialize(): Promise<void> {
try {
// Handle both development and production paths
if (app.isPackaged) {
// In production, the assets are copied to the resources folder
this.modelPath = path.join(
process.resourcesPath,
"assets",
"silero_vad_v5.onnx",
);
} else {
// In development, use the source path
this.modelPath = path.join(
__dirname,
"../../src/assets/silero_vad_v5.onnx",
);
}
logger.main.info("Loading VAD model from", this.modelPath);
// Check if the model file exists
if (!existsSync(this.modelPath)) {
throw new Error(
`VAD model file not found at: ${this.modelPath}. ` +
`Make sure the ONNX model is in the assets folder.`,
);
}
// Load ONNX model
this.session = await ort.InferenceSession.create(this.modelPath, {
executionProviders: ["cpu"], // Use CPU provider for compatibility
});
// Initialize hidden states (h and c)
this.resetStates();
logger.main.info("VAD service initialized successfully");
} catch (error) {
logger.main.error("Failed to initialize VAD service:", error);
throw error;
}
}
private resetStates(): void {
// Silero VAD uses a state tensor with shape [2, 1, 128]
const stateSize = 2 * 1 * 128;
this.state = new ort.Tensor(
"float32",
new Float32Array(stateSize).fill(0),
[2, 1, 128],
);
}
async processBatch(
audioFrames: Float32Array,
): Promise<{ probability: number; isSpeaking: boolean }> {
if (!this.session || !this.state) {
throw new Error("VAD service not initialized");
}
try {
// Create input tensor - shape should be [1, audio_length]
const inputTensor = new ort.Tensor("float32", audioFrames, [
1,
audioFrames.length,
]);
const srTensor = new ort.Tensor(
"int64",
BigInt64Array.from([BigInt(this.sr)]),
[],
);
// Run inference with input, state, and sr
const results = await this.session.run({
input: inputTensor,
state: this.state,
sr: srTensor,
});
// Update state for next iteration
this.state = results.stateN as ort.Tensor;
// Get speech probability
const output = results.output as ort.Tensor;
const probability = output.data[0] as number;
// Apply smoothing logic
const isSpeaking = this.applySpeechDetectionLogic(probability);
return { probability, isSpeaking };
} catch (error) {
logger.main.error("VAD inference failed:", error);
throw error;
}
}
private applySpeechDetectionLogic(probability: number): boolean {
const isSpeechFrame = probability > this.SPEECH_THRESHOLD;
if (isSpeechFrame) {
this.speechFrameCount++;
this.silenceFrameCount = 0;
} else {
this.silenceFrameCount++;
if (this.silenceFrameCount > this.REDEMPTION_FRAMES) {
this.speechFrameCount = 0;
}
}
// Start speaking after enough speech frames
if (!this.isSpeaking && this.speechFrameCount >= 3) {
this.isSpeaking = true;
logger.main.debug("Speech started");
this.emit("voice-detected", true);
}
// Stop speaking after enough silence
if (this.isSpeaking && this.silenceFrameCount >= this.REDEMPTION_FRAMES) {
this.isSpeaking = false;
logger.main.debug("Speech ended");
this.emit("voice-detected", false);
}
return this.isSpeaking;
}
async processAudioFrame(
audioBuffer: ArrayBuffer,
): Promise<{ probability: number; isSpeaking: boolean }> {
// Convert ArrayBuffer to Float32Array
const float32Array = new Float32Array(audioBuffer);
// Silero VAD requires exactly 512 samples
if (float32Array.length !== this.WINDOW_SIZE_SAMPLES) {
// If we have fewer samples (e.g., final buffer flush), pad with zeros
if (float32Array.length < this.WINDOW_SIZE_SAMPLES) {
const paddedArray = new Float32Array(this.WINDOW_SIZE_SAMPLES);
paddedArray.set(float32Array);
// Rest is already zeros
return this.processBatch(paddedArray);
} else {
// If we have more samples, just process the first 512
const truncatedArray = float32Array.slice(0, this.WINDOW_SIZE_SAMPLES);
return this.processBatch(truncatedArray);
}
}
// Process through VAD
return this.processBatch(float32Array);
}
getSpeechState(): boolean {
return this.isSpeaking;
}
async dispose(): Promise<void> {
if (this.session) {
await this.session.release();
this.session = null;
}
this.state = null;
logger.main.info("VAD service disposed");
}
}

View file

@ -3,6 +3,7 @@ import { observable } from "@trpc/server/observable";
import superjson from "superjson";
import { ServiceManager } from "../../main/managers/service-manager";
import type { RecordingStatus } from "../../types/recording";
import { logger } from "../../main/logger";
const t = initTRPC.create({
isServer: true,
@ -61,4 +62,36 @@ export const recordingRouter = t.router({
};
});
}),
// Voice detection subscription
voiceDetectionUpdates: t.procedure.subscription(() => {
return observable<boolean>((emit) => {
const serviceManager = ServiceManager.getInstance();
if (!serviceManager) {
throw new Error("ServiceManager not initialized");
}
const vadService = serviceManager.getVADService();
if (!vadService) {
logger.main.warn(
"VAD service not available for voice detection subscription",
);
// Emit false and complete immediately if VAD is not available
emit.next(false);
return () => {};
}
// Set up listener for voice detection changes
const handleVoiceDetection = (detected: boolean) => {
emit.next(detected);
};
vadService.on("voice-detected", handleVoiceDetection);
// Cleanup function
return () => {
vadService.off("voice-detected", handleVoiceDetection);
};
});
}),
});

13
apps/desktop/src/types/vite-env.d.ts vendored Normal file
View file

@ -0,0 +1,13 @@
/// <reference types="vite/client" />
// Declare module for URL imports
declare module "*?url" {
const url: string;
export default url;
}
// Declare module for raw imports
declare module "*?raw" {
const content: string;
export default content;
}

View file

@ -14,6 +14,7 @@ export default defineConfig({
"@libsql/linux-x64-musl",
"@libsql/win32-x64-msvc",
"libsql",
"onnxruntime-node",
/^node:/,
/^electron$/,
],

View file

@ -38,7 +38,8 @@
"drizzle-orm/libsql",
"@libsql",
"macos-alias",
"fs-xattr"
"fs-xattr",
"onnxruntime-node"
]
}
}

168
pnpm-lock.yaml generated
View file

@ -128,9 +128,6 @@ importers:
'@radix-ui/react-tooltip':
specifier: ^1.2.7
version: 1.2.7(@types/react-dom@19.1.5(@types/react@19.1.5))(@types/react@19.1.5)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
'@ricky0123/vad-web':
specifier: ^0.0.24
version: 0.0.24
'@tabler/icons-react':
specifier: ^3.34.0
version: 3.34.0(react@19.1.0)
@ -218,6 +215,9 @@ importers:
next-themes:
specifier: ^0.4.6
version: 0.4.6(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
onnxruntime-node:
specifier: ^1.20.1
version: 1.22.0
openai:
specifier: ^4.98.0
version: 4.103.0(encoding@0.1.13)(ws@8.18.0)(zod@3.25.67)
@ -2042,36 +2042,6 @@ packages:
resolution: {integrity: sha512-ROFF39F6ZrnzSUEmQQZUar0Jt4xVoP9WnDRdWwF4NNcXs3xBTLgBUDoOwW141y1jP+S8nahIbdxbFC7IShw9Iw==}
engines: {node: ^12.20.0 || ^14.18.0 || >=16.0.0}
'@protobufjs/aspromise@1.1.2':
resolution: {integrity: sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==}
'@protobufjs/base64@1.1.2':
resolution: {integrity: sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==}
'@protobufjs/codegen@2.0.4':
resolution: {integrity: sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==}
'@protobufjs/eventemitter@1.1.0':
resolution: {integrity: sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==}
'@protobufjs/fetch@1.1.0':
resolution: {integrity: sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==}
'@protobufjs/float@1.0.2':
resolution: {integrity: sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==}
'@protobufjs/inquire@1.1.0':
resolution: {integrity: sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==}
'@protobufjs/path@1.1.2':
resolution: {integrity: sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==}
'@protobufjs/pool@1.1.0':
resolution: {integrity: sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==}
'@protobufjs/utf8@1.1.0':
resolution: {integrity: sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==}
'@radix-ui/number@1.1.1':
resolution: {integrity: sha512-MkKCwxlXTgz6CFoJx3pCwn07GKp36+aZyu/u2Ln2VrA5DcdyCZkASEDBTd8x5whTQQL5CiYf4prXKLcgQdv29g==}
@ -2684,9 +2654,6 @@ packages:
'@radix-ui/rect@1.1.1':
resolution: {integrity: sha512-HPwpGIzkl28mWyZqG52jiqDJ12waP11Pa1lGoiyUkIEuMLBP0oeK/C89esbXrxsky5we7dfd8U58nm0SgAWpVw==}
'@ricky0123/vad-web@0.0.24':
resolution: {integrity: sha512-uv6GWW/kq8BkVErMQzXp3uwSyYMT3w/3QJiUerVaaKp7EwhOTIRY+96EoyFdG2WOFU5RkLk/2CVGbI7nDlxhEg==}
'@rollup/plugin-commonjs@28.0.6':
resolution: {integrity: sha512-XSQB1K7FUU5QP+3lOQmVCE3I0FcbbNvmNT4VJSj93iUjayaARrTQeoRdiYQoftAJBLrR9t2agwAd3ekaTgHNlw==}
engines: {node: '>=16.0.0 || 14 >= 14.17'}
@ -3317,9 +3284,6 @@ packages:
'@types/keyv@3.1.4':
resolution: {integrity: sha512-BQ5aZNSCpj7D6K2ksrRCTmKRLEpnPvWDiLPfoGyhZ++8YtiK9d/3DBKPJgry359X/P1PfruyYwvnvwFjuEiEIg==}
'@types/long@4.0.2':
resolution: {integrity: sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA==}
'@types/mdast@4.0.4':
resolution: {integrity: sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==}
@ -3486,6 +3450,10 @@ packages:
engines: {node: '>=0.4.0'}
hasBin: true
adm-zip@0.5.16:
resolution: {integrity: sha512-TGw5yVi4saajsSEgz25grObGHEUaDrniwvA2qwSC060KfqGPdglhvPMA2lPIoxs3PQIItj2iag35fONcQqgUaQ==}
engines: {node: '>=12.0'}
agent-base@6.0.2:
resolution: {integrity: sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==}
engines: {node: '>= 6.0.0'}
@ -4953,9 +4921,6 @@ packages:
resolution: {integrity: sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==}
engines: {node: '>=16'}
flatbuffers@1.12.0:
resolution: {integrity: sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==}
flatted@3.3.3:
resolution: {integrity: sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==}
@ -5266,9 +5231,6 @@ packages:
resolution: {integrity: sha512-5v6yZd4JK3eMI3FqqCouswVqwugaA9r4dNZB1wwcmrD02QkV5H0y7XBQW8QwQqEaZY1pM9aqORSORhJRdNK44Q==}
engines: {node: '>=6.0'}
guid-typescript@1.0.9:
resolution: {integrity: sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==}
handlebars@4.7.7:
resolution: {integrity: sha512-aAcXm5OAfE/8IXkcZvCepKU3VzW1/39Fb5ZuqMtgI/hT8X2YgoMvBY5dLhq/cpOvw7Lk1nK/UF71aLG/ZnVYRA==}
engines: {node: '>=0.4.7'}
@ -5918,9 +5880,6 @@ packages:
resolution: {integrity: sha512-5UtUDQ/6edw4ofyljDNcOVJQ4c7OjDro4h3y8e1GQL5iYElYclVHJ3zeWchylvMaKnDbDilC8irOVyexnA/Slw==}
engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
long@4.0.0:
resolution: {integrity: sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA==}
longest-streak@3.1.0:
resolution: {integrity: sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g==}
@ -6506,14 +6465,12 @@ packages:
oniguruma-to-es@4.3.3:
resolution: {integrity: sha512-rPiZhzC3wXwE59YQMRDodUwwT9FZ9nNBwQQfsd1wfdtlKEyCdRV0avrTcSZ5xlIvGRVPd/cx6ZN45ECmS39xvg==}
onnx-proto@4.0.4:
resolution: {integrity: sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA==}
onnxruntime-common@1.22.0:
resolution: {integrity: sha512-vcuaNWgtF2dGQu/EP5P8UI5rEPEYqXG2sPPe5j9lg2TY/biJF8eWklTMwlDO08iuXq48xJo0awqIpK5mPG+IxA==}
onnxruntime-common@1.14.0:
resolution: {integrity: sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew==}
onnxruntime-web@1.14.0:
resolution: {integrity: sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw==}
onnxruntime-node@1.22.0:
resolution: {integrity: sha512-QaAqr7PFekrmEsmu1rpw7OxJYyG+iACjNHoNtQIVt9Oh7st8WDPIIUe6KhF9l35HVJTJd9CV1rePoPmKhSV26g==}
os: [win32, darwin, linux]
openai@4.103.0:
resolution: {integrity: sha512-eWcz9kdurkGOFDtd5ySS5y251H2uBgq9+1a2lTBnjMMzlexJ40Am5t6Mu76SSE87VvitPa0dkIAp75F+dZVC0g==}
@ -6715,9 +6672,6 @@ packages:
resolution: {integrity: sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==}
engines: {node: '>=0.10.0'}
platform@1.3.6:
resolution: {integrity: sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==}
plist@3.1.0:
resolution: {integrity: sha512-uysumyrvkUX0rX/dEVqt8gC3sTBzd4zoWfLeS29nb53imdaXVvLINYXTI2GNqzaMuvacNx4uJQ8+b3zXR0pkgQ==}
engines: {node: '>=10.4.0'}
@ -6804,10 +6758,6 @@ packages:
property-information@7.1.0:
resolution: {integrity: sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ==}
protobufjs@6.11.4:
resolution: {integrity: sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw==}
hasBin: true
proxy-addr@2.0.7:
resolution: {integrity: sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg==}
engines: {node: '>= 0.10'}
@ -10147,29 +10097,6 @@ snapshots:
'@pkgr/core@0.2.4': {}
'@protobufjs/aspromise@1.1.2': {}
'@protobufjs/base64@1.1.2': {}
'@protobufjs/codegen@2.0.4': {}
'@protobufjs/eventemitter@1.1.0': {}
'@protobufjs/fetch@1.1.0':
dependencies:
'@protobufjs/aspromise': 1.1.2
'@protobufjs/inquire': 1.1.0
'@protobufjs/float@1.0.2': {}
'@protobufjs/inquire@1.1.0': {}
'@protobufjs/path@1.1.2': {}
'@protobufjs/pool@1.1.0': {}
'@protobufjs/utf8@1.1.0': {}
'@radix-ui/number@1.1.1': {}
'@radix-ui/primitive@1.1.2': {}
@ -10823,10 +10750,6 @@ snapshots:
'@radix-ui/rect@1.1.1': {}
'@ricky0123/vad-web@0.0.24':
dependencies:
onnxruntime-web: 1.14.0
'@rollup/plugin-commonjs@28.0.6(rollup@4.41.0)':
dependencies:
'@rollup/pluginutils': 5.2.0(rollup@4.41.0)
@ -11560,8 +11483,6 @@ snapshots:
dependencies:
'@types/node': 22.15.12
'@types/long@4.0.2': {}
'@types/mdast@4.0.4':
dependencies:
'@types/unist': 3.0.3
@ -11744,6 +11665,8 @@ snapshots:
acorn@8.14.1: {}
adm-zip@0.5.16: {}
agent-base@6.0.2:
dependencies:
debug: 4.4.1
@ -12006,8 +11929,7 @@ snapshots:
transitivePeerDependencies:
- supports-color
boolean@3.2.0:
optional: true
boolean@3.2.0: {}
bottleneck@2.19.5: {}
@ -12583,8 +12505,7 @@ snapshots:
detect-node-es@1.1.0: {}
detect-node@2.1.0:
optional: true
detect-node@2.1.0: {}
devlop@1.1.0:
dependencies:
@ -12929,8 +12850,7 @@ snapshots:
is-date-object: 1.1.0
is-symbol: 1.1.1
es6-error@4.1.1:
optional: true
es6-error@4.1.1: {}
esast-util-from-estree@2.0.0:
dependencies:
@ -13474,8 +13394,6 @@ snapshots:
flatted: 3.3.3
keyv: 4.5.4
flatbuffers@1.12.0: {}
flatted@3.3.3: {}
flora-colossus@2.0.0:
@ -13833,7 +13751,6 @@ snapshots:
roarr: 2.15.4
semver: 7.7.2
serialize-error: 7.0.1
optional: true
global-dirs@3.0.1:
dependencies:
@ -13904,8 +13821,6 @@ snapshots:
section-matter: 1.0.0
strip-bom-string: 1.0.0
guid-typescript@1.0.9: {}
handlebars@4.7.7:
dependencies:
minimist: 1.2.8
@ -14443,8 +14358,7 @@ snapshots:
json-stable-stringify-without-jsonify@1.0.1: {}
json-stringify-safe@5.0.1:
optional: true
json-stringify-safe@5.0.1: {}
json5@1.0.2:
dependencies:
@ -14623,8 +14537,6 @@ snapshots:
strip-ansi: 7.1.0
wrap-ansi: 8.1.0
long@4.0.0: {}
longest-streak@3.1.0: {}
loose-envify@1.4.0:
@ -14695,7 +14607,6 @@ snapshots:
matcher@3.0.0:
dependencies:
escape-string-regexp: 4.0.0
optional: true
math-intrinsics@1.1.0: {}
@ -15488,20 +15399,13 @@ snapshots:
regex: 6.0.1
regex-recursion: 6.0.2
onnx-proto@4.0.4:
dependencies:
protobufjs: 6.11.4
onnxruntime-common@1.22.0: {}
onnxruntime-common@1.14.0: {}
onnxruntime-web@1.14.0:
onnxruntime-node@1.22.0:
dependencies:
flatbuffers: 1.12.0
guid-typescript: 1.0.9
long: 4.0.0
onnx-proto: 4.0.4
onnxruntime-common: 1.14.0
platform: 1.3.6
adm-zip: 0.5.16
global-agent: 3.0.0
onnxruntime-common: 1.22.0
openai@4.103.0(encoding@0.1.13)(ws@8.18.0)(zod@3.25.67):
dependencies:
@ -15705,8 +15609,6 @@ snapshots:
pify@2.3.0: {}
platform@1.3.6: {}
plist@3.1.0:
dependencies:
'@xmldom/xmldom': 0.8.10
@ -15790,22 +15692,6 @@ snapshots:
property-information@7.1.0: {}
protobufjs@6.11.4:
dependencies:
'@protobufjs/aspromise': 1.1.2
'@protobufjs/base64': 1.1.2
'@protobufjs/codegen': 2.0.4
'@protobufjs/eventemitter': 1.1.0
'@protobufjs/fetch': 1.1.0
'@protobufjs/float': 1.0.2
'@protobufjs/inquire': 1.1.0
'@protobufjs/path': 1.1.2
'@protobufjs/pool': 1.1.0
'@protobufjs/utf8': 1.1.0
'@types/long': 4.0.2
'@types/node': 22.15.12
long: 4.0.0
proxy-addr@2.0.7:
dependencies:
forwarded: 0.2.0
@ -16341,7 +16227,6 @@ snapshots:
json-stringify-safe: 5.0.1
semver-compare: 1.0.0
sprintf-js: 1.1.3
optional: true
rollup@4.41.0:
dependencies:
@ -16425,8 +16310,7 @@ snapshots:
secure-json-parse@2.7.0: {}
semver-compare@1.0.0:
optional: true
semver-compare@1.0.0: {}
semver@5.7.2: {}
@ -16462,7 +16346,6 @@ snapshots:
serialize-error@7.0.1:
dependencies:
type-fest: 0.13.1
optional: true
serve-favicon@2.5.0:
dependencies:
@ -17180,8 +17063,7 @@ snapshots:
dependencies:
prelude-ls: 1.2.1
type-fest@0.13.1:
optional: true
type-fest@0.13.1: {}
type-fest@0.21.3: {}