chore: implement auto dismis of unintentional dictation (sub 500ms)
This commit is contained in:
parent
1d0c1a68df
commit
73734bfdd9
9 changed files with 998 additions and 587 deletions
|
|
@ -1,13 +1,27 @@
|
|||
import dotenv from "dotenv";
|
||||
dotenv.config();
|
||||
|
||||
import { app } from "electron";
|
||||
import { app, ipcMain } from "electron";
|
||||
import { logger } from "./logger";
|
||||
|
||||
import started from "electron-squirrel-startup";
|
||||
import { AppManager } from "./core/app-manager";
|
||||
import { updateElectronApp } from "update-electron-app";
|
||||
import { isWindows } from "../utils/platform";
|
||||
|
||||
// Setup renderer logging relay (allows renderer to send logs to main process)
|
||||
ipcMain.handle(
|
||||
"log-message",
|
||||
(_event, level: string, scope: string, ...args: unknown[]) => {
|
||||
const scopedLogger =
|
||||
logger[scope as keyof typeof logger] || logger.renderer;
|
||||
const logMethod = scopedLogger[level as keyof typeof scopedLogger];
|
||||
if (typeof logMethod === "function") {
|
||||
logMethod(...args);
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
if (started) {
|
||||
app.quit();
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -7,18 +7,20 @@ import { PipelineContext } from "./context";
|
|||
import { GetAccessibilityContextResult } from "@amical/types";
|
||||
export { PipelineContext, SharedPipelineData } from "./context";
|
||||
|
||||
// Context for transcription operations (shared between transcribe and flush)
|
||||
export interface TranscribeContext {
|
||||
vocabulary?: Map<string, string>;
|
||||
accessibilityContext?: GetAccessibilityContextResult | null;
|
||||
previousChunk?: string;
|
||||
aggregatedTranscription?: string;
|
||||
language?: string;
|
||||
}
|
||||
|
||||
// Transcription input parameters
|
||||
export interface TranscribeParams {
|
||||
audioData: Float32Array;
|
||||
speechProbability?: number; // Speech probability from frontend VAD (0-1)
|
||||
flush?: boolean; // Whether to flush any buffered audio
|
||||
context: {
|
||||
vocabulary?: Map<string, string>;
|
||||
accessibilityContext?: GetAccessibilityContextResult | null;
|
||||
previousChunk?: string;
|
||||
aggregatedTranscription?: string;
|
||||
language?: string;
|
||||
};
|
||||
context: TranscribeContext;
|
||||
}
|
||||
|
||||
// Formatting input parameters
|
||||
|
|
@ -37,6 +39,8 @@ export interface FormatParams {
|
|||
export interface TranscriptionProvider {
|
||||
readonly name: string;
|
||||
transcribe(params: TranscribeParams): Promise<string>;
|
||||
flush(context: TranscribeContext): Promise<string>;
|
||||
reset(): void; // Clear internal buffers without transcribing
|
||||
}
|
||||
|
||||
// Formatting provider interface
|
||||
|
|
@ -71,7 +75,7 @@ export interface StreamingSession {
|
|||
firstChunkReceivedAt?: number; // When first audio chunk arrived at transcription service
|
||||
recordingStartedAt?: number; // When user pressed record button (from RecordingManager)
|
||||
recordingStoppedAt?: number; // When user released record button (from RecordingManager)
|
||||
finalChunkReceivedAt?: number; // When final chunk arrived at transcription service
|
||||
finalizationStartedAt?: number; // When finalizeSession() was called
|
||||
}
|
||||
|
||||
// Simple pipeline configuration
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import {
|
||||
TranscriptionProvider,
|
||||
TranscribeParams,
|
||||
TranscribeContext,
|
||||
} from "../../core/pipeline-types";
|
||||
import { logger } from "../../../main/logger";
|
||||
import { AuthService } from "../../../services/auth-service";
|
||||
|
|
@ -51,21 +52,16 @@ export class AmicalCloudProvider implements TranscriptionProvider {
|
|||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an audio chunk - buffers and conditionally transcribes
|
||||
*/
|
||||
async transcribe(params: TranscribeParams): Promise<string> {
|
||||
try {
|
||||
const {
|
||||
audioData,
|
||||
speechProbability = 1,
|
||||
flush = false,
|
||||
context,
|
||||
} = params;
|
||||
const { audioData, speechProbability = 1, context } = params;
|
||||
|
||||
// Store language for use in API call (undefined = auto-detect)
|
||||
// Store context for API call
|
||||
this.currentLanguage = context.language;
|
||||
|
||||
// Store accessibility context for the API request
|
||||
this.currentAccessibilityContext = context?.accessibilityContext ?? null;
|
||||
|
||||
this.currentAggregatedTranscription = context?.aggregatedTranscription;
|
||||
|
||||
// Check authentication
|
||||
|
|
@ -89,40 +85,46 @@ export class AmicalCloudProvider implements TranscriptionProvider {
|
|||
this.currentSilenceFrameCount++;
|
||||
}
|
||||
|
||||
// Calculate durations
|
||||
const silenceDuration =
|
||||
((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
|
||||
1000;
|
||||
const speechDuration =
|
||||
((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
|
||||
|
||||
// Determine if we should process
|
||||
const shouldProcess =
|
||||
flush ||
|
||||
(speechDuration >= this.MIN_SPEECH_DURATION_MS &&
|
||||
silenceDuration >= this.MAX_SILENCE_DURATION_MS);
|
||||
|
||||
if (!shouldProcess) {
|
||||
// Only transcribe if speech/silence patterns indicate we should
|
||||
if (!this.shouldTranscribe()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Process accumulated audio (pass flush flag for formatting decision)
|
||||
const result = await this.processAudio(flush);
|
||||
|
||||
// Clear buffer after processing
|
||||
this.frameBuffer = [];
|
||||
this.frameBufferSpeechProbabilities = [];
|
||||
this.currentSilenceFrameCount = 0;
|
||||
|
||||
return result;
|
||||
return this.doTranscription(false);
|
||||
} catch (error) {
|
||||
logger.transcription.error("Cloud transcription error:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async processAudio(isFinal: boolean = false): Promise<string> {
|
||||
// Combine all frames into a single Float32Array (may be empty)
|
||||
/**
|
||||
* Flush any buffered audio and return transcription with formatting
|
||||
* Called at the end of a recording session
|
||||
*/
|
||||
async flush(context: TranscribeContext): Promise<string> {
|
||||
try {
|
||||
// Store context for API call
|
||||
this.currentLanguage = context.language;
|
||||
this.currentAccessibilityContext = context?.accessibilityContext ?? null;
|
||||
this.currentAggregatedTranscription = context?.aggregatedTranscription;
|
||||
|
||||
// Check authentication
|
||||
if (!(await this.authService.isAuthenticated())) {
|
||||
throw new Error("Authentication required for cloud transcription");
|
||||
}
|
||||
|
||||
return this.doTranscription(true);
|
||||
} catch (error) {
|
||||
logger.transcription.error("Cloud transcription error:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared transcription logic - aggregates buffer, calls cloud API, clears state
|
||||
*/
|
||||
private async doTranscription(enableFormatting: boolean): Promise<string> {
|
||||
// Combine all frames into a single Float32Array
|
||||
const totalLength = this.frameBuffer.reduce(
|
||||
(acc, frame) => acc + frame.length,
|
||||
0,
|
||||
|
|
@ -134,9 +136,43 @@ export class AmicalCloudProvider implements TranscriptionProvider {
|
|||
offset += frame.length;
|
||||
}
|
||||
|
||||
// Try transcription with automatic retry on 401
|
||||
// Enable formatting only on final chunk
|
||||
return this.makeTranscriptionRequest(combinedAudio, false, isFinal);
|
||||
// Clear frame buffers only (context values needed for API call below)
|
||||
this.frameBuffer = [];
|
||||
this.frameBufferSpeechProbabilities = [];
|
||||
this.currentSilenceFrameCount = 0;
|
||||
|
||||
// Make the API request
|
||||
return this.makeTranscriptionRequest(
|
||||
combinedAudio,
|
||||
false,
|
||||
enableFormatting,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear internal buffers without transcribing
|
||||
* Called when cancelling a session to prevent audio bleed
|
||||
*/
|
||||
reset(): void {
|
||||
this.frameBuffer = [];
|
||||
this.frameBufferSpeechProbabilities = [];
|
||||
this.currentSilenceFrameCount = 0;
|
||||
this.currentLanguage = undefined;
|
||||
this.currentAccessibilityContext = null;
|
||||
this.currentAggregatedTranscription = undefined;
|
||||
}
|
||||
|
||||
private shouldTranscribe(): boolean {
|
||||
const silenceDuration =
|
||||
((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
|
||||
1000;
|
||||
const speechDuration =
|
||||
((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
|
||||
|
||||
return (
|
||||
speechDuration >= this.MIN_SPEECH_DURATION_MS &&
|
||||
silenceDuration >= this.MAX_SILENCE_DURATION_MS
|
||||
);
|
||||
}
|
||||
|
||||
private async makeTranscriptionRequest(
|
||||
|
|
@ -144,9 +180,13 @@ export class AmicalCloudProvider implements TranscriptionProvider {
|
|||
isRetry = false,
|
||||
enableFormatting = false,
|
||||
): Promise<string> {
|
||||
// Skip API call if no audio and formatting not requested
|
||||
if (audioData.length === 0 && !enableFormatting) {
|
||||
return "";
|
||||
// Skip API call if there's nothing to process
|
||||
if (audioData.length === 0) {
|
||||
const hasTextToFormat =
|
||||
enableFormatting && this.currentAggregatedTranscription?.trim();
|
||||
if (!hasTextToFormat) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
// Get auth token
|
||||
|
|
@ -166,112 +206,104 @@ export class AmicalCloudProvider implements TranscriptionProvider {
|
|||
formatting: enableFormatting,
|
||||
});
|
||||
|
||||
try {
|
||||
const response = await fetch(`${this.apiEndpoint}/transcribe`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${idToken}`,
|
||||
"User-Agent": getUserAgent(),
|
||||
const response = await fetch(`${this.apiEndpoint}/transcribe`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${idToken}`,
|
||||
"User-Agent": getUserAgent(),
|
||||
},
|
||||
body: JSON.stringify({
|
||||
audioData: Array.from(audioData),
|
||||
language: this.currentLanguage,
|
||||
previousTranscription: this.currentAggregatedTranscription,
|
||||
formatting: {
|
||||
enabled: enableFormatting,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
audioData: Array.from(audioData),
|
||||
language: this.currentLanguage,
|
||||
previousTranscription: this.currentAggregatedTranscription,
|
||||
formatting: {
|
||||
enabled: enableFormatting,
|
||||
},
|
||||
sharedContext: this.currentAccessibilityContext
|
||||
? {
|
||||
selectedText:
|
||||
this.currentAccessibilityContext.context?.textSelection
|
||||
?.selectedText,
|
||||
beforeText:
|
||||
this.currentAccessibilityContext.context?.textSelection
|
||||
?.preSelectionText,
|
||||
afterText:
|
||||
this.currentAccessibilityContext.context?.textSelection
|
||||
?.postSelectionText,
|
||||
appType: detectApplicationType(
|
||||
this.currentAccessibilityContext,
|
||||
),
|
||||
appBundleId:
|
||||
this.currentAccessibilityContext.context?.application
|
||||
?.bundleIdentifier,
|
||||
appName:
|
||||
this.currentAccessibilityContext.context?.application?.name,
|
||||
appUrl:
|
||||
this.currentAccessibilityContext.context?.windowInfo?.url,
|
||||
surroundingContext: "", // Empty for now, future enhancement
|
||||
}
|
||||
: undefined,
|
||||
}),
|
||||
});
|
||||
sharedContext: this.currentAccessibilityContext
|
||||
? {
|
||||
selectedText:
|
||||
this.currentAccessibilityContext.context?.textSelection
|
||||
?.selectedText,
|
||||
beforeText:
|
||||
this.currentAccessibilityContext.context?.textSelection
|
||||
?.preSelectionText,
|
||||
afterText:
|
||||
this.currentAccessibilityContext.context?.textSelection
|
||||
?.postSelectionText,
|
||||
appType: detectApplicationType(this.currentAccessibilityContext),
|
||||
appBundleId:
|
||||
this.currentAccessibilityContext.context?.application
|
||||
?.bundleIdentifier,
|
||||
appName:
|
||||
this.currentAccessibilityContext.context?.application?.name,
|
||||
appUrl: this.currentAccessibilityContext.context?.windowInfo?.url,
|
||||
surroundingContext: "", // Empty for now, future enhancement
|
||||
}
|
||||
: undefined,
|
||||
}),
|
||||
});
|
||||
|
||||
// Handle 401 with token refresh and retry
|
||||
if (response.status === 401) {
|
||||
if (isRetry) {
|
||||
// Already retried once, give up
|
||||
throw new Error("Authentication failed - please log in again");
|
||||
}
|
||||
// Handle 401 with token refresh and retry
|
||||
if (response.status === 401) {
|
||||
if (isRetry) {
|
||||
// Already retried once, give up
|
||||
throw new Error("Authentication failed - please log in again");
|
||||
}
|
||||
|
||||
logger.transcription.warn(
|
||||
"Got 401 response, attempting token refresh and retry",
|
||||
logger.transcription.warn(
|
||||
"Got 401 response, attempting token refresh and retry",
|
||||
);
|
||||
|
||||
try {
|
||||
// Force token refresh
|
||||
await this.authService.refreshTokenIfNeeded();
|
||||
|
||||
// Retry the request once (preserve formatting flag)
|
||||
return await this.makeTranscriptionRequest(
|
||||
audioData,
|
||||
true,
|
||||
enableFormatting,
|
||||
);
|
||||
|
||||
try {
|
||||
// Force token refresh
|
||||
await this.authService.refreshTokenIfNeeded();
|
||||
|
||||
// Retry the request once (preserve formatting flag)
|
||||
return await this.makeTranscriptionRequest(
|
||||
audioData,
|
||||
true,
|
||||
enableFormatting,
|
||||
);
|
||||
} catch (refreshError) {
|
||||
logger.transcription.error("Token refresh failed:", refreshError);
|
||||
throw new Error("Authentication failed - please log in again");
|
||||
}
|
||||
} catch (refreshError) {
|
||||
logger.transcription.error("Token refresh failed:", refreshError);
|
||||
throw new Error("Authentication failed - please log in again");
|
||||
}
|
||||
|
||||
if (response.status === 403) {
|
||||
throw new Error("Subscription required for cloud transcription");
|
||||
}
|
||||
|
||||
if (response.status === 429) {
|
||||
const errorData = await response.json();
|
||||
throw new Error(
|
||||
`Word limit exceeded: ${errorData.currentWords}/${errorData.limit}`,
|
||||
);
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
logger.transcription.error("Cloud API error:", {
|
||||
status: response.status,
|
||||
statusText: response.statusText,
|
||||
error: errorText,
|
||||
});
|
||||
throw new Error(`Cloud API error: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const result: CloudTranscriptionResponse = await response.json();
|
||||
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "Cloud transcription failed");
|
||||
}
|
||||
|
||||
logger.transcription.info("Cloud transcription successful", {
|
||||
textLength: result.transcription?.length || 0,
|
||||
language: result.language,
|
||||
duration: result.duration,
|
||||
});
|
||||
|
||||
return result.transcription || "";
|
||||
} catch (error) {
|
||||
logger.transcription.error("Cloud transcription request failed:", error);
|
||||
throw error;
|
||||
}
|
||||
|
||||
if (response.status === 403) {
|
||||
throw new Error("Subscription required for cloud transcription");
|
||||
}
|
||||
|
||||
if (response.status === 429) {
|
||||
const errorData = await response.json();
|
||||
throw new Error(
|
||||
`Word limit exceeded: ${errorData.currentWords}/${errorData.limit}`,
|
||||
);
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
logger.transcription.error("Cloud API error:", {
|
||||
status: response.status,
|
||||
statusText: response.statusText,
|
||||
error: errorText,
|
||||
});
|
||||
throw new Error(`Cloud API error: ${response.statusText}`);
|
||||
}
|
||||
|
||||
const result: CloudTranscriptionResponse = await response.json();
|
||||
|
||||
if (!result.success) {
|
||||
throw new Error(result.error || "Cloud transcription failed");
|
||||
}
|
||||
|
||||
logger.transcription.info("Cloud transcription successful", {
|
||||
textLength: result.transcription?.length || 0,
|
||||
language: result.language,
|
||||
duration: result.duration,
|
||||
});
|
||||
|
||||
return result.transcription || "";
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import {
|
||||
TranscriptionProvider,
|
||||
TranscribeParams,
|
||||
TranscribeContext,
|
||||
} from "../../core/pipeline-types";
|
||||
import { logger } from "../../../main/logger";
|
||||
import { ModelService } from "../../../services/model-service";
|
||||
|
|
@ -74,74 +75,79 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process an audio chunk - buffers and conditionally transcribes
|
||||
*/
|
||||
async transcribe(params: TranscribeParams): Promise<string> {
|
||||
await this.initializeWhisper();
|
||||
|
||||
const { audioData, speechProbability = 1, context } = params;
|
||||
|
||||
// Add frame to buffer with speech probability
|
||||
this.frameBuffer.push(audioData);
|
||||
this.frameBufferSpeechProbabilities.push(speechProbability);
|
||||
|
||||
// Consider it speech if probability is above threshold
|
||||
const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD;
|
||||
|
||||
logger.transcription.debug(
|
||||
`Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.currentSilenceFrameCount}`,
|
||||
);
|
||||
|
||||
// Handle speech/silence logic
|
||||
if (isSpeech) {
|
||||
this.currentSilenceFrameCount = 0;
|
||||
this.lastSpeechTimestamp = Date.now();
|
||||
} else {
|
||||
this.currentSilenceFrameCount++;
|
||||
}
|
||||
|
||||
// Only transcribe if speech/silence patterns indicate we should
|
||||
if (!this.shouldTranscribe()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return this.doTranscription(context);
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush any buffered audio and return transcription
|
||||
* Called at the end of a recording session
|
||||
*/
|
||||
async flush(context: TranscribeContext): Promise<string> {
|
||||
if (this.frameBuffer.length === 0) {
|
||||
return "";
|
||||
}
|
||||
|
||||
await this.initializeWhisper();
|
||||
return this.doTranscription(context);
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared transcription logic - aggregates buffer, calls whisper, clears state
|
||||
* Assumes initializeWhisper() was already called by caller
|
||||
*/
|
||||
private async doTranscription(context: TranscribeContext): Promise<string> {
|
||||
try {
|
||||
await this.initializeWhisper();
|
||||
|
||||
// Extract parameters from the new structure
|
||||
const {
|
||||
audioData,
|
||||
speechProbability = 1,
|
||||
context,
|
||||
flush = false,
|
||||
} = params;
|
||||
const { vocabulary, aggregatedTranscription, language } = context;
|
||||
|
||||
// Audio data is already Float32Array
|
||||
|
||||
// Add frame to buffer with speech probability
|
||||
this.frameBuffer.push(audioData);
|
||||
this.frameBufferSpeechProbabilities.push(speechProbability);
|
||||
|
||||
// Consider it speech if probability is above threshold
|
||||
const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD;
|
||||
|
||||
logger.transcription.debug(
|
||||
`Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.currentSilenceFrameCount}`,
|
||||
);
|
||||
|
||||
// Handle speech/silence logic
|
||||
if (isSpeech) {
|
||||
this.currentSilenceFrameCount = 0;
|
||||
this.lastSpeechTimestamp = Date.now();
|
||||
} else {
|
||||
this.currentSilenceFrameCount++;
|
||||
}
|
||||
|
||||
// Determine if we should transcribe
|
||||
const shouldTranscribe = flush || this.shouldTranscribe();
|
||||
|
||||
if (!shouldTranscribe) {
|
||||
// Keep buffering
|
||||
return "";
|
||||
}
|
||||
|
||||
const isAllSilent = this.isAllSilent();
|
||||
|
||||
// Aggregate buffered frames
|
||||
const aggregatedAudio = this.aggregateFrames();
|
||||
|
||||
// Clear buffers immediately after aggregation, before async operations
|
||||
this.frameBuffer = [];
|
||||
this.frameBufferSpeechProbabilities = [];
|
||||
this.currentSilenceFrameCount = 0;
|
||||
// Clear buffers immediately after aggregation
|
||||
this.reset();
|
||||
|
||||
if (isAllSilent && this.IGNORE_FULLY_SILENT_CHUNKS) {
|
||||
logger.transcription.debug("Skipping transcription - all silent");
|
||||
return "";
|
||||
}
|
||||
|
||||
// Skip if too short or only silence
|
||||
/* if (aggregatedAudio.length < this.FRAME_SIZE * 2) {
|
||||
logger.transcription.debug("Skipping transcription - audio too short");
|
||||
return "";
|
||||
} */
|
||||
|
||||
logger.transcription.debug(
|
||||
`Starting transcription of ${aggregatedAudio.length} samples (${((aggregatedAudio.length / this.SAMPLE_RATE) * 1000).toFixed(0)}ms)`,
|
||||
);
|
||||
|
||||
// Transcribe using the local Whisper wrapper
|
||||
if (!this.workerWrapper) {
|
||||
throw new Error("Worker wrapper is not initialized");
|
||||
}
|
||||
|
|
@ -152,7 +158,7 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
aggregatedTranscription,
|
||||
);
|
||||
|
||||
const text = await this.workerWrapper!.exec<string>("transcribeAudio", [
|
||||
const text = await this.workerWrapper.exec<string>("transcribeAudio", [
|
||||
aggregatedAudio,
|
||||
{
|
||||
language: language || "auto",
|
||||
|
|
@ -174,11 +180,20 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear internal buffers without transcribing
|
||||
* Called when cancelling a session to prevent audio bleed
|
||||
*/
|
||||
reset(): void {
|
||||
this.frameBuffer = [];
|
||||
this.frameBufferSpeechProbabilities = [];
|
||||
this.currentSilenceFrameCount = 0;
|
||||
}
|
||||
|
||||
private shouldTranscribe(): boolean {
|
||||
// Transcribe if:
|
||||
// 1. We have significant silence after speech
|
||||
// 2. Buffer is getting too large
|
||||
// 3. Final chunk was received (handled elsewhere)
|
||||
|
||||
const bufferDurationMs =
|
||||
((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
|
||||
|
|
@ -186,7 +201,7 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
|
||||
1000;
|
||||
|
||||
// If we have speech (potential cause frameBuffer might just be all silence too, and thats okay) and then significant silence, transcribe
|
||||
// If we have speech and then significant silence, transcribe
|
||||
if (
|
||||
this.frameBuffer.length > 0 &&
|
||||
silenceDurationMs > this.MAX_SILENCE_DURATION_MS
|
||||
|
|
@ -357,9 +372,6 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
}
|
||||
}
|
||||
|
||||
// Clear buffers
|
||||
this.frameBuffer = [];
|
||||
this.frameBufferSpeechProbabilities = [];
|
||||
this.currentSilenceFrameCount = 0;
|
||||
this.reset();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -214,23 +214,14 @@ export class TranscriptionService {
|
|||
|
||||
/**
|
||||
* Process a single audio chunk in streaming mode
|
||||
* For finalization, use finalizeSession() instead
|
||||
*/
|
||||
async processStreamingChunk(options: {
|
||||
sessionId: string;
|
||||
audioChunk: Float32Array;
|
||||
isFinal?: boolean;
|
||||
audioFilePath?: string;
|
||||
recordingStartedAt?: number;
|
||||
recordingStoppedAt?: number;
|
||||
}): Promise<string> {
|
||||
const {
|
||||
sessionId,
|
||||
audioChunk,
|
||||
isFinal = false,
|
||||
audioFilePath,
|
||||
recordingStartedAt,
|
||||
recordingStoppedAt,
|
||||
} = options;
|
||||
const { sessionId, audioChunk, recordingStartedAt } = options;
|
||||
|
||||
// Run VAD on the audio chunk
|
||||
let speechProbability = 0;
|
||||
|
|
@ -281,7 +272,7 @@ export class TranscriptionService {
|
|||
context: streamingContext,
|
||||
transcriptionResults: [],
|
||||
firstChunkReceivedAt: performance.now(),
|
||||
recordingStartedAt: recordingStartedAt, // From RecordingManager (when user pressed record)
|
||||
recordingStartedAt: recordingStartedAt,
|
||||
};
|
||||
|
||||
this.streamingSessions.set(sessionId, session);
|
||||
|
|
@ -305,11 +296,10 @@ export class TranscriptionService {
|
|||
// Select the appropriate provider
|
||||
const provider = await this.selectProvider();
|
||||
|
||||
// Transcribe with flush parameter for final chunks
|
||||
// Transcribe chunk (flush is done separately in finalizeSession)
|
||||
const chunkTranscription = await provider.transcribe({
|
||||
audioData: audioChunk,
|
||||
speechProbability: speechProbability, // Now from VAD service
|
||||
flush: isFinal, // Pass flush flag for final chunks
|
||||
speechProbability: speechProbability,
|
||||
context: {
|
||||
vocabulary: session.context.sharedData.vocabulary,
|
||||
accessibilityContext: session.context.sharedData.accessibilityContext,
|
||||
|
|
@ -334,25 +324,96 @@ export class TranscriptionService {
|
|||
sessionId,
|
||||
frameSize: audioChunk.length,
|
||||
hadTranscription: chunkTranscription.length > 0,
|
||||
isFinal,
|
||||
});
|
||||
} finally {
|
||||
// Release transcription mutex - always release even on error
|
||||
this.transcriptionMutex.release();
|
||||
}
|
||||
const completeTranscriptionTillNow = session.transcriptionResults
|
||||
.join(" ")
|
||||
.trim();
|
||||
|
||||
// this is the final chunk, save the transcription
|
||||
if (!isFinal) {
|
||||
return completeTranscriptionTillNow;
|
||||
return session.transcriptionResults.join(" ").trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cancel a streaming session without processing
|
||||
* Used when recording is cancelled (e.g., quick tap, accidental activation)
|
||||
*/
|
||||
async cancelStreamingSession(sessionId: string): Promise<void> {
|
||||
if (this.streamingSessions.has(sessionId)) {
|
||||
// Acquire mutex to prevent race with processStreamingChunk
|
||||
await this.transcriptionMutex.acquire();
|
||||
try {
|
||||
// Clear provider buffers to prevent audio bleed into next session
|
||||
this.currentProvider?.reset();
|
||||
|
||||
this.streamingSessions.delete(sessionId);
|
||||
logger.transcription.info("Streaming session cancelled", { sessionId });
|
||||
} finally {
|
||||
this.transcriptionMutex.release();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalize a streaming session - flush provider, format, save to DB
|
||||
* Call this instead of processStreamingChunk with isFinal=true
|
||||
*/
|
||||
async finalizeSession(options: {
|
||||
sessionId: string;
|
||||
audioFilePath?: string;
|
||||
recordingStartedAt?: number;
|
||||
recordingStoppedAt?: number;
|
||||
}): Promise<string> {
|
||||
const { sessionId, audioFilePath, recordingStartedAt, recordingStoppedAt } =
|
||||
options;
|
||||
|
||||
const session = this.streamingSessions.get(sessionId);
|
||||
if (!session) {
|
||||
logger.transcription.warn("No session found to finalize", { sessionId });
|
||||
return "";
|
||||
}
|
||||
|
||||
session.finalChunkReceivedAt = performance.now();
|
||||
// Update session timestamps
|
||||
session.finalizationStartedAt = performance.now();
|
||||
session.recordingStoppedAt = recordingStoppedAt;
|
||||
if (recordingStartedAt && !session.recordingStartedAt) {
|
||||
session.recordingStartedAt = recordingStartedAt;
|
||||
}
|
||||
|
||||
let completeTranscription = completeTranscriptionTillNow;
|
||||
// Flush provider to get any remaining buffered audio
|
||||
await this.transcriptionMutex.acquire();
|
||||
try {
|
||||
const previousChunk =
|
||||
session.transcriptionResults.length > 0
|
||||
? session.transcriptionResults[
|
||||
session.transcriptionResults.length - 1
|
||||
]
|
||||
: undefined;
|
||||
const aggregatedTranscription = session.transcriptionResults
|
||||
.join(" ")
|
||||
.trim();
|
||||
|
||||
const provider = await this.selectProvider();
|
||||
const finalTranscription = await provider.flush({
|
||||
vocabulary: session.context.sharedData.vocabulary,
|
||||
accessibilityContext: session.context.sharedData.accessibilityContext,
|
||||
previousChunk,
|
||||
aggregatedTranscription: aggregatedTranscription || undefined,
|
||||
language: session.context.sharedData.userPreferences?.language,
|
||||
});
|
||||
|
||||
if (finalTranscription.trim()) {
|
||||
session.transcriptionResults.push(finalTranscription);
|
||||
logger.transcription.info("Whisper returned final transcription", {
|
||||
sessionId,
|
||||
transcriptionLength: finalTranscription.length,
|
||||
totalResults: session.transcriptionResults.length,
|
||||
});
|
||||
}
|
||||
} finally {
|
||||
this.transcriptionMutex.release();
|
||||
}
|
||||
|
||||
let completeTranscription = session.transcriptionResults.join(" ").trim();
|
||||
let formattingDuration: number | undefined;
|
||||
|
||||
logger.transcription.info("Finalizing streaming session", {
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ export const recordingRouter = createRouter({
|
|||
if (!recordingManager) {
|
||||
throw new Error("Recording manager not available");
|
||||
}
|
||||
return await recordingManager.startRecording("hands-free");
|
||||
return await recordingManager.signalStart();
|
||||
}),
|
||||
|
||||
signalStop: procedure.mutation(async ({ ctx }) => {
|
||||
|
|
@ -23,7 +23,7 @@ export const recordingRouter = createRouter({
|
|||
if (!recordingManager) {
|
||||
throw new Error("Recording manager not available");
|
||||
}
|
||||
return await recordingManager.stopRecording();
|
||||
return await recordingManager.signalStop();
|
||||
}),
|
||||
|
||||
// Using Observable instead of async generator due to Symbol.asyncDispose conflict
|
||||
|
|
|
|||
|
|
@ -1,6 +1 @@
|
|||
export type RecordingState =
|
||||
| "idle"
|
||||
| "starting"
|
||||
| "recording"
|
||||
| "stopping"
|
||||
| "error";
|
||||
export type RecordingState = "idle" | "starting" | "recording" | "stopping";
|
||||
|
|
|
|||
|
|
@ -135,6 +135,25 @@ export class StreamingWavWriter {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Abort writing and close the file stream without finalizing
|
||||
* Used when recording is cancelled
|
||||
*/
|
||||
async abort(): Promise<void> {
|
||||
if (this.isFinalized) return;
|
||||
|
||||
this.isFinalized = true; // Prevent further writes
|
||||
|
||||
// Close the stream
|
||||
await new Promise<void>((resolve) => {
|
||||
this.fileStream.end(() => resolve());
|
||||
});
|
||||
|
||||
logger.transcription.info("WAV writer aborted", {
|
||||
path: this.fileStream.path,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current size of audio data written
|
||||
*/
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue