chore: implement auto dismis of unintentional dictation (sub 500ms)

This commit is contained in:
haritabh-z01 2026-01-07 12:59:23 +05:30
parent 1d0c1a68df
commit 73734bfdd9
9 changed files with 998 additions and 587 deletions

View file

@ -1,13 +1,27 @@
import dotenv from "dotenv";
dotenv.config();
import { app } from "electron";
import { app, ipcMain } from "electron";
import { logger } from "./logger";
import started from "electron-squirrel-startup";
import { AppManager } from "./core/app-manager";
import { updateElectronApp } from "update-electron-app";
import { isWindows } from "../utils/platform";
// Setup renderer logging relay (allows renderer to send logs to main process)
ipcMain.handle(
"log-message",
(_event, level: string, scope: string, ...args: unknown[]) => {
const scopedLogger =
logger[scope as keyof typeof logger] || logger.renderer;
const logMethod = scopedLogger[level as keyof typeof scopedLogger];
if (typeof logMethod === "function") {
logMethod(...args);
}
},
);
if (started) {
app.quit();
}

File diff suppressed because it is too large Load diff

View file

@ -7,18 +7,20 @@ import { PipelineContext } from "./context";
import { GetAccessibilityContextResult } from "@amical/types";
export { PipelineContext, SharedPipelineData } from "./context";
// Context for transcription operations (shared between transcribe and flush)
export interface TranscribeContext {
vocabulary?: Map<string, string>;
accessibilityContext?: GetAccessibilityContextResult | null;
previousChunk?: string;
aggregatedTranscription?: string;
language?: string;
}
// Transcription input parameters
export interface TranscribeParams {
audioData: Float32Array;
speechProbability?: number; // Speech probability from frontend VAD (0-1)
flush?: boolean; // Whether to flush any buffered audio
context: {
vocabulary?: Map<string, string>;
accessibilityContext?: GetAccessibilityContextResult | null;
previousChunk?: string;
aggregatedTranscription?: string;
language?: string;
};
context: TranscribeContext;
}
// Formatting input parameters
@ -37,6 +39,8 @@ export interface FormatParams {
export interface TranscriptionProvider {
readonly name: string;
transcribe(params: TranscribeParams): Promise<string>;
flush(context: TranscribeContext): Promise<string>;
reset(): void; // Clear internal buffers without transcribing
}
// Formatting provider interface
@ -71,7 +75,7 @@ export interface StreamingSession {
firstChunkReceivedAt?: number; // When first audio chunk arrived at transcription service
recordingStartedAt?: number; // When user pressed record button (from RecordingManager)
recordingStoppedAt?: number; // When user released record button (from RecordingManager)
finalChunkReceivedAt?: number; // When final chunk arrived at transcription service
finalizationStartedAt?: number; // When finalizeSession() was called
}
// Simple pipeline configuration

View file

@ -1,6 +1,7 @@
import {
TranscriptionProvider,
TranscribeParams,
TranscribeContext,
} from "../../core/pipeline-types";
import { logger } from "../../../main/logger";
import { AuthService } from "../../../services/auth-service";
@ -51,21 +52,16 @@ export class AmicalCloudProvider implements TranscriptionProvider {
});
}
/**
* Process an audio chunk - buffers and conditionally transcribes
*/
async transcribe(params: TranscribeParams): Promise<string> {
try {
const {
audioData,
speechProbability = 1,
flush = false,
context,
} = params;
const { audioData, speechProbability = 1, context } = params;
// Store language for use in API call (undefined = auto-detect)
// Store context for API call
this.currentLanguage = context.language;
// Store accessibility context for the API request
this.currentAccessibilityContext = context?.accessibilityContext ?? null;
this.currentAggregatedTranscription = context?.aggregatedTranscription;
// Check authentication
@ -89,40 +85,46 @@ export class AmicalCloudProvider implements TranscriptionProvider {
this.currentSilenceFrameCount++;
}
// Calculate durations
const silenceDuration =
((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
1000;
const speechDuration =
((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
// Determine if we should process
const shouldProcess =
flush ||
(speechDuration >= this.MIN_SPEECH_DURATION_MS &&
silenceDuration >= this.MAX_SILENCE_DURATION_MS);
if (!shouldProcess) {
// Only transcribe if speech/silence patterns indicate we should
if (!this.shouldTranscribe()) {
return "";
}
// Process accumulated audio (pass flush flag for formatting decision)
const result = await this.processAudio(flush);
// Clear buffer after processing
this.frameBuffer = [];
this.frameBufferSpeechProbabilities = [];
this.currentSilenceFrameCount = 0;
return result;
return this.doTranscription(false);
} catch (error) {
logger.transcription.error("Cloud transcription error:", error);
throw error;
}
}
private async processAudio(isFinal: boolean = false): Promise<string> {
// Combine all frames into a single Float32Array (may be empty)
/**
* Flush any buffered audio and return transcription with formatting
* Called at the end of a recording session
*/
async flush(context: TranscribeContext): Promise<string> {
try {
// Store context for API call
this.currentLanguage = context.language;
this.currentAccessibilityContext = context?.accessibilityContext ?? null;
this.currentAggregatedTranscription = context?.aggregatedTranscription;
// Check authentication
if (!(await this.authService.isAuthenticated())) {
throw new Error("Authentication required for cloud transcription");
}
return this.doTranscription(true);
} catch (error) {
logger.transcription.error("Cloud transcription error:", error);
throw error;
}
}
/**
* Shared transcription logic - aggregates buffer, calls cloud API, clears state
*/
private async doTranscription(enableFormatting: boolean): Promise<string> {
// Combine all frames into a single Float32Array
const totalLength = this.frameBuffer.reduce(
(acc, frame) => acc + frame.length,
0,
@ -134,9 +136,43 @@ export class AmicalCloudProvider implements TranscriptionProvider {
offset += frame.length;
}
// Try transcription with automatic retry on 401
// Enable formatting only on final chunk
return this.makeTranscriptionRequest(combinedAudio, false, isFinal);
// Clear frame buffers only (context values needed for API call below)
this.frameBuffer = [];
this.frameBufferSpeechProbabilities = [];
this.currentSilenceFrameCount = 0;
// Make the API request
return this.makeTranscriptionRequest(
combinedAudio,
false,
enableFormatting,
);
}
/**
* Clear internal buffers without transcribing
* Called when cancelling a session to prevent audio bleed
*/
reset(): void {
this.frameBuffer = [];
this.frameBufferSpeechProbabilities = [];
this.currentSilenceFrameCount = 0;
this.currentLanguage = undefined;
this.currentAccessibilityContext = null;
this.currentAggregatedTranscription = undefined;
}
private shouldTranscribe(): boolean {
const silenceDuration =
((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
1000;
const speechDuration =
((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
return (
speechDuration >= this.MIN_SPEECH_DURATION_MS &&
silenceDuration >= this.MAX_SILENCE_DURATION_MS
);
}
private async makeTranscriptionRequest(
@ -144,9 +180,13 @@ export class AmicalCloudProvider implements TranscriptionProvider {
isRetry = false,
enableFormatting = false,
): Promise<string> {
// Skip API call if no audio and formatting not requested
if (audioData.length === 0 && !enableFormatting) {
return "";
// Skip API call if there's nothing to process
if (audioData.length === 0) {
const hasTextToFormat =
enableFormatting && this.currentAggregatedTranscription?.trim();
if (!hasTextToFormat) {
return "";
}
}
// Get auth token
@ -166,112 +206,104 @@ export class AmicalCloudProvider implements TranscriptionProvider {
formatting: enableFormatting,
});
try {
const response = await fetch(`${this.apiEndpoint}/transcribe`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${idToken}`,
"User-Agent": getUserAgent(),
const response = await fetch(`${this.apiEndpoint}/transcribe`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${idToken}`,
"User-Agent": getUserAgent(),
},
body: JSON.stringify({
audioData: Array.from(audioData),
language: this.currentLanguage,
previousTranscription: this.currentAggregatedTranscription,
formatting: {
enabled: enableFormatting,
},
body: JSON.stringify({
audioData: Array.from(audioData),
language: this.currentLanguage,
previousTranscription: this.currentAggregatedTranscription,
formatting: {
enabled: enableFormatting,
},
sharedContext: this.currentAccessibilityContext
? {
selectedText:
this.currentAccessibilityContext.context?.textSelection
?.selectedText,
beforeText:
this.currentAccessibilityContext.context?.textSelection
?.preSelectionText,
afterText:
this.currentAccessibilityContext.context?.textSelection
?.postSelectionText,
appType: detectApplicationType(
this.currentAccessibilityContext,
),
appBundleId:
this.currentAccessibilityContext.context?.application
?.bundleIdentifier,
appName:
this.currentAccessibilityContext.context?.application?.name,
appUrl:
this.currentAccessibilityContext.context?.windowInfo?.url,
surroundingContext: "", // Empty for now, future enhancement
}
: undefined,
}),
});
sharedContext: this.currentAccessibilityContext
? {
selectedText:
this.currentAccessibilityContext.context?.textSelection
?.selectedText,
beforeText:
this.currentAccessibilityContext.context?.textSelection
?.preSelectionText,
afterText:
this.currentAccessibilityContext.context?.textSelection
?.postSelectionText,
appType: detectApplicationType(this.currentAccessibilityContext),
appBundleId:
this.currentAccessibilityContext.context?.application
?.bundleIdentifier,
appName:
this.currentAccessibilityContext.context?.application?.name,
appUrl: this.currentAccessibilityContext.context?.windowInfo?.url,
surroundingContext: "", // Empty for now, future enhancement
}
: undefined,
}),
});
// Handle 401 with token refresh and retry
if (response.status === 401) {
if (isRetry) {
// Already retried once, give up
throw new Error("Authentication failed - please log in again");
}
// Handle 401 with token refresh and retry
if (response.status === 401) {
if (isRetry) {
// Already retried once, give up
throw new Error("Authentication failed - please log in again");
}
logger.transcription.warn(
"Got 401 response, attempting token refresh and retry",
logger.transcription.warn(
"Got 401 response, attempting token refresh and retry",
);
try {
// Force token refresh
await this.authService.refreshTokenIfNeeded();
// Retry the request once (preserve formatting flag)
return await this.makeTranscriptionRequest(
audioData,
true,
enableFormatting,
);
try {
// Force token refresh
await this.authService.refreshTokenIfNeeded();
// Retry the request once (preserve formatting flag)
return await this.makeTranscriptionRequest(
audioData,
true,
enableFormatting,
);
} catch (refreshError) {
logger.transcription.error("Token refresh failed:", refreshError);
throw new Error("Authentication failed - please log in again");
}
} catch (refreshError) {
logger.transcription.error("Token refresh failed:", refreshError);
throw new Error("Authentication failed - please log in again");
}
if (response.status === 403) {
throw new Error("Subscription required for cloud transcription");
}
if (response.status === 429) {
const errorData = await response.json();
throw new Error(
`Word limit exceeded: ${errorData.currentWords}/${errorData.limit}`,
);
}
if (!response.ok) {
const errorText = await response.text();
logger.transcription.error("Cloud API error:", {
status: response.status,
statusText: response.statusText,
error: errorText,
});
throw new Error(`Cloud API error: ${response.statusText}`);
}
const result: CloudTranscriptionResponse = await response.json();
if (!result.success) {
throw new Error(result.error || "Cloud transcription failed");
}
logger.transcription.info("Cloud transcription successful", {
textLength: result.transcription?.length || 0,
language: result.language,
duration: result.duration,
});
return result.transcription || "";
} catch (error) {
logger.transcription.error("Cloud transcription request failed:", error);
throw error;
}
if (response.status === 403) {
throw new Error("Subscription required for cloud transcription");
}
if (response.status === 429) {
const errorData = await response.json();
throw new Error(
`Word limit exceeded: ${errorData.currentWords}/${errorData.limit}`,
);
}
if (!response.ok) {
const errorText = await response.text();
logger.transcription.error("Cloud API error:", {
status: response.status,
statusText: response.statusText,
error: errorText,
});
throw new Error(`Cloud API error: ${response.statusText}`);
}
const result: CloudTranscriptionResponse = await response.json();
if (!result.success) {
throw new Error(result.error || "Cloud transcription failed");
}
logger.transcription.info("Cloud transcription successful", {
textLength: result.transcription?.length || 0,
language: result.language,
duration: result.duration,
});
return result.transcription || "";
}
}

View file

@ -1,6 +1,7 @@
import {
TranscriptionProvider,
TranscribeParams,
TranscribeContext,
} from "../../core/pipeline-types";
import { logger } from "../../../main/logger";
import { ModelService } from "../../../services/model-service";
@ -74,74 +75,79 @@ export class WhisperProvider implements TranscriptionProvider {
}
}
/**
* Process an audio chunk - buffers and conditionally transcribes
*/
async transcribe(params: TranscribeParams): Promise<string> {
await this.initializeWhisper();
const { audioData, speechProbability = 1, context } = params;
// Add frame to buffer with speech probability
this.frameBuffer.push(audioData);
this.frameBufferSpeechProbabilities.push(speechProbability);
// Consider it speech if probability is above threshold
const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD;
logger.transcription.debug(
`Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.currentSilenceFrameCount}`,
);
// Handle speech/silence logic
if (isSpeech) {
this.currentSilenceFrameCount = 0;
this.lastSpeechTimestamp = Date.now();
} else {
this.currentSilenceFrameCount++;
}
// Only transcribe if speech/silence patterns indicate we should
if (!this.shouldTranscribe()) {
return "";
}
return this.doTranscription(context);
}
/**
* Flush any buffered audio and return transcription
* Called at the end of a recording session
*/
async flush(context: TranscribeContext): Promise<string> {
if (this.frameBuffer.length === 0) {
return "";
}
await this.initializeWhisper();
return this.doTranscription(context);
}
/**
* Shared transcription logic - aggregates buffer, calls whisper, clears state
* Assumes initializeWhisper() was already called by caller
*/
private async doTranscription(context: TranscribeContext): Promise<string> {
try {
await this.initializeWhisper();
// Extract parameters from the new structure
const {
audioData,
speechProbability = 1,
context,
flush = false,
} = params;
const { vocabulary, aggregatedTranscription, language } = context;
// Audio data is already Float32Array
// Add frame to buffer with speech probability
this.frameBuffer.push(audioData);
this.frameBufferSpeechProbabilities.push(speechProbability);
// Consider it speech if probability is above threshold
const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD;
logger.transcription.debug(
`Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.currentSilenceFrameCount}`,
);
// Handle speech/silence logic
if (isSpeech) {
this.currentSilenceFrameCount = 0;
this.lastSpeechTimestamp = Date.now();
} else {
this.currentSilenceFrameCount++;
}
// Determine if we should transcribe
const shouldTranscribe = flush || this.shouldTranscribe();
if (!shouldTranscribe) {
// Keep buffering
return "";
}
const isAllSilent = this.isAllSilent();
// Aggregate buffered frames
const aggregatedAudio = this.aggregateFrames();
// Clear buffers immediately after aggregation, before async operations
this.frameBuffer = [];
this.frameBufferSpeechProbabilities = [];
this.currentSilenceFrameCount = 0;
// Clear buffers immediately after aggregation
this.reset();
if (isAllSilent && this.IGNORE_FULLY_SILENT_CHUNKS) {
logger.transcription.debug("Skipping transcription - all silent");
return "";
}
// Skip if too short or only silence
/* if (aggregatedAudio.length < this.FRAME_SIZE * 2) {
logger.transcription.debug("Skipping transcription - audio too short");
return "";
} */
logger.transcription.debug(
`Starting transcription of ${aggregatedAudio.length} samples (${((aggregatedAudio.length / this.SAMPLE_RATE) * 1000).toFixed(0)}ms)`,
);
// Transcribe using the local Whisper wrapper
if (!this.workerWrapper) {
throw new Error("Worker wrapper is not initialized");
}
@ -152,7 +158,7 @@ export class WhisperProvider implements TranscriptionProvider {
aggregatedTranscription,
);
const text = await this.workerWrapper!.exec<string>("transcribeAudio", [
const text = await this.workerWrapper.exec<string>("transcribeAudio", [
aggregatedAudio,
{
language: language || "auto",
@ -174,11 +180,20 @@ export class WhisperProvider implements TranscriptionProvider {
}
}
/**
* Clear internal buffers without transcribing
* Called when cancelling a session to prevent audio bleed
*/
reset(): void {
this.frameBuffer = [];
this.frameBufferSpeechProbabilities = [];
this.currentSilenceFrameCount = 0;
}
private shouldTranscribe(): boolean {
// Transcribe if:
// 1. We have significant silence after speech
// 2. Buffer is getting too large
// 3. Final chunk was received (handled elsewhere)
const bufferDurationMs =
((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
@ -186,7 +201,7 @@ export class WhisperProvider implements TranscriptionProvider {
((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
1000;
// If we have speech (potential cause frameBuffer might just be all silence too, and thats okay) and then significant silence, transcribe
// If we have speech and then significant silence, transcribe
if (
this.frameBuffer.length > 0 &&
silenceDurationMs > this.MAX_SILENCE_DURATION_MS
@ -357,9 +372,6 @@ export class WhisperProvider implements TranscriptionProvider {
}
}
// Clear buffers
this.frameBuffer = [];
this.frameBufferSpeechProbabilities = [];
this.currentSilenceFrameCount = 0;
this.reset();
}
}

View file

@ -214,23 +214,14 @@ export class TranscriptionService {
/**
* Process a single audio chunk in streaming mode
* For finalization, use finalizeSession() instead
*/
async processStreamingChunk(options: {
sessionId: string;
audioChunk: Float32Array;
isFinal?: boolean;
audioFilePath?: string;
recordingStartedAt?: number;
recordingStoppedAt?: number;
}): Promise<string> {
const {
sessionId,
audioChunk,
isFinal = false,
audioFilePath,
recordingStartedAt,
recordingStoppedAt,
} = options;
const { sessionId, audioChunk, recordingStartedAt } = options;
// Run VAD on the audio chunk
let speechProbability = 0;
@ -281,7 +272,7 @@ export class TranscriptionService {
context: streamingContext,
transcriptionResults: [],
firstChunkReceivedAt: performance.now(),
recordingStartedAt: recordingStartedAt, // From RecordingManager (when user pressed record)
recordingStartedAt: recordingStartedAt,
};
this.streamingSessions.set(sessionId, session);
@ -305,11 +296,10 @@ export class TranscriptionService {
// Select the appropriate provider
const provider = await this.selectProvider();
// Transcribe with flush parameter for final chunks
// Transcribe chunk (flush is done separately in finalizeSession)
const chunkTranscription = await provider.transcribe({
audioData: audioChunk,
speechProbability: speechProbability, // Now from VAD service
flush: isFinal, // Pass flush flag for final chunks
speechProbability: speechProbability,
context: {
vocabulary: session.context.sharedData.vocabulary,
accessibilityContext: session.context.sharedData.accessibilityContext,
@ -334,25 +324,96 @@ export class TranscriptionService {
sessionId,
frameSize: audioChunk.length,
hadTranscription: chunkTranscription.length > 0,
isFinal,
});
} finally {
// Release transcription mutex - always release even on error
this.transcriptionMutex.release();
}
const completeTranscriptionTillNow = session.transcriptionResults
.join(" ")
.trim();
// this is the final chunk, save the transcription
if (!isFinal) {
return completeTranscriptionTillNow;
return session.transcriptionResults.join(" ").trim();
}
/**
* Cancel a streaming session without processing
* Used when recording is cancelled (e.g., quick tap, accidental activation)
*/
async cancelStreamingSession(sessionId: string): Promise<void> {
if (this.streamingSessions.has(sessionId)) {
// Acquire mutex to prevent race with processStreamingChunk
await this.transcriptionMutex.acquire();
try {
// Clear provider buffers to prevent audio bleed into next session
this.currentProvider?.reset();
this.streamingSessions.delete(sessionId);
logger.transcription.info("Streaming session cancelled", { sessionId });
} finally {
this.transcriptionMutex.release();
}
}
}
/**
* Finalize a streaming session - flush provider, format, save to DB
* Call this instead of processStreamingChunk with isFinal=true
*/
async finalizeSession(options: {
sessionId: string;
audioFilePath?: string;
recordingStartedAt?: number;
recordingStoppedAt?: number;
}): Promise<string> {
const { sessionId, audioFilePath, recordingStartedAt, recordingStoppedAt } =
options;
const session = this.streamingSessions.get(sessionId);
if (!session) {
logger.transcription.warn("No session found to finalize", { sessionId });
return "";
}
session.finalChunkReceivedAt = performance.now();
// Update session timestamps
session.finalizationStartedAt = performance.now();
session.recordingStoppedAt = recordingStoppedAt;
if (recordingStartedAt && !session.recordingStartedAt) {
session.recordingStartedAt = recordingStartedAt;
}
let completeTranscription = completeTranscriptionTillNow;
// Flush provider to get any remaining buffered audio
await this.transcriptionMutex.acquire();
try {
const previousChunk =
session.transcriptionResults.length > 0
? session.transcriptionResults[
session.transcriptionResults.length - 1
]
: undefined;
const aggregatedTranscription = session.transcriptionResults
.join(" ")
.trim();
const provider = await this.selectProvider();
const finalTranscription = await provider.flush({
vocabulary: session.context.sharedData.vocabulary,
accessibilityContext: session.context.sharedData.accessibilityContext,
previousChunk,
aggregatedTranscription: aggregatedTranscription || undefined,
language: session.context.sharedData.userPreferences?.language,
});
if (finalTranscription.trim()) {
session.transcriptionResults.push(finalTranscription);
logger.transcription.info("Whisper returned final transcription", {
sessionId,
transcriptionLength: finalTranscription.length,
totalResults: session.transcriptionResults.length,
});
}
} finally {
this.transcriptionMutex.release();
}
let completeTranscription = session.transcriptionResults.join(" ").trim();
let formattingDuration: number | undefined;
logger.transcription.info("Finalizing streaming session", {

View file

@ -15,7 +15,7 @@ export const recordingRouter = createRouter({
if (!recordingManager) {
throw new Error("Recording manager not available");
}
return await recordingManager.startRecording("hands-free");
return await recordingManager.signalStart();
}),
signalStop: procedure.mutation(async ({ ctx }) => {
@ -23,7 +23,7 @@ export const recordingRouter = createRouter({
if (!recordingManager) {
throw new Error("Recording manager not available");
}
return await recordingManager.stopRecording();
return await recordingManager.signalStop();
}),
// Using Observable instead of async generator due to Symbol.asyncDispose conflict

View file

@ -1,6 +1 @@
export type RecordingState =
| "idle"
| "starting"
| "recording"
| "stopping"
| "error";
export type RecordingState = "idle" | "starting" | "recording" | "stopping";

View file

@ -135,6 +135,25 @@ export class StreamingWavWriter {
}
}
/**
* Abort writing and close the file stream without finalizing
* Used when recording is cancelled
*/
async abort(): Promise<void> {
if (this.isFinalized) return;
this.isFinalized = true; // Prevent further writes
// Close the stream
await new Promise<void>((resolve) => {
this.fileStream.end(() => resolve());
});
logger.transcription.info("WAV writer aborted", {
path: this.fileStream.path,
});
}
/**
* Get the current size of audio data written
*/