chore: implement auto dismis of unintentional dictation (sub 500ms)

2026-01-07 12:59:23 +05:30 · 2026-01-07 12:59:23 +05:30 · 73734bfdd9
commit 73734bfdd9
parent 1d0c1a68df
9 changed files with 998 additions and 587 deletions
--- a/apps/desktop/src/main/main.ts
+++ b/apps/desktop/src/main/main.ts
@ -1,13 +1,27 @@
 import dotenv from "dotenv";
 dotenv.config();

-import { app } from "electron";
+import { app, ipcMain } from "electron";
+import { logger } from "./logger";

 import started from "electron-squirrel-startup";
 import { AppManager } from "./core/app-manager";
 import { updateElectronApp } from "update-electron-app";
 import { isWindows } from "../utils/platform";

+// Setup renderer logging relay (allows renderer to send logs to main process)
+ipcMain.handle(
+  "log-message",
+  (_event, level: string, scope: string, ...args: unknown[]) => {
+    const scopedLogger =
+      logger[scope as keyof typeof logger] || logger.renderer;
+    const logMethod = scopedLogger[level as keyof typeof scopedLogger];
+    if (typeof logMethod === "function") {
+      logMethod(...args);
+    }
+  },
+);
+
 if (started) {
  app.quit();
 }
--- a/apps/desktop/src/main/managers/recording-manager.ts
+++ b/apps/desktop/src/main/managers/recording-manager.ts
--- a/apps/desktop/src/pipeline/core/pipeline-types.ts
+++ b/apps/desktop/src/pipeline/core/pipeline-types.ts
@ -7,18 +7,20 @@ import { PipelineContext } from "./context";
 import { GetAccessibilityContextResult } from "@amical/types";
 export { PipelineContext, SharedPipelineData } from "./context";

+// Context for transcription operations (shared between transcribe and flush)
+export interface TranscribeContext {
+  vocabulary?: Map<string, string>;
+  accessibilityContext?: GetAccessibilityContextResult | null;
+  previousChunk?: string;
+  aggregatedTranscription?: string;
+  language?: string;
+}
+
 // Transcription input parameters
 export interface TranscribeParams {
  audioData: Float32Array;
  speechProbability?: number; // Speech probability from frontend VAD (0-1)
-  flush?: boolean; // Whether to flush any buffered audio
-  context: {
-    vocabulary?: Map<string, string>;
-    accessibilityContext?: GetAccessibilityContextResult | null;
-    previousChunk?: string;
-    aggregatedTranscription?: string;
-    language?: string;
-  };
+  context: TranscribeContext;
 }

 // Formatting input parameters
@ -37,6 +39,8 @@ export interface FormatParams {
 export interface TranscriptionProvider {
  readonly name: string;
  transcribe(params: TranscribeParams): Promise<string>;
+  flush(context: TranscribeContext): Promise<string>;
+  reset(): void; // Clear internal buffers without transcribing
 }

 // Formatting provider interface
@ -71,7 +75,7 @@ export interface StreamingSession {
  firstChunkReceivedAt?: number; // When first audio chunk arrived at transcription service
  recordingStartedAt?: number; // When user pressed record button (from RecordingManager)
  recordingStoppedAt?: number; // When user released record button (from RecordingManager)
-  finalChunkReceivedAt?: number; // When final chunk arrived at transcription service
+  finalizationStartedAt?: number; // When finalizeSession() was called
 }

 // Simple pipeline configuration
--- a/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts
@ -1,6 +1,7 @@
 import {
  TranscriptionProvider,
  TranscribeParams,
+  TranscribeContext,
 } from "../../core/pipeline-types";
 import { logger } from "../../../main/logger";
 import { AuthService } from "../../../services/auth-service";
@ -51,21 +52,16 @@ export class AmicalCloudProvider implements TranscriptionProvider {
    });
  }

+  /**
+   * Process an audio chunk - buffers and conditionally transcribes
+   */
  async transcribe(params: TranscribeParams): Promise<string> {
    try {
-      const {
-        audioData,
-        speechProbability = 1,
-        flush = false,
-        context,
-      } = params;
+      const { audioData, speechProbability = 1, context } = params;

-      // Store language for use in API call (undefined = auto-detect)
+      // Store context for API call
      this.currentLanguage = context.language;
-
-      // Store accessibility context for the API request
      this.currentAccessibilityContext = context?.accessibilityContext ?? null;
-
      this.currentAggregatedTranscription = context?.aggregatedTranscription;

      // Check authentication
@ -89,40 +85,46 @@ export class AmicalCloudProvider implements TranscriptionProvider {
        this.currentSilenceFrameCount++;
      }

-      // Calculate durations
-      const silenceDuration =
-        ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
-        1000;
-      const speechDuration =
-        ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
-
-      // Determine if we should process
-      const shouldProcess =
-        flush ||
-        (speechDuration >= this.MIN_SPEECH_DURATION_MS &&
-          silenceDuration >= this.MAX_SILENCE_DURATION_MS);
-
-      if (!shouldProcess) {
+      // Only transcribe if speech/silence patterns indicate we should
+      if (!this.shouldTranscribe()) {
        return "";
      }

-      // Process accumulated audio (pass flush flag for formatting decision)
-      const result = await this.processAudio(flush);
-
-      // Clear buffer after processing
-      this.frameBuffer = [];
-      this.frameBufferSpeechProbabilities = [];
-      this.currentSilenceFrameCount = 0;
-
-      return result;
+      return this.doTranscription(false);
    } catch (error) {
      logger.transcription.error("Cloud transcription error:", error);
      throw error;
    }
  }

-  private async processAudio(isFinal: boolean = false): Promise<string> {
-    // Combine all frames into a single Float32Array (may be empty)
+  /**
+   * Flush any buffered audio and return transcription with formatting
+   * Called at the end of a recording session
+   */
+  async flush(context: TranscribeContext): Promise<string> {
+    try {
+      // Store context for API call
+      this.currentLanguage = context.language;
+      this.currentAccessibilityContext = context?.accessibilityContext ?? null;
+      this.currentAggregatedTranscription = context?.aggregatedTranscription;
+
+      // Check authentication
+      if (!(await this.authService.isAuthenticated())) {
+        throw new Error("Authentication required for cloud transcription");
+      }
+
+      return this.doTranscription(true);
+    } catch (error) {
+      logger.transcription.error("Cloud transcription error:", error);
+      throw error;
+    }
+  }
+
+  /**
+   * Shared transcription logic - aggregates buffer, calls cloud API, clears state
+   */
+  private async doTranscription(enableFormatting: boolean): Promise<string> {
+    // Combine all frames into a single Float32Array
    const totalLength = this.frameBuffer.reduce(
      (acc, frame) => acc + frame.length,
      0,
@ -134,9 +136,43 @@ export class AmicalCloudProvider implements TranscriptionProvider {
      offset += frame.length;
    }

-    // Try transcription with automatic retry on 401
-    // Enable formatting only on final chunk
-    return this.makeTranscriptionRequest(combinedAudio, false, isFinal);
+    // Clear frame buffers only (context values needed for API call below)
+    this.frameBuffer = [];
+    this.frameBufferSpeechProbabilities = [];
+    this.currentSilenceFrameCount = 0;
+
+    // Make the API request
+    return this.makeTranscriptionRequest(
+      combinedAudio,
+      false,
+      enableFormatting,
+    );
+  }
+
+  /**
+   * Clear internal buffers without transcribing
+   * Called when cancelling a session to prevent audio bleed
+   */
+  reset(): void {
+    this.frameBuffer = [];
+    this.frameBufferSpeechProbabilities = [];
+    this.currentSilenceFrameCount = 0;
+    this.currentLanguage = undefined;
+    this.currentAccessibilityContext = null;
+    this.currentAggregatedTranscription = undefined;
+  }
+
+  private shouldTranscribe(): boolean {
+    const silenceDuration =
+      ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
+      1000;
+    const speechDuration =
+      ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
+
+    return (
+      speechDuration >= this.MIN_SPEECH_DURATION_MS &&
+      silenceDuration >= this.MAX_SILENCE_DURATION_MS
+    );
  }

  private async makeTranscriptionRequest(
@ -144,9 +180,13 @@ export class AmicalCloudProvider implements TranscriptionProvider {
    isRetry = false,
    enableFormatting = false,
  ): Promise<string> {
-    // Skip API call if no audio and formatting not requested
-    if (audioData.length === 0 && !enableFormatting) {
-      return "";
+    // Skip API call if there's nothing to process
+    if (audioData.length === 0) {
+      const hasTextToFormat =
+        enableFormatting && this.currentAggregatedTranscription?.trim();
+      if (!hasTextToFormat) {
+        return "";
+      }
    }

    // Get auth token
@ -166,112 +206,104 @@ export class AmicalCloudProvider implements TranscriptionProvider {
      formatting: enableFormatting,
    });

-    try {
-      const response = await fetch(`${this.apiEndpoint}/transcribe`, {
-        method: "POST",
-        headers: {
-          "Content-Type": "application/json",
-          Authorization: `Bearer ${idToken}`,
-          "User-Agent": getUserAgent(),
+    const response = await fetch(`${this.apiEndpoint}/transcribe`, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        Authorization: `Bearer ${idToken}`,
+        "User-Agent": getUserAgent(),
+      },
+      body: JSON.stringify({
+        audioData: Array.from(audioData),
+        language: this.currentLanguage,
+        previousTranscription: this.currentAggregatedTranscription,
+        formatting: {
+          enabled: enableFormatting,
        },
-        body: JSON.stringify({
-          audioData: Array.from(audioData),
-          language: this.currentLanguage,
-          previousTranscription: this.currentAggregatedTranscription,
-          formatting: {
-            enabled: enableFormatting,
-          },
-          sharedContext: this.currentAccessibilityContext
-            ? {
-                selectedText:
-                  this.currentAccessibilityContext.context?.textSelection
-                    ?.selectedText,
-                beforeText:
-                  this.currentAccessibilityContext.context?.textSelection
-                    ?.preSelectionText,
-                afterText:
-                  this.currentAccessibilityContext.context?.textSelection
-                    ?.postSelectionText,
-                appType: detectApplicationType(
-                  this.currentAccessibilityContext,
-                ),
-                appBundleId:
-                  this.currentAccessibilityContext.context?.application
-                    ?.bundleIdentifier,
-                appName:
-                  this.currentAccessibilityContext.context?.application?.name,
-                appUrl:
-                  this.currentAccessibilityContext.context?.windowInfo?.url,
-                surroundingContext: "", // Empty for now, future enhancement
-              }
-            : undefined,
-        }),
-      });
+        sharedContext: this.currentAccessibilityContext
+          ? {
+              selectedText:
+                this.currentAccessibilityContext.context?.textSelection
+                  ?.selectedText,
+              beforeText:
+                this.currentAccessibilityContext.context?.textSelection
+                  ?.preSelectionText,
+              afterText:
+                this.currentAccessibilityContext.context?.textSelection
+                  ?.postSelectionText,
+              appType: detectApplicationType(this.currentAccessibilityContext),
+              appBundleId:
+                this.currentAccessibilityContext.context?.application
+                  ?.bundleIdentifier,
+              appName:
+                this.currentAccessibilityContext.context?.application?.name,
+              appUrl: this.currentAccessibilityContext.context?.windowInfo?.url,
+              surroundingContext: "", // Empty for now, future enhancement
+            }
+          : undefined,
+      }),
+    });

-      // Handle 401 with token refresh and retry
-      if (response.status === 401) {
-        if (isRetry) {
-          // Already retried once, give up
-          throw new Error("Authentication failed - please log in again");
-        }
+    // Handle 401 with token refresh and retry
+    if (response.status === 401) {
+      if (isRetry) {
+        // Already retried once, give up
+        throw new Error("Authentication failed - please log in again");
+      }

-        logger.transcription.warn(
-          "Got 401 response, attempting token refresh and retry",
+      logger.transcription.warn(
+        "Got 401 response, attempting token refresh and retry",
+      );
+
+      try {
+        // Force token refresh
+        await this.authService.refreshTokenIfNeeded();
+
+        // Retry the request once (preserve formatting flag)
+        return await this.makeTranscriptionRequest(
+          audioData,
+          true,
+          enableFormatting,
        );
-
-        try {
-          // Force token refresh
-          await this.authService.refreshTokenIfNeeded();
-
-          // Retry the request once (preserve formatting flag)
-          return await this.makeTranscriptionRequest(
-            audioData,
-            true,
-            enableFormatting,
-          );
-        } catch (refreshError) {
-          logger.transcription.error("Token refresh failed:", refreshError);
-          throw new Error("Authentication failed - please log in again");
-        }
+      } catch (refreshError) {
+        logger.transcription.error("Token refresh failed:", refreshError);
+        throw new Error("Authentication failed - please log in again");
      }
-
-      if (response.status === 403) {
-        throw new Error("Subscription required for cloud transcription");
-      }
-
-      if (response.status === 429) {
-        const errorData = await response.json();
-        throw new Error(
-          `Word limit exceeded: ${errorData.currentWords}/${errorData.limit}`,
-        );
-      }
-
-      if (!response.ok) {
-        const errorText = await response.text();
-        logger.transcription.error("Cloud API error:", {
-          status: response.status,
-          statusText: response.statusText,
-          error: errorText,
-        });
-        throw new Error(`Cloud API error: ${response.statusText}`);
-      }
-
-      const result: CloudTranscriptionResponse = await response.json();
-
-      if (!result.success) {
-        throw new Error(result.error || "Cloud transcription failed");
-      }
-
-      logger.transcription.info("Cloud transcription successful", {
-        textLength: result.transcription?.length || 0,
-        language: result.language,
-        duration: result.duration,
-      });
-
-      return result.transcription || "";
-    } catch (error) {
-      logger.transcription.error("Cloud transcription request failed:", error);
-      throw error;
    }
+
+    if (response.status === 403) {
+      throw new Error("Subscription required for cloud transcription");
+    }
+
+    if (response.status === 429) {
+      const errorData = await response.json();
+      throw new Error(
+        `Word limit exceeded: ${errorData.currentWords}/${errorData.limit}`,
+      );
+    }
+
+    if (!response.ok) {
+      const errorText = await response.text();
+      logger.transcription.error("Cloud API error:", {
+        status: response.status,
+        statusText: response.statusText,
+        error: errorText,
+      });
+      throw new Error(`Cloud API error: ${response.statusText}`);
+    }
+
+    const result: CloudTranscriptionResponse = await response.json();
+
+    if (!result.success) {
+      throw new Error(result.error || "Cloud transcription failed");
+    }
+
+    logger.transcription.info("Cloud transcription successful", {
+      textLength: result.transcription?.length || 0,
+      language: result.language,
+      duration: result.duration,
+    });
+
+    return result.transcription || "";
  }
 }
--- a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
@ -1,6 +1,7 @@
 import {
  TranscriptionProvider,
  TranscribeParams,
+  TranscribeContext,
 } from "../../core/pipeline-types";
 import { logger } from "../../../main/logger";
 import { ModelService } from "../../../services/model-service";
@ -74,74 +75,79 @@ export class WhisperProvider implements TranscriptionProvider {
    }
  }

+  /**
+   * Process an audio chunk - buffers and conditionally transcribes
+   */
  async transcribe(params: TranscribeParams): Promise<string> {
+    await this.initializeWhisper();
+
+    const { audioData, speechProbability = 1, context } = params;
+
+    // Add frame to buffer with speech probability
+    this.frameBuffer.push(audioData);
+    this.frameBufferSpeechProbabilities.push(speechProbability);
+
+    // Consider it speech if probability is above threshold
+    const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD;
+
+    logger.transcription.debug(
+      `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.currentSilenceFrameCount}`,
+    );
+
+    // Handle speech/silence logic
+    if (isSpeech) {
+      this.currentSilenceFrameCount = 0;
+      this.lastSpeechTimestamp = Date.now();
+    } else {
+      this.currentSilenceFrameCount++;
+    }
+
+    // Only transcribe if speech/silence patterns indicate we should
+    if (!this.shouldTranscribe()) {
+      return "";
+    }
+
+    return this.doTranscription(context);
+  }
+
+  /**
+   * Flush any buffered audio and return transcription
+   * Called at the end of a recording session
+   */
+  async flush(context: TranscribeContext): Promise<string> {
+    if (this.frameBuffer.length === 0) {
+      return "";
+    }
+
+    await this.initializeWhisper();
+    return this.doTranscription(context);
+  }
+
+  /**
+   * Shared transcription logic - aggregates buffer, calls whisper, clears state
+   * Assumes initializeWhisper() was already called by caller
+   */
+  private async doTranscription(context: TranscribeContext): Promise<string> {
    try {
-      await this.initializeWhisper();
-
-      // Extract parameters from the new structure
-      const {
-        audioData,
-        speechProbability = 1,
-        context,
-        flush = false,
-      } = params;
      const { vocabulary, aggregatedTranscription, language } = context;

-      // Audio data is already Float32Array
-
-      // Add frame to buffer with speech probability
-      this.frameBuffer.push(audioData);
-      this.frameBufferSpeechProbabilities.push(speechProbability);
-
-      // Consider it speech if probability is above threshold
-      const isSpeech = speechProbability > this.SPEECH_PROBABILITY_THRESHOLD;
-
-      logger.transcription.debug(
-        `Frame received - SpeechProb: ${speechProbability.toFixed(3)}, Buffer size: ${this.frameBuffer.length}, Silence count: ${this.currentSilenceFrameCount}`,
-      );
-
-      // Handle speech/silence logic
-      if (isSpeech) {
-        this.currentSilenceFrameCount = 0;
-        this.lastSpeechTimestamp = Date.now();
-      } else {
-        this.currentSilenceFrameCount++;
-      }
-
-      // Determine if we should transcribe
-      const shouldTranscribe = flush || this.shouldTranscribe();
-
-      if (!shouldTranscribe) {
-        // Keep buffering
-        return "";
-      }
-
      const isAllSilent = this.isAllSilent();

      // Aggregate buffered frames
      const aggregatedAudio = this.aggregateFrames();

-      // Clear buffers immediately after aggregation, before async operations
-      this.frameBuffer = [];
-      this.frameBufferSpeechProbabilities = [];
-      this.currentSilenceFrameCount = 0;
+      // Clear buffers immediately after aggregation
+      this.reset();

      if (isAllSilent && this.IGNORE_FULLY_SILENT_CHUNKS) {
        logger.transcription.debug("Skipping transcription - all silent");
        return "";
      }

-      // Skip if too short or only silence
-      /* if (aggregatedAudio.length < this.FRAME_SIZE * 2) {
-        logger.transcription.debug("Skipping transcription - audio too short");
-        return "";
-      } */
-
      logger.transcription.debug(
        `Starting transcription of ${aggregatedAudio.length} samples (${((aggregatedAudio.length / this.SAMPLE_RATE) * 1000).toFixed(0)}ms)`,
      );

-      // Transcribe using the local Whisper wrapper
      if (!this.workerWrapper) {
        throw new Error("Worker wrapper is not initialized");
      }
@ -152,7 +158,7 @@ export class WhisperProvider implements TranscriptionProvider {
        aggregatedTranscription,
      );

-      const text = await this.workerWrapper!.exec<string>("transcribeAudio", [
+      const text = await this.workerWrapper.exec<string>("transcribeAudio", [
        aggregatedAudio,
        {
          language: language || "auto",
@ -174,11 +180,20 @@ export class WhisperProvider implements TranscriptionProvider {
    }
  }

+  /**
+   * Clear internal buffers without transcribing
+   * Called when cancelling a session to prevent audio bleed
+   */
+  reset(): void {
+    this.frameBuffer = [];
+    this.frameBufferSpeechProbabilities = [];
+    this.currentSilenceFrameCount = 0;
+  }
+
  private shouldTranscribe(): boolean {
    // Transcribe if:
    // 1. We have significant silence after speech
    // 2. Buffer is getting too large
-    // 3. Final chunk was received (handled elsewhere)

    const bufferDurationMs =
      ((this.frameBuffer.length * this.FRAME_SIZE) / this.SAMPLE_RATE) * 1000;
@ -186,7 +201,7 @@ export class WhisperProvider implements TranscriptionProvider {
      ((this.currentSilenceFrameCount * this.FRAME_SIZE) / this.SAMPLE_RATE) *
      1000;

-    // If we have speech (potential cause frameBuffer might just be all silence too, and thats okay) and then significant silence, transcribe
+    // If we have speech and then significant silence, transcribe
    if (
      this.frameBuffer.length > 0 &&
      silenceDurationMs > this.MAX_SILENCE_DURATION_MS
@ -357,9 +372,6 @@ export class WhisperProvider implements TranscriptionProvider {
      }
    }

-    // Clear buffers
-    this.frameBuffer = [];
-    this.frameBufferSpeechProbabilities = [];
-    this.currentSilenceFrameCount = 0;
+    this.reset();
  }
 }
--- a/apps/desktop/src/services/transcription-service.ts
+++ b/apps/desktop/src/services/transcription-service.ts
@ -214,23 +214,14 @@ export class TranscriptionService {

  /**
   * Process a single audio chunk in streaming mode
+   * For finalization, use finalizeSession() instead
   */
  async processStreamingChunk(options: {
    sessionId: string;
    audioChunk: Float32Array;
-    isFinal?: boolean;
-    audioFilePath?: string;
    recordingStartedAt?: number;
-    recordingStoppedAt?: number;
  }): Promise<string> {
-    const {
-      sessionId,
-      audioChunk,
-      isFinal = false,
-      audioFilePath,
-      recordingStartedAt,
-      recordingStoppedAt,
-    } = options;
+    const { sessionId, audioChunk, recordingStartedAt } = options;

    // Run VAD on the audio chunk
    let speechProbability = 0;
@ -281,7 +272,7 @@ export class TranscriptionService {
          context: streamingContext,
          transcriptionResults: [],
          firstChunkReceivedAt: performance.now(),
-          recordingStartedAt: recordingStartedAt, // From RecordingManager (when user pressed record)
+          recordingStartedAt: recordingStartedAt,
        };

        this.streamingSessions.set(sessionId, session);
@ -305,11 +296,10 @@ export class TranscriptionService {
      // Select the appropriate provider
      const provider = await this.selectProvider();

-      // Transcribe with flush parameter for final chunks
+      // Transcribe chunk (flush is done separately in finalizeSession)
      const chunkTranscription = await provider.transcribe({
        audioData: audioChunk,
-        speechProbability: speechProbability, // Now from VAD service
-        flush: isFinal, // Pass flush flag for final chunks
+        speechProbability: speechProbability,
        context: {
          vocabulary: session.context.sharedData.vocabulary,
          accessibilityContext: session.context.sharedData.accessibilityContext,
@ -334,25 +324,96 @@ export class TranscriptionService {
        sessionId,
        frameSize: audioChunk.length,
        hadTranscription: chunkTranscription.length > 0,
-        isFinal,
      });
    } finally {
      // Release transcription mutex - always release even on error
      this.transcriptionMutex.release();
    }
-    const completeTranscriptionTillNow = session.transcriptionResults
-      .join(" ")
-      .trim();

-    // this is the final chunk, save the transcription
-    if (!isFinal) {
-      return completeTranscriptionTillNow;
+    return session.transcriptionResults.join(" ").trim();
+  }
+
+  /**
+   * Cancel a streaming session without processing
+   * Used when recording is cancelled (e.g., quick tap, accidental activation)
+   */
+  async cancelStreamingSession(sessionId: string): Promise<void> {
+    if (this.streamingSessions.has(sessionId)) {
+      // Acquire mutex to prevent race with processStreamingChunk
+      await this.transcriptionMutex.acquire();
+      try {
+        // Clear provider buffers to prevent audio bleed into next session
+        this.currentProvider?.reset();
+
+        this.streamingSessions.delete(sessionId);
+        logger.transcription.info("Streaming session cancelled", { sessionId });
+      } finally {
+        this.transcriptionMutex.release();
+      }
+    }
+  }
+
+  /**
+   * Finalize a streaming session - flush provider, format, save to DB
+   * Call this instead of processStreamingChunk with isFinal=true
+   */
+  async finalizeSession(options: {
+    sessionId: string;
+    audioFilePath?: string;
+    recordingStartedAt?: number;
+    recordingStoppedAt?: number;
+  }): Promise<string> {
+    const { sessionId, audioFilePath, recordingStartedAt, recordingStoppedAt } =
+      options;
+
+    const session = this.streamingSessions.get(sessionId);
+    if (!session) {
+      logger.transcription.warn("No session found to finalize", { sessionId });
+      return "";
    }

-    session.finalChunkReceivedAt = performance.now();
+    // Update session timestamps
+    session.finalizationStartedAt = performance.now();
    session.recordingStoppedAt = recordingStoppedAt;
+    if (recordingStartedAt && !session.recordingStartedAt) {
+      session.recordingStartedAt = recordingStartedAt;
+    }

-    let completeTranscription = completeTranscriptionTillNow;
+    // Flush provider to get any remaining buffered audio
+    await this.transcriptionMutex.acquire();
+    try {
+      const previousChunk =
+        session.transcriptionResults.length > 0
+          ? session.transcriptionResults[
+              session.transcriptionResults.length - 1
+            ]
+          : undefined;
+      const aggregatedTranscription = session.transcriptionResults
+        .join(" ")
+        .trim();
+
+      const provider = await this.selectProvider();
+      const finalTranscription = await provider.flush({
+        vocabulary: session.context.sharedData.vocabulary,
+        accessibilityContext: session.context.sharedData.accessibilityContext,
+        previousChunk,
+        aggregatedTranscription: aggregatedTranscription || undefined,
+        language: session.context.sharedData.userPreferences?.language,
+      });
+
+      if (finalTranscription.trim()) {
+        session.transcriptionResults.push(finalTranscription);
+        logger.transcription.info("Whisper returned final transcription", {
+          sessionId,
+          transcriptionLength: finalTranscription.length,
+          totalResults: session.transcriptionResults.length,
+        });
+      }
+    } finally {
+      this.transcriptionMutex.release();
+    }
+
+    let completeTranscription = session.transcriptionResults.join(" ").trim();
    let formattingDuration: number | undefined;

    logger.transcription.info("Finalizing streaming session", {
--- a/apps/desktop/src/trpc/routers/recording.ts
+++ b/apps/desktop/src/trpc/routers/recording.ts
@ -15,7 +15,7 @@ export const recordingRouter = createRouter({
    if (!recordingManager) {
      throw new Error("Recording manager not available");
    }
-    return await recordingManager.startRecording("hands-free");
+    return await recordingManager.signalStart();
  }),

  signalStop: procedure.mutation(async ({ ctx }) => {
@ -23,7 +23,7 @@ export const recordingRouter = createRouter({
    if (!recordingManager) {
      throw new Error("Recording manager not available");
    }
-    return await recordingManager.stopRecording();
+    return await recordingManager.signalStop();
  }),

  // Using Observable instead of async generator due to Symbol.asyncDispose conflict
--- a/apps/desktop/src/types/recording.ts
+++ b/apps/desktop/src/types/recording.ts
@ -1,6 +1 @@
-export type RecordingState =
-  | "idle"
-  | "starting"
-  | "recording"
-  | "stopping"
-  | "error";
+export type RecordingState = "idle" | "starting" | "recording" | "stopping";
--- a/apps/desktop/src/utils/streaming-wav-writer.ts
+++ b/apps/desktop/src/utils/streaming-wav-writer.ts
@ -135,6 +135,25 @@ export class StreamingWavWriter {
    }
  }

+  /**
+   * Abort writing and close the file stream without finalizing
+   * Used when recording is cancelled
+   */
+  async abort(): Promise<void> {
+    if (this.isFinalized) return;
+
+    this.isFinalized = true; // Prevent further writes
+
+    // Close the stream
+    await new Promise<void>((resolve) => {
+      this.fileStream.end(() => resolve());
+    });
+
+    logger.transcription.info("WAV writer aborted", {
+      path: this.fileStream.path,
+    });
+  }
+
  /**
   * Get the current size of audio data written
   */