From cc03a88fc4059f71c93f63bd088f7acd4d547b36 Mon Sep 17 00:00:00 2001
From: haritabh-z01 <haritabh.z01+github@gmail.com>
Date: Fri, 16 Jan 2026 17:37:02 +0530
Subject: [PATCH] fix: improve transcription continuity and add vocabulary
 replacements

   - Preserve native segment spacing by joining without extra spaces
   - Use accessibility context (pre-selection text) as Whisper prompt fallback
   - Add pre-formatter to handle Whisper leading space artifact based on context
   - Add Unicode-aware vocabulary replacement as final post-processing step
---
 .../transcription/amical-cloud-provider.ts    |   1 +
 .../transcription/whisper-provider.ts         |  18 ++--
 .../transcription/whisper-worker-fork.ts      |   3 +-
 .../src/services/transcription-service.ts     | 100 ++++++++++++++++--
 4 files changed, 104 insertions(+), 18 deletions(-)
diff --git a/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts b/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts
index 3f926aa..4e1087f 100644
--- a/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts
@@ -314,6 +314,7 @@ export class AmicalCloudProvider implements TranscriptionProvider {
       textLength: result.transcription?.length || 0,
       language: result.language,
       duration: result.duration,
+      transcription: result.transcription,
     });
 
     return result.transcription || "";
diff --git a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
index 9bbbe47..28bf10c 100644
--- a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
@@ -156,6 +156,7 @@ export class WhisperProvider implements TranscriptionProvider {
       const initialPrompt = this.generateInitialPrompt(
         vocabulary,
         aggregatedTranscription,
+        context.accessibilityContext,
       );
 
       const text = await this.workerWrapper.exec<string>("transcribeAudio", [
@@ -296,6 +297,7 @@ export class WhisperProvider implements TranscriptionProvider {
   private generateInitialPrompt(
     vocabulary?: string[],
     aggregatedTranscription?: string,
+    accessibilityContext?: TranscribeContext["accessibilityContext"],
   ): string {
     const promptParts: string[] = [];
 
@@ -304,17 +306,19 @@ export class WhisperProvider implements TranscriptionProvider {
       promptParts.push(vocabulary.join(", "));
     }
 
-    // Add last 8 words from aggregated transcription if available
-    if (aggregatedTranscription && aggregatedTranscription.trim().length > 0) {
-      const words = aggregatedTranscription.trim().split(/\s+/);
-      const lastWords = words.slice(-8).join(" ");
-      if (lastWords.length > 0) {
-        promptParts.push(lastWords);
+    if (aggregatedTranscription) {
+      // Pass full transcription - whisper.cpp auto-truncates to last ~224 tokens
+      promptParts.push(aggregatedTranscription);
+    } else {
+      const beforeText =
+        accessibilityContext?.context?.textSelection?.preSelectionText;
+      if (beforeText && beforeText.trim().length > 0) {
+        promptParts.push(beforeText);
       }
     }
 
     // Combine parts with a separator, or return empty string if no context
-    const prompt = promptParts.join(". ");
+    const prompt = promptParts.join(" ");
 
     logger.transcription.debug(`Generated initial prompt: "${prompt}"`);
 
diff --git a/apps/desktop/src/pipeline/providers/transcription/whisper-worker-fork.ts b/apps/desktop/src/pipeline/providers/transcription/whisper-worker-fork.ts
index e1ece3a..e74fd43 100644
--- a/apps/desktop/src/pipeline/providers/transcription/whisper-worker-fork.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/whisper-worker-fork.ts
@@ -95,8 +95,7 @@ const methods = {
 
     return transcription
       .map((segment: { text: string }) => segment.text)
-      .join(" ")
-      .trim();
+      .join("");
   },
 
   async dispose(): Promise<void> {
diff --git a/apps/desktop/src/services/transcription-service.ts b/apps/desktop/src/services/transcription-service.ts
index ba4affc..55e20ac 100644
--- a/apps/desktop/src/services/transcription-service.ts
+++ b/apps/desktop/src/services/transcription-service.ts
@@ -290,9 +290,7 @@ export class TranscriptionService {
               session.transcriptionResults.length - 1
             ]
           : undefined;
-      const aggregatedTranscription = session.transcriptionResults
-        .join(" ")
-        .trim();
+      const aggregatedTranscription = session.transcriptionResults.join("");
 
       // Select the appropriate provider
       const provider = await this.selectProvider();
@@ -331,7 +329,7 @@ export class TranscriptionService {
       this.transcriptionMutex.release();
     }
 
-    return session.transcriptionResults.join(" ").trim();
+    return session.transcriptionResults.join("");
   }
 
   /**
@@ -393,10 +391,7 @@ export class TranscriptionService {
           ? session.transcriptionResults[
               session.transcriptionResults.length - 1
             ]
-          : undefined;
-      const aggregatedTranscription = session.transcriptionResults
-        .join(" ")
-        .trim();
+          : undefined;      const aggregatedTranscription = session.transcriptionResults.join("");
 
       const provider = await this.selectProvider();
       usedCloudProvider = provider.name === "amical-cloud";
@@ -421,7 +416,19 @@ export class TranscriptionService {
       this.transcriptionMutex.release();
     }
 
-    let completeTranscription = session.transcriptionResults.join(" ");
+    let completeTranscription = session.transcriptionResults.join("");
+
+    // Apply simple pre-formatting for local models (handles Whisper leading space artifact)
+    if (!usedCloudProvider) {
+      const preSelectionText =
+        session.context.sharedData.accessibilityContext?.context?.textSelection
+          ?.preSelectionText;
+      completeTranscription = this.preFormatLocalTranscription(
+        completeTranscription,
+        preSelectionText,
+      );
+    }
+
     let formattingDuration: number | undefined;
 
     logger.transcription.info("Finalizing streaming session", {
@@ -527,6 +534,24 @@ export class TranscriptionService {
       }
     }
 
+    // Apply vocabulary replacements (final post-processing step)
+    const replacements = session.context.sharedData.replacements;
+    if (replacements.size > 0) {
+      const beforeReplacements = completeTranscription;
+      completeTranscription = this.applyReplacements(
+        completeTranscription,
+        replacements,
+      );
+      if (beforeReplacements !== completeTranscription) {
+        logger.transcription.info("Applied vocabulary replacements", {
+          sessionId,
+          replacementCount: replacements.size,
+          originalLength: beforeReplacements.length,
+          newLength: completeTranscription.length,
+        });
+      }
+    }
+
     // Save directly to database
     logger.transcription.info("Saving transcription with audio file", {
       sessionId,
@@ -644,6 +669,63 @@ export class TranscriptionService {
     return context;
   }
 
+  /**
+   * Simple pre-formatter for local Transcription models.
+   * Handles leading space based on insertion context to avoid double spaces or unwanted leading whitespace.
+   * Runs before LLM formatter (if configured) to ensure clean input.
+   */
+  private preFormatLocalTranscription(
+    transcription: string,
+    preSelectionText: string | null | undefined,
+  ): string {
+    if (!transcription.startsWith(" ")) {
+      return transcription;
+    }
+
+    // Strip leading space if:
+    // 1. No previous text (start of document/field)
+    // 2. Previous text ends with whitespace (avoid double space)
+    const shouldStripLeadingSpace =
+      !preSelectionText ||
+      preSelectionText.length === 0 ||
+      /\s$/.test(preSelectionText);
+
+    return shouldStripLeadingSpace ? transcription.trimStart() : transcription;
+  }
+
+  /**
+   * Apply vocabulary replacements to transcription text.
+   * Uses case-insensitive Unicode-aware word boundary matching to replace terms.
+   * Works across all languages and scripts (Latin, Cyrillic, CJK, Arabic, etc.).
+   * Runs after LLM formatting as the final post-processing step.
+   */
+  private applyReplacements(
+    text: string,
+    replacements: Map<string, string>,
+  ): string {
+    if (replacements.size === 0 || !text) {
+      return text;
+    }
+
+    let result = text;
+
+    for (const [word, replacement] of replacements) {
+      // Escape special regex characters in the word
+      const escapedWord = word.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+      // Use Unicode-aware word boundaries:
+      // - \p{L} matches any Unicode letter (Latin, Cyrillic, CJK, Arabic, etc.)
+      // - \p{N} matches any Unicode number
+      // - Negative lookbehind/lookahead ensures word is not part of a larger word
+      const regex = new RegExp(
+        `(?<![\\p{L}\\p{N}])${escapedWord}(?![\\p{L}\\p{N}])`,
+        "giu",
+      );
+      result = result.replace(regex, replacement);
+    }
+
+    return result;
+  }
+
   private async formatWithProvider(
     provider: FormattingProvider,
     sessionId: string,