fix: improve transcription continuity and add vocabulary replacements

- Preserve native segment spacing by joining without extra spaces - Use accessibility context (pre-selection text) as Whisper prompt fallback - Add pre-formatter to handle Whisper leading space artifact based on context - Add Unicode-aware vocabulary replacement as final post-processing step
2026-01-16 17:37:02 +05:30 · 2026-01-16 17:37:02 +05:30 · cc03a88fc4
commit cc03a88fc4
parent a47c1f56b3
4 changed files with 104 additions and 18 deletions
--- a/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts
@ -314,6 +314,7 @@ export class AmicalCloudProvider implements TranscriptionProvider {
      textLength: result.transcription?.length || 0,
      language: result.language,
      duration: result.duration,
+      transcription: result.transcription,
    });

    return result.transcription || "";
--- a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts
@ -156,6 +156,7 @@ export class WhisperProvider implements TranscriptionProvider {
      const initialPrompt = this.generateInitialPrompt(
        vocabulary,
        aggregatedTranscription,
+        context.accessibilityContext,
      );

      const text = await this.workerWrapper.exec<string>("transcribeAudio", [
@ -296,6 +297,7 @@ export class WhisperProvider implements TranscriptionProvider {
  private generateInitialPrompt(
    vocabulary?: string[],
    aggregatedTranscription?: string,
+    accessibilityContext?: TranscribeContext["accessibilityContext"],
  ): string {
    const promptParts: string[] = [];

@ -304,17 +306,19 @@ export class WhisperProvider implements TranscriptionProvider {
      promptParts.push(vocabulary.join(", "));
    }

-    // Add last 8 words from aggregated transcription if available
-    if (aggregatedTranscription && aggregatedTranscription.trim().length > 0) {
-      const words = aggregatedTranscription.trim().split(/\s+/);
-      const lastWords = words.slice(-8).join(" ");
-      if (lastWords.length > 0) {
-        promptParts.push(lastWords);
+    if (aggregatedTranscription) {
+      // Pass full transcription - whisper.cpp auto-truncates to last ~224 tokens
+      promptParts.push(aggregatedTranscription);
+    } else {
+      const beforeText =
+        accessibilityContext?.context?.textSelection?.preSelectionText;
+      if (beforeText && beforeText.trim().length > 0) {
+        promptParts.push(beforeText);
      }
    }

    // Combine parts with a separator, or return empty string if no context
-    const prompt = promptParts.join(". ");
+    const prompt = promptParts.join(" ");

    logger.transcription.debug(`Generated initial prompt: "${prompt}"`);

--- a/apps/desktop/src/pipeline/providers/transcription/whisper-worker-fork.ts
+++ b/apps/desktop/src/pipeline/providers/transcription/whisper-worker-fork.ts
@ -95,8 +95,7 @@ const methods = {

    return transcription
      .map((segment: { text: string }) => segment.text)
-      .join(" ")
-      .trim();
+      .join("");
  },

  async dispose(): Promise<void> {
--- a/apps/desktop/src/services/transcription-service.ts
+++ b/apps/desktop/src/services/transcription-service.ts
@ -290,9 +290,7 @@ export class TranscriptionService {
              session.transcriptionResults.length - 1
            ]
          : undefined;
-      const aggregatedTranscription = session.transcriptionResults
-        .join(" ")
-        .trim();
+      const aggregatedTranscription = session.transcriptionResults.join("");

      // Select the appropriate provider
      const provider = await this.selectProvider();
@ -331,7 +329,7 @@ export class TranscriptionService {
      this.transcriptionMutex.release();
    }

-    return session.transcriptionResults.join(" ").trim();
+    return session.transcriptionResults.join("");
  }

  /**
@ -393,10 +391,7 @@ export class TranscriptionService {
          ? session.transcriptionResults[
              session.transcriptionResults.length - 1
            ]
-          : undefined;
-      const aggregatedTranscription = session.transcriptionResults
-        .join(" ")
-        .trim();
+          : undefined;      const aggregatedTranscription = session.transcriptionResults.join("");

      const provider = await this.selectProvider();
      usedCloudProvider = provider.name === "amical-cloud";
@ -422,6 +417,18 @@ export class TranscriptionService {
    }

    let completeTranscription = session.transcriptionResults.join("");
+
+    // Apply simple pre-formatting for local models (handles Whisper leading space artifact)
+    if (!usedCloudProvider) {
+      const preSelectionText =
+        session.context.sharedData.accessibilityContext?.context?.textSelection
+          ?.preSelectionText;
+      completeTranscription = this.preFormatLocalTranscription(
+        completeTranscription,
+        preSelectionText,
+      );
+    }
+
    let formattingDuration: number | undefined;

    logger.transcription.info("Finalizing streaming session", {
@ -527,6 +534,24 @@ export class TranscriptionService {
      }
    }

+    // Apply vocabulary replacements (final post-processing step)
+    const replacements = session.context.sharedData.replacements;
+    if (replacements.size > 0) {
+      const beforeReplacements = completeTranscription;
+      completeTranscription = this.applyReplacements(
+        completeTranscription,
+        replacements,
+      );
+      if (beforeReplacements !== completeTranscription) {
+        logger.transcription.info("Applied vocabulary replacements", {
+          sessionId,
+          replacementCount: replacements.size,
+          originalLength: beforeReplacements.length,
+          newLength: completeTranscription.length,
+        });
+      }
+    }
+
    // Save directly to database
    logger.transcription.info("Saving transcription with audio file", {
      sessionId,
@ -644,6 +669,63 @@ export class TranscriptionService {
    return context;
  }

+  /**
+   * Simple pre-formatter for local Transcription models.
+   * Handles leading space based on insertion context to avoid double spaces or unwanted leading whitespace.
+   * Runs before LLM formatter (if configured) to ensure clean input.
+   */
+  private preFormatLocalTranscription(
+    transcription: string,
+    preSelectionText: string | null | undefined,
+  ): string {
+    if (!transcription.startsWith(" ")) {
+      return transcription;
+    }
+
+    // Strip leading space if:
+    // 1. No previous text (start of document/field)
+    // 2. Previous text ends with whitespace (avoid double space)
+    const shouldStripLeadingSpace =
+      !preSelectionText ||
+      preSelectionText.length === 0 ||
+      /\s$/.test(preSelectionText);
+
+    return shouldStripLeadingSpace ? transcription.trimStart() : transcription;
+  }
+
+  /**
+   * Apply vocabulary replacements to transcription text.
+   * Uses case-insensitive Unicode-aware word boundary matching to replace terms.
+   * Works across all languages and scripts (Latin, Cyrillic, CJK, Arabic, etc.).
+   * Runs after LLM formatting as the final post-processing step.
+   */
+  private applyReplacements(
+    text: string,
+    replacements: Map<string, string>,
+  ): string {
+    if (replacements.size === 0 || !text) {
+      return text;
+    }
+
+    let result = text;
+
+    for (const [word, replacement] of replacements) {
+      // Escape special regex characters in the word
+      const escapedWord = word.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+      // Use Unicode-aware word boundaries:
+      // - \p{L} matches any Unicode letter (Latin, Cyrillic, CJK, Arabic, etc.)
+      // - \p{N} matches any Unicode number
+      // - Negative lookbehind/lookahead ensures word is not part of a larger word
+      const regex = new RegExp(
+        `(?<![\\p{L}\\p{N}])${escapedWord}(?![\\p{L}\\p{N}])`,
+        "giu",
+      );
+      result = result.replace(regex, replacement);
+    }
+
+    return result;
+  }
+
  private async formatWithProvider(
    provider: FormattingProvider,
    sessionId: string,