From cc03a88fc4059f71c93f63bd088f7acd4d547b36 Mon Sep 17 00:00:00 2001 From: haritabh-z01 Date: Fri, 16 Jan 2026 17:37:02 +0530 Subject: [PATCH] fix: improve transcription continuity and add vocabulary replacements - Preserve native segment spacing by joining without extra spaces - Use accessibility context (pre-selection text) as Whisper prompt fallback - Add pre-formatter to handle Whisper leading space artifact based on context - Add Unicode-aware vocabulary replacement as final post-processing step --- .../transcription/amical-cloud-provider.ts | 1 + .../transcription/whisper-provider.ts | 18 ++-- .../transcription/whisper-worker-fork.ts | 3 +- .../src/services/transcription-service.ts | 100 ++++++++++++++++-- 4 files changed, 104 insertions(+), 18 deletions(-) diff --git a/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts b/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts index 3f926aa..4e1087f 100644 --- a/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts +++ b/apps/desktop/src/pipeline/providers/transcription/amical-cloud-provider.ts @@ -314,6 +314,7 @@ export class AmicalCloudProvider implements TranscriptionProvider { textLength: result.transcription?.length || 0, language: result.language, duration: result.duration, + transcription: result.transcription, }); return result.transcription || ""; diff --git a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts index 9bbbe47..28bf10c 100644 --- a/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts +++ b/apps/desktop/src/pipeline/providers/transcription/whisper-provider.ts @@ -156,6 +156,7 @@ export class WhisperProvider implements TranscriptionProvider { const initialPrompt = this.generateInitialPrompt( vocabulary, aggregatedTranscription, + context.accessibilityContext, ); const text = await this.workerWrapper.exec("transcribeAudio", [ @@ -296,6 +297,7 @@ export class WhisperProvider implements TranscriptionProvider { private generateInitialPrompt( vocabulary?: string[], aggregatedTranscription?: string, + accessibilityContext?: TranscribeContext["accessibilityContext"], ): string { const promptParts: string[] = []; @@ -304,17 +306,19 @@ export class WhisperProvider implements TranscriptionProvider { promptParts.push(vocabulary.join(", ")); } - // Add last 8 words from aggregated transcription if available - if (aggregatedTranscription && aggregatedTranscription.trim().length > 0) { - const words = aggregatedTranscription.trim().split(/\s+/); - const lastWords = words.slice(-8).join(" "); - if (lastWords.length > 0) { - promptParts.push(lastWords); + if (aggregatedTranscription) { + // Pass full transcription - whisper.cpp auto-truncates to last ~224 tokens + promptParts.push(aggregatedTranscription); + } else { + const beforeText = + accessibilityContext?.context?.textSelection?.preSelectionText; + if (beforeText && beforeText.trim().length > 0) { + promptParts.push(beforeText); } } // Combine parts with a separator, or return empty string if no context - const prompt = promptParts.join(". "); + const prompt = promptParts.join(" "); logger.transcription.debug(`Generated initial prompt: "${prompt}"`); diff --git a/apps/desktop/src/pipeline/providers/transcription/whisper-worker-fork.ts b/apps/desktop/src/pipeline/providers/transcription/whisper-worker-fork.ts index e1ece3a..e74fd43 100644 --- a/apps/desktop/src/pipeline/providers/transcription/whisper-worker-fork.ts +++ b/apps/desktop/src/pipeline/providers/transcription/whisper-worker-fork.ts @@ -95,8 +95,7 @@ const methods = { return transcription .map((segment: { text: string }) => segment.text) - .join(" ") - .trim(); + .join(""); }, async dispose(): Promise { diff --git a/apps/desktop/src/services/transcription-service.ts b/apps/desktop/src/services/transcription-service.ts index ba4affc..55e20ac 100644 --- a/apps/desktop/src/services/transcription-service.ts +++ b/apps/desktop/src/services/transcription-service.ts @@ -290,9 +290,7 @@ export class TranscriptionService { session.transcriptionResults.length - 1 ] : undefined; - const aggregatedTranscription = session.transcriptionResults - .join(" ") - .trim(); + const aggregatedTranscription = session.transcriptionResults.join(""); // Select the appropriate provider const provider = await this.selectProvider(); @@ -331,7 +329,7 @@ export class TranscriptionService { this.transcriptionMutex.release(); } - return session.transcriptionResults.join(" ").trim(); + return session.transcriptionResults.join(""); } /** @@ -393,10 +391,7 @@ export class TranscriptionService { ? session.transcriptionResults[ session.transcriptionResults.length - 1 ] - : undefined; - const aggregatedTranscription = session.transcriptionResults - .join(" ") - .trim(); + : undefined; const aggregatedTranscription = session.transcriptionResults.join(""); const provider = await this.selectProvider(); usedCloudProvider = provider.name === "amical-cloud"; @@ -421,7 +416,19 @@ export class TranscriptionService { this.transcriptionMutex.release(); } - let completeTranscription = session.transcriptionResults.join(" "); + let completeTranscription = session.transcriptionResults.join(""); + + // Apply simple pre-formatting for local models (handles Whisper leading space artifact) + if (!usedCloudProvider) { + const preSelectionText = + session.context.sharedData.accessibilityContext?.context?.textSelection + ?.preSelectionText; + completeTranscription = this.preFormatLocalTranscription( + completeTranscription, + preSelectionText, + ); + } + let formattingDuration: number | undefined; logger.transcription.info("Finalizing streaming session", { @@ -527,6 +534,24 @@ export class TranscriptionService { } } + // Apply vocabulary replacements (final post-processing step) + const replacements = session.context.sharedData.replacements; + if (replacements.size > 0) { + const beforeReplacements = completeTranscription; + completeTranscription = this.applyReplacements( + completeTranscription, + replacements, + ); + if (beforeReplacements !== completeTranscription) { + logger.transcription.info("Applied vocabulary replacements", { + sessionId, + replacementCount: replacements.size, + originalLength: beforeReplacements.length, + newLength: completeTranscription.length, + }); + } + } + // Save directly to database logger.transcription.info("Saving transcription with audio file", { sessionId, @@ -644,6 +669,63 @@ export class TranscriptionService { return context; } + /** + * Simple pre-formatter for local Transcription models. + * Handles leading space based on insertion context to avoid double spaces or unwanted leading whitespace. + * Runs before LLM formatter (if configured) to ensure clean input. + */ + private preFormatLocalTranscription( + transcription: string, + preSelectionText: string | null | undefined, + ): string { + if (!transcription.startsWith(" ")) { + return transcription; + } + + // Strip leading space if: + // 1. No previous text (start of document/field) + // 2. Previous text ends with whitespace (avoid double space) + const shouldStripLeadingSpace = + !preSelectionText || + preSelectionText.length === 0 || + /\s$/.test(preSelectionText); + + return shouldStripLeadingSpace ? transcription.trimStart() : transcription; + } + + /** + * Apply vocabulary replacements to transcription text. + * Uses case-insensitive Unicode-aware word boundary matching to replace terms. + * Works across all languages and scripts (Latin, Cyrillic, CJK, Arabic, etc.). + * Runs after LLM formatting as the final post-processing step. + */ + private applyReplacements( + text: string, + replacements: Map, + ): string { + if (replacements.size === 0 || !text) { + return text; + } + + let result = text; + + for (const [word, replacement] of replacements) { + // Escape special regex characters in the word + const escapedWord = word.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + // Use Unicode-aware word boundaries: + // - \p{L} matches any Unicode letter (Latin, Cyrillic, CJK, Arabic, etc.) + // - \p{N} matches any Unicode number + // - Negative lookbehind/lookahead ensures word is not part of a larger word + const regex = new RegExp( + `(?