fix: improve transcription continuity and add vocabulary replacements

- Preserve native segment spacing by joining without extra spaces
   - Use accessibility context (pre-selection text) as Whisper prompt fallback
   - Add pre-formatter to handle Whisper leading space artifact based on context
   - Add Unicode-aware vocabulary replacement as final post-processing step
This commit is contained in:
haritabh-z01 2026-01-16 17:37:02 +05:30
parent a47c1f56b3
commit cc03a88fc4
4 changed files with 104 additions and 18 deletions

View file

@ -314,6 +314,7 @@ export class AmicalCloudProvider implements TranscriptionProvider {
textLength: result.transcription?.length || 0,
language: result.language,
duration: result.duration,
transcription: result.transcription,
});
return result.transcription || "";

View file

@ -156,6 +156,7 @@ export class WhisperProvider implements TranscriptionProvider {
const initialPrompt = this.generateInitialPrompt(
vocabulary,
aggregatedTranscription,
context.accessibilityContext,
);
const text = await this.workerWrapper.exec<string>("transcribeAudio", [
@ -296,6 +297,7 @@ export class WhisperProvider implements TranscriptionProvider {
private generateInitialPrompt(
vocabulary?: string[],
aggregatedTranscription?: string,
accessibilityContext?: TranscribeContext["accessibilityContext"],
): string {
const promptParts: string[] = [];
@ -304,17 +306,19 @@ export class WhisperProvider implements TranscriptionProvider {
promptParts.push(vocabulary.join(", "));
}
// Add last 8 words from aggregated transcription if available
if (aggregatedTranscription && aggregatedTranscription.trim().length > 0) {
const words = aggregatedTranscription.trim().split(/\s+/);
const lastWords = words.slice(-8).join(" ");
if (lastWords.length > 0) {
promptParts.push(lastWords);
if (aggregatedTranscription) {
// Pass full transcription - whisper.cpp auto-truncates to last ~224 tokens
promptParts.push(aggregatedTranscription);
} else {
const beforeText =
accessibilityContext?.context?.textSelection?.preSelectionText;
if (beforeText && beforeText.trim().length > 0) {
promptParts.push(beforeText);
}
}
// Combine parts with a separator, or return empty string if no context
const prompt = promptParts.join(". ");
const prompt = promptParts.join(" ");
logger.transcription.debug(`Generated initial prompt: "${prompt}"`);

View file

@ -95,8 +95,7 @@ const methods = {
return transcription
.map((segment: { text: string }) => segment.text)
.join(" ")
.trim();
.join("");
},
async dispose(): Promise<void> {

View file

@ -290,9 +290,7 @@ export class TranscriptionService {
session.transcriptionResults.length - 1
]
: undefined;
const aggregatedTranscription = session.transcriptionResults
.join(" ")
.trim();
const aggregatedTranscription = session.transcriptionResults.join("");
// Select the appropriate provider
const provider = await this.selectProvider();
@ -331,7 +329,7 @@ export class TranscriptionService {
this.transcriptionMutex.release();
}
return session.transcriptionResults.join(" ").trim();
return session.transcriptionResults.join("");
}
/**
@ -393,10 +391,7 @@ export class TranscriptionService {
? session.transcriptionResults[
session.transcriptionResults.length - 1
]
: undefined;
const aggregatedTranscription = session.transcriptionResults
.join(" ")
.trim();
: undefined; const aggregatedTranscription = session.transcriptionResults.join("");
const provider = await this.selectProvider();
usedCloudProvider = provider.name === "amical-cloud";
@ -422,6 +417,18 @@ export class TranscriptionService {
}
let completeTranscription = session.transcriptionResults.join("");
// Apply simple pre-formatting for local models (handles Whisper leading space artifact)
if (!usedCloudProvider) {
const preSelectionText =
session.context.sharedData.accessibilityContext?.context?.textSelection
?.preSelectionText;
completeTranscription = this.preFormatLocalTranscription(
completeTranscription,
preSelectionText,
);
}
let formattingDuration: number | undefined;
logger.transcription.info("Finalizing streaming session", {
@ -527,6 +534,24 @@ export class TranscriptionService {
}
}
// Apply vocabulary replacements (final post-processing step)
const replacements = session.context.sharedData.replacements;
if (replacements.size > 0) {
const beforeReplacements = completeTranscription;
completeTranscription = this.applyReplacements(
completeTranscription,
replacements,
);
if (beforeReplacements !== completeTranscription) {
logger.transcription.info("Applied vocabulary replacements", {
sessionId,
replacementCount: replacements.size,
originalLength: beforeReplacements.length,
newLength: completeTranscription.length,
});
}
}
// Save directly to database
logger.transcription.info("Saving transcription with audio file", {
sessionId,
@ -644,6 +669,63 @@ export class TranscriptionService {
return context;
}
/**
* Simple pre-formatter for local Transcription models.
* Handles leading space based on insertion context to avoid double spaces or unwanted leading whitespace.
* Runs before LLM formatter (if configured) to ensure clean input.
*/
private preFormatLocalTranscription(
transcription: string,
preSelectionText: string | null | undefined,
): string {
if (!transcription.startsWith(" ")) {
return transcription;
}
// Strip leading space if:
// 1. No previous text (start of document/field)
// 2. Previous text ends with whitespace (avoid double space)
const shouldStripLeadingSpace =
!preSelectionText ||
preSelectionText.length === 0 ||
/\s$/.test(preSelectionText);
return shouldStripLeadingSpace ? transcription.trimStart() : transcription;
}
/**
* Apply vocabulary replacements to transcription text.
* Uses case-insensitive Unicode-aware word boundary matching to replace terms.
* Works across all languages and scripts (Latin, Cyrillic, CJK, Arabic, etc.).
* Runs after LLM formatting as the final post-processing step.
*/
private applyReplacements(
text: string,
replacements: Map<string, string>,
): string {
if (replacements.size === 0 || !text) {
return text;
}
let result = text;
for (const [word, replacement] of replacements) {
// Escape special regex characters in the word
const escapedWord = word.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
// Use Unicode-aware word boundaries:
// - \p{L} matches any Unicode letter (Latin, Cyrillic, CJK, Arabic, etc.)
// - \p{N} matches any Unicode number
// - Negative lookbehind/lookahead ensures word is not part of a larger word
const regex = new RegExp(
`(?<![\\p{L}\\p{N}])${escapedWord}(?![\\p{L}\\p{N}])`,
"giu",
);
result = result.replace(regex, replacement);
}
return result;
}
private async formatWithProvider(
provider: FormattingProvider,
sessionId: string,