fix: improve transcription continuity and add vocabulary replacements
- Preserve native segment spacing by joining without extra spaces - Use accessibility context (pre-selection text) as Whisper prompt fallback - Add pre-formatter to handle Whisper leading space artifact based on context - Add Unicode-aware vocabulary replacement as final post-processing step
This commit is contained in:
parent
a47c1f56b3
commit
cc03a88fc4
4 changed files with 104 additions and 18 deletions
|
|
@ -314,6 +314,7 @@ export class AmicalCloudProvider implements TranscriptionProvider {
|
|||
textLength: result.transcription?.length || 0,
|
||||
language: result.language,
|
||||
duration: result.duration,
|
||||
transcription: result.transcription,
|
||||
});
|
||||
|
||||
return result.transcription || "";
|
||||
|
|
|
|||
|
|
@ -156,6 +156,7 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
const initialPrompt = this.generateInitialPrompt(
|
||||
vocabulary,
|
||||
aggregatedTranscription,
|
||||
context.accessibilityContext,
|
||||
);
|
||||
|
||||
const text = await this.workerWrapper.exec<string>("transcribeAudio", [
|
||||
|
|
@ -296,6 +297,7 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
private generateInitialPrompt(
|
||||
vocabulary?: string[],
|
||||
aggregatedTranscription?: string,
|
||||
accessibilityContext?: TranscribeContext["accessibilityContext"],
|
||||
): string {
|
||||
const promptParts: string[] = [];
|
||||
|
||||
|
|
@ -304,17 +306,19 @@ export class WhisperProvider implements TranscriptionProvider {
|
|||
promptParts.push(vocabulary.join(", "));
|
||||
}
|
||||
|
||||
// Add last 8 words from aggregated transcription if available
|
||||
if (aggregatedTranscription && aggregatedTranscription.trim().length > 0) {
|
||||
const words = aggregatedTranscription.trim().split(/\s+/);
|
||||
const lastWords = words.slice(-8).join(" ");
|
||||
if (lastWords.length > 0) {
|
||||
promptParts.push(lastWords);
|
||||
if (aggregatedTranscription) {
|
||||
// Pass full transcription - whisper.cpp auto-truncates to last ~224 tokens
|
||||
promptParts.push(aggregatedTranscription);
|
||||
} else {
|
||||
const beforeText =
|
||||
accessibilityContext?.context?.textSelection?.preSelectionText;
|
||||
if (beforeText && beforeText.trim().length > 0) {
|
||||
promptParts.push(beforeText);
|
||||
}
|
||||
}
|
||||
|
||||
// Combine parts with a separator, or return empty string if no context
|
||||
const prompt = promptParts.join(". ");
|
||||
const prompt = promptParts.join(" ");
|
||||
|
||||
logger.transcription.debug(`Generated initial prompt: "${prompt}"`);
|
||||
|
||||
|
|
|
|||
|
|
@ -95,8 +95,7 @@ const methods = {
|
|||
|
||||
return transcription
|
||||
.map((segment: { text: string }) => segment.text)
|
||||
.join(" ")
|
||||
.trim();
|
||||
.join("");
|
||||
},
|
||||
|
||||
async dispose(): Promise<void> {
|
||||
|
|
|
|||
|
|
@ -290,9 +290,7 @@ export class TranscriptionService {
|
|||
session.transcriptionResults.length - 1
|
||||
]
|
||||
: undefined;
|
||||
const aggregatedTranscription = session.transcriptionResults
|
||||
.join(" ")
|
||||
.trim();
|
||||
const aggregatedTranscription = session.transcriptionResults.join("");
|
||||
|
||||
// Select the appropriate provider
|
||||
const provider = await this.selectProvider();
|
||||
|
|
@ -331,7 +329,7 @@ export class TranscriptionService {
|
|||
this.transcriptionMutex.release();
|
||||
}
|
||||
|
||||
return session.transcriptionResults.join(" ").trim();
|
||||
return session.transcriptionResults.join("");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -393,10 +391,7 @@ export class TranscriptionService {
|
|||
? session.transcriptionResults[
|
||||
session.transcriptionResults.length - 1
|
||||
]
|
||||
: undefined;
|
||||
const aggregatedTranscription = session.transcriptionResults
|
||||
.join(" ")
|
||||
.trim();
|
||||
: undefined; const aggregatedTranscription = session.transcriptionResults.join("");
|
||||
|
||||
const provider = await this.selectProvider();
|
||||
usedCloudProvider = provider.name === "amical-cloud";
|
||||
|
|
@ -422,6 +417,18 @@ export class TranscriptionService {
|
|||
}
|
||||
|
||||
let completeTranscription = session.transcriptionResults.join("");
|
||||
|
||||
// Apply simple pre-formatting for local models (handles Whisper leading space artifact)
|
||||
if (!usedCloudProvider) {
|
||||
const preSelectionText =
|
||||
session.context.sharedData.accessibilityContext?.context?.textSelection
|
||||
?.preSelectionText;
|
||||
completeTranscription = this.preFormatLocalTranscription(
|
||||
completeTranscription,
|
||||
preSelectionText,
|
||||
);
|
||||
}
|
||||
|
||||
let formattingDuration: number | undefined;
|
||||
|
||||
logger.transcription.info("Finalizing streaming session", {
|
||||
|
|
@ -527,6 +534,24 @@ export class TranscriptionService {
|
|||
}
|
||||
}
|
||||
|
||||
// Apply vocabulary replacements (final post-processing step)
|
||||
const replacements = session.context.sharedData.replacements;
|
||||
if (replacements.size > 0) {
|
||||
const beforeReplacements = completeTranscription;
|
||||
completeTranscription = this.applyReplacements(
|
||||
completeTranscription,
|
||||
replacements,
|
||||
);
|
||||
if (beforeReplacements !== completeTranscription) {
|
||||
logger.transcription.info("Applied vocabulary replacements", {
|
||||
sessionId,
|
||||
replacementCount: replacements.size,
|
||||
originalLength: beforeReplacements.length,
|
||||
newLength: completeTranscription.length,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Save directly to database
|
||||
logger.transcription.info("Saving transcription with audio file", {
|
||||
sessionId,
|
||||
|
|
@ -644,6 +669,63 @@ export class TranscriptionService {
|
|||
return context;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple pre-formatter for local Transcription models.
|
||||
* Handles leading space based on insertion context to avoid double spaces or unwanted leading whitespace.
|
||||
* Runs before LLM formatter (if configured) to ensure clean input.
|
||||
*/
|
||||
private preFormatLocalTranscription(
|
||||
transcription: string,
|
||||
preSelectionText: string | null | undefined,
|
||||
): string {
|
||||
if (!transcription.startsWith(" ")) {
|
||||
return transcription;
|
||||
}
|
||||
|
||||
// Strip leading space if:
|
||||
// 1. No previous text (start of document/field)
|
||||
// 2. Previous text ends with whitespace (avoid double space)
|
||||
const shouldStripLeadingSpace =
|
||||
!preSelectionText ||
|
||||
preSelectionText.length === 0 ||
|
||||
/\s$/.test(preSelectionText);
|
||||
|
||||
return shouldStripLeadingSpace ? transcription.trimStart() : transcription;
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply vocabulary replacements to transcription text.
|
||||
* Uses case-insensitive Unicode-aware word boundary matching to replace terms.
|
||||
* Works across all languages and scripts (Latin, Cyrillic, CJK, Arabic, etc.).
|
||||
* Runs after LLM formatting as the final post-processing step.
|
||||
*/
|
||||
private applyReplacements(
|
||||
text: string,
|
||||
replacements: Map<string, string>,
|
||||
): string {
|
||||
if (replacements.size === 0 || !text) {
|
||||
return text;
|
||||
}
|
||||
|
||||
let result = text;
|
||||
|
||||
for (const [word, replacement] of replacements) {
|
||||
// Escape special regex characters in the word
|
||||
const escapedWord = word.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
// Use Unicode-aware word boundaries:
|
||||
// - \p{L} matches any Unicode letter (Latin, Cyrillic, CJK, Arabic, etc.)
|
||||
// - \p{N} matches any Unicode number
|
||||
// - Negative lookbehind/lookahead ensures word is not part of a larger word
|
||||
const regex = new RegExp(
|
||||
`(?<![\\p{L}\\p{N}])${escapedWord}(?![\\p{L}\\p{N}])`,
|
||||
"giu",
|
||||
);
|
||||
result = result.replace(regex, replacement);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private async formatWithProvider(
|
||||
provider: FormattingProvider,
|
||||
sessionId: string,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue