diff --git a/apps/desktop/src/constants/models.ts b/apps/desktop/src/constants/models.ts index e8e27ff..6c04e0e 100644 --- a/apps/desktop/src/constants/models.ts +++ b/apps/desktop/src/constants/models.ts @@ -91,4 +91,17 @@ export const AVAILABLE_MODELS: Model[] = [ filename: "ggml-large-v3.bin", checksum: "ad82bf6a9043ceed055076d0fd39f5f186ff8062", }, + { + id: "whisper-large-v3-turbo", + name: "Whisper Large v3 Turbo", + type: "whisper", + size: 1.5 * 1024 * 1024 * 1024, // ~1.5 GB + sizeFormatted: "~1.5 GB", + description: + "Optimized Large v3 variant with only 4 decoder layers, offering significantly faster transcription with accuracy comparable to Large v2/v3.", + downloadUrl: + "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin", + filename: "ggml-large-v3-turbo.bin", + checksum: "4af2b29d7ec73d781377bfd1758ca957a807e941", + }, ]; diff --git a/apps/desktop/src/main/managers/recording-manager.ts b/apps/desktop/src/main/managers/recording-manager.ts index 69a3900..bc5a603 100644 --- a/apps/desktop/src/main/managers/recording-manager.ts +++ b/apps/desktop/src/main/managers/recording-manager.ts @@ -8,6 +8,7 @@ import type { ShortcutManager } from "../services/shortcut-manager"; import { StreamingWavWriter } from "../../utils/streaming-wav-writer"; import * as fs from "node:fs"; import * as path from "node:path"; +import { appContextStore } from "@/stores/app-context"; export type RecordingMode = "idle" | "ptt" | "hands-free"; @@ -175,6 +176,9 @@ export class RecordingManager extends EventEmitter { const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); this.currentSessionId = `session-${timestamp}`; + // Get accessibility context from global store + appContextStore.refreshAccessibilityData(); + // Create audio file and WAV writer const audioFilePath = await this.createAudioFile(this.currentSessionId); this.currentAudioRecording = { diff --git a/apps/desktop/src/pipeline/providers/formatting/formatter-prompt.ts b/apps/desktop/src/pipeline/providers/formatting/formatter-prompt.ts index e3198ac..d3d722d 100644 --- a/apps/desktop/src/pipeline/providers/formatting/formatter-prompt.ts +++ b/apps/desktop/src/pipeline/providers/formatting/formatter-prompt.ts @@ -1,98 +1,187 @@ import { FormatParams } from "../../core/pipeline-types"; -import { GetAccessibilityContextResult, ApplicationInfo } from "@amical/types"; +import { GetAccessibilityContextResult } from "@amical/types"; + +// Base system prompt +const SYSTEM_PROMPT = `You are a professional text formatter. Your task is to format transcribed text to be clear, readable, and properly structured.`; + +// Base instructions that apply to all formatting +const BASE_INSTRUCTIONS = [ + "Fix any transcription errors based on context and custom vocabulary", + "Add proper punctuation and capitalization", + "Format paragraphs appropriately with sufficient line breaks", + "Maintain the original meaning and tone", + "Use the custom vocabulary to correct domain-specific terms", + "Remove unnecessary filler words (um, uh, etc.) but keep natural speech patterns", + "If the text is empty, return ", + "Return ONLY the formatted text enclosed in tags", + "Do not include any commentary, explanations, or text outside the XML tags", +]; + +// Application type specific rules +const APPLICATION_TYPE_RULES: Record = { + email: [ + "Format with proper email structure (greeting, body paragraphs, closing)", + "Preserve email metadata if present (From, To, Subject, Date)", + "Ensure proper paragraph breaks between different topics", + "Maintain professional tone and formatting", + "Format any quoted or forwarded content clearly", + "Preserve email signatures and contact information", + ], + chat: [ + "Preserve conversational tone and informal language", + "Keep messages concise and separate", + "Maintain emoji and emoticons if present", + "Format timestamps and usernames clearly if included", + "Preserve thread context and replies", + ], + notes: [ + "Organize content with clear headings and sections", + "Use bullet points or numbered lists where appropriate", + "Maintain hierarchical structure of ideas", + "Format action items and tasks clearly", + "Preserve any existing formatting hints", + ], + general: [ + "Apply standard formatting for general text", + "Create logical paragraph breaks based on content flow", + "Maintain consistent formatting throughout", + "Preserve the original tone and style", + ], +}; + +// Map bundle identifiers to application types +const BUNDLE_TO_TYPE: Record = { + "com.apple.mail": "email", + "com.microsoft.Outlook": "email", + "com.readdle.smartemail": "email", + "com.google.Gmail": "email", + "com.tinyspeck.slackmacgap": "chat", + "com.microsoft.teams": "chat", + "com.facebook.archon": "chat", // Messenger + "com.discord.Discord": "chat", + "com.telegram.desktop": "chat", + "com.apple.Notes": "notes", + "com.microsoft.onenote.mac": "notes", + "com.evernote.Evernote": "notes", + "notion.id": "notes", + "com.agiletortoise.Drafts-OSX": "notes", +}; + +// Browser bundle identifiers +const BROWSER_BUNDLE_IDS = [ + "com.apple.Safari", + "com.google.Chrome", + "com.google.Chrome.canary", + "com.microsoft.edgemac", + "org.mozilla.firefox", + "com.brave.Browser", + "com.operasoftware.Opera", + "com.vivaldi.Vivaldi", +]; + +// URL patterns for web applications +const URL_PATTERNS: Record = { + email: [ + /mail\.google\.com/, + /outlook\.live\.com/, + /outlook\.office\.com/, + /mail\.yahoo\.com/, + /mail\.proton\.me/, + /webmail\./, + /roundcube/, + /fastmail\.com/, + ], + chat: [ + /web\.whatsapp\.com/, + /discord\.com\/channels/, + /teams\.microsoft\.com/, + /slack\.com/, + /web\.telegram\.org/, + /messenger\.com/, + /chat\.openai\.com/, + /claude\.ai/, + ], + notes: [ + /notion\.so/, + /docs\.google\.com/, + /onenote\.com/, + /evernote\.com/, + /roamresearch\.com/, + /obsidian\.md/, + /workflowy\.com/, + /coda\.io/, + ], +}; export function constructFormatterPrompt(context: FormatParams["context"]): { systemPrompt: string; } { - const { accessibilityContext } = context; - - // Build enhanced system prompt with context information - let systemPrompt = `You are a professional text formatter. Your task is to clean up and improve the formatting of transcribed text while preserving the original meaning and content. - -Please: -1. Fix obvious transcription errors and typos -2. Add proper punctuation where missing -3. Organize the text into proper paragraphs, with sufficient line breaks, etc. -4. Capitalize proper nouns and sentence beginnings -5. Remove unnecessary filler words (um, uh, etc.) but keep natural speech patterns -6. Maintain the speaker's original tone and style -7. If the text is empty, return an empty string -8. For formatting of emails make sure to use the correct email format`; - - // Build context information - const contextXml = buildContextXml(accessibilityContext); - - if (contextXml) { - systemPrompt += `\n\n${contextXml}`; - systemPrompt += `\n\nUse this context to better understand the environment where the text will be used and adjust formatting accordingly.`; + const { accessibilityContext, vocabulary } = context; + + // Detect application type + const applicationType = detectApplicationType(accessibilityContext); + + // Build instructions array + const instructions = [ + ...BASE_INSTRUCTIONS, + ...(APPLICATION_TYPE_RULES[applicationType] || []) + ]; + + // Build prompt parts + const parts = [SYSTEM_PROMPT]; + + // Add vocabulary context if available + if (vocabulary && vocabulary.size > 0) { + const vocabTerms = Array.from(vocabulary.keys()).join(", "); + parts.push(`\nCustom vocabulary to use for corrections: ${vocabTerms}`); } - - systemPrompt += `\n\nReturn only the formatted text without any explanations or additional commentary.`; - - return { systemPrompt }; + + // Add numbered instructions + parts.push("\nInstructions:"); + instructions.forEach((instruction, index) => { + parts.push(`${index + 1}. ${instruction}`); + }); + + return { systemPrompt: parts.join("\n") }; } -function buildContextXml( +function detectApplicationType( accessibilityContext: GetAccessibilityContextResult | null | undefined, -): string | null { - if (!accessibilityContext?.context) return null; +): string { + if (!accessibilityContext?.context?.application?.bundleIdentifier) { + return "general"; + } - const contextParts: string[] = [""]; - - // Add application info - const appXml = buildApplicationXml(accessibilityContext.context.application); - if (appXml) contextParts.push(appXml); - - // Add URL info - const urlXml = buildUrlXml( - accessibilityContext.context.windowInfo?.url || undefined, + const bundleId = accessibilityContext.context.application.bundleIdentifier; + + // Check if it's a browser + const isBrowser = BROWSER_BUNDLE_IDS.some(browserId => + bundleId.includes(browserId) || browserId.includes(bundleId) ); - if (urlXml) contextParts.push(urlXml); - - contextParts.push(""); - - // Only return context if we have actual content - return contextParts.length > 2 ? contextParts.join("\n") : null; -} - -function buildApplicationXml(application: ApplicationInfo): string | null { - if (!application?.name) return null; - - const appParts = [" ", ` ${application.name}`]; - - if (application.bundleIdentifier) { - appParts.push(` ${application.bundleIdentifier}`); - } - - appParts.push(" "); - return appParts.join("\n"); -} - -function buildUrlXml(url: string | undefined): string | null { - if (!url) return null; - - const domain = extractDomain(url); - if (!domain) return null; - - return [" ", ` ${domain}`, " "].join("\n"); -} - -function extractDomain(url: string): string | null { - try { - // Try standard URL parsing first - const parsedUrl = new URL(url); - return parsedUrl.hostname; - } catch { - // Handle URLs without protocol or malformed URLs - // Remove any leading slashes - const cleanUrl = url.replace(/^\/+/, ""); - - // Extract domain from patterns like "domain.com/path" or just "domain.com" - const match = cleanUrl.match(/^([^\/\s?#]+)/); - if (match && match[1].includes(".")) { - return match[1]; + + if (isBrowser && accessibilityContext.context?.windowInfo?.url) { + // Try to detect type from URL + const url = accessibilityContext.context.windowInfo.url.toLowerCase(); + + for (const [type, patterns] of Object.entries(URL_PATTERNS)) { + if (patterns.some(pattern => pattern.test(url))) { + return type; + } } - - return null; } + + // Check for exact match in native apps + if (BUNDLE_TO_TYPE[bundleId]) { + return BUNDLE_TO_TYPE[bundleId]; + } + + // Check for partial matches + for (const [key, type] of Object.entries(BUNDLE_TO_TYPE)) { + if (bundleId.includes(key) || key.includes(bundleId)) { + return type; + } + } + + // Default to general + return "general"; } diff --git a/apps/desktop/src/pipeline/providers/formatting/openrouter-formatter.ts b/apps/desktop/src/pipeline/providers/formatting/openrouter-formatter.ts index e5da141..010f030 100644 --- a/apps/desktop/src/pipeline/providers/formatting/openrouter-formatter.ts +++ b/apps/desktop/src/pipeline/providers/formatting/openrouter-formatter.ts @@ -31,7 +31,7 @@ export class OpenRouterProvider implements FormattingProvider { // Build user prompt with context const userPrompt = text; - const { text: formattedText } = await generateText({ + const { text: aiResponse } = await generateText({ model: this.provider(this.model), messages: [ { @@ -47,9 +47,14 @@ export class OpenRouterProvider implements FormattingProvider { maxTokens: 2000, }); + // Extract formatted text from XML tags + const match = aiResponse.match(/([\s\S]*?)<\/formatted_text>/); + const formattedText = match ? match[1].trim() : aiResponse.trim(); + logger.pipeline.debug("Formatting completed", { original: text, formatted: formattedText, + hadXmlTags: !!match, }); return formattedText; diff --git a/apps/desktop/src/services/model-manager.ts b/apps/desktop/src/services/model-manager.ts index f531962..1e6091e 100644 --- a/apps/desktop/src/services/model-manager.ts +++ b/apps/desktop/src/services/model-manager.ts @@ -423,6 +423,7 @@ class ModelManagerService extends EventEmitter { // Otherwise, find the best available model (prioritize by quality) const preferredOrder = [ + "whisper-large-v3-turbo", "whisper-large-v1", "whisper-medium", "whisper-small", diff --git a/apps/desktop/src/services/transcription-service.ts b/apps/desktop/src/services/transcription-service.ts index b1d8284..11f059a 100644 --- a/apps/desktop/src/services/transcription-service.ts +++ b/apps/desktop/src/services/transcription-service.ts @@ -261,7 +261,7 @@ export class TranscriptionService { chunkCount: session.transcriptionResults.length, }); - if (this.formatterEnabled && this.openRouterProvider) { + if (this.formatterEnabled && this.openRouterProvider && completeTranscription.trim().length) { try { const style = session.context.sharedData.userPreferences?.formattingStyle; @@ -284,6 +284,8 @@ export class TranscriptionService { logger.transcription.info("Text formatted successfully", { sessionId, + originalTranscription: completeTranscription, + formattedTranscription: formattedText, originalLength: completeTranscription.length, formattedLength: formattedText.length, }); diff --git a/packages/native-helpers/swift-helper/Sources/SwiftHelper/AccessibilityContextService.swift b/packages/native-helpers/swift-helper/Sources/SwiftHelper/AccessibilityContextService.swift index 7faddc6..62e5711 100644 --- a/packages/native-helpers/swift-helper/Sources/SwiftHelper/AccessibilityContextService.swift +++ b/packages/native-helpers/swift-helper/Sources/SwiftHelper/AccessibilityContextService.swift @@ -84,7 +84,7 @@ class AccessibilityContextService { // Enable manual accessibility for specific apps if let bundleId: String = getBundleIdentifier(pid: pid), appsManuallyEnableAx.contains(bundleId) { - FileHandle.standardError.write("🔧 Enabling manual accessibility for \(bundleId)\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔧 Enabling manual accessibility for \(bundleId)\n".data(using: .utf8)!) AXUIElementSetAttributeValue(application, "AXManualAccessibility" as CFString, kCFBooleanTrue) AXUIElementSetAttributeValue(application, "AXEnhancedUserInterface" as CFString, kCFBooleanTrue) } @@ -94,12 +94,12 @@ class AccessibilityContextService { // Fallback to focused window if focused element fails if error != .success { - FileHandle.standardError.write("⚠️ Failed to get focused element, trying focused window...\n".data(using: .utf8)!) + // FileHandle.standardError.write("⚠️ Failed to get focused element, trying focused window...\n".data(using: .utf8)!) error = AXUIElementCopyAttributeValue(application, kAXFocusedWindowAttribute as CFString, &focusedElement) } guard error == .success, let element = focusedElement else { - FileHandle.standardError.write("❌ Failed to get focused element or window. Error: \(error.rawValue)\n".data(using: .utf8)!) + // FileHandle.standardError.write("❌ Failed to get focused element or window. Error: \(error.rawValue)\n".data(using: .utf8)!) return nil } @@ -238,13 +238,13 @@ class AccessibilityContextService { var urlSource = "none" // Debug: Print all window attributes - FileHandle.standardError.write("🔍 Window attributes:\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Window attributes:\n".data(using: .utf8)!) let attributes = getAttributeNames(element: windowElement) for attribute in attributes { if let value = getAttributeValue(element: windowElement, attribute: attribute) { - FileHandle.standardError.write(" \(attribute): \(value)\n".data(using: .utf8)!) + // FileHandle.standardError.write(" \(attribute): \(value)\n".data(using: .utf8)!) } else { - FileHandle.standardError.write(" \(attribute): \n".data(using: .utf8)!) + // FileHandle.standardError.write(" \(attribute): \n".data(using: .utf8)!) } } @@ -258,15 +258,15 @@ class AccessibilityContextService { let isFirefox = bundleId == "org.mozilla.firefox" - FileHandle.standardError.write("🔍 Browser type - Chromium: \(isChromiumBrowser), Firefox: \(isFirefox), Bundle: \(bundleId ?? "unknown")\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Browser type - Chromium: \(isChromiumBrowser), Firefox: \(isFirefox), Bundle: \(bundleId ?? "unknown")\n".data(using: .utf8)!) // For Chromium browsers and Firefox: Prioritize AXWebArea (live URL) if isChromiumBrowser || isFirefox { - FileHandle.standardError.write("🔍 Using AXWebArea priority for Chromium/Firefox browser\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Using AXWebArea priority for Chromium/Firefox browser\n".data(using: .utf8)!) foundURL = findURLInChildren(element: windowElement, depth: 0, maxDepth: 30) if foundURL != nil { urlSource = "tree_walking_priority" - FileHandle.standardError.write("🔍 Found URL from AXWebArea (priority): \(foundURL!)\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Found URL from AXWebArea (priority): \(foundURL!)\n".data(using: .utf8)!) return foundURL } } @@ -279,7 +279,7 @@ class AccessibilityContextService { if docErr == .success, let urlString = urlRef as? String, !urlString.isEmpty { foundURL = urlString urlSource = "window_document" - FileHandle.standardError.write("🔍 Found URL from window document: \(urlString)\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Found URL from window document: \(urlString)\n".data(using: .utf8)!) // For Safari and other WebKit browsers, this is reliable, return immediately if !isChromiumBrowser && !isFirefox { @@ -295,7 +295,7 @@ class AccessibilityContextService { if foundURL == nil { foundURL = urlString urlSource = "window_url" - FileHandle.standardError.write("🔍 Found URL from window URL attribute: \(urlString)\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Found URL from window URL attribute: \(urlString)\n".data(using: .utf8)!) // For Safari and other WebKit browsers, this is reliable, return immediately if !isChromiumBrowser && !isFirefox { @@ -309,17 +309,17 @@ class AccessibilityContextService { foundURL = findURLInChildren(element: windowElement, depth: 0, maxDepth: 3) if foundURL != nil { urlSource = "tree_walking_fallback" - FileHandle.standardError.write("🔍 Found URL from tree walking (fallback): \(foundURL!)\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Found URL from tree walking (fallback): \(foundURL!)\n".data(using: .utf8)!) return foundURL } } if foundURL != nil { - FileHandle.standardError.write("🔍 Returning URL (\(urlSource)): \(foundURL!)\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Returning URL (\(urlSource)): \(foundURL!)\n".data(using: .utf8)!) return foundURL } - FileHandle.standardError.write("🔍 No URL found from any method\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 No URL found from any method\n".data(using: .utf8)!) return nil } @@ -355,11 +355,11 @@ class AccessibilityContextService { } // log role - FileHandle.standardError.write("🔍 Found element with role: \(role) at depth \(currentDepth + 1)\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Found element with role: \(role) at depth \(currentDepth + 1)\n".data(using: .utf8)!) // log all attribute names - FileHandle.standardError.write("🔍 Element attributes: \(getAttributeNames(element: child))\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Element attributes: \(getAttributeNames(element: child))\n".data(using: .utf8)!) // log kAXURLAttribute - FileHandle.standardError.write("🔍 kAXURLAttribute: \(getAttributeValue(element: child, attribute: kAXURLAttribute) ?? "none")\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 kAXURLAttribute: \(getAttributeValue(element: child, attribute: kAXURLAttribute) ?? "none")\n".data(using: .utf8)!) // Priority 1: Address/search fields (most current) if role == "AXTextField" || role == "AXComboBox" || role == "AXSafariAddressAndSearchField" { @@ -370,7 +370,7 @@ class AccessibilityContextService { let value = valueRef as? String, !value.isEmpty, (value.hasPrefix("http://") || value.hasPrefix("https://") || value.contains(".")) { - FileHandle.standardError.write("🔍 Found URL in address field (\(role)): \(value)\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Found URL in address field (\(role)): \(value)\n".data(using: .utf8)!) return value } } @@ -389,7 +389,7 @@ class AccessibilityContextService { kAXURLAttribute as CFString, &urlRef) == .success, let urlString = urlRef as? String, !urlString.isEmpty { - FileHandle.standardError.write("🔍 Found URL in web area: \(urlString)\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Found URL in web area: \(urlString)\n".data(using: .utf8)!) return urlString } @@ -397,7 +397,7 @@ class AccessibilityContextService { kAXDocumentAttribute as CFString, &urlRef) == .success, let urlString = urlRef as? String, !urlString.isEmpty { - FileHandle.standardError.write("🔍 Found URL in web area document: \(urlString)\n".data(using: .utf8)!) + // FileHandle.standardError.write("🔍 Found URL in web area document: \(urlString)\n".data(using: .utf8)!) return urlString } }