diff --git a/apps/desktop/src/main/managers/recording-manager.ts b/apps/desktop/src/main/managers/recording-manager.ts index 04bff82..1334551 100644 --- a/apps/desktop/src/main/managers/recording-manager.ts +++ b/apps/desktop/src/main/managers/recording-manager.ts @@ -278,7 +278,7 @@ export class RecordingManager extends EventEmitter { const vadService = this.serviceManager.getService("vadService"); vadService.reset(); - // Refresh accessibility context + // Refresh accessibility context (TextMarker API for Electron support) const nativeBridge = this.serviceManager.getService("nativeBridge"); nativeBridge.refreshAccessibilityContext(); diff --git a/apps/desktop/src/services/platform/native-bridge-service.ts b/apps/desktop/src/services/platform/native-bridge-service.ts index c11abc3..b4b7d9f 100644 --- a/apps/desktop/src/services/platform/native-bridge-service.ts +++ b/apps/desktop/src/services/platform/native-bridge-service.ts @@ -20,6 +20,10 @@ import { GetAccessibilityTreeDetailsResult, GetAccessibilityContextParams, GetAccessibilityContextResult, + GetAccessibilityStatusParams, + GetAccessibilityStatusResult, + RequestAccessibilityPermissionParams, + RequestAccessibilityPermissionResult, PasteTextParams, PasteTextResult, MuteSystemAudioParams, @@ -28,6 +32,7 @@ import { RestoreSystemAudioResult, SetShortcutsParams, SetShortcutsResult, + AppContext, } from "@amical/types"; // Define the interface for RPC methods @@ -40,6 +45,14 @@ interface RPCMethods { params: GetAccessibilityContextParams; result: GetAccessibilityContextResult; }; + getAccessibilityStatus: { + params: GetAccessibilityStatusParams; + result: GetAccessibilityStatusResult; + }; + requestAccessibilityPermission: { + params: RequestAccessibilityPermissionParams; + result: RequestAccessibilityPermissionResult; + }; pasteText: { params: PasteTextParams; result: PasteTextResult; @@ -74,7 +87,7 @@ export class NativeBridge extends EventEmitter { >(); private helperPath: string; private logger = createScopedLogger("native-bridge"); - private accessibilityContext: GetAccessibilityContextResult | null = null; + private accessibilityContext: AppContext | null = null; // Auto-restart configuration private static readonly MAX_RESTARTS = 3; @@ -435,15 +448,16 @@ export class NativeBridge extends EventEmitter { */ async refreshAccessibilityContext(): Promise { try { - const context = await this.call("getAccessibilityContext", { + const result = await this.call("getAccessibilityContext", { editableOnly: false, }); - this.accessibilityContext = context; + this.accessibilityContext = result.context; this.logger.debug("Accessibility context refreshed", { - hasApplication: !!context.context?.application?.name, - hasFocusedElement: !!context.context?.focusedElement?.role, - hasTextSelection: !!context.context?.textSelection?.selectedText, - hasWindow: !!context.context?.windowInfo?.title, + hasApplication: !!result.context?.application?.name, + hasFocusedElement: !!result.context?.focusedElement?.role, + hasTextSelection: !!result.context?.textSelection?.selectedText, + extractionMethod: result.context?.textSelection?.extractionMethod, + metricsMs: result.context?.metrics?.totalTimeMs, }); } catch (error) { this.logger.error("Failed to refresh accessibility context", { @@ -454,9 +468,13 @@ export class NativeBridge extends EventEmitter { /** * Get the cached accessibility context. + * Returns in the result wrapper format for API consistency. */ getAccessibilityContext(): GetAccessibilityContextResult | null { - return this.accessibilityContext; + if (this.accessibilityContext === null) { + return null; + } + return { context: this.accessibilityContext }; } /** @@ -481,6 +499,20 @@ export class NativeBridge extends EventEmitter { } } + /** + * Get accessibility permission status. + */ + async getAccessibilityStatus(): Promise { + return this.call("getAccessibilityStatus", {}); + } + + /** + * Request accessibility permission. + */ + async requestAccessibilityPermission(): Promise { + return this.call("requestAccessibilityPermission", {}); + } + // Typed event emitter methods on( event: E, diff --git a/packages/native-helpers/swift-helper/Sources/SwiftHelper/AccessibilityContextService.swift b/packages/native-helpers/swift-helper/Sources/SwiftHelper/AccessibilityContextService.swift deleted file mode 100644 index 00aafed..0000000 --- a/packages/native-helpers/swift-helper/Sources/SwiftHelper/AccessibilityContextService.swift +++ /dev/null @@ -1,526 +0,0 @@ -import Foundation -import ApplicationServices -import AppKit - -// Apps that need manual accessibility enabling -let appsManuallyEnableAx: Set = ["com.google.Chrome", "org.mozilla.firefox", "com.microsoft.edgemac", "com.apple.Safari"] - -struct ProcessInfo { - let pid: pid_t - let name: String? - let bundleIdentifier: String? - let version: String? -} - -struct Selection { - let text: String - let process: ProcessInfo - let preSelection: String? - let postSelection: String? - let fullContent: String? - let selectionRange: NSRange? - let isEditable: Bool - let elementType: String? -} - -class AccessibilityContextService { - - static func checkAccessibilityPermissions(prompt: Bool = false) -> Bool { - let options: [String: Any] = [kAXTrustedCheckOptionPrompt.takeUnretainedValue() as String: prompt] - return AXIsProcessTrustedWithOptions(options as CFDictionary) - } - - static func getFrontProcessID() -> pid_t { - guard let frontmostApp = NSWorkspace.shared.frontmostApplication else { - FileHandle.standardError.write("โŒ No frontmost application found\n".data(using: .utf8)!) - return 0 - } - return frontmostApp.processIdentifier - } - - static func getProcessName(pid: pid_t) -> String? { - guard let application = NSRunningApplication(processIdentifier: pid), - let url = application.executableURL else { - return nil - } - return url.lastPathComponent - } - - static func getBundleIdentifier(pid: pid_t) -> String? { - guard let application = NSRunningApplication(processIdentifier: pid) else { - return nil - } - return application.bundleIdentifier - } - - static func getApplicationVersion(pid: pid_t) -> String? { - guard let application = NSRunningApplication(processIdentifier: pid), - let bundle = Bundle(url: application.bundleURL ?? URL(fileURLWithPath: "")) else { - return nil - } - return bundle.infoDictionary?["CFBundleShortVersionString"] as? String - } - - static func touchDescendantElements(_ element: AXUIElement, maxDepth: Int) { - guard maxDepth > 0 else { return } - - var children: CFTypeRef? - let error = AXUIElementCopyAttributeValue(element, kAXChildrenAttribute as CFString, &children) - - guard error == .success, let childrenArray = children as? [AXUIElement] else { - return - } - - // Limit to 8 children to avoid performance issues - let limitedChildren = Array(childrenArray.prefix(8)) - for child in limitedChildren { - touchDescendantElements(child, maxDepth: maxDepth - 1) - } - } - - static func _getFocusedElement(pid: pid_t) -> AXUIElement? { - let application = AXUIElementCreateApplication(pid) - - // Enable manual accessibility for specific apps - if let bundleId: String = getBundleIdentifier(pid: pid), - appsManuallyEnableAx.contains(bundleId) { - // FileHandle.standardError.write("๐Ÿ”ง Enabling manual accessibility for \(bundleId)\n".data(using: .utf8)!) - AXUIElementSetAttributeValue(application, "AXManualAccessibility" as CFString, kCFBooleanTrue) - AXUIElementSetAttributeValue(application, "AXEnhancedUserInterface" as CFString, kCFBooleanTrue) - } - - var focusedElement: CFTypeRef? - var error = AXUIElementCopyAttributeValue(application, kAXFocusedUIElementAttribute as CFString, &focusedElement) - - // Fallback to focused window if focused element fails - if error != .success { - // FileHandle.standardError.write("โš ๏ธ Failed to get focused element, trying focused window...\n".data(using: .utf8)!) - error = AXUIElementCopyAttributeValue(application, kAXFocusedWindowAttribute as CFString, &focusedElement) - } - - guard error == .success, let element = focusedElement else { - // FileHandle.standardError.write("โŒ Failed to get focused element or window. Error: \(error.rawValue)\n".data(using: .utf8)!) - return nil - } - - return (element as! AXUIElement) - } - - static func getAttributeValue(element: AXUIElement, attribute: String) -> String? { - var value: CFTypeRef? - let error = AXUIElementCopyAttributeValue(element, attribute as CFString, &value) - - if error == .success { - if let stringValue = value as? String { - return stringValue - } else if let numberValue = value as? NSNumber { - return numberValue.stringValue - } else if let boolValue = value as? Bool { - return boolValue ? "true" : "false" - } - } - return nil - } - - static func getAttributeNames(element: AXUIElement) -> [String] { - var attributeNames: CFArray? - let error = AXUIElementCopyAttributeNames(element, &attributeNames) - - if error == .success, let names = attributeNames as? [String] { - return names - } - return [] - } - - static func isElementEditable(element: AXUIElement) -> Bool { - let role = getAttributeValue(element: element, attribute: kAXRoleAttribute) - let subrole = getAttributeValue(element: element, attribute: kAXSubroleAttribute) - - // Check for editable roles - let editableRoles = ["AXTextField", "AXTextArea", "AXComboBox"] - if let role = role, editableRoles.contains(role) { - return true - } - - // Check for editable subroles - let editableSubroles = ["AXSecureTextField", "AXSearchField"] - if let subrole = subrole, editableSubroles.contains(subrole) { - return true - } - - // Check if element has AXValue attribute (often indicates editability) - let attributes = getAttributeNames(element: element) - return attributes.contains(kAXValueAttribute) - } - - static func getParentChain(element: AXUIElement, maxDepth: Int = 10) -> [String] { - var chain: [String] = [] - var currentElement = element - - for _ in 0.. TextSelection? { - // Get full content first - we need this to provide context - let fullContent = getAttributeValue(element: element, attribute: kAXValueAttribute) - - // Get selection/cursor range - var selectionRange: SelectionRange? = nil - var rangeValue: CFTypeRef? - let rangeError = AXUIElementCopyAttributeValue(element, kAXSelectedTextRangeAttribute as CFString, &rangeValue) - - if rangeError == .success, let axValue = rangeValue { - var range = CFRange() - if AXValueGetValue(axValue as! AXValue, .cfRange, &range) { - selectionRange = SelectionRange(length: Int(range.length), location: Int(range.location)) - } - } - - // If we have no cursor/selection position and no content, return nil - guard selectionRange != nil || fullContent != nil else { - return nil - } - - // Get selected text (may be empty if just cursor position) - let selectedText = getAttributeValue(element: element, attribute: kAXSelectedTextAttribute) - - // Calculate pre and post selection/cursor text - // Return "" instead of nil when cursor is at start/end of document - var preSelectionText: String? = nil - var postSelectionText: String? = nil - - if let fullContent = fullContent, let range = selectionRange { - let nsString = fullContent as NSString - - // Pre-selection text: last MAX_CONTEXT_LENGTH chars before cursor/selection - // Returns "" if cursor is at start of document (position 0) - if range.location > 0 { - let preLength = min(range.location, MAX_CONTEXT_LENGTH) - let preStart = range.location - preLength - let preRange = NSRange(location: preStart, length: preLength) - preSelectionText = nsString.substring(with: preRange) - } else { - preSelectionText = "" - } - - // Post-selection text: first MAX_CONTEXT_LENGTH chars after cursor/selection - // Returns "" if cursor is at end of document - let postStart = range.location + range.length - if postStart < nsString.length { - let postLength = min(nsString.length - postStart, MAX_CONTEXT_LENGTH) - let postRange = NSRange(location: postStart, length: postLength) - postSelectionText = nsString.substring(with: postRange) - } else { - postSelectionText = "" - } - } - - let isEditable = isElementEditable(element: element) - - return TextSelection( - fullContent: fullContent, - isEditable: isEditable, - postSelectionText: postSelectionText, - preSelectionText: preSelectionText, - selectedText: selectedText, - selectionRange: selectionRange - ) - } - - static func getBrowserURL(windowElement: AXUIElement, bundleId: String?) -> String? { - var foundURL: String? = nil - var urlSource = "none" - - // Debug: Print all window attributes - // FileHandle.standardError.write("๐Ÿ” Window attributes:\n".data(using: .utf8)!) - let attributes = getAttributeNames(element: windowElement) - for attribute in attributes { - if let value = getAttributeValue(element: windowElement, attribute: attribute) { - // FileHandle.standardError.write(" \(attribute): \(value)\n".data(using: .utf8)!) - } else { - // FileHandle.standardError.write(" \(attribute): \n".data(using: .utf8)!) - } - } - - // Determine browser type for conditional logic - let isChromiumBrowser = bundleId?.lowercased().contains("chrome") == true || - bundleId?.lowercased().contains("chromium") == true || - bundleId == "com.microsoft.edgemac" || - bundleId == "com.brave.Browser" || - bundleId == "com.operasoftware.Opera" || - bundleId == "com.vivaldi.Vivaldi" - - let isFirefox = bundleId == "org.mozilla.firefox" - - // FileHandle.standardError.write("๐Ÿ” Browser type - Chromium: \(isChromiumBrowser), Firefox: \(isFirefox), Bundle: \(bundleId ?? "unknown")\n".data(using: .utf8)!) - - // For Chromium browsers and Firefox: Prioritize AXWebArea (live URL) - if isChromiumBrowser || isFirefox { - // FileHandle.standardError.write("๐Ÿ” Using AXWebArea priority for Chromium/Firefox browser\n".data(using: .utf8)!) - foundURL = findURLInChildren(element: windowElement, depth: 0, maxDepth: 30) - if foundURL != nil { - urlSource = "tree_walking_priority" - // FileHandle.standardError.write("๐Ÿ” Found URL from AXWebArea (priority): \(foundURL!)\n".data(using: .utf8)!) - return foundURL - } - } - - // Try window-level attributes (reliable for Safari, fallback for others) - var urlRef: CFTypeRef? - let docErr = AXUIElementCopyAttributeValue(windowElement, - kAXDocumentAttribute as CFString, - &urlRef) - if docErr == .success, let urlString = urlRef as? String, !urlString.isEmpty { - foundURL = urlString - urlSource = "window_document" - // FileHandle.standardError.write("๐Ÿ” Found URL from window document: \(urlString)\n".data(using: .utf8)!) - - // For Safari and other WebKit browsers, this is reliable, return immediately - if !isChromiumBrowser && !isFirefox { - return foundURL - } - // For Chromium/Firefox, keep this as fallback but continue looking - } - - if AXUIElementCopyAttributeValue(windowElement, - kAXURLAttribute as CFString, - &urlRef) == .success, - let urlString = urlRef as? String, !urlString.isEmpty { - if foundURL == nil { - foundURL = urlString - urlSource = "window_url" - // FileHandle.standardError.write("๐Ÿ” Found URL from window URL attribute: \(urlString)\n".data(using: .utf8)!) - - // For Safari and other WebKit browsers, this is reliable, return immediately - if !isChromiumBrowser && !isFirefox { - return foundURL - } - } - } - - // For non-Chromium browsers that didn't find window URLs, try tree walking - if !isChromiumBrowser && !isFirefox && foundURL == nil { - foundURL = findURLInChildren(element: windowElement, depth: 0, maxDepth: 3) - if foundURL != nil { - urlSource = "tree_walking_fallback" - // FileHandle.standardError.write("๐Ÿ” Found URL from tree walking (fallback): \(foundURL!)\n".data(using: .utf8)!) - return foundURL - } - } - - if foundURL != nil { - // FileHandle.standardError.write("๐Ÿ” Returning URL (\(urlSource)): \(foundURL!)\n".data(using: .utf8)!) - return foundURL - } - - // FileHandle.standardError.write("๐Ÿ” No URL found from any method\n".data(using: .utf8)!) - return nil - } - - static func findURLInChildren(element: AXUIElement, depth: Int, maxDepth: Int) -> String? { - guard depth < maxDepth else { return nil } - - // BFS implementation using a queue - var queue: [(element: AXUIElement, depth: Int)] = [(element, depth)] - - while !queue.isEmpty { - let (currentElement, currentDepth) = queue.removeFirst() - - // Skip if we've exceeded max depth - guard currentDepth < maxDepth else { continue } - - var childrenRef: CFTypeRef? - guard AXUIElementCopyAttributeValue(currentElement, - kAXChildrenAttribute as CFString, - &childrenRef) == .success, - let children = childrenRef as? [AXUIElement] else { - continue - } - - // Process all children at current level first (BFS) - for child in children { - // Check role first - var roleRef: CFTypeRef? - guard AXUIElementCopyAttributeValue(child, - kAXRoleAttribute as CFString, - &roleRef) == .success, - let role = roleRef as? String else { - continue - } - - // log role - // FileHandle.standardError.write("๐Ÿ” Found element with role: \(role) at depth \(currentDepth + 1)\n".data(using: .utf8)!) - // log all attribute names - // FileHandle.standardError.write("๐Ÿ” Element attributes: \(getAttributeNames(element: child))\n".data(using: .utf8)!) - // log kAXURLAttribute - // FileHandle.standardError.write("๐Ÿ” kAXURLAttribute: \(getAttributeValue(element: child, attribute: kAXURLAttribute) ?? "none")\n".data(using: .utf8)!) - - // Priority 1: Address/search fields (most current) - if role == "AXTextField" || role == "AXComboBox" || role == "AXSafariAddressAndSearchField" { - var valueRef: CFTypeRef? - if AXUIElementCopyAttributeValue(child, - kAXValueAttribute as CFString, - &valueRef) == .success, - let value = valueRef as? String, - !value.isEmpty, - (value.hasPrefix("http://") || value.hasPrefix("https://") || value.contains(".")) { - // FileHandle.standardError.write("๐Ÿ” Found URL in address field (\(role)): \(value)\n".data(using: .utf8)!) - return value - } - } - - // Priority 2: Web areas - if role == "AXWebArea" { - FileHandle.standardError.write("๐Ÿ” Found AXWebArea element at depth \(currentDepth + 1)\n".data(using: .utf8)!) - // list all attributes for this element - FileHandle.standardError.write("๐Ÿ” AXWebArea attributes: \(getAttributeNames(element: child))\n".data(using: .utf8)!) - // iterate and list value for all attributes - for attribute in getAttributeNames(element: child) { - FileHandle.standardError.write("๐Ÿ” \(attribute): \(getAttributeValue(element: child, attribute: attribute) ?? "none")\n".data(using: .utf8)!) - } - var urlRef: CFTypeRef? - if AXUIElementCopyAttributeValue(child, - kAXURLAttribute as CFString, - &urlRef) == .success, - let urlString = urlRef as? String, !urlString.isEmpty { - // FileHandle.standardError.write("๐Ÿ” Found URL in web area: \(urlString)\n".data(using: .utf8)!) - return urlString - } - - if AXUIElementCopyAttributeValue(child, - kAXDocumentAttribute as CFString, - &urlRef) == .success, - let urlString = urlRef as? String, !urlString.isEmpty { - // FileHandle.standardError.write("๐Ÿ” Found URL in web area document: \(urlString)\n".data(using: .utf8)!) - return urlString - } - } - - // Add child to queue for next level processing - queue.append((child, currentDepth + 1)) - } - } - - return nil - } - - static func getWindowInfo(pid: pid_t) -> WindowInfo? { - let application = AXUIElementCreateApplication(pid) - - // Get main window - var mainWindow: CFTypeRef? - let error = AXUIElementCopyAttributeValue(application, kAXMainWindowAttribute as CFString, &mainWindow) - - guard error == .success, let windowRef = mainWindow else { - return nil - } - - // Check if the window is actually an AXUIElement - guard CFGetTypeID(windowRef) == AXUIElementGetTypeID() else { - return nil - } - - let window = windowRef as! AXUIElement - let title = getAttributeValue(element: window, attribute: kAXTitleAttribute) - - // Get URL if this is a browser - let url = getBrowserURL(windowElement: window, bundleId: getBundleIdentifier(pid: pid)) - - return WindowInfo( - title: title, - url: url - ) - } - - static func getAccessibilityContext(editableOnly: Bool = false) -> Context? { - // Check accessibility permissions - guard checkAccessibilityPermissions() else { - FileHandle.standardError.write("โŒ Accessibility permissions not granted\n".data(using: .utf8)!) - return nil - } - - // Get frontmost application - let pid = getFrontProcessID() - guard pid > 0 else { - FileHandle.standardError.write("โŒ Could not get frontmost application PID\n".data(using: .utf8)!) - return nil - } - - let processName = getProcessName(pid: pid) - let bundleId = getBundleIdentifier(pid: pid) - let version = getApplicationVersion(pid: pid) - - // Create application info - let applicationInfo = Application( - bundleIdentifier: bundleId, - name: processName, - version: version - ) - - // Get focused element - var focusedElementInfo: FocusedElement? = nil - var textSelectionInfo: TextSelection? = nil - - if let focusedElement = _getFocusedElement(pid: pid) { - // Touch descendant elements to ensure they're accessible - touchDescendantElements(focusedElement, maxDepth: 3) - - let role = getAttributeValue(element: focusedElement, attribute: kAXRoleAttribute) - let title = getAttributeValue(element: focusedElement, attribute: kAXTitleAttribute) - let description = getAttributeValue(element: focusedElement, attribute: kAXDescriptionAttribute) - let value = getAttributeValue(element: focusedElement, attribute: kAXValueAttribute) - let isEditable = isElementEditable(element: focusedElement) - - focusedElementInfo = FocusedElement( - description: description, - isEditable: isEditable, - role: role, - title: title, - value: value - ) - - // Get text selection if available and not filtered by editableOnly - if let textSelection = getTextSelection(element: focusedElement) { - if !editableOnly || textSelection.isEditable { - textSelectionInfo = textSelection - } - } - } - - // Get window info - let windowInfo = getWindowInfo(pid: pid) - - // Create context - let context = Context( - application: applicationInfo, - focusedElement: focusedElementInfo, - textSelection: textSelectionInfo, - timestamp: Date().timeIntervalSince1970, - windowInfo: windowInfo - ) - - return context - } -} \ No newline at end of file diff --git a/packages/native-helpers/swift-helper/Sources/SwiftHelper/AccessibilityService.swift b/packages/native-helpers/swift-helper/Sources/SwiftHelper/AccessibilityService.swift index 11ef7ff..f4d1917 100644 --- a/packages/native-helpers/swift-helper/Sources/SwiftHelper/AccessibilityService.swift +++ b/packages/native-helpers/swift-helper/Sources/SwiftHelper/AccessibilityService.swift @@ -48,7 +48,7 @@ struct AccessibilityElementNode: Codable { class AccessibilityService { - private let maxDepth = 10 // To prevent excessively deep recursion and large payloads + private let maxDepth = ACCESSIBILITY_TREE_MAX_DEPTH // To prevent excessively deep recursion and large payloads private let dateFormatter: DateFormatter // Properties to store original audio states @@ -478,23 +478,19 @@ class AccessibilityService { return false } - // Simulate Cmd+V - // Using deprecated kVK_Command might still work but kCGEventFlagMaskCommand is preferred. - // Virtual key code for 'v' is 9. - let vKeyCode: CGKeyCode = 9 - + // Simulate Cmd+V using virtual key codes from Constants.swift let source = CGEventSource(stateID: .hidSystemState) - let cmdDown = CGEvent(keyboardEventSource: source, virtualKey: CGKeyCode(55), keyDown: true) // 55 is kVK_Command + let cmdDown = CGEvent(keyboardEventSource: source, virtualKey: VK_COMMAND, keyDown: true) cmdDown?.flags = .maskCommand - let vDown = CGEvent(keyboardEventSource: source, virtualKey: vKeyCode, keyDown: true) + let vDown = CGEvent(keyboardEventSource: source, virtualKey: VK_V, keyDown: true) vDown?.flags = .maskCommand // Keep command flag for the V press as well - let vUp = CGEvent(keyboardEventSource: source, virtualKey: vKeyCode, keyDown: false) + let vUp = CGEvent(keyboardEventSource: source, virtualKey: VK_V, keyDown: false) vUp?.flags = .maskCommand - let cmdUp = CGEvent(keyboardEventSource: source, virtualKey: CGKeyCode(55), keyDown: false) + let cmdUp = CGEvent(keyboardEventSource: source, virtualKey: VK_COMMAND, keyDown: false) // No flags needed for key up typically, or just .maskCommand if it was held if cmdDown == nil || vDown == nil || vUp == nil || cmdUp == nil { @@ -516,7 +512,7 @@ class AccessibilityService { // Restore the original pasteboard content after a short delay // to allow the paste action to complete. - DispatchQueue.main.asyncAfter(deadline: .now() + 0.2) { // 200ms delay + DispatchQueue.main.asyncAfter(deadline: .now() + PASTE_RESTORE_DELAY_SECONDS) { self.restorePasteboard( pasteboard: pasteboard, items: originalPasteboardItems, originalChangeCount: originalChangeCount) @@ -547,10 +543,6 @@ class AccessibilityService { } } - // Define kVK_Function if not available from a system framework directly in this context. - // 0x3F is the virtual key code for the Fn key on Apple keyboards. - private let kVK_Function: CGKeyCode = 0x3F - // Determines whether a keyboard event should be forwarded to the Electron application. // This method should be called from the CGEventTap callback in main.swift or RpcHandler.swift. public func shouldForwardKeyboardEvent(event: CGEvent) -> Bool { @@ -570,7 +562,7 @@ class AccessibilityService { if type == .keyDown || type == .keyUp { // For keyDown and keyUp events, only forward if the event is FOR THE Fn KEY ITSELF. - if keyCode == kVK_Function { + if keyCode == VK_FUNCTION { // logToStderr("[AccessibilityService] Forwarding \(type == .keyDown ? "keyDown" : "keyUp") event because it IS the Fn key (keyCode: \(keyCode)).") return true } else { diff --git a/packages/native-helpers/swift-helper/Sources/SwiftHelper/RpcHandler.swift b/packages/native-helpers/swift-helper/Sources/SwiftHelper/RpcHandler.swift index a198a5d..b658863 100644 --- a/packages/native-helpers/swift-helper/Sources/SwiftHelper/RpcHandler.swift +++ b/packages/native-helpers/swift-helper/Sources/SwiftHelper/RpcHandler.swift @@ -1,12 +1,19 @@ import Foundation import ObjCExceptionCatcher +/// Flexible RPC request that can parse any method string +struct FlexibleRPCRequest: Codable { + let id: String + let method: String + let params: JSONAny? +} + class IOBridge: NSObject { - private let jsonEncoder: JSONEncoder - private let jsonDecoder: JSONDecoder + let jsonEncoder: JSONEncoder + let jsonDecoder: JSONDecoder private let accessibilityService: AccessibilityService private let audioService: AudioService - private let dateFormatter: DateFormatter + let dateFormatter: DateFormatter init(jsonEncoder: JSONEncoder, jsonDecoder: JSONDecoder) { self.jsonEncoder = jsonEncoder @@ -38,13 +45,21 @@ class IOBridge: NSObject { return case .getAccessibilityContext: - // Process accessibility context requests on dedicated thread + // Process accessibility context requests on dedicated thread (uses v2 service) AccessibilityQueue.shared.async { [weak self] in guard let self = self else { return } - self.handleAccessibilityContext(request) + self.handleGetAccessibilityContext(id: request.id, params: request.params) } return + case .getAccessibilityStatus: + handleGetAccessibilityStatus(id: request.id) + return + + case .requestAccessibilityPermission: + handleRequestAccessibilityPermission(id: request.id) + return + case .pasteText: logToStderr("[IOBridge] Handling pasteText for ID: \(request.id)") guard let paramsAnyCodable = request.params else { @@ -308,71 +323,70 @@ class IOBridge: NSObject { } } - private func handleAccessibilityContext(_ request: RPCRequestSchema) { - var contextParams: GetAccessibilityContextParamsSchema? = nil - logToStderr("[IOBridge] Handling getAccessibilityContext for ID: \(request.id)") + // MARK: - Accessibility Handlers (using consolidated service) - if let paramsAnyCodable = request.params { + private func handleGetAccessibilityContext(id: String, params: JSONAny?) { + logToStderr("[IOBridge] Handling getAccessibilityContext for ID: \(id)") + + // Parse params (default editableOnly = false per spec) + var editableOnly = false + if let paramsAnyCodable = params { do { let paramsData = try jsonEncoder.encode(paramsAnyCodable) - contextParams = try jsonDecoder.decode( - GetAccessibilityContextParamsSchema.self, from: paramsData) - logToStderr( - "[IOBridge] Decoded contextParams.editableOnly: \(contextParams?.editableOnly ?? false) for ID: \(request.id)" - ) + let contextParams = try jsonDecoder.decode(GetAccessibilityContextParams.self, from: paramsData) + editableOnly = contextParams.editableOnly ?? false } catch { - logToStderr( - "[IOBridge] Error decoding getAccessibilityContext params: \(error.localizedDescription)" - ) - let errPayload = Error( - code: -32602, data: request.params, - message: "Invalid params: \(error.localizedDescription)") - let rpcResponse = RPCResponseSchema(error: errPayload, id: request.id, result: nil) - sendRpcResponse(rpcResponse) - return + logToStderr("[IOBridge] Error decoding params: \(error.localizedDescription)") } } - let editableOnly = contextParams?.editableOnly ?? false - + // Call service with exception handling switch ExceptionCatcher.try({ AccessibilityContextService.getAccessibilityContext(editableOnly: editableOnly) }) { case .success(let context): - logToStderr("[IOBridge] Retrieved context for ID: \(request.id)") - let resultPayload = GetAccessibilityContextResultSchema(context: context) - do { - let resultData = try jsonEncoder.encode(resultPayload) - let resultAsJsonAny = try jsonDecoder.decode(JSONAny.self, from: resultData) - let rpcResponse = RPCResponseSchema(error: nil, id: request.id, result: resultAsJsonAny) - sendRpcResponse(rpcResponse) - } catch { - logToStderr("[IOBridge] Error encoding result: \(error.localizedDescription) for ID: \(request.id)") - let errPayload = Error(code: -32603, data: nil, message: "Error encoding result: \(error.localizedDescription)") - let rpcResponse = RPCResponseSchema(error: errPayload, id: request.id, result: nil) - sendRpcResponse(rpcResponse) - } + logToStderr("[IOBridge] Retrieved context for ID: \(id)") + let result = GetAccessibilityContextResult(context: context) + sendResult(id: id, result: result) case .exception(let exception): logToStderr("[IOBridge] NSException in getAccessibilityContext: \(exception.name) - \(exception.reason)") - let exceptionData: [String: Any] = [ - "name": exception.name, - "reason": exception.reason, - "callStack": exception.callStack.prefix(10).joined(separator: "\n") - ] - var exceptionJsonAny: JSONAny? = nil - if let jsonData = try? JSONSerialization.data(withJSONObject: exceptionData), - let decoded = try? jsonDecoder.decode(JSONAny.self, from: jsonData) { - exceptionJsonAny = decoded - } - let errPayload = Error( - code: -32603, - data: exceptionJsonAny, - message: "\(exception.name): \(exception.reason)" - ) - let rpcResponse = RPCResponseSchema(error: errPayload, id: request.id, result: nil) - sendRpcResponse(rpcResponse) + sendError(id: id, code: -32603, message: "\(exception.name): \(exception.reason)") } } + private func handleGetAccessibilityStatus(id: String) { + logToStderr("[IOBridge] Handling getAccessibilityStatus for ID: \(id)") + + let result = AccessibilityContextService.getAccessibilityStatus() + sendResult(id: id, result: result) + } + + private func handleRequestAccessibilityPermission(id: String) { + logToStderr("[IOBridge] Handling requestAccessibilityPermission for ID: \(id)") + + let result = AccessibilityContextService.requestAccessibilityPermission() + sendResult(id: id, result: result) + } + + // MARK: - Response Helpers + + private func sendResult(id: String, result: T) { + do { + let resultData = try jsonEncoder.encode(result) + let resultAsJsonAny = try jsonDecoder.decode(JSONAny.self, from: resultData) + let rpcResponse = RPCResponseSchema(error: nil, id: id, result: resultAsJsonAny) + sendRpcResponse(rpcResponse) + } catch { + logToStderr("[IOBridge] Error encoding result: \(error.localizedDescription)") + sendError(id: id, code: -32603, message: "Error encoding result: \(error.localizedDescription)") + } + } + + private func sendError(id: String, code: Int, message: String) { + let errPayload = Error(code: code, data: nil, message: message) + let rpcResponse = RPCResponseSchema(error: errPayload, id: id, result: nil) + sendRpcResponse(rpcResponse) + } + } diff --git a/packages/native-helpers/swift-helper/Sources/SwiftHelper/models/AccessibilityBuilders.swift b/packages/native-helpers/swift-helper/Sources/SwiftHelper/models/AccessibilityBuilders.swift new file mode 100644 index 0000000..f22728e --- /dev/null +++ b/packages/native-helpers/swift-helper/Sources/SwiftHelper/models/AccessibilityBuilders.swift @@ -0,0 +1,156 @@ +import Foundation + +// ============================================================================= +// Accessibility Builders +// ============================================================================= +// Builder pattern helpers for creating accessibility response types. +// These use the auto-generated types from models/generated/models.swift. +// ============================================================================= + +// MARK: - Type Aliases for Backward Compatibility + +/// Maps to the generated `Context` type (AppContext in TypeScript) +typealias AppContext = Context + +/// Maps to the generated `Application` type (ApplicationInfo in TypeScript) +typealias ApplicationInfo = Application + +/// Maps to the generated `FocusedElement` type (AXElementInfo in TypeScript) +typealias AXElementInfo = FocusedElement + +/// Maps to the generated `Metrics` type (ExtractionMetrics in TypeScript) +typealias ExtractionMetrics = Metrics + +/// Maps to the generated `The0` enum (ExtractionMethod in TypeScript) +typealias ExtractionMethod = The0 + +/// Maps to the generated `SelectionRange` type (same name) +typealias AccessibilitySelectionRange = SelectionRange + +/// Maps to the generated `TextSelection` type (same name) +typealias AccessibilityTextSelection = TextSelection + +/// Maps to the generated `WindowInfo` type (same name) +typealias AccessibilityWindowInfo = WindowInfo + +/// Maps to the generated params type +typealias GetAccessibilityContextParams = GetAccessibilityContextParamsSchema + +/// Maps to the generated result type +typealias GetAccessibilityContextResult = GetAccessibilityContextResultSchema + +// MARK: - Result Types for Other Methods + +/// Response result for getAccessibilityStatus +struct GetAccessibilityStatusResult: Codable { + /// Does the app have accessibility permission? + let hasPermission: Bool + /// Is accessibility enabled system-wide? + let isEnabled: Bool +} + +/// Response result for requestAccessibilityPermission +struct RequestAccessibilityPermissionResult: Codable { + /// Was permission granted? + let granted: Bool +} + +// MARK: - Builder for TextSelection + +/// Builder for creating TextSelection with proper defaults +class TextSelectionBuilder { + var selectedText: String? = nil + var fullContent: String? = nil + var preSelectionText: String? = nil + var postSelectionText: String? = nil + var selectionRange: SelectionRange? = nil + var isEditable: Bool = false + var extractionMethod: ExtractionMethod = .none + var hasMultipleRanges: Bool = false + var isPlaceholder: Bool = false + var isSecure: Bool = false + var fullContentTruncated: Bool = false + + func build() -> TextSelection { + return TextSelection( + extractionMethod: extractionMethod, + fullContent: fullContent, + fullContentTruncated: fullContentTruncated, + hasMultipleRanges: hasMultipleRanges, + isEditable: isEditable, + isPlaceholder: isPlaceholder, + isSecure: isSecure, + postSelectionText: postSelectionText, + preSelectionText: preSelectionText, + selectedText: selectedText, + selectionRange: selectionRange + ) + } + + /// Create a secure field result (all content fields suppressed) + static func secureField(isEditable: Bool) -> TextSelection { + return TextSelection( + extractionMethod: .none, + fullContent: nil, + fullContentTruncated: false, + hasMultipleRanges: false, + isEditable: isEditable, + isPlaceholder: false, + isSecure: true, + postSelectionText: nil, + preSelectionText: nil, + selectedText: nil, + selectionRange: nil // Suppressed to prevent password length leakage + ) + } +} + +// MARK: - Builder for Metrics + +/// Builder for creating Metrics +class ExtractionMetricsBuilder { + private var startTime: CFAbsoluteTime + var textMarkerAttempted: Bool = false + var textMarkerSucceeded: Bool = false + var fallbacksUsed: [ExtractionMethod] = [] + var errors: [String] = [] + var timedOut: Bool = false + + // WebArea retry path metrics + var webAreaRetryAttempted: Bool = false + var webAreaFound: Bool = false + var webAreaRetrySucceeded: Bool = false + + init() { + self.startTime = CFAbsoluteTimeGetCurrent() + } + + func recordFallback(_ method: ExtractionMethod) { + fallbacksUsed.append(method) + } + + func recordError(_ message: String) { + // Ensure no PII in error messages + errors.append(message) + } + + func build() -> Metrics { + let endTime = CFAbsoluteTimeGetCurrent() + let totalTimeMs = (endTime - startTime) * 1000 + + // Set timedOut flag if we exceeded best-effort timeout (per spec) + let didTimeout = totalTimeMs > EXTRACTION_TIMEOUT_MS + + return Metrics( + errors: errors, + fallbacksUsed: fallbacksUsed, + textMarkerAttempted: textMarkerAttempted, + textMarkerSucceeded: textMarkerSucceeded, + timedOut: didTimeout, + totalTimeMS: totalTimeMs, + webAreaFound: webAreaFound, + webAreaRetryAttempted: webAreaRetryAttempted, + webAreaRetrySucceeded: webAreaRetrySucceeded + ) + } +} diff --git a/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/AccessibilityContextService.swift b/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/AccessibilityContextService.swift new file mode 100644 index 0000000..c617143 --- /dev/null +++ b/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/AccessibilityContextService.swift @@ -0,0 +1,113 @@ +import Foundation +import ApplicationServices +import AppKit + +// ============================================================================= +// AccessibilityContextService - Main Entry Point for Accessibility API +// ============================================================================= +// Coordinates all services to extract accessibility context. +// This is the main entry point called from RpcHandler. +// ============================================================================= + +/// Main service for accessibility context extraction +class AccessibilityContextService { + + // MARK: - Main API + + /// Get accessibility context using the extraction algorithm + /// - Parameter editableOnly: Only return text selection if element is editable (default: false per spec) + /// - Returns: AppContext with all accessibility context, or nil if unavailable + static func getAccessibilityContext(editableOnly: Bool = false) -> AppContext? { + // Start metrics tracking + let metricsBuilder = ExtractionMetricsBuilder() + + // Check permissions + guard PermissionsService.checkPermissions() else { + logError("Accessibility permissions not granted") + return nil + } + + // Get frontmost application + let pid = AXHelpers.getFrontProcessID() + guard pid > 0 else { + logError("Could not get frontmost application PID") + return nil + } + + // Build application info (arguments in alphabetical order per generated types) + let applicationInfo = ApplicationInfo( + bundleIdentifier: AXHelpers.getBundleIdentifier(pid: pid), + name: AXHelpers.getProcessName(pid: pid), + pid: Int(pid), + version: AXHelpers.getApplicationVersion(pid: pid) + ) + + // Get focused element + var focusedElementInfo: AXElementInfo? = nil + var textSelectionInfo: AccessibilityTextSelection? = nil + + if let focusedElement = FocusService.getFocusedElement(pid: pid) { + // Touch descendants to ensure they're accessible (triggers lazy loading) + AXHelpers.touchDescendants(focusedElement, maxDepth: TOUCH_DESCENDANTS_MAX_DEPTH) + + // Try to find a text-capable element + if let focusResult = FocusService.findTextCapableElement(from: focusedElement, editableOnly: editableOnly) { + focusedElementInfo = FocusService.getElementInfo(element: focusResult.element) + + // Extract text selection + textSelectionInfo = SelectionExtractor.extract(from: focusResult.element, metricsBuilder: metricsBuilder) + + // Apply editableOnly filter + if editableOnly { + if let selection = textSelectionInfo, !selection.isEditable { + textSelectionInfo = nil + } + } + } else { + // No text-capable element found, but still get basic element info + focusedElementInfo = FocusService.getElementInfo(element: focusedElement) + } + } + + // Get window info + let windowInfo = FocusService.getWindowInfo(pid: pid) + + // Build metrics + let metrics = metricsBuilder.build() + + // Build and return context (arguments in alphabetical order per generated types) + return AppContext( + application: applicationInfo, + focusedElement: focusedElementInfo, + metrics: metrics, + schemaVersion: .the20, + textSelection: textSelectionInfo, + timestamp: Date().timeIntervalSince1970, + windowInfo: windowInfo + ) + } + + // MARK: - Permission APIs + + /// Get accessibility permission status + static func getAccessibilityStatus() -> GetAccessibilityStatusResult { + return PermissionsService.getStatus() + } + + /// Request accessibility permission + static func requestAccessibilityPermission() -> RequestAccessibilityPermissionResult { + return PermissionsService.requestPermission() + } + + // MARK: - Logging + + private static func logError(_ message: String) { + FileHandle.standardError.write("โŒ \(message)\n".data(using: .utf8)!) + } + + private static func logDebug(_ message: String) { + #if DEBUG + FileHandle.standardError.write("๐Ÿ” \(message)\n".data(using: .utf8)!) + #endif + } +} diff --git a/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/FocusService.swift b/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/FocusService.swift new file mode 100644 index 0000000..def59a9 --- /dev/null +++ b/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/FocusService.swift @@ -0,0 +1,306 @@ +import Foundation +import ApplicationServices +import AppKit + +// ============================================================================= +// FocusService - Focus Resolution and Element Discovery +// ============================================================================= +// Handles finding the focused element and searching for text-capable elements +// when the focused element is not directly text-capable. +// ============================================================================= + +/// Result of focus resolution +struct FocusResult { + let element: AXUIElement + let role: String? + let wasSearched: Bool // True if we had to search for a text-capable element +} + +/// Service for resolving focus and finding text-capable elements +class FocusService { + + // MARK: - Get Focused Element + + /// Get the focused element for the frontmost application + /// - Parameter pid: Process ID of the application + /// - Returns: The focused AXUIElement, or nil if none found + static func getFocusedElement(pid: pid_t) -> AXUIElement? { + let application = AXHelpers.createApplicationElement(pid: pid) + + // Enable manual accessibility for specific apps (Chrome, Firefox, etc.) + let bundleId = AXHelpers.getBundleIdentifier(pid: pid) + AXHelpers.enableManualAccessibilityIfNeeded(application: application, bundleId: bundleId) + + // Try to get focused UI element + var focusedElement: CFTypeRef? + var error = AXUIElementCopyAttributeValue( + application, + kAXFocusedUIElementAttribute as CFString, + &focusedElement + ) + + // Fallback to focused window if focused element fails + if error != .success { + error = AXUIElementCopyAttributeValue( + application, + kAXFocusedWindowAttribute as CFString, + &focusedElement + ) + } + + guard error == .success, let element = focusedElement else { + return nil + } + + return (element as! AXUIElement) + } + + // MARK: - Find Text-Capable Element + + /// Find a text-capable element starting from the focused element + /// Searches descendants first, then ancestors + /// - Parameters: + /// - element: Starting element + /// - editableOnly: If true, only return editable elements + /// - Returns: FocusResult with the found element, or nil + static func findTextCapableElement(from element: AXUIElement, editableOnly: Bool) -> FocusResult? { + let role = AXHelpers.getStringAttribute(element, kAXRoleAttribute) + + // Check if current element is text-capable + if AXHelpers.isTextCapable(element) { + if !editableOnly || AXHelpers.isElementEditable(element) { + return FocusResult(element: element, role: role, wasSearched: false) + } + } + + // Search descendants for text-capable element + if let descendant = searchDescendantsForTextCapable(element: element, editableOnly: editableOnly) { + let descendantRole = AXHelpers.getStringAttribute(descendant, kAXRoleAttribute) + return FocusResult(element: descendant, role: descendantRole, wasSearched: true) + } + + // Search ancestors for text-capable element + if let ancestor = searchAncestorsForTextCapable(element: element, editableOnly: editableOnly) { + let ancestorRole = AXHelpers.getStringAttribute(ancestor, kAXRoleAttribute) + return FocusResult(element: ancestor, role: ancestorRole, wasSearched: true) + } + + // If editableOnly is false, return the original element if it has any text attributes + if !editableOnly && AXHelpers.hasAttribute(element, kAXValueAttribute) { + return FocusResult(element: element, role: role, wasSearched: false) + } + + return nil + } + + // MARK: - Descendant Search + + /// Search descendants for a text-capable element using BFS + private static func searchDescendantsForTextCapable( + element: AXUIElement, + editableOnly: Bool, + maxDepth: Int = TREE_WALK_MAX_DEPTH, + maxElements: Int = TREE_WALK_MAX_ELEMENTS + ) -> AXUIElement? { + var queue: [(element: AXUIElement, depth: Int)] = [(element, 0)] + var elementsSearched = 0 + + while !queue.isEmpty && elementsSearched < maxElements { + let (current, currentDepth) = queue.removeFirst() + elementsSearched += 1 + + // Skip if we've exceeded max depth + guard currentDepth < maxDepth else { continue } + + let children = AXHelpers.getChildren(current) + + for child in children { + // Check if child is text-capable + if AXHelpers.isTextCapable(child) { + if !editableOnly || AXHelpers.isElementEditable(child) { + return child + } + } + + // Add to queue for further search + queue.append((child, currentDepth + 1)) + } + } + + return nil + } + + // MARK: - Ancestor Search + + /// Search ancestors for a text-capable element + private static func searchAncestorsForTextCapable( + element: AXUIElement, + editableOnly: Bool, + maxDepth: Int = TREE_WALK_MAX_DEPTH + ) -> AXUIElement? { + var currentElement = element + + for _ in 0.. AXElementInfo { + let role = AXHelpers.getStringAttribute(element, kAXRoleAttribute) + let subrole = AXHelpers.getStringAttribute(element, kAXSubroleAttribute) + let title = AXHelpers.getStringAttribute(element, kAXTitleAttribute) + let description = AXHelpers.getStringAttribute(element, kAXDescriptionAttribute) + let isEditable = AXHelpers.isElementEditable(element) + let isSecure = AXHelpers.isSecureField(element) + + // Suppress value for secure fields + let value: String? = isSecure ? nil : AXHelpers.getStringAttribute(element, kAXValueAttribute) + + // Check placeholder + let isPlaceholder = AXHelpers.isPlaceholderShowing(element, selectionLength: nil) + + // Check focus (AXFocused attribute) + let isFocused = AXHelpers.getBoolAttribute(element, kAXFocusedAttribute) ?? true + + // Arguments in alphabetical order per generated types + return AXElementInfo( + description: description, + isEditable: isEditable, + isFocused: isFocused, + isPlaceholder: isPlaceholder, + isSecure: isSecure, + role: role, + subrole: subrole, + title: title, + value: value + ) + } + + // MARK: - Window Info Extraction + + /// Get window info for an application + static func getWindowInfo(pid: pid_t) -> AccessibilityWindowInfo? { + let application = AXHelpers.createApplicationElement(pid: pid) + + // Get main window + var mainWindow: CFTypeRef? + let error = AXUIElementCopyAttributeValue( + application, + kAXMainWindowAttribute as CFString, + &mainWindow + ) + + guard error == .success, let windowRef = mainWindow else { + return nil + } + + // Verify it's an AXUIElement + guard CFGetTypeID(windowRef) == AXUIElementGetTypeID() else { + return nil + } + + let window = windowRef as! AXUIElement + let title = AXHelpers.getStringAttribute(window, kAXTitleAttribute) + + // Get URL if this is a browser + let bundleId = AXHelpers.getBundleIdentifier(pid: pid) + let url = getBrowserURL(windowElement: window, bundleId: bundleId) + + return AccessibilityWindowInfo(title: title, url: url) + } + + // MARK: - Browser URL Extraction + + /// Get browser URL from window element + private static func getBrowserURL(windowElement: AXUIElement, bundleId: String?) -> String? { + // Determine browser type + let isChromiumBrowser = bundleId?.lowercased().contains("chrome") == true || + bundleId?.lowercased().contains("chromium") == true || + bundleId == "com.microsoft.edgemac" || + bundleId == "com.brave.Browser" || + bundleId == "com.operasoftware.Opera" || + bundleId == "com.vivaldi.Vivaldi" + + let isFirefox = bundleId == "org.mozilla.firefox" + + // For Chromium browsers and Firefox: Prioritize AXWebArea tree walk + if isChromiumBrowser || isFirefox { + if let url = findURLInChildren(element: windowElement, maxDepth: CHROMIUM_URL_SEARCH_DEPTH) { + return url + } + // Fallback to window-level attributes if tree walk fails + if let url = AXHelpers.getStringAttribute(windowElement, kAXDocumentAttribute), !url.isEmpty { + return url + } + if let url = AXHelpers.getStringAttribute(windowElement, kAXURLAttribute), !url.isEmpty { + return url + } + return nil + } + + // For non-Chromium browsers: Try window-level attributes first (more reliable) + if let url = AXHelpers.getStringAttribute(windowElement, kAXDocumentAttribute), !url.isEmpty { + return url + } + + if let url = AXHelpers.getStringAttribute(windowElement, kAXURLAttribute), !url.isEmpty { + return url + } + + // Shallow tree walk as fallback for non-Chromium browsers + return findURLInChildren(element: windowElement, maxDepth: NON_CHROMIUM_URL_SEARCH_DEPTH) + } + + /// Find URL in children using BFS + private static func findURLInChildren(element: AXUIElement, maxDepth: Int) -> String? { + var queue: [(element: AXUIElement, depth: Int)] = [(element, 0)] + + while !queue.isEmpty { + let (currentElement, currentDepth) = queue.removeFirst() + + guard currentDepth < maxDepth else { continue } + + let children = AXHelpers.getChildren(currentElement) + + for child in children { + let role = AXHelpers.getStringAttribute(child, kAXRoleAttribute) + + // Check address fields + if role == "AXTextField" || role == "AXComboBox" || role == "AXSafariAddressAndSearchField" { + if let value = AXHelpers.getStringAttribute(child, kAXValueAttribute), + !value.isEmpty, + (value.hasPrefix("http://") || value.hasPrefix("https://") || value.contains(".")) { + return value + } + } + + // Check web areas + if role == "AXWebArea" { + if let url = AXHelpers.getStringAttribute(child, kAXURLAttribute), !url.isEmpty { + return url + } + if let url = AXHelpers.getStringAttribute(child, kAXDocumentAttribute), !url.isEmpty { + return url + } + } + + queue.append((child, currentDepth + 1)) + } + } + + return nil + } +} diff --git a/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/PermissionsService.swift b/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/PermissionsService.swift new file mode 100644 index 0000000..8761645 --- /dev/null +++ b/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/PermissionsService.swift @@ -0,0 +1,49 @@ +import Foundation +import ApplicationServices + +// ============================================================================= +// PermissionsService - Accessibility Permission Management +// ============================================================================= +// Handles checking and requesting accessibility permissions. +// ============================================================================= + +/// Service for managing accessibility permissions +class PermissionsService { + + // MARK: - Permission Check + + /// Check if accessibility permissions are granted + /// - Parameter prompt: If true, show the system prompt to request permissions + /// - Returns: True if permissions are granted + static func checkPermissions(prompt: Bool = false) -> Bool { + return AXHelpers.checkAccessibilityPermissions(prompt: prompt) + } + + // MARK: - Permission Status + + /// Get detailed permission status + /// - Returns: GetAccessibilityStatusResult with permission details + static func getStatus() -> GetAccessibilityStatusResult { + let hasPermission = checkPermissions(prompt: false) + + // On macOS, accessibility is always "enabled" system-wide + // The question is whether the app has permission + let isEnabled = true + + return GetAccessibilityStatusResult( + hasPermission: hasPermission, + isEnabled: isEnabled + ) + } + + // MARK: - Request Permission + + /// Request accessibility permission (shows system prompt) + /// - Returns: RequestAccessibilityPermissionResult with grant status + static func requestPermission() -> RequestAccessibilityPermissionResult { + // Show the system accessibility prompt + let granted = checkPermissions(prompt: true) + + return RequestAccessibilityPermissionResult(granted: granted) + } +} diff --git a/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/SelectionExtractor.swift b/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/SelectionExtractor.swift new file mode 100644 index 0000000..24afc4d --- /dev/null +++ b/packages/native-helpers/swift-helper/Sources/SwiftHelper/services/SelectionExtractor.swift @@ -0,0 +1,792 @@ +import Foundation +import ApplicationServices + +// ============================================================================= +// SelectionExtractor - Multi-Path Text Selection Extraction +// ============================================================================= +// Implements the Phase 1 extraction algorithm with TextMarker as primary path. +// This enables text selection extraction in Electron/Chromium apps where +// AXSelectedTextRange fails. +// ============================================================================= + +/// Result from TextMarker extraction attempt +struct TextMarkerResult { + let selectedText: String? + let selectionRange: SelectionRange? + let hasMultipleRanges: Bool +} + +/// Service for extracting text selection from focused elements +class SelectionExtractor { + + // MARK: - Main Extraction Entry Point + + /// Extract text selection from an element using multi-path algorithm + /// - Parameters: + /// - element: The AXUIElement to extract from (focused element) + /// - metricsBuilder: Builder to record extraction metrics + /// - Returns: AccessibilityTextSelection or nil if no text selection available + static func extract(from element: AXUIElement, metricsBuilder: ExtractionMetricsBuilder) -> AccessibilityTextSelection? { + let builder = TextSelectionBuilder() + + // Track both original focused element and the element we extract from + let focusedElement = element + var extractionElement = element + + // Step 2: Check if element is editable (check original focused element) + let focusedIsEditable = AXHelpers.isElementEditable(focusedElement) + + // Step 2.1: SECURE FIELD CHECK - suppress all content if secure + if AXHelpers.isSecureField(focusedElement) { + return TextSelectionBuilder.secureField(isEditable: focusedIsEditable) + } + + // Variables to track extraction state + var selectionRange: AccessibilitySelectionRange? = nil + var selectedText: String? = nil + var fullContent: String? = nil + var hasMultipleRanges = false + var extractionMethod: ExtractionMethod = .none + + // Step 4: EXTRACTION (Priority Order) + + // Path A: TextMarker (PRIMARY - works in Electron) + metricsBuilder.textMarkerAttempted = true + if let textMarkerResult = extractViaTextMarker(element: focusedElement, metricsBuilder: metricsBuilder) { + metricsBuilder.textMarkerSucceeded = true + selectedText = textMarkerResult.selectedText + selectionRange = textMarkerResult.selectionRange + hasMultipleRanges = textMarkerResult.hasMultipleRanges + extractionMethod = .textMarkerRange + } + + // WebArea Retry Path: When TextMarker fails on focused element + if extractionMethod == .none { + // TextMarker failed - search for a better WebArea + metricsBuilder.webAreaRetryAttempted = true + + if let webArea = findWebArea(from: focusedElement) { + metricsBuilder.webAreaFound = true + + // Try TextMarker on WebArea + if let webAreaTextMarkerResult = extractViaTextMarker(element: webArea, metricsBuilder: metricsBuilder) { + // TextMarker SUCCEEDED on WebArea - now switch extraction element + metricsBuilder.textMarkerSucceeded = true // Mark overall TextMarker as succeeded + metricsBuilder.webAreaRetrySucceeded = true + extractionElement = webArea + selectedText = webAreaTextMarkerResult.selectedText + selectionRange = webAreaTextMarkerResult.selectionRange + hasMultipleRanges = webAreaTextMarkerResult.hasMultipleRanges + extractionMethod = .textMarkerRange + } + // If TextMarker fails on WebArea, DON'T switch extractionElement + // Keep using focusedElement for fallbacks (it has the content, even if noisy) + } + } + + // Descendant Text Element Path: When both TextMarker attempts fail + // Try to find the actual text element inside the container (e.g., in Notion) + if extractionMethod == .none { + if let deepTextElement = AXHelpers.findDeepestTextElement(from: focusedElement) { + // Found a deeper text element - try extraction on it + if let textMarkerResult = extractViaTextMarker(element: deepTextElement, metricsBuilder: metricsBuilder) { + metricsBuilder.textMarkerSucceeded = true // Mark TextMarker as succeeded + extractionElement = deepTextElement + selectedText = textMarkerResult.selectedText + selectionRange = textMarkerResult.selectionRange + hasMultipleRanges = textMarkerResult.hasMultipleRanges + extractionMethod = .textMarkerRange + } else if let rangeResult = extractViaSelectedTextRange(element: deepTextElement) { + // TextMarker failed but SelectedTextRange works - use this element + // This should give us cleaner content without UI labels + extractionElement = deepTextElement + selectedText = rangeResult.selectedText + selectionRange = rangeResult.selectionRange + extractionMethod = .selectedTextRange + } + } + } + + // Path B: SelectedTextRange (Fallback 1) - use extractionElement + if extractionMethod == .none { + metricsBuilder.recordFallback(.selectedTextRange) + if let result = extractViaSelectedTextRange(element: extractionElement) { + selectedText = result.selectedText + selectionRange = result.selectionRange + extractionMethod = .selectedTextRange + } + } + + // Path C: SelectedTextRanges (Fallback 2 - Multi-select) - use extractionElement + if extractionMethod == .none { + metricsBuilder.recordFallback(.selectedTextRanges) + if let result = extractViaSelectedTextRanges(element: extractionElement) { + selectedText = result.selectedText + selectionRange = result.selectionRange + hasMultipleRanges = result.hasMultipleRanges + extractionMethod = .selectedTextRanges + } + } + + // Path D: Value Attribute (Fallback 3) - use extractionElement + if extractionMethod == .none { + metricsBuilder.recordFallback(.valueAttribute) + if let value = AXHelpers.getStringAttribute(extractionElement, kAXValueAttribute) { + fullContent = value + extractionMethod = .valueAttribute + // Note: No selectionRange available from this path + } + } + + // Path E: StringForRange (Fallback 4) - use extractionElement + if extractionMethod == .none { + metricsBuilder.recordFallback(.stringForRange) + if let charCount = AXHelpers.getNumberOfCharacters(extractionElement) { + if charCount == 0 { + fullContent = "" + extractionMethod = .stringForRange + } else if charCount > 0 { + let range = CFRange(location: 0, length: charCount) + if let content = AXHelpers.getStringForRange(extractionElement, range: range) { + fullContent = content + extractionMethod = .stringForRange + } + } + } + } + + // If no extraction succeeded at all, return nil + if extractionMethod == .none { + return nil + } + + // Step 5: FULL CONTENT RETRIEVAL (if not already obtained) - use extractionElement + if fullContent == nil && selectionRange != nil { + // Try AXValue first + fullContent = AXHelpers.getStringAttribute(extractionElement, kAXValueAttribute) + + // If fails, try AXStringForRange + if fullContent == nil, let charCount = AXHelpers.getNumberOfCharacters(extractionElement), charCount > 0 { + let range = CFRange(location: 0, length: charCount) + fullContent = AXHelpers.getStringForRange(extractionElement, range: range) + } + } + + // Step 3: PLACEHOLDER CHECK (non-blocking) + // Use TextMarker-derived length if available, fall back to AXSelectedTextRange + var selectionLength: Int? = selectionRange?.length + if selectionLength == nil { + if let cfRange = AXHelpers.getSelectedTextRange(extractionElement) { + selectionLength = cfRange.length + } + } + // OR logic: check placeholder on BOTH elements + let focusedIsPlaceholder = AXHelpers.isPlaceholderShowing(focusedElement, selectionLength: nil) + let extractionIsPlaceholder = AXHelpers.isPlaceholderShowing(extractionElement, selectionLength: selectionLength) + builder.isPlaceholder = focusedIsPlaceholder || extractionIsPlaceholder + + // OR logic for isEditable: editable if EITHER element is editable + let extractionIsEditable = AXHelpers.isElementEditable(extractionElement) + builder.isEditable = focusedIsEditable || extractionIsEditable + + // Step 5.1: SELECTION RANGE VALIDATION + if var range = selectionRange, let content = fullContent { + let contentLength = content.utf16.count + let originalLocation = range.location + let originalLength = range.length + + // Clamp to valid bounds + let clampedLocation = AXHelpers.clamp(originalLocation, min: 0, max: contentLength) + let maxLength = contentLength - clampedLocation + let clampedLength = AXHelpers.clamp(originalLength, min: 0, max: maxLength) + + // Log if clamping occurred (no PII) + if originalLocation != clampedLocation || originalLength != clampedLength { + metricsBuilder.recordError("SelectionRange clamped: original exceeded content bounds") + } + + selectionRange = SelectionRange(length: clampedLength, location: clampedLocation) + + // Step 5.2: RE-DERIVE selectedText when no windowing needed + if contentLength <= MAX_FULL_CONTENT_LENGTH { + if clampedLength == 0 { + selectedText = "" + } else { + selectedText = AXHelpers.substringUTF16(content, start: clampedLocation, length: clampedLength) + } + } + } + + // Step 6: CONTENT WINDOWING + var fullContentTruncated = false + if var content = fullContent, content.utf16.count > MAX_FULL_CONTENT_LENGTH { + let result = windowContent( + content: content, + selectionRange: selectionRange, + metricsBuilder: metricsBuilder + ) + fullContent = result.windowedContent + selectionRange = result.adjustedRange + selectedText = result.selectedText + fullContentTruncated = true + } + + // Step 7: CONTEXT COMPUTATION + var preSelectionText: String? = nil + var postSelectionText: String? = nil + + if let range = selectionRange, let content = fullContent { + let location = range.location + let length = range.length + let contentLength = content.utf16.count + + // Pre-selection text + if location == 0 { + preSelectionText = "" + } else { + let preStart = max(0, location - MAX_CONTEXT_LENGTH) + let preLength = location - preStart + preSelectionText = AXHelpers.substringUTF16(content, start: preStart, length: preLength) + } + + // Post-selection text + let postStart = location + length + if postStart >= contentLength { + postSelectionText = "" + } else { + let postLength = min(MAX_CONTEXT_LENGTH, contentLength - postStart) + postSelectionText = AXHelpers.substringUTF16(content, start: postStart, length: postLength) + } + } else if let range = selectionRange, fullContent == nil { + // Per spec: when selectionRange exists but fullContent is nil, + // compute pre/post via AXStringForRange + let location = range.location + let length = range.length + + // Pre-selection text via AXStringForRange + if location == 0 { + preSelectionText = "" + } else { + let preStart = max(0, location - MAX_CONTEXT_LENGTH) + let preLength = location - preStart + let preRange = CFRange(location: preStart, length: preLength) + preSelectionText = AXHelpers.getStringForRange(extractionElement, range: preRange) + } + + // Post-selection text via AXStringForRange + let postStart = location + length + // We don't know total length, so just try to get MAX_CONTEXT_LENGTH + let postRange = CFRange(location: postStart, length: MAX_CONTEXT_LENGTH) + postSelectionText = AXHelpers.getStringForRange(extractionElement, range: postRange) + } + + // Build final result + builder.selectedText = selectedText + builder.fullContent = fullContent + builder.preSelectionText = preSelectionText + builder.postSelectionText = postSelectionText + builder.selectionRange = selectionRange + builder.extractionMethod = extractionMethod + builder.hasMultipleRanges = hasMultipleRanges + builder.fullContentTruncated = fullContentTruncated + + return builder.build() + } + + // MARK: - Path A: TextMarker Extraction + + /// Extract selection using TextMarker APIs (works in Electron/Chromium) + /// Tries single range (AXSelectedTextMarkerRange) first, then multi-range (AXSelectedTextMarkerRanges) + private static func extractViaTextMarker(element: AXUIElement, metricsBuilder: ExtractionMetricsBuilder) -> TextMarkerResult? { + // Try single range first + if let result = extractViaSingleTextMarkerRange(element: element, metricsBuilder: metricsBuilder) { + return result + } + + // If single range failed, try multi-range (use first range) + return extractViaMultiTextMarkerRanges(element: element, metricsBuilder: metricsBuilder) + } + + /// Extract selection using single AXSelectedTextMarkerRange + private static func extractViaSingleTextMarkerRange(element: AXUIElement, metricsBuilder: ExtractionMetricsBuilder) -> TextMarkerResult? { + // 1. Get TextMarker range + var markerRangeRef: CFTypeRef? + let rangeError = AXUIElementCopyAttributeValue( + element, + "AXSelectedTextMarkerRange" as CFString, + &markerRangeRef + ) + + guard rangeError == .success, let markerRange = markerRangeRef else { + metricsBuilder.recordError("TextMarker: AXSelectedTextMarkerRange failed, AXError=\(rangeError.rawValue)") + return nil + } + + // Extract from the marker range + return extractFromMarkerRange(markerRange, element: element, metricsBuilder: metricsBuilder, hasMultipleRanges: false) + } + + /// Extract selection using AXSelectedTextMarkerRanges (multi-cursor), using the first range + private static func extractViaMultiTextMarkerRanges(element: AXUIElement, metricsBuilder: ExtractionMetricsBuilder) -> TextMarkerResult? { + // 1. Get TextMarker ranges array + var markerRangesRef: CFTypeRef? + let rangesError = AXUIElementCopyAttributeValue( + element, + "AXSelectedTextMarkerRanges" as CFString, + &markerRangesRef + ) + + guard rangesError == .success, let rangesArray = markerRangesRef as? [AnyObject], !rangesArray.isEmpty else { + metricsBuilder.recordError("TextMarker: AXSelectedTextMarkerRanges failed or empty, AXError=\(rangesError.rawValue)") + return nil + } + + // Use the first range + let firstRange = rangesArray[0] + let hasMultipleRanges = rangesArray.count > 1 + + // Extract from the first marker range + return extractFromMarkerRange(firstRange as CFTypeRef, element: element, metricsBuilder: metricsBuilder, hasMultipleRanges: hasMultipleRanges) + } + + /// Extract text and indices from a TextMarker range + private static func extractFromMarkerRange(_ markerRange: CFTypeRef, element: AXUIElement, metricsBuilder: ExtractionMetricsBuilder, hasMultipleRanges: Bool) -> TextMarkerResult? { + // 2. Get start marker + var startMarkerRef: CFTypeRef? + let startError = AXUIElementCopyParameterizedAttributeValue( + element, + "AXStartTextMarkerForTextMarkerRange" as CFString, + markerRange, + &startMarkerRef + ) + + guard startError == .success, let startMarker = startMarkerRef else { + metricsBuilder.recordError("TextMarker: AXStartTextMarkerForTextMarkerRange failed, AXError=\(startError.rawValue)") + return nil + } + + // 3. Get end marker + var endMarkerRef: CFTypeRef? + let endError = AXUIElementCopyParameterizedAttributeValue( + element, + "AXEndTextMarkerForTextMarkerRange" as CFString, + markerRange, + &endMarkerRef + ) + + guard endError == .success, let endMarker = endMarkerRef else { + metricsBuilder.recordError("TextMarker: AXEndTextMarkerForTextMarkerRange failed, AXError=\(endError.rawValue)") + return nil + } + + // 4. Convert markers to indices + var startIndexRef: CFTypeRef? + let startIndexError = AXUIElementCopyParameterizedAttributeValue( + element, + "AXIndexForTextMarker" as CFString, + startMarker, + &startIndexRef + ) + + guard startIndexError == .success, + let startIndexNumber = startIndexRef as? NSNumber else { + metricsBuilder.recordError("TextMarker: AXIndexForTextMarker (start) failed, AXError=\(startIndexError.rawValue)") + return nil + } + + var endIndexRef: CFTypeRef? + let endIndexError = AXUIElementCopyParameterizedAttributeValue( + element, + "AXIndexForTextMarker" as CFString, + endMarker, + &endIndexRef + ) + + guard endIndexError == .success, + let endIndexNumber = endIndexRef as? NSNumber else { + metricsBuilder.recordError("TextMarker: AXIndexForTextMarker (end) failed, AXError=\(endIndexError.rawValue)") + return nil + } + + let startIndex = startIndexNumber.intValue + let endIndex = endIndexNumber.intValue + + // Validate indices per spec: negative or end < start should fail + if startIndex < 0 || endIndex < 0 { + metricsBuilder.recordError("TextMarker: Invalid indices - negative values (start=\(startIndex), end=\(endIndex))") + return nil + } + if endIndex < startIndex { + metricsBuilder.recordError("TextMarker: Invalid indices - end < start (start=\(startIndex), end=\(endIndex))") + return nil + } + + let length = endIndex - startIndex + + // 5. Get text for marker range + var attributedStringRef: CFTypeRef? + let stringError = AXUIElementCopyParameterizedAttributeValue( + element, + "AXAttributedStringForTextMarkerRange" as CFString, + markerRange, + &attributedStringRef + ) + + var selectedText: String? = nil + if stringError == .success, let attrString = attributedStringRef as? NSAttributedString { + selectedText = attrString.string + } else if stringError == .success, let plainString = attributedStringRef as? String { + selectedText = plainString + } else if length == 0 { + // Cursor only - no selection, this is fine + selectedText = "" + } else { + metricsBuilder.recordError("TextMarker: AXAttributedStringForTextMarkerRange failed, AXError=\(stringError.rawValue)") + } + + let selectionRange = SelectionRange(length: length, location: startIndex) + + return TextMarkerResult( + selectedText: selectedText, + selectionRange: selectionRange, + hasMultipleRanges: hasMultipleRanges + ) + } + + // MARK: - Path B: SelectedTextRange Extraction + + /// Extract selection using standard AXSelectedTextRange + /// Uses AXStringForRange for text extraction (more reliable for Chromium/Electron per spec) + private static func extractViaSelectedTextRange(element: AXUIElement) -> TextMarkerResult? { + guard let cfRange = AXHelpers.getSelectedTextRange(element) else { + return nil + } + + let location = cfRange.location + let length = cfRange.length + + // Get selected text using AXStringForRange (more reliable for Chromium/Electron) + var selectedText: String? = nil + if length == 0 { + selectedText = "" + } else { + // Try AXStringForRange first (per spec - more reliable) + selectedText = AXHelpers.getStringForRange(element, range: cfRange) + // Fall back to AXSelectedText if needed + if selectedText == nil { + selectedText = AXHelpers.getStringAttribute(element, kAXSelectedTextAttribute) + } + } + + return TextMarkerResult( + selectedText: selectedText, + selectionRange: SelectionRange(length: length, location: location), + hasMultipleRanges: false + ) + } + + // MARK: - Path C: SelectedTextRanges Extraction + + /// Extract selection using AXSelectedTextRanges (multi-select) + private static func extractViaSelectedTextRanges(element: AXUIElement) -> TextMarkerResult? { + var rangesRef: CFTypeRef? + let error = AXUIElementCopyAttributeValue( + element, + "AXSelectedTextRanges" as CFString, + &rangesRef + ) + + guard error == .success, let ranges = rangesRef as? [AXValue], !ranges.isEmpty else { + return nil + } + + // Convert ranges and sort by location + var cfRanges: [CFRange] = [] + for rangeValue in ranges { + var range = CFRange() + if AXValueGetValue(rangeValue, .cfRange, &range) { + cfRanges.append(range) + } + } + + guard !cfRanges.isEmpty else { return nil } + + // Sort by location (ascending) + cfRanges.sort { $0.location < $1.location } + + // Use first (lowest location) as primary + let primaryRange = cfRanges[0] + let hasMultipleRanges = cfRanges.count > 1 + + // Get selected text for primary range + var selectedText: String? = nil + if primaryRange.length == 0 { + selectedText = "" + } else { + selectedText = AXHelpers.getStringForRange(element, range: primaryRange) + } + + return TextMarkerResult( + selectedText: selectedText, + selectionRange: SelectionRange(length: primaryRange.length, location: primaryRange.location), + hasMultipleRanges: hasMultipleRanges + ) + } + + // MARK: - WebArea Search + + /// Candidate structure for WebArea selection + private struct WebAreaCandidate { + let element: AXUIElement + let depth: Int // positive = descendant, negative = ancestor + let isAncestor: Bool + } + + /// Find best AXWebArea from descendants (and optionally ancestors) + /// - Parameter focusedElement: The currently focused element + /// - Returns: Best AXWebArea element to use for extraction, or nil if none found + private static func findWebArea(from focusedElement: AXUIElement) -> AXUIElement? { + let focusedIsWebArea = AXHelpers.getRole(focusedElement) == "AXWebArea" + + var candidates: [WebAreaCandidate] = [] + + // 1. Collect from ancestors (only if focused is NOT already a WebArea) + if !focusedIsWebArea { + let ancestorWebAreas = AXHelpers.findWebAreasInAncestors( + element: focusedElement, + excludeElement: focusedElement, + maxLevels: WEB_AREA_ANCESTOR_SEARCH_DEPTH + ) + for (webArea, depth) in ancestorWebAreas { + candidates.append(WebAreaCandidate(element: webArea, depth: depth, isAncestor: true)) + } + } + + // 2. Collect from descendants (ALWAYS, even if focused is WebArea) + let children = AXHelpers.getChildren(focusedElement) + if children.count > 0 { + let descendantWebAreas = AXHelpers.findWebAreasInDescendants( + element: focusedElement, + excludeElement: focusedElement, + maxDepth: FIND_WEB_AREAS_MAX_DEPTH, + maxElements: FIND_WEB_AREAS_MAX_ELEMENTS + ) + for (webArea, depth) in descendantWebAreas { + candidates.append(WebAreaCandidate(element: webArea, depth: depth, isAncestor: false)) + } + } + + // 3. Select best candidate based on preference order + return selectBestWebArea(from: candidates, focusedElement: focusedElement) + } + + /// Select best WebArea from candidates + /// Preference order (DEEPEST descendant wins at ALL levels): + /// 1. Marker range present + contains focus + /// 2. Marker range present (focus unavailable) + /// 3. Contains focus without marker range + /// 4. DEEPEST descendant, then nearest ancestor + private static func selectBestWebArea( + from candidates: [WebAreaCandidate], + focusedElement: AXUIElement + ) -> AXUIElement? { + guard !candidates.isEmpty else { return nil } + + // Get app-level focused element for containment validation + let pid = AXHelpers.getPid(focusedElement) + let appFocusedElement = AXHelpers.getAppFocusedElement(forPid: pid) + + // Score each candidate + struct ScoredCandidate { + let candidate: WebAreaCandidate + let hasMarkerRange: Bool + let containsFocus: Bool + } + + let scored = candidates.map { c -> ScoredCandidate in + // Focus is "related" if EITHER: + // 1. Focus is inside the WebArea (focus is descendant/equal of WebArea) + // 2. WebArea is inside focus (WebArea is descendant/equal of focused container) + let containsFocus: Bool + if let focused = appFocusedElement { + containsFocus = AXHelpers.isDescendantOrEqual(focused, of: c.element) || + AXHelpers.isDescendantOrEqual(c.element, of: focused) + } else { + containsFocus = false + } + return ScoredCandidate( + candidate: c, + hasMarkerRange: AXHelpers.hasTextMarkerRange(c.element), + containsFocus: containsFocus + ) + } + + // 1. BEST: Has marker range AND contains focus (DEEPEST descendant wins) + let withMarkerAndFocus = scored.filter { $0.hasMarkerRange && $0.containsFocus } + if !withMarkerAndFocus.isEmpty { + // Prefer deepest descendant + if let descendant = withMarkerAndFocus + .filter({ !$0.candidate.isAncestor }) + .max(by: { $0.candidate.depth < $1.candidate.depth }) { + return descendant.candidate.element + } + // Otherwise nearest ancestor + if let ancestor = withMarkerAndFocus + .filter({ $0.candidate.isAncestor }) + .max(by: { $0.candidate.depth < $1.candidate.depth }) { + return ancestor.candidate.element + } + } + + // 2. Has marker range (without focus - focus detection may be unavailable) + let withMarker = scored.filter { $0.hasMarkerRange && !$0.containsFocus } + if !withMarker.isEmpty { + // Deepest descendant first + if let descendant = withMarker + .filter({ !$0.candidate.isAncestor }) + .max(by: { $0.candidate.depth < $1.candidate.depth }) { + return descendant.candidate.element + } + // Then nearest ancestor + if let ancestor = withMarker + .filter({ $0.candidate.isAncestor }) + .max(by: { $0.candidate.depth < $1.candidate.depth }) { + return ancestor.candidate.element + } + } + + // 3. Contains focus but no marker range + let withFocus = scored.filter { $0.containsFocus && !$0.hasMarkerRange } + if !withFocus.isEmpty { + // Prefer deepest descendant + if let descendant = withFocus + .filter({ !$0.candidate.isAncestor }) + .max(by: { $0.candidate.depth < $1.candidate.depth }) { + return descendant.candidate.element + } + if let ancestor = withFocus + .filter({ $0.candidate.isAncestor }) + .max(by: { $0.candidate.depth < $1.candidate.depth }) { + return ancestor.candidate.element + } + } + + // 4. Fallback: deepest descendant first, then nearest ancestor + let descendants = candidates.filter { !$0.isAncestor } + if let deepest = descendants.max(by: { $0.depth < $1.depth }) { + return deepest.element + } + let ancestors = candidates.filter { $0.isAncestor } + if let nearest = ancestors.max(by: { $0.depth < $1.depth }) { + return nearest.element + } + + return nil + } + + // MARK: - Content Windowing + + /// Result of content windowing operation + struct WindowResult { + let windowedContent: String + let adjustedRange: SelectionRange? + let selectedText: String? + } + + /// Apply content windowing based on the spec algorithm + private static func windowContent( + content: String, + selectionRange: SelectionRange?, + metricsBuilder: ExtractionMetricsBuilder + ) -> WindowResult { + let utf16 = content.utf16 + let totalLength = utf16.count + + // CASE A: No selection - head+tail truncation + guard let range = selectionRange else { + let delimiter = "\n...\n" + let delimiterLength = delimiter.utf16.count + let availableSpace = MAX_FULL_CONTENT_LENGTH - delimiterLength + + var headSize = availableSpace / 2 + var tailSize = availableSpace - headSize + + // Adjust for surrogate pairs + headSize = AXHelpers.adjustForSurrogatePairs(content, offset: headSize, direction: .backward) + let tailStart = AXHelpers.adjustForSurrogatePairs(content, offset: totalLength - tailSize, direction: .forward) + tailSize = totalLength - tailStart + + let headContent = AXHelpers.substringUTF16(content, start: 0, length: headSize) ?? "" + let tailContent = AXHelpers.substringUTF16(content, start: tailStart, length: tailSize) ?? "" + + return WindowResult( + windowedContent: headContent + delimiter + tailContent, + adjustedRange: nil, + selectedText: nil + ) + } + + let location = range.location + let length = range.length + + // CASE B: Selection exceeds max - clamp to selection start + if length > MAX_FULL_CONTENT_LENGTH { + var windowStart = location + var windowEnd = min(location + MAX_FULL_CONTENT_LENGTH, totalLength) + + // Adjust for surrogate pairs FIRST + windowStart = AXHelpers.adjustForSurrogatePairs(content, offset: windowStart, direction: .forward) + windowEnd = AXHelpers.adjustForSurrogatePairs(content, offset: windowEnd, direction: .backward) + + let windowedContent = AXHelpers.substringUTF16(content, start: windowStart, length: windowEnd - windowStart) ?? "" + let windowLength = windowedContent.utf16.count + + // Compute adjusted range (clamp location FIRST) + let rawLocation = location - windowStart + let adjustedLocation = AXHelpers.clamp(rawLocation, min: 0, max: windowLength) + let maxPossibleLength = windowLength - adjustedLocation + let adjustedLength = AXHelpers.clamp(length, min: 0, max: maxPossibleLength) + + let selectedText = AXHelpers.substringUTF16(windowedContent, start: adjustedLocation, length: adjustedLength) + + return WindowResult( + windowedContent: windowedContent, + adjustedRange: SelectionRange(length: adjustedLength, location: adjustedLocation), + selectedText: selectedText + ) + } + + // CASE C: Selection fits - window around selection + var windowStart = max(0, location - WINDOW_PADDING) + var windowEnd = min(totalLength, location + length + WINDOW_PADDING) + + // Shrink symmetrically if needed + if windowEnd - windowStart > MAX_FULL_CONTENT_LENGTH { + let selectionCenter = location + length / 2 + windowStart = max(0, selectionCenter - MAX_FULL_CONTENT_LENGTH / 2) + windowEnd = min(totalLength, windowStart + MAX_FULL_CONTENT_LENGTH) + windowStart = max(0, windowEnd - MAX_FULL_CONTENT_LENGTH) + } + + // Adjust for surrogate pairs FIRST + windowStart = AXHelpers.adjustForSurrogatePairs(content, offset: windowStart, direction: .forward) + windowEnd = AXHelpers.adjustForSurrogatePairs(content, offset: windowEnd, direction: .backward) + + let windowedContent = AXHelpers.substringUTF16(content, start: windowStart, length: windowEnd - windowStart) ?? "" + let windowLength = windowedContent.utf16.count + + // Compute adjusted range (clamp location FIRST) + let rawLocation = location - windowStart + let adjustedLocation = AXHelpers.clamp(rawLocation, min: 0, max: windowLength) + let maxPossibleLength = windowLength - adjustedLocation + let adjustedLength = AXHelpers.clamp(length, min: 0, max: maxPossibleLength) + + let selectedText = AXHelpers.substringUTF16(windowedContent, start: adjustedLocation, length: adjustedLength) + + return WindowResult( + windowedContent: windowedContent, + adjustedRange: SelectionRange(length: adjustedLength, location: adjustedLocation), + selectedText: selectedText + ) + } +} diff --git a/packages/native-helpers/swift-helper/Sources/SwiftHelper/utils/AXHelpers.swift b/packages/native-helpers/swift-helper/Sources/SwiftHelper/utils/AXHelpers.swift new file mode 100644 index 0000000..3519544 --- /dev/null +++ b/packages/native-helpers/swift-helper/Sources/SwiftHelper/utils/AXHelpers.swift @@ -0,0 +1,656 @@ +import Foundation +import ApplicationServices +import AppKit + +// ============================================================================= +// AXHelpers - Common Accessibility API Utilities +// ============================================================================= +// Shared utilities for working with macOS Accessibility APIs. +// Extracted from AccessibilityContextService for reuse in v2 implementation. +// ============================================================================= + +// Note: Constants are defined in utils/Constants.swift + +// MARK: - Surrogate Pair Direction + +/// Direction for surrogate pair boundary adjustment +enum SurrogatePairDirection { + case forward // For windowStart: move into content to include complete character + case backward // For windowEnd: move out of content to exclude incomplete character +} + +// MARK: - AXHelpers + +/// Utilities for working with macOS Accessibility APIs +enum AXHelpers { + + // MARK: - Attribute Access + + /// Get a string attribute value from an AXUIElement + static func getStringAttribute(_ element: AXUIElement, _ attribute: String) -> String? { + var value: CFTypeRef? + let error = AXUIElementCopyAttributeValue(element, attribute as CFString, &value) + + guard error == .success else { return nil } + + if let stringValue = value as? String { + return stringValue + } else if let numberValue = value as? NSNumber { + return numberValue.stringValue + } else if let boolValue = value as? Bool { + return boolValue ? "true" : "false" + } + return nil + } + + /// Get a boolean attribute value from an AXUIElement + static func getBoolAttribute(_ element: AXUIElement, _ attribute: String) -> Bool? { + var value: CFTypeRef? + let error = AXUIElementCopyAttributeValue(element, attribute as CFString, &value) + + guard error == .success else { return nil } + + if let boolValue = value as? Bool { + return boolValue + } else if let numberValue = value as? NSNumber { + return numberValue.boolValue + } + return nil + } + + /// Get an integer attribute value from an AXUIElement + static func getIntAttribute(_ element: AXUIElement, _ attribute: String) -> Int? { + var value: CFTypeRef? + let error = AXUIElementCopyAttributeValue(element, attribute as CFString, &value) + + guard error == .success else { return nil } + + if let numberValue = value as? NSNumber { + return numberValue.intValue + } + return nil + } + + /// Get all attribute names for an AXUIElement + static func getAttributeNames(_ element: AXUIElement) -> [String] { + var attributeNames: CFArray? + let error = AXUIElementCopyAttributeNames(element, &attributeNames) + + if error == .success, let names = attributeNames as? [String] { + return names + } + return [] + } + + /// Check if an element has a specific attribute + static func hasAttribute(_ element: AXUIElement, _ attribute: String) -> Bool { + return getAttributeNames(element).contains(attribute) + } + + /// Get a raw CFTypeRef attribute value + static func getRawAttribute(_ element: AXUIElement, _ attribute: String) -> CFTypeRef? { + var value: CFTypeRef? + let error = AXUIElementCopyAttributeValue(element, attribute as CFString, &value) + return error == .success ? value : nil + } + + /// Get a parameterized attribute value + static func getParameterizedAttribute(_ element: AXUIElement, _ attribute: String, parameter: CFTypeRef) -> CFTypeRef? { + var value: CFTypeRef? + let error = AXUIElementCopyParameterizedAttributeValue(element, attribute as CFString, parameter, &value) + return error == .success ? value : nil + } + + // MARK: - Element Type Detection + + /// Roles that are typically editable text fields + static let editableRoles: Set = [ + "AXTextField", + "AXTextArea", + "AXComboBox" + ] + + /// Subroles that indicate editable text fields + static let editableSubroles: Set = [ + "AXSecureTextField", + "AXSearchField" + ] + + /// Check if an element is editable + static func isElementEditable(_ element: AXUIElement) -> Bool { + let role = getStringAttribute(element, kAXRoleAttribute) + let subrole = getStringAttribute(element, kAXSubroleAttribute) + + // Check for editable roles + if let role = role, editableRoles.contains(role) { + return true + } + + // Check for editable subroles + if let subrole = subrole, editableSubroles.contains(subrole) { + return true + } + + // Check if element has AXValue attribute (often indicates editability) + return hasAttribute(element, kAXValueAttribute) + } + + /// Check if an element is a secure/password field + /// Per spec: check subrole == "AXSecureTextField" OR role contains "Secure" + static func isSecureField(_ element: AXUIElement) -> Bool { + // Check subrole first (most common case) + let subrole = getStringAttribute(element, kAXSubroleAttribute) + if subrole == "AXSecureTextField" { + return true + } + + // Also check if role contains "Secure" (per spec) + if let role = getStringAttribute(element, kAXRoleAttribute) { + if role.contains("Secure") { + return true + } + } + + return false + } + + /// Check if an element is showing placeholder text + static func isPlaceholderShowing(_ element: AXUIElement, selectionLength: Int?) -> Bool { + let placeholderValue = getStringAttribute(element, "AXPlaceholderValue") + let currentValue = getStringAttribute(element, kAXValueAttribute) + + guard let placeholder = placeholderValue, !placeholder.isEmpty else { + return false + } + + // Placeholder is showing if: + // 1. Placeholder exists AND is non-empty + // 2. AND one of: currentValue is nil/empty OR matches placeholder + // 3. AND (selectionLength == 0 OR selectionLength is unknown) + let valueIsEmpty = currentValue == nil || currentValue!.isEmpty + let valueMatchesPlaceholder = currentValue == placeholder + let selectionIsZeroOrUnknown = selectionLength == nil || selectionLength == 0 + + return (valueIsEmpty || valueMatchesPlaceholder) && selectionIsZeroOrUnknown + } + + /// Check if element is text-capable (can contain text selection) + static func isTextCapable(_ element: AXUIElement) -> Bool { + // Check for TextMarker range attribute + if hasAttribute(element, "AXSelectedTextMarkerRange") { + return true + } + + // Check for standard text range attribute + if hasAttribute(element, kAXSelectedTextRangeAttribute) { + return true + } + + // Check for value attribute with editable role + let role = getStringAttribute(element, kAXRoleAttribute) + if hasAttribute(element, kAXValueAttribute) { + if let role = role, editableRoles.contains(role) { + return true + } + } + + // Check for web area roles + if role == "AXWebArea" { + return true + } + + return false + } + + // MARK: - Element Tree Navigation + + /// Get children of an AXUIElement + static func getChildren(_ element: AXUIElement) -> [AXUIElement] { + var children: CFTypeRef? + let error = AXUIElementCopyAttributeValue(element, kAXChildrenAttribute as CFString, &children) + + guard error == .success, let childrenArray = children as? [AXUIElement] else { + return [] + } + return childrenArray + } + + /// Get parent of an AXUIElement + static func getParent(_ element: AXUIElement) -> AXUIElement? { + var parent: CFTypeRef? + let error = AXUIElementCopyAttributeValue(element, kAXParentAttribute as CFString, &parent) + + guard error == .success, let parentRef = parent else { return nil } + + // Verify it's actually an AXUIElement + if CFGetTypeID(parentRef) == AXUIElementGetTypeID() { + return (parentRef as! AXUIElement) + } + return nil + } + + /// Get the parent chain of an element (up to maxDepth) + static func getParentChain(_ element: AXUIElement, maxDepth: Int = PARENT_CHAIN_MAX_DEPTH) -> [AXUIElement] { + var chain: [AXUIElement] = [] + var currentElement = element + + for _ in 0.. 0 else { return } + + let children = getChildren(element) + let limitedChildren = Array(children.prefix(TOUCH_DESCENDANTS_PREFIX_LIMIT)) + + for child in limitedChildren { + touchDescendants(child, maxDepth: maxDepth - 1) + } + } + + // MARK: - Selection Range Helpers + + /// Get CFRange from AXSelectedTextRange attribute + static func getSelectedTextRange(_ element: AXUIElement) -> CFRange? { + var rangeValue: CFTypeRef? + let error = AXUIElementCopyAttributeValue(element, kAXSelectedTextRangeAttribute as CFString, &rangeValue) + + guard error == .success, let axValue = rangeValue else { return nil } + + var range = CFRange() + if AXValueGetValue(axValue as! AXValue, .cfRange, &range) { + return range + } + return nil + } + + /// Get text for a specific range using AXStringForRange + static func getStringForRange(_ element: AXUIElement, range: CFRange) -> String? { + var mutableRange = range + var rangeValue: AXValue? + rangeValue = AXValueCreate(.cfRange, &mutableRange) as AXValue? + + guard let rangeParam = rangeValue else { return nil } + + var result: CFTypeRef? + let error = AXUIElementCopyParameterizedAttributeValue( + element, + kAXStringForRangeParameterizedAttribute as CFString, + rangeParam, + &result + ) + + return error == .success ? result as? String : nil + } + + /// Get the total number of characters in the element + static func getNumberOfCharacters(_ element: AXUIElement) -> Int? { + return getIntAttribute(element, kAXNumberOfCharactersAttribute) + } + + // MARK: - UTF-16 String Helpers + + /// Adjust offset to avoid splitting surrogate pairs (single source of truth) + /// + /// - direction .forward: Used for windowStart - move INTO content to include complete char + /// - At LOW surrogate (trail): move +1 to skip the orphan trail + /// - Previous is HIGH surrogate (lead): move +1 to include complete pair + /// + /// - direction .backward: Used for windowEnd - move OUT of content to exclude incomplete char + /// - At LOW surrogate (trail): move -1 to exclude orphan trail + /// - Previous is HIGH surrogate (lead): move -1 to exclude lead (pair would be split) + static func adjustForSurrogatePairs(_ content: String, offset: Int, direction: SurrogatePairDirection) -> Int { + let utf16 = content.utf16 + guard offset > 0 && offset < utf16.count else { return offset } + + let idx = utf16.index(utf16.startIndex, offsetBy: offset) + let codeUnit = utf16[idx] + + // At a LOW surrogate (trail) - the HIGH surrogate is before us + if UTF16.isTrailSurrogate(codeUnit) { + return direction == .forward ? offset + 1 : offset - 1 + } + + // Check if previous code unit is a HIGH surrogate (lead) - we'd split the pair + if offset > 0 { + let prevIdx = utf16.index(before: idx) + let prevCodeUnit = utf16[prevIdx] + if UTF16.isLeadSurrogate(prevCodeUnit) { + return direction == .forward ? offset + 1 : offset - 1 + } + } + + return offset + } + + /// Clamp a value to a range + static func clamp(_ value: T, min minValue: T, max maxValue: T) -> T { + return max(minValue, min(maxValue, value)) + } + + /// Extract a substring using UTF-16 indices + static func substringUTF16(_ content: String, start: Int, length: Int) -> String? { + let utf16 = content.utf16 + let totalLength = utf16.count + + guard start >= 0 && start <= totalLength && length >= 0 else { return nil } + + let endOffset = min(start + length, totalLength) + let startIdx = utf16.index(utf16.startIndex, offsetBy: start) + let endIdx = utf16.index(utf16.startIndex, offsetBy: endOffset) + + return String(utf16[startIdx.. pid_t { + guard let frontmostApp = NSWorkspace.shared.frontmostApplication else { + return 0 + } + return frontmostApp.processIdentifier + } + + /// Get the running application for a process ID + static func getRunningApplication(pid: pid_t) -> NSRunningApplication? { + return NSRunningApplication(processIdentifier: pid) + } + + /// Get the process name for a PID + static func getProcessName(pid: pid_t) -> String? { + guard let application = getRunningApplication(pid: pid), + let url = application.executableURL else { + return nil + } + return url.lastPathComponent + } + + /// Get the bundle identifier for a PID + static func getBundleIdentifier(pid: pid_t) -> String? { + return getRunningApplication(pid: pid)?.bundleIdentifier + } + + /// Get the application version for a PID + static func getApplicationVersion(pid: pid_t) -> String? { + guard let application = getRunningApplication(pid: pid), + let bundleURL = application.bundleURL, + let bundle = Bundle(url: bundleURL) else { + return nil + } + return bundle.infoDictionary?["CFBundleShortVersionString"] as? String + } + + /// Create an AXUIElement for an application by PID + static func createApplicationElement(pid: pid_t) -> AXUIElement { + return AXUIElementCreateApplication(pid) + } + + /// Enable manual accessibility for specific apps (Chrome, Firefox, etc.) + static func enableManualAccessibilityIfNeeded(application: AXUIElement, bundleId: String?) { + guard let bundleId = bundleId, appsRequiringManualAX.contains(bundleId) else { return } + + AXUIElementSetAttributeValue(application, "AXManualAccessibility" as CFString, kCFBooleanTrue) + AXUIElementSetAttributeValue(application, "AXEnhancedUserInterface" as CFString, kCFBooleanTrue) + } + + // MARK: - Permission Helpers + + /// Check if accessibility permissions are granted + static func checkAccessibilityPermissions(prompt: Bool = false) -> Bool { + let options: [String: Any] = [kAXTrustedCheckOptionPrompt.takeUnretainedValue() as String: prompt] + return AXIsProcessTrustedWithOptions(options as CFDictionary) + } + + // MARK: - WebArea Search Helpers + + /// Get the role of an element + static func getRole(_ element: AXUIElement) -> String? { + return getStringAttribute(element, kAXRoleAttribute) + } + + /// Get the process ID from an AXUIElement + static func getPid(_ element: AXUIElement) -> pid_t? { + var pid: pid_t = 0 + let error = AXUIElementGetPid(element, &pid) + return error == .success ? pid : nil + } + + /// Get the focused element for a specific application by PID + static func getAppFocusedElement(forPid pid: pid_t?) -> AXUIElement? { + guard let pid = pid, pid > 0 else { return nil } + + let application = AXUIElementCreateApplication(pid) + var focusedElement: CFTypeRef? + let error = AXUIElementCopyAttributeValue( + application, + kAXFocusedUIElementAttribute as CFString, + &focusedElement + ) + + guard error == .success, let element = focusedElement else { + return nil + } + + return (element as! AXUIElement) + } + + /// Check if element A is a descendant of or equal to element B + /// Uses AXParent chain traversal + static func isDescendantOrEqual(_ elementA: AXUIElement, of elementB: AXUIElement) -> Bool { + // Check if they're the same element + if CFEqual(elementA, elementB) { + return true + } + + // Walk up parent chain from elementA looking for elementB + var current: AXUIElement? = elementA + var depth = 0 + let maxDepth = DESCENDANT_CHECK_MAX_DEPTH // Prevent infinite loops + + while let element = current, depth < maxDepth { + if let parent = getParent(element) { + if CFEqual(parent, elementB) { + return true + } + current = parent + depth += 1 + } else { + break + } + } + + return false + } + + /// Check if an element has a text marker range attribute (single or multi-range) + /// Returns true if: + /// - AXSelectedTextMarkerRange is present (not nil, length=0 is valid cursor), OR + /// - AXSelectedTextMarkerRanges array has at least one range + static func hasTextMarkerRange(_ element: AXUIElement) -> Bool { + // Check single range (AXSelectedTextMarkerRange) + var singleRangeRef: CFTypeRef? + let singleError = AXUIElementCopyAttributeValue( + element, + "AXSelectedTextMarkerRange" as CFString, + &singleRangeRef + ) + if singleError == .success && singleRangeRef != nil { + return true + } + + // Check multi-range (AXSelectedTextMarkerRanges) + var multiRangeRef: CFTypeRef? + let multiError = AXUIElementCopyAttributeValue( + element, + "AXSelectedTextMarkerRanges" as CFString, + &multiRangeRef + ) + if multiError == .success, let ranges = multiRangeRef as? [Any], !ranges.isEmpty { + return true + } + + return false + } + + /// Find the descendant text element that actually has focus/cursor + /// Priority: AXFocused text element > element with non-zero selection > element with most content + /// - Parameters: + /// - element: Starting element (container) + /// - maxDepth: Maximum depth to search + /// - maxElements: Maximum elements to visit + /// - Returns: The focused text element, or nil if none found + static func findDeepestTextElement( + from element: AXUIElement, + maxDepth: Int = FIND_TEXT_ELEMENT_MAX_DEPTH, + maxElements: Int = FIND_TEXT_ELEMENT_MAX_ELEMENTS + ) -> AXUIElement? { + var focusedCandidate: AXUIElement? = nil // Element with AXFocused=true AND has value + var selectionCandidate: AXUIElement? = nil // Element with non-zero selection range + var fallbackCandidate: AXUIElement? = nil // Element with most content (fallback) + var fallbackContentLength: Int = 0 + var elementsVisited = 0 + + // BFS queue: (element, depth) + var queue: [(AXUIElement, Int)] = [(element, 0)] + + while !queue.isEmpty && elementsVisited < maxElements { + let (currentElement, currentDepth) = queue.removeFirst() + elementsVisited += 1 + + guard currentDepth < maxDepth else { continue } + + let children = getChildren(currentElement) + + for child in children { + // Check if this is a text element (has AXValue) + let value = getStringAttribute(child, kAXValueAttribute) + let hasValue = value != nil && !value!.isEmpty + + // Check if element has AXSelectedTextRange + let range = getSelectedTextRange(child) + let hasRange = range != nil + + // Priority 1: Check if this element has AXFocused=true AND has content + var focusedRef: CFTypeRef? + let focusedError = AXUIElementCopyAttributeValue(child, kAXFocusedAttribute as CFString, &focusedRef) + if focusedError == .success, let focused = focusedRef as? Bool, focused { + if hasValue && hasRange { + focusedCandidate = child + } + } + + // Priority 2: Check if selection range indicates cursor is here (non-zero location or has selection) + // IMPORTANT: Require non-empty content to be a valid candidate + if let r = range, hasValue { + if selectionCandidate == nil && (r.location > 0 || r.length > 0) { + // Verify the content can accommodate the selection + if let v = value, v.utf16.count >= r.location { + selectionCandidate = child + } + } + } + + // Priority 3: Fallback to element with most content that has a selection range + // IMPORTANT: Require non-empty content to be a valid candidate + if hasRange && hasValue, let v = value { + let contentLength = v.utf16.count + if contentLength > fallbackContentLength { + fallbackContentLength = contentLength + fallbackCandidate = child + } + } + + queue.append((child, currentDepth + 1)) + } + } + + // Return in priority order: focused > selection-based > most content + return focusedCandidate ?? selectionCandidate ?? fallbackCandidate + } + + /// BFS search for AXWebArea elements in descendants + /// - Parameters: + /// - element: Starting element for search + /// - excludeElement: Element to exclude from results (typically the focused element) + /// - maxDepth: Maximum depth to search (default 10) + /// - maxElements: Maximum elements to visit (default 200) + /// - Returns: Array of (WebArea, depth) tuples + static func findWebAreasInDescendants( + element: AXUIElement, + excludeElement: AXUIElement, + maxDepth: Int = FIND_WEB_AREAS_MAX_DEPTH, + maxElements: Int = FIND_WEB_AREAS_MAX_ELEMENTS + ) -> [(AXUIElement, Int)] { + var results: [(AXUIElement, Int)] = [] + var elementsVisited = 0 + + // BFS queue: (element, depth) + var queue: [(AXUIElement, Int)] = [(element, 0)] + + while !queue.isEmpty && elementsVisited < maxElements { + let (currentElement, currentDepth) = queue.removeFirst() + elementsVisited += 1 + + // Skip if we've exceeded max depth for children + guard currentDepth < maxDepth else { continue } + + let children = getChildren(currentElement) + + for child in children { + // Check if this child is an AXWebArea + if let role = getRole(child), role == "AXWebArea" { + // Exclude the original focused element + if !CFEqual(child, excludeElement) { + results.append((child, currentDepth + 1)) + } + } + + // Add child to queue for further exploration + queue.append((child, currentDepth + 1)) + } + } + + return results + } + + /// Walk up parent chain looking for AXWebArea elements + /// - Parameters: + /// - element: Starting element for search + /// - excludeElement: Element to exclude from results + /// - maxLevels: Maximum levels to traverse up (default 3) + /// - Returns: Array of (WebArea, depth) tuples where depth is negative (-1 = parent, -2 = grandparent) + static func findWebAreasInAncestors( + element: AXUIElement, + excludeElement: AXUIElement, + maxLevels: Int = 3 + ) -> [(AXUIElement, Int)] { + var results: [(AXUIElement, Int)] = [] + var current: AXUIElement? = element + var level = 0 + + while let currentElement = current, level < maxLevels { + guard let parent = getParent(currentElement) else { break } + level += 1 + + // Check if parent is AXWebArea + if let role = getRole(parent), role == "AXWebArea" { + // Exclude the original focused element + if !CFEqual(parent, excludeElement) { + results.append((parent, -level)) // Negative depth for ancestors + } + } + + current = parent + } + + return results + } +} diff --git a/packages/native-helpers/swift-helper/Sources/SwiftHelper/utils/Constants.swift b/packages/native-helpers/swift-helper/Sources/SwiftHelper/utils/Constants.swift new file mode 100644 index 0000000..0f6b9f5 --- /dev/null +++ b/packages/native-helpers/swift-helper/Sources/SwiftHelper/utils/Constants.swift @@ -0,0 +1,100 @@ +import Foundation +import CoreGraphics + +// ============================================================================= +// Constants - Centralized Configuration for Accessibility Extraction +// ============================================================================= +// All magic numbers, timeouts, depths, and configuration values in one place. +// This makes it easier to tune, document, and understand system behavior. +// ============================================================================= + +// MARK: - Content Limits + +/// Maximum UTF-16 code units for pre/post selection context +let MAX_CONTEXT_LENGTH = 500 + +/// Maximum UTF-16 code units for full content before truncation +let MAX_FULL_CONTENT_LENGTH = 50_000 + +/// Padding around selection when windowing content (UTF-16 code units) +let WINDOW_PADDING = 25_000 + +// MARK: - Tree Traversal Limits + +/// Default maximum depth for generic tree walks (BFS) +let TREE_WALK_MAX_DEPTH = 8 + +/// Maximum elements to visit during tree searches +let TREE_WALK_MAX_ELEMENTS = 100 + +/// Depth for touching descendants to trigger lazy loading +let TOUCH_DESCENDANTS_MAX_DEPTH = 3 + +/// Maximum children to touch per level during lazy loading +let TOUCH_DESCENDANTS_PREFIX_LIMIT = 8 + +/// Default depth for parent chain traversal +let PARENT_CHAIN_MAX_DEPTH = 10 + +/// Depth limit for descendant-or-equal check (infinite loop guard) +let DESCENDANT_CHECK_MAX_DEPTH = 20 + +/// Default depth for finding deepest text element +let FIND_TEXT_ELEMENT_MAX_DEPTH = 10 + +/// Maximum elements to visit when finding text element +let FIND_TEXT_ELEMENT_MAX_ELEMENTS = 200 + +/// Default depth for finding WebAreas in descendants +let FIND_WEB_AREAS_MAX_DEPTH = 10 + +/// Maximum elements to visit when finding WebAreas +let FIND_WEB_AREAS_MAX_ELEMENTS = 200 + +// MARK: - Browser-Specific Depths + +/// Depth for Chromium browser URL search (deeper due to complex DOM) +let CHROMIUM_URL_SEARCH_DEPTH = 30 + +/// Depth for non-Chromium browser URL search +let NON_CHROMIUM_URL_SEARCH_DEPTH = 3 + +/// Depth for WebArea ancestor search (increased for deeply nested Electron apps like Notion) +let WEB_AREA_ANCESTOR_SEARCH_DEPTH = 15 + +// MARK: - Timeouts + +/// Best-effort timeout for extraction (milliseconds) +let EXTRACTION_TIMEOUT_MS: Double = 600.0 + +/// Delay before restoring pasteboard after paste (seconds) +let PASTE_RESTORE_DELAY_SECONDS: Double = 0.2 + +// MARK: - Virtual Key Codes (macOS) + +/// Virtual key code for 'V' key +let VK_V: CGKeyCode = 9 + +/// Virtual key code for Command key +let VK_COMMAND: CGKeyCode = 55 + +/// Virtual key code for Function (Fn) key +let VK_FUNCTION: CGKeyCode = 0x3F + +// MARK: - Accessibility Tree Building + +/// Maximum recursion depth for building accessibility tree +let ACCESSIBILITY_TREE_MAX_DEPTH = 10 + +// MARK: - App Lists + +/// Apps that need manual accessibility enabling (browsers) +let appsRequiringManualAX: Set = [ + "com.google.Chrome", + "org.mozilla.firefox", + "com.microsoft.edgemac", + "com.apple.Safari", + "com.brave.Browser", + "com.operasoftware.Opera", + "com.vivaldi.Vivaldi" +] diff --git a/packages/native-helpers/windows-helper/src/Models/Generated/Models.cs b/packages/native-helpers/windows-helper/src/Models/Generated/Models.cs index 8322b6d..eea4fdf 100644 --- a/packages/native-helpers/windows-helper/src/Models/Generated/Models.cs +++ b/packages/native-helpers/windows-helper/src/Models/Generated/Models.cs @@ -111,6 +111,12 @@ namespace WindowsHelper.Models [JsonPropertyName("focusedElement")] public FocusedElement FocusedElement { get; set; } + [JsonPropertyName("metrics")] + public Metrics Metrics { get; set; } + + [JsonPropertyName("schemaVersion")] + public SchemaVersion SchemaVersion { get; set; } + [JsonPropertyName("textSelection")] public TextSelection TextSelection { get; set; } @@ -129,6 +135,9 @@ namespace WindowsHelper.Models [JsonPropertyName("name")] public string Name { get; set; } + [JsonPropertyName("pid")] + public long Pid { get; set; } + [JsonPropertyName("version")] public string Version { get; set; } } @@ -141,9 +150,21 @@ namespace WindowsHelper.Models [JsonPropertyName("isEditable")] public bool IsEditable { get; set; } + [JsonPropertyName("isFocused")] + public bool IsFocused { get; set; } + + [JsonPropertyName("isPlaceholder")] + public bool IsPlaceholder { get; set; } + + [JsonPropertyName("isSecure")] + public bool IsSecure { get; set; } + [JsonPropertyName("role")] public string Role { get; set; } + [JsonPropertyName("subrole")] + public string Subrole { get; set; } + [JsonPropertyName("title")] public string Title { get; set; } @@ -151,14 +172,60 @@ namespace WindowsHelper.Models public string Value { get; set; } } + public partial class Metrics + { + [JsonPropertyName("errors")] + public List Errors { get; set; } + + [JsonPropertyName("fallbacksUsed")] + public List FallbacksUsed { get; set; } + + [JsonPropertyName("textMarkerAttempted")] + public bool TextMarkerAttempted { get; set; } + + [JsonPropertyName("textMarkerSucceeded")] + public bool TextMarkerSucceeded { get; set; } + + [JsonPropertyName("timedOut")] + public bool TimedOut { get; set; } + + [JsonPropertyName("totalTimeMs")] + [JsonConverter(typeof(MinMaxValueCheckConverter))] + public double TotalTimeMs { get; set; } + + [JsonPropertyName("webAreaFound")] + public bool WebAreaFound { get; set; } + + [JsonPropertyName("webAreaRetryAttempted")] + public bool WebAreaRetryAttempted { get; set; } + + [JsonPropertyName("webAreaRetrySucceeded")] + public bool WebAreaRetrySucceeded { get; set; } + } + public partial class TextSelection { + [JsonPropertyName("extractionMethod")] + public The0 ExtractionMethod { get; set; } + [JsonPropertyName("fullContent")] public string FullContent { get; set; } + [JsonPropertyName("fullContentTruncated")] + public bool FullContentTruncated { get; set; } + + [JsonPropertyName("hasMultipleRanges")] + public bool HasMultipleRanges { get; set; } + [JsonPropertyName("isEditable")] public bool IsEditable { get; set; } + [JsonPropertyName("isPlaceholder")] + public bool IsPlaceholder { get; set; } + + [JsonPropertyName("isSecure")] + public bool IsSecure { get; set; } + [JsonPropertyName("postSelectionText")] public string PostSelectionText { get; set; } @@ -457,7 +524,11 @@ namespace WindowsHelper.Models public bool? ShiftKey { get; set; } } - public enum Method { GetAccessibilityContext, GetAccessibilityTreeDetails, MuteSystemAudio, PasteText, RestoreSystemAudio, SetShortcuts }; + public enum Method { GetAccessibilityContext, GetAccessibilityStatus, GetAccessibilityTreeDetails, MuteSystemAudio, PasteText, RequestAccessibilityPermission, RestoreSystemAudio, SetShortcuts }; + + public enum The0 { ClipboardCopy, None, SelectedTextRange, SelectedTextRanges, StringForRange, TextMarkerRange, ValueAttribute }; + + public enum SchemaVersion { The20 }; public enum KeyDownEventType { KeyDown }; @@ -585,6 +656,8 @@ namespace WindowsHelper.Models Converters = { MethodConverter.Singleton, + The0Converter.Singleton, + SchemaVersionConverter.Singleton, KeyDownEventTypeConverter.Singleton, KeyUpEventTypeConverter.Singleton, FlagsChangedEventTypeConverter.Singleton, @@ -607,12 +680,16 @@ namespace WindowsHelper.Models { case "getAccessibilityContext": return Method.GetAccessibilityContext; + case "getAccessibilityStatus": + return Method.GetAccessibilityStatus; case "getAccessibilityTreeDetails": return Method.GetAccessibilityTreeDetails; case "muteSystemAudio": return Method.MuteSystemAudio; case "pasteText": return Method.PasteText; + case "requestAccessibilityPermission": + return Method.RequestAccessibilityPermission; case "restoreSystemAudio": return Method.RestoreSystemAudio; case "setShortcuts": @@ -628,6 +705,9 @@ namespace WindowsHelper.Models case Method.GetAccessibilityContext: JsonSerializer.Serialize(writer, "getAccessibilityContext", options); return; + case Method.GetAccessibilityStatus: + JsonSerializer.Serialize(writer, "getAccessibilityStatus", options); + return; case Method.GetAccessibilityTreeDetails: JsonSerializer.Serialize(writer, "getAccessibilityTreeDetails", options); return; @@ -637,6 +717,9 @@ namespace WindowsHelper.Models case Method.PasteText: JsonSerializer.Serialize(writer, "pasteText", options); return; + case Method.RequestAccessibilityPermission: + JsonSerializer.Serialize(writer, "requestAccessibilityPermission", options); + return; case Method.RestoreSystemAudio: JsonSerializer.Serialize(writer, "restoreSystemAudio", options); return; @@ -650,6 +733,119 @@ namespace WindowsHelper.Models public static readonly MethodConverter Singleton = new MethodConverter(); } + internal class The0Converter : JsonConverter + { + public override bool CanConvert(Type t) => t == typeof(The0); + + public override The0 Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + { + var value = reader.GetString(); + switch (value) + { + case "clipboardCopy": + return The0.ClipboardCopy; + case "none": + return The0.None; + case "selectedTextRange": + return The0.SelectedTextRange; + case "selectedTextRanges": + return The0.SelectedTextRanges; + case "stringForRange": + return The0.StringForRange; + case "textMarkerRange": + return The0.TextMarkerRange; + case "valueAttribute": + return The0.ValueAttribute; + } + throw new Exception("Cannot unmarshal type The0"); + } + + public override void Write(Utf8JsonWriter writer, The0 value, JsonSerializerOptions options) + { + switch (value) + { + case The0.ClipboardCopy: + JsonSerializer.Serialize(writer, "clipboardCopy", options); + return; + case The0.None: + JsonSerializer.Serialize(writer, "none", options); + return; + case The0.SelectedTextRange: + JsonSerializer.Serialize(writer, "selectedTextRange", options); + return; + case The0.SelectedTextRanges: + JsonSerializer.Serialize(writer, "selectedTextRanges", options); + return; + case The0.StringForRange: + JsonSerializer.Serialize(writer, "stringForRange", options); + return; + case The0.TextMarkerRange: + JsonSerializer.Serialize(writer, "textMarkerRange", options); + return; + case The0.ValueAttribute: + JsonSerializer.Serialize(writer, "valueAttribute", options); + return; + } + throw new Exception("Cannot marshal type The0"); + } + + public static readonly The0Converter Singleton = new The0Converter(); + } + + internal class MinMaxValueCheckConverter : JsonConverter + { + public override bool CanConvert(Type t) => t == typeof(double); + + public override double Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + { + var value = reader.GetDouble(); + if (value >= 0) + { + return value; + } + throw new Exception("Cannot unmarshal type double"); + } + + public override void Write(Utf8JsonWriter writer, double value, JsonSerializerOptions options) + { + if (value >= 0) + { + JsonSerializer.Serialize(writer, value, options); + return; + } + throw new Exception("Cannot marshal type double"); + } + + public static readonly MinMaxValueCheckConverter Singleton = new MinMaxValueCheckConverter(); + } + + internal class SchemaVersionConverter : JsonConverter + { + public override bool CanConvert(Type t) => t == typeof(SchemaVersion); + + public override SchemaVersion Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + { + var value = reader.GetString(); + if (value == "2.0") + { + return SchemaVersion.The20; + } + throw new Exception("Cannot unmarshal type SchemaVersion"); + } + + public override void Write(Utf8JsonWriter writer, SchemaVersion value, JsonSerializerOptions options) + { + if (value == SchemaVersion.The20) + { + JsonSerializer.Serialize(writer, "2.0", options); + return; + } + throw new Exception("Cannot marshal type SchemaVersion"); + } + + public static readonly SchemaVersionConverter Singleton = new SchemaVersionConverter(); + } + internal class KeyDownEventTypeConverter : JsonConverter { public override bool CanConvert(Type t) => t == typeof(KeyDownEventType); diff --git a/packages/types/src/schemas/methods/get-accessibility-context.ts b/packages/types/src/schemas/methods/get-accessibility-context.ts index 1239647..2ce7267 100644 --- a/packages/types/src/schemas/methods/get-accessibility-context.ts +++ b/packages/types/src/schemas/methods/get-accessibility-context.ts @@ -1,67 +1,316 @@ import { z } from "zod"; -// Request params +// ============================================================================= +// Accessibility Context Schema +// ============================================================================= +// Schema for the Swift helper accessibility layer. +// Key features: +// - TextMarker API support for Electron/Chromium apps +// - Extraction method tracking for debugging +// - Performance metrics +// - Secure field and placeholder detection +// - UTF-16 code unit semantics (documented) +// ============================================================================= + +// ----------------------------------------------------------------------------- +// Enums +// ----------------------------------------------------------------------------- + +/** + * How the text selection was extracted. + * Priority order: textMarkerRange > selectedTextRange > selectedTextRanges > valueAttribute > stringForRange + */ +export const ExtractionMethodSchema = z.enum([ + "textMarkerRange", // Primary - AXSelectedTextMarkerRange (works in Electron) + "selectedTextRange", // Fallback 1 - AXSelectedTextRange + "selectedTextRanges", // Fallback 2 - AXSelectedTextRanges (multi-select) + "valueAttribute", // Fallback 3 - AXValue + "stringForRange", // Fallback 4 - AXStringForRange + "clipboardCopy", // Fallback 5 - Clipboard (Phase 2) + "none", // No extraction possible (secure field, etc.) +]); +export type ExtractionMethod = z.infer; + +// ----------------------------------------------------------------------------- +// Core Data Structures +// ----------------------------------------------------------------------------- + +/** + * Character range for text selection. + * + * IMPORTANT: UTF-16 Code Unit Semantics + * All `location` and `length` values are UTF-16 code unit offsets (equivalent to NSString indices), + * NOT Unicode scalar or grapheme cluster counts. + * + * This matches macOS Accessibility API semantics where CFRange and NSRange use UTF-16 code units. + * Characters outside the Basic Multilingual Plane (e.g., emoji like ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ) occupy 2 code units (surrogate pair). + * + * Examples: + * - "a" (U+0061) = 1 code unit + * - "๐Ÿ˜€" (U+1F600) = 2 code units + * - "๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ" = 11 code units (multiple emoji + ZWJ) + * + * Implications: + * - Swift: Use String.utf16 view for slicing + * - TypeScript/JS: string.length counts code units, so indices align correctly + */ +export const SelectionRangeSchema = z.object({ + /** UTF-16 code unit offset from start (NOT grapheme count) */ + location: z.number().int().nonnegative(), + /** UTF-16 code unit count (0 = cursor only, no selection) */ + length: z.number().int().nonnegative(), +}); +export type SelectionRange = z.infer; + +/** + * Text selection information. + * + * Null vs Empty String Semantics: + * - null = unavailable/unknown (API failed, attribute doesn't exist, or suppressed for security) + * - "" = available and empty (API succeeded, value exists, but is legitimately empty) + * + * Examples: + * - Cursor-only: selectedText = "" (not null), selectionRange.length = 0 + * - Empty text field: fullContent = "" (not null) + * - Secure field: all text fields are null (suppressed) + */ +export const TextSelectionSchema = z.object({ + // Core data + /** Selected text ("" for cursor-only, null if unavailable/suppressed) */ + selectedText: z.string().nullable(), + /** Full textbox content (window around selection if large, null if unavailable) */ + fullContent: z.string().nullable(), + /** Up to 500 UTF-16 units before selection (null if unavailable) */ + preSelectionText: z.string().nullable(), + /** Up to 500 UTF-16 units after selection (null if unavailable) */ + postSelectionText: z.string().nullable(), + /** UTF-16 code unit range (null for secure fields or if unavailable) */ + selectionRange: SelectionRangeSchema.nullable(), + + // Metadata + /** Can user type in this element? */ + isEditable: z.boolean(), + /** How was selection obtained? */ + extractionMethod: ExtractionMethodSchema, + /** Multi-cursor/selection detected? */ + hasMultipleRanges: z.boolean(), + + // Safety flags + /** Is this showing placeholder text only (no user input)? */ + isPlaceholder: z.boolean(), + /** Is this a password/secure field? (all content fields will be null) */ + isSecure: z.boolean(), + + // Truncation info + /** Was fullContent truncated/windowed due to size limits? */ + fullContentTruncated: z.boolean(), +}); +export type TextSelection = z.infer; + +/** + * Focused element information. + */ +export const AXElementInfoSchema = z.object({ + /** AXRole (AXTextField, AXWebArea, etc.) */ + role: z.string().nullable(), + /** AXSubrole if present */ + subrole: z.string().nullable(), + /** AXTitle */ + title: z.string().nullable(), + /** AXDescription */ + description: z.string().nullable(), + /** AXValue (null for secure fields - suppressed for security) */ + value: z.string().nullable(), + /** Can user type in this element? */ + isEditable: z.boolean(), + /** Is this element focused? */ + isFocused: z.boolean(), + /** Is this a secure/password field? */ + isSecure: z.boolean(), + /** Is this showing placeholder text? */ + isPlaceholder: z.boolean(), +}); +export type AXElementInfo = z.infer; + +/** + * Application information. + */ +export const ApplicationInfoSchema = z.object({ + /** Application name */ + name: z.string().nullable(), + /** Bundle identifier (e.g., com.apple.Safari) */ + bundleIdentifier: z.string().nullable(), + /** Application version */ + version: z.string().nullable(), + /** Process ID */ + pid: z.number().int(), +}); +export type ApplicationInfo = z.infer; + +/** + * Window information. + */ +export const WindowInfoSchema = z.object({ + /** Window title */ + title: z.string().nullable(), + /** Browser URL if detected */ + url: z.string().nullable(), +}); +export type WindowInfo = z.infer; + +/** + * Extraction performance metrics. + * + * Note: Error strings must contain only technical error messages, never PII or content values. + * Allowed: "TextMarker: AXError -25204", "Timeout exceeded" + * Forbidden: "Failed to parse text: Hello World", "Value was: password123" + */ +export const ExtractionMetricsSchema = z.object({ + /** Total extraction time in milliseconds */ + totalTimeMs: z.number().nonnegative(), + /** Did we attempt TextMarker extraction? */ + textMarkerAttempted: z.boolean(), + /** Did TextMarker extraction succeed? */ + textMarkerSucceeded: z.boolean(), + /** Which fallback methods were tried (in order) */ + fallbacksUsed: z.array(ExtractionMethodSchema), + /** Technical error messages only - NO PII/content */ + errors: z.array(z.string()), + /** Did extraction exceed best-effort time budget? */ + timedOut: z.boolean(), + + // WebArea retry path metrics + /** Did we search for WebArea candidates? (true when TextMarker fails on focused element) */ + webAreaRetryAttempted: z.boolean(), + /** Did we find a different WebArea to switch to? */ + webAreaFound: z.boolean(), + /** Did TextMarker work on the switched WebArea? */ + webAreaRetrySucceeded: z.boolean(), +}); +export type ExtractionMetrics = z.infer; + +// ----------------------------------------------------------------------------- +// Main Response Schema +// ----------------------------------------------------------------------------- + +/** + * Complete accessibility context response. + */ +export const AppContextSchema = z.object({ + /** Schema version for future evolution */ + schemaVersion: z.literal("2.0"), + + // Application context + /** Information about the frontmost application */ + application: ApplicationInfoSchema, + /** Window information (may be null) */ + windowInfo: WindowInfoSchema.nullable(), + + // Focus and selection + /** Currently focused element (may be null if no focus) */ + focusedElement: AXElementInfoSchema.nullable(), + /** Text selection information (may be null if no text field focused) */ + textSelection: TextSelectionSchema.nullable(), + + // Timing + /** Unix timestamp in seconds when context was captured */ + timestamp: z.number(), + + // Debugging + /** Performance metrics for this extraction */ + metrics: ExtractionMetricsSchema, +}); +export type AppContext = z.infer; + +// ----------------------------------------------------------------------------- +// RPC Method Schemas +// ----------------------------------------------------------------------------- + +/** + * Request params for getAccessibilityContext + */ export const GetAccessibilityContextParamsSchema = z.object({ - editableOnly: z.boolean().optional().default(true), // Only return text selection if element is editable + /** + * Only return text selection if element is editable. + * When true: searches for nearest editable element if current focus is not editable. + * When false: returns whatever element is focused, editable or not. + * Default: false + */ + editableOnly: z.boolean().optional().default(false), }); export type GetAccessibilityContextParams = z.infer< typeof GetAccessibilityContextParamsSchema >; -// Data structures for the result -const SelectionRangeSchema = z.object({ - location: z.number().int(), - length: z.number().int(), -}); - -const ApplicationInfoSchema = z.object({ - name: z.string().nullable(), - bundleIdentifier: z.string().nullable(), - version: z.string().nullable(), -}); - -const FocusedElementInfoSchema = z.object({ - role: z.string().nullable(), // Main accessibility role (e.g., "AXTextField", "AXButton") - isEditable: z.boolean(), - title: z.string().nullable(), - description: z.string().nullable(), - value: z.string().nullable(), -}); - -const TextSelectionInfoSchema = z.object({ - selectedText: z.string().nullable(), // Nullable when only cursor position is available (no selection) - fullContent: z.string().nullable(), - preSelectionText: z.string().nullable(), // Last 500 chars before cursor/selection (closest to cursor) - postSelectionText: z.string().nullable(), // First 500 chars after cursor/selection (closest to cursor) - selectionRange: SelectionRangeSchema.nullable(), - isEditable: z.boolean(), -}); - -const WindowInfoSchema = z.object({ - title: z.string().nullable(), - url: z.string().nullable(), // Browser URL if available -}); - -const AccessibilityContextSchema = z.object({ - application: ApplicationInfoSchema, - focusedElement: FocusedElementInfoSchema.nullable(), - textSelection: TextSelectionInfoSchema.nullable(), - windowInfo: WindowInfoSchema.nullable(), - timestamp: z.number(), -}); - -// Response result +/** + * Response result for getAccessibilityContext + */ export const GetAccessibilityContextResultSchema = z.object({ - context: AccessibilityContextSchema.nullable(), + context: AppContextSchema.nullable(), }); export type GetAccessibilityContextResult = z.infer< typeof GetAccessibilityContextResultSchema >; -// Export individual schemas for potential reuse -export type ApplicationInfo = z.infer; -export type FocusedElementInfo = z.infer; -export type TextSelectionInfo = z.infer; -export type WindowInfo = z.infer; -export type AccessibilityContext = z.infer; -export type SelectionRange = z.infer; +/** + * Request params for getAccessibilityStatus + */ +export const GetAccessibilityStatusParamsSchema = z.object({}); +export type GetAccessibilityStatusParams = z.infer< + typeof GetAccessibilityStatusParamsSchema +>; + +/** + * Response result for getAccessibilityStatus + */ +export const GetAccessibilityStatusResultSchema = z.object({ + /** Does the app have accessibility permission? */ + hasPermission: z.boolean(), + /** Is accessibility enabled system-wide? */ + isEnabled: z.boolean(), +}); +export type GetAccessibilityStatusResult = z.infer< + typeof GetAccessibilityStatusResultSchema +>; + +/** + * Request params for requestAccessibilityPermission + */ +export const RequestAccessibilityPermissionParamsSchema = z.object({}); +export type RequestAccessibilityPermissionParams = z.infer< + typeof RequestAccessibilityPermissionParamsSchema +>; + +/** + * Response result for requestAccessibilityPermission + */ +export const RequestAccessibilityPermissionResultSchema = z.object({ + /** Was permission granted? */ + granted: z.boolean(), +}); +export type RequestAccessibilityPermissionResult = z.infer< + typeof RequestAccessibilityPermissionResultSchema +>; + +// ----------------------------------------------------------------------------- +// Constants (for reference - actual values defined in Swift) +// ----------------------------------------------------------------------------- + +/** + * Context extraction limits (UTF-16 code units). + * These are documented here for reference; actual enforcement is in Swift. + */ +export const ACCESSIBILITY_CONSTANTS = { + /** Max UTF-16 units for pre/post selection context */ + MAX_CONTEXT_LENGTH: 500, + /** Max UTF-16 units for fullContent window */ + MAX_FULL_CONTENT_LENGTH: 50000, + /** UTF-16 units of padding around selection for windowing */ + WINDOW_PADDING: 25000, + /** Best-effort timeout target in milliseconds */ + BEST_EFFORT_TIMEOUT_MS: 600, + /** Max depth for element tree search */ + TREE_WALK_MAX_DEPTH: 8, + /** Max elements to search in tree walk */ + TREE_WALK_MAX_ELEMENTS: 100, +} as const; diff --git a/packages/types/src/schemas/rpc/request.ts b/packages/types/src/schemas/rpc/request.ts index c08f76f..75897d1 100644 --- a/packages/types/src/schemas/rpc/request.ts +++ b/packages/types/src/schemas/rpc/request.ts @@ -7,6 +7,8 @@ import { PasteTextParamsSchema } from "../methods/paste-text.js"; const RPCMethodNameSchema = z.union([ z.literal("getAccessibilityTreeDetails"), z.literal("getAccessibilityContext"), + z.literal("getAccessibilityStatus"), + z.literal("requestAccessibilityPermission"), z.literal("pasteText"), z.literal("muteSystemAudio"), z.literal("restoreSystemAudio"),