From e419fd91641fc06378df31b728adcb492239ddb4 Mon Sep 17 00:00:00 2001 From: Austin Wang Date: Sun, 29 Mar 2026 18:07:39 -0700 Subject: [PATCH] Fix remote proxy notification spam with cooldown, backoff, and SSH keepalive (#2325) (#2330) - Add 5-minute per-host cooldown for remote error notifications - Add exponential backoff (capped at 60s) to proxy broker and session controller retries - Add default SSH ConnectTimeout/ServerAliveInterval/ServerAliveCountMax to detect dead connections faster - Fix error status clearing to only reset on actual .connected state Co-authored-by: Claude Opus 4.6 --- CLI/cmux.swift | 9 +++ Sources/TerminalNotificationStore.swift | 30 ++++++++- Sources/Workspace.swift | 85 ++++++++++++++++++------- 3 files changed, 100 insertions(+), 24 deletions(-) diff --git a/CLI/cmux.swift b/CLI/cmux.swift index 6688dba1..ef7d6aed 100644 --- a/CLI/cmux.swift +++ b/CLI/cmux.swift @@ -4163,6 +4163,15 @@ struct CMUXCLI { remoteRelayPort: options.remoteRelayPort ) var parts: [String] = ["ssh"] + if !hasSSHOptionKey(effectiveSSHOptions, key: "ConnectTimeout") { + parts += ["-o", "ConnectTimeout=6"] + } + if !hasSSHOptionKey(effectiveSSHOptions, key: "ServerAliveInterval") { + parts += ["-o", "ServerAliveInterval=20"] + } + if !hasSSHOptionKey(effectiveSSHOptions, key: "ServerAliveCountMax") { + parts += ["-o", "ServerAliveCountMax=2"] + } if !hasSSHOptionKey(effectiveSSHOptions, key: "SetEnv") { parts += ["-o", "SetEnv COLORTERM=truecolor"] } diff --git a/Sources/TerminalNotificationStore.swift b/Sources/TerminalNotificationStore.swift index 19a695bc..9f3de14a 100644 --- a/Sources/TerminalNotificationStore.swift +++ b/Sources/TerminalNotificationStore.swift @@ -733,6 +733,7 @@ final class TerminalNotificationStore: ObservableObject { notification in store.playSuppressedNotificationFeedback(for: notification) } + private var lastNotificationDateByCooldownKey: [String: Date] = [:] private var indexes = NotificationIndexes() private init() { @@ -890,7 +891,29 @@ final class TerminalNotificationStore: ObservableObject { focusedReadIndicatorByTabId[tabId] } - func addNotification(tabId: UUID, surfaceId: UUID?, title: String, subtitle: String, body: String) { + func addNotification( + tabId: UUID, + surfaceId: UUID?, + title: String, + subtitle: String, + body: String, + cooldownKey: String? = nil, + cooldownInterval: TimeInterval? = nil + ) { + let now = Date() + let resolvedCooldownInterval: TimeInterval? + if let cooldownInterval, cooldownInterval.isFinite, cooldownInterval > 0 { + resolvedCooldownInterval = cooldownInterval + } else { + resolvedCooldownInterval = nil + } + if let cooldownKey, + let resolvedCooldownInterval, + let lastNotificationDate = lastNotificationDateByCooldownKey[cooldownKey], + now.timeIntervalSince(lastNotificationDate) < resolvedCooldownInterval { + return + } + var updated = notifications var idsToClear: [String] = [] updated.removeAll { existing in @@ -925,11 +948,14 @@ final class TerminalNotificationStore: ObservableObject { title: title, subtitle: subtitle, body: body, - createdAt: Date(), + createdAt: now, isRead: false ) updated.insert(notification, at: 0) notifications = updated + if let cooldownKey, resolvedCooldownInterval != nil { + lastNotificationDateByCooldownKey[cooldownKey] = now + } if !idsToClear.isEmpty { center.removeDeliveredNotificationsOffMain(withIdentifiers: idsToClear) center.removePendingNotificationRequestsOffMain(withIdentifiers: idsToClear) diff --git a/Sources/Workspace.swift b/Sources/Workspace.swift index 7bb6d9aa..ce60e26d 100644 --- a/Sources/Workspace.swift +++ b/Sources/Workspace.swift @@ -2489,6 +2489,7 @@ private final class WorkspaceRemoteProxyBroker { var tunnel: WorkspaceRemoteDaemonProxyTunnel? var endpoint: BrowserProxyEndpoint? var restartWorkItem: DispatchWorkItem? + var restartRetryCount = 0 var subscribers: [UUID: (Update) -> Void] = [:] init(configuration: WorkspaceRemoteConfiguration, remotePath: String) { @@ -2515,6 +2516,7 @@ private final class WorkspaceRemoteProxyBroker { entry = existing if existing.remotePath != remotePath { existing.remotePath = remotePath + existing.restartRetryCount = 0 if existing.tunnel != nil { stopEntryRuntimeLocked(existing) notifyLocked(existing, update: .connecting) @@ -2558,12 +2560,13 @@ private final class WorkspaceRemoteProxyBroker { // Internal deterministic test hook used by docker regressions to force bind conflicts. localPort = forcedLocalPort } else { + let retryDelay = Self.retryDelay(baseDelay: 3.0, retry: entry.restartRetryCount + 1) guard let allocatedPort = Self.allocateLoopbackPort() else { notifyLocked( entry, - update: .error("Failed to allocate local proxy port\(Self.retrySuffix(delay: 3.0))") + update: .error("Failed to allocate local proxy port\(Self.retrySuffix(delay: retryDelay))") ) - scheduleRestartLocked(key: key, entry: entry, delay: 3.0) + scheduleRestartLocked(key: key, entry: entry, baseDelay: 3.0) return } localPort = allocatedPort @@ -2583,28 +2586,33 @@ private final class WorkspaceRemoteProxyBroker { entry.tunnel = tunnel let endpoint = BrowserProxyEndpoint(host: "127.0.0.1", port: localPort) entry.endpoint = endpoint + entry.restartRetryCount = 0 notifyLocked(entry, update: .ready(endpoint)) } catch { stopEntryRuntimeLocked(entry) let detail = "Failed to start local daemon proxy: \(error.localizedDescription)" - notifyLocked(entry, update: .error("\(detail)\(Self.retrySuffix(delay: 3.0))")) - scheduleRestartLocked(key: key, entry: entry, delay: 3.0) + let retryDelay = Self.retryDelay(baseDelay: 3.0, retry: entry.restartRetryCount + 1) + notifyLocked(entry, update: .error("\(detail)\(Self.retrySuffix(delay: retryDelay))")) + scheduleRestartLocked(key: key, entry: entry, baseDelay: 3.0) } } private func handleTunnelFailureLocked(key: String, detail: String) { guard let entry = entries[key], entry.tunnel != nil else { return } stopEntryRuntimeLocked(entry) - notifyLocked(entry, update: .error("\(detail)\(Self.retrySuffix(delay: 3.0))")) - scheduleRestartLocked(key: key, entry: entry, delay: 3.0) + let retryDelay = Self.retryDelay(baseDelay: 3.0, retry: entry.restartRetryCount + 1) + notifyLocked(entry, update: .error("\(detail)\(Self.retrySuffix(delay: retryDelay))")) + scheduleRestartLocked(key: key, entry: entry, baseDelay: 3.0) } - private func scheduleRestartLocked(key: String, entry: Entry, delay: TimeInterval) { + private func scheduleRestartLocked(key: String, entry: Entry, baseDelay: TimeInterval) { guard !entry.subscribers.isEmpty else { teardownEntryLocked(key: key, entry: entry) return } guard entry.restartWorkItem == nil else { return } + entry.restartRetryCount += 1 + let retryDelay = Self.retryDelay(baseDelay: baseDelay, retry: entry.restartRetryCount) let workItem = DispatchWorkItem { [weak self] in guard let self, let currentEntry = self.entries[key] else { return } @@ -2618,7 +2626,7 @@ private final class WorkspaceRemoteProxyBroker { } entry.restartWorkItem = workItem - queue.asyncAfter(deadline: .now() + delay, execute: workItem) + queue.asyncAfter(deadline: .now() + retryDelay, execute: workItem) } private func teardownEntryLocked(key: String, entry: Entry) { @@ -2687,6 +2695,11 @@ private final class WorkspaceRemoteProxyBroker { let seconds = max(1, Int(delay.rounded())) return " (retry in \(seconds)s)" } + + private static func retryDelay(baseDelay: TimeInterval, retry: Int) -> TimeInterval { + let exponent = Double(max(0, retry - 1)) + return min(baseDelay * pow(2.0, exponent), 60.0) + } } private final class WorkspaceRemoteCLIRelayServer { @@ -3170,6 +3183,11 @@ private final class WorkspaceRemoteCLIRelayServer { } final class WorkspaceRemoteSessionController { + private struct RetrySchedule { + let retry: Int + let delay: TimeInterval + } + private struct CommandResult { let status: Int32 let stdout: String @@ -3357,8 +3375,8 @@ final class WorkspaceRemoteSessionController { daemonReady = false daemonBootstrapVersion = nil daemonRemotePath = nil - let nextRetry = scheduleReconnectLocked(delay: 4.0) - let retrySuffix = Self.retrySuffix(retry: nextRetry, delay: 4.0) + let retrySchedule = scheduleReconnectLocked(baseDelay: 4.0) + let retrySuffix = Self.retrySuffix(retry: retrySchedule.retry, delay: retrySchedule.delay) let detail = "Remote daemon bootstrap failed: \(error.localizedDescription)\(retrySuffix)" publishDaemonStatus(.error, detail: detail) publishState(.error, detail: detail) @@ -3371,8 +3389,8 @@ final class WorkspaceRemoteSessionController { guard proxyLease == nil else { return } guard let remotePath = daemonRemotePath, !remotePath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { - let nextRetry = scheduleReconnectLocked(delay: 4.0) - let retrySuffix = Self.retrySuffix(retry: nextRetry, delay: 4.0) + let retrySchedule = scheduleReconnectLocked(baseDelay: 4.0) + let retrySuffix = Self.retrySuffix(retry: retrySchedule.retry, delay: retrySchedule.delay) let detail = "Remote daemon did not provide a valid remote path\(retrySuffix)" publishDaemonStatus(.error, detail: detail) publishState(.error, detail: detail) @@ -3588,8 +3606,8 @@ final class WorkspaceRemoteSessionController { daemonBootstrapVersion = nil daemonRemotePath = nil - let nextRetry = scheduleReconnectLocked(delay: 2.0) - let retrySuffix = Self.retrySuffix(retry: nextRetry, delay: 2.0) + let retrySchedule = scheduleReconnectLocked(baseDelay: 2.0) + let retrySuffix = Self.retrySuffix(retry: retrySchedule.retry, delay: retrySchedule.delay) publishDaemonStatus( .error, detail: "Remote daemon transport needs re-bootstrap after proxy failure\(retrySuffix)" @@ -3598,11 +3616,12 @@ final class WorkspaceRemoteSessionController { } @discardableResult - private func scheduleReconnectLocked(delay: TimeInterval) -> Int { - guard !isStopping else { return reconnectRetryCount } + private func scheduleReconnectLocked(baseDelay: TimeInterval) -> RetrySchedule { + let retryNumber = reconnectRetryCount + 1 + let retryDelay = Self.retryDelay(baseDelay: baseDelay, retry: retryNumber) + guard !isStopping else { return RetrySchedule(retry: retryNumber, delay: retryDelay) } reconnectWorkItem?.cancel() - reconnectRetryCount += 1 - let retryNumber = reconnectRetryCount + reconnectRetryCount = retryNumber let workItem = DispatchWorkItem { [weak self] in guard let self else { return } self.reconnectWorkItem = nil @@ -3611,8 +3630,8 @@ final class WorkspaceRemoteSessionController { self.beginConnectionAttemptLocked() } reconnectWorkItem = workItem - queue.asyncAfter(deadline: .now() + delay, execute: workItem) - return retryNumber + queue.asyncAfter(deadline: .now() + retryDelay, execute: workItem) + return RetrySchedule(retry: retryNumber, delay: retryDelay) } private func publishState(_ state: WorkspaceRemoteConnectionState, detail: String?) { @@ -4853,6 +4872,11 @@ final class WorkspaceRemoteSessionController { return " (retry \(retry) in \(seconds)s)" } + private static func retryDelay(baseDelay: TimeInterval, retry: Int) -> TimeInterval { + let exponent = Double(max(0, retry - 1)) + return min(baseDelay * pow(2.0, exponent), 60.0) + } + private static func shouldEscalateProxyErrorToBootstrap(_ detail: String) -> Bool { let lowered = detail.lowercased() return lowered.contains("remote daemon transport failed") @@ -5542,6 +5566,7 @@ final class Workspace: Identifiable, ObservableObject { private static let remoteErrorStatusKey = "remote.error" private static let remotePortConflictStatusKey = "remote.port_conflicts" + private static let remoteNotificationCooldown: TimeInterval = 5 * 60 private static let sshControlMasterCleanupQueue = DispatchQueue( label: "com.cmux.remote-ssh.control-master-cleanup", qos: .utility @@ -5577,6 +5602,20 @@ final class Workspace: Identifiable, ObservableObject { return entry.lowercased().contains("remote proxy unavailable") } + private func remoteNotificationCooldownKey(target: String) -> String? { + let rawTarget = (remoteConfiguration?.destination ?? target) + .trimmingCharacters(in: .whitespacesAndNewlines) + guard !rawTarget.isEmpty else { return nil } + let normalizedHost = rawTarget + .split(separator: "@", maxSplits: 1, omittingEmptySubsequences: false) + .last + .map(String.init)? + .trimmingCharacters(in: .whitespacesAndNewlines) + .lowercased() + guard let normalizedHost, !normalizedHost.isEmpty else { return nil } + return "remote-host:\(normalizedHost)" + } + var focusedSurfaceId: UUID? { focusedPanelId } var surfaceDirectories: [UUID: String] { get { panelDirectories } @@ -7165,13 +7204,15 @@ final class Workspace: Identifiable, ObservableObject { surfaceId: nil, title: notificationTitle, subtitle: target, - body: trimmedDetail + body: trimmedDetail, + cooldownKey: remoteNotificationCooldownKey(target: target), + cooldownInterval: Self.remoteNotificationCooldown ) } return } - if !preserveConnectedStateForRetry && state != .error { + if state == .connected { statusEntries.removeValue(forKey: Self.remoteErrorStatusKey) remoteLastErrorFingerprint = nil }