Fix remote proxy notification spam with cooldown, backoff, and SSH keepalive (#2325) (#2330)

- Add 5-minute per-host cooldown for remote error notifications
- Add exponential backoff (capped at 60s) to proxy broker and session controller retries
- Add default SSH ConnectTimeout/ServerAliveInterval/ServerAliveCountMax to detect dead connections faster
- Fix error status clearing to only reset on actual .connected state

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Austin Wang 2026-03-29 18:07:39 -07:00 committed by GitHub
parent 9e75355525
commit e419fd9164
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 100 additions and 24 deletions

View file

@ -4163,6 +4163,15 @@ struct CMUXCLI {
remoteRelayPort: options.remoteRelayPort
)
var parts: [String] = ["ssh"]
if !hasSSHOptionKey(effectiveSSHOptions, key: "ConnectTimeout") {
parts += ["-o", "ConnectTimeout=6"]
}
if !hasSSHOptionKey(effectiveSSHOptions, key: "ServerAliveInterval") {
parts += ["-o", "ServerAliveInterval=20"]
}
if !hasSSHOptionKey(effectiveSSHOptions, key: "ServerAliveCountMax") {
parts += ["-o", "ServerAliveCountMax=2"]
}
if !hasSSHOptionKey(effectiveSSHOptions, key: "SetEnv") {
parts += ["-o", "SetEnv COLORTERM=truecolor"]
}

View file

@ -733,6 +733,7 @@ final class TerminalNotificationStore: ObservableObject {
notification in
store.playSuppressedNotificationFeedback(for: notification)
}
private var lastNotificationDateByCooldownKey: [String: Date] = [:]
private var indexes = NotificationIndexes()
private init() {
@ -890,7 +891,29 @@ final class TerminalNotificationStore: ObservableObject {
focusedReadIndicatorByTabId[tabId]
}
func addNotification(tabId: UUID, surfaceId: UUID?, title: String, subtitle: String, body: String) {
func addNotification(
tabId: UUID,
surfaceId: UUID?,
title: String,
subtitle: String,
body: String,
cooldownKey: String? = nil,
cooldownInterval: TimeInterval? = nil
) {
let now = Date()
let resolvedCooldownInterval: TimeInterval?
if let cooldownInterval, cooldownInterval.isFinite, cooldownInterval > 0 {
resolvedCooldownInterval = cooldownInterval
} else {
resolvedCooldownInterval = nil
}
if let cooldownKey,
let resolvedCooldownInterval,
let lastNotificationDate = lastNotificationDateByCooldownKey[cooldownKey],
now.timeIntervalSince(lastNotificationDate) < resolvedCooldownInterval {
return
}
var updated = notifications
var idsToClear: [String] = []
updated.removeAll { existing in
@ -925,11 +948,14 @@ final class TerminalNotificationStore: ObservableObject {
title: title,
subtitle: subtitle,
body: body,
createdAt: Date(),
createdAt: now,
isRead: false
)
updated.insert(notification, at: 0)
notifications = updated
if let cooldownKey, resolvedCooldownInterval != nil {
lastNotificationDateByCooldownKey[cooldownKey] = now
}
if !idsToClear.isEmpty {
center.removeDeliveredNotificationsOffMain(withIdentifiers: idsToClear)
center.removePendingNotificationRequestsOffMain(withIdentifiers: idsToClear)

View file

@ -2489,6 +2489,7 @@ private final class WorkspaceRemoteProxyBroker {
var tunnel: WorkspaceRemoteDaemonProxyTunnel?
var endpoint: BrowserProxyEndpoint?
var restartWorkItem: DispatchWorkItem?
var restartRetryCount = 0
var subscribers: [UUID: (Update) -> Void] = [:]
init(configuration: WorkspaceRemoteConfiguration, remotePath: String) {
@ -2515,6 +2516,7 @@ private final class WorkspaceRemoteProxyBroker {
entry = existing
if existing.remotePath != remotePath {
existing.remotePath = remotePath
existing.restartRetryCount = 0
if existing.tunnel != nil {
stopEntryRuntimeLocked(existing)
notifyLocked(existing, update: .connecting)
@ -2558,12 +2560,13 @@ private final class WorkspaceRemoteProxyBroker {
// Internal deterministic test hook used by docker regressions to force bind conflicts.
localPort = forcedLocalPort
} else {
let retryDelay = Self.retryDelay(baseDelay: 3.0, retry: entry.restartRetryCount + 1)
guard let allocatedPort = Self.allocateLoopbackPort() else {
notifyLocked(
entry,
update: .error("Failed to allocate local proxy port\(Self.retrySuffix(delay: 3.0))")
update: .error("Failed to allocate local proxy port\(Self.retrySuffix(delay: retryDelay))")
)
scheduleRestartLocked(key: key, entry: entry, delay: 3.0)
scheduleRestartLocked(key: key, entry: entry, baseDelay: 3.0)
return
}
localPort = allocatedPort
@ -2583,28 +2586,33 @@ private final class WorkspaceRemoteProxyBroker {
entry.tunnel = tunnel
let endpoint = BrowserProxyEndpoint(host: "127.0.0.1", port: localPort)
entry.endpoint = endpoint
entry.restartRetryCount = 0
notifyLocked(entry, update: .ready(endpoint))
} catch {
stopEntryRuntimeLocked(entry)
let detail = "Failed to start local daemon proxy: \(error.localizedDescription)"
notifyLocked(entry, update: .error("\(detail)\(Self.retrySuffix(delay: 3.0))"))
scheduleRestartLocked(key: key, entry: entry, delay: 3.0)
let retryDelay = Self.retryDelay(baseDelay: 3.0, retry: entry.restartRetryCount + 1)
notifyLocked(entry, update: .error("\(detail)\(Self.retrySuffix(delay: retryDelay))"))
scheduleRestartLocked(key: key, entry: entry, baseDelay: 3.0)
}
}
private func handleTunnelFailureLocked(key: String, detail: String) {
guard let entry = entries[key], entry.tunnel != nil else { return }
stopEntryRuntimeLocked(entry)
notifyLocked(entry, update: .error("\(detail)\(Self.retrySuffix(delay: 3.0))"))
scheduleRestartLocked(key: key, entry: entry, delay: 3.0)
let retryDelay = Self.retryDelay(baseDelay: 3.0, retry: entry.restartRetryCount + 1)
notifyLocked(entry, update: .error("\(detail)\(Self.retrySuffix(delay: retryDelay))"))
scheduleRestartLocked(key: key, entry: entry, baseDelay: 3.0)
}
private func scheduleRestartLocked(key: String, entry: Entry, delay: TimeInterval) {
private func scheduleRestartLocked(key: String, entry: Entry, baseDelay: TimeInterval) {
guard !entry.subscribers.isEmpty else {
teardownEntryLocked(key: key, entry: entry)
return
}
guard entry.restartWorkItem == nil else { return }
entry.restartRetryCount += 1
let retryDelay = Self.retryDelay(baseDelay: baseDelay, retry: entry.restartRetryCount)
let workItem = DispatchWorkItem { [weak self] in
guard let self, let currentEntry = self.entries[key] else { return }
@ -2618,7 +2626,7 @@ private final class WorkspaceRemoteProxyBroker {
}
entry.restartWorkItem = workItem
queue.asyncAfter(deadline: .now() + delay, execute: workItem)
queue.asyncAfter(deadline: .now() + retryDelay, execute: workItem)
}
private func teardownEntryLocked(key: String, entry: Entry) {
@ -2687,6 +2695,11 @@ private final class WorkspaceRemoteProxyBroker {
let seconds = max(1, Int(delay.rounded()))
return " (retry in \(seconds)s)"
}
private static func retryDelay(baseDelay: TimeInterval, retry: Int) -> TimeInterval {
let exponent = Double(max(0, retry - 1))
return min(baseDelay * pow(2.0, exponent), 60.0)
}
}
private final class WorkspaceRemoteCLIRelayServer {
@ -3170,6 +3183,11 @@ private final class WorkspaceRemoteCLIRelayServer {
}
final class WorkspaceRemoteSessionController {
private struct RetrySchedule {
let retry: Int
let delay: TimeInterval
}
private struct CommandResult {
let status: Int32
let stdout: String
@ -3357,8 +3375,8 @@ final class WorkspaceRemoteSessionController {
daemonReady = false
daemonBootstrapVersion = nil
daemonRemotePath = nil
let nextRetry = scheduleReconnectLocked(delay: 4.0)
let retrySuffix = Self.retrySuffix(retry: nextRetry, delay: 4.0)
let retrySchedule = scheduleReconnectLocked(baseDelay: 4.0)
let retrySuffix = Self.retrySuffix(retry: retrySchedule.retry, delay: retrySchedule.delay)
let detail = "Remote daemon bootstrap failed: \(error.localizedDescription)\(retrySuffix)"
publishDaemonStatus(.error, detail: detail)
publishState(.error, detail: detail)
@ -3371,8 +3389,8 @@ final class WorkspaceRemoteSessionController {
guard proxyLease == nil else { return }
guard let remotePath = daemonRemotePath,
!remotePath.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
let nextRetry = scheduleReconnectLocked(delay: 4.0)
let retrySuffix = Self.retrySuffix(retry: nextRetry, delay: 4.0)
let retrySchedule = scheduleReconnectLocked(baseDelay: 4.0)
let retrySuffix = Self.retrySuffix(retry: retrySchedule.retry, delay: retrySchedule.delay)
let detail = "Remote daemon did not provide a valid remote path\(retrySuffix)"
publishDaemonStatus(.error, detail: detail)
publishState(.error, detail: detail)
@ -3588,8 +3606,8 @@ final class WorkspaceRemoteSessionController {
daemonBootstrapVersion = nil
daemonRemotePath = nil
let nextRetry = scheduleReconnectLocked(delay: 2.0)
let retrySuffix = Self.retrySuffix(retry: nextRetry, delay: 2.0)
let retrySchedule = scheduleReconnectLocked(baseDelay: 2.0)
let retrySuffix = Self.retrySuffix(retry: retrySchedule.retry, delay: retrySchedule.delay)
publishDaemonStatus(
.error,
detail: "Remote daemon transport needs re-bootstrap after proxy failure\(retrySuffix)"
@ -3598,11 +3616,12 @@ final class WorkspaceRemoteSessionController {
}
@discardableResult
private func scheduleReconnectLocked(delay: TimeInterval) -> Int {
guard !isStopping else { return reconnectRetryCount }
private func scheduleReconnectLocked(baseDelay: TimeInterval) -> RetrySchedule {
let retryNumber = reconnectRetryCount + 1
let retryDelay = Self.retryDelay(baseDelay: baseDelay, retry: retryNumber)
guard !isStopping else { return RetrySchedule(retry: retryNumber, delay: retryDelay) }
reconnectWorkItem?.cancel()
reconnectRetryCount += 1
let retryNumber = reconnectRetryCount
reconnectRetryCount = retryNumber
let workItem = DispatchWorkItem { [weak self] in
guard let self else { return }
self.reconnectWorkItem = nil
@ -3611,8 +3630,8 @@ final class WorkspaceRemoteSessionController {
self.beginConnectionAttemptLocked()
}
reconnectWorkItem = workItem
queue.asyncAfter(deadline: .now() + delay, execute: workItem)
return retryNumber
queue.asyncAfter(deadline: .now() + retryDelay, execute: workItem)
return RetrySchedule(retry: retryNumber, delay: retryDelay)
}
private func publishState(_ state: WorkspaceRemoteConnectionState, detail: String?) {
@ -4853,6 +4872,11 @@ final class WorkspaceRemoteSessionController {
return " (retry \(retry) in \(seconds)s)"
}
private static func retryDelay(baseDelay: TimeInterval, retry: Int) -> TimeInterval {
let exponent = Double(max(0, retry - 1))
return min(baseDelay * pow(2.0, exponent), 60.0)
}
private static func shouldEscalateProxyErrorToBootstrap(_ detail: String) -> Bool {
let lowered = detail.lowercased()
return lowered.contains("remote daemon transport failed")
@ -5542,6 +5566,7 @@ final class Workspace: Identifiable, ObservableObject {
private static let remoteErrorStatusKey = "remote.error"
private static let remotePortConflictStatusKey = "remote.port_conflicts"
private static let remoteNotificationCooldown: TimeInterval = 5 * 60
private static let sshControlMasterCleanupQueue = DispatchQueue(
label: "com.cmux.remote-ssh.control-master-cleanup",
qos: .utility
@ -5577,6 +5602,20 @@ final class Workspace: Identifiable, ObservableObject {
return entry.lowercased().contains("remote proxy unavailable")
}
private func remoteNotificationCooldownKey(target: String) -> String? {
let rawTarget = (remoteConfiguration?.destination ?? target)
.trimmingCharacters(in: .whitespacesAndNewlines)
guard !rawTarget.isEmpty else { return nil }
let normalizedHost = rawTarget
.split(separator: "@", maxSplits: 1, omittingEmptySubsequences: false)
.last
.map(String.init)?
.trimmingCharacters(in: .whitespacesAndNewlines)
.lowercased()
guard let normalizedHost, !normalizedHost.isEmpty else { return nil }
return "remote-host:\(normalizedHost)"
}
var focusedSurfaceId: UUID? { focusedPanelId }
var surfaceDirectories: [UUID: String] {
get { panelDirectories }
@ -7165,13 +7204,15 @@ final class Workspace: Identifiable, ObservableObject {
surfaceId: nil,
title: notificationTitle,
subtitle: target,
body: trimmedDetail
body: trimmedDetail,
cooldownKey: remoteNotificationCooldownKey(target: target),
cooldownInterval: Self.remoteNotificationCooldown
)
}
return
}
if !preserveConnectedStateForRetry && state != .error {
if state == .connected {
statusEntries.removeValue(forKey: Self.remoteErrorStatusKey)
remoteLastErrorFingerprint = nil
}