Surface remote retry counts and specify browser proxy model

This commit is contained in:
Lawrence Chen 2026-02-21 02:20:58 -08:00
parent 5e14bfe087
commit bab7440e49
3 changed files with 73 additions and 25 deletions

View file

@ -54,6 +54,8 @@ private final class WorkspaceRemoteSessionController {
private var daemonReady = false private var daemonReady = false
private var daemonBootstrapVersion: String? private var daemonBootstrapVersion: String?
private var daemonRemotePath: String? private var daemonRemotePath: String?
private var reconnectRetryCount = 0
private var reconnectWorkItem: DispatchWorkItem?
init(workspace: Workspace, configuration: WorkspaceRemoteConfiguration) { init(workspace: Workspace, configuration: WorkspaceRemoteConfiguration) {
self.workspace = workspace self.workspace = workspace
@ -76,6 +78,9 @@ private final class WorkspaceRemoteSessionController {
private func stopAllLocked() { private func stopAllLocked() {
isStopping = true isStopping = true
reconnectWorkItem?.cancel()
reconnectWorkItem = nil
reconnectRetryCount = 0
if let probeProcess { if let probeProcess {
probeStdoutPipe?.fileHandleForReading.readabilityHandler = nil probeStdoutPipe?.fileHandleForReading.readabilityHandler = nil
@ -107,8 +112,18 @@ private final class WorkspaceRemoteSessionController {
private func beginConnectionAttemptLocked() { private func beginConnectionAttemptLocked() {
guard !isStopping else { return } guard !isStopping else { return }
publishState(.connecting, detail: "Connecting to \(configuration.displayTarget)") reconnectWorkItem = nil
publishDaemonStatus(.bootstrapping, detail: "Bootstrapping remote daemon on \(configuration.displayTarget)") let connectDetail: String
let bootstrapDetail: String
if reconnectRetryCount > 0 {
connectDetail = "Reconnecting to \(configuration.displayTarget) (retry \(reconnectRetryCount))"
bootstrapDetail = "Bootstrapping remote daemon on \(configuration.displayTarget) (retry \(reconnectRetryCount))"
} else {
connectDetail = "Connecting to \(configuration.displayTarget)"
bootstrapDetail = "Bootstrapping remote daemon on \(configuration.displayTarget)"
}
publishState(.connecting, detail: connectDetail)
publishDaemonStatus(.bootstrapping, detail: bootstrapDetail)
do { do {
let hello = try bootstrapDaemonLocked() let hello = try bootstrapDaemonLocked()
daemonReady = true daemonReady = true
@ -127,10 +142,11 @@ private final class WorkspaceRemoteSessionController {
daemonReady = false daemonReady = false
daemonBootstrapVersion = nil daemonBootstrapVersion = nil
daemonRemotePath = nil daemonRemotePath = nil
let detail = "Remote daemon bootstrap failed: \(error.localizedDescription)" let nextRetry = scheduleProbeRestartLocked(delay: 4.0)
let retrySuffix = Self.retrySuffix(retry: nextRetry, delay: 4.0)
let detail = "Remote daemon bootstrap failed: \(error.localizedDescription)\(retrySuffix)"
publishDaemonStatus(.error, detail: detail) publishDaemonStatus(.error, detail: detail)
publishState(.error, detail: detail) publishState(.error, detail: detail)
scheduleProbeRestartLocked(delay: 4.0)
} }
} }
@ -183,8 +199,9 @@ private final class WorkspaceRemoteSessionController {
probeStdoutPipe = stdoutPipe probeStdoutPipe = stdoutPipe
probeStderrPipe = stderrPipe probeStderrPipe = stderrPipe
} catch { } catch {
publishState(.error, detail: "Failed to start SSH probe: \(error.localizedDescription)") let nextRetry = scheduleProbeRestartLocked(delay: 3.0)
scheduleProbeRestartLocked(delay: 3.0) let retrySuffix = Self.retrySuffix(retry: nextRetry, delay: 3.0)
publishState(.error, detail: "Failed to start SSH probe: \(error.localizedDescription)\(retrySuffix)")
} }
} }
@ -209,18 +226,27 @@ private final class WorkspaceRemoteSessionController {
let statusCode = process.terminationStatus let statusCode = process.terminationStatus
let rawDetail = Self.bestErrorLine(stderr: probeStderrBuffer, stdout: probeStdoutBuffer) let rawDetail = Self.bestErrorLine(stderr: probeStderrBuffer, stdout: probeStdoutBuffer)
let detail = rawDetail ?? "SSH probe exited with status \(statusCode)" let detail = rawDetail ?? "SSH probe exited with status \(statusCode)"
publishState(.error, detail: "SSH probe to \(configuration.displayTarget) failed: \(detail)") let nextRetry = scheduleProbeRestartLocked(delay: 3.0)
scheduleProbeRestartLocked(delay: 3.0) let retrySuffix = Self.retrySuffix(retry: nextRetry, delay: 3.0)
publishState(.error, detail: "SSH probe to \(configuration.displayTarget) failed: \(detail)\(retrySuffix)")
} }
private func scheduleProbeRestartLocked(delay: TimeInterval) { @discardableResult
guard !isStopping else { return } private func scheduleProbeRestartLocked(delay: TimeInterval) -> Int {
queue.asyncAfter(deadline: .now() + delay) { [weak self] in guard !isStopping else { return reconnectRetryCount }
reconnectWorkItem?.cancel()
reconnectRetryCount += 1
let retryNumber = reconnectRetryCount
let workItem = DispatchWorkItem { [weak self] in
guard let self else { return } guard let self else { return }
self.reconnectWorkItem = nil
guard !self.isStopping else { return } guard !self.isStopping else { return }
guard self.probeProcess == nil else { return } guard self.probeProcess == nil else { return }
self.beginConnectionAttemptLocked() self.beginConnectionAttemptLocked()
} }
reconnectWorkItem = workItem
queue.asyncAfter(deadline: .now() + delay, execute: workItem)
return retryNumber
} }
private func consumeProbeStdoutData(_ data: Data) { private func consumeProbeStdoutData(_ data: Data) {
@ -248,6 +274,9 @@ private final class WorkspaceRemoteSessionController {
let ports = Self.parseRemotePorts(line: line) let ports = Self.parseRemotePorts(line: line)
desiredRemotePorts = Set(ports) desiredRemotePorts = Set(ports)
portConflicts = portConflicts.intersection(desiredRemotePorts) portConflicts = portConflicts.intersection(desiredRemotePorts)
reconnectWorkItem?.cancel()
reconnectWorkItem = nil
reconnectRetryCount = 0
publishState(.connected, detail: "Connected to \(configuration.displayTarget)") publishState(.connected, detail: "Connected to \(configuration.displayTarget)")
reconcileForwardsLocked() reconcileForwardsLocked()
} }
@ -885,6 +914,11 @@ private final class WorkspaceRemoteSessionController {
return false return false
} }
private static func retrySuffix(retry: Int, delay: TimeInterval) -> String {
let seconds = max(1, Int(delay.rounded()))
return " (retry \(retry) in \(seconds)s)"
}
private static func isLoopbackPortAvailable(port: Int) -> Bool { private static func isLoopbackPortAvailable(port: Int) -> Bool {
guard port > 0 && port <= 65535 else { return false } guard port > 0 && port <= 65535 else { return false }

View file

@ -64,16 +64,29 @@ Minimum RPC surface:
Protocol requirement: Protocol requirement:
1. multiplexed framed streams (control + PTY + proxy data) 1. multiplexed framed streams (control + PTY + proxy data)
## 6. Proxying ## 6. Web Proxying (Browser-First)
Proxy endpoints (loopback only by default): Goal: remote workspaces browse from the remote host network, without per-service local port forwards.
1. HTTP CONNECT
2. SOCKS5
Behavior: Model:
1. requests tunnel to daemon, daemon dials destinations 1. `cmux ssh` creates/uses one **proxy endpoint per SSH transport** (not per workspace, not per destination port).
2. websocket must work in both proxy modes 2. Browser panels opened in remote workspaces are auto-wired to that endpoint.
3. local bind conflicts return structured errors (+ optional next-port fallback) 3. Terminal/service port forwarding is **not** the browser path; keep it opt-in for explicit localhost workflows only.
Implementation:
1. local `cmuxd` runs a transport-scoped proxy broker (`127.0.0.1:<ephemeral>`), supporting:
- HTTP CONNECT
- SOCKS5
2. broker opens multiplexed proxy streams to `cmuxd-remote`; remote daemon performs outbound dials.
3. browser wiring uses workspace-scoped `WKWebsiteDataStore.proxyConfigurations`:
- primary: SOCKS5 (`ProxyConfiguration(socksv5Proxy:)`)
- fallback: HTTP CONNECT (`ProxyConfiguration(httpCONNECTProxy:)`)
4. browser panels in non-remote workspaces use no forced proxy config.
Failure + reconnect:
1. if proxy endpoint bind fails, return structured `proxy_unavailable` with actionable detail.
2. if transport drops, browser requests fail fast, workspace status shows reconnect + retry count.
3. after reconnect, proxy broker and WKWebView proxy config are revalidated automatically.
## 7. Reconnect Semantics ## 7. Reconnect Semantics
@ -111,13 +124,13 @@ All cases require deterministic `MUST` assertions.
| ID | Scenario | MUST Assertions | | ID | Scenario | MUST Assertions |
|---|---|---| |---|---|---|
| W-001 | HTTP CONNECT | fixture response matches expected body | | W-001 | browser auto wiring | remote workspace browser gets daemon-backed proxy automatically |
| W-002 | SOCKS5 | response parity with direct remote | | W-002 | remote egress proof | remote workspace browser egress IP matches remote host, not local host |
| W-003 | websocket via CONNECT | echo integrity, no unexpected close | | W-003 | websocket via CONNECT | echo integrity, no unexpected close |
| W-004 | websocket via SOCKS5 | echo integrity | | W-004 | websocket via SOCKS5 | echo integrity |
| W-005 | port conflict | structured conflict error + fallback behavior | | W-005 | proxy listener conflict | structured `proxy_unavailable` + fallback bind behavior |
| W-006 | concurrent PTY + proxy load | no PTY stall; proxy latency/error budget met | | W-006 | concurrent PTY + proxy load | no PTY stall; proxy latency/error budget met |
| W-007 | browser auto wiring | browser workflow uses daemon-backed proxy automatically when remote session is active | | W-007 | reconnect continuity | after transport reconnect, browser traffic resumes without manual proxy reconfiguration |
### 8.3 Reconnect ### 8.3 Reconnect
@ -147,5 +160,5 @@ All cases require deterministic `MUST` assertions.
## 10. Open Decisions ## 10. Open Decisions
1. proxy endpoint scope: per daemon transport vs per workspace 1. reconnect retry budget and backoff profile
2. reconnect retry budget and backoff profile 2. proxy auth policy (none vs optional credentials for local broker)

View file

@ -171,6 +171,7 @@ def main() -> int:
last_daemon = last_remote.get("daemon") or {} last_daemon = last_remote.get("daemon") or {}
detail = str(last_daemon.get("detail") or "") detail = str(last_daemon.get("detail") or "")
_must("bootstrap failed" in detail.lower(), f"daemon error should mention bootstrap failure: {last_status}") _must("bootstrap failed" in detail.lower(), f"daemon error should mention bootstrap failure: {last_status}")
_must(re.search(r"retry\s+\d+", detail.lower()) is not None, f"daemon error should include retry count: {last_status}")
# Lifecycle regression: disconnect with clear should reset remote/daemon metadata. # Lifecycle regression: disconnect with clear should reset remote/daemon metadata.
disconnected = client._call( disconnected = client._call(