Fix claude-hook stop teardown races (#1954)

* Add regression test for claude-hook stop teardown errors

* Handle claude-hook stop teardown races
This commit is contained in:
Austin Wang 2026-03-22 17:34:33 -07:00 committed by GitHub
parent c9616c59e9
commit e8cf83ca57
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 352 additions and 77 deletions

View file

@ -10173,15 +10173,18 @@ struct CMUXCLI {
"has_surface_flag": optionValue(hookArgs, name: "--surface") != nil
]
)
let fallbackWorkspaceId = try resolveWorkspaceIdForClaudeHook(workspaceArg, client: client)
let fallbackSurfaceId = try? resolveSurfaceId(surfaceArg, workspaceId: fallbackWorkspaceId, client: client)
switch subcommand {
case "session-start", "active":
telemetry.breadcrumb("claude-hook.session-start")
let workspaceId = fallbackWorkspaceId
let surfaceId = try resolveSurfaceIdForClaudeHook(
surfaceArg,
let workspaceId = try resolvePreferredWorkspaceIdForClaudeHook(
preferred: nil,
fallback: workspaceArg,
client: client
)
let surfaceId = try resolvePreferredSurfaceIdForClaudeHook(
preferred: nil,
fallback: surfaceArg,
workspaceId: workspaceId,
client: client
)
@ -10217,27 +10220,32 @@ struct CMUXCLI {
case "stop", "idle":
telemetry.breadcrumb("claude-hook.stop")
do {
// Turn ended. Don't consume session or clear PID Claude is still alive.
// Notification hook handles user-facing notifications; SessionEnd handles cleanup.
var workspaceId = fallbackWorkspaceId
var surfaceId = surfaceArg
if let sessionId = parsedInput.sessionId,
let mapped = try? sessionStore.lookup(sessionId: sessionId),
let mappedWorkspace = try? resolveWorkspaceIdForClaudeHook(mapped.workspaceId, client: client) {
workspaceId = mappedWorkspace
surfaceId = mapped.surfaceId
}
let mappedSession = parsedInput.sessionId.flatMap { try? sessionStore.lookup(sessionId: $0) }
let workspaceId = try resolvePreferredWorkspaceIdForClaudeHook(
preferred: mappedSession?.workspaceId,
fallback: workspaceArg,
client: client
)
let surfaceId = try resolvePreferredSurfaceIdForClaudeHook(
preferred: mappedSession?.surfaceId,
fallback: surfaceArg,
workspaceId: workspaceId,
client: client
)
// Update session with transcript summary and send completion notification.
let completion = summarizeClaudeHookStop(
parsedInput: parsedInput,
sessionRecord: (try? sessionStore.lookup(sessionId: parsedInput.sessionId ?? ""))
sessionRecord: mappedSession
)
if let sessionId = parsedInput.sessionId, let completion {
try? sessionStore.upsert(
sessionId: sessionId,
workspaceId: workspaceId,
surfaceId: surfaceId ?? "",
surfaceId: surfaceId,
cwd: parsedInput.cwd,
lastSubtitle: completion.subtitle,
lastBody: completion.body
@ -10245,19 +10253,14 @@ struct CMUXCLI {
}
if let completion {
let resolvedSurface = try resolveSurfaceIdForClaudeHook(
surfaceId,
workspaceId: workspaceId,
client: client
)
let title = "Claude Code"
let subtitle = sanitizeNotificationField(completion.subtitle)
let body = sanitizeNotificationField(completion.body)
let payload = "\(title)|\(subtitle)|\(body)"
_ = try? sendV1Command("notify_target \(workspaceId) \(resolvedSurface) \(payload)", client: client)
_ = try? sendV1Command("notify_target \(workspaceId) \(surfaceId) \(payload)", client: client)
}
try setClaudeStatus(
try? setClaudeStatus(
client: client,
workspaceId: workspaceId,
value: "Idle",
@ -10265,15 +10268,23 @@ struct CMUXCLI {
color: "#8E8E93"
)
print("OK")
} catch {
if shouldIgnoreClaudeHookTeardownError(error) {
telemetry.breadcrumb("claude-hook.stop.ignored", data: ["error": String(describing: error)])
print("OK")
return
}
throw error
}
case "prompt-submit":
telemetry.breadcrumb("claude-hook.prompt-submit")
var workspaceId = fallbackWorkspaceId
if let sessionId = parsedInput.sessionId,
let mapped = try? sessionStore.lookup(sessionId: sessionId),
let mappedWorkspace = try? resolveWorkspaceIdForClaudeHook(mapped.workspaceId, client: client) {
workspaceId = mappedWorkspace
}
let mappedSession = parsedInput.sessionId.flatMap { try? sessionStore.lookup(sessionId: $0) }
let workspaceId = try resolvePreferredWorkspaceIdForClaudeHook(
preferred: mappedSession?.workspaceId,
fallback: workspaceArg,
client: client
)
_ = try sendV1Command("clear_notifications --tab=\(workspaceId)", client: client)
try setClaudeStatus(
client: client,
@ -10288,23 +10299,21 @@ struct CMUXCLI {
telemetry.breadcrumb("claude-hook.notification")
var summary = summarizeClaudeHookNotification(rawInput: rawInput)
var workspaceId = fallbackWorkspaceId
var preferredSurface = surfaceArg
if let sessionId = parsedInput.sessionId,
let mapped = try? sessionStore.lookup(sessionId: sessionId),
let mappedWorkspace = try? resolveWorkspaceIdForClaudeHook(mapped.workspaceId, client: client) {
workspaceId = mappedWorkspace
preferredSurface = mapped.surfaceId
// If PreToolUse saved a richer message (e.g. from AskUserQuestion),
// use it instead of the generic notification text.
if let savedBody = mapped.lastBody, !savedBody.isEmpty,
let mappedSession = parsedInput.sessionId.flatMap { try? sessionStore.lookup(sessionId: $0) }
let workspaceId = try resolvePreferredWorkspaceIdForClaudeHook(
preferred: mappedSession?.workspaceId,
fallback: workspaceArg,
client: client
)
if let mappedSession,
let savedBody = mappedSession.lastBody, !savedBody.isEmpty,
summary.body.contains("needs your attention") || summary.body.contains("needs your input") {
summary = (subtitle: mapped.lastSubtitle ?? summary.subtitle, body: savedBody)
}
summary = (subtitle: mappedSession.lastSubtitle ?? summary.subtitle, body: savedBody)
}
let surfaceId = try resolveSurfaceIdForClaudeHook(
preferredSurface,
let surfaceId = try resolvePreferredSurfaceIdForClaudeHook(
preferred: mappedSession?.surfaceId,
fallback: surfaceArg,
workspaceId: workspaceId,
client: client
)
@ -10341,6 +10350,21 @@ struct CMUXCLI {
// Only clear when we are the primary cleanup path (Stop didn't fire first).
// If Stop already consumed the session, consumedSession is nil and we skip
// to avoid wiping the completion notification that Stop just delivered.
let mappedSession = parsedInput.sessionId.flatMap { try? sessionStore.lookup(sessionId: $0) }
let fallbackWorkspaceId = try? resolvePreferredWorkspaceIdForClaudeHook(
preferred: mappedSession?.workspaceId,
fallback: workspaceArg,
client: client
)
let fallbackSurfaceId: String? = {
guard let fallbackWorkspaceId else { return nil }
return try? resolvePreferredSurfaceIdForClaudeHook(
preferred: mappedSession?.surfaceId,
fallback: surfaceArg,
workspaceId: fallbackWorkspaceId,
client: client
)
}()
let consumedSession = try? sessionStore.consume(
sessionId: parsedInput.sessionId,
workspaceId: fallbackWorkspaceId,
@ -10358,14 +10382,13 @@ struct CMUXCLI {
telemetry.breadcrumb("claude-hook.pre-tool-use")
// Clears "Needs input" status and notification when Claude resumes work
// (e.g. after permission grant). Runs async so it doesn't block tool execution.
var workspaceId = fallbackWorkspaceId
var claudePid: Int? = nil
if let sessionId = parsedInput.sessionId,
let mapped = try? sessionStore.lookup(sessionId: sessionId),
let mappedWorkspace = try? resolveWorkspaceIdForClaudeHook(mapped.workspaceId, client: client) {
workspaceId = mappedWorkspace
claudePid = mapped.pid
}
let mappedSession = parsedInput.sessionId.flatMap { try? sessionStore.lookup(sessionId: $0) }
let workspaceId = try resolvePreferredWorkspaceIdForClaudeHook(
preferred: mappedSession?.workspaceId,
fallback: workspaceArg,
client: client
)
let claudePid = mappedSession?.pid
// AskUserQuestion means Claude is about to ask the user something.
// Save question text in session so the Notification handler can use it
@ -10442,6 +10465,70 @@ struct CMUXCLI {
_ = try client.send(command: "clear_status claude_code --tab=\(workspaceId)")
}
private func resolvePreferredWorkspaceIdForClaudeHook(
preferred: String?,
fallback: String?,
client: SocketClient
) throws -> String {
if let preferred = nonEmptyClaudeHookIdentifier(preferred) {
if isUUID(preferred) {
return preferred
}
return try resolveWorkspaceIdForClaudeHook(preferred, client: client)
}
if let fallback = nonEmptyClaudeHookIdentifier(fallback), isUUID(fallback) {
return fallback
}
return try resolveWorkspaceIdForClaudeHook(fallback, client: client)
}
private func resolvePreferredSurfaceIdForClaudeHook(
preferred: String?,
fallback: String?,
workspaceId: String,
client: SocketClient
) throws -> String {
if let preferred = nonEmptyClaudeHookIdentifier(preferred) {
if isUUID(preferred) {
return preferred
}
return try resolveSurfaceIdForClaudeHook(preferred, workspaceId: workspaceId, client: client)
}
if let fallback = nonEmptyClaudeHookIdentifier(fallback), isUUID(fallback) {
return fallback
}
return try resolveSurfaceIdForClaudeHook(fallback, workspaceId: workspaceId, client: client)
}
private func nonEmptyClaudeHookIdentifier(_ value: String?) -> String? {
guard let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines),
!trimmed.isEmpty else {
return nil
}
return trimmed
}
private func shouldIgnoreClaudeHookTeardownError(_ error: Error) -> Bool {
let message = String(describing: error).lowercased()
let benignFragments = [
"tabmanager not available",
"no workspace selected",
"workspace not found",
"workspace ref not found",
"workspace index not found",
"surface not found",
"surface ref not found",
"surface index not found",
"unable to resolve surface id",
"panel not found",
"tab not found",
"failed to write to socket",
"socket read error",
"not connected"
]
return benignFragments.contains { message.contains($0) }
}
private func describeAskUserQuestion(_ object: [String: Any]?) -> String? {
guard let object,
let input = object["tool_input"] as? [String: Any],

View file

@ -0,0 +1,188 @@
#!/usr/bin/env python3
"""
Regression test: claude-hook stop should not fail when workspace teardown makes
TabManager unavailable before the final status update runs.
"""
from __future__ import annotations
import glob
import json
import os
import shutil
import socket
import subprocess
import tempfile
import threading
import time
import uuid
from pathlib import Path
def resolve_cmux_cli() -> str:
explicit = os.environ.get("CMUX_CLI_BIN") or os.environ.get("CMUX_CLI")
if explicit and os.path.exists(explicit) and os.access(explicit, os.X_OK):
return explicit
candidates: list[str] = []
candidates.extend(glob.glob(os.path.expanduser("~/Library/Developer/Xcode/DerivedData/*/Build/Products/Debug/cmux")))
candidates.extend(glob.glob("/tmp/cmux-*/Build/Products/Debug/cmux"))
candidates = [p for p in candidates if os.path.exists(p) and os.access(p, os.X_OK)]
if candidates:
candidates.sort(key=os.path.getmtime, reverse=True)
return candidates[0]
in_path = shutil.which("cmux")
if in_path:
return in_path
raise RuntimeError("Unable to find cmux CLI binary. Set CMUX_CLI_BIN.")
class TeardownUnavailableServer:
def __init__(self, socket_path: str):
self.socket_path = socket_path
self.commands: list[str] = []
self.ready = threading.Event()
self.error: Exception | None = None
self._thread = threading.Thread(target=self._run, daemon=True)
def start(self) -> None:
self._thread.start()
def wait_ready(self, timeout: float) -> bool:
return self.ready.wait(timeout)
def join(self, timeout: float) -> None:
self._thread.join(timeout=timeout)
def _response_for(self, line: str) -> str:
stripped = line.strip()
if stripped.startswith("{"):
request = json.loads(stripped)
return json.dumps(
{
"id": request.get("id"),
"ok": False,
"error": {
"code": "unavailable",
"message": "TabManager not available",
},
}
)
return "ERROR: TabManager not available"
def _run(self) -> None:
server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
try:
if os.path.exists(self.socket_path):
os.remove(self.socket_path)
server.bind(self.socket_path)
server.listen(1)
server.settimeout(6.0)
self.ready.set()
conn, _ = server.accept()
with conn:
conn.settimeout(0.5)
buffer = b""
idle_deadline = time.time() + 6.0
while time.time() < idle_deadline:
try:
chunk = conn.recv(4096)
except socket.timeout:
continue
if not chunk:
break
buffer += chunk
while b"\n" in buffer:
raw_line, buffer = buffer.split(b"\n", 1)
line = raw_line.decode("utf-8")
if not line:
continue
self.commands.append(line)
response = self._response_for(line)
conn.sendall((response + "\n").encode("utf-8"))
if not self.commands:
raise RuntimeError("cmux CLI never sent a command to the teardown test socket")
except Exception as exc: # pragma: no cover - explicit failure surfacing
self.error = exc
self.ready.set()
finally:
server.close()
def main() -> int:
try:
cli_path = resolve_cmux_cli()
except Exception as exc:
print(f"FAIL: {exc}")
return 1
temp_dir = tempfile.TemporaryDirectory(prefix="cmux-claude-hook-stop-")
try:
root = Path(temp_dir.name)
socket_path = str(root / "cmux.sock")
state_path = root / "claude-hook-state.json"
server = TeardownUnavailableServer(socket_path)
server.start()
if not server.wait_ready(2.0):
print("FAIL: teardown socket server did not become ready")
return 1
if server.error is not None:
print(f"FAIL: teardown socket server failed to start: {server.error}")
return 1
env = os.environ.copy()
env["CMUX_SOCKET_PATH"] = socket_path
env["CMUX_WORKSPACE_ID"] = str(uuid.uuid4())
env["CMUX_SURFACE_ID"] = str(uuid.uuid4())
env["CMUX_CLAUDE_HOOK_STATE_PATH"] = str(state_path)
env["CMUX_CLI_SENTRY_DISABLED"] = "1"
env["CMUX_CLAUDE_HOOK_SENTRY_DISABLED"] = "1"
proc = subprocess.run(
[cli_path, "--socket", socket_path, "claude-hook", "stop"],
input=json.dumps({"session_id": f"sess-{uuid.uuid4().hex}"}),
text=True,
capture_output=True,
env=env,
timeout=8,
check=False,
)
server.join(timeout=2.0)
if server.error is not None:
print(f"FAIL: teardown socket server error: {server.error}")
return 1
if proc.returncode != 0:
print("FAIL: expected claude-hook stop to ignore teardown-time unavailable errors")
print(f"stdout={proc.stdout!r}")
print(f"stderr={proc.stderr!r}")
print(f"commands={server.commands!r}")
return 1
if proc.stdout.strip() != "OK":
print("FAIL: expected claude-hook stop to print OK")
print(f"stdout={proc.stdout!r}")
print(f"stderr={proc.stderr!r}")
print(f"commands={server.commands!r}")
return 1
if not server.commands:
print("FAIL: expected the CLI to send at least one command to the teardown socket")
return 1
print("PASS: claude-hook stop ignores teardown-time TabManager unavailable responses")
return 0
finally:
temp_dir.cleanup()
if __name__ == "__main__":
raise SystemExit(main())