fix(runtime): add server-side sweeper to detect stale runtimes
The only path to marking a runtime offline was the daemon's deregister call on graceful shutdown. If the daemon crashed, was killed, or lost network, the status stayed "online" forever. Add a background goroutine that sweeps every 30s and marks runtimes offline after 45s without a heartbeat (3 missed intervals). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
586c3bf470
commit
b3bbf92a1d
4 changed files with 110 additions and 0 deletions
|
|
@ -105,6 +105,39 @@ func (q *Queries) ListAgentRuntimes(ctx context.Context, workspaceID pgtype.UUID
|
|||
return items, nil
|
||||
}
|
||||
|
||||
const markStaleRuntimesOffline = `-- name: MarkStaleRuntimesOffline :many
|
||||
UPDATE agent_runtime
|
||||
SET status = 'offline', updated_at = now()
|
||||
WHERE status = 'online'
|
||||
AND last_seen_at < now() - make_interval(secs => $1::double precision)
|
||||
RETURNING id, workspace_id
|
||||
`
|
||||
|
||||
type MarkStaleRuntimesOfflineRow struct {
|
||||
ID pgtype.UUID `json:"id"`
|
||||
WorkspaceID pgtype.UUID `json:"workspace_id"`
|
||||
}
|
||||
|
||||
func (q *Queries) MarkStaleRuntimesOffline(ctx context.Context, staleSeconds float64) ([]MarkStaleRuntimesOfflineRow, error) {
|
||||
rows, err := q.db.Query(ctx, markStaleRuntimesOffline, staleSeconds)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
items := []MarkStaleRuntimesOfflineRow{}
|
||||
for rows.Next() {
|
||||
var i MarkStaleRuntimesOfflineRow
|
||||
if err := rows.Scan(&i.ID, &i.WorkspaceID); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, i)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const setAgentRuntimeOffline = `-- name: SetAgentRuntimeOffline :exec
|
||||
UPDATE agent_runtime
|
||||
SET status = 'offline', updated_at = now()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue