feat(tasks): add coalescing queue and task lifecycle guards

- Coalescing queue: use HasPendingTaskForIssue (queued/dispatched only) instead of HasActiveTaskForIssue so comments during a running task enqueue exactly one follow-up task that picks up all new comments. - Stale task cleanup: runtime sweeper now fails orphaned tasks when their runtime goes offline (daemon crash/network partition). - Cancel-aware daemon: handleTask checks task status after execution and discards results if the task was cancelled mid-run (e.g. reassign). - Terminal issue guard: ClaimTaskForRuntime auto-cancels pending tasks for done/cancelled issues instead of executing them. - Race condition safety net: unique partial index ensures at most one pending task per issue at the DB level.
2026-03-29 17:52:35 +08:00 · 2026-03-29 17:52:35 +08:00 · b112d1f1ae
commit b112d1f1ae
parent 32f795e1ef
13 changed files with 148 additions and 3 deletions
--- a/server/pkg/db/generated/agent.sql.go
+++ b/server/pkg/db/generated/agent.sql.go
@ -358,6 +358,22 @@ func (q *Queries) HasActiveTaskForIssue(ctx context.Context, issueID pgtype.UUID
 	return has_active, err
 }

+const hasPendingTaskForIssue = `-- name: HasPendingTaskForIssue :one
+SELECT count(*) > 0 AS has_pending FROM agent_task_queue
+WHERE issue_id = $1 AND status IN ('queued', 'dispatched')
+`
+
+// Returns true if there is a queued or dispatched (but not yet running) task for the issue.
+// Used by the coalescing queue: allow enqueue when a task is running (so
+// the agent picks up new comments on the next cycle) but skip if a pending
+// task already exists (natural dedup).
+func (q *Queries) HasPendingTaskForIssue(ctx context.Context, issueID pgtype.UUID) (bool, error) {
+	row := q.db.QueryRow(ctx, hasPendingTaskForIssue, issueID)
+	var has_pending bool
+	err := row.Scan(&has_pending)
+	return has_pending, err
+}
+
 const listAgentTasks = `-- name: ListAgentTasks :many
 SELECT id, agent_id, issue_id, status, priority, dispatched_at, started_at, completed_at, result, error, created_at, context, runtime_id, session_id, work_dir FROM agent_task_queue
 WHERE agent_id = $1
--- a/server/pkg/db/generated/runtime.sql.go
+++ b/server/pkg/db/generated/runtime.sql.go
@ -11,6 +11,44 @@ import (
 	"github.com/jackc/pgx/v5/pgtype"
 )

+const failTasksForOfflineRuntimes = `-- name: FailTasksForOfflineRuntimes :many
+UPDATE agent_task_queue
+SET status = 'failed', completed_at = now(), error = 'runtime went offline'
+WHERE status IN ('dispatched', 'running')
+  AND runtime_id IN (
+    SELECT id FROM agent_runtime WHERE status = 'offline'
+  )
+RETURNING id, agent_id, issue_id
+`
+
+type FailTasksForOfflineRuntimesRow struct {
+	ID      pgtype.UUID `json:"id"`
+	AgentID pgtype.UUID `json:"agent_id"`
+	IssueID pgtype.UUID `json:"issue_id"`
+}
+
+// Marks dispatched/running tasks as failed when their runtime is offline.
+// This cleans up orphaned tasks after a daemon crash or network partition.
+func (q *Queries) FailTasksForOfflineRuntimes(ctx context.Context) ([]FailTasksForOfflineRuntimesRow, error) {
+	rows, err := q.db.Query(ctx, failTasksForOfflineRuntimes)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	items := []FailTasksForOfflineRuntimesRow{}
+	for rows.Next() {
+		var i FailTasksForOfflineRuntimesRow
+		if err := rows.Scan(&i.ID, &i.AgentID, &i.IssueID); err != nil {
+			return nil, err
+		}
+		items = append(items, i)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	return items, nil
+}
+
 const getAgentRuntime = `-- name: GetAgentRuntime :one
 SELECT id, workspace_id, daemon_id, name, runtime_mode, provider, status, device_info, metadata, last_seen_at, created_at, updated_at FROM agent_runtime
 WHERE id = $1