fix(daemon): prevent stuck tasks from blocking queue and add concurrent execution
- Expand FailAgentTask SQL to accept dispatched OR running status - Add FailStaleTasks server-side sweeper (dispatched >5min, running >2.5h) - Fix daemon handleTask to fail tasks on all error paths (StartTask, CompleteTask) - Make daemon poll loop concurrent with semaphore (default 20 parallel tasks) - Raise default agent max_concurrent_tasks from 1 to 6 (migration 023) - Add --max-concurrent-tasks CLI flag and MULTICA_DAEMON_MAX_CONCURRENT_TASKS env
This commit is contained in:
parent
00c54232ad
commit
67f1f49b09
11 changed files with 261 additions and 88 deletions
|
|
@ -229,7 +229,7 @@ func (q *Queries) DeleteAgent(ctx context.Context, id pgtype.UUID) error {
|
|||
const failAgentTask = `-- name: FailAgentTask :one
|
||||
UPDATE agent_task_queue
|
||||
SET status = 'failed', completed_at = now(), error = $2
|
||||
WHERE id = $1 AND status = 'running'
|
||||
WHERE id = $1 AND status IN ('dispatched', 'running')
|
||||
RETURNING id, agent_id, issue_id, status, priority, dispatched_at, started_at, completed_at, result, error, created_at, context, runtime_id, session_id, work_dir
|
||||
`
|
||||
|
||||
|
|
@ -261,6 +261,48 @@ func (q *Queries) FailAgentTask(ctx context.Context, arg FailAgentTaskParams) (A
|
|||
return i, err
|
||||
}
|
||||
|
||||
const failStaleTasks = `-- name: FailStaleTasks :many
|
||||
UPDATE agent_task_queue
|
||||
SET status = 'failed', completed_at = now(), error = 'task timed out'
|
||||
WHERE (status = 'dispatched' AND dispatched_at < now() - make_interval(secs => $1::double precision))
|
||||
OR (status = 'running' AND started_at < now() - make_interval(secs => $2::double precision))
|
||||
RETURNING id, agent_id, issue_id
|
||||
`
|
||||
|
||||
type FailStaleTasksParams struct {
|
||||
DispatchTimeoutSecs float64 `json:"dispatch_timeout_secs"`
|
||||
RunningTimeoutSecs float64 `json:"running_timeout_secs"`
|
||||
}
|
||||
|
||||
type FailStaleTasksRow struct {
|
||||
ID pgtype.UUID `json:"id"`
|
||||
AgentID pgtype.UUID `json:"agent_id"`
|
||||
IssueID pgtype.UUID `json:"issue_id"`
|
||||
}
|
||||
|
||||
// Fails tasks stuck in dispatched/running beyond the given thresholds.
|
||||
// Handles cases where the daemon is alive but the task is orphaned
|
||||
// (e.g. agent process hung, daemon failed to report completion).
|
||||
func (q *Queries) FailStaleTasks(ctx context.Context, arg FailStaleTasksParams) ([]FailStaleTasksRow, error) {
|
||||
rows, err := q.db.Query(ctx, failStaleTasks, arg.DispatchTimeoutSecs, arg.RunningTimeoutSecs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
items := []FailStaleTasksRow{}
|
||||
for rows.Next() {
|
||||
var i FailStaleTasksRow
|
||||
if err := rows.Scan(&i.ID, &i.AgentID, &i.IssueID); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, i)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
const getAgent = `-- name: GetAgent :one
|
||||
SELECT id, workspace_id, name, avatar_url, runtime_mode, runtime_config, visibility, status, max_concurrent_tasks, owner_id, created_at, updated_at, description, tools, triggers, runtime_id, instructions FROM agent
|
||||
WHERE id = $1
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue