fix(daemon): prevent stuck tasks from blocking queue and add concurrent execution
- Expand FailAgentTask SQL to accept dispatched OR running status - Add FailStaleTasks server-side sweeper (dispatched >5min, running >2.5h) - Fix daemon handleTask to fail tasks on all error paths (StartTask, CompleteTask) - Make daemon poll loop concurrent with semaphore (default 20 parallel tasks) - Raise default agent max_concurrent_tasks from 1 to 6 (migration 023) - Add --max-concurrent-tasks CLI flag and MULTICA_DAEMON_MAX_CONCURRENT_TASKS env
This commit is contained in:
parent
00c54232ad
commit
67f1f49b09
11 changed files with 261 additions and 88 deletions
|
|
@ -61,6 +61,7 @@ func init() {
|
||||||
f.Duration("poll-interval", 0, "Task poll interval (env: MULTICA_DAEMON_POLL_INTERVAL)")
|
f.Duration("poll-interval", 0, "Task poll interval (env: MULTICA_DAEMON_POLL_INTERVAL)")
|
||||||
f.Duration("heartbeat-interval", 0, "Heartbeat interval (env: MULTICA_DAEMON_HEARTBEAT_INTERVAL)")
|
f.Duration("heartbeat-interval", 0, "Heartbeat interval (env: MULTICA_DAEMON_HEARTBEAT_INTERVAL)")
|
||||||
f.Duration("agent-timeout", 0, "Per-task timeout (env: MULTICA_AGENT_TIMEOUT)")
|
f.Duration("agent-timeout", 0, "Per-task timeout (env: MULTICA_AGENT_TIMEOUT)")
|
||||||
|
f.Int("max-concurrent-tasks", 0, "Max tasks running in parallel (env: MULTICA_DAEMON_MAX_CONCURRENT_TASKS)")
|
||||||
|
|
||||||
daemonLogsCmd.Flags().BoolP("follow", "f", false, "Follow log output")
|
daemonLogsCmd.Flags().BoolP("follow", "f", false, "Follow log output")
|
||||||
daemonLogsCmd.Flags().IntP("lines", "n", 50, "Number of lines to show")
|
daemonLogsCmd.Flags().IntP("lines", "n", 50, "Number of lines to show")
|
||||||
|
|
@ -187,6 +188,9 @@ func buildDaemonStartArgs(cmd *cobra.Command) []string {
|
||||||
if d, _ := cmd.Flags().GetDuration("agent-timeout"); d > 0 {
|
if d, _ := cmd.Flags().GetDuration("agent-timeout"); d > 0 {
|
||||||
args = append(args, "--agent-timeout", d.String())
|
args = append(args, "--agent-timeout", d.String())
|
||||||
}
|
}
|
||||||
|
if n, _ := cmd.Flags().GetInt("max-concurrent-tasks"); n > 0 {
|
||||||
|
args = append(args, "--max-concurrent-tasks", strconv.Itoa(n))
|
||||||
|
}
|
||||||
|
|
||||||
// Forward global persistent flags.
|
// Forward global persistent flags.
|
||||||
if v, _ := cmd.Flags().GetString("server-url"); v != "" {
|
if v, _ := cmd.Flags().GetString("server-url"); v != "" {
|
||||||
|
|
@ -212,6 +216,9 @@ func runDaemonForeground(cmd *cobra.Command) error {
|
||||||
if d, _ := cmd.Flags().GetDuration("agent-timeout"); d > 0 {
|
if d, _ := cmd.Flags().GetDuration("agent-timeout"); d > 0 {
|
||||||
overrides.AgentTimeout = d
|
overrides.AgentTimeout = d
|
||||||
}
|
}
|
||||||
|
if n, _ := cmd.Flags().GetInt("max-concurrent-tasks"); n > 0 {
|
||||||
|
overrides.MaxConcurrentTasks = n
|
||||||
|
}
|
||||||
|
|
||||||
cfg, err := daemon.LoadConfig(overrides)
|
cfg, err := daemon.LoadConfig(overrides)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
||||||
|
|
@ -12,16 +12,24 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
// sweepInterval is how often we check for stale runtimes.
|
// sweepInterval is how often we check for stale runtimes and tasks.
|
||||||
sweepInterval = 30 * time.Second
|
sweepInterval = 30 * time.Second
|
||||||
// staleThresholdSeconds marks runtimes offline if no heartbeat for this long.
|
// staleThresholdSeconds marks runtimes offline if no heartbeat for this long.
|
||||||
// The daemon heartbeat interval is 15s, so 45s = 3 missed heartbeats.
|
// The daemon heartbeat interval is 15s, so 45s = 3 missed heartbeats.
|
||||||
staleThresholdSeconds = 45.0
|
staleThresholdSeconds = 45.0
|
||||||
|
// dispatchTimeoutSeconds fails tasks stuck in 'dispatched' beyond this.
|
||||||
|
// The dispatched→running transition should be near-instant, so 5 minutes
|
||||||
|
// means something went wrong (e.g. StartTask API call failed silently).
|
||||||
|
dispatchTimeoutSeconds = 300.0
|
||||||
|
// runningTimeoutSeconds fails tasks stuck in 'running' beyond this.
|
||||||
|
// The default agent timeout is 2h, so 2.5h gives a generous buffer.
|
||||||
|
runningTimeoutSeconds = 9000.0
|
||||||
)
|
)
|
||||||
|
|
||||||
// runRuntimeSweeper periodically marks runtimes as offline if their
|
// runRuntimeSweeper periodically marks runtimes as offline if their
|
||||||
// last_seen_at exceeds the stale threshold. This handles cases where the
|
// last_seen_at exceeds the stale threshold, and fails orphaned tasks.
|
||||||
// daemon crashes or is killed without calling the deregister endpoint.
|
// This handles cases where the daemon crashes, is killed without calling
|
||||||
|
// the deregister endpoint, or leaves tasks in a non-terminal state.
|
||||||
func runRuntimeSweeper(ctx context.Context, queries *db.Queries, bus *events.Bus) {
|
func runRuntimeSweeper(ctx context.Context, queries *db.Queries, bus *events.Bus) {
|
||||||
ticker := time.NewTicker(sweepInterval)
|
ticker := time.NewTicker(sweepInterval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
@ -31,55 +39,95 @@ func runRuntimeSweeper(ctx context.Context, queries *db.Queries, bus *events.Bus
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
staleRows, err := queries.MarkStaleRuntimesOffline(ctx, staleThresholdSeconds)
|
sweepStaleRuntimes(ctx, queries, bus)
|
||||||
if err != nil {
|
sweepStaleTasks(ctx, queries, bus)
|
||||||
slog.Warn("runtime sweeper: failed to mark stale runtimes offline", "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if len(staleRows) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collect unique workspace IDs to notify.
|
|
||||||
workspaces := make(map[string]bool)
|
|
||||||
for _, row := range staleRows {
|
|
||||||
wsID := util.UUIDToString(row.WorkspaceID)
|
|
||||||
workspaces[wsID] = true
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Info("runtime sweeper: marked stale runtimes offline", "count", len(staleRows), "workspaces", len(workspaces))
|
|
||||||
|
|
||||||
// Fail orphaned tasks (dispatched/running) whose runtimes just went offline.
|
|
||||||
failedTasks, err := queries.FailTasksForOfflineRuntimes(ctx)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("runtime sweeper: failed to clean up stale tasks", "error", err)
|
|
||||||
} else if len(failedTasks) > 0 {
|
|
||||||
slog.Info("runtime sweeper: failed orphaned tasks", "count", len(failedTasks))
|
|
||||||
for _, ft := range failedTasks {
|
|
||||||
bus.Publish(events.Event{
|
|
||||||
Type: protocol.EventTaskFailed,
|
|
||||||
ActorType: "system",
|
|
||||||
Payload: map[string]any{
|
|
||||||
"task_id": util.UUIDToString(ft.ID),
|
|
||||||
"agent_id": util.UUIDToString(ft.AgentID),
|
|
||||||
"issue_id": util.UUIDToString(ft.IssueID),
|
|
||||||
"status": "failed",
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Notify frontend clients so they re-fetch runtime list.
|
|
||||||
for wsID := range workspaces {
|
|
||||||
bus.Publish(events.Event{
|
|
||||||
Type: protocol.EventDaemonRegister,
|
|
||||||
WorkspaceID: wsID,
|
|
||||||
ActorType: "system",
|
|
||||||
Payload: map[string]any{
|
|
||||||
"action": "stale_sweep",
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sweepStaleRuntimes marks runtimes offline if they haven't heartbeated,
|
||||||
|
// then fails any tasks belonging to those offline runtimes.
|
||||||
|
func sweepStaleRuntimes(ctx context.Context, queries *db.Queries, bus *events.Bus) {
|
||||||
|
staleRows, err := queries.MarkStaleRuntimesOffline(ctx, staleThresholdSeconds)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("runtime sweeper: failed to mark stale runtimes offline", "error", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(staleRows) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect unique workspace IDs to notify.
|
||||||
|
workspaces := make(map[string]bool)
|
||||||
|
for _, row := range staleRows {
|
||||||
|
wsID := util.UUIDToString(row.WorkspaceID)
|
||||||
|
workspaces[wsID] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Info("runtime sweeper: marked stale runtimes offline", "count", len(staleRows), "workspaces", len(workspaces))
|
||||||
|
|
||||||
|
// Fail orphaned tasks (dispatched/running) whose runtimes just went offline.
|
||||||
|
failedTasks, err := queries.FailTasksForOfflineRuntimes(ctx)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("runtime sweeper: failed to clean up stale tasks", "error", err)
|
||||||
|
} else if len(failedTasks) > 0 {
|
||||||
|
slog.Info("runtime sweeper: failed orphaned tasks", "count", len(failedTasks))
|
||||||
|
for _, ft := range failedTasks {
|
||||||
|
bus.Publish(events.Event{
|
||||||
|
Type: protocol.EventTaskFailed,
|
||||||
|
ActorType: "system",
|
||||||
|
Payload: map[string]any{
|
||||||
|
"task_id": util.UUIDToString(ft.ID),
|
||||||
|
"agent_id": util.UUIDToString(ft.AgentID),
|
||||||
|
"issue_id": util.UUIDToString(ft.IssueID),
|
||||||
|
"status": "failed",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Notify frontend clients so they re-fetch runtime list.
|
||||||
|
for wsID := range workspaces {
|
||||||
|
bus.Publish(events.Event{
|
||||||
|
Type: protocol.EventDaemonRegister,
|
||||||
|
WorkspaceID: wsID,
|
||||||
|
ActorType: "system",
|
||||||
|
Payload: map[string]any{
|
||||||
|
"action": "stale_sweep",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sweepStaleTasks fails tasks stuck in dispatched/running for too long,
|
||||||
|
// even when the runtime is still online. This handles cases where:
|
||||||
|
// - The agent process hangs and the daemon is still heartbeating
|
||||||
|
// - The daemon failed to report task completion/failure
|
||||||
|
// - A server restart left tasks in a non-terminal state
|
||||||
|
func sweepStaleTasks(ctx context.Context, queries *db.Queries, bus *events.Bus) {
|
||||||
|
failedTasks, err := queries.FailStaleTasks(ctx, db.FailStaleTasksParams{
|
||||||
|
DispatchTimeoutSecs: dispatchTimeoutSeconds,
|
||||||
|
RunningTimeoutSecs: runningTimeoutSeconds,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("task sweeper: failed to clean up stale tasks", "error", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(failedTasks) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Info("task sweeper: failed stale tasks", "count", len(failedTasks))
|
||||||
|
for _, ft := range failedTasks {
|
||||||
|
bus.Publish(events.Event{
|
||||||
|
Type: protocol.EventTaskFailed,
|
||||||
|
ActorType: "system",
|
||||||
|
Payload: map[string]any{
|
||||||
|
"task_id": util.UUIDToString(ft.ID),
|
||||||
|
"agent_id": util.UUIDToString(ft.AgentID),
|
||||||
|
"issue_id": util.UUIDToString(ft.IssueID),
|
||||||
|
"status": "failed",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -18,34 +18,37 @@ const (
|
||||||
DefaultRuntimeName = "Local Agent"
|
DefaultRuntimeName = "Local Agent"
|
||||||
DefaultConfigReloadInterval = 5 * time.Second
|
DefaultConfigReloadInterval = 5 * time.Second
|
||||||
DefaultHealthPort = 19514
|
DefaultHealthPort = 19514
|
||||||
|
DefaultMaxConcurrentTasks = 20
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config holds all daemon configuration.
|
// Config holds all daemon configuration.
|
||||||
type Config struct {
|
type Config struct {
|
||||||
ServerBaseURL string
|
ServerBaseURL string
|
||||||
DaemonID string
|
DaemonID string
|
||||||
DeviceName string
|
DeviceName string
|
||||||
RuntimeName string
|
RuntimeName string
|
||||||
Agents map[string]AgentEntry // "claude" -> entry, "codex" -> entry
|
Agents map[string]AgentEntry // "claude" -> entry, "codex" -> entry
|
||||||
WorkspacesRoot string // base path for execution envs (default: ~/multica_workspaces)
|
WorkspacesRoot string // base path for execution envs (default: ~/multica_workspaces)
|
||||||
KeepEnvAfterTask bool // preserve env after task for debugging
|
KeepEnvAfterTask bool // preserve env after task for debugging
|
||||||
HealthPort int // local HTTP port for health checks (default: 19514)
|
HealthPort int // local HTTP port for health checks (default: 19514)
|
||||||
PollInterval time.Duration
|
MaxConcurrentTasks int // max tasks running in parallel (default: 20)
|
||||||
HeartbeatInterval time.Duration
|
PollInterval time.Duration
|
||||||
AgentTimeout time.Duration
|
HeartbeatInterval time.Duration
|
||||||
|
AgentTimeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// Overrides allows CLI flags to override environment variables and defaults.
|
// Overrides allows CLI flags to override environment variables and defaults.
|
||||||
// Zero values are ignored and the env/default value is used instead.
|
// Zero values are ignored and the env/default value is used instead.
|
||||||
type Overrides struct {
|
type Overrides struct {
|
||||||
ServerURL string
|
ServerURL string
|
||||||
WorkspacesRoot string
|
WorkspacesRoot string
|
||||||
PollInterval time.Duration
|
PollInterval time.Duration
|
||||||
HeartbeatInterval time.Duration
|
HeartbeatInterval time.Duration
|
||||||
AgentTimeout time.Duration
|
AgentTimeout time.Duration
|
||||||
DaemonID string
|
MaxConcurrentTasks int
|
||||||
DeviceName string
|
DaemonID string
|
||||||
RuntimeName string
|
DeviceName string
|
||||||
|
RuntimeName string
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadConfig builds the daemon configuration from environment variables
|
// LoadConfig builds the daemon configuration from environment variables
|
||||||
|
|
@ -112,6 +115,14 @@ func LoadConfig(overrides Overrides) (Config, error) {
|
||||||
agentTimeout = overrides.AgentTimeout
|
agentTimeout = overrides.AgentTimeout
|
||||||
}
|
}
|
||||||
|
|
||||||
|
maxConcurrentTasks, err := intFromEnv("MULTICA_DAEMON_MAX_CONCURRENT_TASKS", DefaultMaxConcurrentTasks)
|
||||||
|
if err != nil {
|
||||||
|
return Config{}, err
|
||||||
|
}
|
||||||
|
if overrides.MaxConcurrentTasks > 0 {
|
||||||
|
maxConcurrentTasks = overrides.MaxConcurrentTasks
|
||||||
|
}
|
||||||
|
|
||||||
// String overrides
|
// String overrides
|
||||||
daemonID := envOrDefault("MULTICA_DAEMON_ID", host)
|
daemonID := envOrDefault("MULTICA_DAEMON_ID", host)
|
||||||
if overrides.DaemonID != "" {
|
if overrides.DaemonID != "" {
|
||||||
|
|
@ -149,17 +160,18 @@ func LoadConfig(overrides Overrides) (Config, error) {
|
||||||
keepEnv := os.Getenv("MULTICA_KEEP_ENV_AFTER_TASK") == "true" || os.Getenv("MULTICA_KEEP_ENV_AFTER_TASK") == "1"
|
keepEnv := os.Getenv("MULTICA_KEEP_ENV_AFTER_TASK") == "true" || os.Getenv("MULTICA_KEEP_ENV_AFTER_TASK") == "1"
|
||||||
|
|
||||||
return Config{
|
return Config{
|
||||||
ServerBaseURL: serverBaseURL,
|
ServerBaseURL: serverBaseURL,
|
||||||
DaemonID: daemonID,
|
DaemonID: daemonID,
|
||||||
DeviceName: deviceName,
|
DeviceName: deviceName,
|
||||||
RuntimeName: runtimeName,
|
RuntimeName: runtimeName,
|
||||||
Agents: agents,
|
Agents: agents,
|
||||||
WorkspacesRoot: workspacesRoot,
|
WorkspacesRoot: workspacesRoot,
|
||||||
KeepEnvAfterTask: keepEnv,
|
KeepEnvAfterTask: keepEnv,
|
||||||
HealthPort: DefaultHealthPort,
|
HealthPort: DefaultHealthPort,
|
||||||
PollInterval: pollInterval,
|
MaxConcurrentTasks: maxConcurrentTasks,
|
||||||
HeartbeatInterval: heartbeatInterval,
|
PollInterval: pollInterval,
|
||||||
AgentTimeout: agentTimeout,
|
HeartbeatInterval: heartbeatInterval,
|
||||||
|
AgentTimeout: agentTimeout,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -503,11 +503,22 @@ func (d *Daemon) usageScanLoop(ctx context.Context) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Daemon) pollLoop(ctx context.Context) error {
|
func (d *Daemon) pollLoop(ctx context.Context) error {
|
||||||
|
sem := make(chan struct{}, d.cfg.MaxConcurrentTasks)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
pollOffset := 0
|
pollOffset := 0
|
||||||
pollCount := 0
|
pollCount := 0
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
d.logger.Info("poll loop stopping, waiting for in-flight tasks", "max_wait", "30s")
|
||||||
|
waitDone := make(chan struct{})
|
||||||
|
go func() { wg.Wait(); close(waitDone) }()
|
||||||
|
select {
|
||||||
|
case <-waitDone:
|
||||||
|
case <-time.After(30 * time.Second):
|
||||||
|
d.logger.Warn("timed out waiting for in-flight tasks")
|
||||||
|
}
|
||||||
return ctx.Err()
|
return ctx.Err()
|
||||||
default:
|
default:
|
||||||
}
|
}
|
||||||
|
|
@ -515,6 +526,7 @@ func (d *Daemon) pollLoop(ctx context.Context) error {
|
||||||
runtimeIDs := d.allRuntimeIDs()
|
runtimeIDs := d.allRuntimeIDs()
|
||||||
if len(runtimeIDs) == 0 {
|
if len(runtimeIDs) == 0 {
|
||||||
if err := sleepWithContext(ctx, d.cfg.PollInterval); err != nil {
|
if err := sleepWithContext(ctx, d.cfg.PollInterval); err != nil {
|
||||||
|
wg.Wait()
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
|
|
@ -523,21 +535,40 @@ func (d *Daemon) pollLoop(ctx context.Context) error {
|
||||||
claimed := false
|
claimed := false
|
||||||
n := len(runtimeIDs)
|
n := len(runtimeIDs)
|
||||||
for i := 0; i < n; i++ {
|
for i := 0; i < n; i++ {
|
||||||
|
// Check if we have capacity before claiming.
|
||||||
|
select {
|
||||||
|
case sem <- struct{}{}:
|
||||||
|
// Acquired a slot.
|
||||||
|
default:
|
||||||
|
// All slots occupied, stop trying to claim.
|
||||||
|
d.logger.Debug("poll: at capacity", "running", d.cfg.MaxConcurrentTasks)
|
||||||
|
goto sleep
|
||||||
|
}
|
||||||
|
|
||||||
rid := runtimeIDs[(pollOffset+i)%n]
|
rid := runtimeIDs[(pollOffset+i)%n]
|
||||||
task, err := d.client.ClaimTask(ctx, rid)
|
task, err := d.client.ClaimTask(ctx, rid)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
<-sem // Release the slot.
|
||||||
d.logger.Warn("claim task failed", "runtime_id", rid, "error", err)
|
d.logger.Warn("claim task failed", "runtime_id", rid, "error", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if task != nil {
|
if task != nil {
|
||||||
d.logger.Info("task received", "task_id", task.ID, "issue_id", task.IssueID)
|
d.logger.Info("task received", "task_id", task.ID, "issue_id", task.IssueID)
|
||||||
d.handleTask(ctx, *task)
|
wg.Add(1)
|
||||||
|
go func(t Task) {
|
||||||
|
defer wg.Done()
|
||||||
|
defer func() { <-sem }()
|
||||||
|
d.handleTask(ctx, t)
|
||||||
|
}(*task)
|
||||||
claimed = true
|
claimed = true
|
||||||
pollOffset = (pollOffset + i + 1) % n
|
pollOffset = (pollOffset + i + 1) % n
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
// No task for this runtime, release the slot and try next.
|
||||||
|
<-sem
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sleep:
|
||||||
if !claimed {
|
if !claimed {
|
||||||
pollCount++
|
pollCount++
|
||||||
if pollCount%20 == 1 {
|
if pollCount%20 == 1 {
|
||||||
|
|
@ -545,6 +576,7 @@ func (d *Daemon) pollLoop(ctx context.Context) error {
|
||||||
}
|
}
|
||||||
pollOffset = (pollOffset + 1) % n
|
pollOffset = (pollOffset + 1) % n
|
||||||
if err := sleepWithContext(ctx, d.cfg.PollInterval); err != nil {
|
if err := sleepWithContext(ctx, d.cfg.PollInterval); err != nil {
|
||||||
|
wg.Wait()
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -562,6 +594,9 @@ func (d *Daemon) handleTask(ctx context.Context, task Task) {
|
||||||
|
|
||||||
if err := d.client.StartTask(ctx, task.ID); err != nil {
|
if err := d.client.StartTask(ctx, task.ID); err != nil {
|
||||||
d.logger.Error("start task failed", "task_id", task.ID, "error", err)
|
d.logger.Error("start task failed", "task_id", task.ID, "error", err)
|
||||||
|
if failErr := d.client.FailTask(ctx, task.ID, fmt.Sprintf("start task failed: %s", err.Error())); failErr != nil {
|
||||||
|
d.logger.Error("fail task after start error", "task_id", task.ID, "error", failErr)
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -594,7 +629,10 @@ func (d *Daemon) handleTask(ctx context.Context, task Task) {
|
||||||
default:
|
default:
|
||||||
d.logger.Info("task completed", "task_id", task.ID, "status", result.Status)
|
d.logger.Info("task completed", "task_id", task.ID, "status", result.Status)
|
||||||
if err := d.client.CompleteTask(ctx, task.ID, result.Comment, result.BranchName, result.SessionID, result.WorkDir); err != nil {
|
if err := d.client.CompleteTask(ctx, task.ID, result.Comment, result.BranchName, result.SessionID, result.WorkDir); err != nil {
|
||||||
d.logger.Error("complete task failed", "task_id", task.ID, "error", err)
|
d.logger.Error("complete task failed, falling back to fail", "task_id", task.ID, "error", err)
|
||||||
|
if failErr := d.client.FailTask(ctx, task.ID, fmt.Sprintf("complete task failed: %s", err.Error())); failErr != nil {
|
||||||
|
d.logger.Error("fail task fallback also failed", "task_id", task.ID, "error", failErr)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
@ -28,6 +29,18 @@ func durationFromEnv(key string, fallback time.Duration) (time.Duration, error)
|
||||||
return d, nil
|
return d, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func intFromEnv(key string, fallback int) (int, error) {
|
||||||
|
value := strings.TrimSpace(os.Getenv(key))
|
||||||
|
if value == "" {
|
||||||
|
return fallback, nil
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(value)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("%s: invalid integer %q: %w", key, value, err)
|
||||||
|
}
|
||||||
|
return n, nil
|
||||||
|
}
|
||||||
|
|
||||||
func sleepWithContext(ctx context.Context, d time.Duration) error {
|
func sleepWithContext(ctx context.Context, d time.Duration) error {
|
||||||
timer := time.NewTimer(d)
|
timer := time.NewTimer(d)
|
||||||
defer timer.Stop()
|
defer timer.Stop()
|
||||||
|
|
|
||||||
|
|
@ -251,7 +251,7 @@ func (h *Handler) CreateAgent(w http.ResponseWriter, r *http.Request) {
|
||||||
req.Visibility = "workspace"
|
req.Visibility = "workspace"
|
||||||
}
|
}
|
||||||
if req.MaxConcurrentTasks == 0 {
|
if req.MaxConcurrentTasks == 0 {
|
||||||
req.MaxConcurrentTasks = 1
|
req.MaxConcurrentTasks = 6
|
||||||
}
|
}
|
||||||
|
|
||||||
runtime, err := h.Queries.GetAgentRuntimeForWorkspace(r.Context(), db.GetAgentRuntimeForWorkspaceParams{
|
runtime, err := h.Queries.GetAgentRuntimeForWorkspace(r.Context(), db.GetAgentRuntimeForWorkspaceParams{
|
||||||
|
|
|
||||||
|
|
@ -209,7 +209,7 @@ func (s *TaskService) FailTask(ctx context.Context, taskID pgtype.UUID, errMsg s
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if existing, lookupErr := s.Queries.GetAgentTask(ctx, taskID); lookupErr == nil {
|
if existing, lookupErr := s.Queries.GetAgentTask(ctx, taskID); lookupErr == nil {
|
||||||
slog.Warn("fail task failed: task not in running state",
|
slog.Warn("fail task failed: task not in dispatched/running state",
|
||||||
"task_id", util.UUIDToString(taskID),
|
"task_id", util.UUIDToString(taskID),
|
||||||
"current_status", existing.Status,
|
"current_status", existing.Status,
|
||||||
"issue_id", util.UUIDToString(existing.IssueID),
|
"issue_id", util.UUIDToString(existing.IssueID),
|
||||||
|
|
|
||||||
1
server/migrations/023_agent_concurrency_default.down.sql
Normal file
1
server/migrations/023_agent_concurrency_default.down.sql
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
ALTER TABLE agent ALTER COLUMN max_concurrent_tasks SET DEFAULT 1;
|
||||||
2
server/migrations/023_agent_concurrency_default.up.sql
Normal file
2
server/migrations/023_agent_concurrency_default.up.sql
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
ALTER TABLE agent ALTER COLUMN max_concurrent_tasks SET DEFAULT 6;
|
||||||
|
UPDATE agent SET max_concurrent_tasks = 6 WHERE max_concurrent_tasks = 1;
|
||||||
|
|
@ -229,7 +229,7 @@ func (q *Queries) DeleteAgent(ctx context.Context, id pgtype.UUID) error {
|
||||||
const failAgentTask = `-- name: FailAgentTask :one
|
const failAgentTask = `-- name: FailAgentTask :one
|
||||||
UPDATE agent_task_queue
|
UPDATE agent_task_queue
|
||||||
SET status = 'failed', completed_at = now(), error = $2
|
SET status = 'failed', completed_at = now(), error = $2
|
||||||
WHERE id = $1 AND status = 'running'
|
WHERE id = $1 AND status IN ('dispatched', 'running')
|
||||||
RETURNING id, agent_id, issue_id, status, priority, dispatched_at, started_at, completed_at, result, error, created_at, context, runtime_id, session_id, work_dir
|
RETURNING id, agent_id, issue_id, status, priority, dispatched_at, started_at, completed_at, result, error, created_at, context, runtime_id, session_id, work_dir
|
||||||
`
|
`
|
||||||
|
|
||||||
|
|
@ -261,6 +261,48 @@ func (q *Queries) FailAgentTask(ctx context.Context, arg FailAgentTaskParams) (A
|
||||||
return i, err
|
return i, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const failStaleTasks = `-- name: FailStaleTasks :many
|
||||||
|
UPDATE agent_task_queue
|
||||||
|
SET status = 'failed', completed_at = now(), error = 'task timed out'
|
||||||
|
WHERE (status = 'dispatched' AND dispatched_at < now() - make_interval(secs => $1::double precision))
|
||||||
|
OR (status = 'running' AND started_at < now() - make_interval(secs => $2::double precision))
|
||||||
|
RETURNING id, agent_id, issue_id
|
||||||
|
`
|
||||||
|
|
||||||
|
type FailStaleTasksParams struct {
|
||||||
|
DispatchTimeoutSecs float64 `json:"dispatch_timeout_secs"`
|
||||||
|
RunningTimeoutSecs float64 `json:"running_timeout_secs"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type FailStaleTasksRow struct {
|
||||||
|
ID pgtype.UUID `json:"id"`
|
||||||
|
AgentID pgtype.UUID `json:"agent_id"`
|
||||||
|
IssueID pgtype.UUID `json:"issue_id"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fails tasks stuck in dispatched/running beyond the given thresholds.
|
||||||
|
// Handles cases where the daemon is alive but the task is orphaned
|
||||||
|
// (e.g. agent process hung, daemon failed to report completion).
|
||||||
|
func (q *Queries) FailStaleTasks(ctx context.Context, arg FailStaleTasksParams) ([]FailStaleTasksRow, error) {
|
||||||
|
rows, err := q.db.Query(ctx, failStaleTasks, arg.DispatchTimeoutSecs, arg.RunningTimeoutSecs)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
items := []FailStaleTasksRow{}
|
||||||
|
for rows.Next() {
|
||||||
|
var i FailStaleTasksRow
|
||||||
|
if err := rows.Scan(&i.ID, &i.AgentID, &i.IssueID); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
items = append(items, i)
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return items, nil
|
||||||
|
}
|
||||||
|
|
||||||
const getAgent = `-- name: GetAgent :one
|
const getAgent = `-- name: GetAgent :one
|
||||||
SELECT id, workspace_id, name, avatar_url, runtime_mode, runtime_config, visibility, status, max_concurrent_tasks, owner_id, created_at, updated_at, description, tools, triggers, runtime_id, instructions FROM agent
|
SELECT id, workspace_id, name, avatar_url, runtime_mode, runtime_config, visibility, status, max_concurrent_tasks, owner_id, created_at, updated_at, description, tools, triggers, runtime_id, instructions FROM agent
|
||||||
WHERE id = $1
|
WHERE id = $1
|
||||||
|
|
|
||||||
|
|
@ -90,9 +90,19 @@ LIMIT 1;
|
||||||
-- name: FailAgentTask :one
|
-- name: FailAgentTask :one
|
||||||
UPDATE agent_task_queue
|
UPDATE agent_task_queue
|
||||||
SET status = 'failed', completed_at = now(), error = $2
|
SET status = 'failed', completed_at = now(), error = $2
|
||||||
WHERE id = $1 AND status = 'running'
|
WHERE id = $1 AND status IN ('dispatched', 'running')
|
||||||
RETURNING *;
|
RETURNING *;
|
||||||
|
|
||||||
|
-- name: FailStaleTasks :many
|
||||||
|
-- Fails tasks stuck in dispatched/running beyond the given thresholds.
|
||||||
|
-- Handles cases where the daemon is alive but the task is orphaned
|
||||||
|
-- (e.g. agent process hung, daemon failed to report completion).
|
||||||
|
UPDATE agent_task_queue
|
||||||
|
SET status = 'failed', completed_at = now(), error = 'task timed out'
|
||||||
|
WHERE (status = 'dispatched' AND dispatched_at < now() - make_interval(secs => @dispatch_timeout_secs::double precision))
|
||||||
|
OR (status = 'running' AND started_at < now() - make_interval(secs => @running_timeout_secs::double precision))
|
||||||
|
RETURNING id, agent_id, issue_id;
|
||||||
|
|
||||||
-- name: CountRunningTasks :one
|
-- name: CountRunningTasks :one
|
||||||
SELECT count(*) FROM agent_task_queue
|
SELECT count(*) FROM agent_task_queue
|
||||||
WHERE agent_id = $1 AND status IN ('dispatched', 'running');
|
WHERE agent_id = $1 AND status IN ('dispatched', 'running');
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue