fix(server): broadcast sweeper task:failed events to correct workspace

The runtime sweeper was publishing task:failed events without a
WorkspaceID, causing them to be silently dropped by the WS listener.
This meant frontends never received notification when stale/orphaned
tasks were failed by the sweeper — the live log card kept showing
"Agent is working" and the agent status remained "working" indefinitely.

- Look up workspace_id from issue table for each swept task
- Set WorkspaceID on published events so they reach the correct WS room
- Reconcile agent status after sweeping so agents return to "idle"
This commit is contained in:
Jiayuan 2026-04-02 03:20:59 +08:00
parent 85d00fde57
commit eb35bc5dc9

View file

@ -5,6 +5,7 @@ import (
"log/slog" "log/slog"
"time" "time"
"github.com/jackc/pgx/v5/pgtype"
"github.com/multica-ai/multica/server/internal/events" "github.com/multica-ai/multica/server/internal/events"
"github.com/multica-ai/multica/server/internal/util" "github.com/multica-ai/multica/server/internal/util"
db "github.com/multica-ai/multica/server/pkg/db/generated" db "github.com/multica-ai/multica/server/pkg/db/generated"
@ -72,18 +73,7 @@ func sweepStaleRuntimes(ctx context.Context, queries *db.Queries, bus *events.Bu
slog.Warn("runtime sweeper: failed to clean up stale tasks", "error", err) slog.Warn("runtime sweeper: failed to clean up stale tasks", "error", err)
} else if len(failedTasks) > 0 { } else if len(failedTasks) > 0 {
slog.Info("runtime sweeper: failed orphaned tasks", "count", len(failedTasks)) slog.Info("runtime sweeper: failed orphaned tasks", "count", len(failedTasks))
for _, ft := range failedTasks { broadcastFailedTasks(ctx, queries, bus, failedTasks)
bus.Publish(events.Event{
Type: protocol.EventTaskFailed,
ActorType: "system",
Payload: map[string]any{
"task_id": util.UUIDToString(ft.ID),
"agent_id": util.UUIDToString(ft.AgentID),
"issue_id": util.UUIDToString(ft.IssueID),
"status": "failed",
},
})
}
} }
// Notify frontend clients so they re-fetch runtime list. // Notify frontend clients so they re-fetch runtime list.
@ -118,10 +108,44 @@ func sweepStaleTasks(ctx context.Context, queries *db.Queries, bus *events.Bus)
} }
slog.Info("task sweeper: failed stale tasks", "count", len(failedTasks)) slog.Info("task sweeper: failed stale tasks", "count", len(failedTasks))
for _, ft := range failedTasks { broadcastFailedTasks(ctx, queries, bus, failedTasks)
}
// failedTask is a common interface for both sweeper result types.
type failedTask struct {
ID pgtype.UUID
AgentID pgtype.UUID
IssueID pgtype.UUID
}
// broadcastFailedTasks publishes task:failed events with the correct WorkspaceID
// and reconciles agent status for all affected agents.
func broadcastFailedTasks(ctx context.Context, queries *db.Queries, bus *events.Bus, tasks any) {
var items []failedTask
switch ts := tasks.(type) {
case []db.FailStaleTasksRow:
for _, t := range ts {
items = append(items, failedTask{ID: t.ID, AgentID: t.AgentID, IssueID: t.IssueID})
}
case []db.FailTasksForOfflineRuntimesRow:
for _, t := range ts {
items = append(items, failedTask{ID: t.ID, AgentID: t.AgentID, IssueID: t.IssueID})
}
}
affectedAgents := make(map[string]pgtype.UUID)
for _, ft := range items {
// Look up workspace ID from the issue so the event reaches the right WS room.
workspaceID := ""
if issue, err := queries.GetIssue(ctx, ft.IssueID); err == nil {
workspaceID = util.UUIDToString(issue.WorkspaceID)
}
bus.Publish(events.Event{ bus.Publish(events.Event{
Type: protocol.EventTaskFailed, Type: protocol.EventTaskFailed,
ActorType: "system", WorkspaceID: workspaceID,
ActorType: "system",
Payload: map[string]any{ Payload: map[string]any{
"task_id": util.UUIDToString(ft.ID), "task_id": util.UUIDToString(ft.ID),
"agent_id": util.UUIDToString(ft.AgentID), "agent_id": util.UUIDToString(ft.AgentID),
@ -129,5 +153,38 @@ func sweepStaleTasks(ctx context.Context, queries *db.Queries, bus *events.Bus)
"status": "failed", "status": "failed",
}, },
}) })
agentKey := util.UUIDToString(ft.AgentID)
affectedAgents[agentKey] = ft.AgentID
}
// Reconcile status for each affected agent.
for _, agentID := range affectedAgents {
reconcileAgentStatus(ctx, queries, bus, agentID)
} }
} }
// reconcileAgentStatus checks running task count and updates agent status.
func reconcileAgentStatus(ctx context.Context, queries *db.Queries, bus *events.Bus, agentID pgtype.UUID) {
running, err := queries.CountRunningTasks(ctx, agentID)
if err != nil {
return
}
newStatus := "idle"
if running > 0 {
newStatus = "working"
}
agent, err := queries.UpdateAgentStatus(ctx, db.UpdateAgentStatusParams{
ID: agentID,
Status: newStatus,
})
if err != nil {
return
}
bus.Publish(events.Event{
Type: protocol.EventAgentStatus,
WorkspaceID: util.UUIDToString(agent.WorkspaceID),
ActorType: "system",
Payload: map[string]any{"agent_id": util.UUIDToString(agent.ID), "status": agent.Status},
})
}