Merge remote-tracking branch 'origin/main' into feat/dashboard

2026-02-12 17:38:49 +08:00 · 2026-02-12 17:38:49 +08:00 · fd098c04eb
commit fd098c04eb
parent 901f5ba804 143d779376
22 changed files with 1329 additions and 130 deletions
--- a/packages/core/src/agent/async-agent.ts
+++ b/packages/core/src/agent/async-agent.ts
@ -10,6 +10,23 @@ import { isHeartbeatAckEvent } from "../hub/heartbeat-filter.js";

 const devNull = { write: () => true } as unknown as NodeJS.WritableStream;

+const WRITEINTERNAL_RETRY_DELAY_MS = 5000;
+
+/** Check if a runInternal error string indicates a transient failure worth retrying. */
+function isTransientRunError(errorMsg: string): boolean {
+  const lower = errorMsg.toLowerCase();
+  if (lower.includes("terminated")) return true;
+  if (lower.includes("aborted")) return true;
+  if (lower.includes("econnreset")) return true;
+  if (lower.includes("etimedout")) return true;
+  if (lower.includes("socket hang up")) return true;
+  if (lower.includes("fetch failed")) return true;
+  if (lower.includes("timeout") || lower.includes("timed out")) return true;
+  if (/\b(429|502|503|504)\b/.test(lower)) return true;
+  if (lower.includes("overloaded")) return true;
+  return false;
+}
+
 /** Discriminated union of legacy Message, raw AgentEvent, and MulticaEvent */
 export type ChannelItem = Message | AgentEvent | MulticaEvent;

@ -122,30 +139,54 @@ export class AsyncAgent {
      .then(async () => {
        if (this._closed) return;
        const prevForward = this.forwardInternalAssistant;
-        this.forwardInternalAssistant = forwardAssistant;
-        try {
-          const result = await this.agent.runInternal(content);
-          await this.agent.flushSession();
-          if (result.error) {
-            // Internal run errors are for diagnostics only; do not leak to user stream.
-            console.error(`[AsyncAgent] Internal run error: ${result.error}`);
-          }
-          // Stop forwarding BEFORE persist to avoid double-emitting the same
-          // assistant message (once from runInternal streaming, once from appendMessage).
-          this.forwardInternalAssistant = prevForward;
-          // Persist the LLM summary so it remains in parent context for future turns
-          if (persistResponse && result.text?.trim() && !isSilentReplyText(result.text)) {
-            this.agent.persistAssistantSummary(result.text.trim());
+
+        for (let attempt = 1; attempt <= 2; attempt++) {
+          this.forwardInternalAssistant = forwardAssistant;
+          try {
+            const result = await this.agent.runInternal(content);
            await this.agent.flushSession();
+
+            if (result.error) {
+              if (attempt === 1 && isTransientRunError(result.error)) {
+                console.warn(
+                  `[AsyncAgent] Internal run transient error: ${result.error}. Retrying in ${WRITEINTERNAL_RETRY_DELAY_MS}ms...`,
+                );
+                this.forwardInternalAssistant = prevForward;
+                await new Promise((r) => setTimeout(r, WRITEINTERNAL_RETRY_DELAY_MS));
+                continue;
+              }
+              // Final attempt or non-transient: log and give up
+              console.error(`[AsyncAgent] Internal run error: ${result.error}`);
+              this.forwardInternalAssistant = prevForward;
+              return;
+            }
+
+            // Success — stop forwarding BEFORE persist to avoid double-emitting
+            this.forwardInternalAssistant = prevForward;
+            if (persistResponse && result.text?.trim() && !isSilentReplyText(result.text)) {
+              this.agent.persistAssistantSummary(result.text.trim());
+              await this.agent.flushSession();
+            }
+            return;
+          } catch (err) {
+            const message = err instanceof Error ? err.message : String(err);
+            if (attempt === 1 && isTransientRunError(message)) {
+              console.warn(
+                `[AsyncAgent] Internal run exception: ${message}. Retrying in ${WRITEINTERNAL_RETRY_DELAY_MS}ms...`,
+              );
+              this.forwardInternalAssistant = prevForward;
+              await new Promise((r) => setTimeout(r, WRITEINTERNAL_RETRY_DELAY_MS));
+              continue;
+            }
+            console.error(`[AsyncAgent] Internal run failed: ${message}`);
+            this.forwardInternalAssistant = prevForward;
+            return;
          }
-        } finally {
-          this.forwardInternalAssistant = prevForward;
        }
      })
      .catch((err) => {
        const message = err instanceof Error ? err.message : String(err);
-        // Internal run exceptions are for diagnostics only; do not leak to user stream.
-        console.error(`[AsyncAgent] Internal run failed: ${message}`);
+        console.error(`[AsyncAgent] Internal run failed (outer): ${message}`);
      });
  }

--- a/packages/core/src/agent/context-window/token-estimation.test.ts
+++ b/packages/core/src/agent/context-window/token-estimation.test.ts
@ -37,7 +37,7 @@ vi.mock("@mariozechner/pi-coding-agent", () => ({
 describe("token-estimation", () => {
  describe("constants", () => {
    it("should have correct safety margin", () => {
-      expect(ESTIMATION_SAFETY_MARGIN).toBe(1.2);
+      expect(ESTIMATION_SAFETY_MARGIN).toBe(1.5);
    });

    it("should have correct compaction trigger ratio", () => {
@ -63,20 +63,20 @@ describe("token-estimation", () => {
    });

    it("should estimate tokens based on character count", () => {
-      // ~3 chars per token
-      expect(estimateSystemPromptTokens("abc")).toBe(1);
-      expect(estimateSystemPromptTokens("abcdef")).toBe(2);
-      expect(estimateSystemPromptTokens("abcdefghi")).toBe(3);
+      // ~2 chars per token (conservative for CJK/mixed content)
+      expect(estimateSystemPromptTokens("ab")).toBe(1);
+      expect(estimateSystemPromptTokens("abcd")).toBe(2);
+      expect(estimateSystemPromptTokens("abcdef")).toBe(3);
    });

    it("should ceil the result", () => {
-      // 4 chars / 3 = 1.33, should ceil to 2
-      expect(estimateSystemPromptTokens("abcd")).toBe(2);
+      // 3 chars / 2 = 1.5, should ceil to 2
+      expect(estimateSystemPromptTokens("abc")).toBe(2);
    });

    it("should handle long prompts", () => {
      const longPrompt = "a".repeat(3000);
-      expect(estimateSystemPromptTokens(longPrompt)).toBe(1000);
+      expect(estimateSystemPromptTokens(longPrompt)).toBe(1500);
    });
  });

@ -140,7 +140,7 @@ describe("token-estimation", () => {
        reserveTokens: 0,
      });

-      // Utilization = (tokens * 1.2) / available
+      // Utilization = (tokens * 1.5) / available
      expect(result.utilizationRatio).toBeGreaterThan(0);
    });
  });
@ -292,26 +292,26 @@ describe("token-estimation", () => {
        content: "x".repeat(400), // ~100 tokens
      } as AgentMessage;

-      // With safety margin 1.2, 100 * 1.2 = 120 tokens
-      // 120 > 1000 * 0.1 = 100, so oversized
+      // With safety margin 1.5, 100 * 1.5 = 150 tokens
+      // 150 > 1000 * 0.1 = 100, so oversized
      expect(isMessageOversized(message, 1000, 0.1)).toBe(true);

-      // 120 < 1000 * 0.2 = 200, so not oversized
+      // 150 < 1000 * 0.2 = 200, so not oversized
      expect(isMessageOversized(message, 1000, 0.2)).toBe(false);
    });

    it("should apply safety margin to token count", () => {
      const message = {
        role: "user",
-        content: "x".repeat(400), // ~100 tokens, with margin ~120
+        content: "x".repeat(400), // ~100 tokens, with margin ~150
      } as AgentMessage;

      // Without margin: 100 < 250 (50% of 500)
-      // With margin: 120 < 250, still ok
+      // With margin: 150 < 250, still ok
      expect(isMessageOversized(message, 500, 0.5)).toBe(false);

      // Without margin: 100 < 100 would be false
-      // With margin: 120 > 100, should be true
+      // With margin: 150 > 100, should be true
      expect(isMessageOversized(message, 200, 0.5)).toBe(true);
    });
  });
--- a/packages/core/src/agent/context-window/token-estimation.ts
+++ b/packages/core/src/agent/context-window/token-estimation.ts
@ -9,7 +9,7 @@ import { estimateTokens } from "@mariozechner/pi-coding-agent";
 import type { TokenEstimation, TokenAwareCompactionResult } from "./types.js";

 /** Safety margin coefficient to compensate for estimation inaccuracy */
-export const ESTIMATION_SAFETY_MARGIN = 1.2; // 20% buffer
+export const ESTIMATION_SAFETY_MARGIN = 1.5; // 50% buffer (covers CJK and mixed content)

 /** Utilization threshold for triggering compaction */
 export const COMPACTION_TRIGGER_RATIO = 0.8; // 80%
@ -32,10 +32,10 @@ export function estimateMessagesTokens(messages: AgentMessage[]): number {
 */
 export function estimateSystemPromptTokens(systemPrompt: string | undefined): number {
  if (!systemPrompt) return 0;
-  // Simple estimation: ~4 chars = 1 token (for English/code mixed text)
-  // Chinese ~2 chars = 1 token
-  // Average value of 3
-  return Math.ceil(systemPrompt.length / 3);
+  // Conservative estimation: ~2 chars = 1 token
+  // English/code averages ~4 chars/token but CJK averages ~1-2 chars/token.
+  // Using /2 as a safe default to prevent underestimation on mixed content.
+  return Math.ceil(systemPrompt.length / 2);
 }

 /**
--- a/packages/core/src/agent/errors.ts
+++ b/packages/core/src/agent/errors.ts
@ -0,0 +1,21 @@
+/**
+ * Error classification utilities for agent error handling.
+ */
+
+/**
+ * Check if an error is a context overflow / "prompt too long" error from any LLM provider.
+ *
+ * These errors indicate the request exceeded the model's context window and should
+ * trigger auto-compaction rather than auth profile rotation.
+ */
+export function isContextOverflowError(error: unknown): boolean {
+  const msg = (error instanceof Error ? error.message : String(error)).toLowerCase();
+  return (
+    msg.includes("prompt is too long") ||
+    msg.includes("context length exceeded") ||
+    msg.includes("maximum context length") ||
+    msg.includes("request_too_large") ||
+    msg.includes("request size exceeds") ||
+    (msg.includes("413") && msg.includes("too large"))
+  );
+}
--- a/packages/core/src/agent/runner.ts
+++ b/packages/core/src/agent/runner.ts
@ -22,7 +22,14 @@ import {
  checkContextWindow,
  DEFAULT_CONTEXT_TOKENS,
  type ContextWindowGuardResult,
+  estimateTokenUsage,
+  COMPACTION_TRIGGER_RATIO,
+  compactMessagesTokenAware,
+  MIN_KEEP_MESSAGES,
 } from "./context-window/index.js";
+import {
+  pruneToolResults,
+} from "./context-window/tool-result-pruning.js";
 import { mergeToolsConfig, type ToolsConfig } from "./tools/policy.js";
 import {
  loadAuthProfileStore,
@ -42,6 +49,7 @@ import {
  sanitizeToolCallInputs,
  sanitizeToolUseResultPairing,
 } from "./session/session-transcript-repair.js";
+import { isContextOverflowError } from "./errors.js";

 // ============================================================
 // Error classification for auth profile rotation
@ -89,11 +97,15 @@ export class Agent {
  private readonly stderr: NodeJS.WritableStream;
  private initialized = false;

+  // Context window settings (for pre-flight compaction)
+  private readonly reserveTokens: number;
+
  // Internal run state
  private _internalRun = false;
  private _isRunning = false;
  private _aborted = false;
  private _runMutex: Promise<void> = Promise.resolve();
+  private _compactionPromise: Promise<void> = Promise.resolve();
  private currentUserDisplayPrompt: string | undefined;

  // MulticaEvent subscribers (parallel to PiAgentCore's subscriber list)
@ -188,8 +200,10 @@ export class Agent {
        return this.currentApiKey;
      },
      transformContext: async (messages) => {
-        const sanitizedInputs = sanitizeToolCallInputs(messages);
-        return sanitizeToolUseResultPairing(sanitizedInputs);
+        let result = sanitizeToolCallInputs(messages);
+        result = sanitizeToolUseResultPairing(result);
+        result = this.preflightCompact(result);
+        return result;
      },
    });

@ -260,6 +274,9 @@ export class Agent {
      ? resolveApiKey(this.resolvedProvider, options.apiKey)
      : undefined;

+    // Store reserveTokens for pre-flight compaction
+    this.reserveTokens = options.reserveTokens ?? 1024;
+
    // 创建 SessionManager（带 context window 配置）
    this.session = new SessionManager({
      sessionId: this.sessionId,
@ -425,6 +442,8 @@ export class Agent {
    prompt: string,
    options?: { displayPrompt?: string },
  ): Promise<AgentRunResult> {
+    // Wait for any in-flight compaction from the previous run
+    await this._compactionPromise;
    await this.ensureInitialized();
    this.refreshAuthState();
    this.output.state.lastAssistantText = "";
@ -444,6 +463,9 @@ export class Agent {
      const canRotate = !this.pinnedProfile && this.profileCandidates.length > 1;
      let lastError: unknown;

+      const MAX_OVERFLOW_COMPACTION_ATTEMPTS = 2;
+      let overflowAttempts = 0;
+
      // Loop to exhaust all candidate profiles on rotatable errors
      while (true) {
        try {
@ -452,6 +474,34 @@ export class Agent {
        } catch (error) {
          lastError = error;

+          // Context overflow recovery: auto-compact and retry before trying auth rotation
+          if (isContextOverflowError(error) && overflowAttempts < MAX_OVERFLOW_COMPACTION_ATTEMPTS) {
+            overflowAttempts++;
+            this.stderr.write(
+              `[context-overflow] Overflow detected (attempt ${overflowAttempts}/${MAX_OVERFLOW_COMPACTION_ATTEMPTS}), compacting...\n`,
+            );
+            const messages = this.agent.state.messages.slice();
+            const result = await this.session.maybeCompact(messages);
+            if (result?.kept) {
+              this.agent.replaceMessages(result.kept);
+              this.output.state.lastAssistantText = "";
+              continue; // retry with compacted messages
+            }
+            // Forced fallback: estimation may diverge from reality (the LLM
+            // already told us the context is too large), so drop the oldest
+            // half of messages even when maybeCompact thinks no compaction is needed.
+            if (messages.length > MIN_KEEP_MESSAGES) {
+              const keepCount = Math.max(MIN_KEEP_MESSAGES, Math.floor(messages.length / 2));
+              const forcedKept = messages.slice(-keepCount);
+              this.stderr.write(
+                `[context-overflow] Forced compaction: ${messages.length} → ${forcedKept.length} messages\n`,
+              );
+              this.agent.replaceMessages(forcedKept);
+              this.output.state.lastAssistantText = "";
+              continue;
+            }
+          }
+
          const reason = classifyError(error);
          if (this.currentProfileId && isRotatableError(reason)) {
            markAuthProfileFailure(this.currentProfileId, reason);
@ -615,35 +665,88 @@ export class Agent {
      // Skip compaction during internal runs — internal messages will be
      // rolled back from memory afterwards, so compacting now would be incorrect.
      if (message.role === "assistant" && !this._internalRun) {
-        void this.maybeCompact();
+        this._compactionPromise = this.maybeCompact().catch((err) => {
+          console.error("[Agent] Compaction failed:", err);
+        });
      }
    }
  }

+  /**
+   * Pre-flight context compaction — runs inside transformContext before every LLM call.
+   * Pure in-memory, no disk writes. Prunes tool results and drops oldest messages
+   * when the estimated token utilization exceeds the compaction trigger threshold.
+   */
+  private preflightCompact(messages: AgentMessage[]): AgentMessage[] {
+    const estimation = estimateTokenUsage({
+      messages,
+      systemPrompt: this.agent.state.systemPrompt,
+      contextWindowTokens: this.contextWindowGuard.tokens,
+      reserveTokens: this.reserveTokens,
+    });
+
+    if (estimation.utilizationRatio < COMPACTION_TRIGGER_RATIO) {
+      return messages; // fast path
+    }
+
+    const originalCount = messages.length;
+    let result = messages;
+
+    // Phase 1: Prune tool results (soft trim + hard clear)
+    const pruneResult = pruneToolResults({
+      messages: result,
+      contextWindowTokens: this.contextWindowGuard.tokens,
+    });
+    if (pruneResult.changed) {
+      result = pruneResult.messages;
+    }
+
+    // Re-estimate after pruning
+    const afterPrune = estimateTokenUsage({
+      messages: result,
+      systemPrompt: this.agent.state.systemPrompt,
+      contextWindowTokens: this.contextWindowGuard.tokens,
+      reserveTokens: this.reserveTokens,
+    });
+
+    // Phase 2: Drop oldest messages if still over threshold
+    if (afterPrune.utilizationRatio >= COMPACTION_TRIGGER_RATIO) {
+      const compacted = compactMessagesTokenAware(result, afterPrune.availableTokens);
+      if (compacted) {
+        result = compacted.kept;
+      }
+    }
+
+    if (result.length < originalCount) {
+      const saved = originalCount - result.length;
+      this.stderr.write(
+        `[pre-flight compaction] pruned ${saved} messages (${originalCount} → ${result.length})\n`,
+      );
+    }
+
+    return result;
+  }
+
  private async maybeCompact() {
    const messages = this.agent.state.messages.slice();
    if (!this.session.needsCompaction(messages)) return;

-    try {
-      const result = await this.session.maybeCompact(messages);
-      if (!result) return;
+    const result = await this.session.maybeCompact(messages);
+    if (!result) return;

-      this.emitMulticaEvent({ type: "compaction_start" });
-      if (result?.kept) {
-        this.agent.replaceMessages(result.kept);
-      }
-      const endEvent: CompactionEndEvent = {
-        type: "compaction_end",
-        removed: result?.removedCount ?? 0,
-        kept: result?.kept.length ?? messages.length,
-        tokensRemoved: result?.tokensRemoved,
-        tokensKept: result?.tokensKept,
-        reason: result?.reason ?? "tokens",
-      };
-      this.emitMulticaEvent(endEvent);
-    } catch (err) {
-      throw err;
+    this.emitMulticaEvent({ type: "compaction_start" });
+    if (result.kept) {
+      this.agent.replaceMessages(result.kept);
    }
+    const endEvent: CompactionEndEvent = {
+      type: "compaction_end",
+      removed: result.removedCount ?? 0,
+      kept: result.kept.length ?? messages.length,
+      tokensRemoved: result.tokensRemoved,
+      tokensKept: result.tokensKept,
+      reason: result.reason ?? "tokens",
+    };
+    this.emitMulticaEvent(endEvent);
  }

  /**
--- a/packages/core/src/agent/session/compaction.test.ts
+++ b/packages/core/src/agent/session/compaction.test.ts
@ -44,7 +44,7 @@ vi.mock("../context-window/index.js", async () => {
      const systemPromptTokens = params.systemPrompt ? 100 : 0;
      const reserve = params.reserveTokens ?? 1024;
      const availableTokens = Math.max(0, params.contextWindowTokens - systemPromptTokens - reserve);
-      const utilizationRatio = availableTokens > 0 ? (messageTokens * 1.2) / availableTokens : 1;
+      const utilizationRatio = availableTokens > 0 ? (messageTokens * 1.5) / availableTokens : 1;

      return {
        messageTokens,
@ -234,7 +234,7 @@ describe("compaction", () => {
        // 100 * 10 = 1000 message tokens
        // System: 100 tokens, Reserve: 1024
        // Available: 2000 - 100 - 1024 = 876
-        // Utilization: (1000 * 1.2) / 876 = 1.37 > 0.8
+        // Utilization: (1000 * 1.5) / 876 = 1.71 > 0.8
        const result = compactMessages(messages, {
          mode: "tokens",
          contextWindowTokens: 2000,
@ -249,7 +249,7 @@ describe("compaction", () => {
        const messages = createMessages(5);
        // 5 * 10 = 50 message tokens
        // Available: 10000 - 100 - 1024 = 8876
-        // Utilization: (50 * 1.2) / 8876 = 0.007 < 0.8
+        // Utilization: (50 * 1.5) / 8876 = 0.008 < 0.8
        const result = compactMessages(messages, {
          mode: "tokens",
          contextWindowTokens: 10000,
--- a/packages/core/src/agent/subagent/announce.test.ts
+++ b/packages/core/src/agent/subagent/announce.test.ts
@ -188,7 +188,7 @@ describe("formatCoalescedAnnouncementMessage", () => {

    const msg = formatCoalescedAnnouncementMessage(records);

-    expect(msg).toContain("All 2 background tasks have completed");
+    expect(msg).toContain("All 2 background task(s) have completed");
    expect(msg).toContain('Task 1: "Task A"');
    expect(msg).toContain("Found issue A");
    expect(msg).toContain('Task 2: "Task B"');
@ -251,4 +251,44 @@ describe("formatCoalescedAnnouncementMessage", () => {
    expect(msg).toContain("上海：多云，9°C");
    expect(msg).toContain("MUST include findings from every task item above");
  });
+
+  it("includes continuation prompt when next is provided", () => {
+    const records = [
+      makeRecord({ runId: "run-1", label: "AAPL data", findings: "AAPL revenue: $100B" }),
+      makeRecord({ runId: "run-2", label: "MSFT data", findings: "MSFT revenue: $200B" }),
+    ];
+
+    const msg = formatCoalescedAnnouncementMessage(records, "Summarize all data and write a PDF investment report");
+
+    expect(msg).toContain("CONTINUATION TASK");
+    expect(msg).toContain("Summarize all data and write a PDF investment report");
+    expect(msg).toContain("AAPL revenue: $100B");
+    expect(msg).toContain("MSFT revenue: $200B");
+    // Should NOT contain the default summarize instruction
+    expect(msg).not.toContain("Summarize these results naturally for the user");
+  });
+
+  it("uses continuation prompt even for single record when next is provided", () => {
+    const records = [
+      makeRecord({ runId: "run-1", label: "Data collection", findings: "All data collected" }),
+    ];
+
+    const msg = formatCoalescedAnnouncementMessage(records, "Generate the final report");
+
+    expect(msg).toContain("CONTINUATION TASK");
+    expect(msg).toContain("Generate the final report");
+    expect(msg).toContain("All data collected");
+  });
+
+  it("uses default summarize instruction when next is not provided", () => {
+    const records = [
+      makeRecord({ runId: "run-1" }),
+      makeRecord({ runId: "run-2" }),
+    ];
+
+    const msg = formatCoalescedAnnouncementMessage(records);
+
+    expect(msg).not.toContain("CONTINUATION TASK");
+    expect(msg).toContain("Summarize these results naturally for the user");
+  });
 });
--- a/packages/core/src/agent/subagent/announce.ts
+++ b/packages/core/src/agent/subagent/announce.ts
@ -193,12 +193,17 @@ export function formatAnnouncementMessage(params: FormatAnnouncementParams): str
 /**
 * Format a coalesced announcement message from multiple completed subagent runs.
 * When only one record is provided, delegates to formatAnnouncementMessage.
+ *
+ * @param next — Optional continuation prompt from a SubagentGroup. When present,
+ *   the parent agent is instructed to execute the continuation using the combined
+ *   findings, rather than just summarizing.
 */
 export function formatCoalescedAnnouncementMessage(
  records: SubagentRunRecord[],
+  next?: string,
 ): string {
-  // Single record: delegate to existing format for backward-compatible behavior
-  if (records.length === 1) {
+  // Single record without continuation: delegate to existing format
+  if (records.length === 1 && !next) {
    const r = records[0]!;
    return formatAnnouncementMessage({
      runId: r.runId,
@ -214,10 +219,9 @@ export function formatCoalescedAnnouncementMessage(
    });
  }

-  // Multiple records: build combined message.
-  // Include a strict raw-findings section so parent can reliably cover every task result.
+  // Multiple records (or single with continuation): build combined message.
  const parts: string[] = [
-    `All ${records.length} background tasks have completed. Here are the combined results:`,
+    `All ${records.length} background task(s) have completed. Here are the combined results:`,
    "",
  ];

@ -262,14 +266,30 @@ export function formatCoalescedAnnouncementMessage(
    );
  }

-  parts.push(
-    "",
-    "Summarize these results naturally for the user.",
-    "You MUST include findings from every task item above, without omission.",
-    "Keep it concise, but preserve concrete findings from each task.",
-    "Do not mention technical details like session IDs or that these were background tasks.",
-    "You can respond with NO_REPLY if no announcement is needed.",
-  );
+  // Continuation vs. summarization
+  if (next) {
+    parts.push(
+      "",
+      "---",
+      "",
+      "CONTINUATION TASK: The user's original request requires further work using the findings above.",
+      "Execute the following task now, using ALL the collected data:",
+      "",
+      next,
+      "",
+      "Use the raw findings above as your data source. Call tools as needed to complete this task.",
+      "Do not mention technical details like session IDs or that these were background tasks.",
+    );
+  } else {
+    parts.push(
+      "",
+      "Summarize these results naturally for the user.",
+      "You MUST include findings from every task item above, without omission.",
+      "Keep it concise, but preserve concrete findings from each task.",
+      "Do not mention technical details like session IDs or that these were background tasks.",
+      "You can respond with NO_REPLY if no announcement is needed.",
+    );
+  }

  return parts.join("\n");
 }
@ -289,8 +309,9 @@ export function formatCoalescedAnnouncementMessage(
 export function runCoalescedAnnounceFlow(
  requesterSessionId: string,
  records: SubagentRunRecord[],
+  next?: string,
 ): boolean {
-  const message = formatCoalescedAnnouncementMessage(records);
+  const message = formatCoalescedAnnouncementMessage(records, next);

  try {
    const hub = getHub();
--- a/packages/core/src/agent/subagent/registry-recovery.test.ts
+++ b/packages/core/src/agent/subagent/registry-recovery.test.ts
@ -12,6 +12,7 @@ const rmSyncMock = vi.fn();

 vi.mock("./registry-store.js", () => ({
  loadSubagentRuns: loadSubagentRunsMock,
+  loadSubagentGroups: vi.fn(() => new Map()),
  saveSubagentRuns: saveSubagentRunsMock,
 }));

--- a/packages/core/src/agent/subagent/registry-store.ts
+++ b/packages/core/src/agent/subagent/registry-store.ts
@ -7,7 +7,7 @@
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
 import { join } from "node:path";
 import { DATA_DIR } from "@multica/utils";
-import type { SubagentRunRecord } from "./types.js";
+import type { SubagentRunRecord, SubagentGroup } from "./types.js";

 const SUBAGENTS_DIR = join(DATA_DIR, "subagents");
 const RUNS_FILE = join(SUBAGENTS_DIR, "runs.json");
@ -15,6 +15,7 @@ const RUNS_FILE = join(SUBAGENTS_DIR, "runs.json");
 interface SubagentRunsStore {
  version: 1;
  runs: Record<string, SubagentRunRecord>;
+  groups?: Record<string, SubagentGroup> | undefined;
 }

 function ensureDir(): void {
@ -48,13 +49,31 @@ export function loadSubagentRuns(): Map<string, SubagentRunRecord> {
  }
 }

-/** Save all subagent runs to disk */
-export function saveSubagentRuns(runs: Map<string, SubagentRunRecord>): void {
+/** Load all persisted subagent groups */
+export function loadSubagentGroups(): Map<string, SubagentGroup> {
+  if (!existsSync(RUNS_FILE)) return new Map();
+
+  try {
+    const content = readFileSync(RUNS_FILE, "utf-8");
+    const store = JSON.parse(content) as SubagentRunsStore;
+    if (store.version !== 1 || !store.groups) return new Map();
+    return new Map(Object.entries(store.groups));
+  } catch {
+    return new Map();
+  }
+}
+
+/** Save all subagent runs and groups to disk */
+export function saveSubagentRuns(
+  runs: Map<string, SubagentRunRecord>,
+  groups?: Map<string, SubagentGroup>,
+): void {
  ensureDir();

  const store: SubagentRunsStore = {
    version: 1,
    runs: Object.fromEntries(runs),
+    groups: groups && groups.size > 0 ? Object.fromEntries(groups) : undefined,
  };

  writeFileSync(RUNS_FILE, JSON.stringify(store, null, 2), "utf-8");
--- a/packages/core/src/agent/subagent/registry.ts
+++ b/packages/core/src/agent/subagent/registry.ts
@ -6,11 +6,12 @@
 */

 import { getHub, isHubInitialized } from "../../hub/hub-singleton.js";
-import { loadSubagentRuns, saveSubagentRuns } from "./registry-store.js";
+import { loadSubagentRuns, saveSubagentRuns, loadSubagentGroups } from "./registry-store.js";
 import { readLatestAssistantReply, runCoalescedAnnounceFlow } from "./announce.js";
 import type {
  RegisterSubagentRunParams,
  SubagentRunRecord,
+  SubagentGroup,
 } from "./types.js";
 import { resolveSessionDir } from "../session/storage.js";
 import { rmSync } from "node:fs";
@ -28,6 +29,7 @@ const SWEEP_INTERVAL_MS = 60 * 1000;
 // ============================================================================

 const subagentRuns = new Map<string, SubagentRunRecord>();
+const subagentGroups = new Map<string, SubagentGroup>();
 let sweepTimer: ReturnType<typeof setInterval> | undefined;
 const resumedRequesters = new Set<string>();

@ -50,6 +52,12 @@ export function initSubagentRegistry(): void {
    }
  }

+  // Restore groups
+  const persistedGroups = loadSubagentGroups();
+  for (const [groupId, group] of persistedGroups) {
+    subagentGroups.set(groupId, group);
+  }
+
  // Process incomplete runs
  const affectedRequesters = new Set<string>();

@ -91,6 +99,45 @@ export function initSubagentRegistry(): void {
  }
 }

+// ============================================================================
+// Group management
+// ============================================================================
+
+/** Create a new subagent group. Returns the group record. */
+export function createSubagentGroup(params: {
+  groupId: string;
+  requesterSessionId: string;
+  label?: string;
+  next?: string;
+}): SubagentGroup {
+  const group: SubagentGroup = {
+    groupId: params.groupId,
+    requesterSessionId: params.requesterSessionId,
+    label: params.label,
+    next: params.next,
+    createdAt: Date.now(),
+  };
+  subagentGroups.set(params.groupId, group);
+  persist();
+  return group;
+}
+
+/** Get a group by ID. */
+export function getSubagentGroup(groupId: string): SubagentGroup | undefined {
+  return subagentGroups.get(groupId);
+}
+
+/** List all runs belonging to a group. */
+export function listGroupRuns(groupId: string): SubagentRunRecord[] {
+  const result: SubagentRunRecord[] = [];
+  for (const record of subagentRuns.values()) {
+    if (record.groupId === groupId) {
+      result.push(record);
+    }
+  }
+  return result;
+}
+
 /** Register a new subagent run and start tracking its lifecycle. */
 export function registerSubagentRun(params: RegisterSubagentRunParams): SubagentRunRecord {
  const {
@ -102,6 +149,7 @@ export function registerSubagentRun(params: RegisterSubagentRunParams): Subagent
    cleanup = "delete",
    timeoutSeconds,
    announce,
+    groupId,
    start,
  } = params;

@ -113,6 +161,7 @@ export function registerSubagentRun(params: RegisterSubagentRunParams): Subagent
    label,
    cleanup,
    announce,
+    groupId,
    createdAt: Date.now(),
  };

@ -190,6 +239,7 @@ export function shutdownSubagentRegistry(): void {
 /** Reset all state (for testing). */
 export function resetSubagentRegistryForTests(): void {
  subagentRuns.clear();
+  subagentGroups.clear();
  resumedRequesters.clear();
  stopSweeper();
 }
@ -300,37 +350,59 @@ function captureFindings(record: SubagentRunRecord): void {
 /**
 * Phase 2: Announce completed-but-unannounced runs.
 *
- * Runs with announce="silent" are held back until ALL silent runs from the
- * same requester have completed. All other runs (immediate / undefined) are
- * announced per-completion as before.
+ * Three announcement paths:
+ * 1. Grouped runs — wait for all runs in the group to complete, then announce
+ *    together with the group's `next` continuation prompt (if any).
+ * 2. Ungrouped silent runs — legacy behavior: wait for ALL silent runs from
+ *    the same requester to complete, then announce together.
+ * 3. Ungrouped immediate runs — announce per-completion (default).
 */
 function checkAndAnnounce(requesterSessionId: string): void {
  const allRuns = listSubagentRuns(requesterSessionId);

-  // ── Immediate runs: announce per-completion (default behavior) ──
-  const immediateReady = allRuns.filter(
+  // ── 1. Grouped runs: announce by group when all members complete ──
+  const groupIds = new Set<string>();
+  for (const r of allRuns) {
+    if (r.groupId && !r.announced) groupIds.add(r.groupId);
+  }
+
+  for (const groupId of groupIds) {
+    const groupRuns = allRuns.filter(r => r.groupId === groupId);
+    const unannounced = groupRuns.filter(r => !r.announced);
+    const ready = unannounced.filter(r => r.endedAt !== undefined && r.findingsCaptured);
+
+    if (ready.length > 0 && ready.length === unannounced.length) {
+      const group = subagentGroups.get(groupId);
+      announceRuns(requesterSessionId, ready, group?.next);
+    }
+  }
+
+  // ── 2. Ungrouped runs: original immediate/silent logic ──
+  const ungrouped = allRuns.filter(r => !r.groupId);
+
+  // Immediate: announce per-completion
+  const immediateReady = ungrouped.filter(
    r => !r.announced && r.endedAt !== undefined && r.findingsCaptured && r.announce !== "silent",
  );
  if (immediateReady.length > 0) {
-    announceGroup(requesterSessionId, immediateReady);
+    announceRuns(requesterSessionId, immediateReady);
  }

-  // ── Silent runs: announce only when ALL silent runs are done ──
-  const silentRuns = allRuns.filter(r => r.announce === "silent");
+  // Silent: announce only when ALL ungrouped silent runs are done
+  const silentRuns = ungrouped.filter(r => r.announce === "silent");
  const unannouncedSilent = silentRuns.filter(r => !r.announced);
  const silentReady = unannouncedSilent.filter(
    r => r.endedAt !== undefined && r.findingsCaptured,
  );

-  // All unannounced silent runs must be ready (ended + findings captured)
  if (silentReady.length > 0 && silentReady.length === unannouncedSilent.length) {
-    announceGroup(requesterSessionId, silentReady);
+    announceRuns(requesterSessionId, silentReady);
  }
 }

-/** Announce a group of runs and mark them as announced. */
-function announceGroup(requesterSessionId: string, runs: SubagentRunRecord[]): void {
-  const announced = runCoalescedAnnounceFlow(requesterSessionId, runs);
+/** Announce a batch of completed runs and mark them as announced. */
+function announceRuns(requesterSessionId: string, runs: SubagentRunRecord[], next?: string): void {
+  const announced = runCoalescedAnnounceFlow(requesterSessionId, runs, next);

  if (announced) {
    for (const r of runs) {
@ -415,9 +487,18 @@ function sweep(): void {
    }
  }

+  // Clean up groups whose runs have all been archived
+  for (const [groupId] of subagentGroups) {
+    const hasActiveRuns = [...subagentRuns.values()].some(r => r.groupId === groupId);
+    if (!hasActiveRuns) {
+      subagentGroups.delete(groupId);
+      removed++;
+    }
+  }
+
  if (removed > 0) {
    persist();
-    console.log(`[SubagentRegistry] Archived ${removed} completed run(s)`);
+    console.log(`[SubagentRegistry] Archived ${removed} completed run(s)/group(s)`);
  }

  if (subagentRuns.size === 0) {
@ -431,7 +512,7 @@ function sweep(): void {

 function persist(): void {
  try {
-    saveSubagentRuns(subagentRuns);
+    saveSubagentRuns(subagentRuns, subagentGroups);
  } catch (err) {
    console.error(`[SubagentRegistry] Failed to persist runs:`, err);
  }
--- a/packages/core/src/agent/subagent/types.ts
+++ b/packages/core/src/agent/subagent/types.ts
@ -11,6 +11,26 @@ export type SubagentRunOutcome = {
  error?: string | undefined;
 };

+/**
+ * A logical group of subagent runs that are tracked together.
+ * Groups enable "collect all, then act" workflows:
+ * all runs in a group must complete before the combined results
+ * (plus an optional `next` continuation) are announced to the parent.
+ */
+export type SubagentGroup = {
+  /** Unique group identifier (UUIDv7) */
+  groupId: string;
+  /** Session ID of the parent (requester) agent */
+  requesterSessionId: string;
+  /** Optional human-readable label for the group */
+  label?: string | undefined;
+  /** Continuation prompt executed after all runs in the group complete.
+   *  Injected into the announcement so the parent agent acts on the combined findings. */
+  next?: string | undefined;
+  /** Timestamp when the group was created */
+  createdAt: number;
+};
+
 /** Persistent record tracking a single subagent run */
 export type SubagentRunRecord = {
  /** Unique run identifier (UUIDv7) */
@ -48,6 +68,9 @@ export type SubagentRunRecord = {
  /** Announcement mode: "immediate" (default) announces per-completion,
   *  "silent" defers until all silent runs from the same requester complete. */
  announce?: "immediate" | "silent" | undefined;
+  /** Group ID this run belongs to (if any). Runs in a group are announced
+   *  together when all complete, regardless of the `announce` field. */
+  groupId?: string | undefined;
 };

 /** Parameters for registering a new subagent run */
@ -63,6 +86,12 @@ export type RegisterSubagentRunParams = {
  start?: (() => void) | undefined;
  /** Announcement mode: "immediate" (default) or "silent" (defer until all silent runs complete). */
  announce?: "immediate" | "silent" | undefined;
+  /** Group ID to join. Runs in a group are announced together when all complete. */
+  groupId?: string | undefined;
+  /** Continuation prompt for the group. Only used on group creation (first spawn).
+   *  After all runs in the group complete, this prompt is included in the announcement
+   *  so the parent agent can act on the combined findings (e.g. summarize, write PDF). */
+  next?: string | undefined;
 };

 /** Parameters for the announce flow */
--- a/packages/core/src/agent/system-prompt/sections.ts
+++ b/packages/core/src/agent/system-prompt/sections.ts
@ -262,23 +262,47 @@ export function buildConditionalToolSections(
    lines.push(
      "## Sub-Agents",
      "If a task is complex or long-running, spawn a sub-agent. It will do the work and report back when done.",
-      "IMPORTANT: After spawning sub-agents, do NOT immediately check on them with sessions_list. " +
-        "Results are delivered directly into your context automatically when the sub-agent finishes. " +
-        "Continue with other tasks or finish your turn and wait for the results to arrive.",
-      "You may use sessions_list to check on sub-agents only if a long time has passed or the user explicitly asks about their status.",
-      "Sub-agents cannot spawn nested sub-agents.",
+      "",
+      "### Critical Rules",
+      "- **NEVER fabricate, guess, or make up data that a sub-agent has not yet returned.** " +
+        "This includes completion status — do NOT claim tasks are done until you receive actual results.",
+      "- After spawning, do NOT proceed with work that depends on the sub-agent results. " +
+        "You can still chat with the user, do unrelated tasks, or explain what the sub-agents are working on.",
+      "- Sub-agents cannot spawn nested sub-agents.",
+      "- You can use `sessions_list` to check sub-agent status if needed.",
+      "",
+      "### Groups and Continuation (`next`) — ALWAYS use for multi-agent tasks",
+      "When spawning multiple sub-agents, **always** use `next` to define the follow-up work. " +
+        "This is the standard pattern — do NOT use bare `announce: \"silent\"` for multi-agent collect-then-act workflows.",
+      "",
+      "```",
+      "// First spawn — creates a group automatically, returns groupId",
+      'sessions_spawn({ task: "Get AAPL financials", next: "Summarize all data and write a PDF report", label: "AAPL" })',
+      "// → { groupId: \"grp-abc\", runId: \"...\" }",
+      "",
+      "// Subsequent spawns — join the same group",
+      'sessions_spawn({ task: "Get MSFT financials", groupId: "grp-abc", label: "MSFT" })',
+      'sessions_spawn({ task: "Get GOOG financials", groupId: "grp-abc", label: "GOOG" })',
+      "```",
+      "",
+      "The system waits for ALL runs in the group to complete, then delivers the combined findings " +
+        "plus the `next` continuation prompt back to you. You can then use tools (write files, call APIs, etc.) " +
+        "to complete the follow-up work. The user is NOT blocked during this process — they can keep chatting.",
+      "",
+      "Use `next` whenever the user's request involves: collect data → then act on it (summarize, analyze, generate files).",
+      "Without `next`, findings are summarized but no further action is taken.",
+      "",
+      "### Announce Modes (when not using groups)",
+      "- `announce: \"immediate\"` (default): findings delivered per sub-agent as each completes.",
+      "- `announce: \"silent\"`: all findings held until every silent sub-agent finishes, then delivered together.",
+      "Groups always use silent collection internally — you don't need to set announce when using groupId.",
      "",
      "### Timeout Guidelines",
      "Set timeoutSeconds generously — a sub-agent that times out loses all its work.",
      "- Simple tasks (search, read, summarize): 600 (10 min, the default)",
      "- Moderate tasks (multi-step research, file downloads + analysis): 900–1200 (15–20 min)",
      "- Complex tasks (code generation, PDF creation, multi-file operations): 1200–1800 (20–30 min)",
-      "When in doubt, use a longer timeout. It is always better to wait longer than to lose completed work.",
-      "",
-      "### Announce Modes",
-      "- `announce: \"immediate\"` (default): Each sub-agent's findings are delivered to you as soon as it completes.",
-      "- `announce: \"silent\"`: All findings are held back until every silent sub-agent finishes, then delivered as ONE combined report.",
-      "Use \"silent\" when you want to collect data from multiple sub-agents first, then summarize everything at once.",
+      "When in doubt, use a longer timeout.",
      "",
    );
  }
--- a/packages/core/src/agent/tools/sessions-list.ts
+++ b/packages/core/src/agent/tools/sessions-list.ts
@ -7,7 +7,7 @@

 import { Type } from "@sinclair/typebox";
 import type { AgentTool } from "@mariozechner/pi-agent-core";
-import { listSubagentRuns, getSubagentRun } from "../subagent/registry.js";
+import { listSubagentRuns, getSubagentRun, getSubagentGroup } from "../subagent/registry.js";
 import type { SubagentRunRecord } from "../subagent/types.js";

 const SessionsListSchema = Type.Object({
@ -79,6 +79,11 @@ function formatRunDetail(record: SubagentRunRecord, now: number): string {
  ];

  if (record.label) lines.push(`Label: ${record.label}`);
+  if (record.groupId) {
+    const group = getSubagentGroup(record.groupId);
+    lines.push(`Group: ${record.groupId}${group?.label ? ` (${group.label})` : ""}`);
+    if (group?.next) lines.push(`Continuation: ${group.next.slice(0, 120)}${group.next.length > 120 ? "…" : ""}`);
+  }
  lines.push(`Task: ${record.task}`);
  lines.push(`Status: ${status}${record.outcome?.error ? ` — ${record.outcome.error}` : ""}`);
  lines.push(`Child Session: ${record.childSessionId}`);
@ -128,8 +133,7 @@ export function createSessionsListTool(
    description:
      "List all subagent runs spawned by this session and their current status. " +
      "Optionally pass a runId to get detailed information about a specific run. " +
-      "NOTE: Do NOT call this immediately after spawning subagents — results arrive automatically in your context when subagents complete. " +
-      "Only use this if a long time has passed or the user explicitly asks about subagent status.",
+      "Use this to check subagent progress or when the user asks about status.",
    parameters: SessionsListSchema,
    execute: async (_toolCallId, args) => {
      const { runId } = args as SessionsListArgs;
@ -177,21 +181,59 @@ export function createSessionsListTool(

      const someRunning = runs.some((r) => !r.endedAt);

-      // Build status lines for each run
+      // Build status lines, grouping runs by groupId
      const statusLines: string[] = [];
-      for (let i = 0; i < runs.length; i++) {
-        const r = runs[i]!;
+      const groupedRuns = new Map<string, SubagentRunRecord[]>();
+      const ungroupedRuns: SubagentRunRecord[] = [];
+
+      for (const r of runs) {
+        if (r.groupId) {
+          const list = groupedRuns.get(r.groupId) ?? [];
+          list.push(r);
+          groupedRuns.set(r.groupId, list);
+        } else {
+          ungroupedRuns.push(r);
+        }
+      }
+
+      let idx = 0;
+
+      // Grouped runs
+      for (const [gId, gRuns] of groupedRuns) {
+        const group = getSubagentGroup(gId);
+        const groupLabel = group?.label || `Group ${gId.slice(0, 8)}…`;
+        const done = gRuns.filter(r => r.endedAt).length;
+        const nextSnippet = group?.next ? ` → next: "${group.next.slice(0, 60)}${group.next.length > 60 ? "…" : ""}"` : "";
+        statusLines.push(`\n  📦 ${groupLabel} (${done}/${gRuns.length} done${nextSnippet})`);
+
+        for (const r of gRuns) {
+          idx++;
+          const displayName = r.label || r.task.slice(0, 60);
+          const status = resolveStatus(r);
+          if (status === "running") {
+            const elapsed = r.startedAt ? formatElapsed(now - r.startedAt) : "just spawned";
+            statusLines.push(`     ${idx}. [RUNNING] "${displayName}" (${elapsed})`);
+          } else {
+            const elapsed = r.startedAt && r.endedAt ? formatElapsed(r.endedAt - r.startedAt) : "";
+            statusLines.push(`     ${idx}. [${status.toUpperCase()}] "${displayName}" (${elapsed})`);
+          }
+        }
+      }
+
+      // Ungrouped runs
+      for (const r of ungroupedRuns) {
+        idx++;
        const displayName = r.label || r.task.slice(0, 60);
        const status = resolveStatus(r);
        if (status === "running") {
          const elapsed = r.startedAt ? formatElapsed(now - r.startedAt) : "just spawned";
-          statusLines.push(`  ${i + 1}. [RUNNING] "${displayName}" (${elapsed})`);
+          statusLines.push(`  ${idx}. [RUNNING] "${displayName}" (${elapsed})`);
        } else {
          const elapsed = r.startedAt && r.endedAt ? formatElapsed(r.endedAt - r.startedAt) : "";
          const findings = r.findingsCaptured
            ? (r.findings ? r.findings.slice(0, 200) + (r.findings.length > 200 ? "…" : "") : "(no output)")
            : "(findings not yet captured)";
-          statusLines.push(`  ${i + 1}. [${status.toUpperCase()}] "${displayName}" (${elapsed})\n      Findings: ${findings}`);
+          statusLines.push(`  ${idx}. [${status.toUpperCase()}] "${displayName}" (${elapsed})\n      Findings: ${findings}`);
        }
      }

--- a/packages/core/src/agent/tools/sessions-spawn.ts
+++ b/packages/core/src/agent/tools/sessions-spawn.ts
@ -10,7 +10,7 @@ import { Type } from "@sinclair/typebox";
 import type { AgentTool } from "@mariozechner/pi-agent-core";
 import { getHub } from "../../hub/hub-singleton.js";
 import { buildSubagentSystemPrompt } from "../subagent/announce.js";
-import { registerSubagentRun } from "../subagent/registry.js";
+import { registerSubagentRun, createSubagentGroup, getSubagentGroup } from "../subagent/registry.js";
 import { resolveTools } from "../tools.js";

 const SessionsSpawnSchema = Type.Object({
@ -41,7 +41,26 @@ const SessionsSpawnSchema = Type.Object({
        "Announcement mode. 'immediate' (default): findings delivered as each subagent completes. " +
        "'silent': defer all announcements until every silent subagent from this session finishes, " +
        "then deliver one combined report. Use 'silent' when spawning multiple subagents to collect " +
-        "data in parallel and you want to summarize everything at once.",
+        "data in parallel and you want to summarize everything at once. " +
+        "Ignored when groupId is provided (groups always collect all results before announcing).",
+    }),
+  ),
+  groupId: Type.Optional(
+    Type.String({
+      description:
+        "Join an existing group. Pass the groupId returned by a previous sessions_spawn call " +
+        "to add this subagent to the same group. All runs in a group are announced together " +
+        "when the last one completes. If omitted AND 'next' is provided, a new group is created automatically.",
+    }),
+  ),
+  next: Type.Optional(
+    Type.String({
+      description:
+        "Continuation task to execute after ALL subagents in the group complete. " +
+        "Only used when creating a new group (first spawn without groupId). " +
+        "When set, the combined findings from all subagents plus this 'next' prompt " +
+        "are delivered to you so you can perform follow-up work (e.g. summarize, generate reports, write files). " +
+        "Setting 'next' automatically creates a group and implies silent collection.",
    }),
  ),
 });
@ -53,12 +72,15 @@ type SessionsSpawnArgs = {
  cleanup?: "delete" | "keep";
  timeoutSeconds?: number;
  announce?: "immediate" | "silent";
+  groupId?: string;
+  next?: string;
 };

 export type SessionsSpawnResult = {
  status: "accepted" | "error";
  childSessionId?: string;
  runId?: string;
+  groupId?: string;
  error?: string;
 };

@ -79,13 +101,15 @@ export function createSessionsSpawnTool(
    label: "Spawn Subagent",
    description:
      "Spawn a background subagent to handle a specific task. The subagent runs in an isolated session with its own tool set. " +
-      "When it completes, its findings are delivered directly into your context automatically — you do NOT need to poll or check. " +
-      "IMPORTANT: After spawning subagents, continue with any other immediate tasks you have, or simply finish your turn and wait. " +
-      "Do NOT call sessions_list to check on subagents you just spawned — results take time and will arrive on their own. " +
+      "When it completes, its findings are delivered directly into your context automatically. " +
+      "After spawning, do NOT proceed with work that depends on the results — but you can still chat or do unrelated tasks. " +
+      "When spawning multiple subagents for a collect-then-act workflow, ALWAYS use the `next` parameter " +
+      "on the first spawn to define follow-up work, then pass the returned groupId to subsequent spawns. " +
      "Use this for parallelizable work, long-running analysis, or tasks that benefit from isolation.",
    parameters: SessionsSpawnSchema,
    execute: async (_toolCallId, args) => {
-      const { task, label, model, cleanup = "delete", timeoutSeconds, announce } = args as SessionsSpawnArgs;
+      const { task, label, model, cleanup = "delete", timeoutSeconds, announce, next } = args as SessionsSpawnArgs;
+      let { groupId } = args as SessionsSpawnArgs;

      // Guard: subagents cannot spawn subagents
      if (options.isSubagent) {
@ -102,6 +126,28 @@ export function createSessionsSpawnTool(
      const runId = uuidv7();
      const childSessionId = uuidv7();

+      // Validate groupId if provided
+      if (groupId) {
+        const existingGroup = getSubagentGroup(groupId);
+        if (!existingGroup) {
+          return {
+            content: [{ type: "text", text: `Error: group not found: ${groupId}. Use the groupId returned by a previous sessions_spawn call.` }],
+            details: { status: "error", error: `group not found: ${groupId}` },
+          };
+        }
+      }
+
+      // Auto-create group when `next` is provided without an existing groupId
+      if (!groupId && next) {
+        groupId = uuidv7();
+        createSubagentGroup({
+          groupId,
+          requesterSessionId,
+          label: label ? `Group: ${label}` : undefined,
+          next,
+        });
+      }
+
      // Resolve tools for the subagent (with isSubagent=true for policy filtering)
      const subagentTools = resolveTools({ isSubagent: true });
      const toolNames = subagentTools.map((t) => t.name);
@ -135,21 +181,27 @@ export function createSessionsSpawnTool(
          label,
          cleanup,
          timeoutSeconds,
-          announce,
+          announce: groupId ? "silent" : announce,
+          groupId,
          start: () => childAgent.write(task),
        });

+        // Build response text
+        const groupInfo = groupId ? `\nGroup: ${groupId}` : "";
+        const nextInfo = next ? `\nContinuation: "${next.slice(0, 100)}${next.length > 100 ? "…" : ""}"` : "";
+        const responseText =
+          `Subagent spawned: ${label || task.slice(0, 80)}\n` +
+          `Run: ${runId}${groupInfo}${nextInfo}\n\n` +
+          `⏳ WAITING FOR RESULTS — do NOT proceed with work that depends on these results.\n` +
+          `Do NOT fabricate data or completion status. Results will arrive in your context automatically.`;
+
        return {
-          content: [
-            {
-              type: "text",
-              text: `Subagent spawned successfully.\n\nRun ID: ${runId}\nSession: ${childSessionId}\nTask: ${label || task.slice(0, 80)}\n\nThe subagent is now working in the background. Its findings will be delivered directly into your context when it completes — do NOT poll or call sessions_list for it. Continue with other tasks or finish your turn.`,
-            },
-          ],
+          content: [{ type: "text", text: responseText }],
          details: {
            status: "accepted",
            childSessionId,
            runId,
+            groupId,
          },
        };
      } catch (err) {
--- a/packages/hooks/src/use-chat.ts
+++ b/packages/hooks/src/use-chat.ts
@ -9,13 +9,22 @@ import {
  type AgentMessageItem,
  type ExecApprovalRequestPayload,
  type ApprovalDecision,
+  type CompactionEndEvent,
 } from "@multica/sdk";

 export type ToolStatus = "running" | "success" | "error" | "interrupted";

+export interface CompactionInfo {
+  removed: number;
+  kept: number;
+  tokensRemoved?: number;
+  tokensKept?: number;
+  reason: string;
+}
+
 export interface Message {
  id: string;
-  role: "user" | "assistant" | "toolResult";
+  role: "user" | "assistant" | "toolResult" | "system";
  content: ContentBlock[];
  agentId: string;
  stopReason?: string;
@ -24,6 +33,8 @@ export interface Message {
  toolArgs?: Record<string, unknown>;
  toolStatus?: ToolStatus;
  isError?: boolean;
+  systemType?: "compaction";
+  compaction?: CompactionInfo;
 }

 export interface ChatError {
@ -215,6 +226,27 @@ export function useChat() {
      }
      case "tool_execution_update":
        break;
+      case "compaction_end": {
+        const ce = event as CompactionEndEvent;
+        setMessages((prev) => [
+          ...prev,
+          {
+            id: uuidv7(),
+            role: "system",
+            content: [],
+            agentId: payload.agentId,
+            systemType: "compaction",
+            compaction: {
+              removed: ce.removed,
+              kept: ce.kept,
+              tokensRemoved: ce.tokensRemoved,
+              tokensKept: ce.tokensKept,
+              reason: ce.reason,
+            },
+          },
+        ]);
+        break;
+      }
    }
  }, []);

--- a/packages/store/src/types.ts
+++ b/packages/store/src/types.ts
@ -2,9 +2,17 @@ import type { ContentBlock } from "@multica/sdk"

 export type ToolStatus = "running" | "success" | "error" | "interrupted"

+export interface CompactionInfo {
+  removed: number
+  kept: number
+  tokensRemoved?: number
+  tokensKept?: number
+  reason: string
+}
+
 export interface Message {
  id: string
-  role: "user" | "assistant" | "toolResult"
+  role: "user" | "assistant" | "toolResult" | "system"
  content: ContentBlock[]
  agentId: string
  stopReason?: string
@ -13,4 +21,6 @@ export interface Message {
  toolArgs?: Record<string, unknown>
  toolStatus?: ToolStatus
  isError?: boolean
+  systemType?: "compaction"
+  compaction?: CompactionInfo
 }
--- a/packages/ui/src/components/compaction-item.tsx
+++ b/packages/ui/src/components/compaction-item.tsx
@ -0,0 +1,45 @@
+"use client"
+
+import { memo } from "react"
+import { Scissors } from "lucide-react"
+import type { Message } from "@multica/store"
+
+function formatTokens(n: number): string {
+  if (n >= 1000) return `~${(n / 1000).toFixed(1)}k`
+  return `${n}`
+}
+
+interface CompactionItemProps {
+  message: Message
+}
+
+export const CompactionItem = memo(function CompactionItem({ message }: CompactionItemProps) {
+  const info = message.compaction
+  if (!info) return null
+
+  const label = info.reason === "summary" ? "Context summarized" : "Context compacted"
+  const removed = `${info.removed} messages removed`
+  const tokens = info.tokensRemoved != null
+    ? `, ${formatTokens(info.tokensRemoved)} tokens freed`
+    : ""
+
+  return (
+    <div className="py-0.5 px-2.5 text-sm text-muted-foreground">
+      <div className="flex items-center gap-1.5 px-2.5 py-1">
+        {/* Status dot */}
+        <span className="size-1.5 rounded-full shrink-0 bg-muted-foreground/40" />
+
+        {/* Icon */}
+        <Scissors className="size-3.5 shrink-0" />
+
+        {/* Label */}
+        <span className="font-medium shrink-0">{label}</span>
+
+        {/* Stats */}
+        <span className="ml-auto text-xs text-muted-foreground/60 shrink-0">
+          {removed}{tokens}
+        </span>
+      </div>
+    </div>
+  )
+})
--- a/packages/ui/src/components/message-list.tsx
+++ b/packages/ui/src/components/message-list.tsx
@ -5,6 +5,7 @@ import { MemoizedMarkdown } from "@multica/ui/components/markdown";
 import { StreamingMarkdown } from "@multica/ui/components/markdown/StreamingMarkdown";
 import { ToolCallItem } from "@multica/ui/components/tool-call-item";
 import { ThinkingItem } from "@multica/ui/components/thinking-item";
+import { CompactionItem } from "@multica/ui/components/compaction-item";
 import { cn, getTextContent } from "@multica/ui/lib/utils";
 import type { Message } from "@multica/store";
 import type { ContentBlock, ToolCall, ThinkingContent } from "@multica/sdk";
@ -78,6 +79,11 @@ export const MessageList = memo(function MessageList({ messages, streamingIds }:
  return (
    <div className="relative p-6 px-4 sm:px-10 max-w-4xl mx-auto">
      {messages.map((msg) => {
+        // System messages (e.g. compaction notifications)
+        if (msg.role === "system") {
+          return <CompactionItem key={msg.id} message={msg} />
+        }
+
        // ToolResult messages → render as tool execution item
        if (msg.role === "toolResult") {
          return <ToolCallItem key={msg.id} message={msg} />
--- a/skills/earnings-analysis/SKILL.md
+++ b/skills/earnings-analysis/SKILL.md
@ -0,0 +1,463 @@
+---
+name: Earnings Analysis
+description: >-
+  Analyze a company's financial statements (income statement, balance sheet,
+  cash flow statement) to assess financial health, earnings quality, and
+  competitive advantage. Use when the user asks to read/analyze financial
+  statements, check earnings quality, assess financial health, evaluate
+  profitability trends, or screen for competitive moats.
+version: 1.0.0
+metadata:
+  emoji: "\U0001F4D1"
+  requires:
+    env:
+      - FINANCIAL_DATASETS_API_KEY
+  tags:
+    - finance
+    - earnings
+    - analysis
+    - statements
+    - buffett
+userInvocable: true
+disableModelInvocation: false
+---
+
+## Instructions
+
+You are performing a structured financial statement analysis. Follow all steps in order and show your work. Output language must match the user's input language.
+
+**IMPORTANT: This analysis requires BOTH structured data AND external context.** You MUST use `web_search` to gather earnings call insights, industry context, and explanations for data anomalies. An analysis based only on API data without any web research is incomplete. Expect to make 3-6 web searches throughout the analysis.
+
+### Progress Checklist
+
+```
+Earnings Analysis Progress:
+- [ ] Step 1: Gather financial data
+- [ ] Step 2: Income statement analysis
+- [ ] Step 3: Balance sheet analysis
+- [ ] Step 4: Cash flow statement analysis
+- [ ] Step 5: Buffett competitive advantage scoring
+- [ ] Step 6: Quality of earnings assessment
+- [ ] Step 7: SEC filing qualitative analysis
+- [ ] Step 8: Peer comparison (if requested)
+- [ ] Step 9: Present findings
+```
+
+### Step 1: Gather Financial Data
+
+Use `data` tool with `domain="finance"` for all structured data calls.
+
+#### 1a. Structured Data
+
+1. **Annual financial statements** (5 years):
+   ```
+   action: "get_all_financial_statements"
+   params: { ticker: "[TICKER]", period: "annual", limit: 5 }
+   ```
+   This returns income statements, balance sheets, and cash flow statements together.
+
+2. **Quarterly financial statements** (last 4 quarters):
+   ```
+   action: "get_all_financial_statements"
+   params: { ticker: "[TICKER]", period: "quarterly", limit: 4 }
+   ```
+
+3. **Current financial metrics**:
+   ```
+   action: "get_financial_metrics_snapshot"
+   params: { ticker: "[TICKER]" }
+   ```
+
+4. **Company facts**:
+   ```
+   action: "get_company_facts"
+   params: { ticker: "[TICKER]" }
+   ```
+   Extract: `sector`, `industry` — needed for benchmark comparisons in later steps.
+
+5. **Current stock price**:
+   ```
+   action: "get_price_snapshot"
+   params: { ticker: "[TICKER]" }
+   ```
+
+6. **Recent news**:
+   ```
+   action: "get_news"
+   params: { ticker: "[TICKER]", limit: 10 }
+   ```
+   Scan headlines for material events (earnings surprises, guidance changes, M&A, restructuring).
+
+#### 1b. External Context (Web Search) — MANDATORY
+
+You MUST run the following two web searches after gathering structured data. These are not optional.
+
+1. **Latest earnings call highlights** (REQUIRED):
+   ```
+   web_search("[COMPANY] latest earnings call highlights key takeaways [CURRENT_YEAR]")
+   ```
+   Extract: management guidance, segment commentary, strategic priorities, forward outlook.
+   This provides the "why" behind the numbers that structured data cannot explain.
+
+2. **Industry/macro backdrop** (REQUIRED):
+   ```
+   web_search("[INDUSTRY] industry outlook trends [CURRENT_YEAR]")
+   ```
+   Extract: industry growth rate, tailwinds/headwinds, regulatory changes, competitive dynamics.
+   This is needed to assess whether the company's performance is company-specific or industry-wide.
+
+3. **Company-specific events** (conditional — run if news headlines or data show a material event):
+   ```
+   web_search("[COMPANY] [EVENT_KEYWORD] impact analysis")
+   ```
+   Examples: acquisition, restructuring, product launch, lawsuit, management change.
+
+**Checkpoint:** Before proceeding to Step 2, verify that you have completed at least 2 web searches above. If you have not, go back and run them now.
+
+### Step 2: Income Statement Analysis
+
+Analyze the income statement across all 5 annual periods. Calculate and present:
+
+1. **Revenue trend**:
+   - Year-over-year growth rate for each year
+   - 5-year CAGR: `(Revenue_latest / Revenue_earliest)^(1/years) - 1`
+   - Flag any years with revenue decline
+
+2. **Margin analysis** (calculate for each year, show the trend):
+   - Gross Margin = Gross Profit / Revenue
+   - Operating Margin = Operating Income / Revenue
+   - Net Margin = Net Income / Revenue
+
+3. **Margin benchmarks** (from [financial-ratios-benchmarks.md](references/financial-ratios-benchmarks.md)):
+   - Compare each margin to sector benchmarks
+   - Flag margins that are significantly above or below sector range
+
+4. **EPS analysis**:
+   - EPS trend over 5 years
+   - EPS growth consistency (note any years of decline)
+
+5. **Expense structure**:
+   - Cost of revenue as % of revenue (trend)
+   - SG&A as % of revenue (trend)
+   - R&D as % of revenue (trend, if applicable)
+   - Flag any expense category growing faster than revenue
+
+6. **Contextual explanation** (REQUIRED — use web search results from Step 1b):
+   - For each significant trend or inflection point in the data above, provide a **why** explanation using the earnings call and industry context gathered in Step 1b.
+   - If revenue growth changed direction significantly (acceleration or deceleration > 10pp), run an additional search:
+     `web_search("[COMPANY] revenue [growth/decline] reason [YEAR]")`
+   - If margins shifted by more than 5pp year-over-year, run an additional search:
+     `web_search("[COMPANY] margin [expansion/compression] [YEAR]")`
+   - **Do not present a data table without narrative.** Every major trend must have a "why" attached, citing the source (earnings call, industry report, or company announcement).
+
+Present as a table:
+
+| Metric | Year 1 | Year 2 | Year 3 | Year 4 | Year 5 | 5Y CAGR |
+|--------|--------|--------|--------|--------|--------|---------|
+
+### Step 3: Balance Sheet Analysis
+
+Analyze the balance sheet across all 5 annual periods:
+
+1. **Liquidity**:
+   - Current Ratio = Current Assets / Current Liabilities
+   - Quick Ratio = (Current Assets - Inventory) / Current Liabilities
+   - Cash and equivalents trend
+
+2. **Leverage**:
+   - Cash vs. Total Debt (short-term + long-term debt)
+   - Debt-to-Equity = Total Liabilities / Total Shareholders' Equity
+   - Interest Coverage = Operating Income / Interest Expense
+   - Debt payoff capacity = Total Debt / Net Income (in years)
+
+3. **Asset quality**:
+   - Receivables Turnover = Revenue / Accounts Receivable
+   - Inventory Turnover = Cost of Revenue / Inventory (if applicable)
+   - Goodwill as % of Total Assets (flag if > 30%)
+
+4. **Equity structure**:
+   - Retained earnings: year-over-year changes (growing?)
+   - Preferred stock: present or absent?
+   - Treasury stock: present? growing? (indicates buybacks)
+
+5. **Working capital trend**:
+   - Net Working Capital = Current Assets - Current Liabilities
+   - Direction of change over 5 years
+
+6. **Contextual explanation** (use web search results from Step 1b + additional searches as needed):
+   - Explain major balance sheet changes using earnings call context from Step 1b.
+   - If total debt changed significantly (> 30% YoY), you MUST search for the reason:
+     `web_search("[COMPANY] debt [issuance/repayment] [YEAR]")`
+   - If goodwill jumped, you MUST search for acquisition context:
+     `web_search("[COMPANY] acquisition [YEAR]")`
+   - Large treasury stock changes → confirm buyback program details:
+     `web_search("[COMPANY] share buyback program")`
+
+Compare key ratios to sector benchmarks from [financial-ratios-benchmarks.md](references/financial-ratios-benchmarks.md).
+
+### Step 4: Cash Flow Statement Analysis
+
+Analyze cash flow statements across all 5 annual periods:
+
+1. **Operating cash flow quality**:
+   - OCF vs. Net Income ratio for each year
+   - Target: OCF/NI > 1.0 (cash earnings exceed accrual earnings)
+   - Trend direction
+
+2. **Free cash flow**:
+   - FCF = Operating Cash Flow - Capital Expenditure
+   - FCF Margin = FCF / Revenue
+   - 5-year FCF trend and CAGR
+
+3. **Capital intensity**:
+   - CapEx / Revenue ratio
+   - CapEx / Net Income ratio (Buffett benchmark: < 25% excellent, < 50% acceptable)
+   - Is CapEx growing faster than revenue? (potential red flag)
+
+4. **Cash flow composition**:
+   - Net cash from operating activities (should be consistently positive)
+   - Net cash from investing activities (negative = investing in growth)
+   - Net cash from financing activities (pattern: debt vs. equity funded?)
+
+5. **Shareholder returns**:
+   - Dividends paid (from financing activities)
+   - Share buybacks / treasury stock repurchase
+   - Total payout ratio = (Dividends + Buybacks) / Net Income
+   - Is the company returning cash while maintaining growth?
+
+6. **Contextual explanation** (use web search results from Step 1b + additional searches as needed):
+   - Explain cash flow patterns using earnings call context from Step 1b.
+   - If CapEx spiked significantly in a particular year, you MUST search for what was built:
+     `web_search("[COMPANY] capital expenditure investment [YEAR]")`
+   - If FCF diverged sharply from net income, search for restructuring or working capital events.
+
+Present a summary table:
+
+| Metric | Year 1 | Year 2 | Year 3 | Year 4 | Year 5 |
+|--------|--------|--------|--------|--------|--------|
+
+### Step 5: Buffett Competitive Advantage Scoring
+
+Apply the scoring framework from [buffett-checklist.md](references/buffett-checklist.md).
+
+For each of the 13 criteria across 4 categories:
+1. Calculate the metric value from the data gathered in Steps 1-4
+2. Determine the score based on the threshold table
+3. Note the sector-specific caveats (Financials, Utilities, REITs, Growth-stage)
+
+Present the full scorecard table and the overall rating (Excellent / Good / Average / Weak).
+
+### Step 6: Quality of Earnings Assessment
+
+Assess whether reported earnings are backed by real cash and sustainable operations:
+
+1. **Accrual ratio**:
+   - Formula: (Net Income - Operating Cash Flow) / Total Assets
+   - Interpretation: Lower is better. High positive values suggest earnings are driven by accruals rather than cash.
+   - Red flag threshold: > 10%
+
+2. **Revenue recognition quality**:
+   - Compare Accounts Receivable growth rate vs. Revenue growth rate
+   - If AR grows significantly faster than revenue → potential aggressive revenue recognition
+   - Red flag threshold: AR growth > Revenue growth + 5 percentage points
+
+3. **Inventory quality** (if applicable):
+   - Compare Inventory growth rate vs. Cost of Revenue growth rate
+   - Rising inventory vs. flat/declining COGS → potential obsolescence risk
+   - Red flag threshold: Inventory growth > COGS growth + 10 percentage points
+
+4. **One-time items**:
+   - Identify significant non-recurring charges or gains in the income statement
+   - Calculate adjusted net income excluding one-time items
+   - Compare adjusted vs. reported margins
+
+5. **Deferred revenue trend** (if applicable):
+   - Growing deferred revenue is a positive signal (future revenue already contracted)
+   - Declining deferred revenue may signal weakening demand pipeline
+
+6. **External validation** (web search):
+   - If any red flags were triggered above, search for corroborating or mitigating context:
+     `web_search("[COMPANY] accounting concerns OR restatement OR SEC inquiry")`
+   - Check for auditor changes (can signal accounting issues):
+     `web_search("[COMPANY] auditor change OR audit opinion")`
+   - Only run these searches if quantitative red flags exist. Do not search proactively for every company.
+
+Summarize quality of earnings as: **High** / **Moderate** / **Low** with supporting evidence.
+
+### Step 7: SEC Filing Qualitative Analysis
+
+Pull and analyze the most recent annual or quarterly filing:
+
+1. **Get filing list**:
+   ```
+   action: "get_filings"
+   params: { ticker: "[TICKER]", filing_type: "10-K", limit: 1 }
+   ```
+   If 10-K is not recent enough, also pull 10-Q:
+   ```
+   action: "get_filings"
+   params: { ticker: "[TICKER]", filing_type: "10-Q", limit: 1 }
+   ```
+
+2. **Read MD&A section** (Management's Discussion and Analysis):
+   ```
+   action: "get_filing_items"
+   params: { ticker: "[TICKER]", filing_type: "10-K", item: "7" }
+   ```
+   For 10-Q, MD&A is item "2":
+   ```
+   action: "get_filing_items"
+   params: { ticker: "[TICKER]", filing_type: "10-Q", item: "2" }
+   ```
+
+3. **Read Risk Factors**:
+   ```
+   action: "get_filing_items"
+   params: { ticker: "[TICKER]", filing_type: "10-K", item: "1A" }
+   ```
+
+4. **Extract and analyze**:
+   - Management's explanation of revenue and margin trends
+   - Forward-looking statements and guidance
+   - Key risk factors that could impact financial health
+   - Any disclosures about accounting policy changes
+   - Cross-validate: Does management narrative align with the quantitative data from Steps 2-4?
+   - Flag contradictions between management tone and actual numbers
+
+5. **Supplement with earnings call transcript** (REQUIRED — web search/fetch):
+   You MUST search for and incorporate the most recent earnings call. This is critical for understanding management's forward-looking view.
+   - Search for the transcript:
+     `web_search("[COMPANY] [QUARTER] [YEAR] earnings call transcript")`
+   - If a transcript URL is found, use `web_fetch` to read key sections (CEO/CFO prepared remarks, Q&A highlights).
+   - Extract: forward guidance, segment-level commentary, management tone on competitive position, key analyst concerns.
+   - Cross-reference earnings call statements with MD&A disclosures — flag any inconsistencies.
+
+6. **Summarize key insights**:
+   - What management says about the business trajectory
+   - Material risks not visible in the numbers alone
+   - Any changes in risk factors vs. prior filings (if noticeable)
+   - Key analyst questions and management responses from earnings call (if available)
+
+### Step 8: Peer Comparison (Conditional)
+
+**Execute this step only when the user explicitly requests peer comparison or industry benchmarking.**
+
+1. **Identify peers**:
+   - Use the `sector` and `industry` from `get_company_facts`
+   - Select 2-3 publicly traded competitors in the same industry
+   - If the user specifies peers, use those instead
+
+2. **Pull peer data** (for each peer):
+   ```
+   action: "get_financial_metrics_snapshot"
+   params: { ticker: "[PEER_TICKER]" }
+   ```
+   ```
+   action: "get_income_statements"
+   params: { ticker: "[PEER_TICKER]", period: "annual", limit: 1 }
+   ```
+   ```
+   action: "get_balance_sheets"
+   params: { ticker: "[PEER_TICKER]", period: "annual", limit: 1 }
+   ```
+
+3. **Comparative table**:
+
+   | Metric | [TARGET] | [PEER 1] | [PEER 2] | [PEER 3] | Sector Avg |
+   |--------|----------|----------|----------|----------|------------|
+   | Revenue Growth (YoY) | | | | | |
+   | Gross Margin | | | | | |
+   | Net Margin | | | | | |
+   | ROE | | | | | |
+   | D/E Ratio | | | | | |
+   | FCF Margin | | | | | |
+   | P/E Ratio | | | | | |
+
+4. **Competitive position assessment**:
+   - Where does the target company rank among peers on each metric?
+   - Identify clear advantages and disadvantages relative to peers
+   - Note if the target trades at a premium or discount to peers and whether it's justified
+
+### Step 9: Present Findings
+
+Compile the full analysis into a structured report. Follow this exact structure:
+
+#### 1. Executive Summary
+- Company name, ticker, sector, current price
+- One-paragraph thesis: Is this a financially healthy company with a durable competitive advantage?
+- Financial health rating from Buffett scorecard (Excellent / Good / Average / Weak)
+- Earnings quality assessment (High / Moderate / Low)
+
+#### 2. Financial Health Scorecard
+- Full Buffett checklist scorecard table from Step 5
+- Total score and rating
+
+#### 3. Trend Dashboard
+- 5-year key metrics trend table from Steps 2-4:
+
+| Metric | Y1 | Y2 | Y3 | Y4 | Y5 | Trend |
+|--------|----|----|----|----|----|----|
+| Revenue | | | | | | arrow |
+| Gross Margin | | | | | | arrow |
+| Net Margin | | | | | | arrow |
+| ROE | | | | | | arrow |
+| D/E Ratio | | | | | | arrow |
+| FCF | | | | | | arrow |
+| OCF/NI | | | | | | arrow |
+| CapEx/NI | | | | | | arrow |
+
+Use directional indicators in the Trend column.
+
+#### 4. Quality of Earnings
+- Summary from Step 6 with key metrics and assessment
+
+#### 5. Key Strengths & Red Flags
+- **Strengths**: List 3-5 financial strengths with supporting data
+- **Red Flags**: List any warning signs discovered during analysis. If none, state "No material red flags identified."
+
+Common red flags to watch for:
+- Revenue growth but declining margins
+- Net income growing but OCF declining
+- AR growing faster than revenue
+- Inventory building up vs. flat COGS
+- Rising debt with declining interest coverage
+- Retained earnings declining
+- Large goodwill relative to total assets
+- CapEx consistently > 50% of net income
+- Management tone in MD&A contradicts financial data
+
+#### 6. SEC Filing Insights
+- Key findings from Step 7
+- Management's outlook and material risks
+
+#### 7. Peer Comparison (if Step 8 was executed)
+- Comparative table and competitive position assessment
+
+### Guardrails
+
+- Always state the date range of financial data used.
+- If any data is missing or unavailable, explicitly note it and adjust the analysis scope.
+- Do not present calculated ratios as precise — round to one decimal place.
+- Clearly distinguish between facts (from data) and interpretive conclusions.
+- The Buffett scorecard is a screening framework, not a buy/sell recommendation. State this in the output.
+- For non-US companies or companies not filing with the SEC, skip Step 7 and note the limitation.
+- Output language must match the user's input language (Chinese input → Chinese output, English input → English output).
+
+### Web Search Requirements
+
+**Minimum mandatory searches (you MUST perform these):**
+1. Earnings call highlights (Step 1b) — for management's own explanation of results
+2. Industry outlook (Step 1b) — for macro/sector context
+3. Earnings call transcript (Step 7) — for forward guidance and analyst Q&A
+
+**Additional searches (trigger when data shows anomalies):**
+- Revenue or margin inflection points (Steps 2-4)
+- Major debt changes or acquisitions (Step 3)
+- CapEx spikes (Step 4)
+- Quality-of-earnings red flags (Step 6)
+
+**Search principles:**
+- **Source quality**: Prefer primary sources (SEC filings, company press releases, earnings call transcripts) over secondary sources (analyst blogs, news aggregators).
+- **Cite with dates**: Always include source name and date when referencing external information.
+- **Separate fact from opinion**: Label analyst or media commentary as external opinion, not fact.
+- **Total budget**: Expect 3-8 web searches per analysis. Fewer than 3 means you are likely missing critical context.
--- a/skills/earnings-analysis/references/buffett-checklist.md
+++ b/skills/earnings-analysis/references/buffett-checklist.md
@ -0,0 +1,99 @@
+# Buffett Competitive Advantage Checklist
+
+Score each criterion and calculate a total. Use this to assess whether a company has a durable competitive advantage (economic moat).
+
+## Scoring System
+
+Total: 100 points across 4 categories (25 points each).
+
+### Category 1: Profitability (25 points)
+
+| # | Criterion | Excellent | Good | Weak |
+|---|-----------|-----------|------|------|
+| 1 | **Gross Margin** | > 40% → **10 pts** | 30-40% → **6 pts** | < 30% → **2 pts** |
+| 2 | **Net Margin** | > 20% → **10 pts** | 10-20% → **6 pts** | < 10% → **2 pts** |
+| 3 | **Return on Equity (ROE)** | > 15% → **5 pts** | 10-15% → **3 pts** | < 10% → **1 pt** |
+
+How to calculate:
+- Gross Margin = Gross Profit / Revenue
+- Net Margin = Net Income / Revenue
+- ROE = Net Income / Total Shareholders' Equity
+- Use the most recent annual figures; cross-check with 5-year average
+
+### Category 2: Balance Sheet Health (25 points)
+
+| # | Criterion | Pass | Partial | Fail |
+|---|-----------|------|---------|------|
+| 4 | **Cash > Total Debt** | Yes → **8 pts** | Cash > 50% of Debt → **4 pts** | Cash < 50% of Debt → **1 pt** |
+| 5 | **Debt-to-Equity Ratio** | < 0.8 → **7 pts** | 0.8-1.5 → **4 pts** | > 1.5 → **1 pt** |
+| 6 | **No Preferred Stock** | None → **5 pts** | — | Has Preferred → **0 pts** |
+| 7 | **Retained Earnings Growth** | Growing 5 consecutive years → **5 pts** | Growing 3-4 years → **3 pts** | Declining or flat → **1 pt** |
+
+How to calculate:
+- Cash = Cash and Cash Equivalents + Short-term Investments
+- Total Debt = Short-term Debt + Long-term Debt
+- D/E = Total Liabilities / Total Shareholders' Equity
+- Retained Earnings: Compare year-over-year from balance sheets
+
+Special note on D/E:
+- Exclude operating lease liabilities from "debt" for this assessment (they are contractual obligations, not financial debt)
+- If treasury stock is large, it reduces equity and inflates D/E — note this in analysis
+
+### Category 3: Cash Flow Quality (25 points)
+
+| # | Criterion | Excellent | Good | Weak |
+|---|-----------|-----------|------|------|
+| 8 | **CapEx / Net Income** | < 25% → **10 pts** | 25-50% → **6 pts** | > 50% → **2 pts** |
+| 9 | **Operating CF > Net Income** | OCF/NI > 1.0 → **8 pts** | OCF/NI = 0.8-1.0 → **4 pts** | OCF/NI < 0.8 → **1 pt** |
+| 10 | **Shareholder Returns** | Buybacks + Dividends → **7 pts** | Dividends only → **4 pts** | Neither → **1 pt** |
+
+How to calculate:
+- CapEx: Capital Expenditure from cash flow statement (use absolute value)
+- Operating CF: Net Cash from Operating Activities
+- Buybacks: Check if Treasury Stock increased year-over-year, or look at "repurchase of common stock" in financing activities
+- Dividends: Look at "dividends paid" in financing activities
+
+Note on CapEx:
+- One-time large CapEx (e.g., new factory, data center buildout) should be noted but not penalized if the 5-year average CapEx/NI is still within range
+- Asset-light businesses (software, services) naturally score well here
+
+### Category 4: Consistency (25 points)
+
+| # | Criterion | Excellent | Good | Weak |
+|---|-----------|-----------|------|------|
+| 11 | **Revenue Growth Streak** | 5+ consecutive years growing → **10 pts** | 3-4 years → **6 pts** | < 3 years → **2 pts** |
+| 12 | **Net Income Growth Streak** | 5+ consecutive years growing → **10 pts** | 3-4 years → **6 pts** | < 3 years → **2 pts** |
+| 13 | **Recession Resilience** | Profitable through last recession → **5 pts** | Revenue dip < 10% → **3 pts** | Significant losses → **1 pt** |
+
+How to assess:
+- Revenue/NI growth: Check year-over-year changes for the last 5 years
+- Recession resilience: Check 2020 (COVID) and 2022 (rate hikes) performance. For older data, check 2008-2009 if available.
+- A single flat year in an otherwise consistent growth streak can be scored as "Good"
+
+## Score Interpretation
+
+| Total Score | Rating | Interpretation |
+|-------------|--------|----------------|
+| 80-100 | **Excellent** | Strong durable competitive advantage. Consistent profitability, fortress balance sheet, capital-light operations. Classic Buffett-style investment candidate. |
+| 60-79 | **Good** | Solid business with some competitive advantages. May have minor weaknesses in one category. Worth deeper investigation. |
+| 40-59 | **Average** | Mediocre competitive position. Multiple areas of concern. Higher risk of margin erosion or competitive disruption. |
+| < 40 | **Weak** | No clear competitive advantage. High debt, inconsistent earnings, or capital-intensive operations. Not a typical Buffett investment. |
+
+## Sector-Specific Caveats
+
+- **Financials**: Skip gross margin (criterion 1). Use net interest margin > 3% as substitute for 10 pts. D/E ratio thresholds don't apply — use Tier 1 Capital Ratio > 10% for 7 pts instead.
+- **Utilities**: Naturally capital-intensive (CapEx criterion will score low). Offset by checking regulated return stability. If regulated ROE is consistently 9-11%, award 6 pts for criterion 8.
+- **REITs**: Required to pay out 90%+ as dividends, so retained earnings won't grow. Skip criterion 7; award 5 pts if FFO per share grows consistently instead.
+- **Growth-stage Tech**: May not yet have 5 years of profitability. Score consistency based on revenue growth and gross margin expansion trajectory. Note that the overall score may be artificially low.
+
+## Output Format
+
+Present the scorecard as a table:
+
+| # | Criterion | Value | Score | Max |
+|---|-----------|-------|-------|-----|
+| 1 | Gross Margin | 43.2% | 10 | 10 |
+| 2 | Net Margin | 25.1% | 10 | 10 |
+| ... | ... | ... | ... | ... |
+| | **Total** | | **XX** | **100** |
+| | **Rating** | | **Excellent/Good/Average/Weak** | |
--- a/skills/earnings-analysis/references/financial-ratios-benchmarks.md
+++ b/skills/earnings-analysis/references/financial-ratios-benchmarks.md
@ -0,0 +1,70 @@
+# Financial Ratios Benchmarks by Sector
+
+Use the company's `sector` from `get_company_facts` to look up benchmark ranges below. Compare the company's ratios against these benchmarks and note deviations.
+
+## Profitability Benchmarks
+
+| Sector | Gross Margin | Operating Margin | Net Margin | ROE | ROA |
+|--------|-------------|-----------------|------------|-----|-----|
+| Communication Services | 50-60% | 15-25% | 10-18% | 12-20% | 5-10% |
+| Consumer Discretionary | 35-50% | 8-15% | 5-10% | 15-25% | 5-10% |
+| Consumer Staples | 35-45% | 12-18% | 8-12% | 20-30% | 8-12% |
+| Energy | 30-50% | 10-20% | 5-15% | 10-20% | 5-10% |
+| Financials | N/A | 25-35% | 15-25% | 10-15% | 1-2% |
+| Health Care | 55-70% | 15-25% | 10-20% | 15-25% | 8-12% |
+| Industrials | 25-35% | 10-15% | 6-10% | 15-20% | 5-8% |
+| Information Technology | 55-70% | 20-30% | 15-25% | 20-35% | 10-15% |
+| Materials | 25-35% | 10-18% | 5-12% | 10-18% | 5-8% |
+| Real Estate | 55-70% | 25-40% | 15-30% | 5-10% | 2-5% |
+| Utilities | 35-50% | 15-25% | 8-15% | 8-12% | 3-5% |
+
+## Balance Sheet Benchmarks
+
+| Sector | Current Ratio | Quick Ratio | D/E Ratio | Interest Coverage |
+|--------|--------------|-------------|-----------|-------------------|
+| Communication Services | 1.0-1.5 | 0.8-1.2 | 0.8-1.5 | 4-8x |
+| Consumer Discretionary | 1.2-2.0 | 0.8-1.5 | 0.5-1.2 | 5-10x |
+| Consumer Staples | 1.0-1.5 | 0.6-1.0 | 0.5-1.0 | 8-15x |
+| Energy | 1.0-1.5 | 0.8-1.2 | 0.3-0.8 | 5-10x |
+| Financials | N/A | N/A | 2.0-8.0 | N/A |
+| Health Care | 1.5-2.5 | 1.2-2.0 | 0.3-0.8 | 8-15x |
+| Industrials | 1.2-2.0 | 0.8-1.5 | 0.5-1.0 | 6-12x |
+| Information Technology | 2.0-3.5 | 1.5-3.0 | 0.2-0.6 | 15-30x |
+| Materials | 1.5-2.5 | 1.0-1.5 | 0.4-0.8 | 6-12x |
+| Real Estate | 1.0-1.5 | 0.5-1.0 | 0.8-1.5 | 3-5x |
+| Utilities | 0.8-1.2 | 0.5-0.8 | 1.0-2.0 | 3-5x |
+
+## Cash Flow Benchmarks
+
+| Sector | FCF Margin | CapEx/Revenue | Op. CF / Net Income |
+|--------|-----------|---------------|---------------------|
+| Communication Services | 10-20% | 10-20% | 1.2-1.8x |
+| Consumer Discretionary | 5-12% | 3-8% | 1.1-1.5x |
+| Consumer Staples | 8-15% | 3-6% | 1.2-1.5x |
+| Energy | 5-15% | 15-30% | 1.5-2.5x |
+| Financials | N/A | 1-3% | N/A |
+| Health Care | 15-25% | 3-8% | 1.2-1.8x |
+| Industrials | 5-12% | 3-8% | 1.2-1.6x |
+| Information Technology | 20-35% | 3-10% | 1.2-1.8x |
+| Materials | 5-12% | 5-12% | 1.3-2.0x |
+| Real Estate | 15-30% | 5-15% | 1.5-3.0x |
+| Utilities | 5-10% | 15-25% | 2.0-3.5x |
+
+## Usage Notes
+
+- **Financials sector**: Gross margin and current/quick ratios are not meaningful for banks and insurers. Use net interest margin and capital adequacy ratios instead.
+- **Real Estate**: High depreciation makes net margin less useful. Focus on Funds From Operations (FFO).
+- **Growth-stage companies**: May have negative margins. Compare against growth-stage peers rather than mature sector benchmarks.
+- **Cyclical sectors** (Energy, Materials, Industrials): Use cycle-average margins (5-7 years) rather than single-year comparisons.
+- **Post-M&A**: Goodwill and amortization may distort margins for 1-2 years after acquisitions. Note any large acquisitions.
+
+## Buffett's Rules of Thumb (Quick Reference)
+
+| Metric | Excellent | Good | Weak |
+|--------|-----------|------|------|
+| Gross Margin | > 40% | 30-40% | < 30% |
+| Net Margin | > 20% | 10-20% | < 10% |
+| ROE | > 15% | 10-15% | < 10% |
+| D/E Ratio | < 0.5 | 0.5-0.8 | > 0.8 |
+| CapEx / Net Income | < 25% | 25-50% | > 50% |
+| Debt Payoff (years) | < 2 | 2-4 | > 4 |