Merge pull request #136 from multica-ai/fix/agent-compaction

fix(agent): prevent context window overflow with 3-layer compaction defense
This commit is contained in:
LinYushen 2026-02-12 17:23:12 +08:00 committed by GitHub
commit 8bc36a9cc9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 263 additions and 46 deletions

View file

@ -37,7 +37,7 @@ vi.mock("@mariozechner/pi-coding-agent", () => ({
describe("token-estimation", () => {
describe("constants", () => {
it("should have correct safety margin", () => {
expect(ESTIMATION_SAFETY_MARGIN).toBe(1.2);
expect(ESTIMATION_SAFETY_MARGIN).toBe(1.5);
});
it("should have correct compaction trigger ratio", () => {
@ -63,20 +63,20 @@ describe("token-estimation", () => {
});
it("should estimate tokens based on character count", () => {
// ~3 chars per token
expect(estimateSystemPromptTokens("abc")).toBe(1);
expect(estimateSystemPromptTokens("abcdef")).toBe(2);
expect(estimateSystemPromptTokens("abcdefghi")).toBe(3);
// ~2 chars per token (conservative for CJK/mixed content)
expect(estimateSystemPromptTokens("ab")).toBe(1);
expect(estimateSystemPromptTokens("abcd")).toBe(2);
expect(estimateSystemPromptTokens("abcdef")).toBe(3);
});
it("should ceil the result", () => {
// 4 chars / 3 = 1.33, should ceil to 2
expect(estimateSystemPromptTokens("abcd")).toBe(2);
// 3 chars / 2 = 1.5, should ceil to 2
expect(estimateSystemPromptTokens("abc")).toBe(2);
});
it("should handle long prompts", () => {
const longPrompt = "a".repeat(3000);
expect(estimateSystemPromptTokens(longPrompt)).toBe(1000);
expect(estimateSystemPromptTokens(longPrompt)).toBe(1500);
});
});
@ -140,7 +140,7 @@ describe("token-estimation", () => {
reserveTokens: 0,
});
// Utilization = (tokens * 1.2) / available
// Utilization = (tokens * 1.5) / available
expect(result.utilizationRatio).toBeGreaterThan(0);
});
});
@ -292,26 +292,26 @@ describe("token-estimation", () => {
content: "x".repeat(400), // ~100 tokens
} as AgentMessage;
// With safety margin 1.2, 100 * 1.2 = 120 tokens
// 120 > 1000 * 0.1 = 100, so oversized
// With safety margin 1.5, 100 * 1.5 = 150 tokens
// 150 > 1000 * 0.1 = 100, so oversized
expect(isMessageOversized(message, 1000, 0.1)).toBe(true);
// 120 < 1000 * 0.2 = 200, so not oversized
// 150 < 1000 * 0.2 = 200, so not oversized
expect(isMessageOversized(message, 1000, 0.2)).toBe(false);
});
it("should apply safety margin to token count", () => {
const message = {
role: "user",
content: "x".repeat(400), // ~100 tokens, with margin ~120
content: "x".repeat(400), // ~100 tokens, with margin ~150
} as AgentMessage;
// Without margin: 100 < 250 (50% of 500)
// With margin: 120 < 250, still ok
// With margin: 150 < 250, still ok
expect(isMessageOversized(message, 500, 0.5)).toBe(false);
// Without margin: 100 < 100 would be false
// With margin: 120 > 100, should be true
// With margin: 150 > 100, should be true
expect(isMessageOversized(message, 200, 0.5)).toBe(true);
});
});

View file

@ -9,7 +9,7 @@ import { estimateTokens } from "@mariozechner/pi-coding-agent";
import type { TokenEstimation, TokenAwareCompactionResult } from "./types.js";
/** Safety margin coefficient to compensate for estimation inaccuracy */
export const ESTIMATION_SAFETY_MARGIN = 1.2; // 20% buffer
export const ESTIMATION_SAFETY_MARGIN = 1.5; // 50% buffer (covers CJK and mixed content)
/** Utilization threshold for triggering compaction */
export const COMPACTION_TRIGGER_RATIO = 0.8; // 80%
@ -32,10 +32,10 @@ export function estimateMessagesTokens(messages: AgentMessage[]): number {
*/
export function estimateSystemPromptTokens(systemPrompt: string | undefined): number {
if (!systemPrompt) return 0;
// Simple estimation: ~4 chars = 1 token (for English/code mixed text)
// Chinese ~2 chars = 1 token
// Average value of 3
return Math.ceil(systemPrompt.length / 3);
// Conservative estimation: ~2 chars = 1 token
// English/code averages ~4 chars/token but CJK averages ~1-2 chars/token.
// Using /2 as a safe default to prevent underestimation on mixed content.
return Math.ceil(systemPrompt.length / 2);
}
/**

View file

@ -0,0 +1,21 @@
/**
* Error classification utilities for agent error handling.
*/
/**
* Check if an error is a context overflow / "prompt too long" error from any LLM provider.
*
* These errors indicate the request exceeded the model's context window and should
* trigger auto-compaction rather than auth profile rotation.
*/
export function isContextOverflowError(error: unknown): boolean {
const msg = (error instanceof Error ? error.message : String(error)).toLowerCase();
return (
msg.includes("prompt is too long") ||
msg.includes("context length exceeded") ||
msg.includes("maximum context length") ||
msg.includes("request_too_large") ||
msg.includes("request size exceeds") ||
(msg.includes("413") && msg.includes("too large"))
);
}

View file

@ -22,7 +22,14 @@ import {
checkContextWindow,
DEFAULT_CONTEXT_TOKENS,
type ContextWindowGuardResult,
estimateTokenUsage,
COMPACTION_TRIGGER_RATIO,
compactMessagesTokenAware,
MIN_KEEP_MESSAGES,
} from "./context-window/index.js";
import {
pruneToolResults,
} from "./context-window/tool-result-pruning.js";
import { mergeToolsConfig, type ToolsConfig } from "./tools/policy.js";
import {
loadAuthProfileStore,
@ -42,6 +49,7 @@ import {
sanitizeToolCallInputs,
sanitizeToolUseResultPairing,
} from "./session/session-transcript-repair.js";
import { isContextOverflowError } from "./errors.js";
// ============================================================
// Error classification for auth profile rotation
@ -89,11 +97,15 @@ export class Agent {
private readonly stderr: NodeJS.WritableStream;
private initialized = false;
// Context window settings (for pre-flight compaction)
private readonly reserveTokens: number;
// Internal run state
private _internalRun = false;
private _isRunning = false;
private _aborted = false;
private _runMutex: Promise<void> = Promise.resolve();
private _compactionPromise: Promise<void> = Promise.resolve();
private currentUserDisplayPrompt: string | undefined;
// MulticaEvent subscribers (parallel to PiAgentCore's subscriber list)
@ -188,8 +200,10 @@ export class Agent {
return this.currentApiKey;
},
transformContext: async (messages) => {
const sanitizedInputs = sanitizeToolCallInputs(messages);
return sanitizeToolUseResultPairing(sanitizedInputs);
let result = sanitizeToolCallInputs(messages);
result = sanitizeToolUseResultPairing(result);
result = this.preflightCompact(result);
return result;
},
});
@ -260,6 +274,9 @@ export class Agent {
? resolveApiKey(this.resolvedProvider, options.apiKey)
: undefined;
// Store reserveTokens for pre-flight compaction
this.reserveTokens = options.reserveTokens ?? 1024;
// 创建 SessionManager带 context window 配置)
this.session = new SessionManager({
sessionId: this.sessionId,
@ -425,6 +442,8 @@ export class Agent {
prompt: string,
options?: { displayPrompt?: string },
): Promise<AgentRunResult> {
// Wait for any in-flight compaction from the previous run
await this._compactionPromise;
await this.ensureInitialized();
this.refreshAuthState();
this.output.state.lastAssistantText = "";
@ -444,6 +463,9 @@ export class Agent {
const canRotate = !this.pinnedProfile && this.profileCandidates.length > 1;
let lastError: unknown;
const MAX_OVERFLOW_COMPACTION_ATTEMPTS = 2;
let overflowAttempts = 0;
// Loop to exhaust all candidate profiles on rotatable errors
while (true) {
try {
@ -452,6 +474,34 @@ export class Agent {
} catch (error) {
lastError = error;
// Context overflow recovery: auto-compact and retry before trying auth rotation
if (isContextOverflowError(error) && overflowAttempts < MAX_OVERFLOW_COMPACTION_ATTEMPTS) {
overflowAttempts++;
this.stderr.write(
`[context-overflow] Overflow detected (attempt ${overflowAttempts}/${MAX_OVERFLOW_COMPACTION_ATTEMPTS}), compacting...\n`,
);
const messages = this.agent.state.messages.slice();
const result = await this.session.maybeCompact(messages);
if (result?.kept) {
this.agent.replaceMessages(result.kept);
this.output.state.lastAssistantText = "";
continue; // retry with compacted messages
}
// Forced fallback: estimation may diverge from reality (the LLM
// already told us the context is too large), so drop the oldest
// half of messages even when maybeCompact thinks no compaction is needed.
if (messages.length > MIN_KEEP_MESSAGES) {
const keepCount = Math.max(MIN_KEEP_MESSAGES, Math.floor(messages.length / 2));
const forcedKept = messages.slice(-keepCount);
this.stderr.write(
`[context-overflow] Forced compaction: ${messages.length}${forcedKept.length} messages\n`,
);
this.agent.replaceMessages(forcedKept);
this.output.state.lastAssistantText = "";
continue;
}
}
const reason = classifyError(error);
if (this.currentProfileId && isRotatableError(reason)) {
markAuthProfileFailure(this.currentProfileId, reason);
@ -615,35 +665,88 @@ export class Agent {
// Skip compaction during internal runs — internal messages will be
// rolled back from memory afterwards, so compacting now would be incorrect.
if (message.role === "assistant" && !this._internalRun) {
void this.maybeCompact();
this._compactionPromise = this.maybeCompact().catch((err) => {
console.error("[Agent] Compaction failed:", err);
});
}
}
}
/**
* Pre-flight context compaction runs inside transformContext before every LLM call.
* Pure in-memory, no disk writes. Prunes tool results and drops oldest messages
* when the estimated token utilization exceeds the compaction trigger threshold.
*/
private preflightCompact(messages: AgentMessage[]): AgentMessage[] {
const estimation = estimateTokenUsage({
messages,
systemPrompt: this.agent.state.systemPrompt,
contextWindowTokens: this.contextWindowGuard.tokens,
reserveTokens: this.reserveTokens,
});
if (estimation.utilizationRatio < COMPACTION_TRIGGER_RATIO) {
return messages; // fast path
}
const originalCount = messages.length;
let result = messages;
// Phase 1: Prune tool results (soft trim + hard clear)
const pruneResult = pruneToolResults({
messages: result,
contextWindowTokens: this.contextWindowGuard.tokens,
});
if (pruneResult.changed) {
result = pruneResult.messages;
}
// Re-estimate after pruning
const afterPrune = estimateTokenUsage({
messages: result,
systemPrompt: this.agent.state.systemPrompt,
contextWindowTokens: this.contextWindowGuard.tokens,
reserveTokens: this.reserveTokens,
});
// Phase 2: Drop oldest messages if still over threshold
if (afterPrune.utilizationRatio >= COMPACTION_TRIGGER_RATIO) {
const compacted = compactMessagesTokenAware(result, afterPrune.availableTokens);
if (compacted) {
result = compacted.kept;
}
}
if (result.length < originalCount) {
const saved = originalCount - result.length;
this.stderr.write(
`[pre-flight compaction] pruned ${saved} messages (${originalCount}${result.length})\n`,
);
}
return result;
}
private async maybeCompact() {
const messages = this.agent.state.messages.slice();
if (!this.session.needsCompaction(messages)) return;
try {
const result = await this.session.maybeCompact(messages);
if (!result) return;
const result = await this.session.maybeCompact(messages);
if (!result) return;
this.emitMulticaEvent({ type: "compaction_start" });
if (result?.kept) {
this.agent.replaceMessages(result.kept);
}
const endEvent: CompactionEndEvent = {
type: "compaction_end",
removed: result?.removedCount ?? 0,
kept: result?.kept.length ?? messages.length,
tokensRemoved: result?.tokensRemoved,
tokensKept: result?.tokensKept,
reason: result?.reason ?? "tokens",
};
this.emitMulticaEvent(endEvent);
} catch (err) {
throw err;
this.emitMulticaEvent({ type: "compaction_start" });
if (result.kept) {
this.agent.replaceMessages(result.kept);
}
const endEvent: CompactionEndEvent = {
type: "compaction_end",
removed: result.removedCount ?? 0,
kept: result.kept.length ?? messages.length,
tokensRemoved: result.tokensRemoved,
tokensKept: result.tokensKept,
reason: result.reason ?? "tokens",
};
this.emitMulticaEvent(endEvent);
}
/**

View file

@ -44,7 +44,7 @@ vi.mock("../context-window/index.js", async () => {
const systemPromptTokens = params.systemPrompt ? 100 : 0;
const reserve = params.reserveTokens ?? 1024;
const availableTokens = Math.max(0, params.contextWindowTokens - systemPromptTokens - reserve);
const utilizationRatio = availableTokens > 0 ? (messageTokens * 1.2) / availableTokens : 1;
const utilizationRatio = availableTokens > 0 ? (messageTokens * 1.5) / availableTokens : 1;
return {
messageTokens,
@ -234,7 +234,7 @@ describe("compaction", () => {
// 100 * 10 = 1000 message tokens
// System: 100 tokens, Reserve: 1024
// Available: 2000 - 100 - 1024 = 876
// Utilization: (1000 * 1.2) / 876 = 1.37 > 0.8
// Utilization: (1000 * 1.5) / 876 = 1.71 > 0.8
const result = compactMessages(messages, {
mode: "tokens",
contextWindowTokens: 2000,
@ -249,7 +249,7 @@ describe("compaction", () => {
const messages = createMessages(5);
// 5 * 10 = 50 message tokens
// Available: 10000 - 100 - 1024 = 8876
// Utilization: (50 * 1.2) / 8876 = 0.007 < 0.8
// Utilization: (50 * 1.5) / 8876 = 0.008 < 0.8
const result = compactMessages(messages, {
mode: "tokens",
contextWindowTokens: 10000,

View file

@ -9,13 +9,22 @@ import {
type AgentMessageItem,
type ExecApprovalRequestPayload,
type ApprovalDecision,
type CompactionEndEvent,
} from "@multica/sdk";
export type ToolStatus = "running" | "success" | "error" | "interrupted";
export interface CompactionInfo {
removed: number;
kept: number;
tokensRemoved?: number;
tokensKept?: number;
reason: string;
}
export interface Message {
id: string;
role: "user" | "assistant" | "toolResult";
role: "user" | "assistant" | "toolResult" | "system";
content: ContentBlock[];
agentId: string;
stopReason?: string;
@ -24,6 +33,8 @@ export interface Message {
toolArgs?: Record<string, unknown>;
toolStatus?: ToolStatus;
isError?: boolean;
systemType?: "compaction";
compaction?: CompactionInfo;
}
export interface ChatError {
@ -215,6 +226,27 @@ export function useChat() {
}
case "tool_execution_update":
break;
case "compaction_end": {
const ce = event as CompactionEndEvent;
setMessages((prev) => [
...prev,
{
id: uuidv7(),
role: "system",
content: [],
agentId: payload.agentId,
systemType: "compaction",
compaction: {
removed: ce.removed,
kept: ce.kept,
tokensRemoved: ce.tokensRemoved,
tokensKept: ce.tokensKept,
reason: ce.reason,
},
},
]);
break;
}
}
}, []);

View file

@ -2,9 +2,17 @@ import type { ContentBlock } from "@multica/sdk"
export type ToolStatus = "running" | "success" | "error" | "interrupted"
export interface CompactionInfo {
removed: number
kept: number
tokensRemoved?: number
tokensKept?: number
reason: string
}
export interface Message {
id: string
role: "user" | "assistant" | "toolResult"
role: "user" | "assistant" | "toolResult" | "system"
content: ContentBlock[]
agentId: string
stopReason?: string
@ -13,4 +21,6 @@ export interface Message {
toolArgs?: Record<string, unknown>
toolStatus?: ToolStatus
isError?: boolean
systemType?: "compaction"
compaction?: CompactionInfo
}

View file

@ -0,0 +1,45 @@
"use client"
import { memo } from "react"
import { Scissors } from "lucide-react"
import type { Message } from "@multica/store"
function formatTokens(n: number): string {
if (n >= 1000) return `~${(n / 1000).toFixed(1)}k`
return `${n}`
}
interface CompactionItemProps {
message: Message
}
export const CompactionItem = memo(function CompactionItem({ message }: CompactionItemProps) {
const info = message.compaction
if (!info) return null
const label = info.reason === "summary" ? "Context summarized" : "Context compacted"
const removed = `${info.removed} messages removed`
const tokens = info.tokensRemoved != null
? `, ${formatTokens(info.tokensRemoved)} tokens freed`
: ""
return (
<div className="py-0.5 px-2.5 text-sm text-muted-foreground">
<div className="flex items-center gap-1.5 px-2.5 py-1">
{/* Status dot */}
<span className="size-1.5 rounded-full shrink-0 bg-muted-foreground/40" />
{/* Icon */}
<Scissors className="size-3.5 shrink-0" />
{/* Label */}
<span className="font-medium shrink-0">{label}</span>
{/* Stats */}
<span className="ml-auto text-xs text-muted-foreground/60 shrink-0">
{removed}{tokens}
</span>
</div>
</div>
)
})

View file

@ -5,6 +5,7 @@ import { MemoizedMarkdown } from "@multica/ui/components/markdown";
import { StreamingMarkdown } from "@multica/ui/components/markdown/StreamingMarkdown";
import { ToolCallItem } from "@multica/ui/components/tool-call-item";
import { ThinkingItem } from "@multica/ui/components/thinking-item";
import { CompactionItem } from "@multica/ui/components/compaction-item";
import { cn, getTextContent } from "@multica/ui/lib/utils";
import type { Message } from "@multica/store";
import type { ContentBlock, ToolCall, ThinkingContent } from "@multica/sdk";
@ -78,6 +79,11 @@ export const MessageList = memo(function MessageList({ messages, streamingIds }:
return (
<div className="relative p-6 px-4 sm:px-10 max-w-4xl mx-auto">
{messages.map((msg) => {
// System messages (e.g. compaction notifications)
if (msg.role === "system") {
return <CompactionItem key={msg.id} message={msg} />
}
// ToolResult messages → render as tool execution item
if (msg.role === "toolResult") {
return <ToolCallItem key={msg.id} message={msg} />