Merge pull request #201 from multica-ai/forrestchang/debug-agent-logs
fix(agent): report accurate compaction metrics and add run-log observability
This commit is contained in:
commit
47f8e621c8
12 changed files with 537 additions and 16 deletions
65
CLAUDE.md
65
CLAUDE.md
|
|
@ -170,19 +170,72 @@ Fonts are loaded via `@fontsource` packages (not Google Fonts) for cross-platfor
|
|||
The agent engine supports structured run logging for debugging. When enabled, it writes all key execution events to `~/.super-multica/sessions/{sessionId}/run-log.jsonl` alongside the session data.
|
||||
|
||||
```bash
|
||||
# Enable via environment variable
|
||||
MULTICA_RUN_LOG=1 pnpm multica run "your prompt"
|
||||
# Enable via CLI flag
|
||||
pnpm multica run --run-log "your prompt"
|
||||
|
||||
# Enable during tests
|
||||
MULTICA_RUN_LOG=1 pnpm --filter @multica/core test
|
||||
# Or via environment variable
|
||||
MULTICA_RUN_LOG=1 pnpm multica run "your prompt"
|
||||
|
||||
# Or programmatically
|
||||
const agent = new Agent({ enableRunLog: true });
|
||||
```
|
||||
|
||||
Logged events: `run_start`, `run_end`, `llm_call`, `llm_result`, `tool_start`, `tool_end`, `context_overflow`, `auth_rotate`, `error_classify`, `preflight_compact_start/end`, `compaction`.
|
||||
When `--run-log` is enabled, the CLI prints the session directory path to stderr:
|
||||
```
|
||||
[session: 019c584a-...]
|
||||
[session-dir: ~/.super-multica/sessions/019c584a-...]
|
||||
```
|
||||
|
||||
Each line is a JSON object with `ts` (timestamp) and `event` (type), suitable for AI-assisted log analysis. Implementation: `packages/core/src/agent/run-log.ts`.
|
||||
Logged events: `run_start`, `run_end`, `llm_call`, `llm_result`, `tool_start`, `tool_end`, `context_overflow`, `auth_rotate`, `error_classify`, `preflight_compact_start/end`, `tool_result_pruning`, `compaction`, `compaction_detail`.
|
||||
|
||||
Each line is a JSON object with `ts` (timestamp) and `event` (type), suitable for AI-assisted log analysis. Full event reference: `packages/core/src/agent/run-log.ts`.
|
||||
|
||||
## E2E Testing (Agent-Driven)
|
||||
|
||||
E2E tests are executed and analyzed by the Coding Agent (Claude Code), not by vitest. The Coding Agent runs the Multica agent via CLI, reads the structured run-log, and intelligently analyzes intermediate behavior and results.
|
||||
|
||||
### How to Run
|
||||
|
||||
E2E tests use an isolated data directory (`~/.super-multica-e2e`) to avoid polluting dev or production session data.
|
||||
|
||||
```bash
|
||||
# Basic E2E test (web_search/data tools require MULTICA_API_URL)
|
||||
SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log "your test prompt"
|
||||
|
||||
# With specific provider
|
||||
SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log --provider kimi-coding "your test prompt"
|
||||
|
||||
# Multi-turn test (reuse session)
|
||||
SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log --session <session-id> "follow-up prompt"
|
||||
|
||||
# Clean up all E2E test data
|
||||
rm -rf ~/.super-multica-e2e
|
||||
```
|
||||
|
||||
### Analysis Workflow
|
||||
|
||||
After running, the Coding Agent should:
|
||||
1. Read `{session-dir}/run-log.jsonl` — structured execution events
|
||||
2. Read `{session-dir}/session.jsonl` — full conversation transcript (if needed)
|
||||
3. Analyze event sequence, tool calls, errors, and timing
|
||||
4. Report findings with verdict (pass/fail + details)
|
||||
|
||||
### What to Check
|
||||
|
||||
- **Event completeness**: `run_start` → ... → `run_end` (no orphaned starts)
|
||||
- **Tool pairing**: every `tool_start` has a matching `tool_end`
|
||||
- **Error handling**: `is_error`, `error_classify`, `auth_rotate` events
|
||||
- **Compaction health**: `tokens_removed > 0` when compaction fires
|
||||
- **Performance**: `llm_result.duration_ms`, tool execution times
|
||||
|
||||
### Important
|
||||
|
||||
- **`SMC_DATA_DIR=~/.super-multica-e2e`** isolates E2E test sessions from dev (`~/.super-multica-dev`) and production (`~/.super-multica`) data. Always set this.
|
||||
- **`MULTICA_API_URL=https://api-dev.copilothub.ai`** is required for `web_search` and `data` tools. Without it, these tools fail with `MULTICA_API_URL is required`.
|
||||
- **Auth for `web_search`/`data`**: These tools need dev backend auth. The auth store auto-falls back to `~/.super-multica-dev/auth.json`. If missing, run `pnpm dev:local` first and log in through the Desktop app.
|
||||
- Default provider is `kimi-coding`. Override with `--provider`.
|
||||
- Run-log and session data are at `~/.super-multica-e2e/sessions/{sessionId}/`
|
||||
- Detailed guide with feature-specific test playbooks: `docs/e2e-testing-guide.md`
|
||||
|
||||
## Credentials Setup
|
||||
|
||||
|
|
|
|||
|
|
@ -6,9 +6,11 @@
|
|||
* echo "prompt" | multica run
|
||||
*/
|
||||
|
||||
import { join } from "node:path";
|
||||
import { Agent } from "@multica/core";
|
||||
import type { AgentOptions } from "@multica/core";
|
||||
import type { ToolsConfig } from "@multica/core";
|
||||
import { DATA_DIR } from "@multica/utils";
|
||||
import { cyan, yellow, dim } from "../colors.js";
|
||||
|
||||
type RunOptions = {
|
||||
|
|
@ -23,6 +25,7 @@ type RunOptions = {
|
|||
cwd?: string | undefined;
|
||||
session?: string | undefined;
|
||||
debug?: boolean;
|
||||
runLog?: boolean;
|
||||
toolsAllow?: string[];
|
||||
toolsDeny?: string[];
|
||||
help?: boolean;
|
||||
|
|
@ -45,6 +48,7 @@ ${cyan("Options:")}
|
|||
${yellow("--cwd")} DIR Working directory
|
||||
${yellow("--session")} ID Session ID for persistence
|
||||
${yellow("--debug")} Enable debug logging
|
||||
${yellow("--run-log")} Enable structured run logging (run-log.jsonl)
|
||||
${yellow("--help")}, -h Show this help
|
||||
|
||||
${cyan("Tools Configuration:")}
|
||||
|
|
@ -123,6 +127,10 @@ function parseArgs(argv: string[]): { opts: RunOptions; prompt: string } {
|
|||
opts.debug = true;
|
||||
continue;
|
||||
}
|
||||
if (arg === "--run-log") {
|
||||
opts.runLog = true;
|
||||
continue;
|
||||
}
|
||||
if (arg === "--tools-allow") {
|
||||
const value = args.shift();
|
||||
opts.toolsAllow = value?.split(",").map((s) => s.trim()) ?? [];
|
||||
|
|
@ -182,6 +190,8 @@ export async function runCommand(args: string[]): Promise<void> {
|
|||
}
|
||||
}
|
||||
|
||||
const enableRunLog = opts.runLog || !!process.env.MULTICA_RUN_LOG;
|
||||
|
||||
const agent = new Agent({
|
||||
profileId: opts.profile,
|
||||
provider: opts.provider,
|
||||
|
|
@ -194,13 +204,19 @@ export async function runCommand(args: string[]): Promise<void> {
|
|||
cwd: opts.cwd,
|
||||
sessionId: opts.session,
|
||||
debug: opts.debug,
|
||||
enableRunLog,
|
||||
tools: toolsConfig,
|
||||
});
|
||||
|
||||
const sessionDir = join(DATA_DIR, "sessions", agent.sessionId);
|
||||
|
||||
// If it's a newly created session, notify user of sessionId
|
||||
if (!opts.session) {
|
||||
console.error(`[session: ${agent.sessionId}]`);
|
||||
}
|
||||
if (enableRunLog) {
|
||||
console.error(`[session-dir: ${sessionDir}]`);
|
||||
}
|
||||
|
||||
const result = await agent.run(finalPrompt);
|
||||
if (result.error) {
|
||||
|
|
|
|||
295
docs/e2e-testing-guide.md
Normal file
295
docs/e2e-testing-guide.md
Normal file
|
|
@ -0,0 +1,295 @@
|
|||
# Agent-Driven E2E Testing Guide
|
||||
|
||||
This guide teaches Coding Agents (Claude Code, etc.) how to perform automated end-to-end testing of Super Multica features. Unlike traditional test frameworks, **the Coding Agent itself is the test runner and oracle** — it executes the agent, reads structured logs, and intelligently analyzes the results.
|
||||
|
||||
## Overview
|
||||
|
||||
The testing flow:
|
||||
|
||||
1. Coding Agent runs `pnpm multica run --run-log "test prompt"`
|
||||
2. The agent engine executes the prompt with full structured logging
|
||||
3. Coding Agent reads the `run-log.jsonl` and `session.jsonl` files
|
||||
4. Coding Agent analyzes events, tool calls, and behavior for correctness
|
||||
|
||||
This approach is superior to static assertions because:
|
||||
- The AI can understand **intent** — did the agent do what the prompt asked?
|
||||
- It can reason about **intermediate process** — were the right tools called in the right order?
|
||||
- It can detect **subtle issues** — token counts that don't make sense, unnecessary retries, missing events
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Credentials configured**: Run `pnpm multica credentials init` or ensure `~/.super-multica/credentials.json5` has valid provider credentials
|
||||
2. **Available providers**: Check with `pnpm multica profile list` or inspect credentials file
|
||||
3. **Default provider**: `kimi-coding` (Kimi Code, free tier available). Can override with `--provider`
|
||||
4. **`MULTICA_API_URL`**: Required for `web_search` and `data` tools. Set to `https://api-dev.copilothub.ai` for dev environment. Without this, web search and financial data tools will fail with `MULTICA_API_URL is required`
|
||||
5. **`SMC_DATA_DIR`**: Set to `~/.super-multica-e2e` to isolate E2E test sessions from dev (`~/.super-multica-dev`) and production (`~/.super-multica`) data. Without this, test sessions pollute the production sessions directory
|
||||
6. **Dev auth for `web_search`/`data` tools**: These tools authenticate via `auth.json` (session ID + device ID). The auth store automatically falls back to `~/.super-multica-dev/auth.json` when the E2E data dir has no auth. If `~/.super-multica-dev/auth.json` doesn't exist, run `pnpm dev:local` first and log in through the Desktop app to create it
|
||||
|
||||
## Running a Test
|
||||
|
||||
### Environment variables
|
||||
|
||||
All E2E test commands should include these env vars:
|
||||
|
||||
```bash
|
||||
# SMC_DATA_DIR — isolates test sessions from dev/production
|
||||
# MULTICA_API_URL — enables web_search and data tools
|
||||
export SMC_DATA_DIR=~/.super-multica-e2e
|
||||
export MULTICA_API_URL=https://api-dev.copilothub.ai
|
||||
```
|
||||
|
||||
### Basic command
|
||||
|
||||
```bash
|
||||
# For prompts that only need exec/read/write tools:
|
||||
SMC_DATA_DIR=~/.super-multica-e2e pnpm multica run --run-log "your test prompt here"
|
||||
|
||||
# For prompts that need web_search or data tools:
|
||||
SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log "your test prompt here"
|
||||
```
|
||||
|
||||
### With provider override
|
||||
|
||||
```bash
|
||||
SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log --provider claude-code "your test prompt"
|
||||
SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log --provider kimi-coding "your test prompt"
|
||||
```
|
||||
|
||||
### Resume a session (multi-turn testing)
|
||||
|
||||
```bash
|
||||
# First turn
|
||||
SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log "Create a file called test.txt with content 'hello'"
|
||||
# Note the session ID from stderr output: [session: 019c584a-...]
|
||||
|
||||
# Second turn (same session)
|
||||
SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log --session 019c584a-... "Read the file test.txt and tell me its content"
|
||||
```
|
||||
|
||||
### Cleanup
|
||||
|
||||
```bash
|
||||
# Remove all E2E test sessions
|
||||
rm -rf ~/.super-multica-e2e
|
||||
```
|
||||
|
||||
### Output
|
||||
|
||||
The CLI prints metadata to stderr:
|
||||
```
|
||||
[session: 019c584a-7753-762d-9fb9-9eb0a8187df5]
|
||||
[session-dir: /Users/you/.super-multica/sessions/019c584a-7753-762d-9fb9-9eb0a8187df5]
|
||||
```
|
||||
|
||||
Agent text output goes to stdout.
|
||||
|
||||
## Reading Results
|
||||
|
||||
After a run, two files contain the data needed for analysis:
|
||||
|
||||
### run-log.jsonl
|
||||
|
||||
Location: `{session-dir}/run-log.jsonl`
|
||||
|
||||
Each line is a JSON object with structured event data. Read this file to understand **what happened during execution**.
|
||||
|
||||
```jsonl
|
||||
{"ts":1739000001,"event":"run_start","prompt":"What is 2+2?","provider":"kimi-coding","model":"kimi-k2-thinking","messages":0}
|
||||
{"ts":1739000002,"event":"llm_call","provider":"kimi-coding","model":"kimi-k2-thinking","messages":2}
|
||||
{"ts":1739000005,"event":"llm_result","duration_ms":3000}
|
||||
{"ts":1739000005,"event":"run_end","duration_ms":4000,"error":null,"text":"4"}
|
||||
```
|
||||
|
||||
### session.jsonl
|
||||
|
||||
Location: `{session-dir}/session.jsonl`
|
||||
|
||||
Contains the full conversation transcript (user messages, assistant replies, tool calls and results). Read this for **message content analysis**.
|
||||
|
||||
## Run-Log Event Reference
|
||||
|
||||
> Source of truth: `packages/core/src/agent/run-log.ts` (JSDoc at top of file)
|
||||
|
||||
### Lifecycle Events
|
||||
|
||||
| Event | Fields | Description |
|
||||
|-------|--------|-------------|
|
||||
| `run_start` | prompt, internal, provider, model, messages | Agent run begins |
|
||||
| `run_end` | duration_ms, error, text, aborted? | Agent run completes |
|
||||
|
||||
### LLM Interaction
|
||||
|
||||
| Event | Fields | Description |
|
||||
|-------|--------|-------------|
|
||||
| `llm_call` | provider, model, profile, messages | LLM API request sent |
|
||||
| `llm_result` | duration_ms | LLM API response received |
|
||||
|
||||
### Tool Execution
|
||||
|
||||
| Event | Fields | Description |
|
||||
|-------|--------|-------------|
|
||||
| `tool_start` | tool, args | Tool execution begins |
|
||||
| `tool_end` | tool, duration_ms, is_error | Tool execution completes |
|
||||
|
||||
### Context Management
|
||||
|
||||
| Event | Fields | Description |
|
||||
|-------|--------|-------------|
|
||||
| `preflight_compact_start` | utilization, trigger, messages, est_tokens | Preflight compaction triggered |
|
||||
| `preflight_compact_end` | messages_before, messages_after, pruned | Preflight compaction done |
|
||||
| `tool_result_pruning` | soft_trimmed, hard_cleared, chars_saved, phase, tokens_before?, tokens_after? | Tool result pruning (Phase 1) |
|
||||
| `compaction` | removed, kept, tokens_removed, tokens_kept, reason, pruning_stats? | Summary compaction (Phase 2) |
|
||||
| `compaction_detail` | pre_pruning_tokens, post_compaction_tokens, messages_removed, reason, pruning_applied | Detailed compaction breakdown |
|
||||
|
||||
### Error Recovery
|
||||
|
||||
| Event | Fields | Description |
|
||||
|-------|--------|-------------|
|
||||
| `context_overflow` | attempt, messages_before | Context window overflow detected |
|
||||
| `context_overflow_compacted` | messages_after, tokens_removed | Recovered via compaction |
|
||||
| `context_overflow_forced` | messages_before, messages_after | Recovered via forced drop |
|
||||
| `error_classify` | error, reason, rotatable | Error classified for rotation |
|
||||
| `auth_rotate` | from, to, reason | Auth profile rotated |
|
||||
|
||||
## Feature Test Playbooks
|
||||
|
||||
### 1. Basic Prompt Completion
|
||||
|
||||
**Goal**: Verify the agent can complete a simple prompt end-to-end.
|
||||
|
||||
```bash
|
||||
pnpm multica run --run-log "What is the capital of France? Reply in one word."
|
||||
```
|
||||
|
||||
**What to check in run-log**:
|
||||
- `run_start` event exists with correct provider
|
||||
- `llm_call` → `llm_result` pair exists (at least one)
|
||||
- `run_end` event has `error: null`
|
||||
- `run_end.duration_ms` is reasonable (< 30s for simple prompt)
|
||||
|
||||
**What to check in output**:
|
||||
- Text contains "Paris"
|
||||
|
||||
### 2. Tool Usage
|
||||
|
||||
**Goal**: Verify tools are called correctly when the prompt requires them.
|
||||
|
||||
```bash
|
||||
pnpm multica run --run-log --cwd /tmp "List the files in the current directory"
|
||||
```
|
||||
|
||||
**What to check in run-log**:
|
||||
- `tool_start` event with `tool: "exec"` or similar filesystem tool
|
||||
- Matching `tool_end` with `is_error: false`
|
||||
- Tool called before final `run_end`
|
||||
|
||||
**What to check in output**:
|
||||
- Output contains actual file names from /tmp
|
||||
|
||||
### 3. Context Compaction
|
||||
|
||||
**Goal**: Verify compaction works correctly on long sessions.
|
||||
|
||||
```bash
|
||||
# Build up a long session to trigger compaction
|
||||
pnpm multica run --run-log "Write a detailed 2000-word essay about climate change"
|
||||
# Note session ID, then continue:
|
||||
pnpm multica run --run-log --session {id} "Now write another 2000-word essay about renewable energy"
|
||||
pnpm multica run --run-log --session {id} "Summarize both essays in 3 bullet points"
|
||||
```
|
||||
|
||||
**What to check in run-log**:
|
||||
- `preflight_compact_start` appears when utilization exceeds trigger ratio
|
||||
- `tool_result_pruning` shows `soft_trimmed > 0` or `hard_cleared > 0` if tool results were pruned
|
||||
- `compaction` event has `tokens_removed > 0` (not near-zero like the bug we fixed)
|
||||
- `compaction_detail` shows `pre_pruning_tokens` > `post_compaction_tokens`
|
||||
|
||||
### 4. Multi-Provider Comparison
|
||||
|
||||
**Goal**: Verify the same prompt works across different providers.
|
||||
|
||||
```bash
|
||||
pnpm multica run --run-log --provider kimi-coding "Explain recursion in 2 sentences"
|
||||
pnpm multica run --run-log --provider claude-code "Explain recursion in 2 sentences"
|
||||
```
|
||||
|
||||
**What to check**:
|
||||
- Both runs complete without errors
|
||||
- Both `run_end` events have `error: null`
|
||||
- Compare `llm_result.duration_ms` across providers
|
||||
- Both outputs are meaningful explanations of recursion
|
||||
|
||||
### 5. Error Handling & Auth Rotation
|
||||
|
||||
**Goal**: Verify error recovery when credentials are invalid.
|
||||
|
||||
```bash
|
||||
pnpm multica run --run-log --provider anthropic --api-key "sk-invalid-key" "Hello"
|
||||
```
|
||||
|
||||
**What to check in run-log**:
|
||||
- `error_classify` event with `reason: "auth"`
|
||||
- `auth_rotate` event if multiple profiles are configured
|
||||
- `run_end` with appropriate error message if no valid profiles exist
|
||||
|
||||
## Analysis Patterns
|
||||
|
||||
When analyzing run-logs, look for these patterns:
|
||||
|
||||
### Healthy Run
|
||||
```
|
||||
run_start → llm_call → llm_result → run_end (error: null)
|
||||
```
|
||||
|
||||
### Run with Tool Usage
|
||||
```
|
||||
run_start → llm_call → llm_result → tool_start → tool_end → llm_call → llm_result → run_end
|
||||
```
|
||||
|
||||
### Run with Compaction
|
||||
```
|
||||
run_start → preflight_compact_start → tool_result_pruning → preflight_compact_end → llm_call → ...
|
||||
```
|
||||
|
||||
### Red Flags
|
||||
- `run_end` without preceding `run_start` (log corruption)
|
||||
- `tool_start` without matching `tool_end` (tool hang/crash)
|
||||
- `compaction` with `tokens_removed` near zero (compaction ineffective)
|
||||
- Multiple `error_classify` events (repeated failures)
|
||||
- `context_overflow_forced` (emergency fallback — should be rare)
|
||||
|
||||
## Creating a New Test Playbook
|
||||
|
||||
When a new feature is implemented, create a test playbook following this template:
|
||||
|
||||
```markdown
|
||||
### N. Feature Name
|
||||
|
||||
**Goal**: One sentence describing what to verify.
|
||||
|
||||
**Command**:
|
||||
\`\`\`bash
|
||||
pnpm multica run --run-log [options] "prompt that exercises the feature"
|
||||
\`\`\`
|
||||
|
||||
**What to check in run-log**:
|
||||
- List specific events and field values to verify
|
||||
- Include both positive checks (event exists) and negative checks (no errors)
|
||||
|
||||
**What to check in output**:
|
||||
- What the text output should contain or look like
|
||||
|
||||
**What to check in session.jsonl** (if applicable):
|
||||
- Specific message patterns to verify
|
||||
```
|
||||
|
||||
## Tips for Coding Agents
|
||||
|
||||
1. **Always use `--run-log`** — without it, there's no structured data to analyze
|
||||
2. **Use `--cwd`** to control the working directory for file-related tests
|
||||
3. **Read run-log line by line** — each line is independent JSON, parse individually
|
||||
4. **Check event ordering** — events are chronologically ordered by `ts`
|
||||
5. **Token counts are estimates** — don't expect exact values, check for reasonable ranges
|
||||
6. **Clean up test sessions** — after testing, remove session dirs from `~/.super-multica/sessions/` to avoid clutter
|
||||
7. **Use `--provider`** to test specific providers — defaults to whatever is configured in credentials
|
||||
8. **For multi-turn tests**, always capture and reuse the session ID from the first run
|
||||
|
|
@ -26,6 +26,8 @@ export type CompactionEndEvent = {
|
|||
reason: "count" | "tokens" | "summary" | "pruning";
|
||||
/** Generated summary text (only present when reason is "summary") */
|
||||
summary?: string | undefined;
|
||||
/** Tool result pruning statistics (when Phase 1 pruning was applied) */
|
||||
pruningStats?: { softTrimmed: number; hardCleared: number; charsSaved: number } | undefined;
|
||||
};
|
||||
|
||||
/** Emitted when an agent encounters an error during execution */
|
||||
|
|
|
|||
|
|
@ -1,3 +1,60 @@
|
|||
/**
|
||||
* Structured Run Log
|
||||
*
|
||||
* Records agent execution events to `{sessionDir}/run-log.jsonl`.
|
||||
* Each line is a JSON object with `ts` (epoch ms) and `event` (type string).
|
||||
*
|
||||
* Enable via `MULTICA_RUN_LOG=1` env var or `enableRunLog: true` in AgentOptions.
|
||||
* CLI: `pnpm multica run --run-log "prompt"`
|
||||
*
|
||||
* ## Event Reference
|
||||
*
|
||||
* ### Lifecycle
|
||||
* - `run_start` — Agent run begins.
|
||||
* Fields: prompt (first 200 chars), internal, provider, model, messages (count)
|
||||
* - `run_end` — Agent run completes.
|
||||
* Fields: duration_ms, error (string|null), text (first 200 chars), aborted?
|
||||
*
|
||||
* ### LLM Interaction
|
||||
* - `llm_call` — LLM API request sent.
|
||||
* Fields: provider, model, profile, messages (count)
|
||||
* - `llm_result` — LLM API response received.
|
||||
* Fields: duration_ms
|
||||
*
|
||||
* ### Tool Execution
|
||||
* - `tool_start` — Tool execution begins.
|
||||
* Fields: tool (name), args (first 500 chars of JSON)
|
||||
* - `tool_end` — Tool execution completes.
|
||||
* Fields: tool (name), duration_ms, is_error
|
||||
*
|
||||
* ### Context Management — Preflight (before LLM call)
|
||||
* - `preflight_compact_start` — Preflight compaction triggered.
|
||||
* Fields: utilization, trigger, messages (count), est_tokens
|
||||
* - `preflight_compact_end` — Preflight compaction completed.
|
||||
* Fields: messages_before, messages_after, pruned (count removed)
|
||||
* - `tool_result_pruning` — Tool result pruning applied (Phase 1).
|
||||
* Fields: soft_trimmed, hard_cleared, chars_saved, phase ("preflight"|"compaction"),
|
||||
* tokens_before?, tokens_after? (present when phase="compaction")
|
||||
*
|
||||
* ### Context Management — Compaction (during session)
|
||||
* - `compaction` — Summary compaction completed (Phase 2).
|
||||
* Fields: removed, kept, tokens_removed, tokens_kept, reason, pruning_stats?
|
||||
* - `compaction_detail` — Detailed compaction breakdown.
|
||||
* Fields: pre_pruning_tokens, post_compaction_tokens, messages_removed, reason, pruning_applied
|
||||
*
|
||||
* ### Error Recovery
|
||||
* - `context_overflow` — Context window overflow detected.
|
||||
* Fields: attempt, messages_before
|
||||
* - `context_overflow_compacted` — Overflow recovered via compaction.
|
||||
* Fields: messages_after, tokens_removed
|
||||
* - `context_overflow_forced` — Overflow recovered via forced message drop.
|
||||
* Fields: messages_before, messages_after
|
||||
* - `error_classify` — Error classified for auth rotation.
|
||||
* Fields: error (first 200 chars), reason, rotatable
|
||||
* - `auth_rotate` — Auth profile rotated after error.
|
||||
* Fields: from, to, reason
|
||||
*/
|
||||
|
||||
import { join } from "path";
|
||||
import { mkdirSync } from "fs";
|
||||
import { appendFile } from "fs/promises";
|
||||
|
|
|
|||
|
|
@ -306,6 +306,8 @@ export class Agent {
|
|||
model: compactionMode === "summary" ? model : undefined,
|
||||
apiKey: summaryApiKey,
|
||||
customInstructions: options.summaryInstructions,
|
||||
// Observability
|
||||
runLog: this.runLog,
|
||||
});
|
||||
|
||||
if (!options.thinkingLevel && storedMeta?.thinkingLevel) {
|
||||
|
|
@ -810,6 +812,14 @@ export class Agent {
|
|||
});
|
||||
if (pruneResult.changed) {
|
||||
result = pruneResult.messages;
|
||||
if (pruneResult.softTrimmed > 0 || pruneResult.hardCleared > 0) {
|
||||
this.runLog.log("tool_result_pruning", {
|
||||
soft_trimmed: pruneResult.softTrimmed,
|
||||
hard_cleared: pruneResult.hardCleared,
|
||||
chars_saved: pruneResult.charsSaved,
|
||||
phase: "preflight",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Re-estimate after pruning
|
||||
|
|
@ -862,6 +872,7 @@ export class Agent {
|
|||
tokensKept: result.tokensKept,
|
||||
reason: result.reason ?? "tokens",
|
||||
summary: result.summary,
|
||||
pruningStats: result.pruningStats,
|
||||
};
|
||||
this.emitMulticaEvent(endEvent);
|
||||
this.runLog.log("compaction", {
|
||||
|
|
@ -870,6 +881,7 @@ export class Agent {
|
|||
tokens_removed: endEvent.tokensRemoved,
|
||||
tokens_kept: endEvent.tokensKept,
|
||||
reason: endEvent.reason,
|
||||
pruning_stats: endEvent.pruningStats,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,13 @@ import {
|
|||
MIN_KEEP_MESSAGES,
|
||||
} from "../context-window/index.js";
|
||||
|
||||
/** Tool result pruning statistics */
|
||||
export type PruningStats = {
|
||||
softTrimmed: number;
|
||||
hardCleared: number;
|
||||
charsSaved: number;
|
||||
};
|
||||
|
||||
export type CompactionResult = {
|
||||
kept: AgentMessage[];
|
||||
removedCount: number;
|
||||
|
|
@ -25,6 +32,8 @@ export type CompactionResult = {
|
|||
toolFailures?: Array<{ toolName: string; summary: string }> | undefined;
|
||||
/** Reason for compaction: count, tokens, summary, or pruning (tool result trimming only) */
|
||||
reason: "count" | "tokens" | "summary" | "pruning";
|
||||
/** Tool result pruning statistics (when Phase 1 pruning was applied) */
|
||||
pruningStats?: PruningStats | undefined;
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import { getModel, type Model, type UserMessage } from "@mariozechner/pi-ai";
|
|||
import type { SessionEntry, SessionMeta } from "./types.js";
|
||||
import { appendEntry, readEntries, resolveSessionPath, writeEntries } from "./storage.js";
|
||||
import { compactMessages, compactMessagesAsync, type CompactionResult } from "./compaction.js";
|
||||
import { estimateTokenUsage, shouldCompact as shouldCompactTokens } from "../context-window/index.js";
|
||||
import { estimateTokenUsage, estimateMessagesTokens, shouldCompact as shouldCompactTokens } from "../context-window/index.js";
|
||||
import { credentialManager } from "../credentials.js";
|
||||
import { repairSessionFileIfNeeded, type RepairReport } from "./session-file-repair.js";
|
||||
import { sanitizeToolCallInputs, sanitizeToolUseResultPairing } from "./session-transcript-repair.js";
|
||||
|
|
@ -11,6 +11,7 @@ import {
|
|||
pruneToolResults,
|
||||
type ToolResultPruningSettings,
|
||||
} from "../context-window/tool-result-pruning.js";
|
||||
import type { RunLog } from "../run-log.js";
|
||||
|
||||
/** Get Kimi model for summarization (use a cheaper model than k2-thinking) */
|
||||
function getSummaryModel(): Model<any> {
|
||||
|
|
@ -64,6 +65,10 @@ export type SessionManagerOptions = {
|
|||
enableToolResultPruning?: boolean | undefined;
|
||||
/** Tool result pruning settings */
|
||||
toolResultPruning?: Partial<ToolResultPruningSettings> | undefined;
|
||||
|
||||
// Observability
|
||||
/** RunLog instance for structured logging */
|
||||
runLog?: RunLog | undefined;
|
||||
};
|
||||
|
||||
export class SessionManager {
|
||||
|
|
@ -87,6 +92,8 @@ export class SessionManager {
|
|||
// Tool result pruning
|
||||
private readonly enableToolResultPruning: boolean;
|
||||
private readonly toolResultPruning: Partial<ToolResultPruningSettings> | undefined;
|
||||
// Observability
|
||||
private readonly runLog: RunLog;
|
||||
|
||||
private queue: Promise<void> = Promise.resolve();
|
||||
private meta: SessionMeta | undefined;
|
||||
|
|
@ -120,6 +127,9 @@ export class SessionManager {
|
|||
(this.compactionMode === "tokens" || this.compactionMode === "summary");
|
||||
this.toolResultPruning = options.toolResultPruning;
|
||||
|
||||
// Observability
|
||||
this.runLog = options.runLog ?? { log() {}, async flush() {} };
|
||||
|
||||
this.meta = this.loadMeta();
|
||||
}
|
||||
|
||||
|
|
@ -270,6 +280,10 @@ export class SessionManager {
|
|||
async maybeCompact(messages: AgentMessage[]): Promise<CompactionResult | null> {
|
||||
let workingMessages = messages;
|
||||
let toolResultPruningApplied = false;
|
||||
let pruningStats: { softTrimmed: number; hardCleared: number; charsSaved: number } | undefined;
|
||||
|
||||
// Capture pre-pruning token count for accurate combined metrics
|
||||
const preCompactionTokens = estimateMessagesTokens(messages);
|
||||
|
||||
// Phase 1: Tool result pruning (soft trim / hard clear)
|
||||
// This reduces token usage without removing messages
|
||||
|
|
@ -283,6 +297,14 @@ export class SessionManager {
|
|||
if (pruneResult.changed) {
|
||||
workingMessages = pruneResult.messages;
|
||||
toolResultPruningApplied = true;
|
||||
pruningStats = {
|
||||
softTrimmed: pruneResult.softTrimmed,
|
||||
hardCleared: pruneResult.hardCleared,
|
||||
charsSaved: pruneResult.charsSaved,
|
||||
};
|
||||
|
||||
const postPruningTokens = estimateMessagesTokens(workingMessages);
|
||||
|
||||
// Log pruning stats
|
||||
if (pruneResult.softTrimmed > 0 || pruneResult.hardCleared > 0) {
|
||||
console.error(
|
||||
|
|
@ -290,11 +312,19 @@ export class SessionManager {
|
|||
`${pruneResult.hardCleared} hard-cleared, ~${Math.round(pruneResult.charsSaved / 1000)}k chars saved`,
|
||||
);
|
||||
}
|
||||
this.runLog.log("tool_result_pruning", {
|
||||
soft_trimmed: pruneResult.softTrimmed,
|
||||
hard_cleared: pruneResult.hardCleared,
|
||||
chars_saved: pruneResult.charsSaved,
|
||||
tokens_before: preCompactionTokens,
|
||||
tokens_after: postPruningTokens,
|
||||
phase: "compaction",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Message compaction (remove old messages if still needed)
|
||||
let result;
|
||||
let result: CompactionResult | null = null;
|
||||
|
||||
if (this.compactionMode === "summary") {
|
||||
// Use provided model/apiKey or fall back to Kimi
|
||||
|
|
@ -364,11 +394,33 @@ export class SessionManager {
|
|||
// still return the pruned messages
|
||||
if (!result) {
|
||||
if (toolResultPruningApplied) {
|
||||
return { kept: workingMessages, removedCount: 0, reason: "pruning" as const };
|
||||
const postPruningTokens = estimateMessagesTokens(workingMessages);
|
||||
return {
|
||||
kept: workingMessages,
|
||||
removedCount: 0,
|
||||
tokensRemoved: preCompactionTokens - postPruningTokens,
|
||||
tokensKept: postPruningTokens,
|
||||
reason: "pruning" as const,
|
||||
pruningStats,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Override metrics with accurate combined savings (Phase 1 + Phase 2)
|
||||
const postCompactionTokens = estimateMessagesTokens(result.kept);
|
||||
result.tokensRemoved = preCompactionTokens - postCompactionTokens;
|
||||
result.tokensKept = postCompactionTokens;
|
||||
result.pruningStats = pruningStats;
|
||||
|
||||
this.runLog.log("compaction_detail", {
|
||||
pre_pruning_tokens: preCompactionTokens,
|
||||
post_compaction_tokens: postCompactionTokens,
|
||||
messages_removed: result.removedCount,
|
||||
reason: result.reason,
|
||||
pruning_applied: toolResultPruningApplied,
|
||||
});
|
||||
|
||||
const entries: SessionEntry[] = [];
|
||||
if (this.meta) {
|
||||
entries.push({ type: "meta", meta: this.meta, timestamp: Date.now() });
|
||||
|
|
|
|||
|
|
@ -40,6 +40,8 @@ export type CompactionEndEvent = {
|
|||
tokensRemoved?: number;
|
||||
tokensKept?: number;
|
||||
reason: string;
|
||||
/** Tool result pruning statistics (when Phase 1 pruning was applied) */
|
||||
pruningStats?: { softTrimmed: number; hardCleared: number; charsSaved: number };
|
||||
};
|
||||
|
||||
/** Union of all compaction events */
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ export function getAuthHeaders(context?: string): Record<string, string> {
|
|||
if (!auth) {
|
||||
const suffix = context ? ` ${context}` : "";
|
||||
throw new Error(
|
||||
`Not logged in. Please sign in via the Desktop app${suffix}.`,
|
||||
`Not logged in${suffix}. Sign in via the Desktop app, or run pnpm dev:local and log in there.`,
|
||||
);
|
||||
}
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -1,18 +1,16 @@
|
|||
import { readFileSync } from "node:fs";
|
||||
import { homedir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { DATA_DIR } from "@multica/utils";
|
||||
|
||||
const AUTH_FILE_PATH = join(DATA_DIR, "auth.json");
|
||||
const DEV_AUTH_FILE_PATH = join(homedir(), ".super-multica-dev", "auth.json");
|
||||
|
||||
export type LocalAuthData = { sid: string; deviceId: string };
|
||||
|
||||
/**
|
||||
* Read sid and deviceId from ~/.super-multica/auth.json.
|
||||
* Returns null if the file is missing, unreadable, or incomplete.
|
||||
*/
|
||||
export function getLocalAuth(): LocalAuthData | null {
|
||||
function tryReadAuth(filePath: string): LocalAuthData | null {
|
||||
try {
|
||||
const raw = readFileSync(AUTH_FILE_PATH, "utf8").trim();
|
||||
const raw = readFileSync(filePath, "utf8").trim();
|
||||
if (!raw) return null;
|
||||
|
||||
const data = JSON.parse(raw);
|
||||
|
|
@ -32,3 +30,26 @@ export function getLocalAuth(): LocalAuthData | null {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read sid and deviceId from auth.json.
|
||||
*
|
||||
* Lookup order:
|
||||
* 1. {DATA_DIR}/auth.json (current data dir, respects SMC_DATA_DIR)
|
||||
* 2. ~/.super-multica-dev/auth.json (dev environment fallback —
|
||||
* allows E2E tests and other custom SMC_DATA_DIR setups to
|
||||
* share the dev auth created by `pnpm dev:local`)
|
||||
*
|
||||
* Returns null if no valid auth is found.
|
||||
*/
|
||||
export function getLocalAuth(): LocalAuthData | null {
|
||||
const primary = tryReadAuth(AUTH_FILE_PATH);
|
||||
if (primary) return primary;
|
||||
|
||||
// Fallback to dev auth when using a custom data dir (e.g. E2E tests)
|
||||
if (AUTH_FILE_PATH !== DEV_AUTH_FILE_PATH) {
|
||||
return tryReadAuth(DEV_AUTH_FILE_PATH);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -40,6 +40,8 @@ export type CompactionEndEvent = {
|
|||
tokensRemoved?: number;
|
||||
tokensKept?: number;
|
||||
reason: string;
|
||||
/** Tool result pruning statistics (when Phase 1 pruning was applied) */
|
||||
pruningStats?: { softTrimmed: number; hardCleared: number; charsSaved: number };
|
||||
};
|
||||
|
||||
/** Union of all compaction events */
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue