diff --git a/CLAUDE.md b/CLAUDE.md index e08524ea..a9112e6e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -170,19 +170,72 @@ Fonts are loaded via `@fontsource` packages (not Google Fonts) for cross-platfor The agent engine supports structured run logging for debugging. When enabled, it writes all key execution events to `~/.super-multica/sessions/{sessionId}/run-log.jsonl` alongside the session data. ```bash -# Enable via environment variable -MULTICA_RUN_LOG=1 pnpm multica run "your prompt" +# Enable via CLI flag +pnpm multica run --run-log "your prompt" -# Enable during tests -MULTICA_RUN_LOG=1 pnpm --filter @multica/core test +# Or via environment variable +MULTICA_RUN_LOG=1 pnpm multica run "your prompt" # Or programmatically const agent = new Agent({ enableRunLog: true }); ``` -Logged events: `run_start`, `run_end`, `llm_call`, `llm_result`, `tool_start`, `tool_end`, `context_overflow`, `auth_rotate`, `error_classify`, `preflight_compact_start/end`, `compaction`. +When `--run-log` is enabled, the CLI prints the session directory path to stderr: +``` +[session: 019c584a-...] +[session-dir: ~/.super-multica/sessions/019c584a-...] +``` -Each line is a JSON object with `ts` (timestamp) and `event` (type), suitable for AI-assisted log analysis. Implementation: `packages/core/src/agent/run-log.ts`. +Logged events: `run_start`, `run_end`, `llm_call`, `llm_result`, `tool_start`, `tool_end`, `context_overflow`, `auth_rotate`, `error_classify`, `preflight_compact_start/end`, `tool_result_pruning`, `compaction`, `compaction_detail`. + +Each line is a JSON object with `ts` (timestamp) and `event` (type), suitable for AI-assisted log analysis. Full event reference: `packages/core/src/agent/run-log.ts`. + +## E2E Testing (Agent-Driven) + +E2E tests are executed and analyzed by the Coding Agent (Claude Code), not by vitest. The Coding Agent runs the Multica agent via CLI, reads the structured run-log, and intelligently analyzes intermediate behavior and results. + +### How to Run + +E2E tests use an isolated data directory (`~/.super-multica-e2e`) to avoid polluting dev or production session data. + +```bash +# Basic E2E test (web_search/data tools require MULTICA_API_URL) +SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log "your test prompt" + +# With specific provider +SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log --provider kimi-coding "your test prompt" + +# Multi-turn test (reuse session) +SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log --session "follow-up prompt" + +# Clean up all E2E test data +rm -rf ~/.super-multica-e2e +``` + +### Analysis Workflow + +After running, the Coding Agent should: +1. Read `{session-dir}/run-log.jsonl` — structured execution events +2. Read `{session-dir}/session.jsonl` — full conversation transcript (if needed) +3. Analyze event sequence, tool calls, errors, and timing +4. Report findings with verdict (pass/fail + details) + +### What to Check + +- **Event completeness**: `run_start` → ... → `run_end` (no orphaned starts) +- **Tool pairing**: every `tool_start` has a matching `tool_end` +- **Error handling**: `is_error`, `error_classify`, `auth_rotate` events +- **Compaction health**: `tokens_removed > 0` when compaction fires +- **Performance**: `llm_result.duration_ms`, tool execution times + +### Important + +- **`SMC_DATA_DIR=~/.super-multica-e2e`** isolates E2E test sessions from dev (`~/.super-multica-dev`) and production (`~/.super-multica`) data. Always set this. +- **`MULTICA_API_URL=https://api-dev.copilothub.ai`** is required for `web_search` and `data` tools. Without it, these tools fail with `MULTICA_API_URL is required`. +- **Auth for `web_search`/`data`**: These tools need dev backend auth. The auth store auto-falls back to `~/.super-multica-dev/auth.json`. If missing, run `pnpm dev:local` first and log in through the Desktop app. +- Default provider is `kimi-coding`. Override with `--provider`. +- Run-log and session data are at `~/.super-multica-e2e/sessions/{sessionId}/` +- Detailed guide with feature-specific test playbooks: `docs/e2e-testing-guide.md` ## Credentials Setup diff --git a/apps/cli/src/commands/run.ts b/apps/cli/src/commands/run.ts index 1f5656be..1b915d0d 100644 --- a/apps/cli/src/commands/run.ts +++ b/apps/cli/src/commands/run.ts @@ -6,9 +6,11 @@ * echo "prompt" | multica run */ +import { join } from "node:path"; import { Agent } from "@multica/core"; import type { AgentOptions } from "@multica/core"; import type { ToolsConfig } from "@multica/core"; +import { DATA_DIR } from "@multica/utils"; import { cyan, yellow, dim } from "../colors.js"; type RunOptions = { @@ -23,6 +25,7 @@ type RunOptions = { cwd?: string | undefined; session?: string | undefined; debug?: boolean; + runLog?: boolean; toolsAllow?: string[]; toolsDeny?: string[]; help?: boolean; @@ -45,6 +48,7 @@ ${cyan("Options:")} ${yellow("--cwd")} DIR Working directory ${yellow("--session")} ID Session ID for persistence ${yellow("--debug")} Enable debug logging + ${yellow("--run-log")} Enable structured run logging (run-log.jsonl) ${yellow("--help")}, -h Show this help ${cyan("Tools Configuration:")} @@ -123,6 +127,10 @@ function parseArgs(argv: string[]): { opts: RunOptions; prompt: string } { opts.debug = true; continue; } + if (arg === "--run-log") { + opts.runLog = true; + continue; + } if (arg === "--tools-allow") { const value = args.shift(); opts.toolsAllow = value?.split(",").map((s) => s.trim()) ?? []; @@ -182,6 +190,8 @@ export async function runCommand(args: string[]): Promise { } } + const enableRunLog = opts.runLog || !!process.env.MULTICA_RUN_LOG; + const agent = new Agent({ profileId: opts.profile, provider: opts.provider, @@ -194,13 +204,19 @@ export async function runCommand(args: string[]): Promise { cwd: opts.cwd, sessionId: opts.session, debug: opts.debug, + enableRunLog, tools: toolsConfig, }); + const sessionDir = join(DATA_DIR, "sessions", agent.sessionId); + // If it's a newly created session, notify user of sessionId if (!opts.session) { console.error(`[session: ${agent.sessionId}]`); } + if (enableRunLog) { + console.error(`[session-dir: ${sessionDir}]`); + } const result = await agent.run(finalPrompt); if (result.error) { diff --git a/docs/e2e-testing-guide.md b/docs/e2e-testing-guide.md new file mode 100644 index 00000000..3892f5b7 --- /dev/null +++ b/docs/e2e-testing-guide.md @@ -0,0 +1,295 @@ +# Agent-Driven E2E Testing Guide + +This guide teaches Coding Agents (Claude Code, etc.) how to perform automated end-to-end testing of Super Multica features. Unlike traditional test frameworks, **the Coding Agent itself is the test runner and oracle** — it executes the agent, reads structured logs, and intelligently analyzes the results. + +## Overview + +The testing flow: + +1. Coding Agent runs `pnpm multica run --run-log "test prompt"` +2. The agent engine executes the prompt with full structured logging +3. Coding Agent reads the `run-log.jsonl` and `session.jsonl` files +4. Coding Agent analyzes events, tool calls, and behavior for correctness + +This approach is superior to static assertions because: +- The AI can understand **intent** — did the agent do what the prompt asked? +- It can reason about **intermediate process** — were the right tools called in the right order? +- It can detect **subtle issues** — token counts that don't make sense, unnecessary retries, missing events + +## Prerequisites + +1. **Credentials configured**: Run `pnpm multica credentials init` or ensure `~/.super-multica/credentials.json5` has valid provider credentials +2. **Available providers**: Check with `pnpm multica profile list` or inspect credentials file +3. **Default provider**: `kimi-coding` (Kimi Code, free tier available). Can override with `--provider` +4. **`MULTICA_API_URL`**: Required for `web_search` and `data` tools. Set to `https://api-dev.copilothub.ai` for dev environment. Without this, web search and financial data tools will fail with `MULTICA_API_URL is required` +5. **`SMC_DATA_DIR`**: Set to `~/.super-multica-e2e` to isolate E2E test sessions from dev (`~/.super-multica-dev`) and production (`~/.super-multica`) data. Without this, test sessions pollute the production sessions directory +6. **Dev auth for `web_search`/`data` tools**: These tools authenticate via `auth.json` (session ID + device ID). The auth store automatically falls back to `~/.super-multica-dev/auth.json` when the E2E data dir has no auth. If `~/.super-multica-dev/auth.json` doesn't exist, run `pnpm dev:local` first and log in through the Desktop app to create it + +## Running a Test + +### Environment variables + +All E2E test commands should include these env vars: + +```bash +# SMC_DATA_DIR — isolates test sessions from dev/production +# MULTICA_API_URL — enables web_search and data tools +export SMC_DATA_DIR=~/.super-multica-e2e +export MULTICA_API_URL=https://api-dev.copilothub.ai +``` + +### Basic command + +```bash +# For prompts that only need exec/read/write tools: +SMC_DATA_DIR=~/.super-multica-e2e pnpm multica run --run-log "your test prompt here" + +# For prompts that need web_search or data tools: +SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log "your test prompt here" +``` + +### With provider override + +```bash +SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log --provider claude-code "your test prompt" +SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log --provider kimi-coding "your test prompt" +``` + +### Resume a session (multi-turn testing) + +```bash +# First turn +SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log "Create a file called test.txt with content 'hello'" +# Note the session ID from stderr output: [session: 019c584a-...] + +# Second turn (same session) +SMC_DATA_DIR=~/.super-multica-e2e MULTICA_API_URL=https://api-dev.copilothub.ai pnpm multica run --run-log --session 019c584a-... "Read the file test.txt and tell me its content" +``` + +### Cleanup + +```bash +# Remove all E2E test sessions +rm -rf ~/.super-multica-e2e +``` + +### Output + +The CLI prints metadata to stderr: +``` +[session: 019c584a-7753-762d-9fb9-9eb0a8187df5] +[session-dir: /Users/you/.super-multica/sessions/019c584a-7753-762d-9fb9-9eb0a8187df5] +``` + +Agent text output goes to stdout. + +## Reading Results + +After a run, two files contain the data needed for analysis: + +### run-log.jsonl + +Location: `{session-dir}/run-log.jsonl` + +Each line is a JSON object with structured event data. Read this file to understand **what happened during execution**. + +```jsonl +{"ts":1739000001,"event":"run_start","prompt":"What is 2+2?","provider":"kimi-coding","model":"kimi-k2-thinking","messages":0} +{"ts":1739000002,"event":"llm_call","provider":"kimi-coding","model":"kimi-k2-thinking","messages":2} +{"ts":1739000005,"event":"llm_result","duration_ms":3000} +{"ts":1739000005,"event":"run_end","duration_ms":4000,"error":null,"text":"4"} +``` + +### session.jsonl + +Location: `{session-dir}/session.jsonl` + +Contains the full conversation transcript (user messages, assistant replies, tool calls and results). Read this for **message content analysis**. + +## Run-Log Event Reference + +> Source of truth: `packages/core/src/agent/run-log.ts` (JSDoc at top of file) + +### Lifecycle Events + +| Event | Fields | Description | +|-------|--------|-------------| +| `run_start` | prompt, internal, provider, model, messages | Agent run begins | +| `run_end` | duration_ms, error, text, aborted? | Agent run completes | + +### LLM Interaction + +| Event | Fields | Description | +|-------|--------|-------------| +| `llm_call` | provider, model, profile, messages | LLM API request sent | +| `llm_result` | duration_ms | LLM API response received | + +### Tool Execution + +| Event | Fields | Description | +|-------|--------|-------------| +| `tool_start` | tool, args | Tool execution begins | +| `tool_end` | tool, duration_ms, is_error | Tool execution completes | + +### Context Management + +| Event | Fields | Description | +|-------|--------|-------------| +| `preflight_compact_start` | utilization, trigger, messages, est_tokens | Preflight compaction triggered | +| `preflight_compact_end` | messages_before, messages_after, pruned | Preflight compaction done | +| `tool_result_pruning` | soft_trimmed, hard_cleared, chars_saved, phase, tokens_before?, tokens_after? | Tool result pruning (Phase 1) | +| `compaction` | removed, kept, tokens_removed, tokens_kept, reason, pruning_stats? | Summary compaction (Phase 2) | +| `compaction_detail` | pre_pruning_tokens, post_compaction_tokens, messages_removed, reason, pruning_applied | Detailed compaction breakdown | + +### Error Recovery + +| Event | Fields | Description | +|-------|--------|-------------| +| `context_overflow` | attempt, messages_before | Context window overflow detected | +| `context_overflow_compacted` | messages_after, tokens_removed | Recovered via compaction | +| `context_overflow_forced` | messages_before, messages_after | Recovered via forced drop | +| `error_classify` | error, reason, rotatable | Error classified for rotation | +| `auth_rotate` | from, to, reason | Auth profile rotated | + +## Feature Test Playbooks + +### 1. Basic Prompt Completion + +**Goal**: Verify the agent can complete a simple prompt end-to-end. + +```bash +pnpm multica run --run-log "What is the capital of France? Reply in one word." +``` + +**What to check in run-log**: +- `run_start` event exists with correct provider +- `llm_call` → `llm_result` pair exists (at least one) +- `run_end` event has `error: null` +- `run_end.duration_ms` is reasonable (< 30s for simple prompt) + +**What to check in output**: +- Text contains "Paris" + +### 2. Tool Usage + +**Goal**: Verify tools are called correctly when the prompt requires them. + +```bash +pnpm multica run --run-log --cwd /tmp "List the files in the current directory" +``` + +**What to check in run-log**: +- `tool_start` event with `tool: "exec"` or similar filesystem tool +- Matching `tool_end` with `is_error: false` +- Tool called before final `run_end` + +**What to check in output**: +- Output contains actual file names from /tmp + +### 3. Context Compaction + +**Goal**: Verify compaction works correctly on long sessions. + +```bash +# Build up a long session to trigger compaction +pnpm multica run --run-log "Write a detailed 2000-word essay about climate change" +# Note session ID, then continue: +pnpm multica run --run-log --session {id} "Now write another 2000-word essay about renewable energy" +pnpm multica run --run-log --session {id} "Summarize both essays in 3 bullet points" +``` + +**What to check in run-log**: +- `preflight_compact_start` appears when utilization exceeds trigger ratio +- `tool_result_pruning` shows `soft_trimmed > 0` or `hard_cleared > 0` if tool results were pruned +- `compaction` event has `tokens_removed > 0` (not near-zero like the bug we fixed) +- `compaction_detail` shows `pre_pruning_tokens` > `post_compaction_tokens` + +### 4. Multi-Provider Comparison + +**Goal**: Verify the same prompt works across different providers. + +```bash +pnpm multica run --run-log --provider kimi-coding "Explain recursion in 2 sentences" +pnpm multica run --run-log --provider claude-code "Explain recursion in 2 sentences" +``` + +**What to check**: +- Both runs complete without errors +- Both `run_end` events have `error: null` +- Compare `llm_result.duration_ms` across providers +- Both outputs are meaningful explanations of recursion + +### 5. Error Handling & Auth Rotation + +**Goal**: Verify error recovery when credentials are invalid. + +```bash +pnpm multica run --run-log --provider anthropic --api-key "sk-invalid-key" "Hello" +``` + +**What to check in run-log**: +- `error_classify` event with `reason: "auth"` +- `auth_rotate` event if multiple profiles are configured +- `run_end` with appropriate error message if no valid profiles exist + +## Analysis Patterns + +When analyzing run-logs, look for these patterns: + +### Healthy Run +``` +run_start → llm_call → llm_result → run_end (error: null) +``` + +### Run with Tool Usage +``` +run_start → llm_call → llm_result → tool_start → tool_end → llm_call → llm_result → run_end +``` + +### Run with Compaction +``` +run_start → preflight_compact_start → tool_result_pruning → preflight_compact_end → llm_call → ... +``` + +### Red Flags +- `run_end` without preceding `run_start` (log corruption) +- `tool_start` without matching `tool_end` (tool hang/crash) +- `compaction` with `tokens_removed` near zero (compaction ineffective) +- Multiple `error_classify` events (repeated failures) +- `context_overflow_forced` (emergency fallback — should be rare) + +## Creating a New Test Playbook + +When a new feature is implemented, create a test playbook following this template: + +```markdown +### N. Feature Name + +**Goal**: One sentence describing what to verify. + +**Command**: +\`\`\`bash +pnpm multica run --run-log [options] "prompt that exercises the feature" +\`\`\` + +**What to check in run-log**: +- List specific events and field values to verify +- Include both positive checks (event exists) and negative checks (no errors) + +**What to check in output**: +- What the text output should contain or look like + +**What to check in session.jsonl** (if applicable): +- Specific message patterns to verify +``` + +## Tips for Coding Agents + +1. **Always use `--run-log`** — without it, there's no structured data to analyze +2. **Use `--cwd`** to control the working directory for file-related tests +3. **Read run-log line by line** — each line is independent JSON, parse individually +4. **Check event ordering** — events are chronologically ordered by `ts` +5. **Token counts are estimates** — don't expect exact values, check for reasonable ranges +6. **Clean up test sessions** — after testing, remove session dirs from `~/.super-multica/sessions/` to avoid clutter +7. **Use `--provider`** to test specific providers — defaults to whatever is configured in credentials +8. **For multi-turn tests**, always capture and reuse the session ID from the first run diff --git a/packages/core/src/agent/events.ts b/packages/core/src/agent/events.ts index 139048da..f568a0d0 100644 --- a/packages/core/src/agent/events.ts +++ b/packages/core/src/agent/events.ts @@ -26,6 +26,8 @@ export type CompactionEndEvent = { reason: "count" | "tokens" | "summary" | "pruning"; /** Generated summary text (only present when reason is "summary") */ summary?: string | undefined; + /** Tool result pruning statistics (when Phase 1 pruning was applied) */ + pruningStats?: { softTrimmed: number; hardCleared: number; charsSaved: number } | undefined; }; /** Emitted when an agent encounters an error during execution */ diff --git a/packages/core/src/agent/run-log.ts b/packages/core/src/agent/run-log.ts index e9b9e371..e8e3b446 100644 --- a/packages/core/src/agent/run-log.ts +++ b/packages/core/src/agent/run-log.ts @@ -1,3 +1,60 @@ +/** + * Structured Run Log + * + * Records agent execution events to `{sessionDir}/run-log.jsonl`. + * Each line is a JSON object with `ts` (epoch ms) and `event` (type string). + * + * Enable via `MULTICA_RUN_LOG=1` env var or `enableRunLog: true` in AgentOptions. + * CLI: `pnpm multica run --run-log "prompt"` + * + * ## Event Reference + * + * ### Lifecycle + * - `run_start` — Agent run begins. + * Fields: prompt (first 200 chars), internal, provider, model, messages (count) + * - `run_end` — Agent run completes. + * Fields: duration_ms, error (string|null), text (first 200 chars), aborted? + * + * ### LLM Interaction + * - `llm_call` — LLM API request sent. + * Fields: provider, model, profile, messages (count) + * - `llm_result` — LLM API response received. + * Fields: duration_ms + * + * ### Tool Execution + * - `tool_start` — Tool execution begins. + * Fields: tool (name), args (first 500 chars of JSON) + * - `tool_end` — Tool execution completes. + * Fields: tool (name), duration_ms, is_error + * + * ### Context Management — Preflight (before LLM call) + * - `preflight_compact_start` — Preflight compaction triggered. + * Fields: utilization, trigger, messages (count), est_tokens + * - `preflight_compact_end` — Preflight compaction completed. + * Fields: messages_before, messages_after, pruned (count removed) + * - `tool_result_pruning` — Tool result pruning applied (Phase 1). + * Fields: soft_trimmed, hard_cleared, chars_saved, phase ("preflight"|"compaction"), + * tokens_before?, tokens_after? (present when phase="compaction") + * + * ### Context Management — Compaction (during session) + * - `compaction` — Summary compaction completed (Phase 2). + * Fields: removed, kept, tokens_removed, tokens_kept, reason, pruning_stats? + * - `compaction_detail` — Detailed compaction breakdown. + * Fields: pre_pruning_tokens, post_compaction_tokens, messages_removed, reason, pruning_applied + * + * ### Error Recovery + * - `context_overflow` — Context window overflow detected. + * Fields: attempt, messages_before + * - `context_overflow_compacted` — Overflow recovered via compaction. + * Fields: messages_after, tokens_removed + * - `context_overflow_forced` — Overflow recovered via forced message drop. + * Fields: messages_before, messages_after + * - `error_classify` — Error classified for auth rotation. + * Fields: error (first 200 chars), reason, rotatable + * - `auth_rotate` — Auth profile rotated after error. + * Fields: from, to, reason + */ + import { join } from "path"; import { mkdirSync } from "fs"; import { appendFile } from "fs/promises"; diff --git a/packages/core/src/agent/runner.ts b/packages/core/src/agent/runner.ts index 8d9f0de8..e561213d 100644 --- a/packages/core/src/agent/runner.ts +++ b/packages/core/src/agent/runner.ts @@ -306,6 +306,8 @@ export class Agent { model: compactionMode === "summary" ? model : undefined, apiKey: summaryApiKey, customInstructions: options.summaryInstructions, + // Observability + runLog: this.runLog, }); if (!options.thinkingLevel && storedMeta?.thinkingLevel) { @@ -810,6 +812,14 @@ export class Agent { }); if (pruneResult.changed) { result = pruneResult.messages; + if (pruneResult.softTrimmed > 0 || pruneResult.hardCleared > 0) { + this.runLog.log("tool_result_pruning", { + soft_trimmed: pruneResult.softTrimmed, + hard_cleared: pruneResult.hardCleared, + chars_saved: pruneResult.charsSaved, + phase: "preflight", + }); + } } // Re-estimate after pruning @@ -862,6 +872,7 @@ export class Agent { tokensKept: result.tokensKept, reason: result.reason ?? "tokens", summary: result.summary, + pruningStats: result.pruningStats, }; this.emitMulticaEvent(endEvent); this.runLog.log("compaction", { @@ -870,6 +881,7 @@ export class Agent { tokens_removed: endEvent.tokensRemoved, tokens_kept: endEvent.tokensKept, reason: endEvent.reason, + pruning_stats: endEvent.pruningStats, }); } diff --git a/packages/core/src/agent/session/compaction.ts b/packages/core/src/agent/session/compaction.ts index 9dfaa40c..3651f972 100644 --- a/packages/core/src/agent/session/compaction.ts +++ b/packages/core/src/agent/session/compaction.ts @@ -11,6 +11,13 @@ import { MIN_KEEP_MESSAGES, } from "../context-window/index.js"; +/** Tool result pruning statistics */ +export type PruningStats = { + softTrimmed: number; + hardCleared: number; + charsSaved: number; +}; + export type CompactionResult = { kept: AgentMessage[]; removedCount: number; @@ -25,6 +32,8 @@ export type CompactionResult = { toolFailures?: Array<{ toolName: string; summary: string }> | undefined; /** Reason for compaction: count, tokens, summary, or pruning (tool result trimming only) */ reason: "count" | "tokens" | "summary" | "pruning"; + /** Tool result pruning statistics (when Phase 1 pruning was applied) */ + pruningStats?: PruningStats | undefined; }; /** diff --git a/packages/core/src/agent/session/session-manager.ts b/packages/core/src/agent/session/session-manager.ts index 05f229e3..d11773c6 100644 --- a/packages/core/src/agent/session/session-manager.ts +++ b/packages/core/src/agent/session/session-manager.ts @@ -3,7 +3,7 @@ import { getModel, type Model, type UserMessage } from "@mariozechner/pi-ai"; import type { SessionEntry, SessionMeta } from "./types.js"; import { appendEntry, readEntries, resolveSessionPath, writeEntries } from "./storage.js"; import { compactMessages, compactMessagesAsync, type CompactionResult } from "./compaction.js"; -import { estimateTokenUsage, shouldCompact as shouldCompactTokens } from "../context-window/index.js"; +import { estimateTokenUsage, estimateMessagesTokens, shouldCompact as shouldCompactTokens } from "../context-window/index.js"; import { credentialManager } from "../credentials.js"; import { repairSessionFileIfNeeded, type RepairReport } from "./session-file-repair.js"; import { sanitizeToolCallInputs, sanitizeToolUseResultPairing } from "./session-transcript-repair.js"; @@ -11,6 +11,7 @@ import { pruneToolResults, type ToolResultPruningSettings, } from "../context-window/tool-result-pruning.js"; +import type { RunLog } from "../run-log.js"; /** Get Kimi model for summarization (use a cheaper model than k2-thinking) */ function getSummaryModel(): Model { @@ -64,6 +65,10 @@ export type SessionManagerOptions = { enableToolResultPruning?: boolean | undefined; /** Tool result pruning settings */ toolResultPruning?: Partial | undefined; + + // Observability + /** RunLog instance for structured logging */ + runLog?: RunLog | undefined; }; export class SessionManager { @@ -87,6 +92,8 @@ export class SessionManager { // Tool result pruning private readonly enableToolResultPruning: boolean; private readonly toolResultPruning: Partial | undefined; + // Observability + private readonly runLog: RunLog; private queue: Promise = Promise.resolve(); private meta: SessionMeta | undefined; @@ -120,6 +127,9 @@ export class SessionManager { (this.compactionMode === "tokens" || this.compactionMode === "summary"); this.toolResultPruning = options.toolResultPruning; + // Observability + this.runLog = options.runLog ?? { log() {}, async flush() {} }; + this.meta = this.loadMeta(); } @@ -270,6 +280,10 @@ export class SessionManager { async maybeCompact(messages: AgentMessage[]): Promise { let workingMessages = messages; let toolResultPruningApplied = false; + let pruningStats: { softTrimmed: number; hardCleared: number; charsSaved: number } | undefined; + + // Capture pre-pruning token count for accurate combined metrics + const preCompactionTokens = estimateMessagesTokens(messages); // Phase 1: Tool result pruning (soft trim / hard clear) // This reduces token usage without removing messages @@ -283,6 +297,14 @@ export class SessionManager { if (pruneResult.changed) { workingMessages = pruneResult.messages; toolResultPruningApplied = true; + pruningStats = { + softTrimmed: pruneResult.softTrimmed, + hardCleared: pruneResult.hardCleared, + charsSaved: pruneResult.charsSaved, + }; + + const postPruningTokens = estimateMessagesTokens(workingMessages); + // Log pruning stats if (pruneResult.softTrimmed > 0 || pruneResult.hardCleared > 0) { console.error( @@ -290,11 +312,19 @@ export class SessionManager { `${pruneResult.hardCleared} hard-cleared, ~${Math.round(pruneResult.charsSaved / 1000)}k chars saved`, ); } + this.runLog.log("tool_result_pruning", { + soft_trimmed: pruneResult.softTrimmed, + hard_cleared: pruneResult.hardCleared, + chars_saved: pruneResult.charsSaved, + tokens_before: preCompactionTokens, + tokens_after: postPruningTokens, + phase: "compaction", + }); } } // Phase 2: Message compaction (remove old messages if still needed) - let result; + let result: CompactionResult | null = null; if (this.compactionMode === "summary") { // Use provided model/apiKey or fall back to Kimi @@ -364,11 +394,33 @@ export class SessionManager { // still return the pruned messages if (!result) { if (toolResultPruningApplied) { - return { kept: workingMessages, removedCount: 0, reason: "pruning" as const }; + const postPruningTokens = estimateMessagesTokens(workingMessages); + return { + kept: workingMessages, + removedCount: 0, + tokensRemoved: preCompactionTokens - postPruningTokens, + tokensKept: postPruningTokens, + reason: "pruning" as const, + pruningStats, + }; } return null; } + // Override metrics with accurate combined savings (Phase 1 + Phase 2) + const postCompactionTokens = estimateMessagesTokens(result.kept); + result.tokensRemoved = preCompactionTokens - postCompactionTokens; + result.tokensKept = postCompactionTokens; + result.pruningStats = pruningStats; + + this.runLog.log("compaction_detail", { + pre_pruning_tokens: preCompactionTokens, + post_compaction_tokens: postCompactionTokens, + messages_removed: result.removedCount, + reason: result.reason, + pruning_applied: toolResultPruningApplied, + }); + const entries: SessionEntry[] = []; if (this.meta) { entries.push({ type: "meta", meta: this.meta, timestamp: Date.now() }); diff --git a/packages/core/src/client/actions/stream.ts b/packages/core/src/client/actions/stream.ts index dfaf06fa..cf1349d3 100644 --- a/packages/core/src/client/actions/stream.ts +++ b/packages/core/src/client/actions/stream.ts @@ -40,6 +40,8 @@ export type CompactionEndEvent = { tokensRemoved?: number; tokensKept?: number; reason: string; + /** Tool result pruning statistics (when Phase 1 pruning was applied) */ + pruningStats?: { softTrimmed: number; hardCleared: number; charsSaved: number }; }; /** Union of all compaction events */ diff --git a/packages/core/src/hub/api-client.ts b/packages/core/src/hub/api-client.ts index c3254463..d739cc9d 100644 --- a/packages/core/src/hub/api-client.ts +++ b/packages/core/src/hub/api-client.ts @@ -19,7 +19,7 @@ export function getAuthHeaders(context?: string): Record { if (!auth) { const suffix = context ? ` ${context}` : ""; throw new Error( - `Not logged in. Please sign in via the Desktop app${suffix}.`, + `Not logged in${suffix}. Sign in via the Desktop app, or run pnpm dev:local and log in there.`, ); } return { diff --git a/packages/core/src/hub/auth-store.ts b/packages/core/src/hub/auth-store.ts index ff2342ba..23c88c41 100644 --- a/packages/core/src/hub/auth-store.ts +++ b/packages/core/src/hub/auth-store.ts @@ -1,18 +1,16 @@ import { readFileSync } from "node:fs"; +import { homedir } from "node:os"; import { join } from "node:path"; import { DATA_DIR } from "@multica/utils"; const AUTH_FILE_PATH = join(DATA_DIR, "auth.json"); +const DEV_AUTH_FILE_PATH = join(homedir(), ".super-multica-dev", "auth.json"); export type LocalAuthData = { sid: string; deviceId: string }; -/** - * Read sid and deviceId from ~/.super-multica/auth.json. - * Returns null if the file is missing, unreadable, or incomplete. - */ -export function getLocalAuth(): LocalAuthData | null { +function tryReadAuth(filePath: string): LocalAuthData | null { try { - const raw = readFileSync(AUTH_FILE_PATH, "utf8").trim(); + const raw = readFileSync(filePath, "utf8").trim(); if (!raw) return null; const data = JSON.parse(raw); @@ -32,3 +30,26 @@ export function getLocalAuth(): LocalAuthData | null { return null; } } + +/** + * Read sid and deviceId from auth.json. + * + * Lookup order: + * 1. {DATA_DIR}/auth.json (current data dir, respects SMC_DATA_DIR) + * 2. ~/.super-multica-dev/auth.json (dev environment fallback — + * allows E2E tests and other custom SMC_DATA_DIR setups to + * share the dev auth created by `pnpm dev:local`) + * + * Returns null if no valid auth is found. + */ +export function getLocalAuth(): LocalAuthData | null { + const primary = tryReadAuth(AUTH_FILE_PATH); + if (primary) return primary; + + // Fallback to dev auth when using a custom data dir (e.g. E2E tests) + if (AUTH_FILE_PATH !== DEV_AUTH_FILE_PATH) { + return tryReadAuth(DEV_AUTH_FILE_PATH); + } + + return null; +} diff --git a/packages/sdk/src/actions/stream.ts b/packages/sdk/src/actions/stream.ts index dfaf06fa..cf1349d3 100644 --- a/packages/sdk/src/actions/stream.ts +++ b/packages/sdk/src/actions/stream.ts @@ -40,6 +40,8 @@ export type CompactionEndEvent = { tokensRemoved?: number; tokensKept?: number; reason: string; + /** Tool result pruning statistics (when Phase 1 pruning was applied) */ + pruningStats?: { softTrimmed: number; hardCleared: number; charsSaved: number }; }; /** Union of all compaction events */