From 7a16df7c564d200e410a352bfc9db86be81b1cfb Mon Sep 17 00:00:00 2001 From: Jiayuan Zhang Date: Sun, 15 Feb 2026 21:37:36 +0800 Subject: [PATCH] chore(compaction): add E2E compaction benchmark script Multi-turn test script for compaction behavior with low context window. Runs 4 turns of file reading to push context pressure and outputs run-log analysis. Co-Authored-By: Claude Opus 4.6 --- scripts/compaction-benchmark/run.sh | 137 ++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100755 scripts/compaction-benchmark/run.sh diff --git a/scripts/compaction-benchmark/run.sh b/scripts/compaction-benchmark/run.sh new file mode 100755 index 00000000..9433f840 --- /dev/null +++ b/scripts/compaction-benchmark/run.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +# Compaction Benchmark - Multi-turn test with low context window +# +# This script runs a series of prompts against the Multica agent with a very +# low context window (20k tokens) to force compaction to trigger quickly. +# The run-log output is then available for analysis. +# +# Usage: +# bash scripts/compaction-benchmark/run.sh [provider] +# +# Default provider: kimi-coding + +set -euo pipefail + +PROVIDER="${1:-kimi-coding}" +CONTEXT_WINDOW="${2:-20000}" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" + +export SMC_DATA_DIR=~/.super-multica-e2e + +echo "=== Compaction Benchmark ===" +echo "Provider: $PROVIDER" +echo "Context Window: $CONTEXT_WINDOW tokens" +echo "Data Dir: $SMC_DATA_DIR" +echo "" + +# Clean previous E2E data +rm -rf "$SMC_DATA_DIR" + +cd "$ROOT_DIR" + +# Turn 1: Start a session with a substantial prompt that generates tool usage +echo "--- Turn 1: Initial prompt (read multiple files) ---" +TURN1_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \ + --run-log \ + --provider "$PROVIDER" \ + --context-window "$CONTEXT_WINDOW" \ + "Read the following files and give me a brief summary of each: packages/core/src/agent/runner.ts, packages/core/src/agent/session/session-manager.ts, packages/core/src/agent/context-window/token-estimation.ts. List the main exports and key functions in each file." \ + 2>&1) + +# Extract session ID from stderr output +SESSION_ID=$(echo "$TURN1_OUTPUT" | grep -o '\[session: [^]]*\]' | head -1 | sed 's/\[session: //;s/\]//') +SESSION_DIR=$(echo "$TURN1_OUTPUT" | grep -o '\[session-dir: [^]]*\]' | head -1 | sed 's/\[session-dir: //;s/\]//') + +if [ -z "$SESSION_ID" ]; then + echo "ERROR: Could not extract session ID from output" + echo "$TURN1_OUTPUT" + exit 1 +fi + +echo "Session ID: $SESSION_ID" +echo "Session Dir: $SESSION_DIR" +echo "" + +# Turn 2: Continue the session with more file reads to push context higher +echo "--- Turn 2: More file reads (push context higher) ---" +TURN2_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \ + --run-log \ + --provider "$PROVIDER" \ + --context-window "$CONTEXT_WINDOW" \ + --session "$SESSION_ID" \ + "Now also read packages/core/src/agent/context-window/summarization.ts and packages/core/src/agent/context-window/tool-result-pruning.ts. Describe the key algorithms in each." \ + 2>&1) + +echo "$TURN2_OUTPUT" | head -5 +echo "" + +# Turn 3: More context-heavy work +echo "--- Turn 3: Additional analysis (should trigger compaction) ---" +TURN3_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \ + --run-log \ + --provider "$PROVIDER" \ + --context-window "$CONTEXT_WINDOW" \ + --session "$SESSION_ID" \ + "Read packages/core/src/agent/session/compaction.ts and explain the three compaction modes. Also read packages/core/src/agent/context-window/guard.ts and explain the guard thresholds." \ + 2>&1) + +echo "$TURN3_OUTPUT" | head -5 +echo "" + +# Turn 4: More tool usage +echo "--- Turn 4: Write and test (more context pressure) ---" +TURN4_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \ + --run-log \ + --provider "$PROVIDER" \ + --context-window "$CONTEXT_WINDOW" \ + --session "$SESSION_ID" \ + "Based on everything you've read so far, list all the constants and thresholds used in the compaction system. Provide exact values and which file each is defined in." \ + 2>&1) + +echo "$TURN4_OUTPUT" | head -5 +echo "" + +# Output analysis summary +echo "=== Benchmark Complete ===" +echo "Session Dir: $SESSION_DIR" +echo "" + +# Show run-log stats +if [ -f "$SESSION_DIR/run-log.jsonl" ]; then + echo "--- Run Log Event Summary ---" + echo "Total events: $(wc -l < "$SESSION_DIR/run-log.jsonl")" + echo "" + echo "Events by type:" + cat "$SESSION_DIR/run-log.jsonl" | python3 -c " +import sys, json +from collections import Counter +events = Counter() +for line in sys.stdin: + try: + obj = json.loads(line.strip()) + events[obj.get('event', 'unknown')] += 1 + except: + pass +for event, count in sorted(events.items()): + print(f' {event}: {count}') +" 2>/dev/null || echo " (python3 not available for analysis)" + echo "" + + echo "--- Compaction Events ---" + cat "$SESSION_DIR/run-log.jsonl" | python3 -c " +import sys, json +for line in sys.stdin: + try: + obj = json.loads(line.strip()) + event = obj.get('event', '') + if 'compact' in event or 'overflow' in event or 'pruning' in event: + print(json.dumps(obj, indent=2)) + except: + pass +" 2>/dev/null || echo " (python3 not available for analysis)" +fi + +echo "" +echo "=== Full run-log path: $SESSION_DIR/run-log.jsonl ===" +echo "=== Session file path: $SESSION_DIR/session.jsonl ==="