chore(compaction): add E2E compaction benchmark script
Multi-turn test script for compaction behavior with low context window. Runs 4 turns of file reading to push context pressure and outputs run-log analysis. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
b412ca902b
commit
7a16df7c56
1 changed files with 137 additions and 0 deletions
137
scripts/compaction-benchmark/run.sh
Executable file
137
scripts/compaction-benchmark/run.sh
Executable file
|
|
@ -0,0 +1,137 @@
|
|||
#!/usr/bin/env bash
|
||||
# Compaction Benchmark - Multi-turn test with low context window
|
||||
#
|
||||
# This script runs a series of prompts against the Multica agent with a very
|
||||
# low context window (20k tokens) to force compaction to trigger quickly.
|
||||
# The run-log output is then available for analysis.
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/compaction-benchmark/run.sh [provider]
|
||||
#
|
||||
# Default provider: kimi-coding
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PROVIDER="${1:-kimi-coding}"
|
||||
CONTEXT_WINDOW="${2:-20000}"
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
|
||||
export SMC_DATA_DIR=~/.super-multica-e2e
|
||||
|
||||
echo "=== Compaction Benchmark ==="
|
||||
echo "Provider: $PROVIDER"
|
||||
echo "Context Window: $CONTEXT_WINDOW tokens"
|
||||
echo "Data Dir: $SMC_DATA_DIR"
|
||||
echo ""
|
||||
|
||||
# Clean previous E2E data
|
||||
rm -rf "$SMC_DATA_DIR"
|
||||
|
||||
cd "$ROOT_DIR"
|
||||
|
||||
# Turn 1: Start a session with a substantial prompt that generates tool usage
|
||||
echo "--- Turn 1: Initial prompt (read multiple files) ---"
|
||||
TURN1_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
|
||||
--run-log \
|
||||
--provider "$PROVIDER" \
|
||||
--context-window "$CONTEXT_WINDOW" \
|
||||
"Read the following files and give me a brief summary of each: packages/core/src/agent/runner.ts, packages/core/src/agent/session/session-manager.ts, packages/core/src/agent/context-window/token-estimation.ts. List the main exports and key functions in each file." \
|
||||
2>&1)
|
||||
|
||||
# Extract session ID from stderr output
|
||||
SESSION_ID=$(echo "$TURN1_OUTPUT" | grep -o '\[session: [^]]*\]' | head -1 | sed 's/\[session: //;s/\]//')
|
||||
SESSION_DIR=$(echo "$TURN1_OUTPUT" | grep -o '\[session-dir: [^]]*\]' | head -1 | sed 's/\[session-dir: //;s/\]//')
|
||||
|
||||
if [ -z "$SESSION_ID" ]; then
|
||||
echo "ERROR: Could not extract session ID from output"
|
||||
echo "$TURN1_OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Session ID: $SESSION_ID"
|
||||
echo "Session Dir: $SESSION_DIR"
|
||||
echo ""
|
||||
|
||||
# Turn 2: Continue the session with more file reads to push context higher
|
||||
echo "--- Turn 2: More file reads (push context higher) ---"
|
||||
TURN2_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
|
||||
--run-log \
|
||||
--provider "$PROVIDER" \
|
||||
--context-window "$CONTEXT_WINDOW" \
|
||||
--session "$SESSION_ID" \
|
||||
"Now also read packages/core/src/agent/context-window/summarization.ts and packages/core/src/agent/context-window/tool-result-pruning.ts. Describe the key algorithms in each." \
|
||||
2>&1)
|
||||
|
||||
echo "$TURN2_OUTPUT" | head -5
|
||||
echo ""
|
||||
|
||||
# Turn 3: More context-heavy work
|
||||
echo "--- Turn 3: Additional analysis (should trigger compaction) ---"
|
||||
TURN3_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
|
||||
--run-log \
|
||||
--provider "$PROVIDER" \
|
||||
--context-window "$CONTEXT_WINDOW" \
|
||||
--session "$SESSION_ID" \
|
||||
"Read packages/core/src/agent/session/compaction.ts and explain the three compaction modes. Also read packages/core/src/agent/context-window/guard.ts and explain the guard thresholds." \
|
||||
2>&1)
|
||||
|
||||
echo "$TURN3_OUTPUT" | head -5
|
||||
echo ""
|
||||
|
||||
# Turn 4: More tool usage
|
||||
echo "--- Turn 4: Write and test (more context pressure) ---"
|
||||
TURN4_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
|
||||
--run-log \
|
||||
--provider "$PROVIDER" \
|
||||
--context-window "$CONTEXT_WINDOW" \
|
||||
--session "$SESSION_ID" \
|
||||
"Based on everything you've read so far, list all the constants and thresholds used in the compaction system. Provide exact values and which file each is defined in." \
|
||||
2>&1)
|
||||
|
||||
echo "$TURN4_OUTPUT" | head -5
|
||||
echo ""
|
||||
|
||||
# Output analysis summary
|
||||
echo "=== Benchmark Complete ==="
|
||||
echo "Session Dir: $SESSION_DIR"
|
||||
echo ""
|
||||
|
||||
# Show run-log stats
|
||||
if [ -f "$SESSION_DIR/run-log.jsonl" ]; then
|
||||
echo "--- Run Log Event Summary ---"
|
||||
echo "Total events: $(wc -l < "$SESSION_DIR/run-log.jsonl")"
|
||||
echo ""
|
||||
echo "Events by type:"
|
||||
cat "$SESSION_DIR/run-log.jsonl" | python3 -c "
|
||||
import sys, json
|
||||
from collections import Counter
|
||||
events = Counter()
|
||||
for line in sys.stdin:
|
||||
try:
|
||||
obj = json.loads(line.strip())
|
||||
events[obj.get('event', 'unknown')] += 1
|
||||
except:
|
||||
pass
|
||||
for event, count in sorted(events.items()):
|
||||
print(f' {event}: {count}')
|
||||
" 2>/dev/null || echo " (python3 not available for analysis)"
|
||||
echo ""
|
||||
|
||||
echo "--- Compaction Events ---"
|
||||
cat "$SESSION_DIR/run-log.jsonl" | python3 -c "
|
||||
import sys, json
|
||||
for line in sys.stdin:
|
||||
try:
|
||||
obj = json.loads(line.strip())
|
||||
event = obj.get('event', '')
|
||||
if 'compact' in event or 'overflow' in event or 'pruning' in event:
|
||||
print(json.dumps(obj, indent=2))
|
||||
except:
|
||||
pass
|
||||
" 2>/dev/null || echo " (python3 not available for analysis)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Full run-log path: $SESSION_DIR/run-log.jsonl ==="
|
||||
echo "=== Session file path: $SESSION_DIR/session.jsonl ==="
|
||||
Loading…
Add table
Add a link
Reference in a new issue