feat(scripts): add SWE-bench runner for Multica agent evaluation

- download-dataset.py: fetches SWE-bench Lite/Verified/Full from HuggingFace
- run.ts: core runner that clones repos, runs Agent, collects git diff patches
- evaluate.sh: wrapper for official SWE-bench Docker evaluation harness
- analyze.ts: summarizes run results with per-repo and timing breakdowns

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jiayuan Zhang 2026-02-15 18:05:17 +08:00
parent 47f8e621c8
commit 90d374ffd5
5 changed files with 681 additions and 0 deletions

5
scripts/swe-bench/.gitignore vendored Normal file
View file

@ -0,0 +1,5 @@
# Downloaded datasets
*.jsonl
# Don't ignore the scripts themselves
!.gitignore

View file

@ -0,0 +1,116 @@
#!/usr/bin/env tsx
/**
* Analyze SWE-bench run results.
*
* Reads the .results.jsonl file produced by run.ts and prints a summary.
*
* Usage:
* tsx scripts/swe-bench/analyze.ts [results.jsonl]
*/
import { readFileSync, existsSync } from "node:fs";
import { resolve, join } from "node:path";
interface RunResult {
instance_id: string;
success: boolean;
patch: string;
error?: string;
duration_ms: number;
session_id: string;
}
function main() {
const resultsPath = resolve(
process.argv[2] || "scripts/swe-bench/predictions.results.jsonl",
);
if (!existsSync(resultsPath)) {
console.error(`Results file not found: ${resultsPath}`);
process.exit(1);
}
const lines = readFileSync(resultsPath, "utf-8").split("\n").filter(Boolean);
const results: RunResult[] = lines.map((l) => JSON.parse(l));
const total = results.length;
const patched = results.filter((r) => r.success).length;
const failed = results.filter((r) => !r.success).length;
const errors = results.filter((r) => r.error).length;
const durations = results.map((r) => r.duration_ms);
const avgDuration = durations.reduce((a, b) => a + b, 0) / total;
const maxDuration = Math.max(...durations);
const minDuration = Math.min(...durations);
const patchSizes = results
.filter((r) => r.success)
.map((r) => r.patch.length);
const avgPatchSize =
patchSizes.length > 0
? patchSizes.reduce((a, b) => a + b, 0) / patchSizes.length
: 0;
console.log("=== SWE-bench Run Analysis ===\n");
console.log(`Total tasks: ${total}`);
console.log(`Patched: ${patched} (${((patched / total) * 100).toFixed(1)}%)`);
console.log(`No patch: ${failed}`);
console.log(`Errors: ${errors}`);
console.log();
console.log(`Avg duration: ${(avgDuration / 1000).toFixed(1)}s`);
console.log(`Min duration: ${(minDuration / 1000).toFixed(1)}s`);
console.log(`Max duration: ${(maxDuration / 1000).toFixed(1)}s`);
console.log(`Avg patch size: ${(avgPatchSize / 1024).toFixed(1)}KB`);
// Error breakdown
if (errors > 0) {
console.log("\n--- Errors ---");
const errorCounts = new Map<string, number>();
for (const r of results) {
if (r.error) {
const key = r.error.length > 60 ? r.error.slice(0, 60) + "..." : r.error;
errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
}
}
for (const [err, count] of [...errorCounts.entries()].sort(
(a, b) => b[1] - a[1],
)) {
console.log(` ${count}x ${err}`);
}
}
// Per-repo breakdown
console.log("\n--- By Repository ---");
const repoStats = new Map<string, { total: number; patched: number }>();
for (const r of results) {
const repo = r.instance_id.split("__")[0]?.replace(/__/g, "/") || "unknown";
const stats = repoStats.get(repo) || { total: 0, patched: 0 };
stats.total++;
if (r.success) stats.patched++;
repoStats.set(repo, stats);
}
for (const [repo, stats] of [...repoStats.entries()].sort(
(a, b) => b[1].total - a[1].total,
)) {
const pct = ((stats.patched / stats.total) * 100).toFixed(0);
console.log(
` ${repo.padEnd(30)} ${stats.patched}/${stats.total} (${pct}%)`,
);
}
// Slowest tasks
console.log("\n--- Slowest Tasks ---");
const sorted = [...results].sort((a, b) => b.duration_ms - a.duration_ms);
for (const r of sorted.slice(0, 5)) {
console.log(
` ${(r.duration_ms / 1000).toFixed(1)}s ${r.instance_id} ${r.success ? "PATCHED" : "NO_PATCH"}`,
);
}
// Session IDs for further analysis
const dataDir = process.env.SMC_DATA_DIR || join(process.env.HOME || "~", ".swe-bench-eval");
console.log(`\n--- Run Logs ---`);
console.log(`Session data: ${dataDir}/sessions/`);
console.log(`View a session's run log:`);
console.log(` cat ${dataDir}/sessions/<session-id>/run-log.jsonl | head -20`);
}
main();

View file

@ -0,0 +1,100 @@
#!/usr/bin/env python3
"""
Download SWE-bench dataset from HuggingFace and export to JSONL for the Node.js runner.
Usage:
pip install datasets
python scripts/swe-bench/download-dataset.py [--dataset verified|lite|full] [--limit N] [--output PATH]
Output format (one JSON object per line):
{
"instance_id": "django__django-16379",
"repo": "django/django",
"base_commit": "abc123...",
"problem_statement": "...",
"hints_text": "...",
"patch": "...", # gold patch (for reference, not shown to agent)
"test_patch": "...", # test patch applied during evaluation
"version": "4.2",
"environment_setup_commit": "..."
}
"""
import argparse
import json
import sys
DATASET_MAP = {
"verified": "princeton-nlp/SWE-bench_Verified",
"lite": "princeton-nlp/SWE-bench_Lite",
"full": "princeton-nlp/SWE-bench",
}
def main():
parser = argparse.ArgumentParser(description="Download SWE-bench dataset to JSONL")
parser.add_argument(
"--dataset",
choices=["verified", "lite", "full"],
default="lite",
help="Dataset variant (default: lite)",
)
parser.add_argument(
"--limit", type=int, default=0, help="Limit number of instances (0 = all)"
)
parser.add_argument(
"--output",
type=str,
default=None,
help="Output JSONL path (default: scripts/swe-bench/<dataset>.jsonl)",
)
parser.add_argument(
"--split",
type=str,
default="test",
help="Dataset split (default: test)",
)
args = parser.parse_args()
try:
from datasets import load_dataset
except ImportError:
print("Error: 'datasets' package not installed. Run: pip install datasets", file=sys.stderr)
sys.exit(1)
dataset_name = DATASET_MAP[args.dataset]
output_path = args.output or f"scripts/swe-bench/{args.dataset}.jsonl"
print(f"Downloading {dataset_name} (split={args.split})...", file=sys.stderr)
ds = load_dataset(dataset_name, split=args.split)
# Fields to keep
keep_fields = [
"instance_id",
"repo",
"base_commit",
"problem_statement",
"hints_text",
"patch",
"test_patch",
"version",
"environment_setup_commit",
]
count = 0
with open(output_path, "w") as f:
for item in ds:
record = {}
for field in keep_fields:
if field in item:
record[field] = item[field]
f.write(json.dumps(record, ensure_ascii=False) + "\n")
count += 1
if args.limit and count >= args.limit:
break
print(f"Wrote {count} instances to {output_path}", file=sys.stderr)
if __name__ == "__main__":
main()

68
scripts/swe-bench/evaluate.sh Executable file
View file

@ -0,0 +1,68 @@
#!/usr/bin/env bash
#
# Evaluate Multica predictions against SWE-bench using the official Docker harness.
#
# Prerequisites:
# pip install swebench
# Docker running with at least 120GB storage, 16GB RAM, 8 CPU cores
#
# Usage:
# bash scripts/swe-bench/evaluate.sh [predictions.jsonl] [dataset] [run_id]
#
# Examples:
# bash scripts/swe-bench/evaluate.sh
# bash scripts/swe-bench/evaluate.sh scripts/swe-bench/predictions.jsonl lite multica-v1
set -euo pipefail
PREDICTIONS="${1:-scripts/swe-bench/predictions.jsonl}"
DATASET="${2:-lite}"
RUN_ID="${3:-multica}"
# Map short names to HuggingFace dataset names
case "$DATASET" in
lite) DATASET_NAME="princeton-nlp/SWE-bench_Lite" ;;
verified) DATASET_NAME="princeton-nlp/SWE-bench_Verified" ;;
full) DATASET_NAME="princeton-nlp/SWE-bench" ;;
*) DATASET_NAME="$DATASET" ;;
esac
echo "=== SWE-bench Evaluation ==="
echo "Predictions: $PREDICTIONS"
echo "Dataset: $DATASET_NAME"
echo "Run ID: $RUN_ID"
echo ""
if [ ! -f "$PREDICTIONS" ]; then
echo "Error: Predictions file not found: $PREDICTIONS"
exit 1
fi
TASK_COUNT=$(wc -l < "$PREDICTIONS" | tr -d ' ')
echo "Tasks to evaluate: $TASK_COUNT"
echo ""
# Check if swebench is installed
if ! python -c "import swebench" 2>/dev/null; then
echo "Error: swebench not installed. Run: pip install swebench"
exit 1
fi
# Check if Docker is running
if ! docker info >/dev/null 2>&1; then
echo "Error: Docker is not running"
exit 1
fi
echo "Starting evaluation (this may take a while)..."
echo ""
python -m swebench.harness.run_evaluation \
--dataset_name "$DATASET_NAME" \
--predictions_path "$PREDICTIONS" \
--max_workers 4 \
--run_id "$RUN_ID"
echo ""
echo "=== Evaluation Complete ==="
echo "Check logs/ and evaluation_results/ for detailed results."

392
scripts/swe-bench/run.ts Normal file
View file

@ -0,0 +1,392 @@
#!/usr/bin/env tsx
/**
* SWE-bench Runner for Multica
*
* Runs the Multica agent against SWE-bench task instances and collects patches.
*
* Usage:
* tsx scripts/swe-bench/run.ts [options]
*
* Options:
* --dataset PATH Path to JSONL dataset (default: scripts/swe-bench/lite.jsonl)
* --provider NAME LLM provider (default: kimi-coding)
* --model NAME Model name
* --limit N Max tasks to run (default: all)
* --offset N Skip first N tasks (default: 0)
* --output PATH Output predictions JSONL (default: scripts/swe-bench/predictions.jsonl)
* --workdir PATH Working directory for repos (default: /tmp/swe-bench)
* --timeout MS Timeout per task in ms (default: 300000 = 5min)
* --instance ID Run a single instance by ID
* --debug Enable debug logging
*/
import { readFileSync, writeFileSync, appendFileSync, existsSync, mkdirSync } from "node:fs";
import { join, resolve } from "node:path";
import { execSync, spawn } from "node:child_process";
import { Agent } from "@multica/core";
import type { AgentOptions } from "@multica/core";
// ============================================================
// Types
// ============================================================
interface SWEBenchTask {
instance_id: string;
repo: string;
base_commit: string;
problem_statement: string;
hints_text?: string;
patch?: string;
test_patch?: string;
version?: string;
environment_setup_commit?: string;
}
interface Prediction {
instance_id: string;
model_patch: string;
model_name_or_path: string;
}
interface RunResult {
instance_id: string;
success: boolean;
patch: string;
error?: string;
duration_ms: number;
session_id: string;
}
// ============================================================
// CLI argument parsing
// ============================================================
interface RunOptions {
dataset: string;
provider: string;
model?: string;
limit: number;
offset: number;
output: string;
workdir: string;
timeout: number;
instance?: string;
debug: boolean;
}
function parseArgs(): RunOptions {
const args = process.argv.slice(2);
const opts: RunOptions = {
dataset: "scripts/swe-bench/lite.jsonl",
provider: "kimi-coding",
limit: 0,
offset: 0,
output: "scripts/swe-bench/predictions.jsonl",
workdir: "/tmp/swe-bench",
timeout: 300_000, // 5 minutes
debug: false,
};
for (let i = 0; i < args.length; i++) {
const arg = args[i]!;
if (arg === "--dataset") opts.dataset = args[++i]!;
else if (arg === "--provider") opts.provider = args[++i]!;
else if (arg === "--model") opts.model = args[++i]!;
else if (arg === "--limit") opts.limit = parseInt(args[++i]!, 10);
else if (arg === "--offset") opts.offset = parseInt(args[++i]!, 10);
else if (arg === "--output") opts.output = args[++i]!;
else if (arg === "--workdir") opts.workdir = args[++i]!;
else if (arg === "--timeout") opts.timeout = parseInt(args[++i]!, 10);
else if (arg === "--instance") opts.instance = args[++i]!;
else if (arg === "--debug") opts.debug = true;
else {
console.error(`Unknown argument: ${arg}`);
process.exit(1);
}
}
return opts;
}
// ============================================================
// Dataset loading
// ============================================================
function loadDataset(path: string): SWEBenchTask[] {
if (!existsSync(path)) {
console.error(`Dataset not found: ${path}`);
console.error("Run: python scripts/swe-bench/download-dataset.py");
process.exit(1);
}
const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
return lines.map((line) => JSON.parse(line) as SWEBenchTask);
}
// ============================================================
// Repository setup
// ============================================================
function setupRepo(task: SWEBenchTask, workdir: string): string {
const repoDir = join(workdir, task.instance_id.replace(/\//g, "__"));
if (existsSync(repoDir)) {
// Reset existing repo to base commit
log(` Resetting existing repo to ${task.base_commit.slice(0, 8)}...`);
execSync(`git checkout -f ${task.base_commit} && git clean -fdx`, {
cwd: repoDir,
stdio: "pipe",
timeout: 60_000,
});
} else {
// Clone from GitHub
const repoUrl = `https://github.com/${task.repo}.git`;
log(` Cloning ${task.repo}...`);
mkdirSync(workdir, { recursive: true });
execSync(`git clone --quiet ${repoUrl} "${repoDir}"`, {
stdio: "pipe",
timeout: 120_000,
});
execSync(`git checkout -f ${task.base_commit}`, {
cwd: repoDir,
stdio: "pipe",
timeout: 30_000,
});
}
return repoDir;
}
// ============================================================
// System prompt
// ============================================================
function buildSystemPrompt(task: SWEBenchTask): string {
return `You are an expert software engineer tasked with fixing a bug in an open-source repository.
## Instructions
1. Read the issue description carefully and understand the problem.
2. Explore the repository to find the relevant source code.
3. Identify the root cause of the issue.
4. Make the minimal set of changes to fix the issue. Do NOT add tests.
5. After making changes, verify your fix makes sense.
## Important Rules
- Make ONLY the changes necessary to fix the described issue.
- Do NOT modify or add any test files.
- Do NOT add comments explaining the fix unless the code is non-obvious.
- Do NOT refactor unrelated code.
- Keep changes minimal and focused.
## Repository
This is the \`${task.repo}\` repository checked out at commit \`${task.base_commit.slice(0, 12)}\`.`;
}
function buildPrompt(task: SWEBenchTask): string {
let prompt = `## Issue\n\n${task.problem_statement}`;
if (task.hints_text) {
prompt += `\n\n## Hints\n\n${task.hints_text}`;
}
prompt += `\n\nPlease fix this issue. Remember: make minimal changes, do not modify tests.`;
return prompt;
}
// ============================================================
// Run a single task
// ============================================================
async function runTask(
task: SWEBenchTask,
opts: RunOptions,
): Promise<RunResult> {
const start = Date.now();
// Setup repo
const repoDir = setupRepo(task, opts.workdir);
// Create agent
const agentOptions: AgentOptions = {
provider: opts.provider,
model: opts.model,
cwd: repoDir,
enableRunLog: true,
debug: opts.debug,
systemPrompt: buildSystemPrompt(task),
enableSkills: false,
tools: {
// Only allow coding tools — no web, no cron, no sessions
deny: ["web_fetch", "web_search", "cron", "data", "sessions_spawn", "sessions_list", "memory_search", "send_file"],
},
};
const agent = new Agent(agentOptions);
log(` Session: ${agent.sessionId}`);
try {
// Run agent with timeout
const result = await Promise.race([
agent.run(buildPrompt(task)),
new Promise<never>((_, reject) =>
setTimeout(() => reject(new Error("timeout")), opts.timeout),
),
]);
// Collect the git diff (the patch)
let patch = "";
try {
patch = execSync("git diff", {
cwd: repoDir,
encoding: "utf-8",
maxBuffer: 10 * 1024 * 1024, // 10MB
timeout: 10_000,
});
} catch {
// Also check for staged changes
try {
patch = execSync("git diff HEAD", {
cwd: repoDir,
encoding: "utf-8",
maxBuffer: 10 * 1024 * 1024,
timeout: 10_000,
});
} catch {
patch = "";
}
}
return {
instance_id: task.instance_id,
success: patch.length > 0,
patch,
error: result.error,
duration_ms: Date.now() - start,
session_id: agent.sessionId,
};
} catch (err) {
// Collect any partial patch
let patch = "";
try {
patch = execSync("git diff", {
cwd: repoDir,
encoding: "utf-8",
maxBuffer: 10 * 1024 * 1024,
timeout: 10_000,
});
} catch {
// ignore
}
return {
instance_id: task.instance_id,
success: false,
patch,
error: err instanceof Error ? err.message : String(err),
duration_ms: Date.now() - start,
session_id: agent.sessionId,
};
}
}
// ============================================================
// Logging
// ============================================================
function log(msg: string) {
const ts = new Date().toISOString().slice(11, 19);
console.error(`[${ts}] ${msg}`);
}
// ============================================================
// Main
// ============================================================
async function main() {
const opts = parseArgs();
log("SWE-bench Runner for Multica");
log(`Provider: ${opts.provider}${opts.model ? ` (${opts.model})` : ""}`);
log(`Dataset: ${opts.dataset}`);
log(`Work dir: ${opts.workdir}`);
log(`Timeout: ${opts.timeout / 1000}s per task`);
// Set SMC_DATA_DIR for isolation
if (!process.env.SMC_DATA_DIR) {
process.env.SMC_DATA_DIR = join(process.env.HOME || "~", ".swe-bench-eval");
log(`SMC_DATA_DIR: ${process.env.SMC_DATA_DIR}`);
}
// Load dataset
let tasks = loadDataset(resolve(opts.dataset));
log(`Loaded ${tasks.length} tasks`);
// Filter by instance ID if specified
if (opts.instance) {
tasks = tasks.filter((t) => t.instance_id === opts.instance);
if (tasks.length === 0) {
console.error(`Instance not found: ${opts.instance}`);
process.exit(1);
}
}
// Apply offset and limit
if (opts.offset > 0) {
tasks = tasks.slice(opts.offset);
}
if (opts.limit > 0) {
tasks = tasks.slice(0, opts.limit);
}
log(`Running ${tasks.length} tasks`);
// Prepare output
const outputPath = resolve(opts.output);
const resultsPath = outputPath.replace(".jsonl", ".results.jsonl");
// Run tasks sequentially
const modelName = `multica-${opts.provider}${opts.model ? `-${opts.model}` : ""}`;
let completed = 0;
let succeeded = 0;
for (const task of tasks) {
completed++;
log(`\n[${completed}/${tasks.length}] ${task.instance_id}`);
const result = await runTask(task, opts);
if (result.success) succeeded++;
// Write prediction in SWE-bench format
const prediction: Prediction = {
instance_id: result.instance_id,
model_patch: result.patch,
model_name_or_path: modelName,
};
appendFileSync(outputPath, JSON.stringify(prediction) + "\n");
// Write detailed result
appendFileSync(resultsPath, JSON.stringify(result) + "\n");
const status = result.success ? "PATCHED" : "NO_PATCH";
const errorInfo = result.error ? ` (${result.error})` : "";
log(
` ${status} | ${(result.duration_ms / 1000).toFixed(1)}s | patch=${result.patch.length} bytes${errorInfo}`,
);
}
log(`\n========================================`);
log(`Results: ${succeeded}/${completed} tasks produced patches`);
log(`Predictions: ${outputPath}`);
log(`Details: ${resultsPath}`);
log(`\nTo evaluate with SWE-bench harness:`);
log(
` python -m swebench.harness.run_evaluation --dataset_name princeton-nlp/SWE-bench_Lite --predictions_path ${outputPath} --max_workers 4 --run_id multica`,
);
}
main().catch((err) => {
console.error("Fatal error:", err);
process.exit(1);
});