multica/scripts/swe-bench/analyze.ts
Jiayuan Zhang 90d374ffd5 feat(scripts): add SWE-bench runner for Multica agent evaluation
- download-dataset.py: fetches SWE-bench Lite/Verified/Full from HuggingFace
- run.ts: core runner that clones repos, runs Agent, collects git diff patches
- evaluate.sh: wrapper for official SWE-bench Docker evaluation harness
- analyze.ts: summarizes run results with per-repo and timing breakdowns

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:05:17 +08:00

116 lines
3.8 KiB
TypeScript

#!/usr/bin/env tsx
/**
* Analyze SWE-bench run results.
*
* Reads the .results.jsonl file produced by run.ts and prints a summary.
*
* Usage:
* tsx scripts/swe-bench/analyze.ts [results.jsonl]
*/
import { readFileSync, existsSync } from "node:fs";
import { resolve, join } from "node:path";
interface RunResult {
instance_id: string;
success: boolean;
patch: string;
error?: string;
duration_ms: number;
session_id: string;
}
function main() {
const resultsPath = resolve(
process.argv[2] || "scripts/swe-bench/predictions.results.jsonl",
);
if (!existsSync(resultsPath)) {
console.error(`Results file not found: ${resultsPath}`);
process.exit(1);
}
const lines = readFileSync(resultsPath, "utf-8").split("\n").filter(Boolean);
const results: RunResult[] = lines.map((l) => JSON.parse(l));
const total = results.length;
const patched = results.filter((r) => r.success).length;
const failed = results.filter((r) => !r.success).length;
const errors = results.filter((r) => r.error).length;
const durations = results.map((r) => r.duration_ms);
const avgDuration = durations.reduce((a, b) => a + b, 0) / total;
const maxDuration = Math.max(...durations);
const minDuration = Math.min(...durations);
const patchSizes = results
.filter((r) => r.success)
.map((r) => r.patch.length);
const avgPatchSize =
patchSizes.length > 0
? patchSizes.reduce((a, b) => a + b, 0) / patchSizes.length
: 0;
console.log("=== SWE-bench Run Analysis ===\n");
console.log(`Total tasks: ${total}`);
console.log(`Patched: ${patched} (${((patched / total) * 100).toFixed(1)}%)`);
console.log(`No patch: ${failed}`);
console.log(`Errors: ${errors}`);
console.log();
console.log(`Avg duration: ${(avgDuration / 1000).toFixed(1)}s`);
console.log(`Min duration: ${(minDuration / 1000).toFixed(1)}s`);
console.log(`Max duration: ${(maxDuration / 1000).toFixed(1)}s`);
console.log(`Avg patch size: ${(avgPatchSize / 1024).toFixed(1)}KB`);
// Error breakdown
if (errors > 0) {
console.log("\n--- Errors ---");
const errorCounts = new Map<string, number>();
for (const r of results) {
if (r.error) {
const key = r.error.length > 60 ? r.error.slice(0, 60) + "..." : r.error;
errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
}
}
for (const [err, count] of [...errorCounts.entries()].sort(
(a, b) => b[1] - a[1],
)) {
console.log(` ${count}x ${err}`);
}
}
// Per-repo breakdown
console.log("\n--- By Repository ---");
const repoStats = new Map<string, { total: number; patched: number }>();
for (const r of results) {
const repo = r.instance_id.split("__")[0]?.replace(/__/g, "/") || "unknown";
const stats = repoStats.get(repo) || { total: 0, patched: 0 };
stats.total++;
if (r.success) stats.patched++;
repoStats.set(repo, stats);
}
for (const [repo, stats] of [...repoStats.entries()].sort(
(a, b) => b[1].total - a[1].total,
)) {
const pct = ((stats.patched / stats.total) * 100).toFixed(0);
console.log(
` ${repo.padEnd(30)} ${stats.patched}/${stats.total} (${pct}%)`,
);
}
// Slowest tasks
console.log("\n--- Slowest Tasks ---");
const sorted = [...results].sort((a, b) => b.duration_ms - a.duration_ms);
for (const r of sorted.slice(0, 5)) {
console.log(
` ${(r.duration_ms / 1000).toFixed(1)}s ${r.instance_id} ${r.success ? "PATCHED" : "NO_PATCH"}`,
);
}
// Session IDs for further analysis
const dataDir = process.env.SMC_DATA_DIR || join(process.env.HOME || "~", ".swe-bench-eval");
console.log(`\n--- Run Logs ---`);
console.log(`Session data: ${dataDir}/sessions/`);
console.log(`View a session's run log:`);
console.log(` cat ${dataDir}/sessions/<session-id>/run-log.jsonl | head -20`);
}
main();