#!/usr/bin/env tsx /** * Analyze SWE-bench run results. * * Reads the .results.jsonl file produced by run.ts and prints a summary. * * Usage: * tsx scripts/swe-bench/analyze.ts [results.jsonl] */ import { readFileSync, existsSync } from "node:fs"; import { resolve, join } from "node:path"; interface RunResult { instance_id: string; success: boolean; patch: string; error?: string; duration_ms: number; session_id: string; } function main() { const resultsPath = resolve( process.argv[2] || "scripts/swe-bench/predictions.results.jsonl", ); if (!existsSync(resultsPath)) { console.error(`Results file not found: ${resultsPath}`); process.exit(1); } const lines = readFileSync(resultsPath, "utf-8").split("\n").filter(Boolean); const results: RunResult[] = lines.map((l) => JSON.parse(l)); const total = results.length; const patched = results.filter((r) => r.success).length; const failed = results.filter((r) => !r.success).length; const errors = results.filter((r) => r.error).length; const durations = results.map((r) => r.duration_ms); const avgDuration = durations.reduce((a, b) => a + b, 0) / total; const maxDuration = Math.max(...durations); const minDuration = Math.min(...durations); const patchSizes = results .filter((r) => r.success) .map((r) => r.patch.length); const avgPatchSize = patchSizes.length > 0 ? patchSizes.reduce((a, b) => a + b, 0) / patchSizes.length : 0; console.log("=== SWE-bench Run Analysis ===\n"); console.log(`Total tasks: ${total}`); console.log(`Patched: ${patched} (${((patched / total) * 100).toFixed(1)}%)`); console.log(`No patch: ${failed}`); console.log(`Errors: ${errors}`); console.log(); console.log(`Avg duration: ${(avgDuration / 1000).toFixed(1)}s`); console.log(`Min duration: ${(minDuration / 1000).toFixed(1)}s`); console.log(`Max duration: ${(maxDuration / 1000).toFixed(1)}s`); console.log(`Avg patch size: ${(avgPatchSize / 1024).toFixed(1)}KB`); // Error breakdown if (errors > 0) { console.log("\n--- Errors ---"); const errorCounts = new Map(); for (const r of results) { if (r.error) { const key = r.error.length > 60 ? r.error.slice(0, 60) + "..." : r.error; errorCounts.set(key, (errorCounts.get(key) || 0) + 1); } } for (const [err, count] of [...errorCounts.entries()].sort( (a, b) => b[1] - a[1], )) { console.log(` ${count}x ${err}`); } } // Per-repo breakdown console.log("\n--- By Repository ---"); const repoStats = new Map(); for (const r of results) { const repo = r.instance_id.split("__")[0]?.replace(/__/g, "/") || "unknown"; const stats = repoStats.get(repo) || { total: 0, patched: 0 }; stats.total++; if (r.success) stats.patched++; repoStats.set(repo, stats); } for (const [repo, stats] of [...repoStats.entries()].sort( (a, b) => b[1].total - a[1].total, )) { const pct = ((stats.patched / stats.total) * 100).toFixed(0); console.log( ` ${repo.padEnd(30)} ${stats.patched}/${stats.total} (${pct}%)`, ); } // Slowest tasks console.log("\n--- Slowest Tasks ---"); const sorted = [...results].sort((a, b) => b.duration_ms - a.duration_ms); for (const r of sorted.slice(0, 5)) { console.log( ` ${(r.duration_ms / 1000).toFixed(1)}s ${r.instance_id} ${r.success ? "PATCHED" : "NO_PATCH"}`, ); } // Session IDs for further analysis const dataDir = process.env.SMC_DATA_DIR || join(process.env.HOME || "~", ".swe-bench-eval"); console.log(`\n--- Run Logs ---`); console.log(`Session data: ${dataDir}/sessions/`); console.log(`View a session's run log:`); console.log(` cat ${dataDir}/sessions//run-log.jsonl | head -20`); } main();