From 90d374ffd57a4bd3b75f6a1db86c2e3b2b852441 Mon Sep 17 00:00:00 2001 From: Jiayuan Zhang Date: Sun, 15 Feb 2026 18:05:17 +0800 Subject: [PATCH] feat(scripts): add SWE-bench runner for Multica agent evaluation - download-dataset.py: fetches SWE-bench Lite/Verified/Full from HuggingFace - run.ts: core runner that clones repos, runs Agent, collects git diff patches - evaluate.sh: wrapper for official SWE-bench Docker evaluation harness - analyze.ts: summarizes run results with per-repo and timing breakdowns Co-Authored-By: Claude Opus 4.6 --- scripts/swe-bench/.gitignore | 5 + scripts/swe-bench/analyze.ts | 116 ++++++++ scripts/swe-bench/download-dataset.py | 100 +++++++ scripts/swe-bench/evaluate.sh | 68 +++++ scripts/swe-bench/run.ts | 392 ++++++++++++++++++++++++++ 5 files changed, 681 insertions(+) create mode 100644 scripts/swe-bench/.gitignore create mode 100644 scripts/swe-bench/analyze.ts create mode 100755 scripts/swe-bench/download-dataset.py create mode 100755 scripts/swe-bench/evaluate.sh create mode 100644 scripts/swe-bench/run.ts diff --git a/scripts/swe-bench/.gitignore b/scripts/swe-bench/.gitignore new file mode 100644 index 00000000..eef1d884 --- /dev/null +++ b/scripts/swe-bench/.gitignore @@ -0,0 +1,5 @@ +# Downloaded datasets +*.jsonl + +# Don't ignore the scripts themselves +!.gitignore diff --git a/scripts/swe-bench/analyze.ts b/scripts/swe-bench/analyze.ts new file mode 100644 index 00000000..e83d8d2e --- /dev/null +++ b/scripts/swe-bench/analyze.ts @@ -0,0 +1,116 @@ +#!/usr/bin/env tsx +/** + * Analyze SWE-bench run results. + * + * Reads the .results.jsonl file produced by run.ts and prints a summary. + * + * Usage: + * tsx scripts/swe-bench/analyze.ts [results.jsonl] + */ + +import { readFileSync, existsSync } from "node:fs"; +import { resolve, join } from "node:path"; + +interface RunResult { + instance_id: string; + success: boolean; + patch: string; + error?: string; + duration_ms: number; + session_id: string; +} + +function main() { + const resultsPath = resolve( + process.argv[2] || "scripts/swe-bench/predictions.results.jsonl", + ); + + if (!existsSync(resultsPath)) { + console.error(`Results file not found: ${resultsPath}`); + process.exit(1); + } + + const lines = readFileSync(resultsPath, "utf-8").split("\n").filter(Boolean); + const results: RunResult[] = lines.map((l) => JSON.parse(l)); + + const total = results.length; + const patched = results.filter((r) => r.success).length; + const failed = results.filter((r) => !r.success).length; + const errors = results.filter((r) => r.error).length; + const durations = results.map((r) => r.duration_ms); + const avgDuration = durations.reduce((a, b) => a + b, 0) / total; + const maxDuration = Math.max(...durations); + const minDuration = Math.min(...durations); + const patchSizes = results + .filter((r) => r.success) + .map((r) => r.patch.length); + const avgPatchSize = + patchSizes.length > 0 + ? patchSizes.reduce((a, b) => a + b, 0) / patchSizes.length + : 0; + + console.log("=== SWE-bench Run Analysis ===\n"); + console.log(`Total tasks: ${total}`); + console.log(`Patched: ${patched} (${((patched / total) * 100).toFixed(1)}%)`); + console.log(`No patch: ${failed}`); + console.log(`Errors: ${errors}`); + console.log(); + console.log(`Avg duration: ${(avgDuration / 1000).toFixed(1)}s`); + console.log(`Min duration: ${(minDuration / 1000).toFixed(1)}s`); + console.log(`Max duration: ${(maxDuration / 1000).toFixed(1)}s`); + console.log(`Avg patch size: ${(avgPatchSize / 1024).toFixed(1)}KB`); + + // Error breakdown + if (errors > 0) { + console.log("\n--- Errors ---"); + const errorCounts = new Map(); + for (const r of results) { + if (r.error) { + const key = r.error.length > 60 ? r.error.slice(0, 60) + "..." : r.error; + errorCounts.set(key, (errorCounts.get(key) || 0) + 1); + } + } + for (const [err, count] of [...errorCounts.entries()].sort( + (a, b) => b[1] - a[1], + )) { + console.log(` ${count}x ${err}`); + } + } + + // Per-repo breakdown + console.log("\n--- By Repository ---"); + const repoStats = new Map(); + for (const r of results) { + const repo = r.instance_id.split("__")[0]?.replace(/__/g, "/") || "unknown"; + const stats = repoStats.get(repo) || { total: 0, patched: 0 }; + stats.total++; + if (r.success) stats.patched++; + repoStats.set(repo, stats); + } + for (const [repo, stats] of [...repoStats.entries()].sort( + (a, b) => b[1].total - a[1].total, + )) { + const pct = ((stats.patched / stats.total) * 100).toFixed(0); + console.log( + ` ${repo.padEnd(30)} ${stats.patched}/${stats.total} (${pct}%)`, + ); + } + + // Slowest tasks + console.log("\n--- Slowest Tasks ---"); + const sorted = [...results].sort((a, b) => b.duration_ms - a.duration_ms); + for (const r of sorted.slice(0, 5)) { + console.log( + ` ${(r.duration_ms / 1000).toFixed(1)}s ${r.instance_id} ${r.success ? "PATCHED" : "NO_PATCH"}`, + ); + } + + // Session IDs for further analysis + const dataDir = process.env.SMC_DATA_DIR || join(process.env.HOME || "~", ".swe-bench-eval"); + console.log(`\n--- Run Logs ---`); + console.log(`Session data: ${dataDir}/sessions/`); + console.log(`View a session's run log:`); + console.log(` cat ${dataDir}/sessions//run-log.jsonl | head -20`); +} + +main(); diff --git a/scripts/swe-bench/download-dataset.py b/scripts/swe-bench/download-dataset.py new file mode 100755 index 00000000..517bfa71 --- /dev/null +++ b/scripts/swe-bench/download-dataset.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +""" +Download SWE-bench dataset from HuggingFace and export to JSONL for the Node.js runner. + +Usage: + pip install datasets + python scripts/swe-bench/download-dataset.py [--dataset verified|lite|full] [--limit N] [--output PATH] + +Output format (one JSON object per line): + { + "instance_id": "django__django-16379", + "repo": "django/django", + "base_commit": "abc123...", + "problem_statement": "...", + "hints_text": "...", + "patch": "...", # gold patch (for reference, not shown to agent) + "test_patch": "...", # test patch applied during evaluation + "version": "4.2", + "environment_setup_commit": "..." + } +""" + +import argparse +import json +import sys + +DATASET_MAP = { + "verified": "princeton-nlp/SWE-bench_Verified", + "lite": "princeton-nlp/SWE-bench_Lite", + "full": "princeton-nlp/SWE-bench", +} + + +def main(): + parser = argparse.ArgumentParser(description="Download SWE-bench dataset to JSONL") + parser.add_argument( + "--dataset", + choices=["verified", "lite", "full"], + default="lite", + help="Dataset variant (default: lite)", + ) + parser.add_argument( + "--limit", type=int, default=0, help="Limit number of instances (0 = all)" + ) + parser.add_argument( + "--output", + type=str, + default=None, + help="Output JSONL path (default: scripts/swe-bench/.jsonl)", + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Dataset split (default: test)", + ) + args = parser.parse_args() + + try: + from datasets import load_dataset + except ImportError: + print("Error: 'datasets' package not installed. Run: pip install datasets", file=sys.stderr) + sys.exit(1) + + dataset_name = DATASET_MAP[args.dataset] + output_path = args.output or f"scripts/swe-bench/{args.dataset}.jsonl" + + print(f"Downloading {dataset_name} (split={args.split})...", file=sys.stderr) + ds = load_dataset(dataset_name, split=args.split) + + # Fields to keep + keep_fields = [ + "instance_id", + "repo", + "base_commit", + "problem_statement", + "hints_text", + "patch", + "test_patch", + "version", + "environment_setup_commit", + ] + + count = 0 + with open(output_path, "w") as f: + for item in ds: + record = {} + for field in keep_fields: + if field in item: + record[field] = item[field] + f.write(json.dumps(record, ensure_ascii=False) + "\n") + count += 1 + if args.limit and count >= args.limit: + break + + print(f"Wrote {count} instances to {output_path}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/scripts/swe-bench/evaluate.sh b/scripts/swe-bench/evaluate.sh new file mode 100755 index 00000000..8679622a --- /dev/null +++ b/scripts/swe-bench/evaluate.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# +# Evaluate Multica predictions against SWE-bench using the official Docker harness. +# +# Prerequisites: +# pip install swebench +# Docker running with at least 120GB storage, 16GB RAM, 8 CPU cores +# +# Usage: +# bash scripts/swe-bench/evaluate.sh [predictions.jsonl] [dataset] [run_id] +# +# Examples: +# bash scripts/swe-bench/evaluate.sh +# bash scripts/swe-bench/evaluate.sh scripts/swe-bench/predictions.jsonl lite multica-v1 + +set -euo pipefail + +PREDICTIONS="${1:-scripts/swe-bench/predictions.jsonl}" +DATASET="${2:-lite}" +RUN_ID="${3:-multica}" + +# Map short names to HuggingFace dataset names +case "$DATASET" in + lite) DATASET_NAME="princeton-nlp/SWE-bench_Lite" ;; + verified) DATASET_NAME="princeton-nlp/SWE-bench_Verified" ;; + full) DATASET_NAME="princeton-nlp/SWE-bench" ;; + *) DATASET_NAME="$DATASET" ;; +esac + +echo "=== SWE-bench Evaluation ===" +echo "Predictions: $PREDICTIONS" +echo "Dataset: $DATASET_NAME" +echo "Run ID: $RUN_ID" +echo "" + +if [ ! -f "$PREDICTIONS" ]; then + echo "Error: Predictions file not found: $PREDICTIONS" + exit 1 +fi + +TASK_COUNT=$(wc -l < "$PREDICTIONS" | tr -d ' ') +echo "Tasks to evaluate: $TASK_COUNT" +echo "" + +# Check if swebench is installed +if ! python -c "import swebench" 2>/dev/null; then + echo "Error: swebench not installed. Run: pip install swebench" + exit 1 +fi + +# Check if Docker is running +if ! docker info >/dev/null 2>&1; then + echo "Error: Docker is not running" + exit 1 +fi + +echo "Starting evaluation (this may take a while)..." +echo "" + +python -m swebench.harness.run_evaluation \ + --dataset_name "$DATASET_NAME" \ + --predictions_path "$PREDICTIONS" \ + --max_workers 4 \ + --run_id "$RUN_ID" + +echo "" +echo "=== Evaluation Complete ===" +echo "Check logs/ and evaluation_results/ for detailed results." diff --git a/scripts/swe-bench/run.ts b/scripts/swe-bench/run.ts new file mode 100644 index 00000000..5e6c4b40 --- /dev/null +++ b/scripts/swe-bench/run.ts @@ -0,0 +1,392 @@ +#!/usr/bin/env tsx +/** + * SWE-bench Runner for Multica + * + * Runs the Multica agent against SWE-bench task instances and collects patches. + * + * Usage: + * tsx scripts/swe-bench/run.ts [options] + * + * Options: + * --dataset PATH Path to JSONL dataset (default: scripts/swe-bench/lite.jsonl) + * --provider NAME LLM provider (default: kimi-coding) + * --model NAME Model name + * --limit N Max tasks to run (default: all) + * --offset N Skip first N tasks (default: 0) + * --output PATH Output predictions JSONL (default: scripts/swe-bench/predictions.jsonl) + * --workdir PATH Working directory for repos (default: /tmp/swe-bench) + * --timeout MS Timeout per task in ms (default: 300000 = 5min) + * --instance ID Run a single instance by ID + * --debug Enable debug logging + */ + +import { readFileSync, writeFileSync, appendFileSync, existsSync, mkdirSync } from "node:fs"; +import { join, resolve } from "node:path"; +import { execSync, spawn } from "node:child_process"; +import { Agent } from "@multica/core"; +import type { AgentOptions } from "@multica/core"; + +// ============================================================ +// Types +// ============================================================ + +interface SWEBenchTask { + instance_id: string; + repo: string; + base_commit: string; + problem_statement: string; + hints_text?: string; + patch?: string; + test_patch?: string; + version?: string; + environment_setup_commit?: string; +} + +interface Prediction { + instance_id: string; + model_patch: string; + model_name_or_path: string; +} + +interface RunResult { + instance_id: string; + success: boolean; + patch: string; + error?: string; + duration_ms: number; + session_id: string; +} + +// ============================================================ +// CLI argument parsing +// ============================================================ + +interface RunOptions { + dataset: string; + provider: string; + model?: string; + limit: number; + offset: number; + output: string; + workdir: string; + timeout: number; + instance?: string; + debug: boolean; +} + +function parseArgs(): RunOptions { + const args = process.argv.slice(2); + const opts: RunOptions = { + dataset: "scripts/swe-bench/lite.jsonl", + provider: "kimi-coding", + limit: 0, + offset: 0, + output: "scripts/swe-bench/predictions.jsonl", + workdir: "/tmp/swe-bench", + timeout: 300_000, // 5 minutes + debug: false, + }; + + for (let i = 0; i < args.length; i++) { + const arg = args[i]!; + if (arg === "--dataset") opts.dataset = args[++i]!; + else if (arg === "--provider") opts.provider = args[++i]!; + else if (arg === "--model") opts.model = args[++i]!; + else if (arg === "--limit") opts.limit = parseInt(args[++i]!, 10); + else if (arg === "--offset") opts.offset = parseInt(args[++i]!, 10); + else if (arg === "--output") opts.output = args[++i]!; + else if (arg === "--workdir") opts.workdir = args[++i]!; + else if (arg === "--timeout") opts.timeout = parseInt(args[++i]!, 10); + else if (arg === "--instance") opts.instance = args[++i]!; + else if (arg === "--debug") opts.debug = true; + else { + console.error(`Unknown argument: ${arg}`); + process.exit(1); + } + } + + return opts; +} + +// ============================================================ +// Dataset loading +// ============================================================ + +function loadDataset(path: string): SWEBenchTask[] { + if (!existsSync(path)) { + console.error(`Dataset not found: ${path}`); + console.error("Run: python scripts/swe-bench/download-dataset.py"); + process.exit(1); + } + const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean); + return lines.map((line) => JSON.parse(line) as SWEBenchTask); +} + +// ============================================================ +// Repository setup +// ============================================================ + +function setupRepo(task: SWEBenchTask, workdir: string): string { + const repoDir = join(workdir, task.instance_id.replace(/\//g, "__")); + + if (existsSync(repoDir)) { + // Reset existing repo to base commit + log(` Resetting existing repo to ${task.base_commit.slice(0, 8)}...`); + execSync(`git checkout -f ${task.base_commit} && git clean -fdx`, { + cwd: repoDir, + stdio: "pipe", + timeout: 60_000, + }); + } else { + // Clone from GitHub + const repoUrl = `https://github.com/${task.repo}.git`; + log(` Cloning ${task.repo}...`); + mkdirSync(workdir, { recursive: true }); + execSync(`git clone --quiet ${repoUrl} "${repoDir}"`, { + stdio: "pipe", + timeout: 120_000, + }); + execSync(`git checkout -f ${task.base_commit}`, { + cwd: repoDir, + stdio: "pipe", + timeout: 30_000, + }); + } + + return repoDir; +} + +// ============================================================ +// System prompt +// ============================================================ + +function buildSystemPrompt(task: SWEBenchTask): string { + return `You are an expert software engineer tasked with fixing a bug in an open-source repository. + +## Instructions + +1. Read the issue description carefully and understand the problem. +2. Explore the repository to find the relevant source code. +3. Identify the root cause of the issue. +4. Make the minimal set of changes to fix the issue. Do NOT add tests. +5. After making changes, verify your fix makes sense. + +## Important Rules + +- Make ONLY the changes necessary to fix the described issue. +- Do NOT modify or add any test files. +- Do NOT add comments explaining the fix unless the code is non-obvious. +- Do NOT refactor unrelated code. +- Keep changes minimal and focused. + +## Repository + +This is the \`${task.repo}\` repository checked out at commit \`${task.base_commit.slice(0, 12)}\`.`; +} + +function buildPrompt(task: SWEBenchTask): string { + let prompt = `## Issue\n\n${task.problem_statement}`; + if (task.hints_text) { + prompt += `\n\n## Hints\n\n${task.hints_text}`; + } + prompt += `\n\nPlease fix this issue. Remember: make minimal changes, do not modify tests.`; + return prompt; +} + +// ============================================================ +// Run a single task +// ============================================================ + +async function runTask( + task: SWEBenchTask, + opts: RunOptions, +): Promise { + const start = Date.now(); + + // Setup repo + const repoDir = setupRepo(task, opts.workdir); + + // Create agent + const agentOptions: AgentOptions = { + provider: opts.provider, + model: opts.model, + cwd: repoDir, + enableRunLog: true, + debug: opts.debug, + systemPrompt: buildSystemPrompt(task), + enableSkills: false, + tools: { + // Only allow coding tools — no web, no cron, no sessions + deny: ["web_fetch", "web_search", "cron", "data", "sessions_spawn", "sessions_list", "memory_search", "send_file"], + }, + }; + + const agent = new Agent(agentOptions); + + log(` Session: ${agent.sessionId}`); + + try { + // Run agent with timeout + const result = await Promise.race([ + agent.run(buildPrompt(task)), + new Promise((_, reject) => + setTimeout(() => reject(new Error("timeout")), opts.timeout), + ), + ]); + + // Collect the git diff (the patch) + let patch = ""; + try { + patch = execSync("git diff", { + cwd: repoDir, + encoding: "utf-8", + maxBuffer: 10 * 1024 * 1024, // 10MB + timeout: 10_000, + }); + } catch { + // Also check for staged changes + try { + patch = execSync("git diff HEAD", { + cwd: repoDir, + encoding: "utf-8", + maxBuffer: 10 * 1024 * 1024, + timeout: 10_000, + }); + } catch { + patch = ""; + } + } + + return { + instance_id: task.instance_id, + success: patch.length > 0, + patch, + error: result.error, + duration_ms: Date.now() - start, + session_id: agent.sessionId, + }; + } catch (err) { + // Collect any partial patch + let patch = ""; + try { + patch = execSync("git diff", { + cwd: repoDir, + encoding: "utf-8", + maxBuffer: 10 * 1024 * 1024, + timeout: 10_000, + }); + } catch { + // ignore + } + + return { + instance_id: task.instance_id, + success: false, + patch, + error: err instanceof Error ? err.message : String(err), + duration_ms: Date.now() - start, + session_id: agent.sessionId, + }; + } +} + +// ============================================================ +// Logging +// ============================================================ + +function log(msg: string) { + const ts = new Date().toISOString().slice(11, 19); + console.error(`[${ts}] ${msg}`); +} + +// ============================================================ +// Main +// ============================================================ + +async function main() { + const opts = parseArgs(); + + log("SWE-bench Runner for Multica"); + log(`Provider: ${opts.provider}${opts.model ? ` (${opts.model})` : ""}`); + log(`Dataset: ${opts.dataset}`); + log(`Work dir: ${opts.workdir}`); + log(`Timeout: ${opts.timeout / 1000}s per task`); + + // Set SMC_DATA_DIR for isolation + if (!process.env.SMC_DATA_DIR) { + process.env.SMC_DATA_DIR = join(process.env.HOME || "~", ".swe-bench-eval"); + log(`SMC_DATA_DIR: ${process.env.SMC_DATA_DIR}`); + } + + // Load dataset + let tasks = loadDataset(resolve(opts.dataset)); + log(`Loaded ${tasks.length} tasks`); + + // Filter by instance ID if specified + if (opts.instance) { + tasks = tasks.filter((t) => t.instance_id === opts.instance); + if (tasks.length === 0) { + console.error(`Instance not found: ${opts.instance}`); + process.exit(1); + } + } + + // Apply offset and limit + if (opts.offset > 0) { + tasks = tasks.slice(opts.offset); + } + if (opts.limit > 0) { + tasks = tasks.slice(0, opts.limit); + } + + log(`Running ${tasks.length} tasks`); + + // Prepare output + const outputPath = resolve(opts.output); + const resultsPath = outputPath.replace(".jsonl", ".results.jsonl"); + + // Run tasks sequentially + const modelName = `multica-${opts.provider}${opts.model ? `-${opts.model}` : ""}`; + let completed = 0; + let succeeded = 0; + + for (const task of tasks) { + completed++; + log(`\n[${completed}/${tasks.length}] ${task.instance_id}`); + + const result = await runTask(task, opts); + + if (result.success) succeeded++; + + // Write prediction in SWE-bench format + const prediction: Prediction = { + instance_id: result.instance_id, + model_patch: result.patch, + model_name_or_path: modelName, + }; + appendFileSync(outputPath, JSON.stringify(prediction) + "\n"); + + // Write detailed result + appendFileSync(resultsPath, JSON.stringify(result) + "\n"); + + const status = result.success ? "PATCHED" : "NO_PATCH"; + const errorInfo = result.error ? ` (${result.error})` : ""; + log( + ` ${status} | ${(result.duration_ms / 1000).toFixed(1)}s | patch=${result.patch.length} bytes${errorInfo}`, + ); + } + + log(`\n========================================`); + log(`Results: ${succeeded}/${completed} tasks produced patches`); + log(`Predictions: ${outputPath}`); + log(`Details: ${resultsPath}`); + log(`\nTo evaluate with SWE-bench harness:`); + log( + ` python -m swebench.harness.run_evaluation --dataset_name princeton-nlp/SWE-bench_Lite --predictions_path ${outputPath} --max_workers 4 --run_id multica`, + ); +} + +main().catch((err) => { + console.error("Fatal error:", err); + process.exit(1); +});