feat(scripts): add SWE-bench runner for Multica agent evaluation

- download-dataset.py: fetches SWE-bench Lite/Verified/Full from HuggingFace - run.ts: core runner that clones repos, runs Agent, collects git diff patches - evaluate.sh: wrapper for official SWE-bench Docker evaluation harness - analyze.ts: summarizes run results with per-repo and timing breakdowns Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:05:17 +08:00 · 2026-02-15 18:05:17 +08:00 · 90d374ffd5
commit 90d374ffd5
parent 47f8e621c8
5 changed files with 681 additions and 0 deletions
--- a/scripts/swe-bench/.gitignore
+++ b/scripts/swe-bench/.gitignore
@ -0,0 +1,5 @@
+# Downloaded datasets
+*.jsonl
+
+# Don't ignore the scripts themselves
+!.gitignore
--- a/scripts/swe-bench/analyze.ts
+++ b/scripts/swe-bench/analyze.ts
@ -0,0 +1,116 @@
+#!/usr/bin/env tsx
+/**
+ * Analyze SWE-bench run results.
+ *
+ * Reads the .results.jsonl file produced by run.ts and prints a summary.
+ *
+ * Usage:
+ *   tsx scripts/swe-bench/analyze.ts [results.jsonl]
+ */
+
+import { readFileSync, existsSync } from "node:fs";
+import { resolve, join } from "node:path";
+
+interface RunResult {
+  instance_id: string;
+  success: boolean;
+  patch: string;
+  error?: string;
+  duration_ms: number;
+  session_id: string;
+}
+
+function main() {
+  const resultsPath = resolve(
+    process.argv[2] || "scripts/swe-bench/predictions.results.jsonl",
+  );
+
+  if (!existsSync(resultsPath)) {
+    console.error(`Results file not found: ${resultsPath}`);
+    process.exit(1);
+  }
+
+  const lines = readFileSync(resultsPath, "utf-8").split("\n").filter(Boolean);
+  const results: RunResult[] = lines.map((l) => JSON.parse(l));
+
+  const total = results.length;
+  const patched = results.filter((r) => r.success).length;
+  const failed = results.filter((r) => !r.success).length;
+  const errors = results.filter((r) => r.error).length;
+  const durations = results.map((r) => r.duration_ms);
+  const avgDuration = durations.reduce((a, b) => a + b, 0) / total;
+  const maxDuration = Math.max(...durations);
+  const minDuration = Math.min(...durations);
+  const patchSizes = results
+    .filter((r) => r.success)
+    .map((r) => r.patch.length);
+  const avgPatchSize =
+    patchSizes.length > 0
+      ? patchSizes.reduce((a, b) => a + b, 0) / patchSizes.length
+      : 0;
+
+  console.log("=== SWE-bench Run Analysis ===\n");
+  console.log(`Total tasks:     ${total}`);
+  console.log(`Patched:         ${patched} (${((patched / total) * 100).toFixed(1)}%)`);
+  console.log(`No patch:        ${failed}`);
+  console.log(`Errors:          ${errors}`);
+  console.log();
+  console.log(`Avg duration:    ${(avgDuration / 1000).toFixed(1)}s`);
+  console.log(`Min duration:    ${(minDuration / 1000).toFixed(1)}s`);
+  console.log(`Max duration:    ${(maxDuration / 1000).toFixed(1)}s`);
+  console.log(`Avg patch size:  ${(avgPatchSize / 1024).toFixed(1)}KB`);
+
+  // Error breakdown
+  if (errors > 0) {
+    console.log("\n--- Errors ---");
+    const errorCounts = new Map<string, number>();
+    for (const r of results) {
+      if (r.error) {
+        const key = r.error.length > 60 ? r.error.slice(0, 60) + "..." : r.error;
+        errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
+      }
+    }
+    for (const [err, count] of [...errorCounts.entries()].sort(
+      (a, b) => b[1] - a[1],
+    )) {
+      console.log(`  ${count}x  ${err}`);
+    }
+  }
+
+  // Per-repo breakdown
+  console.log("\n--- By Repository ---");
+  const repoStats = new Map<string, { total: number; patched: number }>();
+  for (const r of results) {
+    const repo = r.instance_id.split("__")[0]?.replace(/__/g, "/") || "unknown";
+    const stats = repoStats.get(repo) || { total: 0, patched: 0 };
+    stats.total++;
+    if (r.success) stats.patched++;
+    repoStats.set(repo, stats);
+  }
+  for (const [repo, stats] of [...repoStats.entries()].sort(
+    (a, b) => b[1].total - a[1].total,
+  )) {
+    const pct = ((stats.patched / stats.total) * 100).toFixed(0);
+    console.log(
+      `  ${repo.padEnd(30)} ${stats.patched}/${stats.total} (${pct}%)`,
+    );
+  }
+
+  // Slowest tasks
+  console.log("\n--- Slowest Tasks ---");
+  const sorted = [...results].sort((a, b) => b.duration_ms - a.duration_ms);
+  for (const r of sorted.slice(0, 5)) {
+    console.log(
+      `  ${(r.duration_ms / 1000).toFixed(1)}s  ${r.instance_id}  ${r.success ? "PATCHED" : "NO_PATCH"}`,
+    );
+  }
+
+  // Session IDs for further analysis
+  const dataDir = process.env.SMC_DATA_DIR || join(process.env.HOME || "~", ".swe-bench-eval");
+  console.log(`\n--- Run Logs ---`);
+  console.log(`Session data: ${dataDir}/sessions/`);
+  console.log(`View a session's run log:`);
+  console.log(`  cat ${dataDir}/sessions/<session-id>/run-log.jsonl | head -20`);
+}
+
+main();
--- a/scripts/swe-bench/download-dataset.py
+++ b/scripts/swe-bench/download-dataset.py
@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+"""
+Download SWE-bench dataset from HuggingFace and export to JSONL for the Node.js runner.
+
+Usage:
+  pip install datasets
+  python scripts/swe-bench/download-dataset.py [--dataset verified|lite|full] [--limit N] [--output PATH]
+
+Output format (one JSON object per line):
+  {
+    "instance_id": "django__django-16379",
+    "repo": "django/django",
+    "base_commit": "abc123...",
+    "problem_statement": "...",
+    "hints_text": "...",
+    "patch": "...",           # gold patch (for reference, not shown to agent)
+    "test_patch": "...",      # test patch applied during evaluation
+    "version": "4.2",
+    "environment_setup_commit": "..."
+  }
+"""
+
+import argparse
+import json
+import sys
+
+DATASET_MAP = {
+    "verified": "princeton-nlp/SWE-bench_Verified",
+    "lite": "princeton-nlp/SWE-bench_Lite",
+    "full": "princeton-nlp/SWE-bench",
+}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download SWE-bench dataset to JSONL")
+    parser.add_argument(
+        "--dataset",
+        choices=["verified", "lite", "full"],
+        default="lite",
+        help="Dataset variant (default: lite)",
+    )
+    parser.add_argument(
+        "--limit", type=int, default=0, help="Limit number of instances (0 = all)"
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=None,
+        help="Output JSONL path (default: scripts/swe-bench/<dataset>.jsonl)",
+    )
+    parser.add_argument(
+        "--split",
+        type=str,
+        default="test",
+        help="Dataset split (default: test)",
+    )
+    args = parser.parse_args()
+
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        print("Error: 'datasets' package not installed. Run: pip install datasets", file=sys.stderr)
+        sys.exit(1)
+
+    dataset_name = DATASET_MAP[args.dataset]
+    output_path = args.output or f"scripts/swe-bench/{args.dataset}.jsonl"
+
+    print(f"Downloading {dataset_name} (split={args.split})...", file=sys.stderr)
+    ds = load_dataset(dataset_name, split=args.split)
+
+    # Fields to keep
+    keep_fields = [
+        "instance_id",
+        "repo",
+        "base_commit",
+        "problem_statement",
+        "hints_text",
+        "patch",
+        "test_patch",
+        "version",
+        "environment_setup_commit",
+    ]
+
+    count = 0
+    with open(output_path, "w") as f:
+        for item in ds:
+            record = {}
+            for field in keep_fields:
+                if field in item:
+                    record[field] = item[field]
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+            count += 1
+            if args.limit and count >= args.limit:
+                break
+
+    print(f"Wrote {count} instances to {output_path}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/swe-bench/evaluate.sh
+++ b/scripts/swe-bench/evaluate.sh
@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+#
+# Evaluate Multica predictions against SWE-bench using the official Docker harness.
+#
+# Prerequisites:
+#   pip install swebench
+#   Docker running with at least 120GB storage, 16GB RAM, 8 CPU cores
+#
+# Usage:
+#   bash scripts/swe-bench/evaluate.sh [predictions.jsonl] [dataset] [run_id]
+#
+# Examples:
+#   bash scripts/swe-bench/evaluate.sh
+#   bash scripts/swe-bench/evaluate.sh scripts/swe-bench/predictions.jsonl lite multica-v1
+
+set -euo pipefail
+
+PREDICTIONS="${1:-scripts/swe-bench/predictions.jsonl}"
+DATASET="${2:-lite}"
+RUN_ID="${3:-multica}"
+
+# Map short names to HuggingFace dataset names
+case "$DATASET" in
+  lite)     DATASET_NAME="princeton-nlp/SWE-bench_Lite" ;;
+  verified) DATASET_NAME="princeton-nlp/SWE-bench_Verified" ;;
+  full)     DATASET_NAME="princeton-nlp/SWE-bench" ;;
+  *)        DATASET_NAME="$DATASET" ;;
+esac
+
+echo "=== SWE-bench Evaluation ==="
+echo "Predictions: $PREDICTIONS"
+echo "Dataset:     $DATASET_NAME"
+echo "Run ID:      $RUN_ID"
+echo ""
+
+if [ ! -f "$PREDICTIONS" ]; then
+  echo "Error: Predictions file not found: $PREDICTIONS"
+  exit 1
+fi
+
+TASK_COUNT=$(wc -l < "$PREDICTIONS" | tr -d ' ')
+echo "Tasks to evaluate: $TASK_COUNT"
+echo ""
+
+# Check if swebench is installed
+if ! python -c "import swebench" 2>/dev/null; then
+  echo "Error: swebench not installed. Run: pip install swebench"
+  exit 1
+fi
+
+# Check if Docker is running
+if ! docker info >/dev/null 2>&1; then
+  echo "Error: Docker is not running"
+  exit 1
+fi
+
+echo "Starting evaluation (this may take a while)..."
+echo ""
+
+python -m swebench.harness.run_evaluation \
+  --dataset_name "$DATASET_NAME" \
+  --predictions_path "$PREDICTIONS" \
+  --max_workers 4 \
+  --run_id "$RUN_ID"
+
+echo ""
+echo "=== Evaluation Complete ==="
+echo "Check logs/ and evaluation_results/ for detailed results."
--- a/scripts/swe-bench/run.ts
+++ b/scripts/swe-bench/run.ts
@ -0,0 +1,392 @@
+#!/usr/bin/env tsx
+/**
+ * SWE-bench Runner for Multica
+ *
+ * Runs the Multica agent against SWE-bench task instances and collects patches.
+ *
+ * Usage:
+ *   tsx scripts/swe-bench/run.ts [options]
+ *
+ * Options:
+ *   --dataset PATH      Path to JSONL dataset (default: scripts/swe-bench/lite.jsonl)
+ *   --provider NAME     LLM provider (default: kimi-coding)
+ *   --model NAME        Model name
+ *   --limit N           Max tasks to run (default: all)
+ *   --offset N          Skip first N tasks (default: 0)
+ *   --output PATH       Output predictions JSONL (default: scripts/swe-bench/predictions.jsonl)
+ *   --workdir PATH      Working directory for repos (default: /tmp/swe-bench)
+ *   --timeout MS        Timeout per task in ms (default: 300000 = 5min)
+ *   --instance ID       Run a single instance by ID
+ *   --debug             Enable debug logging
+ */
+
+import { readFileSync, writeFileSync, appendFileSync, existsSync, mkdirSync } from "node:fs";
+import { join, resolve } from "node:path";
+import { execSync, spawn } from "node:child_process";
+import { Agent } from "@multica/core";
+import type { AgentOptions } from "@multica/core";
+
+// ============================================================
+// Types
+// ============================================================
+
+interface SWEBenchTask {
+  instance_id: string;
+  repo: string;
+  base_commit: string;
+  problem_statement: string;
+  hints_text?: string;
+  patch?: string;
+  test_patch?: string;
+  version?: string;
+  environment_setup_commit?: string;
+}
+
+interface Prediction {
+  instance_id: string;
+  model_patch: string;
+  model_name_or_path: string;
+}
+
+interface RunResult {
+  instance_id: string;
+  success: boolean;
+  patch: string;
+  error?: string;
+  duration_ms: number;
+  session_id: string;
+}
+
+// ============================================================
+// CLI argument parsing
+// ============================================================
+
+interface RunOptions {
+  dataset: string;
+  provider: string;
+  model?: string;
+  limit: number;
+  offset: number;
+  output: string;
+  workdir: string;
+  timeout: number;
+  instance?: string;
+  debug: boolean;
+}
+
+function parseArgs(): RunOptions {
+  const args = process.argv.slice(2);
+  const opts: RunOptions = {
+    dataset: "scripts/swe-bench/lite.jsonl",
+    provider: "kimi-coding",
+    limit: 0,
+    offset: 0,
+    output: "scripts/swe-bench/predictions.jsonl",
+    workdir: "/tmp/swe-bench",
+    timeout: 300_000, // 5 minutes
+    debug: false,
+  };
+
+  for (let i = 0; i < args.length; i++) {
+    const arg = args[i]!;
+    if (arg === "--dataset") opts.dataset = args[++i]!;
+    else if (arg === "--provider") opts.provider = args[++i]!;
+    else if (arg === "--model") opts.model = args[++i]!;
+    else if (arg === "--limit") opts.limit = parseInt(args[++i]!, 10);
+    else if (arg === "--offset") opts.offset = parseInt(args[++i]!, 10);
+    else if (arg === "--output") opts.output = args[++i]!;
+    else if (arg === "--workdir") opts.workdir = args[++i]!;
+    else if (arg === "--timeout") opts.timeout = parseInt(args[++i]!, 10);
+    else if (arg === "--instance") opts.instance = args[++i]!;
+    else if (arg === "--debug") opts.debug = true;
+    else {
+      console.error(`Unknown argument: ${arg}`);
+      process.exit(1);
+    }
+  }
+
+  return opts;
+}
+
+// ============================================================
+// Dataset loading
+// ============================================================
+
+function loadDataset(path: string): SWEBenchTask[] {
+  if (!existsSync(path)) {
+    console.error(`Dataset not found: ${path}`);
+    console.error("Run: python scripts/swe-bench/download-dataset.py");
+    process.exit(1);
+  }
+  const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
+  return lines.map((line) => JSON.parse(line) as SWEBenchTask);
+}
+
+// ============================================================
+// Repository setup
+// ============================================================
+
+function setupRepo(task: SWEBenchTask, workdir: string): string {
+  const repoDir = join(workdir, task.instance_id.replace(/\//g, "__"));
+
+  if (existsSync(repoDir)) {
+    // Reset existing repo to base commit
+    log(`  Resetting existing repo to ${task.base_commit.slice(0, 8)}...`);
+    execSync(`git checkout -f ${task.base_commit} && git clean -fdx`, {
+      cwd: repoDir,
+      stdio: "pipe",
+      timeout: 60_000,
+    });
+  } else {
+    // Clone from GitHub
+    const repoUrl = `https://github.com/${task.repo}.git`;
+    log(`  Cloning ${task.repo}...`);
+    mkdirSync(workdir, { recursive: true });
+    execSync(`git clone --quiet ${repoUrl} "${repoDir}"`, {
+      stdio: "pipe",
+      timeout: 120_000,
+    });
+    execSync(`git checkout -f ${task.base_commit}`, {
+      cwd: repoDir,
+      stdio: "pipe",
+      timeout: 30_000,
+    });
+  }
+
+  return repoDir;
+}
+
+// ============================================================
+// System prompt
+// ============================================================
+
+function buildSystemPrompt(task: SWEBenchTask): string {
+  return `You are an expert software engineer tasked with fixing a bug in an open-source repository.
+
+## Instructions
+
+1. Read the issue description carefully and understand the problem.
+2. Explore the repository to find the relevant source code.
+3. Identify the root cause of the issue.
+4. Make the minimal set of changes to fix the issue. Do NOT add tests.
+5. After making changes, verify your fix makes sense.
+
+## Important Rules
+
+- Make ONLY the changes necessary to fix the described issue.
+- Do NOT modify or add any test files.
+- Do NOT add comments explaining the fix unless the code is non-obvious.
+- Do NOT refactor unrelated code.
+- Keep changes minimal and focused.
+
+## Repository
+
+This is the \`${task.repo}\` repository checked out at commit \`${task.base_commit.slice(0, 12)}\`.`;
+}
+
+function buildPrompt(task: SWEBenchTask): string {
+  let prompt = `## Issue\n\n${task.problem_statement}`;
+  if (task.hints_text) {
+    prompt += `\n\n## Hints\n\n${task.hints_text}`;
+  }
+  prompt += `\n\nPlease fix this issue. Remember: make minimal changes, do not modify tests.`;
+  return prompt;
+}
+
+// ============================================================
+// Run a single task
+// ============================================================
+
+async function runTask(
+  task: SWEBenchTask,
+  opts: RunOptions,
+): Promise<RunResult> {
+  const start = Date.now();
+
+  // Setup repo
+  const repoDir = setupRepo(task, opts.workdir);
+
+  // Create agent
+  const agentOptions: AgentOptions = {
+    provider: opts.provider,
+    model: opts.model,
+    cwd: repoDir,
+    enableRunLog: true,
+    debug: opts.debug,
+    systemPrompt: buildSystemPrompt(task),
+    enableSkills: false,
+    tools: {
+      // Only allow coding tools — no web, no cron, no sessions
+      deny: ["web_fetch", "web_search", "cron", "data", "sessions_spawn", "sessions_list", "memory_search", "send_file"],
+    },
+  };
+
+  const agent = new Agent(agentOptions);
+
+  log(`  Session: ${agent.sessionId}`);
+
+  try {
+    // Run agent with timeout
+    const result = await Promise.race([
+      agent.run(buildPrompt(task)),
+      new Promise<never>((_, reject) =>
+        setTimeout(() => reject(new Error("timeout")), opts.timeout),
+      ),
+    ]);
+
+    // Collect the git diff (the patch)
+    let patch = "";
+    try {
+      patch = execSync("git diff", {
+        cwd: repoDir,
+        encoding: "utf-8",
+        maxBuffer: 10 * 1024 * 1024, // 10MB
+        timeout: 10_000,
+      });
+    } catch {
+      // Also check for staged changes
+      try {
+        patch = execSync("git diff HEAD", {
+          cwd: repoDir,
+          encoding: "utf-8",
+          maxBuffer: 10 * 1024 * 1024,
+          timeout: 10_000,
+        });
+      } catch {
+        patch = "";
+      }
+    }
+
+    return {
+      instance_id: task.instance_id,
+      success: patch.length > 0,
+      patch,
+      error: result.error,
+      duration_ms: Date.now() - start,
+      session_id: agent.sessionId,
+    };
+  } catch (err) {
+    // Collect any partial patch
+    let patch = "";
+    try {
+      patch = execSync("git diff", {
+        cwd: repoDir,
+        encoding: "utf-8",
+        maxBuffer: 10 * 1024 * 1024,
+        timeout: 10_000,
+      });
+    } catch {
+      // ignore
+    }
+
+    return {
+      instance_id: task.instance_id,
+      success: false,
+      patch,
+      error: err instanceof Error ? err.message : String(err),
+      duration_ms: Date.now() - start,
+      session_id: agent.sessionId,
+    };
+  }
+}
+
+// ============================================================
+// Logging
+// ============================================================
+
+function log(msg: string) {
+  const ts = new Date().toISOString().slice(11, 19);
+  console.error(`[${ts}] ${msg}`);
+}
+
+// ============================================================
+// Main
+// ============================================================
+
+async function main() {
+  const opts = parseArgs();
+
+  log("SWE-bench Runner for Multica");
+  log(`Provider: ${opts.provider}${opts.model ? ` (${opts.model})` : ""}`);
+  log(`Dataset: ${opts.dataset}`);
+  log(`Work dir: ${opts.workdir}`);
+  log(`Timeout: ${opts.timeout / 1000}s per task`);
+
+  // Set SMC_DATA_DIR for isolation
+  if (!process.env.SMC_DATA_DIR) {
+    process.env.SMC_DATA_DIR = join(process.env.HOME || "~", ".swe-bench-eval");
+    log(`SMC_DATA_DIR: ${process.env.SMC_DATA_DIR}`);
+  }
+
+  // Load dataset
+  let tasks = loadDataset(resolve(opts.dataset));
+  log(`Loaded ${tasks.length} tasks`);
+
+  // Filter by instance ID if specified
+  if (opts.instance) {
+    tasks = tasks.filter((t) => t.instance_id === opts.instance);
+    if (tasks.length === 0) {
+      console.error(`Instance not found: ${opts.instance}`);
+      process.exit(1);
+    }
+  }
+
+  // Apply offset and limit
+  if (opts.offset > 0) {
+    tasks = tasks.slice(opts.offset);
+  }
+  if (opts.limit > 0) {
+    tasks = tasks.slice(0, opts.limit);
+  }
+
+  log(`Running ${tasks.length} tasks`);
+
+  // Prepare output
+  const outputPath = resolve(opts.output);
+  const resultsPath = outputPath.replace(".jsonl", ".results.jsonl");
+
+  // Run tasks sequentially
+  const modelName = `multica-${opts.provider}${opts.model ? `-${opts.model}` : ""}`;
+  let completed = 0;
+  let succeeded = 0;
+
+  for (const task of tasks) {
+    completed++;
+    log(`\n[${completed}/${tasks.length}] ${task.instance_id}`);
+
+    const result = await runTask(task, opts);
+
+    if (result.success) succeeded++;
+
+    // Write prediction in SWE-bench format
+    const prediction: Prediction = {
+      instance_id: result.instance_id,
+      model_patch: result.patch,
+      model_name_or_path: modelName,
+    };
+    appendFileSync(outputPath, JSON.stringify(prediction) + "\n");
+
+    // Write detailed result
+    appendFileSync(resultsPath, JSON.stringify(result) + "\n");
+
+    const status = result.success ? "PATCHED" : "NO_PATCH";
+    const errorInfo = result.error ? ` (${result.error})` : "";
+    log(
+      `  ${status} | ${(result.duration_ms / 1000).toFixed(1)}s | patch=${result.patch.length} bytes${errorInfo}`,
+    );
+  }
+
+  log(`\n========================================`);
+  log(`Results: ${succeeded}/${completed} tasks produced patches`);
+  log(`Predictions: ${outputPath}`);
+  log(`Details: ${resultsPath}`);
+  log(`\nTo evaluate with SWE-bench harness:`);
+  log(
+    `  python -m swebench.harness.run_evaluation --dataset_name princeton-nlp/SWE-bench_Lite --predictions_path ${outputPath} --max_workers 4 --run_id multica`,
+  );
+}
+
+main().catch((err) => {
+  console.error("Fatal error:", err);
+  process.exit(1);
+});