feat(scripts): add SWE-bench runner for Multica agent evaluation
- download-dataset.py: fetches SWE-bench Lite/Verified/Full from HuggingFace - run.ts: core runner that clones repos, runs Agent, collects git diff patches - evaluate.sh: wrapper for official SWE-bench Docker evaluation harness - analyze.ts: summarizes run results with per-repo and timing breakdowns Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
47f8e621c8
commit
90d374ffd5
5 changed files with 681 additions and 0 deletions
5
scripts/swe-bench/.gitignore
vendored
Normal file
5
scripts/swe-bench/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
# Downloaded datasets
|
||||
*.jsonl
|
||||
|
||||
# Don't ignore the scripts themselves
|
||||
!.gitignore
|
||||
116
scripts/swe-bench/analyze.ts
Normal file
116
scripts/swe-bench/analyze.ts
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Analyze SWE-bench run results.
|
||||
*
|
||||
* Reads the .results.jsonl file produced by run.ts and prints a summary.
|
||||
*
|
||||
* Usage:
|
||||
* tsx scripts/swe-bench/analyze.ts [results.jsonl]
|
||||
*/
|
||||
|
||||
import { readFileSync, existsSync } from "node:fs";
|
||||
import { resolve, join } from "node:path";
|
||||
|
||||
interface RunResult {
|
||||
instance_id: string;
|
||||
success: boolean;
|
||||
patch: string;
|
||||
error?: string;
|
||||
duration_ms: number;
|
||||
session_id: string;
|
||||
}
|
||||
|
||||
function main() {
|
||||
const resultsPath = resolve(
|
||||
process.argv[2] || "scripts/swe-bench/predictions.results.jsonl",
|
||||
);
|
||||
|
||||
if (!existsSync(resultsPath)) {
|
||||
console.error(`Results file not found: ${resultsPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const lines = readFileSync(resultsPath, "utf-8").split("\n").filter(Boolean);
|
||||
const results: RunResult[] = lines.map((l) => JSON.parse(l));
|
||||
|
||||
const total = results.length;
|
||||
const patched = results.filter((r) => r.success).length;
|
||||
const failed = results.filter((r) => !r.success).length;
|
||||
const errors = results.filter((r) => r.error).length;
|
||||
const durations = results.map((r) => r.duration_ms);
|
||||
const avgDuration = durations.reduce((a, b) => a + b, 0) / total;
|
||||
const maxDuration = Math.max(...durations);
|
||||
const minDuration = Math.min(...durations);
|
||||
const patchSizes = results
|
||||
.filter((r) => r.success)
|
||||
.map((r) => r.patch.length);
|
||||
const avgPatchSize =
|
||||
patchSizes.length > 0
|
||||
? patchSizes.reduce((a, b) => a + b, 0) / patchSizes.length
|
||||
: 0;
|
||||
|
||||
console.log("=== SWE-bench Run Analysis ===\n");
|
||||
console.log(`Total tasks: ${total}`);
|
||||
console.log(`Patched: ${patched} (${((patched / total) * 100).toFixed(1)}%)`);
|
||||
console.log(`No patch: ${failed}`);
|
||||
console.log(`Errors: ${errors}`);
|
||||
console.log();
|
||||
console.log(`Avg duration: ${(avgDuration / 1000).toFixed(1)}s`);
|
||||
console.log(`Min duration: ${(minDuration / 1000).toFixed(1)}s`);
|
||||
console.log(`Max duration: ${(maxDuration / 1000).toFixed(1)}s`);
|
||||
console.log(`Avg patch size: ${(avgPatchSize / 1024).toFixed(1)}KB`);
|
||||
|
||||
// Error breakdown
|
||||
if (errors > 0) {
|
||||
console.log("\n--- Errors ---");
|
||||
const errorCounts = new Map<string, number>();
|
||||
for (const r of results) {
|
||||
if (r.error) {
|
||||
const key = r.error.length > 60 ? r.error.slice(0, 60) + "..." : r.error;
|
||||
errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
|
||||
}
|
||||
}
|
||||
for (const [err, count] of [...errorCounts.entries()].sort(
|
||||
(a, b) => b[1] - a[1],
|
||||
)) {
|
||||
console.log(` ${count}x ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Per-repo breakdown
|
||||
console.log("\n--- By Repository ---");
|
||||
const repoStats = new Map<string, { total: number; patched: number }>();
|
||||
for (const r of results) {
|
||||
const repo = r.instance_id.split("__")[0]?.replace(/__/g, "/") || "unknown";
|
||||
const stats = repoStats.get(repo) || { total: 0, patched: 0 };
|
||||
stats.total++;
|
||||
if (r.success) stats.patched++;
|
||||
repoStats.set(repo, stats);
|
||||
}
|
||||
for (const [repo, stats] of [...repoStats.entries()].sort(
|
||||
(a, b) => b[1].total - a[1].total,
|
||||
)) {
|
||||
const pct = ((stats.patched / stats.total) * 100).toFixed(0);
|
||||
console.log(
|
||||
` ${repo.padEnd(30)} ${stats.patched}/${stats.total} (${pct}%)`,
|
||||
);
|
||||
}
|
||||
|
||||
// Slowest tasks
|
||||
console.log("\n--- Slowest Tasks ---");
|
||||
const sorted = [...results].sort((a, b) => b.duration_ms - a.duration_ms);
|
||||
for (const r of sorted.slice(0, 5)) {
|
||||
console.log(
|
||||
` ${(r.duration_ms / 1000).toFixed(1)}s ${r.instance_id} ${r.success ? "PATCHED" : "NO_PATCH"}`,
|
||||
);
|
||||
}
|
||||
|
||||
// Session IDs for further analysis
|
||||
const dataDir = process.env.SMC_DATA_DIR || join(process.env.HOME || "~", ".swe-bench-eval");
|
||||
console.log(`\n--- Run Logs ---`);
|
||||
console.log(`Session data: ${dataDir}/sessions/`);
|
||||
console.log(`View a session's run log:`);
|
||||
console.log(` cat ${dataDir}/sessions/<session-id>/run-log.jsonl | head -20`);
|
||||
}
|
||||
|
||||
main();
|
||||
100
scripts/swe-bench/download-dataset.py
Executable file
100
scripts/swe-bench/download-dataset.py
Executable file
|
|
@ -0,0 +1,100 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download SWE-bench dataset from HuggingFace and export to JSONL for the Node.js runner.
|
||||
|
||||
Usage:
|
||||
pip install datasets
|
||||
python scripts/swe-bench/download-dataset.py [--dataset verified|lite|full] [--limit N] [--output PATH]
|
||||
|
||||
Output format (one JSON object per line):
|
||||
{
|
||||
"instance_id": "django__django-16379",
|
||||
"repo": "django/django",
|
||||
"base_commit": "abc123...",
|
||||
"problem_statement": "...",
|
||||
"hints_text": "...",
|
||||
"patch": "...", # gold patch (for reference, not shown to agent)
|
||||
"test_patch": "...", # test patch applied during evaluation
|
||||
"version": "4.2",
|
||||
"environment_setup_commit": "..."
|
||||
}
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
DATASET_MAP = {
|
||||
"verified": "princeton-nlp/SWE-bench_Verified",
|
||||
"lite": "princeton-nlp/SWE-bench_Lite",
|
||||
"full": "princeton-nlp/SWE-bench",
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Download SWE-bench dataset to JSONL")
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
choices=["verified", "lite", "full"],
|
||||
default="lite",
|
||||
help="Dataset variant (default: lite)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit", type=int, default=0, help="Limit number of instances (0 = all)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Output JSONL path (default: scripts/swe-bench/<dataset>.jsonl)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--split",
|
||||
type=str,
|
||||
default="test",
|
||||
help="Dataset split (default: test)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
from datasets import load_dataset
|
||||
except ImportError:
|
||||
print("Error: 'datasets' package not installed. Run: pip install datasets", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
dataset_name = DATASET_MAP[args.dataset]
|
||||
output_path = args.output or f"scripts/swe-bench/{args.dataset}.jsonl"
|
||||
|
||||
print(f"Downloading {dataset_name} (split={args.split})...", file=sys.stderr)
|
||||
ds = load_dataset(dataset_name, split=args.split)
|
||||
|
||||
# Fields to keep
|
||||
keep_fields = [
|
||||
"instance_id",
|
||||
"repo",
|
||||
"base_commit",
|
||||
"problem_statement",
|
||||
"hints_text",
|
||||
"patch",
|
||||
"test_patch",
|
||||
"version",
|
||||
"environment_setup_commit",
|
||||
]
|
||||
|
||||
count = 0
|
||||
with open(output_path, "w") as f:
|
||||
for item in ds:
|
||||
record = {}
|
||||
for field in keep_fields:
|
||||
if field in item:
|
||||
record[field] = item[field]
|
||||
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
count += 1
|
||||
if args.limit and count >= args.limit:
|
||||
break
|
||||
|
||||
print(f"Wrote {count} instances to {output_path}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
68
scripts/swe-bench/evaluate.sh
Executable file
68
scripts/swe-bench/evaluate.sh
Executable file
|
|
@ -0,0 +1,68 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Evaluate Multica predictions against SWE-bench using the official Docker harness.
|
||||
#
|
||||
# Prerequisites:
|
||||
# pip install swebench
|
||||
# Docker running with at least 120GB storage, 16GB RAM, 8 CPU cores
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/swe-bench/evaluate.sh [predictions.jsonl] [dataset] [run_id]
|
||||
#
|
||||
# Examples:
|
||||
# bash scripts/swe-bench/evaluate.sh
|
||||
# bash scripts/swe-bench/evaluate.sh scripts/swe-bench/predictions.jsonl lite multica-v1
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PREDICTIONS="${1:-scripts/swe-bench/predictions.jsonl}"
|
||||
DATASET="${2:-lite}"
|
||||
RUN_ID="${3:-multica}"
|
||||
|
||||
# Map short names to HuggingFace dataset names
|
||||
case "$DATASET" in
|
||||
lite) DATASET_NAME="princeton-nlp/SWE-bench_Lite" ;;
|
||||
verified) DATASET_NAME="princeton-nlp/SWE-bench_Verified" ;;
|
||||
full) DATASET_NAME="princeton-nlp/SWE-bench" ;;
|
||||
*) DATASET_NAME="$DATASET" ;;
|
||||
esac
|
||||
|
||||
echo "=== SWE-bench Evaluation ==="
|
||||
echo "Predictions: $PREDICTIONS"
|
||||
echo "Dataset: $DATASET_NAME"
|
||||
echo "Run ID: $RUN_ID"
|
||||
echo ""
|
||||
|
||||
if [ ! -f "$PREDICTIONS" ]; then
|
||||
echo "Error: Predictions file not found: $PREDICTIONS"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TASK_COUNT=$(wc -l < "$PREDICTIONS" | tr -d ' ')
|
||||
echo "Tasks to evaluate: $TASK_COUNT"
|
||||
echo ""
|
||||
|
||||
# Check if swebench is installed
|
||||
if ! python -c "import swebench" 2>/dev/null; then
|
||||
echo "Error: swebench not installed. Run: pip install swebench"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if Docker is running
|
||||
if ! docker info >/dev/null 2>&1; then
|
||||
echo "Error: Docker is not running"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Starting evaluation (this may take a while)..."
|
||||
echo ""
|
||||
|
||||
python -m swebench.harness.run_evaluation \
|
||||
--dataset_name "$DATASET_NAME" \
|
||||
--predictions_path "$PREDICTIONS" \
|
||||
--max_workers 4 \
|
||||
--run_id "$RUN_ID"
|
||||
|
||||
echo ""
|
||||
echo "=== Evaluation Complete ==="
|
||||
echo "Check logs/ and evaluation_results/ for detailed results."
|
||||
392
scripts/swe-bench/run.ts
Normal file
392
scripts/swe-bench/run.ts
Normal file
|
|
@ -0,0 +1,392 @@
|
|||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* SWE-bench Runner for Multica
|
||||
*
|
||||
* Runs the Multica agent against SWE-bench task instances and collects patches.
|
||||
*
|
||||
* Usage:
|
||||
* tsx scripts/swe-bench/run.ts [options]
|
||||
*
|
||||
* Options:
|
||||
* --dataset PATH Path to JSONL dataset (default: scripts/swe-bench/lite.jsonl)
|
||||
* --provider NAME LLM provider (default: kimi-coding)
|
||||
* --model NAME Model name
|
||||
* --limit N Max tasks to run (default: all)
|
||||
* --offset N Skip first N tasks (default: 0)
|
||||
* --output PATH Output predictions JSONL (default: scripts/swe-bench/predictions.jsonl)
|
||||
* --workdir PATH Working directory for repos (default: /tmp/swe-bench)
|
||||
* --timeout MS Timeout per task in ms (default: 300000 = 5min)
|
||||
* --instance ID Run a single instance by ID
|
||||
* --debug Enable debug logging
|
||||
*/
|
||||
|
||||
import { readFileSync, writeFileSync, appendFileSync, existsSync, mkdirSync } from "node:fs";
|
||||
import { join, resolve } from "node:path";
|
||||
import { execSync, spawn } from "node:child_process";
|
||||
import { Agent } from "@multica/core";
|
||||
import type { AgentOptions } from "@multica/core";
|
||||
|
||||
// ============================================================
|
||||
// Types
|
||||
// ============================================================
|
||||
|
||||
interface SWEBenchTask {
|
||||
instance_id: string;
|
||||
repo: string;
|
||||
base_commit: string;
|
||||
problem_statement: string;
|
||||
hints_text?: string;
|
||||
patch?: string;
|
||||
test_patch?: string;
|
||||
version?: string;
|
||||
environment_setup_commit?: string;
|
||||
}
|
||||
|
||||
interface Prediction {
|
||||
instance_id: string;
|
||||
model_patch: string;
|
||||
model_name_or_path: string;
|
||||
}
|
||||
|
||||
interface RunResult {
|
||||
instance_id: string;
|
||||
success: boolean;
|
||||
patch: string;
|
||||
error?: string;
|
||||
duration_ms: number;
|
||||
session_id: string;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CLI argument parsing
|
||||
// ============================================================
|
||||
|
||||
interface RunOptions {
|
||||
dataset: string;
|
||||
provider: string;
|
||||
model?: string;
|
||||
limit: number;
|
||||
offset: number;
|
||||
output: string;
|
||||
workdir: string;
|
||||
timeout: number;
|
||||
instance?: string;
|
||||
debug: boolean;
|
||||
}
|
||||
|
||||
function parseArgs(): RunOptions {
|
||||
const args = process.argv.slice(2);
|
||||
const opts: RunOptions = {
|
||||
dataset: "scripts/swe-bench/lite.jsonl",
|
||||
provider: "kimi-coding",
|
||||
limit: 0,
|
||||
offset: 0,
|
||||
output: "scripts/swe-bench/predictions.jsonl",
|
||||
workdir: "/tmp/swe-bench",
|
||||
timeout: 300_000, // 5 minutes
|
||||
debug: false,
|
||||
};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
const arg = args[i]!;
|
||||
if (arg === "--dataset") opts.dataset = args[++i]!;
|
||||
else if (arg === "--provider") opts.provider = args[++i]!;
|
||||
else if (arg === "--model") opts.model = args[++i]!;
|
||||
else if (arg === "--limit") opts.limit = parseInt(args[++i]!, 10);
|
||||
else if (arg === "--offset") opts.offset = parseInt(args[++i]!, 10);
|
||||
else if (arg === "--output") opts.output = args[++i]!;
|
||||
else if (arg === "--workdir") opts.workdir = args[++i]!;
|
||||
else if (arg === "--timeout") opts.timeout = parseInt(args[++i]!, 10);
|
||||
else if (arg === "--instance") opts.instance = args[++i]!;
|
||||
else if (arg === "--debug") opts.debug = true;
|
||||
else {
|
||||
console.error(`Unknown argument: ${arg}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return opts;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Dataset loading
|
||||
// ============================================================
|
||||
|
||||
function loadDataset(path: string): SWEBenchTask[] {
|
||||
if (!existsSync(path)) {
|
||||
console.error(`Dataset not found: ${path}`);
|
||||
console.error("Run: python scripts/swe-bench/download-dataset.py");
|
||||
process.exit(1);
|
||||
}
|
||||
const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
|
||||
return lines.map((line) => JSON.parse(line) as SWEBenchTask);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Repository setup
|
||||
// ============================================================
|
||||
|
||||
function setupRepo(task: SWEBenchTask, workdir: string): string {
|
||||
const repoDir = join(workdir, task.instance_id.replace(/\//g, "__"));
|
||||
|
||||
if (existsSync(repoDir)) {
|
||||
// Reset existing repo to base commit
|
||||
log(` Resetting existing repo to ${task.base_commit.slice(0, 8)}...`);
|
||||
execSync(`git checkout -f ${task.base_commit} && git clean -fdx`, {
|
||||
cwd: repoDir,
|
||||
stdio: "pipe",
|
||||
timeout: 60_000,
|
||||
});
|
||||
} else {
|
||||
// Clone from GitHub
|
||||
const repoUrl = `https://github.com/${task.repo}.git`;
|
||||
log(` Cloning ${task.repo}...`);
|
||||
mkdirSync(workdir, { recursive: true });
|
||||
execSync(`git clone --quiet ${repoUrl} "${repoDir}"`, {
|
||||
stdio: "pipe",
|
||||
timeout: 120_000,
|
||||
});
|
||||
execSync(`git checkout -f ${task.base_commit}`, {
|
||||
cwd: repoDir,
|
||||
stdio: "pipe",
|
||||
timeout: 30_000,
|
||||
});
|
||||
}
|
||||
|
||||
return repoDir;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// System prompt
|
||||
// ============================================================
|
||||
|
||||
function buildSystemPrompt(task: SWEBenchTask): string {
|
||||
return `You are an expert software engineer tasked with fixing a bug in an open-source repository.
|
||||
|
||||
## Instructions
|
||||
|
||||
1. Read the issue description carefully and understand the problem.
|
||||
2. Explore the repository to find the relevant source code.
|
||||
3. Identify the root cause of the issue.
|
||||
4. Make the minimal set of changes to fix the issue. Do NOT add tests.
|
||||
5. After making changes, verify your fix makes sense.
|
||||
|
||||
## Important Rules
|
||||
|
||||
- Make ONLY the changes necessary to fix the described issue.
|
||||
- Do NOT modify or add any test files.
|
||||
- Do NOT add comments explaining the fix unless the code is non-obvious.
|
||||
- Do NOT refactor unrelated code.
|
||||
- Keep changes minimal and focused.
|
||||
|
||||
## Repository
|
||||
|
||||
This is the \`${task.repo}\` repository checked out at commit \`${task.base_commit.slice(0, 12)}\`.`;
|
||||
}
|
||||
|
||||
function buildPrompt(task: SWEBenchTask): string {
|
||||
let prompt = `## Issue\n\n${task.problem_statement}`;
|
||||
if (task.hints_text) {
|
||||
prompt += `\n\n## Hints\n\n${task.hints_text}`;
|
||||
}
|
||||
prompt += `\n\nPlease fix this issue. Remember: make minimal changes, do not modify tests.`;
|
||||
return prompt;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Run a single task
|
||||
// ============================================================
|
||||
|
||||
async function runTask(
|
||||
task: SWEBenchTask,
|
||||
opts: RunOptions,
|
||||
): Promise<RunResult> {
|
||||
const start = Date.now();
|
||||
|
||||
// Setup repo
|
||||
const repoDir = setupRepo(task, opts.workdir);
|
||||
|
||||
// Create agent
|
||||
const agentOptions: AgentOptions = {
|
||||
provider: opts.provider,
|
||||
model: opts.model,
|
||||
cwd: repoDir,
|
||||
enableRunLog: true,
|
||||
debug: opts.debug,
|
||||
systemPrompt: buildSystemPrompt(task),
|
||||
enableSkills: false,
|
||||
tools: {
|
||||
// Only allow coding tools — no web, no cron, no sessions
|
||||
deny: ["web_fetch", "web_search", "cron", "data", "sessions_spawn", "sessions_list", "memory_search", "send_file"],
|
||||
},
|
||||
};
|
||||
|
||||
const agent = new Agent(agentOptions);
|
||||
|
||||
log(` Session: ${agent.sessionId}`);
|
||||
|
||||
try {
|
||||
// Run agent with timeout
|
||||
const result = await Promise.race([
|
||||
agent.run(buildPrompt(task)),
|
||||
new Promise<never>((_, reject) =>
|
||||
setTimeout(() => reject(new Error("timeout")), opts.timeout),
|
||||
),
|
||||
]);
|
||||
|
||||
// Collect the git diff (the patch)
|
||||
let patch = "";
|
||||
try {
|
||||
patch = execSync("git diff", {
|
||||
cwd: repoDir,
|
||||
encoding: "utf-8",
|
||||
maxBuffer: 10 * 1024 * 1024, // 10MB
|
||||
timeout: 10_000,
|
||||
});
|
||||
} catch {
|
||||
// Also check for staged changes
|
||||
try {
|
||||
patch = execSync("git diff HEAD", {
|
||||
cwd: repoDir,
|
||||
encoding: "utf-8",
|
||||
maxBuffer: 10 * 1024 * 1024,
|
||||
timeout: 10_000,
|
||||
});
|
||||
} catch {
|
||||
patch = "";
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
instance_id: task.instance_id,
|
||||
success: patch.length > 0,
|
||||
patch,
|
||||
error: result.error,
|
||||
duration_ms: Date.now() - start,
|
||||
session_id: agent.sessionId,
|
||||
};
|
||||
} catch (err) {
|
||||
// Collect any partial patch
|
||||
let patch = "";
|
||||
try {
|
||||
patch = execSync("git diff", {
|
||||
cwd: repoDir,
|
||||
encoding: "utf-8",
|
||||
maxBuffer: 10 * 1024 * 1024,
|
||||
timeout: 10_000,
|
||||
});
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
|
||||
return {
|
||||
instance_id: task.instance_id,
|
||||
success: false,
|
||||
patch,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
duration_ms: Date.now() - start,
|
||||
session_id: agent.sessionId,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Logging
|
||||
// ============================================================
|
||||
|
||||
function log(msg: string) {
|
||||
const ts = new Date().toISOString().slice(11, 19);
|
||||
console.error(`[${ts}] ${msg}`);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Main
|
||||
// ============================================================
|
||||
|
||||
async function main() {
|
||||
const opts = parseArgs();
|
||||
|
||||
log("SWE-bench Runner for Multica");
|
||||
log(`Provider: ${opts.provider}${opts.model ? ` (${opts.model})` : ""}`);
|
||||
log(`Dataset: ${opts.dataset}`);
|
||||
log(`Work dir: ${opts.workdir}`);
|
||||
log(`Timeout: ${opts.timeout / 1000}s per task`);
|
||||
|
||||
// Set SMC_DATA_DIR for isolation
|
||||
if (!process.env.SMC_DATA_DIR) {
|
||||
process.env.SMC_DATA_DIR = join(process.env.HOME || "~", ".swe-bench-eval");
|
||||
log(`SMC_DATA_DIR: ${process.env.SMC_DATA_DIR}`);
|
||||
}
|
||||
|
||||
// Load dataset
|
||||
let tasks = loadDataset(resolve(opts.dataset));
|
||||
log(`Loaded ${tasks.length} tasks`);
|
||||
|
||||
// Filter by instance ID if specified
|
||||
if (opts.instance) {
|
||||
tasks = tasks.filter((t) => t.instance_id === opts.instance);
|
||||
if (tasks.length === 0) {
|
||||
console.error(`Instance not found: ${opts.instance}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply offset and limit
|
||||
if (opts.offset > 0) {
|
||||
tasks = tasks.slice(opts.offset);
|
||||
}
|
||||
if (opts.limit > 0) {
|
||||
tasks = tasks.slice(0, opts.limit);
|
||||
}
|
||||
|
||||
log(`Running ${tasks.length} tasks`);
|
||||
|
||||
// Prepare output
|
||||
const outputPath = resolve(opts.output);
|
||||
const resultsPath = outputPath.replace(".jsonl", ".results.jsonl");
|
||||
|
||||
// Run tasks sequentially
|
||||
const modelName = `multica-${opts.provider}${opts.model ? `-${opts.model}` : ""}`;
|
||||
let completed = 0;
|
||||
let succeeded = 0;
|
||||
|
||||
for (const task of tasks) {
|
||||
completed++;
|
||||
log(`\n[${completed}/${tasks.length}] ${task.instance_id}`);
|
||||
|
||||
const result = await runTask(task, opts);
|
||||
|
||||
if (result.success) succeeded++;
|
||||
|
||||
// Write prediction in SWE-bench format
|
||||
const prediction: Prediction = {
|
||||
instance_id: result.instance_id,
|
||||
model_patch: result.patch,
|
||||
model_name_or_path: modelName,
|
||||
};
|
||||
appendFileSync(outputPath, JSON.stringify(prediction) + "\n");
|
||||
|
||||
// Write detailed result
|
||||
appendFileSync(resultsPath, JSON.stringify(result) + "\n");
|
||||
|
||||
const status = result.success ? "PATCHED" : "NO_PATCH";
|
||||
const errorInfo = result.error ? ` (${result.error})` : "";
|
||||
log(
|
||||
` ${status} | ${(result.duration_ms / 1000).toFixed(1)}s | patch=${result.patch.length} bytes${errorInfo}`,
|
||||
);
|
||||
}
|
||||
|
||||
log(`\n========================================`);
|
||||
log(`Results: ${succeeded}/${completed} tasks produced patches`);
|
||||
log(`Predictions: ${outputPath}`);
|
||||
log(`Details: ${resultsPath}`);
|
||||
log(`\nTo evaluate with SWE-bench harness:`);
|
||||
log(
|
||||
` python -m swebench.harness.run_evaluation --dataset_name princeton-nlp/SWE-bench_Lite --predictions_path ${outputPath} --max_workers 4 --run_id multica`,
|
||||
);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("Fatal error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue