Delete sessions-spawn.ts, sessions-list.ts and their tests. Update CLI to remove waitForSubagents polling workaround (delegate is synchronous). Update UI, desktop IPC, SWE-bench, and system prompt tests to use the new delegate tool name. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
392 lines
11 KiB
TypeScript
392 lines
11 KiB
TypeScript
#!/usr/bin/env tsx
|
|
/**
|
|
* SWE-bench Runner for Multica
|
|
*
|
|
* Runs the Multica agent against SWE-bench task instances and collects patches.
|
|
*
|
|
* Usage:
|
|
* tsx scripts/swe-bench/run.ts [options]
|
|
*
|
|
* Options:
|
|
* --dataset PATH Path to JSONL dataset (default: scripts/swe-bench/lite.jsonl)
|
|
* --provider NAME LLM provider (default: kimi-coding)
|
|
* --model NAME Model name
|
|
* --limit N Max tasks to run (default: all)
|
|
* --offset N Skip first N tasks (default: 0)
|
|
* --output PATH Output predictions JSONL (default: scripts/swe-bench/predictions.jsonl)
|
|
* --workdir PATH Working directory for repos (default: /tmp/swe-bench)
|
|
* --timeout MS Timeout per task in ms (default: 300000 = 5min)
|
|
* --instance ID Run a single instance by ID
|
|
* --debug Enable debug logging
|
|
*/
|
|
|
|
import { readFileSync, writeFileSync, appendFileSync, existsSync, mkdirSync } from "node:fs";
|
|
import { join, resolve } from "node:path";
|
|
import { execSync, spawn } from "node:child_process";
|
|
import { Agent } from "@multica/core";
|
|
import type { AgentOptions } from "@multica/core";
|
|
|
|
// ============================================================
|
|
// Types
|
|
// ============================================================
|
|
|
|
interface SWEBenchTask {
|
|
instance_id: string;
|
|
repo: string;
|
|
base_commit: string;
|
|
problem_statement: string;
|
|
hints_text?: string;
|
|
patch?: string;
|
|
test_patch?: string;
|
|
version?: string;
|
|
environment_setup_commit?: string;
|
|
}
|
|
|
|
interface Prediction {
|
|
instance_id: string;
|
|
model_patch: string;
|
|
model_name_or_path: string;
|
|
}
|
|
|
|
interface RunResult {
|
|
instance_id: string;
|
|
success: boolean;
|
|
patch: string;
|
|
error?: string;
|
|
duration_ms: number;
|
|
session_id: string;
|
|
}
|
|
|
|
// ============================================================
|
|
// CLI argument parsing
|
|
// ============================================================
|
|
|
|
interface RunOptions {
|
|
dataset: string;
|
|
provider: string;
|
|
model?: string;
|
|
limit: number;
|
|
offset: number;
|
|
output: string;
|
|
workdir: string;
|
|
timeout: number;
|
|
instance?: string;
|
|
debug: boolean;
|
|
}
|
|
|
|
function parseArgs(): RunOptions {
|
|
const args = process.argv.slice(2);
|
|
const opts: RunOptions = {
|
|
dataset: "scripts/swe-bench/lite.jsonl",
|
|
provider: "kimi-coding",
|
|
limit: 0,
|
|
offset: 0,
|
|
output: "scripts/swe-bench/predictions.jsonl",
|
|
workdir: "/tmp/swe-bench",
|
|
timeout: 300_000, // 5 minutes
|
|
debug: false,
|
|
};
|
|
|
|
for (let i = 0; i < args.length; i++) {
|
|
const arg = args[i]!;
|
|
if (arg === "--dataset") opts.dataset = args[++i]!;
|
|
else if (arg === "--provider") opts.provider = args[++i]!;
|
|
else if (arg === "--model") opts.model = args[++i]!;
|
|
else if (arg === "--limit") opts.limit = parseInt(args[++i]!, 10);
|
|
else if (arg === "--offset") opts.offset = parseInt(args[++i]!, 10);
|
|
else if (arg === "--output") opts.output = args[++i]!;
|
|
else if (arg === "--workdir") opts.workdir = args[++i]!;
|
|
else if (arg === "--timeout") opts.timeout = parseInt(args[++i]!, 10);
|
|
else if (arg === "--instance") opts.instance = args[++i]!;
|
|
else if (arg === "--debug") opts.debug = true;
|
|
else {
|
|
console.error(`Unknown argument: ${arg}`);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
return opts;
|
|
}
|
|
|
|
// ============================================================
|
|
// Dataset loading
|
|
// ============================================================
|
|
|
|
function loadDataset(path: string): SWEBenchTask[] {
|
|
if (!existsSync(path)) {
|
|
console.error(`Dataset not found: ${path}`);
|
|
console.error("Run: python scripts/swe-bench/download-dataset.py");
|
|
process.exit(1);
|
|
}
|
|
const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
|
|
return lines.map((line) => JSON.parse(line) as SWEBenchTask);
|
|
}
|
|
|
|
// ============================================================
|
|
// Repository setup
|
|
// ============================================================
|
|
|
|
function setupRepo(task: SWEBenchTask, workdir: string): string {
|
|
const repoDir = join(workdir, task.instance_id.replace(/\//g, "__"));
|
|
|
|
if (existsSync(repoDir)) {
|
|
// Reset existing repo to base commit
|
|
log(` Resetting existing repo to ${task.base_commit.slice(0, 8)}...`);
|
|
execSync(`git checkout -f ${task.base_commit} && git clean -fdx`, {
|
|
cwd: repoDir,
|
|
stdio: "pipe",
|
|
timeout: 60_000,
|
|
});
|
|
} else {
|
|
// Clone from GitHub
|
|
const repoUrl = `https://github.com/${task.repo}.git`;
|
|
log(` Cloning ${task.repo}...`);
|
|
mkdirSync(workdir, { recursive: true });
|
|
execSync(`git clone --quiet ${repoUrl} "${repoDir}"`, {
|
|
stdio: "pipe",
|
|
timeout: 120_000,
|
|
});
|
|
execSync(`git checkout -f ${task.base_commit}`, {
|
|
cwd: repoDir,
|
|
stdio: "pipe",
|
|
timeout: 30_000,
|
|
});
|
|
}
|
|
|
|
return repoDir;
|
|
}
|
|
|
|
// ============================================================
|
|
// System prompt
|
|
// ============================================================
|
|
|
|
function buildSystemPrompt(task: SWEBenchTask): string {
|
|
return `You are an expert software engineer tasked with fixing a bug in an open-source repository.
|
|
|
|
## Instructions
|
|
|
|
1. Read the issue description carefully and understand the problem.
|
|
2. Explore the repository to find the relevant source code.
|
|
3. Identify the root cause of the issue.
|
|
4. Make the minimal set of changes to fix the issue. Do NOT add tests.
|
|
5. After making changes, verify your fix makes sense.
|
|
|
|
## Important Rules
|
|
|
|
- Make ONLY the changes necessary to fix the described issue.
|
|
- Do NOT modify or add any test files.
|
|
- Do NOT add comments explaining the fix unless the code is non-obvious.
|
|
- Do NOT refactor unrelated code.
|
|
- Keep changes minimal and focused.
|
|
|
|
## Repository
|
|
|
|
This is the \`${task.repo}\` repository checked out at commit \`${task.base_commit.slice(0, 12)}\`.`;
|
|
}
|
|
|
|
function buildPrompt(task: SWEBenchTask): string {
|
|
let prompt = `## Issue\n\n${task.problem_statement}`;
|
|
if (task.hints_text) {
|
|
prompt += `\n\n## Hints\n\n${task.hints_text}`;
|
|
}
|
|
prompt += `\n\nPlease fix this issue. Remember: make minimal changes, do not modify tests.`;
|
|
return prompt;
|
|
}
|
|
|
|
// ============================================================
|
|
// Run a single task
|
|
// ============================================================
|
|
|
|
async function runTask(
|
|
task: SWEBenchTask,
|
|
opts: RunOptions,
|
|
): Promise<RunResult> {
|
|
const start = Date.now();
|
|
|
|
// Setup repo
|
|
const repoDir = setupRepo(task, opts.workdir);
|
|
|
|
// Create agent
|
|
const agentOptions: AgentOptions = {
|
|
provider: opts.provider,
|
|
model: opts.model,
|
|
cwd: repoDir,
|
|
enableRunLog: true,
|
|
debug: opts.debug,
|
|
systemPrompt: buildSystemPrompt(task),
|
|
enableSkills: false,
|
|
tools: {
|
|
// Only allow coding tools — no web, no cron, no sessions
|
|
deny: ["web_fetch", "web_search", "cron", "data", "delegate", "memory_search", "send_file"],
|
|
},
|
|
};
|
|
|
|
const agent = new Agent(agentOptions);
|
|
|
|
log(` Session: ${agent.sessionId}`);
|
|
|
|
try {
|
|
// Run agent with timeout
|
|
const result = await Promise.race([
|
|
agent.run(buildPrompt(task)),
|
|
new Promise<never>((_, reject) =>
|
|
setTimeout(() => reject(new Error("timeout")), opts.timeout),
|
|
),
|
|
]);
|
|
|
|
// Collect the git diff (the patch)
|
|
let patch = "";
|
|
try {
|
|
patch = execSync("git diff", {
|
|
cwd: repoDir,
|
|
encoding: "utf-8",
|
|
maxBuffer: 10 * 1024 * 1024, // 10MB
|
|
timeout: 10_000,
|
|
});
|
|
} catch {
|
|
// Also check for staged changes
|
|
try {
|
|
patch = execSync("git diff HEAD", {
|
|
cwd: repoDir,
|
|
encoding: "utf-8",
|
|
maxBuffer: 10 * 1024 * 1024,
|
|
timeout: 10_000,
|
|
});
|
|
} catch {
|
|
patch = "";
|
|
}
|
|
}
|
|
|
|
return {
|
|
instance_id: task.instance_id,
|
|
success: patch.length > 0,
|
|
patch,
|
|
error: result.error,
|
|
duration_ms: Date.now() - start,
|
|
session_id: agent.sessionId,
|
|
};
|
|
} catch (err) {
|
|
// Collect any partial patch
|
|
let patch = "";
|
|
try {
|
|
patch = execSync("git diff", {
|
|
cwd: repoDir,
|
|
encoding: "utf-8",
|
|
maxBuffer: 10 * 1024 * 1024,
|
|
timeout: 10_000,
|
|
});
|
|
} catch {
|
|
// ignore
|
|
}
|
|
|
|
return {
|
|
instance_id: task.instance_id,
|
|
success: false,
|
|
patch,
|
|
error: err instanceof Error ? err.message : String(err),
|
|
duration_ms: Date.now() - start,
|
|
session_id: agent.sessionId,
|
|
};
|
|
}
|
|
}
|
|
|
|
// ============================================================
|
|
// Logging
|
|
// ============================================================
|
|
|
|
function log(msg: string) {
|
|
const ts = new Date().toISOString().slice(11, 19);
|
|
console.error(`[${ts}] ${msg}`);
|
|
}
|
|
|
|
// ============================================================
|
|
// Main
|
|
// ============================================================
|
|
|
|
async function main() {
|
|
const opts = parseArgs();
|
|
|
|
log("SWE-bench Runner for Multica");
|
|
log(`Provider: ${opts.provider}${opts.model ? ` (${opts.model})` : ""}`);
|
|
log(`Dataset: ${opts.dataset}`);
|
|
log(`Work dir: ${opts.workdir}`);
|
|
log(`Timeout: ${opts.timeout / 1000}s per task`);
|
|
|
|
// Set SMC_DATA_DIR for isolation
|
|
if (!process.env.SMC_DATA_DIR) {
|
|
process.env.SMC_DATA_DIR = join(process.env.HOME || "~", ".swe-bench-eval");
|
|
log(`SMC_DATA_DIR: ${process.env.SMC_DATA_DIR}`);
|
|
}
|
|
|
|
// Load dataset
|
|
let tasks = loadDataset(resolve(opts.dataset));
|
|
log(`Loaded ${tasks.length} tasks`);
|
|
|
|
// Filter by instance ID if specified
|
|
if (opts.instance) {
|
|
tasks = tasks.filter((t) => t.instance_id === opts.instance);
|
|
if (tasks.length === 0) {
|
|
console.error(`Instance not found: ${opts.instance}`);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
// Apply offset and limit
|
|
if (opts.offset > 0) {
|
|
tasks = tasks.slice(opts.offset);
|
|
}
|
|
if (opts.limit > 0) {
|
|
tasks = tasks.slice(0, opts.limit);
|
|
}
|
|
|
|
log(`Running ${tasks.length} tasks`);
|
|
|
|
// Prepare output
|
|
const outputPath = resolve(opts.output);
|
|
const resultsPath = outputPath.replace(".jsonl", ".results.jsonl");
|
|
|
|
// Run tasks sequentially
|
|
const modelName = `multica-${opts.provider}${opts.model ? `-${opts.model}` : ""}`;
|
|
let completed = 0;
|
|
let succeeded = 0;
|
|
|
|
for (const task of tasks) {
|
|
completed++;
|
|
log(`\n[${completed}/${tasks.length}] ${task.instance_id}`);
|
|
|
|
const result = await runTask(task, opts);
|
|
|
|
if (result.success) succeeded++;
|
|
|
|
// Write prediction in SWE-bench format
|
|
const prediction: Prediction = {
|
|
instance_id: result.instance_id,
|
|
model_patch: result.patch,
|
|
model_name_or_path: modelName,
|
|
};
|
|
appendFileSync(outputPath, JSON.stringify(prediction) + "\n");
|
|
|
|
// Write detailed result
|
|
appendFileSync(resultsPath, JSON.stringify(result) + "\n");
|
|
|
|
const status = result.success ? "PATCHED" : "NO_PATCH";
|
|
const errorInfo = result.error ? ` (${result.error})` : "";
|
|
log(
|
|
` ${status} | ${(result.duration_ms / 1000).toFixed(1)}s | patch=${result.patch.length} bytes${errorInfo}`,
|
|
);
|
|
}
|
|
|
|
log(`\n========================================`);
|
|
log(`Results: ${succeeded}/${completed} tasks produced patches`);
|
|
log(`Predictions: ${outputPath}`);
|
|
log(`Details: ${resultsPath}`);
|
|
log(`\nTo evaluate with SWE-bench harness:`);
|
|
log(
|
|
` python -m swebench.harness.run_evaluation --dataset_name princeton-nlp/SWE-bench_Lite --predictions_path ${outputPath} --max_workers 4 --run_id multica`,
|
|
);
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("Fatal error:", err);
|
|
process.exit(1);
|
|
});
|