- download-dataset.py: fetches SWE-bench Lite/Verified/Full from HuggingFace - run.ts: core runner that clones repos, runs Agent, collects git diff patches - evaluate.sh: wrapper for official SWE-bench Docker evaluation harness - analyze.ts: summarizes run results with per-repo and timing breakdowns Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
100 lines
2.7 KiB
Python
Executable file
100 lines
2.7 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Download SWE-bench dataset from HuggingFace and export to JSONL for the Node.js runner.
|
|
|
|
Usage:
|
|
pip install datasets
|
|
python scripts/swe-bench/download-dataset.py [--dataset verified|lite|full] [--limit N] [--output PATH]
|
|
|
|
Output format (one JSON object per line):
|
|
{
|
|
"instance_id": "django__django-16379",
|
|
"repo": "django/django",
|
|
"base_commit": "abc123...",
|
|
"problem_statement": "...",
|
|
"hints_text": "...",
|
|
"patch": "...", # gold patch (for reference, not shown to agent)
|
|
"test_patch": "...", # test patch applied during evaluation
|
|
"version": "4.2",
|
|
"environment_setup_commit": "..."
|
|
}
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
|
|
DATASET_MAP = {
|
|
"verified": "princeton-nlp/SWE-bench_Verified",
|
|
"lite": "princeton-nlp/SWE-bench_Lite",
|
|
"full": "princeton-nlp/SWE-bench",
|
|
}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Download SWE-bench dataset to JSONL")
|
|
parser.add_argument(
|
|
"--dataset",
|
|
choices=["verified", "lite", "full"],
|
|
default="lite",
|
|
help="Dataset variant (default: lite)",
|
|
)
|
|
parser.add_argument(
|
|
"--limit", type=int, default=0, help="Limit number of instances (0 = all)"
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=str,
|
|
default=None,
|
|
help="Output JSONL path (default: scripts/swe-bench/<dataset>.jsonl)",
|
|
)
|
|
parser.add_argument(
|
|
"--split",
|
|
type=str,
|
|
default="test",
|
|
help="Dataset split (default: test)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
from datasets import load_dataset
|
|
except ImportError:
|
|
print("Error: 'datasets' package not installed. Run: pip install datasets", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
dataset_name = DATASET_MAP[args.dataset]
|
|
output_path = args.output or f"scripts/swe-bench/{args.dataset}.jsonl"
|
|
|
|
print(f"Downloading {dataset_name} (split={args.split})...", file=sys.stderr)
|
|
ds = load_dataset(dataset_name, split=args.split)
|
|
|
|
# Fields to keep
|
|
keep_fields = [
|
|
"instance_id",
|
|
"repo",
|
|
"base_commit",
|
|
"problem_statement",
|
|
"hints_text",
|
|
"patch",
|
|
"test_patch",
|
|
"version",
|
|
"environment_setup_commit",
|
|
]
|
|
|
|
count = 0
|
|
with open(output_path, "w") as f:
|
|
for item in ds:
|
|
record = {}
|
|
for field in keep_fields:
|
|
if field in item:
|
|
record[field] = item[field]
|
|
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
count += 1
|
|
if args.limit and count >= args.limit:
|
|
break
|
|
|
|
print(f"Wrote {count} instances to {output_path}", file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|