multica/scripts/swe-bench/evaluate.sh
Jiayuan Zhang 90d374ffd5 feat(scripts): add SWE-bench runner for Multica agent evaluation
- download-dataset.py: fetches SWE-bench Lite/Verified/Full from HuggingFace
- run.ts: core runner that clones repos, runs Agent, collects git diff patches
- evaluate.sh: wrapper for official SWE-bench Docker evaluation harness
- analyze.ts: summarizes run results with per-repo and timing breakdowns

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 18:05:17 +08:00

68 lines
1.8 KiB
Bash
Executable file

#!/usr/bin/env bash
#
# Evaluate Multica predictions against SWE-bench using the official Docker harness.
#
# Prerequisites:
# pip install swebench
# Docker running with at least 120GB storage, 16GB RAM, 8 CPU cores
#
# Usage:
# bash scripts/swe-bench/evaluate.sh [predictions.jsonl] [dataset] [run_id]
#
# Examples:
# bash scripts/swe-bench/evaluate.sh
# bash scripts/swe-bench/evaluate.sh scripts/swe-bench/predictions.jsonl lite multica-v1
set -euo pipefail
PREDICTIONS="${1:-scripts/swe-bench/predictions.jsonl}"
DATASET="${2:-lite}"
RUN_ID="${3:-multica}"
# Map short names to HuggingFace dataset names
case "$DATASET" in
lite) DATASET_NAME="princeton-nlp/SWE-bench_Lite" ;;
verified) DATASET_NAME="princeton-nlp/SWE-bench_Verified" ;;
full) DATASET_NAME="princeton-nlp/SWE-bench" ;;
*) DATASET_NAME="$DATASET" ;;
esac
echo "=== SWE-bench Evaluation ==="
echo "Predictions: $PREDICTIONS"
echo "Dataset: $DATASET_NAME"
echo "Run ID: $RUN_ID"
echo ""
if [ ! -f "$PREDICTIONS" ]; then
echo "Error: Predictions file not found: $PREDICTIONS"
exit 1
fi
TASK_COUNT=$(wc -l < "$PREDICTIONS" | tr -d ' ')
echo "Tasks to evaluate: $TASK_COUNT"
echo ""
# Check if swebench is installed
if ! python -c "import swebench" 2>/dev/null; then
echo "Error: swebench not installed. Run: pip install swebench"
exit 1
fi
# Check if Docker is running
if ! docker info >/dev/null 2>&1; then
echo "Error: Docker is not running"
exit 1
fi
echo "Starting evaluation (this may take a while)..."
echo ""
python -m swebench.harness.run_evaluation \
--dataset_name "$DATASET_NAME" \
--predictions_path "$PREDICTIONS" \
--max_workers 4 \
--run_id "$RUN_ID"
echo ""
echo "=== Evaluation Complete ==="
echo "Check logs/ and evaluation_results/ for detailed results."