- download-dataset.py: fetches SWE-bench Lite/Verified/Full from HuggingFace - run.ts: core runner that clones repos, runs Agent, collects git diff patches - evaluate.sh: wrapper for official SWE-bench Docker evaluation harness - analyze.ts: summarizes run results with per-repo and timing breakdowns Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
68 lines
1.8 KiB
Bash
Executable file
68 lines
1.8 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
#
|
|
# Evaluate Multica predictions against SWE-bench using the official Docker harness.
|
|
#
|
|
# Prerequisites:
|
|
# pip install swebench
|
|
# Docker running with at least 120GB storage, 16GB RAM, 8 CPU cores
|
|
#
|
|
# Usage:
|
|
# bash scripts/swe-bench/evaluate.sh [predictions.jsonl] [dataset] [run_id]
|
|
#
|
|
# Examples:
|
|
# bash scripts/swe-bench/evaluate.sh
|
|
# bash scripts/swe-bench/evaluate.sh scripts/swe-bench/predictions.jsonl lite multica-v1
|
|
|
|
set -euo pipefail
|
|
|
|
PREDICTIONS="${1:-scripts/swe-bench/predictions.jsonl}"
|
|
DATASET="${2:-lite}"
|
|
RUN_ID="${3:-multica}"
|
|
|
|
# Map short names to HuggingFace dataset names
|
|
case "$DATASET" in
|
|
lite) DATASET_NAME="princeton-nlp/SWE-bench_Lite" ;;
|
|
verified) DATASET_NAME="princeton-nlp/SWE-bench_Verified" ;;
|
|
full) DATASET_NAME="princeton-nlp/SWE-bench" ;;
|
|
*) DATASET_NAME="$DATASET" ;;
|
|
esac
|
|
|
|
echo "=== SWE-bench Evaluation ==="
|
|
echo "Predictions: $PREDICTIONS"
|
|
echo "Dataset: $DATASET_NAME"
|
|
echo "Run ID: $RUN_ID"
|
|
echo ""
|
|
|
|
if [ ! -f "$PREDICTIONS" ]; then
|
|
echo "Error: Predictions file not found: $PREDICTIONS"
|
|
exit 1
|
|
fi
|
|
|
|
TASK_COUNT=$(wc -l < "$PREDICTIONS" | tr -d ' ')
|
|
echo "Tasks to evaluate: $TASK_COUNT"
|
|
echo ""
|
|
|
|
# Check if swebench is installed
|
|
if ! python -c "import swebench" 2>/dev/null; then
|
|
echo "Error: swebench not installed. Run: pip install swebench"
|
|
exit 1
|
|
fi
|
|
|
|
# Check if Docker is running
|
|
if ! docker info >/dev/null 2>&1; then
|
|
echo "Error: Docker is not running"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Starting evaluation (this may take a while)..."
|
|
echo ""
|
|
|
|
python -m swebench.harness.run_evaluation \
|
|
--dataset_name "$DATASET_NAME" \
|
|
--predictions_path "$PREDICTIONS" \
|
|
--max_workers 4 \
|
|
--run_id "$RUN_ID"
|
|
|
|
echo ""
|
|
echo "=== Evaluation Complete ==="
|
|
echo "Check logs/ and evaluation_results/ for detailed results."
|