multica/scripts/e2e-skills-benchmark/run.sh
2026-02-17 02:37:29 +08:00

170 lines
5.3 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
CASES_DIR="${SCRIPT_DIR}/cases"
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/skills-e2e-runs/${TIMESTAMP}}"
RESULTS_DIR="${OUT_DIR}/results"
MANIFEST="${OUT_DIR}/manifest.tsv"
# Required environment for agent-driven E2E.
SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
PROVIDERS_RAW="${PROVIDERS:-kimi-coding}"
CASE_GLOB="${CASE_GLOB:-case-*.txt}"
CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-1200}"
MAX_PARALLEL="${MAX_PARALLEL:-1}"
TIMEOUT_ENABLED="true"
if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
TIMEOUT_ENABLED="false"
fi
if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
exit 1
fi
if [[ "${1:-}" == "--worker" ]]; then
provider="${2:?missing provider}"
case_file="${3:?missing case file}"
case_base="$(basename "${case_file}")"
case_id="${case_base%.txt}"
log_file="${OUT_DIR}/${provider}-${case_id}.log"
result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
prompt="$(cat "${case_file}")"
status="success"
timed_out="false"
started_epoch="$(date +%s)"
started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
SMC_DATA_DIR="${SMC_DATA_DIR}" \
MULTICA_API_URL="${MULTICA_API_URL}" \
pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
cmd_pid=$!
while kill -0 "${cmd_pid}" 2>/dev/null; do
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
now="$(date +%s)"
elapsed="$((now - started_epoch))"
if (( elapsed >= CASE_TIMEOUT_SEC )); then
timed_out="true"
kill "${cmd_pid}" 2>/dev/null || true
sleep 1
kill -9 "${cmd_pid}" 2>/dev/null || true
break
fi
fi
sleep 2
done
exit_code=0
wait "${cmd_pid}" 2>/dev/null || exit_code=$?
ended_epoch="$(date +%s)"
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
duration_sec="$((ended_epoch - started_epoch))"
if [[ "${timed_out}" == "true" ]]; then
status="timeout"
printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
elif (( exit_code != 0 )); then
status="failed"
elif [[ ! -s "${log_file}" ]]; then
status="failed"
elif ! rg -q "\[session: " "${log_file}"; then
status="failed"
fi
session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
"${TIMESTAMP}" \
"${provider}" \
"${case_id}" \
"${status}" \
"${session_id}" \
"${session_dir}" \
"${log_file}" \
"${started_at}" \
"${ended_at}" \
"${duration_sec}" \
"${exit_code}" > "${result_file}"
printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
"${provider}" \
"${case_id}" \
"${status}" \
"${duration_sec}" \
"${session_id:-N/A}"
exit 0
fi
mkdir -p "${OUT_DIR}"
mkdir -p "${RESULTS_DIR}"
printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
CASE_FILES=()
while IFS= read -r line; do
CASE_FILES+=("${line}")
done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
exit 1
fi
echo "Output directory: ${OUT_DIR}"
echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
echo "Providers: ${PROVIDERS[*]}"
echo "Cases: ${#CASE_FILES[@]}"
echo "Max parallel: ${MAX_PARALLEL}"
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
else
echo "Case timeout: disabled"
fi
TASKS=()
for provider in "${PROVIDERS[@]}"; do
for case_file in "${CASE_FILES[@]}"; do
TASKS+=("${provider}" "${case_file}")
done
done
echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
RESULT_FILES=()
while IFS= read -r line; do
RESULT_FILES+=("${line}")
done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
echo "No result files produced in ${RESULTS_DIR}" >&2
exit 1
fi
for result_file in "${RESULT_FILES[@]}"; do
cat "${result_file}" >> "${MANIFEST}"
done
success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
echo
echo "Completed run stage. Manifest: ${MANIFEST}"
echo "Run summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
echo
echo "Running structured analysis..."
node "${SCRIPT_DIR}/analyze.mjs" "${MANIFEST}" | tee "${OUT_DIR}/analysis.txt"