diff --git a/docs/e2e-finance-benchmark.md b/docs/e2e-finance-benchmark.md index 23f538f9..80d8ac1a 100644 --- a/docs/e2e-finance-benchmark.md +++ b/docs/e2e-finance-benchmark.md @@ -32,6 +32,8 @@ scripts/e2e-finance-benchmark/run.sh The script defaults: - Providers: `kimi-coding claude-code` - Case glob: `case-*.txt` +- Max parallel workers: `2` +- Per-case timeout: `900s` (set `CASE_TIMEOUT_SEC=0` to disable) - Output directory: `.context/finance-e2e-runs//` Generated artifact: @@ -51,6 +53,12 @@ Run only specific cases by glob: CASE_GLOB="case-0[1-3]*.txt" scripts/e2e-finance-benchmark/run.sh ``` +Run with higher parallelism for long-horizon tasks: + +```bash +MAX_PARALLEL=4 CASE_TIMEOUT_SEC=2700 scripts/e2e-finance-benchmark/run.sh +``` + ## Case List 1. `case-01-top10-financial-reports.txt` diff --git a/scripts/e2e-finance-benchmark/run.sh b/scripts/e2e-finance-benchmark/run.sh index 439a7a67..5d600906 100755 --- a/scripts/e2e-finance-benchmark/run.sh +++ b/scripts/e2e-finance-benchmark/run.sh @@ -4,8 +4,10 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)" CASES_DIR="${SCRIPT_DIR}/cases" -TIMESTAMP="$(date +%Y%m%d-%H%M%S)" -OUT_DIR="${ROOT_DIR}/.context/finance-e2e-runs/${TIMESTAMP}" +TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}" +OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/finance-e2e-runs/${TIMESTAMP}}" +RESULTS_DIR="${OUT_DIR}/results" +MANIFEST="${OUT_DIR}/manifest.tsv" # Required environment for agent-driven E2E with web_search/data tools. SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}" @@ -13,16 +15,99 @@ MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}" PROVIDERS_RAW="${PROVIDERS:-kimi-coding claude-code}" CASE_GLOB="${CASE_GLOB:-case-*.txt}" CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-900}" +MAX_PARALLEL="${MAX_PARALLEL:-2}" TIMEOUT_ENABLED="true" if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then TIMEOUT_ENABLED="false" fi -read -r -a PROVIDERS <<< "${PROVIDERS_RAW}" +if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then + echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2 + exit 1 +fi + +if [[ "${1:-}" == "--worker" ]]; then + provider="${2:?missing provider}" + case_file="${3:?missing case file}" + case_base="$(basename "${case_file}")" + case_id="${case_base%.txt}" + log_file="${OUT_DIR}/${provider}-${case_id}.log" + result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv" + + prompt="$(cat "${case_file}")" + + status="success" + timed_out="false" + started_epoch="$(date +%s)" + started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + + SMC_DATA_DIR="${SMC_DATA_DIR}" \ + MULTICA_API_URL="${MULTICA_API_URL}" \ + pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 & + cmd_pid=$! + + while kill -0 "${cmd_pid}" 2>/dev/null; do + if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then + now="$(date +%s)" + elapsed="$((now - started_epoch))" + if (( elapsed >= CASE_TIMEOUT_SEC )); then + timed_out="true" + kill "${cmd_pid}" 2>/dev/null || true + sleep 1 + kill -9 "${cmd_pid}" 2>/dev/null || true + break + fi + fi + sleep 2 + done + + exit_code=0 + wait "${cmd_pid}" 2>/dev/null || exit_code=$? + ended_epoch="$(date +%s)" + ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + duration_sec="$((ended_epoch - started_epoch))" + + if [[ "${timed_out}" == "true" ]]; then + status="timeout" + printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}" + elif (( exit_code != 0 )); then + status="failed" + elif [[ ! -s "${log_file}" ]]; then + status="failed" + elif ! rg -q "\[session: " "${log_file}"; then + status="failed" + fi + + session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)" + session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)" + + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \ + "${TIMESTAMP}" \ + "${provider}" \ + "${case_id}" \ + "${status}" \ + "${session_id}" \ + "${session_dir}" \ + "${log_file}" \ + "${started_at}" \ + "${ended_at}" \ + "${duration_sec}" \ + "${exit_code}" > "${result_file}" + + printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \ + "${provider}" \ + "${case_id}" \ + "${status}" \ + "${duration_sec}" \ + "${session_id:-N/A}" + exit 0 +fi mkdir -p "${OUT_DIR}" -MANIFEST="${OUT_DIR}/manifest.tsv" -printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\n" > "${MANIFEST}" +mkdir -p "${RESULTS_DIR}" +printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}" + +read -r -a PROVIDERS <<< "${PROVIDERS_RAW}" CASE_FILES=() while IFS= read -r line; do @@ -39,79 +124,43 @@ echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}" echo "Using MULTICA_API_URL=${MULTICA_API_URL}" echo "Providers: ${PROVIDERS[*]}" echo "Cases: ${#CASE_FILES[@]}" +echo "Max parallel: ${MAX_PARALLEL}" if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then echo "Case timeout: ${CASE_TIMEOUT_SEC}s" else echo "Case timeout: disabled" fi -total=0 +TASKS=() for provider in "${PROVIDERS[@]}"; do for case_file in "${CASE_FILES[@]}"; do - total=$((total + 1)) - case_base="$(basename "${case_file}")" - case_id="${case_base%.txt}" - log_file="${OUT_DIR}/${provider}-${case_id}.log" - - prompt="$(cat "${case_file}")" - - echo - echo "[${total}] Running ${case_id} with provider=${provider}" - - status="success" - timed_out="false" - started_at="$(date +%s)" - - ( - SMC_DATA_DIR="${SMC_DATA_DIR}" \ - MULTICA_API_URL="${MULTICA_API_URL}" \ - pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 - ) & - cmd_pid=$! - - while kill -0 "${cmd_pid}" 2>/dev/null; do - if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then - now="$(date +%s)" - elapsed="$((now - started_at))" - if (( elapsed >= CASE_TIMEOUT_SEC )); then - timed_out="true" - kill "${cmd_pid}" 2>/dev/null || true - sleep 1 - kill -9 "${cmd_pid}" 2>/dev/null || true - break - fi - fi - sleep 2 - done - - exit_code=0 - wait "${cmd_pid}" || exit_code=$? - if [[ "${timed_out}" == "true" ]]; then - status="timeout" - printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}" - elif (( exit_code != 0 )); then - status="failed" - elif [[ ! -s "${log_file}" ]]; then - status="failed" - elif ! rg -q "\[session: " "${log_file}"; then - status="failed" - fi - - session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)" - session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)" - - printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \ - "${TIMESTAMP}" \ - "${provider}" \ - "${case_id}" \ - "${status}" \ - "${session_id}" \ - "${session_dir}" \ - "${log_file}" >> "${MANIFEST}" - - echo "status=${status} session=${session_id:-N/A}" + TASKS+=("${provider}" "${case_file}") done done +echo "Total tasks: $(( ${#TASKS[@]} / 2 ))" + +export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED +printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker + +RESULT_FILES=() +while IFS= read -r line; do + RESULT_FILES+=("${line}") +done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort) + +if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then + echo "No result files produced in ${RESULTS_DIR}" >&2 + exit 1 +fi + +for result_file in "${RESULT_FILES[@]}"; do + cat "${result_file}" >> "${MANIFEST}" +done + +success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")" +failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")" +timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")" + echo echo "Completed. Manifest: ${MANIFEST}" +echo "Summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"