From 2074aac49e8b4b9b1e19b6cd2024d18141d89f51 Mon Sep 17 00:00:00 2001 From: Jiayuan Zhang Date: Tue, 17 Feb 2026 00:50:01 +0800 Subject: [PATCH] feat(e2e): add clawhub skills benchmark suite --- docs/e2e-skills-benchmark.md | 94 ++++++ package.json | 3 +- scripts/e2e-skills-benchmark/analyze.mjs | 297 ++++++++++++++++++ .../cases/case-01-install-caldav-calendar.txt | 15 + .../case-02-gap-discovery-homeassistant.txt | 16 + .../case-03-install-update-codexmonitor.txt | 16 + scripts/e2e-skills-benchmark/run.sh | 170 ++++++++++ 7 files changed, 610 insertions(+), 1 deletion(-) create mode 100644 docs/e2e-skills-benchmark.md create mode 100755 scripts/e2e-skills-benchmark/analyze.mjs create mode 100644 scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt create mode 100644 scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt create mode 100644 scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt create mode 100755 scripts/e2e-skills-benchmark/run.sh diff --git a/docs/e2e-skills-benchmark.md b/docs/e2e-skills-benchmark.md new file mode 100644 index 00000000..e9859624 --- /dev/null +++ b/docs/e2e-skills-benchmark.md @@ -0,0 +1,94 @@ +# Skills Agent-Driven E2E Benchmark + +This benchmark validates the meta skill workflow for capability-gap discovery, ClawHub installation, and security-gated rollout. + +## Scope + +- Domain: skill discovery + installation + update +- Focus: `skills/meta-skill-installer` +- Providers: default `kimi-coding` (override with `PROVIDERS`) +- Cases: 3 + +Case prompts are stored in: +- `scripts/e2e-skills-benchmark/cases/` + +## Real ClawHub Examples Used + +The case set references real public pages from ClawHub: + +- [CalDAV Calendar](https://clawhub.ai/skills/caldav-calendar) +- [Home Assistant](https://clawhub.ai/skills/homeassistant) +- [CodexMonitor](https://clawhub.ai/odrobnik/codexmonitor) + +## Prerequisites + +1. Credentials configured (`pnpm multica credentials init` if needed) +2. Dependencies installed in repo (`pnpm install`) +3. `clawhub` CLI available, or allow runtime fallback to `npx -y clawhub` +4. Required env: + +```bash +export SMC_DATA_DIR=~/.super-multica-e2e +export MULTICA_API_URL=https://api-dev.copilothub.ai +``` + +## Run Benchmark + +```bash +scripts/e2e-skills-benchmark/run.sh +``` + +Defaults: + +- Providers: `kimi-coding` +- Case glob: `case-*.txt` +- Max parallel workers: `1` +- Per-case timeout: `1200s` (`CASE_TIMEOUT_SEC=0` to disable) +- Output directory: `.context/skills-e2e-runs//` + +Generated artifacts: + +- `manifest.tsv`: provider/case/status/session/log metadata +- `analysis.txt`: human-readable pass/fail report +- `analysis.json`: structured detailed check output + +## Run Subset + +Only one case: + +```bash +CASE_GLOB="case-01-*.txt" scripts/e2e-skills-benchmark/run.sh +``` + +Multiple providers: + +```bash +PROVIDERS="kimi-coding claude-code" scripts/e2e-skills-benchmark/run.sh +``` + +Faster throughput: + +```bash +MAX_PARALLEL=2 CASE_TIMEOUT_SEC=1800 scripts/e2e-skills-benchmark/run.sh +``` + +## Analyzer Checks + +For each run: + +1. `run_start` and `run_end` both present +2. `run_end.error` is empty/null +3. `tool_start` and `tool_end` are paired +4. no `tool_end.is_error=true` +5. at least one `exec` tool call exists +6. case-specific command evidence in `tool_start.args`: + - `clawhub search` + - `clawhub install` + - `review-skill-security.mjs` + - for case 03 also `clawhub update` + +## Notes + +- These are agent-driven tests; prompt intent plus run-log evidence are both evaluated. +- `SMC_DATA_DIR=~/.super-multica-e2e` avoids polluting normal user skill/session data. +- If a case fails, open `manifest.tsv` and inspect the matching `session_dir/run-log.jsonl`. diff --git a/package.json b/package.json index 80b868f4..47661ee2 100644 --- a/package.json +++ b/package.json @@ -30,7 +30,8 @@ "typecheck": "turbo typecheck", "test": "vitest run", "test:watch": "vitest", - "test:coverage": "vitest run --coverage" + "test:coverage": "vitest run --coverage", + "e2e:skills": "bash scripts/e2e-skills-benchmark/run.sh" }, "keywords": [], "author": "", diff --git a/scripts/e2e-skills-benchmark/analyze.mjs b/scripts/e2e-skills-benchmark/analyze.mjs new file mode 100755 index 00000000..ac090783 --- /dev/null +++ b/scripts/e2e-skills-benchmark/analyze.mjs @@ -0,0 +1,297 @@ +#!/usr/bin/env node + +import { existsSync, readFileSync, writeFileSync } from "node:fs"; +import { dirname, join, resolve } from "node:path"; + +/** + * @typedef {{ + * id: string; + * check: string; + * passed: boolean; + * detail?: string; + * }} CheckResult + */ + +/** + * @typedef {{ + * provider: string; + * caseId: string; + * status: string; + * sessionId: string; + * sessionDir: string; + * logFile: string; + * checks: CheckResult[]; + * pass: boolean; + * }} CaseAnalysis + */ + +const manifestArg = process.argv[2]; +if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") { + console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs "); + process.exit(0); +} + +const manifestPath = resolve(manifestArg); +if (!existsSync(manifestPath)) { + console.error(`Manifest not found: ${manifestPath}`); + process.exit(1); +} + +const CASE_RULES = { + "case-01-install-caldav-calendar": { + requiredCommandTokens: [ + ["clawhub", "search"], + ["caldav"], + ["clawhub", "install"], + ["review-skill-security.mjs"], + ], + }, + "case-02-gap-discovery-homeassistant": { + requiredCommandTokens: [ + ["clawhub", "search"], + ["home", "assistant"], + ["clawhub", "install"], + ["review-skill-security.mjs"], + ], + }, + "case-03-install-update-codexmonitor": { + requiredCommandTokens: [ + ["clawhub", "search"], + ["codexmonitor"], + ["clawhub", "install"], + ["clawhub", "update"], + ["review-skill-security.mjs"], + ], + }, +}; + +/** + * @param {string} text + * @returns {string[]} + */ +function splitLines(text) { + return text.split(/\r?\n/).filter(Boolean); +} + +/** + * @param {string} command + * @param {string[]} tokens + * @returns {boolean} + */ +function commandHasTokens(command, tokens) { + const lower = command.toLowerCase(); + return tokens.every((token) => lower.includes(token.toLowerCase())); +} + +/** + * @param {string} rawArgs + * @returns {string} + */ +function extractCommand(rawArgs) { + if (!rawArgs) return ""; + try { + const parsed = JSON.parse(rawArgs); + if (parsed && typeof parsed.command === "string") { + return parsed.command; + } + } catch { + // Fall through: args may be truncated JSON in run-log. + } + return rawArgs; +} + +/** + * @param {string} runLogPath + */ +function parseRunLog(runLogPath) { + const lines = splitLines(readFileSync(runLogPath, "utf-8")); + const events = []; + for (const line of lines) { + try { + events.push(JSON.parse(line)); + } catch { + // Ignore malformed lines but keep analysis alive. + } + } + return events; +} + +/** + * @param {CaseAnalysis} analysis + * @param {string} id + * @param {string} check + * @param {boolean} passed + * @param {string} [detail] + */ +function addCheck(analysis, id, check, passed, detail) { + analysis.checks.push({ id, check, passed, detail }); +} + +const rows = splitLines(readFileSync(manifestPath, "utf-8")); +if (rows.length <= 1) { + console.error(`Manifest has no data rows: ${manifestPath}`); + process.exit(1); +} + +/** @type {CaseAnalysis[]} */ +const analyses = []; + +for (let i = 1; i < rows.length; i++) { + const row = rows[i]; + if (!row) continue; + + const cols = row.split("\t"); + if (cols.length < 11) continue; + + const provider = cols[1] ?? ""; + const caseId = cols[2] ?? ""; + const status = cols[3] ?? ""; + const sessionId = cols[4] ?? ""; + const sessionDir = cols[5] ?? ""; + const logFile = cols[6] ?? ""; + + /** @type {CaseAnalysis} */ + const analysis = { + provider, + caseId, + status, + sessionId, + sessionDir, + logFile, + checks: [], + pass: false, + }; + + addCheck( + analysis, + "run-status", + "runner status is success", + status === "success", + `status=${status}`, + ); + + if (!sessionDir) { + addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir"); + analyses.push(analysis); + continue; + } + + const runLogPath = join(sessionDir, "run-log.jsonl"); + addCheck( + analysis, + "run-log-file", + "run-log.jsonl exists", + existsSync(runLogPath), + runLogPath, + ); + + if (!existsSync(runLogPath)) { + analyses.push(analysis); + continue; + } + + const events = parseRunLog(runLogPath); + const runStarts = events.filter((e) => e.event === "run_start"); + const runEnds = events.filter((e) => e.event === "run_end"); + const toolStarts = events.filter((e) => e.event === "tool_start"); + const toolEnds = events.filter((e) => e.event === "tool_end"); + const errorToolEnds = toolEnds.filter((e) => e.is_error === true); + + addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`); + addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`); + addCheck( + analysis, + "tool-pairing", + "tool_start count matches tool_end count", + toolStarts.length === toolEnds.length, + `start=${toolStarts.length} end=${toolEnds.length}`, + ); + + const finalRunEnd = runEnds.at(-1); + const runEndError = finalRunEnd?.error; + addCheck( + analysis, + "run-end-error", + "final run_end.error is null/empty", + runEndError === null || runEndError === undefined || runEndError === "", + `error=${String(runEndError)}`, + ); + + addCheck( + analysis, + "tool-errors", + "no tool_end has is_error=true", + errorToolEnds.length === 0, + `error_tool_calls=${errorToolEnds.length}`, + ); + + const execCommands = toolStarts + .filter((e) => e.tool === "exec") + .map((e) => extractCommand(typeof e.args === "string" ? e.args : "")) + .filter(Boolean); + + addCheck( + analysis, + "exec-usage", + "at least one exec command was used", + execCommands.length > 0, + `exec_calls=${execCommands.length}`, + ); + + const rules = CASE_RULES[caseId]; + if (rules) { + for (let r = 0; r < rules.requiredCommandTokens.length; r++) { + const tokenList = rules.requiredCommandTokens[r]; + const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList)); + addCheck( + analysis, + `cmd-${r + 1}`, + `exec command contains tokens: ${tokenList.join(" + ")}`, + passed, + ); + } + } else { + addCheck( + analysis, + "case-rules", + "case has rule set", + false, + `No rules defined for case_id=${caseId}`, + ); + } + + analysis.pass = analysis.checks.every((c) => c.passed); + analyses.push(analysis); +} + +const passedCases = analyses.filter((a) => a.pass).length; +const failedCases = analyses.length - passedCases; + +const output = { + manifestPath, + totalCases: analyses.length, + passedCases, + failedCases, + results: analyses, +}; + +const outputPath = join(dirname(manifestPath), "analysis.json"); +writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8"); + +for (const item of analyses) { + const status = item.pass ? "PASS" : "FAIL"; + console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`); + for (const check of item.checks) { + const marker = check.passed ? " [ok] " : " [bad] "; + const detail = check.detail ? ` (${check.detail})` : ""; + console.log(`${marker}${check.check}${detail}`); + } +} + +console.log(""); +console.log(`Analysis file: ${outputPath}`); +console.log(`Summary: pass=${passedCases} fail=${failedCases}`); + +if (failedCases > 0) { + process.exit(1); +} diff --git a/scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt b/scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt new file mode 100644 index 00000000..2b1f6571 --- /dev/null +++ b/scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt @@ -0,0 +1,15 @@ +Run an end-to-end test for the Meta Skill Installer. + +Goal: install a real ClawHub skill for CalDAV calendar capability. +Reference page: https://clawhub.ai/skills/caldav-calendar + +Follow this exact workflow: +1. State the missing capability in one sentence. +2. Search ClawHub for CalDAV-related skills and choose the best candidate. +3. Stage-install to a temporary directory first (never install directly to active skills path). +4. Run security review on the staged skill: + DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}" + node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "" +5. If riskLevel is safe, install to "$DATA_DIR/skills". +6. Verify final install by checking "$DATA_DIR/skills//SKILL.md" exists. +7. Return a short report: selected slug, riskLevel, final install path. diff --git a/scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt b/scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt new file mode 100644 index 00000000..a72d65f6 --- /dev/null +++ b/scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt @@ -0,0 +1,16 @@ +Run an end-to-end capability-gap discovery test for Meta Skill Installer. + +User intent: "I need to control Home Assistant lights and switches from the agent." +Reference page: https://clawhub.ai/skills/homeassistant + +Requirements: +1. Treat this as a missing capability and explicitly define the gap. +2. Search ClawHub for relevant skills and list the top 3 candidates. +3. Pick one candidate with rationale (scope match + lower security risk). +4. Stage-install to a temporary directory. +5. Run security review: + DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}" + node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "" +6. If riskLevel is safe, install to "$DATA_DIR/skills". +7. Verify "$DATA_DIR/skills//SKILL.md" exists. +8. Return: candidate list, chosen slug, riskLevel, and final path. diff --git a/scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt b/scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt new file mode 100644 index 00000000..9c828b62 --- /dev/null +++ b/scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt @@ -0,0 +1,16 @@ +Run an end-to-end install+update regression test for Meta Skill Installer. + +Goal: use a real ClawHub skill and verify install, review, and update flow. +Reference page: https://clawhub.ai/odrobnik/codexmonitor + +Requirements: +1. Search ClawHub for CodexMonitor and select the matching skill slug. +2. Stage-install to a temporary directory and run security review: + DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}" + node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "" +3. If riskLevel is safe, install to "$DATA_DIR/skills". +4. Verify "$DATA_DIR/skills//SKILL.md" exists. +5. Run an update for the same slug in managed dir: + clawhub update "" --workdir "$DATA_DIR" --dir skills --force +6. Run security review again on the final installed path. +7. Return: slug, initial riskLevel, update executed (yes/no), final path. diff --git a/scripts/e2e-skills-benchmark/run.sh b/scripts/e2e-skills-benchmark/run.sh new file mode 100755 index 00000000..01c873cf --- /dev/null +++ b/scripts/e2e-skills-benchmark/run.sh @@ -0,0 +1,170 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)" +CASES_DIR="${SCRIPT_DIR}/cases" +TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}" +OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/skills-e2e-runs/${TIMESTAMP}}" +RESULTS_DIR="${OUT_DIR}/results" +MANIFEST="${OUT_DIR}/manifest.tsv" + +# Required environment for agent-driven E2E. +SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}" +MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}" +PROVIDERS_RAW="${PROVIDERS:-kimi-coding}" +CASE_GLOB="${CASE_GLOB:-case-*.txt}" +CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-1200}" +MAX_PARALLEL="${MAX_PARALLEL:-1}" +TIMEOUT_ENABLED="true" +if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then + TIMEOUT_ENABLED="false" +fi + +if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then + echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2 + exit 1 +fi + +if [[ "${1:-}" == "--worker" ]]; then + provider="${2:?missing provider}" + case_file="${3:?missing case file}" + case_base="$(basename "${case_file}")" + case_id="${case_base%.txt}" + log_file="${OUT_DIR}/${provider}-${case_id}.log" + result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv" + + prompt="$(cat "${case_file}")" + + status="success" + timed_out="false" + started_epoch="$(date +%s)" + started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + + SMC_DATA_DIR="${SMC_DATA_DIR}" \ + MULTICA_API_URL="${MULTICA_API_URL}" \ + pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 & + cmd_pid=$! + + while kill -0 "${cmd_pid}" 2>/dev/null; do + if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then + now="$(date +%s)" + elapsed="$((now - started_epoch))" + if (( elapsed >= CASE_TIMEOUT_SEC )); then + timed_out="true" + kill "${cmd_pid}" 2>/dev/null || true + sleep 1 + kill -9 "${cmd_pid}" 2>/dev/null || true + break + fi + fi + sleep 2 + done + + exit_code=0 + wait "${cmd_pid}" 2>/dev/null || exit_code=$? + ended_epoch="$(date +%s)" + ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + duration_sec="$((ended_epoch - started_epoch))" + + if [[ "${timed_out}" == "true" ]]; then + status="timeout" + printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}" + elif (( exit_code != 0 )); then + status="failed" + elif [[ ! -s "${log_file}" ]]; then + status="failed" + elif ! rg -q "\[session: " "${log_file}"; then + status="failed" + fi + + session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)" + session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)" + + printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \ + "${TIMESTAMP}" \ + "${provider}" \ + "${case_id}" \ + "${status}" \ + "${session_id}" \ + "${session_dir}" \ + "${log_file}" \ + "${started_at}" \ + "${ended_at}" \ + "${duration_sec}" \ + "${exit_code}" > "${result_file}" + + printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \ + "${provider}" \ + "${case_id}" \ + "${status}" \ + "${duration_sec}" \ + "${session_id:-N/A}" + exit 0 +fi + +mkdir -p "${OUT_DIR}" +mkdir -p "${RESULTS_DIR}" +printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}" + +read -r -a PROVIDERS <<< "${PROVIDERS_RAW}" + +CASE_FILES=() +while IFS= read -r line; do + CASE_FILES+=("${line}") +done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort) + +if [[ ${#CASE_FILES[@]} -eq 0 ]]; then + echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2 + exit 1 +fi + +echo "Output directory: ${OUT_DIR}" +echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}" +echo "Using MULTICA_API_URL=${MULTICA_API_URL}" +echo "Providers: ${PROVIDERS[*]}" +echo "Cases: ${#CASE_FILES[@]}" +echo "Max parallel: ${MAX_PARALLEL}" +if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then + echo "Case timeout: ${CASE_TIMEOUT_SEC}s" +else + echo "Case timeout: disabled" +fi + +TASKS=() +for provider in "${PROVIDERS[@]}"; do + for case_file in "${CASE_FILES[@]}"; do + TASKS+=("${provider}" "${case_file}") + done +done + +echo "Total tasks: $(( ${#TASKS[@]} / 2 ))" + +export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED +printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker + +RESULT_FILES=() +while IFS= read -r line; do + RESULT_FILES+=("${line}") +done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort) + +if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then + echo "No result files produced in ${RESULTS_DIR}" >&2 + exit 1 +fi + +for result_file in "${RESULT_FILES[@]}"; do + cat "${result_file}" >> "${MANIFEST}" +done + +success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")" +failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")" +timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")" + +echo +echo "Completed run stage. Manifest: ${MANIFEST}" +echo "Run summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}" + +echo +echo "Running structured analysis..." +node "${SCRIPT_DIR}/analyze.mjs" "${MANIFEST}" | tee "${OUT_DIR}/analysis.txt"