feat(e2e): add clawhub skills benchmark suite

This commit is contained in:
Jiayuan Zhang 2026-02-17 00:50:01 +08:00
parent 0c1856b54b
commit 2074aac49e
7 changed files with 610 additions and 1 deletions

View file

@ -0,0 +1,94 @@
# Skills Agent-Driven E2E Benchmark
This benchmark validates the meta skill workflow for capability-gap discovery, ClawHub installation, and security-gated rollout.
## Scope
- Domain: skill discovery + installation + update
- Focus: `skills/meta-skill-installer`
- Providers: default `kimi-coding` (override with `PROVIDERS`)
- Cases: 3
Case prompts are stored in:
- `scripts/e2e-skills-benchmark/cases/`
## Real ClawHub Examples Used
The case set references real public pages from ClawHub:
- [CalDAV Calendar](https://clawhub.ai/skills/caldav-calendar)
- [Home Assistant](https://clawhub.ai/skills/homeassistant)
- [CodexMonitor](https://clawhub.ai/odrobnik/codexmonitor)
## Prerequisites
1. Credentials configured (`pnpm multica credentials init` if needed)
2. Dependencies installed in repo (`pnpm install`)
3. `clawhub` CLI available, or allow runtime fallback to `npx -y clawhub`
4. Required env:
```bash
export SMC_DATA_DIR=~/.super-multica-e2e
export MULTICA_API_URL=https://api-dev.copilothub.ai
```
## Run Benchmark
```bash
scripts/e2e-skills-benchmark/run.sh
```
Defaults:
- Providers: `kimi-coding`
- Case glob: `case-*.txt`
- Max parallel workers: `1`
- Per-case timeout: `1200s` (`CASE_TIMEOUT_SEC=0` to disable)
- Output directory: `.context/skills-e2e-runs/<timestamp>/`
Generated artifacts:
- `manifest.tsv`: provider/case/status/session/log metadata
- `analysis.txt`: human-readable pass/fail report
- `analysis.json`: structured detailed check output
## Run Subset
Only one case:
```bash
CASE_GLOB="case-01-*.txt" scripts/e2e-skills-benchmark/run.sh
```
Multiple providers:
```bash
PROVIDERS="kimi-coding claude-code" scripts/e2e-skills-benchmark/run.sh
```
Faster throughput:
```bash
MAX_PARALLEL=2 CASE_TIMEOUT_SEC=1800 scripts/e2e-skills-benchmark/run.sh
```
## Analyzer Checks
For each run:
1. `run_start` and `run_end` both present
2. `run_end.error` is empty/null
3. `tool_start` and `tool_end` are paired
4. no `tool_end.is_error=true`
5. at least one `exec` tool call exists
6. case-specific command evidence in `tool_start.args`:
- `clawhub search`
- `clawhub install`
- `review-skill-security.mjs`
- for case 03 also `clawhub update`
## Notes
- These are agent-driven tests; prompt intent plus run-log evidence are both evaluated.
- `SMC_DATA_DIR=~/.super-multica-e2e` avoids polluting normal user skill/session data.
- If a case fails, open `manifest.tsv` and inspect the matching `session_dir/run-log.jsonl`.

View file

@ -30,7 +30,8 @@
"typecheck": "turbo typecheck",
"test": "vitest run",
"test:watch": "vitest",
"test:coverage": "vitest run --coverage"
"test:coverage": "vitest run --coverage",
"e2e:skills": "bash scripts/e2e-skills-benchmark/run.sh"
},
"keywords": [],
"author": "",

View file

@ -0,0 +1,297 @@
#!/usr/bin/env node
import { existsSync, readFileSync, writeFileSync } from "node:fs";
import { dirname, join, resolve } from "node:path";
/**
* @typedef {{
* id: string;
* check: string;
* passed: boolean;
* detail?: string;
* }} CheckResult
*/
/**
* @typedef {{
* provider: string;
* caseId: string;
* status: string;
* sessionId: string;
* sessionDir: string;
* logFile: string;
* checks: CheckResult[];
* pass: boolean;
* }} CaseAnalysis
*/
const manifestArg = process.argv[2];
if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") {
console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs <manifest.tsv>");
process.exit(0);
}
const manifestPath = resolve(manifestArg);
if (!existsSync(manifestPath)) {
console.error(`Manifest not found: ${manifestPath}`);
process.exit(1);
}
const CASE_RULES = {
"case-01-install-caldav-calendar": {
requiredCommandTokens: [
["clawhub", "search"],
["caldav"],
["clawhub", "install"],
["review-skill-security.mjs"],
],
},
"case-02-gap-discovery-homeassistant": {
requiredCommandTokens: [
["clawhub", "search"],
["home", "assistant"],
["clawhub", "install"],
["review-skill-security.mjs"],
],
},
"case-03-install-update-codexmonitor": {
requiredCommandTokens: [
["clawhub", "search"],
["codexmonitor"],
["clawhub", "install"],
["clawhub", "update"],
["review-skill-security.mjs"],
],
},
};
/**
* @param {string} text
* @returns {string[]}
*/
function splitLines(text) {
return text.split(/\r?\n/).filter(Boolean);
}
/**
* @param {string} command
* @param {string[]} tokens
* @returns {boolean}
*/
function commandHasTokens(command, tokens) {
const lower = command.toLowerCase();
return tokens.every((token) => lower.includes(token.toLowerCase()));
}
/**
* @param {string} rawArgs
* @returns {string}
*/
function extractCommand(rawArgs) {
if (!rawArgs) return "";
try {
const parsed = JSON.parse(rawArgs);
if (parsed && typeof parsed.command === "string") {
return parsed.command;
}
} catch {
// Fall through: args may be truncated JSON in run-log.
}
return rawArgs;
}
/**
* @param {string} runLogPath
*/
function parseRunLog(runLogPath) {
const lines = splitLines(readFileSync(runLogPath, "utf-8"));
const events = [];
for (const line of lines) {
try {
events.push(JSON.parse(line));
} catch {
// Ignore malformed lines but keep analysis alive.
}
}
return events;
}
/**
* @param {CaseAnalysis} analysis
* @param {string} id
* @param {string} check
* @param {boolean} passed
* @param {string} [detail]
*/
function addCheck(analysis, id, check, passed, detail) {
analysis.checks.push({ id, check, passed, detail });
}
const rows = splitLines(readFileSync(manifestPath, "utf-8"));
if (rows.length <= 1) {
console.error(`Manifest has no data rows: ${manifestPath}`);
process.exit(1);
}
/** @type {CaseAnalysis[]} */
const analyses = [];
for (let i = 1; i < rows.length; i++) {
const row = rows[i];
if (!row) continue;
const cols = row.split("\t");
if (cols.length < 11) continue;
const provider = cols[1] ?? "";
const caseId = cols[2] ?? "";
const status = cols[3] ?? "";
const sessionId = cols[4] ?? "";
const sessionDir = cols[5] ?? "";
const logFile = cols[6] ?? "";
/** @type {CaseAnalysis} */
const analysis = {
provider,
caseId,
status,
sessionId,
sessionDir,
logFile,
checks: [],
pass: false,
};
addCheck(
analysis,
"run-status",
"runner status is success",
status === "success",
`status=${status}`,
);
if (!sessionDir) {
addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir");
analyses.push(analysis);
continue;
}
const runLogPath = join(sessionDir, "run-log.jsonl");
addCheck(
analysis,
"run-log-file",
"run-log.jsonl exists",
existsSync(runLogPath),
runLogPath,
);
if (!existsSync(runLogPath)) {
analyses.push(analysis);
continue;
}
const events = parseRunLog(runLogPath);
const runStarts = events.filter((e) => e.event === "run_start");
const runEnds = events.filter((e) => e.event === "run_end");
const toolStarts = events.filter((e) => e.event === "tool_start");
const toolEnds = events.filter((e) => e.event === "tool_end");
const errorToolEnds = toolEnds.filter((e) => e.is_error === true);
addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`);
addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`);
addCheck(
analysis,
"tool-pairing",
"tool_start count matches tool_end count",
toolStarts.length === toolEnds.length,
`start=${toolStarts.length} end=${toolEnds.length}`,
);
const finalRunEnd = runEnds.at(-1);
const runEndError = finalRunEnd?.error;
addCheck(
analysis,
"run-end-error",
"final run_end.error is null/empty",
runEndError === null || runEndError === undefined || runEndError === "",
`error=${String(runEndError)}`,
);
addCheck(
analysis,
"tool-errors",
"no tool_end has is_error=true",
errorToolEnds.length === 0,
`error_tool_calls=${errorToolEnds.length}`,
);
const execCommands = toolStarts
.filter((e) => e.tool === "exec")
.map((e) => extractCommand(typeof e.args === "string" ? e.args : ""))
.filter(Boolean);
addCheck(
analysis,
"exec-usage",
"at least one exec command was used",
execCommands.length > 0,
`exec_calls=${execCommands.length}`,
);
const rules = CASE_RULES[caseId];
if (rules) {
for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
const tokenList = rules.requiredCommandTokens[r];
const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
addCheck(
analysis,
`cmd-${r + 1}`,
`exec command contains tokens: ${tokenList.join(" + ")}`,
passed,
);
}
} else {
addCheck(
analysis,
"case-rules",
"case has rule set",
false,
`No rules defined for case_id=${caseId}`,
);
}
analysis.pass = analysis.checks.every((c) => c.passed);
analyses.push(analysis);
}
const passedCases = analyses.filter((a) => a.pass).length;
const failedCases = analyses.length - passedCases;
const output = {
manifestPath,
totalCases: analyses.length,
passedCases,
failedCases,
results: analyses,
};
const outputPath = join(dirname(manifestPath), "analysis.json");
writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8");
for (const item of analyses) {
const status = item.pass ? "PASS" : "FAIL";
console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`);
for (const check of item.checks) {
const marker = check.passed ? " [ok] " : " [bad] ";
const detail = check.detail ? ` (${check.detail})` : "";
console.log(`${marker}${check.check}${detail}`);
}
}
console.log("");
console.log(`Analysis file: ${outputPath}`);
console.log(`Summary: pass=${passedCases} fail=${failedCases}`);
if (failedCases > 0) {
process.exit(1);
}

View file

@ -0,0 +1,15 @@
Run an end-to-end test for the Meta Skill Installer.
Goal: install a real ClawHub skill for CalDAV calendar capability.
Reference page: https://clawhub.ai/skills/caldav-calendar
Follow this exact workflow:
1. State the missing capability in one sentence.
2. Search ClawHub for CalDAV-related skills and choose the best candidate.
3. Stage-install to a temporary directory first (never install directly to active skills path).
4. Run security review on the staged skill:
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
5. If riskLevel is safe, install to "$DATA_DIR/skills".
6. Verify final install by checking "$DATA_DIR/skills/<slug>/SKILL.md" exists.
7. Return a short report: selected slug, riskLevel, final install path.

View file

@ -0,0 +1,16 @@
Run an end-to-end capability-gap discovery test for Meta Skill Installer.
User intent: "I need to control Home Assistant lights and switches from the agent."
Reference page: https://clawhub.ai/skills/homeassistant
Requirements:
1. Treat this as a missing capability and explicitly define the gap.
2. Search ClawHub for relevant skills and list the top 3 candidates.
3. Pick one candidate with rationale (scope match + lower security risk).
4. Stage-install to a temporary directory.
5. Run security review:
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
6. If riskLevel is safe, install to "$DATA_DIR/skills".
7. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
8. Return: candidate list, chosen slug, riskLevel, and final path.

View file

@ -0,0 +1,16 @@
Run an end-to-end install+update regression test for Meta Skill Installer.
Goal: use a real ClawHub skill and verify install, review, and update flow.
Reference page: https://clawhub.ai/odrobnik/codexmonitor
Requirements:
1. Search ClawHub for CodexMonitor and select the matching skill slug.
2. Stage-install to a temporary directory and run security review:
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
3. If riskLevel is safe, install to "$DATA_DIR/skills".
4. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
5. Run an update for the same slug in managed dir:
clawhub update "<slug>" --workdir "$DATA_DIR" --dir skills --force
6. Run security review again on the final installed path.
7. Return: slug, initial riskLevel, update executed (yes/no), final path.

View file

@ -0,0 +1,170 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
CASES_DIR="${SCRIPT_DIR}/cases"
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/skills-e2e-runs/${TIMESTAMP}}"
RESULTS_DIR="${OUT_DIR}/results"
MANIFEST="${OUT_DIR}/manifest.tsv"
# Required environment for agent-driven E2E.
SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
PROVIDERS_RAW="${PROVIDERS:-kimi-coding}"
CASE_GLOB="${CASE_GLOB:-case-*.txt}"
CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-1200}"
MAX_PARALLEL="${MAX_PARALLEL:-1}"
TIMEOUT_ENABLED="true"
if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
TIMEOUT_ENABLED="false"
fi
if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
exit 1
fi
if [[ "${1:-}" == "--worker" ]]; then
provider="${2:?missing provider}"
case_file="${3:?missing case file}"
case_base="$(basename "${case_file}")"
case_id="${case_base%.txt}"
log_file="${OUT_DIR}/${provider}-${case_id}.log"
result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
prompt="$(cat "${case_file}")"
status="success"
timed_out="false"
started_epoch="$(date +%s)"
started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
SMC_DATA_DIR="${SMC_DATA_DIR}" \
MULTICA_API_URL="${MULTICA_API_URL}" \
pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
cmd_pid=$!
while kill -0 "${cmd_pid}" 2>/dev/null; do
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
now="$(date +%s)"
elapsed="$((now - started_epoch))"
if (( elapsed >= CASE_TIMEOUT_SEC )); then
timed_out="true"
kill "${cmd_pid}" 2>/dev/null || true
sleep 1
kill -9 "${cmd_pid}" 2>/dev/null || true
break
fi
fi
sleep 2
done
exit_code=0
wait "${cmd_pid}" 2>/dev/null || exit_code=$?
ended_epoch="$(date +%s)"
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
duration_sec="$((ended_epoch - started_epoch))"
if [[ "${timed_out}" == "true" ]]; then
status="timeout"
printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
elif (( exit_code != 0 )); then
status="failed"
elif [[ ! -s "${log_file}" ]]; then
status="failed"
elif ! rg -q "\[session: " "${log_file}"; then
status="failed"
fi
session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
"${TIMESTAMP}" \
"${provider}" \
"${case_id}" \
"${status}" \
"${session_id}" \
"${session_dir}" \
"${log_file}" \
"${started_at}" \
"${ended_at}" \
"${duration_sec}" \
"${exit_code}" > "${result_file}"
printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
"${provider}" \
"${case_id}" \
"${status}" \
"${duration_sec}" \
"${session_id:-N/A}"
exit 0
fi
mkdir -p "${OUT_DIR}"
mkdir -p "${RESULTS_DIR}"
printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
CASE_FILES=()
while IFS= read -r line; do
CASE_FILES+=("${line}")
done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
exit 1
fi
echo "Output directory: ${OUT_DIR}"
echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
echo "Providers: ${PROVIDERS[*]}"
echo "Cases: ${#CASE_FILES[@]}"
echo "Max parallel: ${MAX_PARALLEL}"
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
else
echo "Case timeout: disabled"
fi
TASKS=()
for provider in "${PROVIDERS[@]}"; do
for case_file in "${CASE_FILES[@]}"; do
TASKS+=("${provider}" "${case_file}")
done
done
echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
RESULT_FILES=()
while IFS= read -r line; do
RESULT_FILES+=("${line}")
done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
echo "No result files produced in ${RESULTS_DIR}" >&2
exit 1
fi
for result_file in "${RESULT_FILES[@]}"; do
cat "${result_file}" >> "${MANIFEST}"
done
success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
echo
echo "Completed run stage. Manifest: ${MANIFEST}"
echo "Run summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
echo
echo "Running structured analysis..."
node "${SCRIPT_DIR}/analyze.mjs" "${MANIFEST}" | tee "${OUT_DIR}/analysis.txt"