feat(e2e): add clawhub skills benchmark suite

2026-02-17 00:50:01 +08:00 · 2026-02-17 00:50:01 +08:00 · 2074aac49e
commit 2074aac49e
parent 0c1856b54b
7 changed files with 610 additions and 1 deletions
--- a/docs/e2e-skills-benchmark.md
+++ b/docs/e2e-skills-benchmark.md
@ -0,0 +1,94 @@
+# Skills Agent-Driven E2E Benchmark
+
+This benchmark validates the meta skill workflow for capability-gap discovery, ClawHub installation, and security-gated rollout.
+
+## Scope
+
+- Domain: skill discovery + installation + update
+- Focus: `skills/meta-skill-installer`
+- Providers: default `kimi-coding` (override with `PROVIDERS`)
+- Cases: 3
+
+Case prompts are stored in:
+- `scripts/e2e-skills-benchmark/cases/`
+
+## Real ClawHub Examples Used
+
+The case set references real public pages from ClawHub:
+
+- [CalDAV Calendar](https://clawhub.ai/skills/caldav-calendar)
+- [Home Assistant](https://clawhub.ai/skills/homeassistant)
+- [CodexMonitor](https://clawhub.ai/odrobnik/codexmonitor)
+
+## Prerequisites
+
+1. Credentials configured (`pnpm multica credentials init` if needed)
+2. Dependencies installed in repo (`pnpm install`)
+3. `clawhub` CLI available, or allow runtime fallback to `npx -y clawhub`
+4. Required env:
+
+```bash
+export SMC_DATA_DIR=~/.super-multica-e2e
+export MULTICA_API_URL=https://api-dev.copilothub.ai
+```
+
+## Run Benchmark
+
+```bash
+scripts/e2e-skills-benchmark/run.sh
+```
+
+Defaults:
+
+- Providers: `kimi-coding`
+- Case glob: `case-*.txt`
+- Max parallel workers: `1`
+- Per-case timeout: `1200s` (`CASE_TIMEOUT_SEC=0` to disable)
+- Output directory: `.context/skills-e2e-runs/<timestamp>/`
+
+Generated artifacts:
+
+- `manifest.tsv`: provider/case/status/session/log metadata
+- `analysis.txt`: human-readable pass/fail report
+- `analysis.json`: structured detailed check output
+
+## Run Subset
+
+Only one case:
+
+```bash
+CASE_GLOB="case-01-*.txt" scripts/e2e-skills-benchmark/run.sh
+```
+
+Multiple providers:
+
+```bash
+PROVIDERS="kimi-coding claude-code" scripts/e2e-skills-benchmark/run.sh
+```
+
+Faster throughput:
+
+```bash
+MAX_PARALLEL=2 CASE_TIMEOUT_SEC=1800 scripts/e2e-skills-benchmark/run.sh
+```
+
+## Analyzer Checks
+
+For each run:
+
+1. `run_start` and `run_end` both present
+2. `run_end.error` is empty/null
+3. `tool_start` and `tool_end` are paired
+4. no `tool_end.is_error=true`
+5. at least one `exec` tool call exists
+6. case-specific command evidence in `tool_start.args`:
+   - `clawhub search`
+   - `clawhub install`
+   - `review-skill-security.mjs`
+   - for case 03 also `clawhub update`
+
+## Notes
+
+- These are agent-driven tests; prompt intent plus run-log evidence are both evaluated.
+- `SMC_DATA_DIR=~/.super-multica-e2e` avoids polluting normal user skill/session data.
+- If a case fails, open `manifest.tsv` and inspect the matching `session_dir/run-log.jsonl`.
--- a/package.json
+++ b/package.json
@ -30,7 +30,8 @@
    "typecheck": "turbo typecheck",
    "test": "vitest run",
    "test:watch": "vitest",
-    "test:coverage": "vitest run --coverage"
+    "test:coverage": "vitest run --coverage",
+    "e2e:skills": "bash scripts/e2e-skills-benchmark/run.sh"
  },
  "keywords": [],
  "author": "",
--- a/scripts/e2e-skills-benchmark/analyze.mjs
+++ b/scripts/e2e-skills-benchmark/analyze.mjs
@ -0,0 +1,297 @@
+#!/usr/bin/env node
+
+import { existsSync, readFileSync, writeFileSync } from "node:fs";
+import { dirname, join, resolve } from "node:path";
+
+/**
+ * @typedef {{
+ *   id: string;
+ *   check: string;
+ *   passed: boolean;
+ *   detail?: string;
+ * }} CheckResult
+ */
+
+/**
+ * @typedef {{
+ *   provider: string;
+ *   caseId: string;
+ *   status: string;
+ *   sessionId: string;
+ *   sessionDir: string;
+ *   logFile: string;
+ *   checks: CheckResult[];
+ *   pass: boolean;
+ * }} CaseAnalysis
+ */
+
+const manifestArg = process.argv[2];
+if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") {
+  console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs <manifest.tsv>");
+  process.exit(0);
+}
+
+const manifestPath = resolve(manifestArg);
+if (!existsSync(manifestPath)) {
+  console.error(`Manifest not found: ${manifestPath}`);
+  process.exit(1);
+}
+
+const CASE_RULES = {
+  "case-01-install-caldav-calendar": {
+    requiredCommandTokens: [
+      ["clawhub", "search"],
+      ["caldav"],
+      ["clawhub", "install"],
+      ["review-skill-security.mjs"],
+    ],
+  },
+  "case-02-gap-discovery-homeassistant": {
+    requiredCommandTokens: [
+      ["clawhub", "search"],
+      ["home", "assistant"],
+      ["clawhub", "install"],
+      ["review-skill-security.mjs"],
+    ],
+  },
+  "case-03-install-update-codexmonitor": {
+    requiredCommandTokens: [
+      ["clawhub", "search"],
+      ["codexmonitor"],
+      ["clawhub", "install"],
+      ["clawhub", "update"],
+      ["review-skill-security.mjs"],
+    ],
+  },
+};
+
+/**
+ * @param {string} text
+ * @returns {string[]}
+ */
+function splitLines(text) {
+  return text.split(/\r?\n/).filter(Boolean);
+}
+
+/**
+ * @param {string} command
+ * @param {string[]} tokens
+ * @returns {boolean}
+ */
+function commandHasTokens(command, tokens) {
+  const lower = command.toLowerCase();
+  return tokens.every((token) => lower.includes(token.toLowerCase()));
+}
+
+/**
+ * @param {string} rawArgs
+ * @returns {string}
+ */
+function extractCommand(rawArgs) {
+  if (!rawArgs) return "";
+  try {
+    const parsed = JSON.parse(rawArgs);
+    if (parsed && typeof parsed.command === "string") {
+      return parsed.command;
+    }
+  } catch {
+    // Fall through: args may be truncated JSON in run-log.
+  }
+  return rawArgs;
+}
+
+/**
+ * @param {string} runLogPath
+ */
+function parseRunLog(runLogPath) {
+  const lines = splitLines(readFileSync(runLogPath, "utf-8"));
+  const events = [];
+  for (const line of lines) {
+    try {
+      events.push(JSON.parse(line));
+    } catch {
+      // Ignore malformed lines but keep analysis alive.
+    }
+  }
+  return events;
+}
+
+/**
+ * @param {CaseAnalysis} analysis
+ * @param {string} id
+ * @param {string} check
+ * @param {boolean} passed
+ * @param {string} [detail]
+ */
+function addCheck(analysis, id, check, passed, detail) {
+  analysis.checks.push({ id, check, passed, detail });
+}
+
+const rows = splitLines(readFileSync(manifestPath, "utf-8"));
+if (rows.length <= 1) {
+  console.error(`Manifest has no data rows: ${manifestPath}`);
+  process.exit(1);
+}
+
+/** @type {CaseAnalysis[]} */
+const analyses = [];
+
+for (let i = 1; i < rows.length; i++) {
+  const row = rows[i];
+  if (!row) continue;
+
+  const cols = row.split("\t");
+  if (cols.length < 11) continue;
+
+  const provider = cols[1] ?? "";
+  const caseId = cols[2] ?? "";
+  const status = cols[3] ?? "";
+  const sessionId = cols[4] ?? "";
+  const sessionDir = cols[5] ?? "";
+  const logFile = cols[6] ?? "";
+
+  /** @type {CaseAnalysis} */
+  const analysis = {
+    provider,
+    caseId,
+    status,
+    sessionId,
+    sessionDir,
+    logFile,
+    checks: [],
+    pass: false,
+  };
+
+  addCheck(
+    analysis,
+    "run-status",
+    "runner status is success",
+    status === "success",
+    `status=${status}`,
+  );
+
+  if (!sessionDir) {
+    addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir");
+    analyses.push(analysis);
+    continue;
+  }
+
+  const runLogPath = join(sessionDir, "run-log.jsonl");
+  addCheck(
+    analysis,
+    "run-log-file",
+    "run-log.jsonl exists",
+    existsSync(runLogPath),
+    runLogPath,
+  );
+
+  if (!existsSync(runLogPath)) {
+    analyses.push(analysis);
+    continue;
+  }
+
+  const events = parseRunLog(runLogPath);
+  const runStarts = events.filter((e) => e.event === "run_start");
+  const runEnds = events.filter((e) => e.event === "run_end");
+  const toolStarts = events.filter((e) => e.event === "tool_start");
+  const toolEnds = events.filter((e) => e.event === "tool_end");
+  const errorToolEnds = toolEnds.filter((e) => e.is_error === true);
+
+  addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`);
+  addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`);
+  addCheck(
+    analysis,
+    "tool-pairing",
+    "tool_start count matches tool_end count",
+    toolStarts.length === toolEnds.length,
+    `start=${toolStarts.length} end=${toolEnds.length}`,
+  );
+
+  const finalRunEnd = runEnds.at(-1);
+  const runEndError = finalRunEnd?.error;
+  addCheck(
+    analysis,
+    "run-end-error",
+    "final run_end.error is null/empty",
+    runEndError === null || runEndError === undefined || runEndError === "",
+    `error=${String(runEndError)}`,
+  );
+
+  addCheck(
+    analysis,
+    "tool-errors",
+    "no tool_end has is_error=true",
+    errorToolEnds.length === 0,
+    `error_tool_calls=${errorToolEnds.length}`,
+  );
+
+  const execCommands = toolStarts
+    .filter((e) => e.tool === "exec")
+    .map((e) => extractCommand(typeof e.args === "string" ? e.args : ""))
+    .filter(Boolean);
+
+  addCheck(
+    analysis,
+    "exec-usage",
+    "at least one exec command was used",
+    execCommands.length > 0,
+    `exec_calls=${execCommands.length}`,
+  );
+
+  const rules = CASE_RULES[caseId];
+  if (rules) {
+    for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
+      const tokenList = rules.requiredCommandTokens[r];
+      const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
+      addCheck(
+        analysis,
+        `cmd-${r + 1}`,
+        `exec command contains tokens: ${tokenList.join(" + ")}`,
+        passed,
+      );
+    }
+  } else {
+    addCheck(
+      analysis,
+      "case-rules",
+      "case has rule set",
+      false,
+      `No rules defined for case_id=${caseId}`,
+    );
+  }
+
+  analysis.pass = analysis.checks.every((c) => c.passed);
+  analyses.push(analysis);
+}
+
+const passedCases = analyses.filter((a) => a.pass).length;
+const failedCases = analyses.length - passedCases;
+
+const output = {
+  manifestPath,
+  totalCases: analyses.length,
+  passedCases,
+  failedCases,
+  results: analyses,
+};
+
+const outputPath = join(dirname(manifestPath), "analysis.json");
+writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8");
+
+for (const item of analyses) {
+  const status = item.pass ? "PASS" : "FAIL";
+  console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`);
+  for (const check of item.checks) {
+    const marker = check.passed ? "  [ok]  " : "  [bad] ";
+    const detail = check.detail ? ` (${check.detail})` : "";
+    console.log(`${marker}${check.check}${detail}`);
+  }
+}
+
+console.log("");
+console.log(`Analysis file: ${outputPath}`);
+console.log(`Summary: pass=${passedCases} fail=${failedCases}`);
+
+if (failedCases > 0) {
+  process.exit(1);
+}
--- a/scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt
+++ b/scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt
@ -0,0 +1,15 @@
+Run an end-to-end test for the Meta Skill Installer.
+
+Goal: install a real ClawHub skill for CalDAV calendar capability.
+Reference page: https://clawhub.ai/skills/caldav-calendar
+
+Follow this exact workflow:
+1. State the missing capability in one sentence.
+2. Search ClawHub for CalDAV-related skills and choose the best candidate.
+3. Stage-install to a temporary directory first (never install directly to active skills path).
+4. Run security review on the staged skill:
+   DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
+   node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
+5. If riskLevel is safe, install to "$DATA_DIR/skills".
+6. Verify final install by checking "$DATA_DIR/skills/<slug>/SKILL.md" exists.
+7. Return a short report: selected slug, riskLevel, final install path.
--- a/scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt
+++ b/scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt
@ -0,0 +1,16 @@
+Run an end-to-end capability-gap discovery test for Meta Skill Installer.
+
+User intent: "I need to control Home Assistant lights and switches from the agent."
+Reference page: https://clawhub.ai/skills/homeassistant
+
+Requirements:
+1. Treat this as a missing capability and explicitly define the gap.
+2. Search ClawHub for relevant skills and list the top 3 candidates.
+3. Pick one candidate with rationale (scope match + lower security risk).
+4. Stage-install to a temporary directory.
+5. Run security review:
+   DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
+   node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
+6. If riskLevel is safe, install to "$DATA_DIR/skills".
+7. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
+8. Return: candidate list, chosen slug, riskLevel, and final path.
--- a/scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt
+++ b/scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt
@ -0,0 +1,16 @@
+Run an end-to-end install+update regression test for Meta Skill Installer.
+
+Goal: use a real ClawHub skill and verify install, review, and update flow.
+Reference page: https://clawhub.ai/odrobnik/codexmonitor
+
+Requirements:
+1. Search ClawHub for CodexMonitor and select the matching skill slug.
+2. Stage-install to a temporary directory and run security review:
+   DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
+   node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
+3. If riskLevel is safe, install to "$DATA_DIR/skills".
+4. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
+5. Run an update for the same slug in managed dir:
+   clawhub update "<slug>" --workdir "$DATA_DIR" --dir skills --force
+6. Run security review again on the final installed path.
+7. Return: slug, initial riskLevel, update executed (yes/no), final path.
--- a/scripts/e2e-skills-benchmark/run.sh
+++ b/scripts/e2e-skills-benchmark/run.sh
@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+CASES_DIR="${SCRIPT_DIR}/cases"
+TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
+OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/skills-e2e-runs/${TIMESTAMP}}"
+RESULTS_DIR="${OUT_DIR}/results"
+MANIFEST="${OUT_DIR}/manifest.tsv"
+
+# Required environment for agent-driven E2E.
+SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
+MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
+PROVIDERS_RAW="${PROVIDERS:-kimi-coding}"
+CASE_GLOB="${CASE_GLOB:-case-*.txt}"
+CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-1200}"
+MAX_PARALLEL="${MAX_PARALLEL:-1}"
+TIMEOUT_ENABLED="true"
+if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
+  TIMEOUT_ENABLED="false"
+fi
+
+if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
+  echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
+  exit 1
+fi
+
+if [[ "${1:-}" == "--worker" ]]; then
+  provider="${2:?missing provider}"
+  case_file="${3:?missing case file}"
+  case_base="$(basename "${case_file}")"
+  case_id="${case_base%.txt}"
+  log_file="${OUT_DIR}/${provider}-${case_id}.log"
+  result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
+
+  prompt="$(cat "${case_file}")"
+
+  status="success"
+  timed_out="false"
+  started_epoch="$(date +%s)"
+  started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+
+  SMC_DATA_DIR="${SMC_DATA_DIR}" \
+    MULTICA_API_URL="${MULTICA_API_URL}" \
+    pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
+  cmd_pid=$!
+
+  while kill -0 "${cmd_pid}" 2>/dev/null; do
+    if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
+      now="$(date +%s)"
+      elapsed="$((now - started_epoch))"
+      if (( elapsed >= CASE_TIMEOUT_SEC )); then
+        timed_out="true"
+        kill "${cmd_pid}" 2>/dev/null || true
+        sleep 1
+        kill -9 "${cmd_pid}" 2>/dev/null || true
+        break
+      fi
+    fi
+    sleep 2
+  done
+
+  exit_code=0
+  wait "${cmd_pid}" 2>/dev/null || exit_code=$?
+  ended_epoch="$(date +%s)"
+  ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+  duration_sec="$((ended_epoch - started_epoch))"
+
+  if [[ "${timed_out}" == "true" ]]; then
+    status="timeout"
+    printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
+  elif (( exit_code != 0 )); then
+    status="failed"
+  elif [[ ! -s "${log_file}" ]]; then
+    status="failed"
+  elif ! rg -q "\[session: " "${log_file}"; then
+    status="failed"
+  fi
+
+  session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
+  session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
+
+  printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
+    "${TIMESTAMP}" \
+    "${provider}" \
+    "${case_id}" \
+    "${status}" \
+    "${session_id}" \
+    "${session_dir}" \
+    "${log_file}" \
+    "${started_at}" \
+    "${ended_at}" \
+    "${duration_sec}" \
+    "${exit_code}" > "${result_file}"
+
+  printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
+    "${provider}" \
+    "${case_id}" \
+    "${status}" \
+    "${duration_sec}" \
+    "${session_id:-N/A}"
+  exit 0
+fi
+
+mkdir -p "${OUT_DIR}"
+mkdir -p "${RESULTS_DIR}"
+printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
+
+read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
+
+CASE_FILES=()
+while IFS= read -r line; do
+  CASE_FILES+=("${line}")
+done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
+
+if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
+  echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
+  exit 1
+fi
+
+echo "Output directory: ${OUT_DIR}"
+echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
+echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
+echo "Providers: ${PROVIDERS[*]}"
+echo "Cases: ${#CASE_FILES[@]}"
+echo "Max parallel: ${MAX_PARALLEL}"
+if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
+  echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
+else
+  echo "Case timeout: disabled"
+fi
+
+TASKS=()
+for provider in "${PROVIDERS[@]}"; do
+  for case_file in "${CASE_FILES[@]}"; do
+    TASKS+=("${provider}" "${case_file}")
+  done
+done
+
+echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
+
+export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
+printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
+
+RESULT_FILES=()
+while IFS= read -r line; do
+  RESULT_FILES+=("${line}")
+done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
+
+if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
+  echo "No result files produced in ${RESULTS_DIR}" >&2
+  exit 1
+fi
+
+for result_file in "${RESULT_FILES[@]}"; do
+  cat "${result_file}" >> "${MANIFEST}"
+done
+
+success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
+failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
+timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
+
+echo
+echo "Completed run stage. Manifest: ${MANIFEST}"
+echo "Run summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
+
+echo
+echo "Running structured analysis..."
+node "${SCRIPT_DIR}/analyze.mjs" "${MANIFEST}" | tee "${OUT_DIR}/analysis.txt"