feat(e2e): add clawhub skills benchmark suite
This commit is contained in:
parent
0c1856b54b
commit
2074aac49e
7 changed files with 610 additions and 1 deletions
94
docs/e2e-skills-benchmark.md
Normal file
94
docs/e2e-skills-benchmark.md
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
# Skills Agent-Driven E2E Benchmark
|
||||
|
||||
This benchmark validates the meta skill workflow for capability-gap discovery, ClawHub installation, and security-gated rollout.
|
||||
|
||||
## Scope
|
||||
|
||||
- Domain: skill discovery + installation + update
|
||||
- Focus: `skills/meta-skill-installer`
|
||||
- Providers: default `kimi-coding` (override with `PROVIDERS`)
|
||||
- Cases: 3
|
||||
|
||||
Case prompts are stored in:
|
||||
- `scripts/e2e-skills-benchmark/cases/`
|
||||
|
||||
## Real ClawHub Examples Used
|
||||
|
||||
The case set references real public pages from ClawHub:
|
||||
|
||||
- [CalDAV Calendar](https://clawhub.ai/skills/caldav-calendar)
|
||||
- [Home Assistant](https://clawhub.ai/skills/homeassistant)
|
||||
- [CodexMonitor](https://clawhub.ai/odrobnik/codexmonitor)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Credentials configured (`pnpm multica credentials init` if needed)
|
||||
2. Dependencies installed in repo (`pnpm install`)
|
||||
3. `clawhub` CLI available, or allow runtime fallback to `npx -y clawhub`
|
||||
4. Required env:
|
||||
|
||||
```bash
|
||||
export SMC_DATA_DIR=~/.super-multica-e2e
|
||||
export MULTICA_API_URL=https://api-dev.copilothub.ai
|
||||
```
|
||||
|
||||
## Run Benchmark
|
||||
|
||||
```bash
|
||||
scripts/e2e-skills-benchmark/run.sh
|
||||
```
|
||||
|
||||
Defaults:
|
||||
|
||||
- Providers: `kimi-coding`
|
||||
- Case glob: `case-*.txt`
|
||||
- Max parallel workers: `1`
|
||||
- Per-case timeout: `1200s` (`CASE_TIMEOUT_SEC=0` to disable)
|
||||
- Output directory: `.context/skills-e2e-runs/<timestamp>/`
|
||||
|
||||
Generated artifacts:
|
||||
|
||||
- `manifest.tsv`: provider/case/status/session/log metadata
|
||||
- `analysis.txt`: human-readable pass/fail report
|
||||
- `analysis.json`: structured detailed check output
|
||||
|
||||
## Run Subset
|
||||
|
||||
Only one case:
|
||||
|
||||
```bash
|
||||
CASE_GLOB="case-01-*.txt" scripts/e2e-skills-benchmark/run.sh
|
||||
```
|
||||
|
||||
Multiple providers:
|
||||
|
||||
```bash
|
||||
PROVIDERS="kimi-coding claude-code" scripts/e2e-skills-benchmark/run.sh
|
||||
```
|
||||
|
||||
Faster throughput:
|
||||
|
||||
```bash
|
||||
MAX_PARALLEL=2 CASE_TIMEOUT_SEC=1800 scripts/e2e-skills-benchmark/run.sh
|
||||
```
|
||||
|
||||
## Analyzer Checks
|
||||
|
||||
For each run:
|
||||
|
||||
1. `run_start` and `run_end` both present
|
||||
2. `run_end.error` is empty/null
|
||||
3. `tool_start` and `tool_end` are paired
|
||||
4. no `tool_end.is_error=true`
|
||||
5. at least one `exec` tool call exists
|
||||
6. case-specific command evidence in `tool_start.args`:
|
||||
- `clawhub search`
|
||||
- `clawhub install`
|
||||
- `review-skill-security.mjs`
|
||||
- for case 03 also `clawhub update`
|
||||
|
||||
## Notes
|
||||
|
||||
- These are agent-driven tests; prompt intent plus run-log evidence are both evaluated.
|
||||
- `SMC_DATA_DIR=~/.super-multica-e2e` avoids polluting normal user skill/session data.
|
||||
- If a case fails, open `manifest.tsv` and inspect the matching `session_dir/run-log.jsonl`.
|
||||
|
|
@ -30,7 +30,8 @@
|
|||
"typecheck": "turbo typecheck",
|
||||
"test": "vitest run",
|
||||
"test:watch": "vitest",
|
||||
"test:coverage": "vitest run --coverage"
|
||||
"test:coverage": "vitest run --coverage",
|
||||
"e2e:skills": "bash scripts/e2e-skills-benchmark/run.sh"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
|
|
|
|||
297
scripts/e2e-skills-benchmark/analyze.mjs
Executable file
297
scripts/e2e-skills-benchmark/analyze.mjs
Executable file
|
|
@ -0,0 +1,297 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
||||
import { dirname, join, resolve } from "node:path";
|
||||
|
||||
/**
|
||||
* @typedef {{
|
||||
* id: string;
|
||||
* check: string;
|
||||
* passed: boolean;
|
||||
* detail?: string;
|
||||
* }} CheckResult
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {{
|
||||
* provider: string;
|
||||
* caseId: string;
|
||||
* status: string;
|
||||
* sessionId: string;
|
||||
* sessionDir: string;
|
||||
* logFile: string;
|
||||
* checks: CheckResult[];
|
||||
* pass: boolean;
|
||||
* }} CaseAnalysis
|
||||
*/
|
||||
|
||||
const manifestArg = process.argv[2];
|
||||
if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") {
|
||||
console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs <manifest.tsv>");
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const manifestPath = resolve(manifestArg);
|
||||
if (!existsSync(manifestPath)) {
|
||||
console.error(`Manifest not found: ${manifestPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const CASE_RULES = {
|
||||
"case-01-install-caldav-calendar": {
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["caldav"],
|
||||
["clawhub", "install"],
|
||||
["review-skill-security.mjs"],
|
||||
],
|
||||
},
|
||||
"case-02-gap-discovery-homeassistant": {
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["home", "assistant"],
|
||||
["clawhub", "install"],
|
||||
["review-skill-security.mjs"],
|
||||
],
|
||||
},
|
||||
"case-03-install-update-codexmonitor": {
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["codexmonitor"],
|
||||
["clawhub", "install"],
|
||||
["clawhub", "update"],
|
||||
["review-skill-security.mjs"],
|
||||
],
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* @param {string} text
|
||||
* @returns {string[]}
|
||||
*/
|
||||
function splitLines(text) {
|
||||
return text.split(/\r?\n/).filter(Boolean);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} command
|
||||
* @param {string[]} tokens
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function commandHasTokens(command, tokens) {
|
||||
const lower = command.toLowerCase();
|
||||
return tokens.every((token) => lower.includes(token.toLowerCase()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} rawArgs
|
||||
* @returns {string}
|
||||
*/
|
||||
function extractCommand(rawArgs) {
|
||||
if (!rawArgs) return "";
|
||||
try {
|
||||
const parsed = JSON.parse(rawArgs);
|
||||
if (parsed && typeof parsed.command === "string") {
|
||||
return parsed.command;
|
||||
}
|
||||
} catch {
|
||||
// Fall through: args may be truncated JSON in run-log.
|
||||
}
|
||||
return rawArgs;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} runLogPath
|
||||
*/
|
||||
function parseRunLog(runLogPath) {
|
||||
const lines = splitLines(readFileSync(runLogPath, "utf-8"));
|
||||
const events = [];
|
||||
for (const line of lines) {
|
||||
try {
|
||||
events.push(JSON.parse(line));
|
||||
} catch {
|
||||
// Ignore malformed lines but keep analysis alive.
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {CaseAnalysis} analysis
|
||||
* @param {string} id
|
||||
* @param {string} check
|
||||
* @param {boolean} passed
|
||||
* @param {string} [detail]
|
||||
*/
|
||||
function addCheck(analysis, id, check, passed, detail) {
|
||||
analysis.checks.push({ id, check, passed, detail });
|
||||
}
|
||||
|
||||
const rows = splitLines(readFileSync(manifestPath, "utf-8"));
|
||||
if (rows.length <= 1) {
|
||||
console.error(`Manifest has no data rows: ${manifestPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
/** @type {CaseAnalysis[]} */
|
||||
const analyses = [];
|
||||
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const row = rows[i];
|
||||
if (!row) continue;
|
||||
|
||||
const cols = row.split("\t");
|
||||
if (cols.length < 11) continue;
|
||||
|
||||
const provider = cols[1] ?? "";
|
||||
const caseId = cols[2] ?? "";
|
||||
const status = cols[3] ?? "";
|
||||
const sessionId = cols[4] ?? "";
|
||||
const sessionDir = cols[5] ?? "";
|
||||
const logFile = cols[6] ?? "";
|
||||
|
||||
/** @type {CaseAnalysis} */
|
||||
const analysis = {
|
||||
provider,
|
||||
caseId,
|
||||
status,
|
||||
sessionId,
|
||||
sessionDir,
|
||||
logFile,
|
||||
checks: [],
|
||||
pass: false,
|
||||
};
|
||||
|
||||
addCheck(
|
||||
analysis,
|
||||
"run-status",
|
||||
"runner status is success",
|
||||
status === "success",
|
||||
`status=${status}`,
|
||||
);
|
||||
|
||||
if (!sessionDir) {
|
||||
addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir");
|
||||
analyses.push(analysis);
|
||||
continue;
|
||||
}
|
||||
|
||||
const runLogPath = join(sessionDir, "run-log.jsonl");
|
||||
addCheck(
|
||||
analysis,
|
||||
"run-log-file",
|
||||
"run-log.jsonl exists",
|
||||
existsSync(runLogPath),
|
||||
runLogPath,
|
||||
);
|
||||
|
||||
if (!existsSync(runLogPath)) {
|
||||
analyses.push(analysis);
|
||||
continue;
|
||||
}
|
||||
|
||||
const events = parseRunLog(runLogPath);
|
||||
const runStarts = events.filter((e) => e.event === "run_start");
|
||||
const runEnds = events.filter((e) => e.event === "run_end");
|
||||
const toolStarts = events.filter((e) => e.event === "tool_start");
|
||||
const toolEnds = events.filter((e) => e.event === "tool_end");
|
||||
const errorToolEnds = toolEnds.filter((e) => e.is_error === true);
|
||||
|
||||
addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`);
|
||||
addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`);
|
||||
addCheck(
|
||||
analysis,
|
||||
"tool-pairing",
|
||||
"tool_start count matches tool_end count",
|
||||
toolStarts.length === toolEnds.length,
|
||||
`start=${toolStarts.length} end=${toolEnds.length}`,
|
||||
);
|
||||
|
||||
const finalRunEnd = runEnds.at(-1);
|
||||
const runEndError = finalRunEnd?.error;
|
||||
addCheck(
|
||||
analysis,
|
||||
"run-end-error",
|
||||
"final run_end.error is null/empty",
|
||||
runEndError === null || runEndError === undefined || runEndError === "",
|
||||
`error=${String(runEndError)}`,
|
||||
);
|
||||
|
||||
addCheck(
|
||||
analysis,
|
||||
"tool-errors",
|
||||
"no tool_end has is_error=true",
|
||||
errorToolEnds.length === 0,
|
||||
`error_tool_calls=${errorToolEnds.length}`,
|
||||
);
|
||||
|
||||
const execCommands = toolStarts
|
||||
.filter((e) => e.tool === "exec")
|
||||
.map((e) => extractCommand(typeof e.args === "string" ? e.args : ""))
|
||||
.filter(Boolean);
|
||||
|
||||
addCheck(
|
||||
analysis,
|
||||
"exec-usage",
|
||||
"at least one exec command was used",
|
||||
execCommands.length > 0,
|
||||
`exec_calls=${execCommands.length}`,
|
||||
);
|
||||
|
||||
const rules = CASE_RULES[caseId];
|
||||
if (rules) {
|
||||
for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
|
||||
const tokenList = rules.requiredCommandTokens[r];
|
||||
const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
|
||||
addCheck(
|
||||
analysis,
|
||||
`cmd-${r + 1}`,
|
||||
`exec command contains tokens: ${tokenList.join(" + ")}`,
|
||||
passed,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
addCheck(
|
||||
analysis,
|
||||
"case-rules",
|
||||
"case has rule set",
|
||||
false,
|
||||
`No rules defined for case_id=${caseId}`,
|
||||
);
|
||||
}
|
||||
|
||||
analysis.pass = analysis.checks.every((c) => c.passed);
|
||||
analyses.push(analysis);
|
||||
}
|
||||
|
||||
const passedCases = analyses.filter((a) => a.pass).length;
|
||||
const failedCases = analyses.length - passedCases;
|
||||
|
||||
const output = {
|
||||
manifestPath,
|
||||
totalCases: analyses.length,
|
||||
passedCases,
|
||||
failedCases,
|
||||
results: analyses,
|
||||
};
|
||||
|
||||
const outputPath = join(dirname(manifestPath), "analysis.json");
|
||||
writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8");
|
||||
|
||||
for (const item of analyses) {
|
||||
const status = item.pass ? "PASS" : "FAIL";
|
||||
console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`);
|
||||
for (const check of item.checks) {
|
||||
const marker = check.passed ? " [ok] " : " [bad] ";
|
||||
const detail = check.detail ? ` (${check.detail})` : "";
|
||||
console.log(`${marker}${check.check}${detail}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("");
|
||||
console.log(`Analysis file: ${outputPath}`);
|
||||
console.log(`Summary: pass=${passedCases} fail=${failedCases}`);
|
||||
|
||||
if (failedCases > 0) {
|
||||
process.exit(1);
|
||||
}
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
Run an end-to-end test for the Meta Skill Installer.
|
||||
|
||||
Goal: install a real ClawHub skill for CalDAV calendar capability.
|
||||
Reference page: https://clawhub.ai/skills/caldav-calendar
|
||||
|
||||
Follow this exact workflow:
|
||||
1. State the missing capability in one sentence.
|
||||
2. Search ClawHub for CalDAV-related skills and choose the best candidate.
|
||||
3. Stage-install to a temporary directory first (never install directly to active skills path).
|
||||
4. Run security review on the staged skill:
|
||||
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
|
||||
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
|
||||
5. If riskLevel is safe, install to "$DATA_DIR/skills".
|
||||
6. Verify final install by checking "$DATA_DIR/skills/<slug>/SKILL.md" exists.
|
||||
7. Return a short report: selected slug, riskLevel, final install path.
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
Run an end-to-end capability-gap discovery test for Meta Skill Installer.
|
||||
|
||||
User intent: "I need to control Home Assistant lights and switches from the agent."
|
||||
Reference page: https://clawhub.ai/skills/homeassistant
|
||||
|
||||
Requirements:
|
||||
1. Treat this as a missing capability and explicitly define the gap.
|
||||
2. Search ClawHub for relevant skills and list the top 3 candidates.
|
||||
3. Pick one candidate with rationale (scope match + lower security risk).
|
||||
4. Stage-install to a temporary directory.
|
||||
5. Run security review:
|
||||
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
|
||||
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
|
||||
6. If riskLevel is safe, install to "$DATA_DIR/skills".
|
||||
7. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
|
||||
8. Return: candidate list, chosen slug, riskLevel, and final path.
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
Run an end-to-end install+update regression test for Meta Skill Installer.
|
||||
|
||||
Goal: use a real ClawHub skill and verify install, review, and update flow.
|
||||
Reference page: https://clawhub.ai/odrobnik/codexmonitor
|
||||
|
||||
Requirements:
|
||||
1. Search ClawHub for CodexMonitor and select the matching skill slug.
|
||||
2. Stage-install to a temporary directory and run security review:
|
||||
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
|
||||
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
|
||||
3. If riskLevel is safe, install to "$DATA_DIR/skills".
|
||||
4. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
|
||||
5. Run an update for the same slug in managed dir:
|
||||
clawhub update "<slug>" --workdir "$DATA_DIR" --dir skills --force
|
||||
6. Run security review again on the final installed path.
|
||||
7. Return: slug, initial riskLevel, update executed (yes/no), final path.
|
||||
170
scripts/e2e-skills-benchmark/run.sh
Executable file
170
scripts/e2e-skills-benchmark/run.sh
Executable file
|
|
@ -0,0 +1,170 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
CASES_DIR="${SCRIPT_DIR}/cases"
|
||||
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
|
||||
OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/skills-e2e-runs/${TIMESTAMP}}"
|
||||
RESULTS_DIR="${OUT_DIR}/results"
|
||||
MANIFEST="${OUT_DIR}/manifest.tsv"
|
||||
|
||||
# Required environment for agent-driven E2E.
|
||||
SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
|
||||
MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
|
||||
PROVIDERS_RAW="${PROVIDERS:-kimi-coding}"
|
||||
CASE_GLOB="${CASE_GLOB:-case-*.txt}"
|
||||
CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-1200}"
|
||||
MAX_PARALLEL="${MAX_PARALLEL:-1}"
|
||||
TIMEOUT_ENABLED="true"
|
||||
if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
|
||||
TIMEOUT_ENABLED="false"
|
||||
fi
|
||||
|
||||
if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
|
||||
echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${1:-}" == "--worker" ]]; then
|
||||
provider="${2:?missing provider}"
|
||||
case_file="${3:?missing case file}"
|
||||
case_base="$(basename "${case_file}")"
|
||||
case_id="${case_base%.txt}"
|
||||
log_file="${OUT_DIR}/${provider}-${case_id}.log"
|
||||
result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
|
||||
|
||||
prompt="$(cat "${case_file}")"
|
||||
|
||||
status="success"
|
||||
timed_out="false"
|
||||
started_epoch="$(date +%s)"
|
||||
started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
|
||||
SMC_DATA_DIR="${SMC_DATA_DIR}" \
|
||||
MULTICA_API_URL="${MULTICA_API_URL}" \
|
||||
pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
|
||||
cmd_pid=$!
|
||||
|
||||
while kill -0 "${cmd_pid}" 2>/dev/null; do
|
||||
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
|
||||
now="$(date +%s)"
|
||||
elapsed="$((now - started_epoch))"
|
||||
if (( elapsed >= CASE_TIMEOUT_SEC )); then
|
||||
timed_out="true"
|
||||
kill "${cmd_pid}" 2>/dev/null || true
|
||||
sleep 1
|
||||
kill -9 "${cmd_pid}" 2>/dev/null || true
|
||||
break
|
||||
fi
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
exit_code=0
|
||||
wait "${cmd_pid}" 2>/dev/null || exit_code=$?
|
||||
ended_epoch="$(date +%s)"
|
||||
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
duration_sec="$((ended_epoch - started_epoch))"
|
||||
|
||||
if [[ "${timed_out}" == "true" ]]; then
|
||||
status="timeout"
|
||||
printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
|
||||
elif (( exit_code != 0 )); then
|
||||
status="failed"
|
||||
elif [[ ! -s "${log_file}" ]]; then
|
||||
status="failed"
|
||||
elif ! rg -q "\[session: " "${log_file}"; then
|
||||
status="failed"
|
||||
fi
|
||||
|
||||
session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
|
||||
session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
|
||||
|
||||
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
|
||||
"${TIMESTAMP}" \
|
||||
"${provider}" \
|
||||
"${case_id}" \
|
||||
"${status}" \
|
||||
"${session_id}" \
|
||||
"${session_dir}" \
|
||||
"${log_file}" \
|
||||
"${started_at}" \
|
||||
"${ended_at}" \
|
||||
"${duration_sec}" \
|
||||
"${exit_code}" > "${result_file}"
|
||||
|
||||
printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
|
||||
"${provider}" \
|
||||
"${case_id}" \
|
||||
"${status}" \
|
||||
"${duration_sec}" \
|
||||
"${session_id:-N/A}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${OUT_DIR}"
|
||||
mkdir -p "${RESULTS_DIR}"
|
||||
printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
|
||||
|
||||
read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
|
||||
|
||||
CASE_FILES=()
|
||||
while IFS= read -r line; do
|
||||
CASE_FILES+=("${line}")
|
||||
done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
|
||||
|
||||
if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
|
||||
echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Output directory: ${OUT_DIR}"
|
||||
echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
|
||||
echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
|
||||
echo "Providers: ${PROVIDERS[*]}"
|
||||
echo "Cases: ${#CASE_FILES[@]}"
|
||||
echo "Max parallel: ${MAX_PARALLEL}"
|
||||
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
|
||||
echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
|
||||
else
|
||||
echo "Case timeout: disabled"
|
||||
fi
|
||||
|
||||
TASKS=()
|
||||
for provider in "${PROVIDERS[@]}"; do
|
||||
for case_file in "${CASE_FILES[@]}"; do
|
||||
TASKS+=("${provider}" "${case_file}")
|
||||
done
|
||||
done
|
||||
|
||||
echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
|
||||
|
||||
export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
|
||||
printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
|
||||
|
||||
RESULT_FILES=()
|
||||
while IFS= read -r line; do
|
||||
RESULT_FILES+=("${line}")
|
||||
done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
|
||||
|
||||
if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
|
||||
echo "No result files produced in ${RESULTS_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for result_file in "${RESULT_FILES[@]}"; do
|
||||
cat "${result_file}" >> "${MANIFEST}"
|
||||
done
|
||||
|
||||
success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
|
||||
echo
|
||||
echo "Completed run stage. Manifest: ${MANIFEST}"
|
||||
echo "Run summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
|
||||
|
||||
echo
|
||||
echo "Running structured analysis..."
|
||||
node "${SCRIPT_DIR}/analyze.mjs" "${MANIFEST}" | tee "${OUT_DIR}/analysis.txt"
|
||||
Loading…
Add table
Add a link
Reference in a new issue