#!/usr/bin/env node import { existsSync, readFileSync, writeFileSync } from "node:fs"; import { dirname, join, resolve } from "node:path"; /** * @typedef {{ * id: string; * check: string; * passed: boolean; * detail?: string; * }} CheckResult */ /** * @typedef {{ * provider: string; * caseId: string; * status: string; * sessionId: string; * sessionDir: string; * logFile: string; * checks: CheckResult[]; * pass: boolean; * }} CaseAnalysis */ const manifestArg = process.argv[2]; if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") { console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs "); process.exit(0); } const manifestPath = resolve(manifestArg); if (!existsSync(manifestPath)) { console.error(`Manifest not found: ${manifestPath}`); process.exit(1); } const CASE_RULES = { "case-01-install-caldav-calendar": { requiredCommandTokens: [ ["clawhub", "search"], ["caldav"], ["clawhub", "install"], ["review-skill-security.mjs"], ], }, "case-02-gap-discovery-homeassistant": { requiredCommandTokens: [ ["clawhub", "search"], ["home", "assistant"], ["clawhub", "install"], ["review-skill-security.mjs"], ], }, "case-03-install-update-codexmonitor": { requiredCommandTokens: [ ["clawhub", "search"], ["codexmonitor"], ["clawhub", "install"], ["clawhub", "update"], ["review-skill-security.mjs"], ], }, "case-04-gap-discovery-spotify-ux": { requireExecUsage: false, requiredResponseRegex: [ "缺少|没有.*(技能|能力|集成)|capability gap", "clawhub|cloud\\s*hub|cloudhub", "安装|install", "是否|要不要|would you like|do you want", "安全|审查|security|review", ], forbiddenCommandTokens: [ ["clawhub", "install"], ["clawhub", "update"], ["osascript"], ["spogo"], ["spotify_player"], ["ha.sh"], ["/api/states"], ], }, "case-05-gap-discovery-notion-ux": { requireExecUsage: false, requiredCommandTokens: [ ["clawhub", "search"], ["notion"], ], requiredEventTokens: [ ["install_guard", "blocked"], ], requiredResponseRegex: [ "notion", "安装|install", "是否|要不要|would you like|do you want|同意", "token|授权|integration", ], forbiddenCommandTokens: [ ["osascript"], ["spogo"], ["spotify_player"], ["ha.sh"], ["/api/states"], ], }, }; /** * @param {string} text * @returns {string[]} */ function splitLines(text) { return text.split(/\r?\n/).filter(Boolean); } /** * @param {string} command * @param {string[]} tokens * @returns {boolean} */ function commandHasTokens(command, tokens) { const lower = command.toLowerCase(); return tokens.every((token) => lower.includes(token.toLowerCase())); } /** * @param {string} rawArgs * @returns {string} */ function extractCommand(rawArgs) { if (!rawArgs) return ""; try { const parsed = JSON.parse(rawArgs); if (parsed && typeof parsed.command === "string") { return parsed.command; } } catch { // Fall through: args may be truncated JSON in run-log. } return rawArgs; } /** * @param {string} text * @param {string} pattern * @returns {boolean} */ function textMatchesPattern(text, pattern) { try { return new RegExp(pattern, "i").test(text); } catch { return false; } } /** * @param {string} runLogPath */ function parseRunLog(runLogPath) { const lines = splitLines(readFileSync(runLogPath, "utf-8")); const events = []; for (const line of lines) { try { events.push(JSON.parse(line)); } catch { // Ignore malformed lines but keep analysis alive. } } return events; } /** * @param {string} sessionPath * @returns {string} */ function parseFinalAssistantText(sessionPath) { if (!existsSync(sessionPath)) return ""; const lines = splitLines(readFileSync(sessionPath, "utf-8")); let latest = ""; for (const line of lines) { try { const entry = JSON.parse(line); if (entry?.type !== "message") continue; const msg = entry.message; if (!msg || msg.role !== "assistant") continue; if (typeof msg.content === "string") { latest = msg.content; continue; } if (Array.isArray(msg.content)) { const text = msg.content .filter((part) => part && part.type === "text" && typeof part.text === "string") .map((part) => part.text) .join("\n") .trim(); if (text) latest = text; } } catch { // Ignore malformed lines. } } return latest; } /** * @param {CaseAnalysis} analysis * @param {string} id * @param {string} check * @param {boolean} passed * @param {string} [detail] */ function addCheck(analysis, id, check, passed, detail) { analysis.checks.push({ id, check, passed, detail }); } const rows = splitLines(readFileSync(manifestPath, "utf-8")); if (rows.length <= 1) { console.error(`Manifest has no data rows: ${manifestPath}`); process.exit(1); } /** @type {CaseAnalysis[]} */ const analyses = []; for (let i = 1; i < rows.length; i++) { const row = rows[i]; if (!row) continue; const cols = row.split("\t"); if (cols.length < 11) continue; const provider = cols[1] ?? ""; const caseId = cols[2] ?? ""; const rules = CASE_RULES[caseId]; const status = cols[3] ?? ""; const sessionId = cols[4] ?? ""; const sessionDir = cols[5] ?? ""; const logFile = cols[6] ?? ""; /** @type {CaseAnalysis} */ const analysis = { provider, caseId, status, sessionId, sessionDir, logFile, checks: [], pass: false, }; addCheck( analysis, "run-status", "runner status is success", status === "success", `status=${status}`, ); if (!sessionDir) { addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir"); analyses.push(analysis); continue; } const runLogPath = join(sessionDir, "run-log.jsonl"); addCheck( analysis, "run-log-file", "run-log.jsonl exists", existsSync(runLogPath), runLogPath, ); if (!existsSync(runLogPath)) { analyses.push(analysis); continue; } const events = parseRunLog(runLogPath); const sessionPath = join(sessionDir, "session.jsonl"); const finalAssistantText = parseFinalAssistantText(sessionPath); const runStarts = events.filter((e) => e.event === "run_start"); const runEnds = events.filter((e) => e.event === "run_end"); const toolStarts = events.filter((e) => e.event === "tool_start"); const toolEnds = events.filter((e) => e.event === "tool_end"); const errorToolEnds = toolEnds.filter((e) => e.is_error === true); addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`); addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`); addCheck( analysis, "tool-pairing", "tool_start count matches tool_end count", toolStarts.length === toolEnds.length, `start=${toolStarts.length} end=${toolEnds.length}`, ); const finalRunEnd = runEnds.at(-1); const runEndError = finalRunEnd?.error; const finalRunText = typeof finalRunEnd?.text === "string" ? finalRunEnd.text : ""; const finalResponseText = finalAssistantText || finalRunText; addCheck( analysis, "run-end-error", "final run_end.error is null/empty", runEndError === null || runEndError === undefined || runEndError === "", `error=${String(runEndError)}`, ); addCheck( analysis, "tool-errors", "no tool_end has is_error=true", errorToolEnds.length === 0, `error_tool_calls=${errorToolEnds.length}`, ); const execCommands = toolStarts .filter((e) => e.tool === "exec") .map((e) => extractCommand(typeof e.args === "string" ? e.args : "")) .filter(Boolean); const requireExecUsage = rules?.requireExecUsage !== false; addCheck( analysis, "exec-usage", requireExecUsage ? "at least one exec command was used" : "exec usage is optional for this case", requireExecUsage ? execCommands.length > 0 : true, requireExecUsage ? `exec_calls=${execCommands.length}` : `exec_calls=${execCommands.length} (optional)`, ); if (rules) { if (Array.isArray(rules.requiredCommandTokens)) { for (let r = 0; r < rules.requiredCommandTokens.length; r++) { const tokenList = rules.requiredCommandTokens[r]; const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList)); addCheck( analysis, `cmd-${r + 1}`, `exec command contains tokens: ${tokenList.join(" + ")}`, passed, ); } } if (Array.isArray(rules.requiredEventTokens)) { const eventLines = events.map((event) => JSON.stringify(event).toLowerCase()); for (let r = 0; r < rules.requiredEventTokens.length; r++) { const tokenList = rules.requiredEventTokens[r]; const passed = eventLines.some((line) => tokenList.every((token) => line.includes(token.toLowerCase())), ); addCheck( analysis, `event-${r + 1}`, `event log contains tokens: ${tokenList.join(" + ")}`, passed, ); } } if (Array.isArray(rules.forbiddenCommandTokens)) { for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) { const tokenList = rules.forbiddenCommandTokens[r]; const passed = !execCommands.some((cmd) => commandHasTokens(cmd, tokenList)); addCheck( analysis, `forbid-cmd-${r + 1}`, `exec command does not contain tokens: ${tokenList.join(" + ")}`, passed, ); } } if (Array.isArray(rules.requiredResponseRegex)) { for (let r = 0; r < rules.requiredResponseRegex.length; r++) { const pattern = rules.requiredResponseRegex[r]; const passed = textMatchesPattern(finalResponseText, pattern); addCheck( analysis, `resp-${r + 1}`, `final response matches regex: /${pattern}/i`, passed, ); } } } else { addCheck( analysis, "case-rules", "case has rule set", false, `No rules defined for case_id=${caseId}`, ); } analysis.pass = analysis.checks.every((c) => c.passed); analyses.push(analysis); } const passedCases = analyses.filter((a) => a.pass).length; const failedCases = analyses.length - passedCases; const output = { manifestPath, totalCases: analyses.length, passedCases, failedCases, results: analyses, }; const outputPath = join(dirname(manifestPath), "analysis.json"); writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8"); for (const item of analyses) { const status = item.pass ? "PASS" : "FAIL"; console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`); for (const check of item.checks) { const marker = check.passed ? " [ok] " : " [bad] "; const detail = check.detail ? ` (${check.detail})` : ""; console.log(`${marker}${check.check}${detail}`); } } console.log(""); console.log(`Analysis file: ${outputPath}`); console.log(`Summary: pass=${passedCases} fail=${failedCases}`); if (failedCases > 0) { process.exit(1); }