441 lines
11 KiB
JavaScript
Executable file
441 lines
11 KiB
JavaScript
Executable file
#!/usr/bin/env node
|
|
|
|
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
import { dirname, join, resolve } from "node:path";
|
|
|
|
/**
|
|
* @typedef {{
|
|
* id: string;
|
|
* check: string;
|
|
* passed: boolean;
|
|
* detail?: string;
|
|
* }} CheckResult
|
|
*/
|
|
|
|
/**
|
|
* @typedef {{
|
|
* provider: string;
|
|
* caseId: string;
|
|
* status: string;
|
|
* sessionId: string;
|
|
* sessionDir: string;
|
|
* logFile: string;
|
|
* checks: CheckResult[];
|
|
* pass: boolean;
|
|
* }} CaseAnalysis
|
|
*/
|
|
|
|
const manifestArg = process.argv[2];
|
|
if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") {
|
|
console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs <manifest.tsv>");
|
|
process.exit(0);
|
|
}
|
|
|
|
const manifestPath = resolve(manifestArg);
|
|
if (!existsSync(manifestPath)) {
|
|
console.error(`Manifest not found: ${manifestPath}`);
|
|
process.exit(1);
|
|
}
|
|
|
|
const CASE_RULES = {
|
|
"case-01-install-caldav-calendar": {
|
|
requiredCommandTokens: [
|
|
["clawhub", "search"],
|
|
["caldav"],
|
|
["clawhub", "install"],
|
|
["review-skill-security.mjs"],
|
|
],
|
|
},
|
|
"case-02-gap-discovery-homeassistant": {
|
|
requiredCommandTokens: [
|
|
["clawhub", "search"],
|
|
["home", "assistant"],
|
|
["clawhub", "install"],
|
|
["review-skill-security.mjs"],
|
|
],
|
|
},
|
|
"case-03-install-update-codexmonitor": {
|
|
requiredCommandTokens: [
|
|
["clawhub", "search"],
|
|
["codexmonitor"],
|
|
["clawhub", "install"],
|
|
["clawhub", "update"],
|
|
["review-skill-security.mjs"],
|
|
],
|
|
},
|
|
"case-04-gap-discovery-spotify-ux": {
|
|
requireExecUsage: false,
|
|
requiredResponseRegex: [
|
|
"缺少|没有.*(技能|能力|集成)|capability gap",
|
|
"clawhub|cloud\\s*hub|cloudhub",
|
|
"安装|install",
|
|
"是否|要不要|would you like|do you want",
|
|
"安全|审查|security|review",
|
|
],
|
|
forbiddenCommandTokens: [
|
|
["clawhub", "install"],
|
|
["clawhub", "update"],
|
|
["osascript"],
|
|
["spogo"],
|
|
["spotify_player"],
|
|
["ha.sh"],
|
|
["/api/states"],
|
|
],
|
|
},
|
|
"case-05-gap-discovery-notion-ux": {
|
|
requireExecUsage: false,
|
|
requiredCommandTokens: [
|
|
["clawhub", "search"],
|
|
["notion"],
|
|
],
|
|
requiredEventTokens: [
|
|
["install_guard", "blocked"],
|
|
],
|
|
requiredResponseRegex: [
|
|
"notion",
|
|
"安装|install",
|
|
"是否|要不要|would you like|do you want|同意",
|
|
"token|授权|integration",
|
|
],
|
|
forbiddenCommandTokens: [
|
|
["osascript"],
|
|
["spogo"],
|
|
["spotify_player"],
|
|
["ha.sh"],
|
|
["/api/states"],
|
|
],
|
|
},
|
|
};
|
|
|
|
/**
|
|
* @param {string} text
|
|
* @returns {string[]}
|
|
*/
|
|
function splitLines(text) {
|
|
return text.split(/\r?\n/).filter(Boolean);
|
|
}
|
|
|
|
/**
|
|
* @param {string} command
|
|
* @param {string[]} tokens
|
|
* @returns {boolean}
|
|
*/
|
|
function commandHasTokens(command, tokens) {
|
|
const lower = command.toLowerCase();
|
|
return tokens.every((token) => lower.includes(token.toLowerCase()));
|
|
}
|
|
|
|
/**
|
|
* @param {string} rawArgs
|
|
* @returns {string}
|
|
*/
|
|
function extractCommand(rawArgs) {
|
|
if (!rawArgs) return "";
|
|
try {
|
|
const parsed = JSON.parse(rawArgs);
|
|
if (parsed && typeof parsed.command === "string") {
|
|
return parsed.command;
|
|
}
|
|
} catch {
|
|
// Fall through: args may be truncated JSON in run-log.
|
|
}
|
|
return rawArgs;
|
|
}
|
|
|
|
/**
|
|
* @param {string} text
|
|
* @param {string} pattern
|
|
* @returns {boolean}
|
|
*/
|
|
function textMatchesPattern(text, pattern) {
|
|
try {
|
|
return new RegExp(pattern, "i").test(text);
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param {string} runLogPath
|
|
*/
|
|
function parseRunLog(runLogPath) {
|
|
const lines = splitLines(readFileSync(runLogPath, "utf-8"));
|
|
const events = [];
|
|
for (const line of lines) {
|
|
try {
|
|
events.push(JSON.parse(line));
|
|
} catch {
|
|
// Ignore malformed lines but keep analysis alive.
|
|
}
|
|
}
|
|
return events;
|
|
}
|
|
|
|
/**
|
|
* @param {string} sessionPath
|
|
* @returns {string}
|
|
*/
|
|
function parseFinalAssistantText(sessionPath) {
|
|
if (!existsSync(sessionPath)) return "";
|
|
|
|
const lines = splitLines(readFileSync(sessionPath, "utf-8"));
|
|
let latest = "";
|
|
|
|
for (const line of lines) {
|
|
try {
|
|
const entry = JSON.parse(line);
|
|
if (entry?.type !== "message") continue;
|
|
const msg = entry.message;
|
|
if (!msg || msg.role !== "assistant") continue;
|
|
|
|
if (typeof msg.content === "string") {
|
|
latest = msg.content;
|
|
continue;
|
|
}
|
|
|
|
if (Array.isArray(msg.content)) {
|
|
const text = msg.content
|
|
.filter((part) => part && part.type === "text" && typeof part.text === "string")
|
|
.map((part) => part.text)
|
|
.join("\n")
|
|
.trim();
|
|
if (text) latest = text;
|
|
}
|
|
} catch {
|
|
// Ignore malformed lines.
|
|
}
|
|
}
|
|
|
|
return latest;
|
|
}
|
|
|
|
/**
|
|
* @param {CaseAnalysis} analysis
|
|
* @param {string} id
|
|
* @param {string} check
|
|
* @param {boolean} passed
|
|
* @param {string} [detail]
|
|
*/
|
|
function addCheck(analysis, id, check, passed, detail) {
|
|
analysis.checks.push({ id, check, passed, detail });
|
|
}
|
|
|
|
const rows = splitLines(readFileSync(manifestPath, "utf-8"));
|
|
if (rows.length <= 1) {
|
|
console.error(`Manifest has no data rows: ${manifestPath}`);
|
|
process.exit(1);
|
|
}
|
|
|
|
/** @type {CaseAnalysis[]} */
|
|
const analyses = [];
|
|
|
|
for (let i = 1; i < rows.length; i++) {
|
|
const row = rows[i];
|
|
if (!row) continue;
|
|
|
|
const cols = row.split("\t");
|
|
if (cols.length < 11) continue;
|
|
|
|
const provider = cols[1] ?? "";
|
|
const caseId = cols[2] ?? "";
|
|
const rules = CASE_RULES[caseId];
|
|
const status = cols[3] ?? "";
|
|
const sessionId = cols[4] ?? "";
|
|
const sessionDir = cols[5] ?? "";
|
|
const logFile = cols[6] ?? "";
|
|
|
|
/** @type {CaseAnalysis} */
|
|
const analysis = {
|
|
provider,
|
|
caseId,
|
|
status,
|
|
sessionId,
|
|
sessionDir,
|
|
logFile,
|
|
checks: [],
|
|
pass: false,
|
|
};
|
|
|
|
addCheck(
|
|
analysis,
|
|
"run-status",
|
|
"runner status is success",
|
|
status === "success",
|
|
`status=${status}`,
|
|
);
|
|
|
|
if (!sessionDir) {
|
|
addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir");
|
|
analyses.push(analysis);
|
|
continue;
|
|
}
|
|
|
|
const runLogPath = join(sessionDir, "run-log.jsonl");
|
|
addCheck(
|
|
analysis,
|
|
"run-log-file",
|
|
"run-log.jsonl exists",
|
|
existsSync(runLogPath),
|
|
runLogPath,
|
|
);
|
|
|
|
if (!existsSync(runLogPath)) {
|
|
analyses.push(analysis);
|
|
continue;
|
|
}
|
|
|
|
const events = parseRunLog(runLogPath);
|
|
const sessionPath = join(sessionDir, "session.jsonl");
|
|
const finalAssistantText = parseFinalAssistantText(sessionPath);
|
|
const runStarts = events.filter((e) => e.event === "run_start");
|
|
const runEnds = events.filter((e) => e.event === "run_end");
|
|
const toolStarts = events.filter((e) => e.event === "tool_start");
|
|
const toolEnds = events.filter((e) => e.event === "tool_end");
|
|
const errorToolEnds = toolEnds.filter((e) => e.is_error === true);
|
|
|
|
addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`);
|
|
addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`);
|
|
addCheck(
|
|
analysis,
|
|
"tool-pairing",
|
|
"tool_start count matches tool_end count",
|
|
toolStarts.length === toolEnds.length,
|
|
`start=${toolStarts.length} end=${toolEnds.length}`,
|
|
);
|
|
|
|
const finalRunEnd = runEnds.at(-1);
|
|
const runEndError = finalRunEnd?.error;
|
|
const finalRunText = typeof finalRunEnd?.text === "string" ? finalRunEnd.text : "";
|
|
const finalResponseText = finalAssistantText || finalRunText;
|
|
addCheck(
|
|
analysis,
|
|
"run-end-error",
|
|
"final run_end.error is null/empty",
|
|
runEndError === null || runEndError === undefined || runEndError === "",
|
|
`error=${String(runEndError)}`,
|
|
);
|
|
|
|
addCheck(
|
|
analysis,
|
|
"tool-errors",
|
|
"no tool_end has is_error=true",
|
|
errorToolEnds.length === 0,
|
|
`error_tool_calls=${errorToolEnds.length}`,
|
|
);
|
|
|
|
const execCommands = toolStarts
|
|
.filter((e) => e.tool === "exec")
|
|
.map((e) => extractCommand(typeof e.args === "string" ? e.args : ""))
|
|
.filter(Boolean);
|
|
|
|
const requireExecUsage = rules?.requireExecUsage !== false;
|
|
addCheck(
|
|
analysis,
|
|
"exec-usage",
|
|
requireExecUsage
|
|
? "at least one exec command was used"
|
|
: "exec usage is optional for this case",
|
|
requireExecUsage ? execCommands.length > 0 : true,
|
|
requireExecUsage ? `exec_calls=${execCommands.length}` : `exec_calls=${execCommands.length} (optional)`,
|
|
);
|
|
|
|
if (rules) {
|
|
if (Array.isArray(rules.requiredCommandTokens)) {
|
|
for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
|
|
const tokenList = rules.requiredCommandTokens[r];
|
|
const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
|
|
addCheck(
|
|
analysis,
|
|
`cmd-${r + 1}`,
|
|
`exec command contains tokens: ${tokenList.join(" + ")}`,
|
|
passed,
|
|
);
|
|
}
|
|
}
|
|
|
|
if (Array.isArray(rules.requiredEventTokens)) {
|
|
const eventLines = events.map((event) => JSON.stringify(event).toLowerCase());
|
|
for (let r = 0; r < rules.requiredEventTokens.length; r++) {
|
|
const tokenList = rules.requiredEventTokens[r];
|
|
const passed = eventLines.some((line) =>
|
|
tokenList.every((token) => line.includes(token.toLowerCase())),
|
|
);
|
|
addCheck(
|
|
analysis,
|
|
`event-${r + 1}`,
|
|
`event log contains tokens: ${tokenList.join(" + ")}`,
|
|
passed,
|
|
);
|
|
}
|
|
}
|
|
|
|
if (Array.isArray(rules.forbiddenCommandTokens)) {
|
|
for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) {
|
|
const tokenList = rules.forbiddenCommandTokens[r];
|
|
const passed = !execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
|
|
addCheck(
|
|
analysis,
|
|
`forbid-cmd-${r + 1}`,
|
|
`exec command does not contain tokens: ${tokenList.join(" + ")}`,
|
|
passed,
|
|
);
|
|
}
|
|
}
|
|
|
|
if (Array.isArray(rules.requiredResponseRegex)) {
|
|
for (let r = 0; r < rules.requiredResponseRegex.length; r++) {
|
|
const pattern = rules.requiredResponseRegex[r];
|
|
const passed = textMatchesPattern(finalResponseText, pattern);
|
|
addCheck(
|
|
analysis,
|
|
`resp-${r + 1}`,
|
|
`final response matches regex: /${pattern}/i`,
|
|
passed,
|
|
);
|
|
}
|
|
}
|
|
} else {
|
|
addCheck(
|
|
analysis,
|
|
"case-rules",
|
|
"case has rule set",
|
|
false,
|
|
`No rules defined for case_id=${caseId}`,
|
|
);
|
|
}
|
|
|
|
analysis.pass = analysis.checks.every((c) => c.passed);
|
|
analyses.push(analysis);
|
|
}
|
|
|
|
const passedCases = analyses.filter((a) => a.pass).length;
|
|
const failedCases = analyses.length - passedCases;
|
|
|
|
const output = {
|
|
manifestPath,
|
|
totalCases: analyses.length,
|
|
passedCases,
|
|
failedCases,
|
|
results: analyses,
|
|
};
|
|
|
|
const outputPath = join(dirname(manifestPath), "analysis.json");
|
|
writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8");
|
|
|
|
for (const item of analyses) {
|
|
const status = item.pass ? "PASS" : "FAIL";
|
|
console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`);
|
|
for (const check of item.checks) {
|
|
const marker = check.passed ? " [ok] " : " [bad] ";
|
|
const detail = check.detail ? ` (${check.detail})` : "";
|
|
console.log(`${marker}${check.check}${detail}`);
|
|
}
|
|
}
|
|
|
|
console.log("");
|
|
console.log(`Analysis file: ${outputPath}`);
|
|
console.log(`Summary: pass=${passedCases} fail=${failedCases}`);
|
|
|
|
if (failedCases > 0) {
|
|
process.exit(1);
|
|
}
|