diff --git a/docs/e2e-skills-benchmark.md b/docs/e2e-skills-benchmark.md index e9859624..674a3b4b 100644 --- a/docs/e2e-skills-benchmark.md +++ b/docs/e2e-skills-benchmark.md @@ -7,7 +7,7 @@ This benchmark validates the meta skill workflow for capability-gap discovery, C - Domain: skill discovery + installation + update - Focus: `skills/meta-skill-installer` - Providers: default `kimi-coding` (override with `PROVIDERS`) -- Cases: 3 +- Cases: 4 Case prompts are stored in: - `scripts/e2e-skills-benchmark/cases/` @@ -19,6 +19,7 @@ The case set references real public pages from ClawHub: - [CalDAV Calendar](https://clawhub.ai/skills/caldav-calendar) - [Home Assistant](https://clawhub.ai/skills/homeassistant) - [CodexMonitor](https://clawhub.ai/odrobnik/codexmonitor) +- [Spotify (gap-discovery UX flow)](https://clawhub.ai/search?q=spotify) ## Prerequisites @@ -86,6 +87,7 @@ For each run: - `clawhub install` - `review-skill-security.mjs` - for case 03 also `clawhub update` + - for case 04, final response must include ClawHub + install confirmation language, and must not run `clawhub install/update` before confirmation ## Notes diff --git a/scripts/e2e-skills-benchmark/analyze.mjs b/scripts/e2e-skills-benchmark/analyze.mjs index ac090783..0eaee0ed 100755 --- a/scripts/e2e-skills-benchmark/analyze.mjs +++ b/scripts/e2e-skills-benchmark/analyze.mjs @@ -63,6 +63,19 @@ const CASE_RULES = { ["review-skill-security.mjs"], ], }, + "case-04-gap-discovery-spotify-ux": { + requireExecUsage: false, + requiredResponseRegex: [ + "clawhub|cloud\\s*hub|cloudhub", + "安装|install", + "是否|要不要|would you like|do you want", + "安全|审查|security|review", + ], + forbiddenCommandTokens: [ + ["clawhub", "install"], + ["clawhub", "update"], + ], + }, }; /** @@ -100,6 +113,19 @@ function extractCommand(rawArgs) { return rawArgs; } +/** + * @param {string} text + * @param {string} pattern + * @returns {boolean} + */ +function textMatchesPattern(text, pattern) { + try { + return new RegExp(pattern, "i").test(text); + } catch { + return false; + } +} + /** * @param {string} runLogPath */ @@ -116,6 +142,44 @@ function parseRunLog(runLogPath) { return events; } +/** + * @param {string} sessionPath + * @returns {string} + */ +function parseFinalAssistantText(sessionPath) { + if (!existsSync(sessionPath)) return ""; + + const lines = splitLines(readFileSync(sessionPath, "utf-8")); + let latest = ""; + + for (const line of lines) { + try { + const entry = JSON.parse(line); + if (entry?.type !== "message") continue; + const msg = entry.message; + if (!msg || msg.role !== "assistant") continue; + + if (typeof msg.content === "string") { + latest = msg.content; + continue; + } + + if (Array.isArray(msg.content)) { + const text = msg.content + .filter((part) => part && part.type === "text" && typeof part.text === "string") + .map((part) => part.text) + .join("\n") + .trim(); + if (text) latest = text; + } + } catch { + // Ignore malformed lines. + } + } + + return latest; +} + /** * @param {CaseAnalysis} analysis * @param {string} id @@ -145,6 +209,7 @@ for (let i = 1; i < rows.length; i++) { const provider = cols[1] ?? ""; const caseId = cols[2] ?? ""; + const rules = CASE_RULES[caseId]; const status = cols[3] ?? ""; const sessionId = cols[4] ?? ""; const sessionDir = cols[5] ?? ""; @@ -191,6 +256,8 @@ for (let i = 1; i < rows.length; i++) { } const events = parseRunLog(runLogPath); + const sessionPath = join(sessionDir, "session.jsonl"); + const finalAssistantText = parseFinalAssistantText(sessionPath); const runStarts = events.filter((e) => e.event === "run_start"); const runEnds = events.filter((e) => e.event === "run_end"); const toolStarts = events.filter((e) => e.event === "tool_start"); @@ -209,6 +276,8 @@ for (let i = 1; i < rows.length; i++) { const finalRunEnd = runEnds.at(-1); const runEndError = finalRunEnd?.error; + const finalRunText = typeof finalRunEnd?.text === "string" ? finalRunEnd.text : ""; + const finalResponseText = finalAssistantText || finalRunText; addCheck( analysis, "run-end-error", @@ -230,25 +299,55 @@ for (let i = 1; i < rows.length; i++) { .map((e) => extractCommand(typeof e.args === "string" ? e.args : "")) .filter(Boolean); + const requireExecUsage = rules?.requireExecUsage !== false; addCheck( analysis, "exec-usage", - "at least one exec command was used", - execCommands.length > 0, - `exec_calls=${execCommands.length}`, + requireExecUsage + ? "at least one exec command was used" + : "exec usage is optional for this case", + requireExecUsage ? execCommands.length > 0 : true, + requireExecUsage ? `exec_calls=${execCommands.length}` : `exec_calls=${execCommands.length} (optional)`, ); - const rules = CASE_RULES[caseId]; if (rules) { - for (let r = 0; r < rules.requiredCommandTokens.length; r++) { - const tokenList = rules.requiredCommandTokens[r]; - const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList)); - addCheck( - analysis, - `cmd-${r + 1}`, - `exec command contains tokens: ${tokenList.join(" + ")}`, - passed, - ); + if (Array.isArray(rules.requiredCommandTokens)) { + for (let r = 0; r < rules.requiredCommandTokens.length; r++) { + const tokenList = rules.requiredCommandTokens[r]; + const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList)); + addCheck( + analysis, + `cmd-${r + 1}`, + `exec command contains tokens: ${tokenList.join(" + ")}`, + passed, + ); + } + } + + if (Array.isArray(rules.forbiddenCommandTokens)) { + for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) { + const tokenList = rules.forbiddenCommandTokens[r]; + const passed = !execCommands.some((cmd) => commandHasTokens(cmd, tokenList)); + addCheck( + analysis, + `forbid-cmd-${r + 1}`, + `exec command does not contain tokens: ${tokenList.join(" + ")}`, + passed, + ); + } + } + + if (Array.isArray(rules.requiredResponseRegex)) { + for (let r = 0; r < rules.requiredResponseRegex.length; r++) { + const pattern = rules.requiredResponseRegex[r]; + const passed = textMatchesPattern(finalResponseText, pattern); + addCheck( + analysis, + `resp-${r + 1}`, + `final response matches regex: /${pattern}/i`, + passed, + ); + } } } else { addCheck( diff --git a/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt b/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt new file mode 100644 index 00000000..5c8a7c16 --- /dev/null +++ b/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt @@ -0,0 +1,10 @@ +请模拟真实用户的首轮请求: + +用户说:「随机播放 spotify 中的音乐」。 + +要求: +1. 先明确你当前缺少直接控制 Spotify 的能力(能力缺口识别)。 +2. 不要直接拒绝结束,要给出恢复路径:引导用户可通过 ClawHub 搜索并安装相关 skill。 +3. 明确说明会先做安全审查,再安装。 +4. 在同一轮里必须询问用户是否要继续安装(显式确认),在用户确认前不要执行 `clawhub install` 或 `clawhub update`。 +5. 输出用中文,且要包含关键词:ClawHub、安全审查、安装、是否要继续。