diff --git a/docs/e2e-skills-benchmark.md b/docs/e2e-skills-benchmark.md index 674a3b4b..c82ca61b 100644 --- a/docs/e2e-skills-benchmark.md +++ b/docs/e2e-skills-benchmark.md @@ -7,7 +7,7 @@ This benchmark validates the meta skill workflow for capability-gap discovery, C - Domain: skill discovery + installation + update - Focus: `skills/meta-skill-installer` - Providers: default `kimi-coding` (override with `PROVIDERS`) -- Cases: 4 +- Cases: 5 Case prompts are stored in: - `scripts/e2e-skills-benchmark/cases/` @@ -20,6 +20,7 @@ The case set references real public pages from ClawHub: - [Home Assistant](https://clawhub.ai/skills/homeassistant) - [CodexMonitor](https://clawhub.ai/odrobnik/codexmonitor) - [Spotify (gap-discovery UX flow)](https://clawhub.ai/search?q=spotify) +- [Notion (gap-discovery UX flow)](https://clawhub.ai/search?q=notion) ## Prerequisites @@ -87,7 +88,8 @@ For each run: - `clawhub install` - `review-skill-security.mjs` - for case 03 also `clawhub update` - - for case 04, final response must include ClawHub + install confirmation language, and must not run `clawhub install/update` before confirmation + - for case 04, prompt is a natural user request only; agent must self-discover capability gap, propose ClawHub + security review + install confirmation, and must not run workaround commands (`osascript`, `ha.sh`, `spogo`, `spotify_player`) before user confirmation + - for case 05, prompt is a natural Notion request; agent must discover missing capability, search skill candidates, trigger `install_guard` (blocked until confirmation), and ask for explicit install consent plus token/auth prerequisites ## Notes diff --git a/scripts/e2e-skills-benchmark/analyze.mjs b/scripts/e2e-skills-benchmark/analyze.mjs index 0eaee0ed..3f621328 100755 --- a/scripts/e2e-skills-benchmark/analyze.mjs +++ b/scripts/e2e-skills-benchmark/analyze.mjs @@ -66,6 +66,7 @@ const CASE_RULES = { "case-04-gap-discovery-spotify-ux": { requireExecUsage: false, requiredResponseRegex: [ + "缺少|没有.*(技能|能力|集成)|capability gap", "clawhub|cloud\\s*hub|cloudhub", "安装|install", "是否|要不要|would you like|do you want", @@ -74,6 +75,34 @@ const CASE_RULES = { forbiddenCommandTokens: [ ["clawhub", "install"], ["clawhub", "update"], + ["osascript"], + ["spogo"], + ["spotify_player"], + ["ha.sh"], + ["/api/states"], + ], + }, + "case-05-gap-discovery-notion-ux": { + requireExecUsage: false, + requiredCommandTokens: [ + ["clawhub", "search"], + ["notion"], + ], + requiredEventTokens: [ + ["install_guard", "blocked"], + ], + requiredResponseRegex: [ + "notion", + "安装|install", + "是否|要不要|would you like|do you want|同意", + "token|授权|integration", + ], + forbiddenCommandTokens: [ + ["osascript"], + ["spogo"], + ["spotify_player"], + ["ha.sh"], + ["/api/states"], ], }, }; @@ -324,6 +353,22 @@ for (let i = 1; i < rows.length; i++) { } } + if (Array.isArray(rules.requiredEventTokens)) { + const eventLines = events.map((event) => JSON.stringify(event).toLowerCase()); + for (let r = 0; r < rules.requiredEventTokens.length; r++) { + const tokenList = rules.requiredEventTokens[r]; + const passed = eventLines.some((line) => + tokenList.every((token) => line.includes(token.toLowerCase())), + ); + addCheck( + analysis, + `event-${r + 1}`, + `event log contains tokens: ${tokenList.join(" + ")}`, + passed, + ); + } + } + if (Array.isArray(rules.forbiddenCommandTokens)) { for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) { const tokenList = rules.forbiddenCommandTokens[r]; diff --git a/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt b/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt index 5c8a7c16..bf2b917a 100644 --- a/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt +++ b/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt @@ -1,10 +1 @@ -请模拟真实用户的首轮请求: - -用户说:「随机播放 spotify 中的音乐」。 - -要求: -1. 先明确你当前缺少直接控制 Spotify 的能力(能力缺口识别)。 -2. 不要直接拒绝结束,要给出恢复路径:引导用户可通过 ClawHub 搜索并安装相关 skill。 -3. 明确说明会先做安全审查,再安装。 -4. 在同一轮里必须询问用户是否要继续安装(显式确认),在用户确认前不要执行 `clawhub install` 或 `clawhub update`。 -5. 输出用中文,且要包含关键词:ClawHub、安全审查、安装、是否要继续。 +随机播放 spotify 中的音乐 diff --git a/scripts/e2e-skills-benchmark/cases/case-05-gap-discovery-notion-ux.txt b/scripts/e2e-skills-benchmark/cases/case-05-gap-discovery-notion-ux.txt new file mode 100644 index 00000000..aff66735 --- /dev/null +++ b/scripts/e2e-skills-benchmark/cases/case-05-gap-discovery-notion-ux.txt @@ -0,0 +1 @@ +帮我在 Notion 新建一个页面,标题是今天待办,并写入三条任务:修复登录 bug、写周报、安排评审