test(e2e): add natural Notion gap-discovery benchmark case
This commit is contained in:
parent
4b7f0afb50
commit
8a2b3e10f3
4 changed files with 51 additions and 12 deletions
|
|
@ -66,6 +66,7 @@ const CASE_RULES = {
|
|||
"case-04-gap-discovery-spotify-ux": {
|
||||
requireExecUsage: false,
|
||||
requiredResponseRegex: [
|
||||
"缺少|没有.*(技能|能力|集成)|capability gap",
|
||||
"clawhub|cloud\\s*hub|cloudhub",
|
||||
"安装|install",
|
||||
"是否|要不要|would you like|do you want",
|
||||
|
|
@ -74,6 +75,34 @@ const CASE_RULES = {
|
|||
forbiddenCommandTokens: [
|
||||
["clawhub", "install"],
|
||||
["clawhub", "update"],
|
||||
["osascript"],
|
||||
["spogo"],
|
||||
["spotify_player"],
|
||||
["ha.sh"],
|
||||
["/api/states"],
|
||||
],
|
||||
},
|
||||
"case-05-gap-discovery-notion-ux": {
|
||||
requireExecUsage: false,
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["notion"],
|
||||
],
|
||||
requiredEventTokens: [
|
||||
["install_guard", "blocked"],
|
||||
],
|
||||
requiredResponseRegex: [
|
||||
"notion",
|
||||
"安装|install",
|
||||
"是否|要不要|would you like|do you want|同意",
|
||||
"token|授权|integration",
|
||||
],
|
||||
forbiddenCommandTokens: [
|
||||
["osascript"],
|
||||
["spogo"],
|
||||
["spotify_player"],
|
||||
["ha.sh"],
|
||||
["/api/states"],
|
||||
],
|
||||
},
|
||||
};
|
||||
|
|
@ -324,6 +353,22 @@ for (let i = 1; i < rows.length; i++) {
|
|||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(rules.requiredEventTokens)) {
|
||||
const eventLines = events.map((event) => JSON.stringify(event).toLowerCase());
|
||||
for (let r = 0; r < rules.requiredEventTokens.length; r++) {
|
||||
const tokenList = rules.requiredEventTokens[r];
|
||||
const passed = eventLines.some((line) =>
|
||||
tokenList.every((token) => line.includes(token.toLowerCase())),
|
||||
);
|
||||
addCheck(
|
||||
analysis,
|
||||
`event-${r + 1}`,
|
||||
`event log contains tokens: ${tokenList.join(" + ")}`,
|
||||
passed,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(rules.forbiddenCommandTokens)) {
|
||||
for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) {
|
||||
const tokenList = rules.forbiddenCommandTokens[r];
|
||||
|
|
|
|||
|
|
@ -1,10 +1 @@
|
|||
请模拟真实用户的首轮请求:
|
||||
|
||||
用户说:「随机播放 spotify 中的音乐」。
|
||||
|
||||
要求:
|
||||
1. 先明确你当前缺少直接控制 Spotify 的能力(能力缺口识别)。
|
||||
2. 不要直接拒绝结束,要给出恢复路径:引导用户可通过 ClawHub 搜索并安装相关 skill。
|
||||
3. 明确说明会先做安全审查,再安装。
|
||||
4. 在同一轮里必须询问用户是否要继续安装(显式确认),在用户确认前不要执行 `clawhub install` 或 `clawhub update`。
|
||||
5. 输出用中文,且要包含关键词:ClawHub、安全审查、安装、是否要继续。
|
||||
随机播放 spotify 中的音乐
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
帮我在 Notion 新建一个页面,标题是今天待办,并写入三条任务:修复登录 bug、写周报、安排评审
|
||||
Loading…
Add table
Add a link
Reference in a new issue