Merge pull request #216 from multica-ai/codex/meta-skill-installer-e2e-skills-benchmark
feat(skills): add ClawHub meta installer and agent-driven E2E benchmark
This commit is contained in:
commit
e28ecb9a91
17 changed files with 1781 additions and 8 deletions
98
docs/e2e-skills-benchmark.md
Normal file
98
docs/e2e-skills-benchmark.md
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
# Skills Agent-Driven E2E Benchmark
|
||||
|
||||
This benchmark validates the meta skill workflow for capability-gap discovery, ClawHub installation, and security-gated rollout.
|
||||
|
||||
## Scope
|
||||
|
||||
- Domain: skill discovery + installation + update
|
||||
- Focus: `skills/meta-skill-installer`
|
||||
- Providers: default `kimi-coding` (override with `PROVIDERS`)
|
||||
- Cases: 5
|
||||
|
||||
Case prompts are stored in:
|
||||
- `scripts/e2e-skills-benchmark/cases/`
|
||||
|
||||
## Real ClawHub Examples Used
|
||||
|
||||
The case set references real public pages from ClawHub:
|
||||
|
||||
- [CalDAV Calendar](https://clawhub.ai/skills/caldav-calendar)
|
||||
- [Home Assistant](https://clawhub.ai/skills/homeassistant)
|
||||
- [CodexMonitor](https://clawhub.ai/odrobnik/codexmonitor)
|
||||
- [Spotify (gap-discovery UX flow)](https://clawhub.ai/search?q=spotify)
|
||||
- [Notion (gap-discovery UX flow)](https://clawhub.ai/search?q=notion)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. Credentials configured (`pnpm multica credentials init` if needed)
|
||||
2. Dependencies installed in repo (`pnpm install`)
|
||||
3. `clawhub` CLI available, or allow runtime fallback to `npx -y clawhub`
|
||||
4. Required env:
|
||||
|
||||
```bash
|
||||
export SMC_DATA_DIR=~/.super-multica-e2e
|
||||
export MULTICA_API_URL=https://api-dev.copilothub.ai
|
||||
```
|
||||
|
||||
## Run Benchmark
|
||||
|
||||
```bash
|
||||
scripts/e2e-skills-benchmark/run.sh
|
||||
```
|
||||
|
||||
Defaults:
|
||||
|
||||
- Providers: `kimi-coding`
|
||||
- Case glob: `case-*.txt`
|
||||
- Max parallel workers: `1`
|
||||
- Per-case timeout: `1200s` (`CASE_TIMEOUT_SEC=0` to disable)
|
||||
- Output directory: `.context/skills-e2e-runs/<timestamp>/`
|
||||
|
||||
Generated artifacts:
|
||||
|
||||
- `manifest.tsv`: provider/case/status/session/log metadata
|
||||
- `analysis.txt`: human-readable pass/fail report
|
||||
- `analysis.json`: structured detailed check output
|
||||
|
||||
## Run Subset
|
||||
|
||||
Only one case:
|
||||
|
||||
```bash
|
||||
CASE_GLOB="case-01-*.txt" scripts/e2e-skills-benchmark/run.sh
|
||||
```
|
||||
|
||||
Multiple providers:
|
||||
|
||||
```bash
|
||||
PROVIDERS="kimi-coding claude-code" scripts/e2e-skills-benchmark/run.sh
|
||||
```
|
||||
|
||||
Faster throughput:
|
||||
|
||||
```bash
|
||||
MAX_PARALLEL=2 CASE_TIMEOUT_SEC=1800 scripts/e2e-skills-benchmark/run.sh
|
||||
```
|
||||
|
||||
## Analyzer Checks
|
||||
|
||||
For each run:
|
||||
|
||||
1. `run_start` and `run_end` both present
|
||||
2. `run_end.error` is empty/null
|
||||
3. `tool_start` and `tool_end` are paired
|
||||
4. no `tool_end.is_error=true`
|
||||
5. at least one `exec` tool call exists
|
||||
6. case-specific command evidence in `tool_start.args`:
|
||||
- `clawhub search`
|
||||
- `clawhub install`
|
||||
- `review-skill-security.mjs`
|
||||
- for case 03 also `clawhub update`
|
||||
- for case 04, prompt is a natural user request only; agent must self-discover capability gap, propose ClawHub + security review + install confirmation, and must not run workaround commands (`osascript`, `ha.sh`, `spogo`, `spotify_player`) before user confirmation
|
||||
- for case 05, prompt is a natural Notion request; agent must discover missing capability, search skill candidates, trigger `install_guard` (blocked until confirmation), and ask for explicit install consent plus token/auth prerequisites
|
||||
|
||||
## Notes
|
||||
|
||||
- These are agent-driven tests; prompt intent plus run-log evidence are both evaluated.
|
||||
- `SMC_DATA_DIR=~/.super-multica-e2e` avoids polluting normal user skill/session data.
|
||||
- If a case fails, open `manifest.tsv` and inspect the matching `session_dir/run-log.jsonl`.
|
||||
|
|
@ -30,7 +30,8 @@
|
|||
"typecheck": "turbo typecheck",
|
||||
"test": "vitest run",
|
||||
"test:watch": "vitest",
|
||||
"test:coverage": "vitest run --coverage"
|
||||
"test:coverage": "vitest run --coverage",
|
||||
"e2e:skills": "bash scripts/e2e-skills-benchmark/run.sh"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
|
|
|
|||
171
packages/core/src/agent/runner.skill-install-consent.test.ts
Normal file
171
packages/core/src/agent/runner.skill-install-consent.test.ts
Normal file
|
|
@ -0,0 +1,171 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
evaluateCustomSkillAuthoringConsent,
|
||||
evaluateWorkaroundConsent,
|
||||
evaluateSkillInstallConsent,
|
||||
isEnvironmentInstallCommand,
|
||||
isLocalSkillMutationCommand,
|
||||
isMutatingClawhubCommand,
|
||||
isThirdPartyWorkaroundCommand,
|
||||
} from "./runner.js";
|
||||
|
||||
describe("isMutatingClawhubCommand", () => {
|
||||
it("detects clawhub install command", () => {
|
||||
expect(
|
||||
isMutatingClawhubCommand("npx -y clawhub install spotify --workdir /tmp --dir skills"),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("detects clawhub update command", () => {
|
||||
expect(isMutatingClawhubCommand("clawhub update spotify --force")).toBe(true);
|
||||
});
|
||||
|
||||
it("does not match non-mutating clawhub commands", () => {
|
||||
expect(isMutatingClawhubCommand("clawhub search spotify --limit 10")).toBe(false);
|
||||
expect(isMutatingClawhubCommand("clawhub inspect spotify")).toBe(false);
|
||||
});
|
||||
|
||||
it("detects wrapped bash flow that expands CLAWHUB_CMD and runs install", () => {
|
||||
const command = [
|
||||
"cd /tmp/meta-skill-installer && bash -c '",
|
||||
"if command -v clawhub >/dev/null 2>&1; then",
|
||||
" CLAWHUB_CMD=(clawhub)",
|
||||
"else",
|
||||
" CLAWHUB_CMD=(npx -y clawhub)",
|
||||
"fi",
|
||||
"\"${CLAWHUB_CMD[@]}\" install \"spotify\" --workdir \"$DATA_DIR\" --dir skills --force",
|
||||
"'",
|
||||
].join("\n");
|
||||
expect(isMutatingClawhubCommand(command)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("evaluateSkillInstallConsent", () => {
|
||||
it("does not grant consent for generic capability requests", () => {
|
||||
const result = evaluateSkillInstallConsent("随机播放 spotify 中的音乐", false);
|
||||
expect(result).toEqual({ allowInstall: false, declined: false });
|
||||
});
|
||||
|
||||
it("grants consent for explicit install requests", () => {
|
||||
const result = evaluateSkillInstallConsent("请帮我安装 spotify skill", false);
|
||||
expect(result).toEqual({ allowInstall: true, declined: false });
|
||||
});
|
||||
|
||||
it("grants consent for short affirmative replies when awaiting confirmation", () => {
|
||||
const result = evaluateSkillInstallConsent("继续", true);
|
||||
expect(result).toEqual({ allowInstall: true, declined: false });
|
||||
});
|
||||
|
||||
it("treats standalone Chinese affirmative as consent when awaiting confirmation", () => {
|
||||
const result = evaluateSkillInstallConsent("行", true);
|
||||
expect(result).toEqual({ allowInstall: true, declined: false });
|
||||
});
|
||||
|
||||
it("marks declines explicitly", () => {
|
||||
const result = evaluateSkillInstallConsent("不要安装,先别动", true);
|
||||
expect(result).toEqual({ allowInstall: false, declined: true });
|
||||
});
|
||||
});
|
||||
|
||||
describe("isEnvironmentInstallCommand", () => {
|
||||
it("detects package manager install commands", () => {
|
||||
expect(isEnvironmentInstallCommand("brew install spogo")).toBe(true);
|
||||
expect(isEnvironmentInstallCommand("pnpm add lodash")).toBe(true);
|
||||
expect(isEnvironmentInstallCommand("npm install -g clawhub")).toBe(true);
|
||||
expect(isEnvironmentInstallCommand("pip install requests")).toBe(true);
|
||||
});
|
||||
|
||||
it("does not match read-only package manager commands", () => {
|
||||
expect(isEnvironmentInstallCommand("brew list")).toBe(false);
|
||||
expect(isEnvironmentInstallCommand("pnpm list --depth 0")).toBe(false);
|
||||
expect(isEnvironmentInstallCommand("npm view clawhub")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isThirdPartyWorkaroundCommand", () => {
|
||||
it("detects local workaround commands", () => {
|
||||
expect(isThirdPartyWorkaroundCommand("spotify_player playback shuffle")).toBe(true);
|
||||
expect(isThirdPartyWorkaroundCommand("spogo status")).toBe(true);
|
||||
expect(isThirdPartyWorkaroundCommand("osascript -e 'tell app \"Spotify\" to play'")).toBe(true);
|
||||
expect(isThirdPartyWorkaroundCommand("curl http://localhost:8123/api/states")).toBe(true);
|
||||
});
|
||||
|
||||
it("does not match unrelated commands", () => {
|
||||
expect(isThirdPartyWorkaroundCommand("ls -la")).toBe(false);
|
||||
expect(isThirdPartyWorkaroundCommand("pnpm test")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("evaluateWorkaroundConsent", () => {
|
||||
it("does not grant workaround mode for generic capability requests", () => {
|
||||
const result = evaluateWorkaroundConsent("随机播放 spotify 中的音乐", false);
|
||||
expect(result).toEqual({ allowWorkaround: false, declined: false });
|
||||
});
|
||||
|
||||
it("grants workaround mode for explicit local-command intent", () => {
|
||||
const result = evaluateWorkaroundConsent("不要安装 skill,直接用本地命令试试", false);
|
||||
expect(result).toEqual({ allowWorkaround: true, declined: false });
|
||||
});
|
||||
|
||||
it("grants workaround mode for short affirmative replies when awaiting confirmation", () => {
|
||||
const result = evaluateWorkaroundConsent("继续", true);
|
||||
expect(result).toEqual({ allowWorkaround: true, declined: false });
|
||||
});
|
||||
|
||||
it("treats standalone Chinese affirmative as workaround consent when awaiting confirmation", () => {
|
||||
const result = evaluateWorkaroundConsent("行", true);
|
||||
expect(result).toEqual({ allowWorkaround: true, declined: false });
|
||||
});
|
||||
|
||||
it("marks declines when no workaround intent is present", () => {
|
||||
const result = evaluateWorkaroundConsent("不要,先别执行", true);
|
||||
expect(result).toEqual({ allowWorkaround: false, declined: true });
|
||||
});
|
||||
});
|
||||
|
||||
describe("isLocalSkillMutationCommand", () => {
|
||||
it("detects direct local skill mutation commands", () => {
|
||||
expect(
|
||||
isLocalSkillMutationCommand(
|
||||
"mkdir -p ~/.super-multica/skills/notion-integration && touch ~/.super-multica/skills/notion-integration/SKILL.md",
|
||||
),
|
||||
).toBe(true);
|
||||
|
||||
expect(
|
||||
isLocalSkillMutationCommand(
|
||||
"cat > ~/.super-multica/skills/notion-integration/SKILL.md << 'EOF'\n# skill\nEOF",
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("does not match read-only commands or clawhub install flow", () => {
|
||||
expect(isLocalSkillMutationCommand("cat ~/.super-multica/skills/notion/SKILL.md")).toBe(false);
|
||||
expect(
|
||||
isLocalSkillMutationCommand(
|
||||
"npx -y clawhub install notion --workdir ~/.super-multica --dir skills --force",
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("evaluateCustomSkillAuthoringConsent", () => {
|
||||
it("does not grant consent for generic third-party requests", () => {
|
||||
const result = evaluateCustomSkillAuthoringConsent("帮我在 Notion 新建一个页面", false);
|
||||
expect(result).toEqual({ allowAuthoring: false, declined: false });
|
||||
});
|
||||
|
||||
it("grants consent when user explicitly asks to create a custom skill", () => {
|
||||
const result = evaluateCustomSkillAuthoringConsent("请帮我创建一个 Notion skill", false);
|
||||
expect(result).toEqual({ allowAuthoring: true, declined: false });
|
||||
});
|
||||
|
||||
it("grants consent for short affirmatives when awaiting confirmation", () => {
|
||||
const result = evaluateCustomSkillAuthoringConsent("继续", true);
|
||||
expect(result).toEqual({ allowAuthoring: true, declined: false });
|
||||
});
|
||||
|
||||
it("marks declines explicitly", () => {
|
||||
const result = evaluateCustomSkillAuthoringConsent("先别创建技能", true);
|
||||
expect(result).toEqual({ allowAuthoring: false, declined: true });
|
||||
});
|
||||
});
|
||||
|
|
@ -57,6 +57,7 @@ import {
|
|||
import { isContextOverflowError } from "./errors.js";
|
||||
import { resolveWorkspaceDir, ensureWorkspaceDir } from "./workspace.js";
|
||||
import { createRunLog, type RunLog } from "./run-log.js";
|
||||
import type { ExecApprovalCallback } from "./tools/exec-approval-types.js";
|
||||
|
||||
// ============================================================
|
||||
// Error classification for auth profile rotation
|
||||
|
|
@ -90,6 +91,153 @@ export function isRotatableError(reason: AuthProfileFailureReason): boolean {
|
|||
return reason === "auth" || reason === "rate_limit" || reason === "billing" || reason === "timeout";
|
||||
}
|
||||
|
||||
// ── Skill install consent guard ─────────────────────────────────────────────
|
||||
|
||||
const CLAWHUB_MUTATION_RE = /\bclawhub\b[\s\S]*\b(?:install|update)\b/i;
|
||||
const ENV_INSTALL_RE = /\b(?:brew|apt-get|apt|yum|dnf|pacman|zypper)\s+(?:install|upgrade|tap)\b|\b(?:npm|pnpm|yarn|bun)\s+(?:install|add)\b|\bpip(?:3)?\s+install\b|\buv\s+(?:tool\s+install|pip\s+install)\b|\bcargo\s+install\b|\bgo\s+install\b/i;
|
||||
const THIRD_PARTY_WORKAROUND_RE = /\b(?:osascript|spogo|spotify_player|ha\.sh|homeassistant|hass)\b|\/api\/states\b/i;
|
||||
const LOCAL_SKILL_PATH_RE = /(?:~\/\.super-multica(?:-[\w-]+)?\/skills\/|\/\.super-multica(?:-[\w-]+)?\/skills\/|\/skills\/)/i;
|
||||
const LOCAL_SKILL_MUTATION_VERB_RE = /\b(?:mkdir|cp|mv|rm|touch|install|clone)\b/i;
|
||||
const INSTALL_ACTION_RE = /\b(?:install|update|add)\b|安装|更新|添加|启用|配置/i;
|
||||
const SKILL_CONTEXT_RE = /\b(?:clawhub|skill|skills)\b|技能|插件|扩展/i;
|
||||
const WORKAROUND_ACTION_RE = /\b(?:workaround|fallback|local\s+command|local\s+script|shell\s+script|osascript|apple\s*script|spogo|spotify_player|homeassistant|ha\.sh)\b|绕过|临时方案|本地命令|本机命令|脚本方式|直接执行|不用技能|不用skill|不装skill|不安装skill/i;
|
||||
const CUSTOM_SKILL_AUTHORING_RE = /\b(?:create|author|build)\b[\s\S]*\bskills?\b|创建[\s\S]{0,30}(?:技能|skill)|自定义[\s\S]{0,20}(?:技能|skill)|手写[\s\S]{0,20}(?:技能|skill)|custom\s+skill/i;
|
||||
const AFFIRMATIVE_RE = /\b(?:yes|y|ok|okay|sure|confirm|confirmed|continue|go ahead|please do|do it)\b|继续|确认|同意|可以|好的|继续安装/i;
|
||||
const STANDALONE_AFFIRMATIVE_RE = /^\s*(?:行|行吧|行的)\s*[。!!]?$/i;
|
||||
const DECLINE_RE = /\b(?:no|cancel|stop|don't|do not|not now|skip)\b|不要|不需要|取消|先别|暂时不用/i;
|
||||
|
||||
function hasAffirmativeConsent(text: string): boolean {
|
||||
return AFFIRMATIVE_RE.test(text) || STANDALONE_AFFIRMATIVE_RE.test(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect mutating ClawHub commands that require explicit user confirmation.
|
||||
*/
|
||||
export function isMutatingClawhubCommand(command: string): boolean {
|
||||
return CLAWHUB_MUTATION_RE.test(command);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect package/environment installation commands.
|
||||
* These mutate the runtime environment and should require explicit user confirmation.
|
||||
*/
|
||||
export function isEnvironmentInstallCommand(command: string): boolean {
|
||||
return ENV_INSTALL_RE.test(command);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect local workaround commands for third-party integrations.
|
||||
* These should require explicit user opt-in before execution.
|
||||
*/
|
||||
export function isThirdPartyWorkaroundCommand(command: string): boolean {
|
||||
return THIRD_PARTY_WORKAROUND_RE.test(command);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect direct local skill mutations outside ClawHub install/update flow.
|
||||
*/
|
||||
export function isLocalSkillMutationCommand(command: string): boolean {
|
||||
if (!LOCAL_SKILL_PATH_RE.test(command)) return false;
|
||||
if (/\bclawhub\b/i.test(command)) return false;
|
||||
|
||||
if (LOCAL_SKILL_MUTATION_VERB_RE.test(command)) return true;
|
||||
|
||||
const hasCatOrEchoWrite = /\b(?:cat|tee|echo)\b/i.test(command) && />>?|<<\s*['"]?EOF/i.test(command);
|
||||
return hasCatOrEchoWrite;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether the current user prompt grants permission to install/update skills.
|
||||
*
|
||||
* If `awaitingConfirmation` is true, short affirmative replies (e.g. "继续", "yes")
|
||||
* are treated as confirmation.
|
||||
*/
|
||||
export function evaluateSkillInstallConsent(
|
||||
prompt: string,
|
||||
awaitingConfirmation: boolean,
|
||||
): { allowInstall: boolean; declined: boolean } {
|
||||
const text = prompt.trim();
|
||||
if (!text) return { allowInstall: false, declined: false };
|
||||
|
||||
if (DECLINE_RE.test(text)) {
|
||||
return { allowInstall: false, declined: true };
|
||||
}
|
||||
|
||||
const hasInstallAction = INSTALL_ACTION_RE.test(text);
|
||||
const hasSkillContext = SKILL_CONTEXT_RE.test(text);
|
||||
const hasAffirmative = hasAffirmativeConsent(text);
|
||||
|
||||
if (hasInstallAction) {
|
||||
return { allowInstall: true, declined: false };
|
||||
}
|
||||
|
||||
if (hasSkillContext && hasAffirmative) {
|
||||
return { allowInstall: true, declined: false };
|
||||
}
|
||||
|
||||
if (awaitingConfirmation && hasAffirmative) {
|
||||
return { allowInstall: true, declined: false };
|
||||
}
|
||||
|
||||
return { allowInstall: false, declined: false };
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether the current user prompt explicitly opts into local workaround mode.
|
||||
*/
|
||||
export function evaluateWorkaroundConsent(
|
||||
prompt: string,
|
||||
awaitingConfirmation: boolean,
|
||||
): { allowWorkaround: boolean; declined: boolean } {
|
||||
const text = prompt.trim();
|
||||
if (!text) return { allowWorkaround: false, declined: false };
|
||||
|
||||
const hasWorkaroundAction = WORKAROUND_ACTION_RE.test(text);
|
||||
const hasAffirmative = hasAffirmativeConsent(text);
|
||||
|
||||
if (hasWorkaroundAction) {
|
||||
return { allowWorkaround: true, declined: false };
|
||||
}
|
||||
|
||||
if (awaitingConfirmation && hasAffirmative) {
|
||||
return { allowWorkaround: true, declined: false };
|
||||
}
|
||||
|
||||
if (DECLINE_RE.test(text)) {
|
||||
return { allowWorkaround: false, declined: true };
|
||||
}
|
||||
|
||||
return { allowWorkaround: false, declined: false };
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine whether the current prompt explicitly opts into custom skill authoring.
|
||||
*/
|
||||
export function evaluateCustomSkillAuthoringConsent(
|
||||
prompt: string,
|
||||
awaitingConfirmation: boolean,
|
||||
): { allowAuthoring: boolean; declined: boolean } {
|
||||
const text = prompt.trim();
|
||||
if (!text) return { allowAuthoring: false, declined: false };
|
||||
|
||||
if (DECLINE_RE.test(text)) {
|
||||
return { allowAuthoring: false, declined: true };
|
||||
}
|
||||
|
||||
const hasAuthoringIntent = CUSTOM_SKILL_AUTHORING_RE.test(text);
|
||||
const hasAffirmative = hasAffirmativeConsent(text);
|
||||
|
||||
if (hasAuthoringIntent) {
|
||||
return { allowAuthoring: true, declined: false };
|
||||
}
|
||||
|
||||
if (awaitingConfirmation && hasAffirmative) {
|
||||
return { allowAuthoring: true, declined: false };
|
||||
}
|
||||
|
||||
return { allowAuthoring: false, declined: false };
|
||||
}
|
||||
|
||||
// ── Run-log result extraction helpers ──────────────────────────────────────
|
||||
// Lightweight extractors for tool_end metadata. These mirror the patterns in
|
||||
// cli/output.ts but are kept separate to avoid CLI-specific dependencies.
|
||||
|
|
@ -201,6 +349,13 @@ export class Agent {
|
|||
private readonly toolStartTimes = new Map<string, number>();
|
||||
private currentRunToolExecutions: ToolExecutionRecord[] = [];
|
||||
private initialized = false;
|
||||
private allowSkillInstallForCurrentRun = false;
|
||||
private awaitingSkillInstallConfirmation = false;
|
||||
private allowWorkaroundForCurrentRun = false;
|
||||
private awaitingWorkaroundConfirmation = false;
|
||||
private allowCustomSkillAuthoringForCurrentRun = false;
|
||||
private awaitingCustomSkillAuthoringConfirmation = false;
|
||||
private readonly guardedExecApproval: ExecApprovalCallback;
|
||||
|
||||
// Context window settings (for pre-flight compaction)
|
||||
private readonly reserveTokens: number;
|
||||
|
|
@ -244,6 +399,7 @@ export class Agent {
|
|||
|
||||
// Load session metadata early so stored provider/model can inform defaults
|
||||
this.sessionId = options.sessionId ?? uuidv7();
|
||||
this.guardedExecApproval = this.createGuardedExecApprovalCallback(options.onExecApprovalNeeded);
|
||||
this.runLog = createRunLog(
|
||||
options.enableRunLog ?? !!process.env.MULTICA_RUN_LOG,
|
||||
this.sessionId,
|
||||
|
|
@ -454,8 +610,25 @@ export class Agent {
|
|||
// Use this.sessionId (which may be auto-generated) instead of options.sessionId
|
||||
// (which may be undefined). Without this, delegate tool has no session context.
|
||||
this.toolsOptions = mergedToolsConfig
|
||||
? { ...options, sessionId: this.sessionId, cwd: effectiveCwd, tools: mergedToolsConfig, profileDir, provider: this.resolvedProvider, runLog: this.runLog }
|
||||
: { ...options, sessionId: this.sessionId, cwd: effectiveCwd, profileDir, provider: this.resolvedProvider, runLog: this.runLog };
|
||||
? {
|
||||
...options,
|
||||
sessionId: this.sessionId,
|
||||
cwd: effectiveCwd,
|
||||
tools: mergedToolsConfig,
|
||||
profileDir,
|
||||
provider: this.resolvedProvider,
|
||||
runLog: this.runLog,
|
||||
onExecApprovalNeeded: this.guardedExecApproval,
|
||||
}
|
||||
: {
|
||||
...options,
|
||||
sessionId: this.sessionId,
|
||||
cwd: effectiveCwd,
|
||||
profileDir,
|
||||
provider: this.resolvedProvider,
|
||||
runLog: this.runLog,
|
||||
onExecApprovalNeeded: this.guardedExecApproval,
|
||||
};
|
||||
|
||||
const tools = resolveTools(this.toolsOptions);
|
||||
if (this.debug) {
|
||||
|
|
@ -585,6 +758,42 @@ export class Agent {
|
|||
this._aborted = false;
|
||||
this.currentRunToolExecutions = [];
|
||||
|
||||
if (this._internalRun) {
|
||||
this.allowSkillInstallForCurrentRun = false;
|
||||
this.allowWorkaroundForCurrentRun = false;
|
||||
this.allowCustomSkillAuthoringForCurrentRun = false;
|
||||
} else {
|
||||
const consent = evaluateSkillInstallConsent(prompt, this.awaitingSkillInstallConfirmation);
|
||||
if (consent.declined) {
|
||||
this.awaitingSkillInstallConfirmation = false;
|
||||
}
|
||||
this.allowSkillInstallForCurrentRun = consent.allowInstall;
|
||||
if (consent.allowInstall) {
|
||||
this.awaitingSkillInstallConfirmation = false;
|
||||
}
|
||||
|
||||
const workaroundConsent = evaluateWorkaroundConsent(prompt, this.awaitingWorkaroundConfirmation);
|
||||
if (workaroundConsent.declined) {
|
||||
this.awaitingWorkaroundConfirmation = false;
|
||||
}
|
||||
this.allowWorkaroundForCurrentRun = workaroundConsent.allowWorkaround;
|
||||
if (workaroundConsent.allowWorkaround) {
|
||||
this.awaitingWorkaroundConfirmation = false;
|
||||
}
|
||||
|
||||
const customSkillConsent = evaluateCustomSkillAuthoringConsent(
|
||||
prompt,
|
||||
this.awaitingCustomSkillAuthoringConfirmation,
|
||||
);
|
||||
if (customSkillConsent.declined) {
|
||||
this.awaitingCustomSkillAuthoringConfirmation = false;
|
||||
}
|
||||
this.allowCustomSkillAuthoringForCurrentRun = customSkillConsent.allowAuthoring;
|
||||
if (customSkillConsent.allowAuthoring) {
|
||||
this.awaitingCustomSkillAuthoringConfirmation = false;
|
||||
}
|
||||
}
|
||||
|
||||
const runStart = Date.now();
|
||||
this.runLog.log("run_start", {
|
||||
prompt: prompt.slice(0, 200),
|
||||
|
|
@ -758,6 +967,9 @@ export class Agent {
|
|||
}
|
||||
this._isRunning = false;
|
||||
this._aborted = false;
|
||||
this.allowSkillInstallForCurrentRun = false;
|
||||
this.allowWorkaroundForCurrentRun = false;
|
||||
this.allowCustomSkillAuthoringForCurrentRun = false;
|
||||
this._lastEventSavedAssistant = undefined;
|
||||
this.currentUserDisplayPrompt = undefined;
|
||||
this.currentUserSource = undefined;
|
||||
|
|
@ -766,6 +978,91 @@ export class Agent {
|
|||
}
|
||||
}
|
||||
|
||||
private createGuardedExecApprovalCallback(
|
||||
base?: ExecApprovalCallback,
|
||||
): ExecApprovalCallback {
|
||||
return async (command, cwd) => {
|
||||
const needsInstallConsent =
|
||||
isMutatingClawhubCommand(command) || isEnvironmentInstallCommand(command);
|
||||
const needsWorkaroundConsent = isThirdPartyWorkaroundCommand(command);
|
||||
const needsCustomSkillAuthoringConsent = isLocalSkillMutationCommand(command);
|
||||
if (needsInstallConsent && !this.allowSkillInstallForCurrentRun) {
|
||||
this.awaitingSkillInstallConfirmation = true;
|
||||
this.runLog.log("install_guard", {
|
||||
action: "blocked",
|
||||
reason: "explicit_user_confirmation_required",
|
||||
command: command.slice(0, 200),
|
||||
});
|
||||
return {
|
||||
approved: false,
|
||||
decision: "deny",
|
||||
message:
|
||||
"Install command blocked: explicit user confirmation is required first. Ask the user whether to continue installation.",
|
||||
};
|
||||
}
|
||||
|
||||
if (needsInstallConsent) {
|
||||
this.runLog.log("install_guard", {
|
||||
action: "allowed",
|
||||
reason: "user_confirmed",
|
||||
command: command.slice(0, 200),
|
||||
});
|
||||
}
|
||||
|
||||
if (needsCustomSkillAuthoringConsent && !this.allowCustomSkillAuthoringForCurrentRun) {
|
||||
this.awaitingCustomSkillAuthoringConfirmation = true;
|
||||
this.runLog.log("custom_skill_guard", {
|
||||
action: "blocked",
|
||||
reason: "explicit_custom_skill_authoring_confirmation_required",
|
||||
command: command.slice(0, 200),
|
||||
});
|
||||
return {
|
||||
approved: false,
|
||||
decision: "deny",
|
||||
message:
|
||||
"Manual local skill creation command blocked by policy. Use ClawHub discovery/install flow first, or ask the user to explicitly confirm custom skill authoring.",
|
||||
};
|
||||
}
|
||||
|
||||
if (needsCustomSkillAuthoringConsent) {
|
||||
this.runLog.log("custom_skill_guard", {
|
||||
action: "allowed",
|
||||
reason: "user_confirmed_custom_skill_authoring",
|
||||
command: command.slice(0, 200),
|
||||
});
|
||||
}
|
||||
|
||||
if (needsWorkaroundConsent && !this.allowWorkaroundForCurrentRun) {
|
||||
this.awaitingWorkaroundConfirmation = true;
|
||||
this.runLog.log("workaround_guard", {
|
||||
action: "blocked",
|
||||
reason: "explicit_workaround_opt_in_required",
|
||||
command: command.slice(0, 200),
|
||||
});
|
||||
return {
|
||||
approved: false,
|
||||
decision: "deny",
|
||||
message:
|
||||
"Local workaround command blocked by policy. First explain the capability gap and ask whether to search/install a Cloud Hub skill, or get explicit user opt-in for workaround mode.",
|
||||
};
|
||||
}
|
||||
|
||||
if (needsWorkaroundConsent) {
|
||||
this.runLog.log("workaround_guard", {
|
||||
action: "allowed",
|
||||
reason: "user_opted_in_workaround_mode",
|
||||
command: command.slice(0, 200),
|
||||
});
|
||||
}
|
||||
|
||||
if (base) {
|
||||
return base(command, cwd);
|
||||
}
|
||||
|
||||
return { approved: true, decision: "allow-once" };
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance to the next non-cooldown auth profile.
|
||||
* Returns true if a new profile was activated, false if exhausted.
|
||||
|
|
|
|||
|
|
@ -218,6 +218,34 @@ describe("buildSkillsSection", () => {
|
|||
expect(text).toContain("suggest activating it");
|
||||
});
|
||||
|
||||
it("includes capability-gap recovery guidance", () => {
|
||||
const result = buildSkillsSection("## commit\nDo commits.", "full");
|
||||
const text = result.join("\n");
|
||||
expect(text).toContain("capability gap");
|
||||
expect(text).toContain("explicit user confirmation");
|
||||
expect(text).toContain("clawhub install");
|
||||
expect(text).toContain("third-party service requests");
|
||||
expect(text).toContain("local workaround commands");
|
||||
expect(text).toContain("spotify_player");
|
||||
});
|
||||
|
||||
it("surfaces installed skill IDs and prioritizes meta skill guidance when present", () => {
|
||||
const prompt = [
|
||||
"## 🔧 Meta Skill Installer (meta-skill-installer)",
|
||||
"Detect missing capabilities.",
|
||||
"",
|
||||
"## 📄 PDF (pdf)",
|
||||
"Handle PDFs.",
|
||||
].join("\n");
|
||||
const result = buildSkillsSection(prompt, "full");
|
||||
const text = result.join("\n");
|
||||
expect(text).toContain("Installed skill IDs:");
|
||||
expect(text).toContain("`meta-skill-installer`");
|
||||
expect(text).toContain("is installed");
|
||||
expect(text).toContain("ClawHub search");
|
||||
expect(text).toContain("run ClawHub discovery first");
|
||||
});
|
||||
|
||||
it("returns empty in minimal mode", () => {
|
||||
expect(buildSkillsSection("skills", "minimal")).toEqual([]);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -389,19 +389,72 @@ export function buildSkillsSection(
|
|||
const trimmed = skillsPrompt?.trim();
|
||||
if (!trimmed) return [];
|
||||
|
||||
const skillIds = extractSkillIdsFromSkillsPrompt(trimmed);
|
||||
const hasMetaSkillInstaller = skillIds.includes("meta-skill-installer");
|
||||
const { text: budgeted } = truncateWithBudget(trimmed, DEFAULT_SKILLS_MAX_CHARS);
|
||||
|
||||
return [
|
||||
const lines: string[] = [
|
||||
"## Skills (mandatory)",
|
||||
"Before replying: scan the available skills below.",
|
||||
];
|
||||
|
||||
if (skillIds.length > 0) {
|
||||
lines.push(
|
||||
`Installed skill IDs: ${skillIds.map((id) => `\`${id}\``).join(", ")}`,
|
||||
);
|
||||
}
|
||||
|
||||
lines.push(
|
||||
"- If exactly one skill clearly applies: follow its instructions.",
|
||||
"- If multiple could apply: choose the most specific one.",
|
||||
"- If none clearly apply but an **inactive skill** matches the user's intent: suggest activating it.",
|
||||
"- If no skill matches at all: skip skill invocation.",
|
||||
"- If the request needs a capability you currently lack: do not stop at refusal. Treat it as a capability gap and propose a recovery path.",
|
||||
"- For third-party service requests (Spotify, Notion, Slack, Jira, etc.), do not jump to ad-hoc shell/app hacks as the default path.",
|
||||
"- Treat local CLIs/scripts (for example `spogo`, `spotify_player`, `osascript`, `ha.sh`) as workaround mode: only use them after explicit user opt-in.",
|
||||
);
|
||||
|
||||
if (hasMetaSkillInstaller) {
|
||||
lines.push(
|
||||
"- `meta-skill-installer` is installed: for capability gaps with no matching installed skill, proactively offer ClawHub search + security review + explicit install confirmation.",
|
||||
"- With `meta-skill-installer` installed, run ClawHub discovery first (`clawhub search`) before proposing to hand-build a new custom skill.",
|
||||
);
|
||||
} else {
|
||||
lines.push(
|
||||
"- If `meta-skill-installer` is available and no installed skill matches: proactively offer to search ClawHub for candidates and run security review before install.",
|
||||
"- Prefer ClawHub discovery over creating a brand-new custom skill from scratch unless the user explicitly asks for custom skill authoring.",
|
||||
);
|
||||
}
|
||||
|
||||
lines.push(
|
||||
"- Ask for explicit user confirmation before final `clawhub install` / `clawhub update` unless the user already clearly asked you to install in this turn.",
|
||||
"- Only use local workaround commands (for example `osascript` or custom shell scripts) if the user explicitly asks for workaround mode or declines skill installation.",
|
||||
"- After install/update, verify the skill path and retry the original user task.",
|
||||
"",
|
||||
budgeted,
|
||||
"",
|
||||
];
|
||||
);
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract skill IDs from SkillManager prompt headings.
|
||||
* Expected heading format: `## <emoji> <name> (<id>)`
|
||||
*/
|
||||
function extractSkillIdsFromSkillsPrompt(skillsPrompt: string): string[] {
|
||||
const ids: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
const headingRegex = /^##\s+.*\(([^()\n]+)\)\s*$/gm;
|
||||
|
||||
let match: RegExpExecArray | null;
|
||||
while ((match = headingRegex.exec(skillsPrompt)) !== null) {
|
||||
const id = match[1]?.trim();
|
||||
if (!id || seen.has(id)) continue;
|
||||
seen.add(id);
|
||||
ids.push(id);
|
||||
}
|
||||
|
||||
return ids;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -40,6 +40,8 @@ export interface ExecApprovalRequest {
|
|||
export interface ApprovalResult {
|
||||
approved: boolean;
|
||||
decision: ApprovalDecision;
|
||||
/** Optional denial/approval message for the exec tool response */
|
||||
message?: string | undefined;
|
||||
}
|
||||
|
||||
// ============ Configuration ============
|
||||
|
|
|
|||
|
|
@ -59,10 +59,11 @@ export function createExecTool(
|
|||
if (onApprovalNeeded) {
|
||||
const approvalResult = await onApprovalNeeded(command, effectiveCwd);
|
||||
if (!approvalResult.approved) {
|
||||
const denialText = approvalResult.message?.trim() || "Command execution denied by user.";
|
||||
return {
|
||||
content: [{ type: "text", text: "Command execution denied by user." }],
|
||||
content: [{ type: "text", text: denialText }],
|
||||
details: {
|
||||
output: "Command execution denied by user.",
|
||||
output: denialText,
|
||||
exitCode: 1,
|
||||
truncated: false,
|
||||
},
|
||||
|
|
|
|||
441
scripts/e2e-skills-benchmark/analyze.mjs
Executable file
441
scripts/e2e-skills-benchmark/analyze.mjs
Executable file
|
|
@ -0,0 +1,441 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
||||
import { dirname, join, resolve } from "node:path";
|
||||
|
||||
/**
|
||||
* @typedef {{
|
||||
* id: string;
|
||||
* check: string;
|
||||
* passed: boolean;
|
||||
* detail?: string;
|
||||
* }} CheckResult
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {{
|
||||
* provider: string;
|
||||
* caseId: string;
|
||||
* status: string;
|
||||
* sessionId: string;
|
||||
* sessionDir: string;
|
||||
* logFile: string;
|
||||
* checks: CheckResult[];
|
||||
* pass: boolean;
|
||||
* }} CaseAnalysis
|
||||
*/
|
||||
|
||||
const manifestArg = process.argv[2];
|
||||
if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") {
|
||||
console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs <manifest.tsv>");
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const manifestPath = resolve(manifestArg);
|
||||
if (!existsSync(manifestPath)) {
|
||||
console.error(`Manifest not found: ${manifestPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const CASE_RULES = {
|
||||
"case-01-install-caldav-calendar": {
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["caldav"],
|
||||
["clawhub", "install"],
|
||||
["review-skill-security.mjs"],
|
||||
],
|
||||
},
|
||||
"case-02-gap-discovery-homeassistant": {
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["home", "assistant"],
|
||||
["clawhub", "install"],
|
||||
["review-skill-security.mjs"],
|
||||
],
|
||||
},
|
||||
"case-03-install-update-codexmonitor": {
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["codexmonitor"],
|
||||
["clawhub", "install"],
|
||||
["clawhub", "update"],
|
||||
["review-skill-security.mjs"],
|
||||
],
|
||||
},
|
||||
"case-04-gap-discovery-spotify-ux": {
|
||||
requireExecUsage: false,
|
||||
requiredResponseRegex: [
|
||||
"缺少|没有.*(技能|能力|集成)|capability gap",
|
||||
"clawhub|cloud\\s*hub|cloudhub",
|
||||
"安装|install",
|
||||
"是否|要不要|would you like|do you want",
|
||||
"安全|审查|security|review",
|
||||
],
|
||||
forbiddenCommandTokens: [
|
||||
["clawhub", "install"],
|
||||
["clawhub", "update"],
|
||||
["osascript"],
|
||||
["spogo"],
|
||||
["spotify_player"],
|
||||
["ha.sh"],
|
||||
["/api/states"],
|
||||
],
|
||||
},
|
||||
"case-05-gap-discovery-notion-ux": {
|
||||
requireExecUsage: false,
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["notion"],
|
||||
],
|
||||
requiredEventTokens: [
|
||||
["install_guard", "blocked"],
|
||||
],
|
||||
requiredResponseRegex: [
|
||||
"notion",
|
||||
"安装|install",
|
||||
"是否|要不要|would you like|do you want|同意",
|
||||
"token|授权|integration",
|
||||
],
|
||||
forbiddenCommandTokens: [
|
||||
["osascript"],
|
||||
["spogo"],
|
||||
["spotify_player"],
|
||||
["ha.sh"],
|
||||
["/api/states"],
|
||||
],
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* @param {string} text
|
||||
* @returns {string[]}
|
||||
*/
|
||||
function splitLines(text) {
|
||||
return text.split(/\r?\n/).filter(Boolean);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} command
|
||||
* @param {string[]} tokens
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function commandHasTokens(command, tokens) {
|
||||
const lower = command.toLowerCase();
|
||||
return tokens.every((token) => lower.includes(token.toLowerCase()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} rawArgs
|
||||
* @returns {string}
|
||||
*/
|
||||
function extractCommand(rawArgs) {
|
||||
if (!rawArgs) return "";
|
||||
try {
|
||||
const parsed = JSON.parse(rawArgs);
|
||||
if (parsed && typeof parsed.command === "string") {
|
||||
return parsed.command;
|
||||
}
|
||||
} catch {
|
||||
// Fall through: args may be truncated JSON in run-log.
|
||||
}
|
||||
return rawArgs;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} text
|
||||
* @param {string} pattern
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function textMatchesPattern(text, pattern) {
|
||||
try {
|
||||
return new RegExp(pattern, "i").test(text);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} runLogPath
|
||||
*/
|
||||
function parseRunLog(runLogPath) {
|
||||
const lines = splitLines(readFileSync(runLogPath, "utf-8"));
|
||||
const events = [];
|
||||
for (const line of lines) {
|
||||
try {
|
||||
events.push(JSON.parse(line));
|
||||
} catch {
|
||||
// Ignore malformed lines but keep analysis alive.
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} sessionPath
|
||||
* @returns {string}
|
||||
*/
|
||||
function parseFinalAssistantText(sessionPath) {
|
||||
if (!existsSync(sessionPath)) return "";
|
||||
|
||||
const lines = splitLines(readFileSync(sessionPath, "utf-8"));
|
||||
let latest = "";
|
||||
|
||||
for (const line of lines) {
|
||||
try {
|
||||
const entry = JSON.parse(line);
|
||||
if (entry?.type !== "message") continue;
|
||||
const msg = entry.message;
|
||||
if (!msg || msg.role !== "assistant") continue;
|
||||
|
||||
if (typeof msg.content === "string") {
|
||||
latest = msg.content;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (Array.isArray(msg.content)) {
|
||||
const text = msg.content
|
||||
.filter((part) => part && part.type === "text" && typeof part.text === "string")
|
||||
.map((part) => part.text)
|
||||
.join("\n")
|
||||
.trim();
|
||||
if (text) latest = text;
|
||||
}
|
||||
} catch {
|
||||
// Ignore malformed lines.
|
||||
}
|
||||
}
|
||||
|
||||
return latest;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {CaseAnalysis} analysis
|
||||
* @param {string} id
|
||||
* @param {string} check
|
||||
* @param {boolean} passed
|
||||
* @param {string} [detail]
|
||||
*/
|
||||
function addCheck(analysis, id, check, passed, detail) {
|
||||
analysis.checks.push({ id, check, passed, detail });
|
||||
}
|
||||
|
||||
const rows = splitLines(readFileSync(manifestPath, "utf-8"));
|
||||
if (rows.length <= 1) {
|
||||
console.error(`Manifest has no data rows: ${manifestPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
/** @type {CaseAnalysis[]} */
|
||||
const analyses = [];
|
||||
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const row = rows[i];
|
||||
if (!row) continue;
|
||||
|
||||
const cols = row.split("\t");
|
||||
if (cols.length < 11) continue;
|
||||
|
||||
const provider = cols[1] ?? "";
|
||||
const caseId = cols[2] ?? "";
|
||||
const rules = CASE_RULES[caseId];
|
||||
const status = cols[3] ?? "";
|
||||
const sessionId = cols[4] ?? "";
|
||||
const sessionDir = cols[5] ?? "";
|
||||
const logFile = cols[6] ?? "";
|
||||
|
||||
/** @type {CaseAnalysis} */
|
||||
const analysis = {
|
||||
provider,
|
||||
caseId,
|
||||
status,
|
||||
sessionId,
|
||||
sessionDir,
|
||||
logFile,
|
||||
checks: [],
|
||||
pass: false,
|
||||
};
|
||||
|
||||
addCheck(
|
||||
analysis,
|
||||
"run-status",
|
||||
"runner status is success",
|
||||
status === "success",
|
||||
`status=${status}`,
|
||||
);
|
||||
|
||||
if (!sessionDir) {
|
||||
addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir");
|
||||
analyses.push(analysis);
|
||||
continue;
|
||||
}
|
||||
|
||||
const runLogPath = join(sessionDir, "run-log.jsonl");
|
||||
addCheck(
|
||||
analysis,
|
||||
"run-log-file",
|
||||
"run-log.jsonl exists",
|
||||
existsSync(runLogPath),
|
||||
runLogPath,
|
||||
);
|
||||
|
||||
if (!existsSync(runLogPath)) {
|
||||
analyses.push(analysis);
|
||||
continue;
|
||||
}
|
||||
|
||||
const events = parseRunLog(runLogPath);
|
||||
const sessionPath = join(sessionDir, "session.jsonl");
|
||||
const finalAssistantText = parseFinalAssistantText(sessionPath);
|
||||
const runStarts = events.filter((e) => e.event === "run_start");
|
||||
const runEnds = events.filter((e) => e.event === "run_end");
|
||||
const toolStarts = events.filter((e) => e.event === "tool_start");
|
||||
const toolEnds = events.filter((e) => e.event === "tool_end");
|
||||
const errorToolEnds = toolEnds.filter((e) => e.is_error === true);
|
||||
|
||||
addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`);
|
||||
addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`);
|
||||
addCheck(
|
||||
analysis,
|
||||
"tool-pairing",
|
||||
"tool_start count matches tool_end count",
|
||||
toolStarts.length === toolEnds.length,
|
||||
`start=${toolStarts.length} end=${toolEnds.length}`,
|
||||
);
|
||||
|
||||
const finalRunEnd = runEnds.at(-1);
|
||||
const runEndError = finalRunEnd?.error;
|
||||
const finalRunText = typeof finalRunEnd?.text === "string" ? finalRunEnd.text : "";
|
||||
const finalResponseText = finalAssistantText || finalRunText;
|
||||
addCheck(
|
||||
analysis,
|
||||
"run-end-error",
|
||||
"final run_end.error is null/empty",
|
||||
runEndError === null || runEndError === undefined || runEndError === "",
|
||||
`error=${String(runEndError)}`,
|
||||
);
|
||||
|
||||
addCheck(
|
||||
analysis,
|
||||
"tool-errors",
|
||||
"no tool_end has is_error=true",
|
||||
errorToolEnds.length === 0,
|
||||
`error_tool_calls=${errorToolEnds.length}`,
|
||||
);
|
||||
|
||||
const execCommands = toolStarts
|
||||
.filter((e) => e.tool === "exec")
|
||||
.map((e) => extractCommand(typeof e.args === "string" ? e.args : ""))
|
||||
.filter(Boolean);
|
||||
|
||||
const requireExecUsage = rules?.requireExecUsage !== false;
|
||||
addCheck(
|
||||
analysis,
|
||||
"exec-usage",
|
||||
requireExecUsage
|
||||
? "at least one exec command was used"
|
||||
: "exec usage is optional for this case",
|
||||
requireExecUsage ? execCommands.length > 0 : true,
|
||||
requireExecUsage ? `exec_calls=${execCommands.length}` : `exec_calls=${execCommands.length} (optional)`,
|
||||
);
|
||||
|
||||
if (rules) {
|
||||
if (Array.isArray(rules.requiredCommandTokens)) {
|
||||
for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
|
||||
const tokenList = rules.requiredCommandTokens[r];
|
||||
const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
|
||||
addCheck(
|
||||
analysis,
|
||||
`cmd-${r + 1}`,
|
||||
`exec command contains tokens: ${tokenList.join(" + ")}`,
|
||||
passed,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(rules.requiredEventTokens)) {
|
||||
const eventLines = events.map((event) => JSON.stringify(event).toLowerCase());
|
||||
for (let r = 0; r < rules.requiredEventTokens.length; r++) {
|
||||
const tokenList = rules.requiredEventTokens[r];
|
||||
const passed = eventLines.some((line) =>
|
||||
tokenList.every((token) => line.includes(token.toLowerCase())),
|
||||
);
|
||||
addCheck(
|
||||
analysis,
|
||||
`event-${r + 1}`,
|
||||
`event log contains tokens: ${tokenList.join(" + ")}`,
|
||||
passed,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(rules.forbiddenCommandTokens)) {
|
||||
for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) {
|
||||
const tokenList = rules.forbiddenCommandTokens[r];
|
||||
const passed = !execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
|
||||
addCheck(
|
||||
analysis,
|
||||
`forbid-cmd-${r + 1}`,
|
||||
`exec command does not contain tokens: ${tokenList.join(" + ")}`,
|
||||
passed,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(rules.requiredResponseRegex)) {
|
||||
for (let r = 0; r < rules.requiredResponseRegex.length; r++) {
|
||||
const pattern = rules.requiredResponseRegex[r];
|
||||
const passed = textMatchesPattern(finalResponseText, pattern);
|
||||
addCheck(
|
||||
analysis,
|
||||
`resp-${r + 1}`,
|
||||
`final response matches regex: /${pattern}/i`,
|
||||
passed,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
addCheck(
|
||||
analysis,
|
||||
"case-rules",
|
||||
"case has rule set",
|
||||
false,
|
||||
`No rules defined for case_id=${caseId}`,
|
||||
);
|
||||
}
|
||||
|
||||
analysis.pass = analysis.checks.every((c) => c.passed);
|
||||
analyses.push(analysis);
|
||||
}
|
||||
|
||||
const passedCases = analyses.filter((a) => a.pass).length;
|
||||
const failedCases = analyses.length - passedCases;
|
||||
|
||||
const output = {
|
||||
manifestPath,
|
||||
totalCases: analyses.length,
|
||||
passedCases,
|
||||
failedCases,
|
||||
results: analyses,
|
||||
};
|
||||
|
||||
const outputPath = join(dirname(manifestPath), "analysis.json");
|
||||
writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8");
|
||||
|
||||
for (const item of analyses) {
|
||||
const status = item.pass ? "PASS" : "FAIL";
|
||||
console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`);
|
||||
for (const check of item.checks) {
|
||||
const marker = check.passed ? " [ok] " : " [bad] ";
|
||||
const detail = check.detail ? ` (${check.detail})` : "";
|
||||
console.log(`${marker}${check.check}${detail}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("");
|
||||
console.log(`Analysis file: ${outputPath}`);
|
||||
console.log(`Summary: pass=${passedCases} fail=${failedCases}`);
|
||||
|
||||
if (failedCases > 0) {
|
||||
process.exit(1);
|
||||
}
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
Run an end-to-end test for the Meta Skill Installer.
|
||||
|
||||
Goal: install a real ClawHub skill for CalDAV calendar capability.
|
||||
Reference page: https://clawhub.ai/skills/caldav-calendar
|
||||
|
||||
Follow this exact workflow:
|
||||
1. State the missing capability in one sentence.
|
||||
2. Search ClawHub for CalDAV-related skills and choose the best candidate.
|
||||
3. Stage-install to a temporary directory first (never install directly to active skills path).
|
||||
4. Run security review on the staged skill:
|
||||
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
|
||||
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
|
||||
5. If riskLevel is safe, install to "$DATA_DIR/skills".
|
||||
6. Verify final install by checking "$DATA_DIR/skills/<slug>/SKILL.md" exists.
|
||||
7. Return a short report: selected slug, riskLevel, final install path.
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
Run an end-to-end capability-gap discovery test for Meta Skill Installer.
|
||||
|
||||
User intent: "I need to control Home Assistant lights and switches from the agent."
|
||||
Reference page: https://clawhub.ai/skills/homeassistant
|
||||
|
||||
Requirements:
|
||||
1. Treat this as a missing capability and explicitly define the gap.
|
||||
2. Search ClawHub for relevant skills and list the top 3 candidates.
|
||||
3. Pick one candidate with rationale (scope match + lower security risk).
|
||||
4. Stage-install to a temporary directory.
|
||||
5. Run security review:
|
||||
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
|
||||
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
|
||||
6. If riskLevel is safe, install to "$DATA_DIR/skills".
|
||||
7. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
|
||||
8. Return: candidate list, chosen slug, riskLevel, and final path.
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
Run an end-to-end install+update regression test for Meta Skill Installer.
|
||||
|
||||
Goal: use a real ClawHub skill and verify install, review, and update flow.
|
||||
Reference page: https://clawhub.ai/odrobnik/codexmonitor
|
||||
|
||||
Requirements:
|
||||
1. Search ClawHub for CodexMonitor and select the matching skill slug.
|
||||
2. Stage-install to a temporary directory and run security review:
|
||||
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
|
||||
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
|
||||
3. If riskLevel is safe, install to "$DATA_DIR/skills".
|
||||
4. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
|
||||
5. Run an update for the same slug in managed dir:
|
||||
clawhub update "<slug>" --workdir "$DATA_DIR" --dir skills --force
|
||||
6. Run security review again on the final installed path.
|
||||
7. Return: slug, initial riskLevel, update executed (yes/no), final path.
|
||||
|
|
@ -0,0 +1 @@
|
|||
随机播放 spotify 中的音乐
|
||||
|
|
@ -0,0 +1 @@
|
|||
帮我在 Notion 新建一个页面,标题是今天待办,并写入三条任务:修复登录 bug、写周报、安排评审
|
||||
170
scripts/e2e-skills-benchmark/run.sh
Executable file
170
scripts/e2e-skills-benchmark/run.sh
Executable file
|
|
@ -0,0 +1,170 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
CASES_DIR="${SCRIPT_DIR}/cases"
|
||||
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
|
||||
OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/skills-e2e-runs/${TIMESTAMP}}"
|
||||
RESULTS_DIR="${OUT_DIR}/results"
|
||||
MANIFEST="${OUT_DIR}/manifest.tsv"
|
||||
|
||||
# Required environment for agent-driven E2E.
|
||||
SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
|
||||
MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
|
||||
PROVIDERS_RAW="${PROVIDERS:-kimi-coding}"
|
||||
CASE_GLOB="${CASE_GLOB:-case-*.txt}"
|
||||
CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-1200}"
|
||||
MAX_PARALLEL="${MAX_PARALLEL:-1}"
|
||||
TIMEOUT_ENABLED="true"
|
||||
if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
|
||||
TIMEOUT_ENABLED="false"
|
||||
fi
|
||||
|
||||
if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
|
||||
echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${1:-}" == "--worker" ]]; then
|
||||
provider="${2:?missing provider}"
|
||||
case_file="${3:?missing case file}"
|
||||
case_base="$(basename "${case_file}")"
|
||||
case_id="${case_base%.txt}"
|
||||
log_file="${OUT_DIR}/${provider}-${case_id}.log"
|
||||
result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
|
||||
|
||||
prompt="$(cat "${case_file}")"
|
||||
|
||||
status="success"
|
||||
timed_out="false"
|
||||
started_epoch="$(date +%s)"
|
||||
started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
|
||||
SMC_DATA_DIR="${SMC_DATA_DIR}" \
|
||||
MULTICA_API_URL="${MULTICA_API_URL}" \
|
||||
pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
|
||||
cmd_pid=$!
|
||||
|
||||
while kill -0 "${cmd_pid}" 2>/dev/null; do
|
||||
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
|
||||
now="$(date +%s)"
|
||||
elapsed="$((now - started_epoch))"
|
||||
if (( elapsed >= CASE_TIMEOUT_SEC )); then
|
||||
timed_out="true"
|
||||
kill "${cmd_pid}" 2>/dev/null || true
|
||||
sleep 1
|
||||
kill -9 "${cmd_pid}" 2>/dev/null || true
|
||||
break
|
||||
fi
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
exit_code=0
|
||||
wait "${cmd_pid}" 2>/dev/null || exit_code=$?
|
||||
ended_epoch="$(date +%s)"
|
||||
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
duration_sec="$((ended_epoch - started_epoch))"
|
||||
|
||||
if [[ "${timed_out}" == "true" ]]; then
|
||||
status="timeout"
|
||||
printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
|
||||
elif (( exit_code != 0 )); then
|
||||
status="failed"
|
||||
elif [[ ! -s "${log_file}" ]]; then
|
||||
status="failed"
|
||||
elif ! rg -q "\[session: " "${log_file}"; then
|
||||
status="failed"
|
||||
fi
|
||||
|
||||
session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
|
||||
session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
|
||||
|
||||
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
|
||||
"${TIMESTAMP}" \
|
||||
"${provider}" \
|
||||
"${case_id}" \
|
||||
"${status}" \
|
||||
"${session_id}" \
|
||||
"${session_dir}" \
|
||||
"${log_file}" \
|
||||
"${started_at}" \
|
||||
"${ended_at}" \
|
||||
"${duration_sec}" \
|
||||
"${exit_code}" > "${result_file}"
|
||||
|
||||
printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
|
||||
"${provider}" \
|
||||
"${case_id}" \
|
||||
"${status}" \
|
||||
"${duration_sec}" \
|
||||
"${session_id:-N/A}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${OUT_DIR}"
|
||||
mkdir -p "${RESULTS_DIR}"
|
||||
printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
|
||||
|
||||
read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
|
||||
|
||||
CASE_FILES=()
|
||||
while IFS= read -r line; do
|
||||
CASE_FILES+=("${line}")
|
||||
done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
|
||||
|
||||
if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
|
||||
echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Output directory: ${OUT_DIR}"
|
||||
echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
|
||||
echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
|
||||
echo "Providers: ${PROVIDERS[*]}"
|
||||
echo "Cases: ${#CASE_FILES[@]}"
|
||||
echo "Max parallel: ${MAX_PARALLEL}"
|
||||
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
|
||||
echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
|
||||
else
|
||||
echo "Case timeout: disabled"
|
||||
fi
|
||||
|
||||
TASKS=()
|
||||
for provider in "${PROVIDERS[@]}"; do
|
||||
for case_file in "${CASE_FILES[@]}"; do
|
||||
TASKS+=("${provider}" "${case_file}")
|
||||
done
|
||||
done
|
||||
|
||||
echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
|
||||
|
||||
export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
|
||||
printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
|
||||
|
||||
RESULT_FILES=()
|
||||
while IFS= read -r line; do
|
||||
RESULT_FILES+=("${line}")
|
||||
done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
|
||||
|
||||
if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
|
||||
echo "No result files produced in ${RESULTS_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for result_file in "${RESULT_FILES[@]}"; do
|
||||
cat "${result_file}" >> "${MANIFEST}"
|
||||
done
|
||||
|
||||
success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
|
||||
echo
|
||||
echo "Completed run stage. Manifest: ${MANIFEST}"
|
||||
echo "Run summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
|
||||
|
||||
echo
|
||||
echo "Running structured analysis..."
|
||||
node "${SCRIPT_DIR}/analyze.mjs" "${MANIFEST}" | tee "${OUT_DIR}/analysis.txt"
|
||||
134
skills/meta-skill-installer/SKILL.md
Normal file
134
skills/meta-skill-installer/SKILL.md
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
---
|
||||
name: Meta Skill Installer
|
||||
description: Detect missing capabilities, search clawhub.ai for matching skills, run security review on candidate skills, and install safe skills into Multica. Use when a task cannot be completed with current skills/tools or when the user asks to discover/install/update skills from ClawHub.
|
||||
version: 1.0.0
|
||||
metadata:
|
||||
tags:
|
||||
- meta
|
||||
- skills
|
||||
- clawhub
|
||||
- security
|
||||
install:
|
||||
- id: node-clawhub
|
||||
kind: node
|
||||
package: clawhub
|
||||
bins: [clawhub]
|
||||
label: "Install ClawHub CLI"
|
||||
userInvocable: true
|
||||
disableModelInvocation: false
|
||||
---
|
||||
|
||||
# Meta Skill Installer
|
||||
|
||||
Use this skill to close capability gaps by discovering and installing skills from ClawHub with a mandatory security gate.
|
||||
|
||||
## Safety Defaults
|
||||
|
||||
- Always run in this order: identify gap -> search -> stage install -> security review -> install to managed dir -> validate.
|
||||
- Never install directly into the active skills directory before review.
|
||||
- If risk is `dangerous`, stop and explain why.
|
||||
- If risk is `needs-review`, ask for explicit user confirmation before final install.
|
||||
|
||||
## Resolve Paths and Commands
|
||||
|
||||
Use Multica managed skills path, not the current workspace:
|
||||
|
||||
```bash
|
||||
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
|
||||
SKILLS_DIR="$DATA_DIR/skills"
|
||||
META_SKILL_DIR="$SKILLS_DIR/meta-skill-installer"
|
||||
|
||||
if command -v clawhub >/dev/null 2>&1; then
|
||||
CLAWHUB_CMD=(clawhub)
|
||||
else
|
||||
CLAWHUB_CMD=(npx -y clawhub)
|
||||
fi
|
||||
```
|
||||
|
||||
If neither command path works, install the CLI first (`npm i -g clawhub`) and retry.
|
||||
|
||||
## Workflow
|
||||
|
||||
### 1) Detect the Capability Gap
|
||||
|
||||
When the current task cannot be completed with existing skills/tools:
|
||||
|
||||
- Summarize the missing capability in one sentence.
|
||||
- Convert it to a focused search query (tool + domain + action).
|
||||
- Keep the original user intent and success criteria.
|
||||
|
||||
### 2) Search ClawHub
|
||||
|
||||
Run one or more searches and collect top candidates:
|
||||
|
||||
```bash
|
||||
"${CLAWHUB_CMD[@]}" search "<query>" --limit 10
|
||||
```
|
||||
|
||||
Candidate ranking rules:
|
||||
|
||||
- Primary: semantic relevance to the missing capability.
|
||||
- Secondary: clearer SKILL description and narrower scope.
|
||||
- Tertiary: lower operational risk (fewer privileged or remote-exec patterns).
|
||||
|
||||
### 3) Stage Install in Quarantine Directory
|
||||
|
||||
Install candidate skill into a temporary workdir first:
|
||||
|
||||
```bash
|
||||
STAGING_DIR="$(mktemp -d "${TMPDIR:-/tmp}/multica-skill-review.XXXXXX")"
|
||||
"${CLAWHUB_CMD[@]}" install "<slug>" --workdir "$STAGING_DIR" --dir skills --version "<optional-version>" --force
|
||||
```
|
||||
|
||||
Expected staged path:
|
||||
|
||||
```bash
|
||||
"$STAGING_DIR/skills/<slug>"
|
||||
```
|
||||
|
||||
### 4) Run Security Review
|
||||
|
||||
Use this skill's scanner script against the staged skill:
|
||||
|
||||
```bash
|
||||
node "$META_SKILL_DIR/scripts/review-skill-security.mjs" "$STAGING_DIR/skills/<slug>"
|
||||
```
|
||||
|
||||
Interpret scanner output:
|
||||
|
||||
- `riskLevel: safe` -> continue to install.
|
||||
- `riskLevel: needs-review` -> present findings, ask user for explicit confirmation.
|
||||
- `riskLevel: dangerous` -> block install by default.
|
||||
|
||||
### 5) Install to Multica Managed Skills Directory
|
||||
|
||||
Only after passing the review gate, install to the directory Multica actually loads:
|
||||
|
||||
```bash
|
||||
mkdir -p "$SKILLS_DIR"
|
||||
"${CLAWHUB_CMD[@]}" install "<slug>" --workdir "$DATA_DIR" --dir skills --version "<optional-version>" --force
|
||||
```
|
||||
|
||||
If skill already exists, use update:
|
||||
|
||||
```bash
|
||||
"${CLAWHUB_CMD[@]}" update "<slug>" --workdir "$DATA_DIR" --dir skills --version "<optional-version>" --force
|
||||
```
|
||||
|
||||
### 6) Post-Install Validation
|
||||
|
||||
Validate presence and scan once more in the final location:
|
||||
|
||||
```bash
|
||||
test -f "$SKILLS_DIR/<slug>/SKILL.md"
|
||||
node "$META_SKILL_DIR/scripts/review-skill-security.mjs" "$SKILLS_DIR/<slug>"
|
||||
```
|
||||
|
||||
Then retry the original user task with the new skill.
|
||||
|
||||
## Guardrails
|
||||
|
||||
- Never claim installation success without path-level verification.
|
||||
- Never hide security findings; summarize concrete files and reasons.
|
||||
- Prefer pinned versions when available, and report the installed version to the user.
|
||||
- If the chosen skill requires secrets/API keys, pause after install and ask user to configure required env vars before using it.
|
||||
328
skills/meta-skill-installer/scripts/review-skill-security.mjs
Normal file
328
skills/meta-skill-installer/scripts/review-skill-security.mjs
Normal file
|
|
@ -0,0 +1,328 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
import { existsSync, lstatSync, readdirSync, readFileSync } from "node:fs";
|
||||
import { basename, extname, join, relative, resolve } from "node:path";
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
if (args.length !== 1 || args[0] === "--help" || args[0] === "-h") {
|
||||
console.error("Usage: node review-skill-security.mjs <skill-directory>");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const targetDir = resolve(args[0]);
|
||||
if (!existsSync(targetDir)) {
|
||||
console.error(JSON.stringify({
|
||||
targetDir,
|
||||
riskLevel: "dangerous",
|
||||
error: "Target directory does not exist",
|
||||
}, null, 2));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
/** Maximum file size to inspect as text (2 MB). */
|
||||
const MAX_TEXT_FILE_BYTES = 2_000_000;
|
||||
/** Maximum findings returned to avoid huge output. */
|
||||
const MAX_FINDINGS = 200;
|
||||
|
||||
const SKIP_DIRS = new Set([
|
||||
".git",
|
||||
".hg",
|
||||
".svn",
|
||||
"node_modules",
|
||||
"dist",
|
||||
"build",
|
||||
".next",
|
||||
".turbo",
|
||||
".cache",
|
||||
]);
|
||||
|
||||
const TEXT_EXTENSIONS = new Set([
|
||||
".md",
|
||||
".txt",
|
||||
".json",
|
||||
".yaml",
|
||||
".yml",
|
||||
".toml",
|
||||
".ini",
|
||||
".cfg",
|
||||
".conf",
|
||||
".env",
|
||||
".sh",
|
||||
".bash",
|
||||
".zsh",
|
||||
".fish",
|
||||
".ps1",
|
||||
".js",
|
||||
".mjs",
|
||||
".cjs",
|
||||
".ts",
|
||||
".tsx",
|
||||
".jsx",
|
||||
".py",
|
||||
".rb",
|
||||
".go",
|
||||
".rs",
|
||||
".java",
|
||||
".kt",
|
||||
".swift",
|
||||
".php",
|
||||
".lua",
|
||||
".sql",
|
||||
".xml",
|
||||
".html",
|
||||
".css",
|
||||
]);
|
||||
|
||||
/**
|
||||
* @typedef {"safe" | "needs-review" | "dangerous"} RiskLevel
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {{
|
||||
* severity: Exclude<RiskLevel, "safe">;
|
||||
* type: string;
|
||||
* file: string;
|
||||
* line?: number;
|
||||
* message: string;
|
||||
* snippet?: string;
|
||||
* }} Finding
|
||||
*/
|
||||
|
||||
const LINE_PATTERNS = [
|
||||
{
|
||||
type: "network-pipe-shell",
|
||||
severity: "dangerous",
|
||||
regex: /\b(?:curl|wget)\b[^\n|]*\|\s*(?:ba|z)?sh\b/i,
|
||||
message: "Network content piped directly into shell.",
|
||||
},
|
||||
{
|
||||
type: "powershell-iex-download",
|
||||
severity: "dangerous",
|
||||
regex: /\b(?:invoke-webrequest|iwr)\b[^\n|]*\|\s*iex\b/i,
|
||||
message: "Downloaded content executed via PowerShell IEX.",
|
||||
},
|
||||
{
|
||||
type: "destructive-rm-root",
|
||||
severity: "dangerous",
|
||||
regex: /(?:^|[\s;])(?:sudo\s+)?rm\s+-rf\s+(?:\/(?:\s|$)|~(?:\/|\s|$))/i,
|
||||
message: "Potentially destructive recursive delete at root/home scope.",
|
||||
},
|
||||
{
|
||||
type: "device-overwrite",
|
||||
severity: "dangerous",
|
||||
regex: /\bdd\s+if=.*\s+of=\/dev\/(?:sd[a-z]\d*|nvme\d+n\d+(?:p\d+)?|disk\d+)/i,
|
||||
message: "Possible block-device overwrite command.",
|
||||
},
|
||||
{
|
||||
type: "reverse-shell",
|
||||
severity: "dangerous",
|
||||
regex: /\/dev\/tcp\/|nc\s+-e\s+|bash\s+-i\b.*\/dev\/tcp\//i,
|
||||
message: "Potential reverse-shell behavior.",
|
||||
},
|
||||
{
|
||||
type: "sudo-usage",
|
||||
severity: "needs-review",
|
||||
regex: /(^|[\s;])sudo\s+/i,
|
||||
message: "Uses privileged command execution (sudo).",
|
||||
},
|
||||
{
|
||||
type: "remote-download",
|
||||
severity: "needs-review",
|
||||
regex: /\b(?:curl|wget|invoke-webrequest|iwr)\b.*https?:\/\//i,
|
||||
message: "Downloads remote content. Verify source integrity and intent.",
|
||||
},
|
||||
{
|
||||
type: "dynamic-exec-js",
|
||||
severity: "needs-review",
|
||||
regex: /\bchild_process\.(?:exec|spawn|execSync|spawnSync)\b|\beval\s*\(/i,
|
||||
message: "Dynamic execution primitive found in JavaScript/TypeScript.",
|
||||
},
|
||||
{
|
||||
type: "python-shell-exec",
|
||||
severity: "needs-review",
|
||||
regex: /\bos\.system\s*\(|\bsubprocess\.(?:run|Popen|call)\s*\(.*shell\s*=\s*True/i,
|
||||
message: "Shell execution primitive found in Python.",
|
||||
},
|
||||
{
|
||||
type: "secret-env-access",
|
||||
severity: "needs-review",
|
||||
regex: /process\.env\.[A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD)|\$\{?[A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD)\}?/i,
|
||||
message: "Reads variables that may contain credentials/secrets.",
|
||||
},
|
||||
];
|
||||
|
||||
/**
|
||||
* @param {string} value
|
||||
* @returns {string}
|
||||
*/
|
||||
function compactSnippet(value) {
|
||||
return value.replace(/\s+/g, " ").trim().slice(0, 200);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} filePath
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function shouldReadAsText(filePath) {
|
||||
const base = basename(filePath).toLowerCase();
|
||||
if (base === "skill.md") return true;
|
||||
return TEXT_EXTENSIONS.has(extname(filePath).toLowerCase());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} filePath
|
||||
* @returns {string | null}
|
||||
*/
|
||||
function readTextFile(filePath) {
|
||||
const buf = readFileSync(filePath);
|
||||
if (buf.includes(0)) return null;
|
||||
return buf.toString("utf-8");
|
||||
}
|
||||
|
||||
/** @type {Finding[]} */
|
||||
const findings = [];
|
||||
let scannedFiles = 0;
|
||||
let skippedLargeFiles = 0;
|
||||
let skippedBinaryFiles = 0;
|
||||
let symlinkCount = 0;
|
||||
|
||||
/**
|
||||
* @param {Finding} finding
|
||||
*/
|
||||
function addFinding(finding) {
|
||||
if (findings.length >= MAX_FINDINGS) return;
|
||||
findings.push(finding);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} currentDir
|
||||
*/
|
||||
function walk(currentDir) {
|
||||
const entries = readdirSync(currentDir, { withFileTypes: true });
|
||||
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(currentDir, entry.name);
|
||||
const relPath = relative(targetDir, fullPath) || ".";
|
||||
|
||||
let stat;
|
||||
try {
|
||||
stat = lstatSync(fullPath);
|
||||
} catch {
|
||||
addFinding({
|
||||
severity: "needs-review",
|
||||
type: "stat-error",
|
||||
file: relPath,
|
||||
message: "Could not stat path. Manual inspection recommended.",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
if (stat.isSymbolicLink()) {
|
||||
symlinkCount++;
|
||||
addFinding({
|
||||
severity: "dangerous",
|
||||
type: "symlink",
|
||||
file: relPath,
|
||||
message: "Symbolic links can hide path traversal or redirection behavior.",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
if (stat.isDirectory()) {
|
||||
if (SKIP_DIRS.has(entry.name)) continue;
|
||||
walk(fullPath);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!stat.isFile()) continue;
|
||||
scannedFiles++;
|
||||
|
||||
if (stat.size > MAX_TEXT_FILE_BYTES) {
|
||||
skippedLargeFiles++;
|
||||
addFinding({
|
||||
severity: "needs-review",
|
||||
type: "large-file",
|
||||
file: relPath,
|
||||
message: `Large file (${stat.size} bytes) was not fully scanned.`,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!shouldReadAsText(fullPath)) continue;
|
||||
|
||||
let content;
|
||||
try {
|
||||
content = readTextFile(fullPath);
|
||||
} catch {
|
||||
addFinding({
|
||||
severity: "needs-review",
|
||||
type: "read-error",
|
||||
file: relPath,
|
||||
message: "Failed to read file during scan.",
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
if (content === null) {
|
||||
skippedBinaryFiles++;
|
||||
continue;
|
||||
}
|
||||
|
||||
const lines = content.split(/\r?\n/);
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i] ?? "";
|
||||
if (!line) continue;
|
||||
for (const pattern of LINE_PATTERNS) {
|
||||
if (!pattern.regex.test(line)) continue;
|
||||
addFinding({
|
||||
severity: pattern.severity,
|
||||
type: pattern.type,
|
||||
file: relPath,
|
||||
line: i + 1,
|
||||
message: pattern.message,
|
||||
snippet: compactSnippet(line),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
walk(targetDir);
|
||||
|
||||
if (!existsSync(join(targetDir, "SKILL.md"))) {
|
||||
addFinding({
|
||||
severity: "dangerous",
|
||||
type: "missing-skill-md",
|
||||
file: ".",
|
||||
message: "SKILL.md not found at skill root.",
|
||||
});
|
||||
}
|
||||
|
||||
const dangerousCount = findings.filter((f) => f.severity === "dangerous").length;
|
||||
const reviewCount = findings.filter((f) => f.severity === "needs-review").length;
|
||||
|
||||
/** @type {RiskLevel} */
|
||||
let riskLevel = "safe";
|
||||
if (dangerousCount > 0) {
|
||||
riskLevel = "dangerous";
|
||||
} else if (reviewCount > 0) {
|
||||
riskLevel = "needs-review";
|
||||
}
|
||||
|
||||
const output = {
|
||||
targetDir,
|
||||
riskLevel,
|
||||
summary: {
|
||||
scannedFiles,
|
||||
symlinkCount,
|
||||
skippedLargeFiles,
|
||||
skippedBinaryFiles,
|
||||
dangerousFindings: dangerousCount,
|
||||
reviewFindings: reviewCount,
|
||||
totalFindings: findings.length,
|
||||
findingsTruncated: findings.length >= MAX_FINDINGS,
|
||||
},
|
||||
findings,
|
||||
};
|
||||
|
||||
console.log(JSON.stringify(output, null, 2));
|
||||
Loading…
Add table
Add a link
Reference in a new issue