Merge pull request #216 from multica-ai/codex/meta-skill-installer-e2e-skills-benchmark

feat(skills): add ClawHub meta installer and agent-driven E2E benchmark
This commit is contained in:
Jiayuan Zhang 2026-02-17 02:45:45 +08:00 committed by GitHub
commit e28ecb9a91
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 1781 additions and 8 deletions

View file

@ -0,0 +1,98 @@
# Skills Agent-Driven E2E Benchmark
This benchmark validates the meta skill workflow for capability-gap discovery, ClawHub installation, and security-gated rollout.
## Scope
- Domain: skill discovery + installation + update
- Focus: `skills/meta-skill-installer`
- Providers: default `kimi-coding` (override with `PROVIDERS`)
- Cases: 5
Case prompts are stored in:
- `scripts/e2e-skills-benchmark/cases/`
## Real ClawHub Examples Used
The case set references real public pages from ClawHub:
- [CalDAV Calendar](https://clawhub.ai/skills/caldav-calendar)
- [Home Assistant](https://clawhub.ai/skills/homeassistant)
- [CodexMonitor](https://clawhub.ai/odrobnik/codexmonitor)
- [Spotify (gap-discovery UX flow)](https://clawhub.ai/search?q=spotify)
- [Notion (gap-discovery UX flow)](https://clawhub.ai/search?q=notion)
## Prerequisites
1. Credentials configured (`pnpm multica credentials init` if needed)
2. Dependencies installed in repo (`pnpm install`)
3. `clawhub` CLI available, or allow runtime fallback to `npx -y clawhub`
4. Required env:
```bash
export SMC_DATA_DIR=~/.super-multica-e2e
export MULTICA_API_URL=https://api-dev.copilothub.ai
```
## Run Benchmark
```bash
scripts/e2e-skills-benchmark/run.sh
```
Defaults:
- Providers: `kimi-coding`
- Case glob: `case-*.txt`
- Max parallel workers: `1`
- Per-case timeout: `1200s` (`CASE_TIMEOUT_SEC=0` to disable)
- Output directory: `.context/skills-e2e-runs/<timestamp>/`
Generated artifacts:
- `manifest.tsv`: provider/case/status/session/log metadata
- `analysis.txt`: human-readable pass/fail report
- `analysis.json`: structured detailed check output
## Run Subset
Only one case:
```bash
CASE_GLOB="case-01-*.txt" scripts/e2e-skills-benchmark/run.sh
```
Multiple providers:
```bash
PROVIDERS="kimi-coding claude-code" scripts/e2e-skills-benchmark/run.sh
```
Faster throughput:
```bash
MAX_PARALLEL=2 CASE_TIMEOUT_SEC=1800 scripts/e2e-skills-benchmark/run.sh
```
## Analyzer Checks
For each run:
1. `run_start` and `run_end` both present
2. `run_end.error` is empty/null
3. `tool_start` and `tool_end` are paired
4. no `tool_end.is_error=true`
5. at least one `exec` tool call exists
6. case-specific command evidence in `tool_start.args`:
- `clawhub search`
- `clawhub install`
- `review-skill-security.mjs`
- for case 03 also `clawhub update`
- for case 04, prompt is a natural user request only; agent must self-discover capability gap, propose ClawHub + security review + install confirmation, and must not run workaround commands (`osascript`, `ha.sh`, `spogo`, `spotify_player`) before user confirmation
- for case 05, prompt is a natural Notion request; agent must discover missing capability, search skill candidates, trigger `install_guard` (blocked until confirmation), and ask for explicit install consent plus token/auth prerequisites
## Notes
- These are agent-driven tests; prompt intent plus run-log evidence are both evaluated.
- `SMC_DATA_DIR=~/.super-multica-e2e` avoids polluting normal user skill/session data.
- If a case fails, open `manifest.tsv` and inspect the matching `session_dir/run-log.jsonl`.

View file

@ -30,7 +30,8 @@
"typecheck": "turbo typecheck",
"test": "vitest run",
"test:watch": "vitest",
"test:coverage": "vitest run --coverage"
"test:coverage": "vitest run --coverage",
"e2e:skills": "bash scripts/e2e-skills-benchmark/run.sh"
},
"keywords": [],
"author": "",

View file

@ -0,0 +1,171 @@
import { describe, expect, it } from "vitest";
import {
evaluateCustomSkillAuthoringConsent,
evaluateWorkaroundConsent,
evaluateSkillInstallConsent,
isEnvironmentInstallCommand,
isLocalSkillMutationCommand,
isMutatingClawhubCommand,
isThirdPartyWorkaroundCommand,
} from "./runner.js";
describe("isMutatingClawhubCommand", () => {
it("detects clawhub install command", () => {
expect(
isMutatingClawhubCommand("npx -y clawhub install spotify --workdir /tmp --dir skills"),
).toBe(true);
});
it("detects clawhub update command", () => {
expect(isMutatingClawhubCommand("clawhub update spotify --force")).toBe(true);
});
it("does not match non-mutating clawhub commands", () => {
expect(isMutatingClawhubCommand("clawhub search spotify --limit 10")).toBe(false);
expect(isMutatingClawhubCommand("clawhub inspect spotify")).toBe(false);
});
it("detects wrapped bash flow that expands CLAWHUB_CMD and runs install", () => {
const command = [
"cd /tmp/meta-skill-installer && bash -c '",
"if command -v clawhub >/dev/null 2>&1; then",
" CLAWHUB_CMD=(clawhub)",
"else",
" CLAWHUB_CMD=(npx -y clawhub)",
"fi",
"\"${CLAWHUB_CMD[@]}\" install \"spotify\" --workdir \"$DATA_DIR\" --dir skills --force",
"'",
].join("\n");
expect(isMutatingClawhubCommand(command)).toBe(true);
});
});
describe("evaluateSkillInstallConsent", () => {
it("does not grant consent for generic capability requests", () => {
const result = evaluateSkillInstallConsent("随机播放 spotify 中的音乐", false);
expect(result).toEqual({ allowInstall: false, declined: false });
});
it("grants consent for explicit install requests", () => {
const result = evaluateSkillInstallConsent("请帮我安装 spotify skill", false);
expect(result).toEqual({ allowInstall: true, declined: false });
});
it("grants consent for short affirmative replies when awaiting confirmation", () => {
const result = evaluateSkillInstallConsent("继续", true);
expect(result).toEqual({ allowInstall: true, declined: false });
});
it("treats standalone Chinese affirmative as consent when awaiting confirmation", () => {
const result = evaluateSkillInstallConsent("行", true);
expect(result).toEqual({ allowInstall: true, declined: false });
});
it("marks declines explicitly", () => {
const result = evaluateSkillInstallConsent("不要安装,先别动", true);
expect(result).toEqual({ allowInstall: false, declined: true });
});
});
describe("isEnvironmentInstallCommand", () => {
it("detects package manager install commands", () => {
expect(isEnvironmentInstallCommand("brew install spogo")).toBe(true);
expect(isEnvironmentInstallCommand("pnpm add lodash")).toBe(true);
expect(isEnvironmentInstallCommand("npm install -g clawhub")).toBe(true);
expect(isEnvironmentInstallCommand("pip install requests")).toBe(true);
});
it("does not match read-only package manager commands", () => {
expect(isEnvironmentInstallCommand("brew list")).toBe(false);
expect(isEnvironmentInstallCommand("pnpm list --depth 0")).toBe(false);
expect(isEnvironmentInstallCommand("npm view clawhub")).toBe(false);
});
});
describe("isThirdPartyWorkaroundCommand", () => {
it("detects local workaround commands", () => {
expect(isThirdPartyWorkaroundCommand("spotify_player playback shuffle")).toBe(true);
expect(isThirdPartyWorkaroundCommand("spogo status")).toBe(true);
expect(isThirdPartyWorkaroundCommand("osascript -e 'tell app \"Spotify\" to play'")).toBe(true);
expect(isThirdPartyWorkaroundCommand("curl http://localhost:8123/api/states")).toBe(true);
});
it("does not match unrelated commands", () => {
expect(isThirdPartyWorkaroundCommand("ls -la")).toBe(false);
expect(isThirdPartyWorkaroundCommand("pnpm test")).toBe(false);
});
});
describe("evaluateWorkaroundConsent", () => {
it("does not grant workaround mode for generic capability requests", () => {
const result = evaluateWorkaroundConsent("随机播放 spotify 中的音乐", false);
expect(result).toEqual({ allowWorkaround: false, declined: false });
});
it("grants workaround mode for explicit local-command intent", () => {
const result = evaluateWorkaroundConsent("不要安装 skill直接用本地命令试试", false);
expect(result).toEqual({ allowWorkaround: true, declined: false });
});
it("grants workaround mode for short affirmative replies when awaiting confirmation", () => {
const result = evaluateWorkaroundConsent("继续", true);
expect(result).toEqual({ allowWorkaround: true, declined: false });
});
it("treats standalone Chinese affirmative as workaround consent when awaiting confirmation", () => {
const result = evaluateWorkaroundConsent("行", true);
expect(result).toEqual({ allowWorkaround: true, declined: false });
});
it("marks declines when no workaround intent is present", () => {
const result = evaluateWorkaroundConsent("不要,先别执行", true);
expect(result).toEqual({ allowWorkaround: false, declined: true });
});
});
describe("isLocalSkillMutationCommand", () => {
it("detects direct local skill mutation commands", () => {
expect(
isLocalSkillMutationCommand(
"mkdir -p ~/.super-multica/skills/notion-integration && touch ~/.super-multica/skills/notion-integration/SKILL.md",
),
).toBe(true);
expect(
isLocalSkillMutationCommand(
"cat > ~/.super-multica/skills/notion-integration/SKILL.md << 'EOF'\n# skill\nEOF",
),
).toBe(true);
});
it("does not match read-only commands or clawhub install flow", () => {
expect(isLocalSkillMutationCommand("cat ~/.super-multica/skills/notion/SKILL.md")).toBe(false);
expect(
isLocalSkillMutationCommand(
"npx -y clawhub install notion --workdir ~/.super-multica --dir skills --force",
),
).toBe(false);
});
});
describe("evaluateCustomSkillAuthoringConsent", () => {
it("does not grant consent for generic third-party requests", () => {
const result = evaluateCustomSkillAuthoringConsent("帮我在 Notion 新建一个页面", false);
expect(result).toEqual({ allowAuthoring: false, declined: false });
});
it("grants consent when user explicitly asks to create a custom skill", () => {
const result = evaluateCustomSkillAuthoringConsent("请帮我创建一个 Notion skill", false);
expect(result).toEqual({ allowAuthoring: true, declined: false });
});
it("grants consent for short affirmatives when awaiting confirmation", () => {
const result = evaluateCustomSkillAuthoringConsent("继续", true);
expect(result).toEqual({ allowAuthoring: true, declined: false });
});
it("marks declines explicitly", () => {
const result = evaluateCustomSkillAuthoringConsent("先别创建技能", true);
expect(result).toEqual({ allowAuthoring: false, declined: true });
});
});

View file

@ -57,6 +57,7 @@ import {
import { isContextOverflowError } from "./errors.js";
import { resolveWorkspaceDir, ensureWorkspaceDir } from "./workspace.js";
import { createRunLog, type RunLog } from "./run-log.js";
import type { ExecApprovalCallback } from "./tools/exec-approval-types.js";
// ============================================================
// Error classification for auth profile rotation
@ -90,6 +91,153 @@ export function isRotatableError(reason: AuthProfileFailureReason): boolean {
return reason === "auth" || reason === "rate_limit" || reason === "billing" || reason === "timeout";
}
// ── Skill install consent guard ─────────────────────────────────────────────
const CLAWHUB_MUTATION_RE = /\bclawhub\b[\s\S]*\b(?:install|update)\b/i;
const ENV_INSTALL_RE = /\b(?:brew|apt-get|apt|yum|dnf|pacman|zypper)\s+(?:install|upgrade|tap)\b|\b(?:npm|pnpm|yarn|bun)\s+(?:install|add)\b|\bpip(?:3)?\s+install\b|\buv\s+(?:tool\s+install|pip\s+install)\b|\bcargo\s+install\b|\bgo\s+install\b/i;
const THIRD_PARTY_WORKAROUND_RE = /\b(?:osascript|spogo|spotify_player|ha\.sh|homeassistant|hass)\b|\/api\/states\b/i;
const LOCAL_SKILL_PATH_RE = /(?:~\/\.super-multica(?:-[\w-]+)?\/skills\/|\/\.super-multica(?:-[\w-]+)?\/skills\/|\/skills\/)/i;
const LOCAL_SKILL_MUTATION_VERB_RE = /\b(?:mkdir|cp|mv|rm|touch|install|clone)\b/i;
const INSTALL_ACTION_RE = /\b(?:install|update|add)\b|安装|更新|添加|启用|配置/i;
const SKILL_CONTEXT_RE = /\b(?:clawhub|skill|skills)\b|技能|插件|扩展/i;
const WORKAROUND_ACTION_RE = /\b(?:workaround|fallback|local\s+command|local\s+script|shell\s+script|osascript|apple\s*script|spogo|spotify_player|homeassistant|ha\.sh)\b|绕过|临时方案|本地命令|本机命令|脚本方式|直接执行|不用技能|不用skill|不装skill|不安装skill/i;
const CUSTOM_SKILL_AUTHORING_RE = /\b(?:create|author|build)\b[\s\S]*\bskills?\b|创建[\s\S]{0,30}(?:技能|skill)|自定义[\s\S]{0,20}(?:技能|skill)|手写[\s\S]{0,20}(?:技能|skill)|custom\s+skill/i;
const AFFIRMATIVE_RE = /\b(?:yes|y|ok|okay|sure|confirm|confirmed|continue|go ahead|please do|do it)\b|继续|确认|同意|可以|好的|继续安装/i;
const STANDALONE_AFFIRMATIVE_RE = /^\s*(?:行|行吧|行的)\s*[。!!]?$/i;
const DECLINE_RE = /\b(?:no|cancel|stop|don't|do not|not now|skip)\b|不要|不需要|取消|先别|暂时不用/i;
function hasAffirmativeConsent(text: string): boolean {
return AFFIRMATIVE_RE.test(text) || STANDALONE_AFFIRMATIVE_RE.test(text);
}
/**
* Detect mutating ClawHub commands that require explicit user confirmation.
*/
export function isMutatingClawhubCommand(command: string): boolean {
return CLAWHUB_MUTATION_RE.test(command);
}
/**
* Detect package/environment installation commands.
* These mutate the runtime environment and should require explicit user confirmation.
*/
export function isEnvironmentInstallCommand(command: string): boolean {
return ENV_INSTALL_RE.test(command);
}
/**
* Detect local workaround commands for third-party integrations.
* These should require explicit user opt-in before execution.
*/
export function isThirdPartyWorkaroundCommand(command: string): boolean {
return THIRD_PARTY_WORKAROUND_RE.test(command);
}
/**
* Detect direct local skill mutations outside ClawHub install/update flow.
*/
export function isLocalSkillMutationCommand(command: string): boolean {
if (!LOCAL_SKILL_PATH_RE.test(command)) return false;
if (/\bclawhub\b/i.test(command)) return false;
if (LOCAL_SKILL_MUTATION_VERB_RE.test(command)) return true;
const hasCatOrEchoWrite = /\b(?:cat|tee|echo)\b/i.test(command) && />>?|<<\s*['"]?EOF/i.test(command);
return hasCatOrEchoWrite;
}
/**
* Determine whether the current user prompt grants permission to install/update skills.
*
* If `awaitingConfirmation` is true, short affirmative replies (e.g. "继续", "yes")
* are treated as confirmation.
*/
export function evaluateSkillInstallConsent(
prompt: string,
awaitingConfirmation: boolean,
): { allowInstall: boolean; declined: boolean } {
const text = prompt.trim();
if (!text) return { allowInstall: false, declined: false };
if (DECLINE_RE.test(text)) {
return { allowInstall: false, declined: true };
}
const hasInstallAction = INSTALL_ACTION_RE.test(text);
const hasSkillContext = SKILL_CONTEXT_RE.test(text);
const hasAffirmative = hasAffirmativeConsent(text);
if (hasInstallAction) {
return { allowInstall: true, declined: false };
}
if (hasSkillContext && hasAffirmative) {
return { allowInstall: true, declined: false };
}
if (awaitingConfirmation && hasAffirmative) {
return { allowInstall: true, declined: false };
}
return { allowInstall: false, declined: false };
}
/**
* Determine whether the current user prompt explicitly opts into local workaround mode.
*/
export function evaluateWorkaroundConsent(
prompt: string,
awaitingConfirmation: boolean,
): { allowWorkaround: boolean; declined: boolean } {
const text = prompt.trim();
if (!text) return { allowWorkaround: false, declined: false };
const hasWorkaroundAction = WORKAROUND_ACTION_RE.test(text);
const hasAffirmative = hasAffirmativeConsent(text);
if (hasWorkaroundAction) {
return { allowWorkaround: true, declined: false };
}
if (awaitingConfirmation && hasAffirmative) {
return { allowWorkaround: true, declined: false };
}
if (DECLINE_RE.test(text)) {
return { allowWorkaround: false, declined: true };
}
return { allowWorkaround: false, declined: false };
}
/**
* Determine whether the current prompt explicitly opts into custom skill authoring.
*/
export function evaluateCustomSkillAuthoringConsent(
prompt: string,
awaitingConfirmation: boolean,
): { allowAuthoring: boolean; declined: boolean } {
const text = prompt.trim();
if (!text) return { allowAuthoring: false, declined: false };
if (DECLINE_RE.test(text)) {
return { allowAuthoring: false, declined: true };
}
const hasAuthoringIntent = CUSTOM_SKILL_AUTHORING_RE.test(text);
const hasAffirmative = hasAffirmativeConsent(text);
if (hasAuthoringIntent) {
return { allowAuthoring: true, declined: false };
}
if (awaitingConfirmation && hasAffirmative) {
return { allowAuthoring: true, declined: false };
}
return { allowAuthoring: false, declined: false };
}
// ── Run-log result extraction helpers ──────────────────────────────────────
// Lightweight extractors for tool_end metadata. These mirror the patterns in
// cli/output.ts but are kept separate to avoid CLI-specific dependencies.
@ -201,6 +349,13 @@ export class Agent {
private readonly toolStartTimes = new Map<string, number>();
private currentRunToolExecutions: ToolExecutionRecord[] = [];
private initialized = false;
private allowSkillInstallForCurrentRun = false;
private awaitingSkillInstallConfirmation = false;
private allowWorkaroundForCurrentRun = false;
private awaitingWorkaroundConfirmation = false;
private allowCustomSkillAuthoringForCurrentRun = false;
private awaitingCustomSkillAuthoringConfirmation = false;
private readonly guardedExecApproval: ExecApprovalCallback;
// Context window settings (for pre-flight compaction)
private readonly reserveTokens: number;
@ -244,6 +399,7 @@ export class Agent {
// Load session metadata early so stored provider/model can inform defaults
this.sessionId = options.sessionId ?? uuidv7();
this.guardedExecApproval = this.createGuardedExecApprovalCallback(options.onExecApprovalNeeded);
this.runLog = createRunLog(
options.enableRunLog ?? !!process.env.MULTICA_RUN_LOG,
this.sessionId,
@ -454,8 +610,25 @@ export class Agent {
// Use this.sessionId (which may be auto-generated) instead of options.sessionId
// (which may be undefined). Without this, delegate tool has no session context.
this.toolsOptions = mergedToolsConfig
? { ...options, sessionId: this.sessionId, cwd: effectiveCwd, tools: mergedToolsConfig, profileDir, provider: this.resolvedProvider, runLog: this.runLog }
: { ...options, sessionId: this.sessionId, cwd: effectiveCwd, profileDir, provider: this.resolvedProvider, runLog: this.runLog };
? {
...options,
sessionId: this.sessionId,
cwd: effectiveCwd,
tools: mergedToolsConfig,
profileDir,
provider: this.resolvedProvider,
runLog: this.runLog,
onExecApprovalNeeded: this.guardedExecApproval,
}
: {
...options,
sessionId: this.sessionId,
cwd: effectiveCwd,
profileDir,
provider: this.resolvedProvider,
runLog: this.runLog,
onExecApprovalNeeded: this.guardedExecApproval,
};
const tools = resolveTools(this.toolsOptions);
if (this.debug) {
@ -585,6 +758,42 @@ export class Agent {
this._aborted = false;
this.currentRunToolExecutions = [];
if (this._internalRun) {
this.allowSkillInstallForCurrentRun = false;
this.allowWorkaroundForCurrentRun = false;
this.allowCustomSkillAuthoringForCurrentRun = false;
} else {
const consent = evaluateSkillInstallConsent(prompt, this.awaitingSkillInstallConfirmation);
if (consent.declined) {
this.awaitingSkillInstallConfirmation = false;
}
this.allowSkillInstallForCurrentRun = consent.allowInstall;
if (consent.allowInstall) {
this.awaitingSkillInstallConfirmation = false;
}
const workaroundConsent = evaluateWorkaroundConsent(prompt, this.awaitingWorkaroundConfirmation);
if (workaroundConsent.declined) {
this.awaitingWorkaroundConfirmation = false;
}
this.allowWorkaroundForCurrentRun = workaroundConsent.allowWorkaround;
if (workaroundConsent.allowWorkaround) {
this.awaitingWorkaroundConfirmation = false;
}
const customSkillConsent = evaluateCustomSkillAuthoringConsent(
prompt,
this.awaitingCustomSkillAuthoringConfirmation,
);
if (customSkillConsent.declined) {
this.awaitingCustomSkillAuthoringConfirmation = false;
}
this.allowCustomSkillAuthoringForCurrentRun = customSkillConsent.allowAuthoring;
if (customSkillConsent.allowAuthoring) {
this.awaitingCustomSkillAuthoringConfirmation = false;
}
}
const runStart = Date.now();
this.runLog.log("run_start", {
prompt: prompt.slice(0, 200),
@ -758,6 +967,9 @@ export class Agent {
}
this._isRunning = false;
this._aborted = false;
this.allowSkillInstallForCurrentRun = false;
this.allowWorkaroundForCurrentRun = false;
this.allowCustomSkillAuthoringForCurrentRun = false;
this._lastEventSavedAssistant = undefined;
this.currentUserDisplayPrompt = undefined;
this.currentUserSource = undefined;
@ -766,6 +978,91 @@ export class Agent {
}
}
private createGuardedExecApprovalCallback(
base?: ExecApprovalCallback,
): ExecApprovalCallback {
return async (command, cwd) => {
const needsInstallConsent =
isMutatingClawhubCommand(command) || isEnvironmentInstallCommand(command);
const needsWorkaroundConsent = isThirdPartyWorkaroundCommand(command);
const needsCustomSkillAuthoringConsent = isLocalSkillMutationCommand(command);
if (needsInstallConsent && !this.allowSkillInstallForCurrentRun) {
this.awaitingSkillInstallConfirmation = true;
this.runLog.log("install_guard", {
action: "blocked",
reason: "explicit_user_confirmation_required",
command: command.slice(0, 200),
});
return {
approved: false,
decision: "deny",
message:
"Install command blocked: explicit user confirmation is required first. Ask the user whether to continue installation.",
};
}
if (needsInstallConsent) {
this.runLog.log("install_guard", {
action: "allowed",
reason: "user_confirmed",
command: command.slice(0, 200),
});
}
if (needsCustomSkillAuthoringConsent && !this.allowCustomSkillAuthoringForCurrentRun) {
this.awaitingCustomSkillAuthoringConfirmation = true;
this.runLog.log("custom_skill_guard", {
action: "blocked",
reason: "explicit_custom_skill_authoring_confirmation_required",
command: command.slice(0, 200),
});
return {
approved: false,
decision: "deny",
message:
"Manual local skill creation command blocked by policy. Use ClawHub discovery/install flow first, or ask the user to explicitly confirm custom skill authoring.",
};
}
if (needsCustomSkillAuthoringConsent) {
this.runLog.log("custom_skill_guard", {
action: "allowed",
reason: "user_confirmed_custom_skill_authoring",
command: command.slice(0, 200),
});
}
if (needsWorkaroundConsent && !this.allowWorkaroundForCurrentRun) {
this.awaitingWorkaroundConfirmation = true;
this.runLog.log("workaround_guard", {
action: "blocked",
reason: "explicit_workaround_opt_in_required",
command: command.slice(0, 200),
});
return {
approved: false,
decision: "deny",
message:
"Local workaround command blocked by policy. First explain the capability gap and ask whether to search/install a Cloud Hub skill, or get explicit user opt-in for workaround mode.",
};
}
if (needsWorkaroundConsent) {
this.runLog.log("workaround_guard", {
action: "allowed",
reason: "user_opted_in_workaround_mode",
command: command.slice(0, 200),
});
}
if (base) {
return base(command, cwd);
}
return { approved: true, decision: "allow-once" };
};
}
/**
* Advance to the next non-cooldown auth profile.
* Returns true if a new profile was activated, false if exhausted.

View file

@ -218,6 +218,34 @@ describe("buildSkillsSection", () => {
expect(text).toContain("suggest activating it");
});
it("includes capability-gap recovery guidance", () => {
const result = buildSkillsSection("## commit\nDo commits.", "full");
const text = result.join("\n");
expect(text).toContain("capability gap");
expect(text).toContain("explicit user confirmation");
expect(text).toContain("clawhub install");
expect(text).toContain("third-party service requests");
expect(text).toContain("local workaround commands");
expect(text).toContain("spotify_player");
});
it("surfaces installed skill IDs and prioritizes meta skill guidance when present", () => {
const prompt = [
"## 🔧 Meta Skill Installer (meta-skill-installer)",
"Detect missing capabilities.",
"",
"## 📄 PDF (pdf)",
"Handle PDFs.",
].join("\n");
const result = buildSkillsSection(prompt, "full");
const text = result.join("\n");
expect(text).toContain("Installed skill IDs:");
expect(text).toContain("`meta-skill-installer`");
expect(text).toContain("is installed");
expect(text).toContain("ClawHub search");
expect(text).toContain("run ClawHub discovery first");
});
it("returns empty in minimal mode", () => {
expect(buildSkillsSection("skills", "minimal")).toEqual([]);
});

View file

@ -389,19 +389,72 @@ export function buildSkillsSection(
const trimmed = skillsPrompt?.trim();
if (!trimmed) return [];
const skillIds = extractSkillIdsFromSkillsPrompt(trimmed);
const hasMetaSkillInstaller = skillIds.includes("meta-skill-installer");
const { text: budgeted } = truncateWithBudget(trimmed, DEFAULT_SKILLS_MAX_CHARS);
return [
const lines: string[] = [
"## Skills (mandatory)",
"Before replying: scan the available skills below.",
];
if (skillIds.length > 0) {
lines.push(
`Installed skill IDs: ${skillIds.map((id) => `\`${id}\``).join(", ")}`,
);
}
lines.push(
"- If exactly one skill clearly applies: follow its instructions.",
"- If multiple could apply: choose the most specific one.",
"- If none clearly apply but an **inactive skill** matches the user's intent: suggest activating it.",
"- If no skill matches at all: skip skill invocation.",
"- If the request needs a capability you currently lack: do not stop at refusal. Treat it as a capability gap and propose a recovery path.",
"- For third-party service requests (Spotify, Notion, Slack, Jira, etc.), do not jump to ad-hoc shell/app hacks as the default path.",
"- Treat local CLIs/scripts (for example `spogo`, `spotify_player`, `osascript`, `ha.sh`) as workaround mode: only use them after explicit user opt-in.",
);
if (hasMetaSkillInstaller) {
lines.push(
"- `meta-skill-installer` is installed: for capability gaps with no matching installed skill, proactively offer ClawHub search + security review + explicit install confirmation.",
"- With `meta-skill-installer` installed, run ClawHub discovery first (`clawhub search`) before proposing to hand-build a new custom skill.",
);
} else {
lines.push(
"- If `meta-skill-installer` is available and no installed skill matches: proactively offer to search ClawHub for candidates and run security review before install.",
"- Prefer ClawHub discovery over creating a brand-new custom skill from scratch unless the user explicitly asks for custom skill authoring.",
);
}
lines.push(
"- Ask for explicit user confirmation before final `clawhub install` / `clawhub update` unless the user already clearly asked you to install in this turn.",
"- Only use local workaround commands (for example `osascript` or custom shell scripts) if the user explicitly asks for workaround mode or declines skill installation.",
"- After install/update, verify the skill path and retry the original user task.",
"",
budgeted,
"",
];
);
return lines;
}
/**
* Extract skill IDs from SkillManager prompt headings.
* Expected heading format: `## <emoji> <name> (<id>)`
*/
function extractSkillIdsFromSkillsPrompt(skillsPrompt: string): string[] {
const ids: string[] = [];
const seen = new Set<string>();
const headingRegex = /^##\s+.*\(([^()\n]+)\)\s*$/gm;
let match: RegExpExecArray | null;
while ((match = headingRegex.exec(skillsPrompt)) !== null) {
const id = match[1]?.trim();
if (!id || seen.has(id)) continue;
seen.add(id);
ids.push(id);
}
return ids;
}
/**

View file

@ -40,6 +40,8 @@ export interface ExecApprovalRequest {
export interface ApprovalResult {
approved: boolean;
decision: ApprovalDecision;
/** Optional denial/approval message for the exec tool response */
message?: string | undefined;
}
// ============ Configuration ============

View file

@ -59,10 +59,11 @@ export function createExecTool(
if (onApprovalNeeded) {
const approvalResult = await onApprovalNeeded(command, effectiveCwd);
if (!approvalResult.approved) {
const denialText = approvalResult.message?.trim() || "Command execution denied by user.";
return {
content: [{ type: "text", text: "Command execution denied by user." }],
content: [{ type: "text", text: denialText }],
details: {
output: "Command execution denied by user.",
output: denialText,
exitCode: 1,
truncated: false,
},

View file

@ -0,0 +1,441 @@
#!/usr/bin/env node
import { existsSync, readFileSync, writeFileSync } from "node:fs";
import { dirname, join, resolve } from "node:path";
/**
* @typedef {{
* id: string;
* check: string;
* passed: boolean;
* detail?: string;
* }} CheckResult
*/
/**
* @typedef {{
* provider: string;
* caseId: string;
* status: string;
* sessionId: string;
* sessionDir: string;
* logFile: string;
* checks: CheckResult[];
* pass: boolean;
* }} CaseAnalysis
*/
const manifestArg = process.argv[2];
if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") {
console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs <manifest.tsv>");
process.exit(0);
}
const manifestPath = resolve(manifestArg);
if (!existsSync(manifestPath)) {
console.error(`Manifest not found: ${manifestPath}`);
process.exit(1);
}
const CASE_RULES = {
"case-01-install-caldav-calendar": {
requiredCommandTokens: [
["clawhub", "search"],
["caldav"],
["clawhub", "install"],
["review-skill-security.mjs"],
],
},
"case-02-gap-discovery-homeassistant": {
requiredCommandTokens: [
["clawhub", "search"],
["home", "assistant"],
["clawhub", "install"],
["review-skill-security.mjs"],
],
},
"case-03-install-update-codexmonitor": {
requiredCommandTokens: [
["clawhub", "search"],
["codexmonitor"],
["clawhub", "install"],
["clawhub", "update"],
["review-skill-security.mjs"],
],
},
"case-04-gap-discovery-spotify-ux": {
requireExecUsage: false,
requiredResponseRegex: [
"缺少|没有.*(技能|能力|集成)|capability gap",
"clawhub|cloud\\s*hub|cloudhub",
"安装|install",
"是否|要不要|would you like|do you want",
"安全|审查|security|review",
],
forbiddenCommandTokens: [
["clawhub", "install"],
["clawhub", "update"],
["osascript"],
["spogo"],
["spotify_player"],
["ha.sh"],
["/api/states"],
],
},
"case-05-gap-discovery-notion-ux": {
requireExecUsage: false,
requiredCommandTokens: [
["clawhub", "search"],
["notion"],
],
requiredEventTokens: [
["install_guard", "blocked"],
],
requiredResponseRegex: [
"notion",
"安装|install",
"是否|要不要|would you like|do you want|同意",
"token|授权|integration",
],
forbiddenCommandTokens: [
["osascript"],
["spogo"],
["spotify_player"],
["ha.sh"],
["/api/states"],
],
},
};
/**
* @param {string} text
* @returns {string[]}
*/
function splitLines(text) {
return text.split(/\r?\n/).filter(Boolean);
}
/**
* @param {string} command
* @param {string[]} tokens
* @returns {boolean}
*/
function commandHasTokens(command, tokens) {
const lower = command.toLowerCase();
return tokens.every((token) => lower.includes(token.toLowerCase()));
}
/**
* @param {string} rawArgs
* @returns {string}
*/
function extractCommand(rawArgs) {
if (!rawArgs) return "";
try {
const parsed = JSON.parse(rawArgs);
if (parsed && typeof parsed.command === "string") {
return parsed.command;
}
} catch {
// Fall through: args may be truncated JSON in run-log.
}
return rawArgs;
}
/**
* @param {string} text
* @param {string} pattern
* @returns {boolean}
*/
function textMatchesPattern(text, pattern) {
try {
return new RegExp(pattern, "i").test(text);
} catch {
return false;
}
}
/**
* @param {string} runLogPath
*/
function parseRunLog(runLogPath) {
const lines = splitLines(readFileSync(runLogPath, "utf-8"));
const events = [];
for (const line of lines) {
try {
events.push(JSON.parse(line));
} catch {
// Ignore malformed lines but keep analysis alive.
}
}
return events;
}
/**
* @param {string} sessionPath
* @returns {string}
*/
function parseFinalAssistantText(sessionPath) {
if (!existsSync(sessionPath)) return "";
const lines = splitLines(readFileSync(sessionPath, "utf-8"));
let latest = "";
for (const line of lines) {
try {
const entry = JSON.parse(line);
if (entry?.type !== "message") continue;
const msg = entry.message;
if (!msg || msg.role !== "assistant") continue;
if (typeof msg.content === "string") {
latest = msg.content;
continue;
}
if (Array.isArray(msg.content)) {
const text = msg.content
.filter((part) => part && part.type === "text" && typeof part.text === "string")
.map((part) => part.text)
.join("\n")
.trim();
if (text) latest = text;
}
} catch {
// Ignore malformed lines.
}
}
return latest;
}
/**
* @param {CaseAnalysis} analysis
* @param {string} id
* @param {string} check
* @param {boolean} passed
* @param {string} [detail]
*/
function addCheck(analysis, id, check, passed, detail) {
analysis.checks.push({ id, check, passed, detail });
}
const rows = splitLines(readFileSync(manifestPath, "utf-8"));
if (rows.length <= 1) {
console.error(`Manifest has no data rows: ${manifestPath}`);
process.exit(1);
}
/** @type {CaseAnalysis[]} */
const analyses = [];
for (let i = 1; i < rows.length; i++) {
const row = rows[i];
if (!row) continue;
const cols = row.split("\t");
if (cols.length < 11) continue;
const provider = cols[1] ?? "";
const caseId = cols[2] ?? "";
const rules = CASE_RULES[caseId];
const status = cols[3] ?? "";
const sessionId = cols[4] ?? "";
const sessionDir = cols[5] ?? "";
const logFile = cols[6] ?? "";
/** @type {CaseAnalysis} */
const analysis = {
provider,
caseId,
status,
sessionId,
sessionDir,
logFile,
checks: [],
pass: false,
};
addCheck(
analysis,
"run-status",
"runner status is success",
status === "success",
`status=${status}`,
);
if (!sessionDir) {
addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir");
analyses.push(analysis);
continue;
}
const runLogPath = join(sessionDir, "run-log.jsonl");
addCheck(
analysis,
"run-log-file",
"run-log.jsonl exists",
existsSync(runLogPath),
runLogPath,
);
if (!existsSync(runLogPath)) {
analyses.push(analysis);
continue;
}
const events = parseRunLog(runLogPath);
const sessionPath = join(sessionDir, "session.jsonl");
const finalAssistantText = parseFinalAssistantText(sessionPath);
const runStarts = events.filter((e) => e.event === "run_start");
const runEnds = events.filter((e) => e.event === "run_end");
const toolStarts = events.filter((e) => e.event === "tool_start");
const toolEnds = events.filter((e) => e.event === "tool_end");
const errorToolEnds = toolEnds.filter((e) => e.is_error === true);
addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`);
addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`);
addCheck(
analysis,
"tool-pairing",
"tool_start count matches tool_end count",
toolStarts.length === toolEnds.length,
`start=${toolStarts.length} end=${toolEnds.length}`,
);
const finalRunEnd = runEnds.at(-1);
const runEndError = finalRunEnd?.error;
const finalRunText = typeof finalRunEnd?.text === "string" ? finalRunEnd.text : "";
const finalResponseText = finalAssistantText || finalRunText;
addCheck(
analysis,
"run-end-error",
"final run_end.error is null/empty",
runEndError === null || runEndError === undefined || runEndError === "",
`error=${String(runEndError)}`,
);
addCheck(
analysis,
"tool-errors",
"no tool_end has is_error=true",
errorToolEnds.length === 0,
`error_tool_calls=${errorToolEnds.length}`,
);
const execCommands = toolStarts
.filter((e) => e.tool === "exec")
.map((e) => extractCommand(typeof e.args === "string" ? e.args : ""))
.filter(Boolean);
const requireExecUsage = rules?.requireExecUsage !== false;
addCheck(
analysis,
"exec-usage",
requireExecUsage
? "at least one exec command was used"
: "exec usage is optional for this case",
requireExecUsage ? execCommands.length > 0 : true,
requireExecUsage ? `exec_calls=${execCommands.length}` : `exec_calls=${execCommands.length} (optional)`,
);
if (rules) {
if (Array.isArray(rules.requiredCommandTokens)) {
for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
const tokenList = rules.requiredCommandTokens[r];
const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
addCheck(
analysis,
`cmd-${r + 1}`,
`exec command contains tokens: ${tokenList.join(" + ")}`,
passed,
);
}
}
if (Array.isArray(rules.requiredEventTokens)) {
const eventLines = events.map((event) => JSON.stringify(event).toLowerCase());
for (let r = 0; r < rules.requiredEventTokens.length; r++) {
const tokenList = rules.requiredEventTokens[r];
const passed = eventLines.some((line) =>
tokenList.every((token) => line.includes(token.toLowerCase())),
);
addCheck(
analysis,
`event-${r + 1}`,
`event log contains tokens: ${tokenList.join(" + ")}`,
passed,
);
}
}
if (Array.isArray(rules.forbiddenCommandTokens)) {
for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) {
const tokenList = rules.forbiddenCommandTokens[r];
const passed = !execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
addCheck(
analysis,
`forbid-cmd-${r + 1}`,
`exec command does not contain tokens: ${tokenList.join(" + ")}`,
passed,
);
}
}
if (Array.isArray(rules.requiredResponseRegex)) {
for (let r = 0; r < rules.requiredResponseRegex.length; r++) {
const pattern = rules.requiredResponseRegex[r];
const passed = textMatchesPattern(finalResponseText, pattern);
addCheck(
analysis,
`resp-${r + 1}`,
`final response matches regex: /${pattern}/i`,
passed,
);
}
}
} else {
addCheck(
analysis,
"case-rules",
"case has rule set",
false,
`No rules defined for case_id=${caseId}`,
);
}
analysis.pass = analysis.checks.every((c) => c.passed);
analyses.push(analysis);
}
const passedCases = analyses.filter((a) => a.pass).length;
const failedCases = analyses.length - passedCases;
const output = {
manifestPath,
totalCases: analyses.length,
passedCases,
failedCases,
results: analyses,
};
const outputPath = join(dirname(manifestPath), "analysis.json");
writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8");
for (const item of analyses) {
const status = item.pass ? "PASS" : "FAIL";
console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`);
for (const check of item.checks) {
const marker = check.passed ? " [ok] " : " [bad] ";
const detail = check.detail ? ` (${check.detail})` : "";
console.log(`${marker}${check.check}${detail}`);
}
}
console.log("");
console.log(`Analysis file: ${outputPath}`);
console.log(`Summary: pass=${passedCases} fail=${failedCases}`);
if (failedCases > 0) {
process.exit(1);
}

View file

@ -0,0 +1,15 @@
Run an end-to-end test for the Meta Skill Installer.
Goal: install a real ClawHub skill for CalDAV calendar capability.
Reference page: https://clawhub.ai/skills/caldav-calendar
Follow this exact workflow:
1. State the missing capability in one sentence.
2. Search ClawHub for CalDAV-related skills and choose the best candidate.
3. Stage-install to a temporary directory first (never install directly to active skills path).
4. Run security review on the staged skill:
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
5. If riskLevel is safe, install to "$DATA_DIR/skills".
6. Verify final install by checking "$DATA_DIR/skills/<slug>/SKILL.md" exists.
7. Return a short report: selected slug, riskLevel, final install path.

View file

@ -0,0 +1,16 @@
Run an end-to-end capability-gap discovery test for Meta Skill Installer.
User intent: "I need to control Home Assistant lights and switches from the agent."
Reference page: https://clawhub.ai/skills/homeassistant
Requirements:
1. Treat this as a missing capability and explicitly define the gap.
2. Search ClawHub for relevant skills and list the top 3 candidates.
3. Pick one candidate with rationale (scope match + lower security risk).
4. Stage-install to a temporary directory.
5. Run security review:
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
6. If riskLevel is safe, install to "$DATA_DIR/skills".
7. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
8. Return: candidate list, chosen slug, riskLevel, and final path.

View file

@ -0,0 +1,16 @@
Run an end-to-end install+update regression test for Meta Skill Installer.
Goal: use a real ClawHub skill and verify install, review, and update flow.
Reference page: https://clawhub.ai/odrobnik/codexmonitor
Requirements:
1. Search ClawHub for CodexMonitor and select the matching skill slug.
2. Stage-install to a temporary directory and run security review:
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
3. If riskLevel is safe, install to "$DATA_DIR/skills".
4. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
5. Run an update for the same slug in managed dir:
clawhub update "<slug>" --workdir "$DATA_DIR" --dir skills --force
6. Run security review again on the final installed path.
7. Return: slug, initial riskLevel, update executed (yes/no), final path.

View file

@ -0,0 +1 @@
随机播放 spotify 中的音乐

View file

@ -0,0 +1 @@
帮我在 Notion 新建一个页面,标题是今天待办,并写入三条任务:修复登录 bug、写周报、安排评审

View file

@ -0,0 +1,170 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
CASES_DIR="${SCRIPT_DIR}/cases"
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/skills-e2e-runs/${TIMESTAMP}}"
RESULTS_DIR="${OUT_DIR}/results"
MANIFEST="${OUT_DIR}/manifest.tsv"
# Required environment for agent-driven E2E.
SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
PROVIDERS_RAW="${PROVIDERS:-kimi-coding}"
CASE_GLOB="${CASE_GLOB:-case-*.txt}"
CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-1200}"
MAX_PARALLEL="${MAX_PARALLEL:-1}"
TIMEOUT_ENABLED="true"
if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
TIMEOUT_ENABLED="false"
fi
if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
exit 1
fi
if [[ "${1:-}" == "--worker" ]]; then
provider="${2:?missing provider}"
case_file="${3:?missing case file}"
case_base="$(basename "${case_file}")"
case_id="${case_base%.txt}"
log_file="${OUT_DIR}/${provider}-${case_id}.log"
result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
prompt="$(cat "${case_file}")"
status="success"
timed_out="false"
started_epoch="$(date +%s)"
started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
SMC_DATA_DIR="${SMC_DATA_DIR}" \
MULTICA_API_URL="${MULTICA_API_URL}" \
pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
cmd_pid=$!
while kill -0 "${cmd_pid}" 2>/dev/null; do
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
now="$(date +%s)"
elapsed="$((now - started_epoch))"
if (( elapsed >= CASE_TIMEOUT_SEC )); then
timed_out="true"
kill "${cmd_pid}" 2>/dev/null || true
sleep 1
kill -9 "${cmd_pid}" 2>/dev/null || true
break
fi
fi
sleep 2
done
exit_code=0
wait "${cmd_pid}" 2>/dev/null || exit_code=$?
ended_epoch="$(date +%s)"
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
duration_sec="$((ended_epoch - started_epoch))"
if [[ "${timed_out}" == "true" ]]; then
status="timeout"
printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
elif (( exit_code != 0 )); then
status="failed"
elif [[ ! -s "${log_file}" ]]; then
status="failed"
elif ! rg -q "\[session: " "${log_file}"; then
status="failed"
fi
session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
"${TIMESTAMP}" \
"${provider}" \
"${case_id}" \
"${status}" \
"${session_id}" \
"${session_dir}" \
"${log_file}" \
"${started_at}" \
"${ended_at}" \
"${duration_sec}" \
"${exit_code}" > "${result_file}"
printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
"${provider}" \
"${case_id}" \
"${status}" \
"${duration_sec}" \
"${session_id:-N/A}"
exit 0
fi
mkdir -p "${OUT_DIR}"
mkdir -p "${RESULTS_DIR}"
printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
CASE_FILES=()
while IFS= read -r line; do
CASE_FILES+=("${line}")
done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
exit 1
fi
echo "Output directory: ${OUT_DIR}"
echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
echo "Providers: ${PROVIDERS[*]}"
echo "Cases: ${#CASE_FILES[@]}"
echo "Max parallel: ${MAX_PARALLEL}"
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
else
echo "Case timeout: disabled"
fi
TASKS=()
for provider in "${PROVIDERS[@]}"; do
for case_file in "${CASE_FILES[@]}"; do
TASKS+=("${provider}" "${case_file}")
done
done
echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
RESULT_FILES=()
while IFS= read -r line; do
RESULT_FILES+=("${line}")
done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
echo "No result files produced in ${RESULTS_DIR}" >&2
exit 1
fi
for result_file in "${RESULT_FILES[@]}"; do
cat "${result_file}" >> "${MANIFEST}"
done
success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
echo
echo "Completed run stage. Manifest: ${MANIFEST}"
echo "Run summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
echo
echo "Running structured analysis..."
node "${SCRIPT_DIR}/analyze.mjs" "${MANIFEST}" | tee "${OUT_DIR}/analysis.txt"

View file

@ -0,0 +1,134 @@
---
name: Meta Skill Installer
description: Detect missing capabilities, search clawhub.ai for matching skills, run security review on candidate skills, and install safe skills into Multica. Use when a task cannot be completed with current skills/tools or when the user asks to discover/install/update skills from ClawHub.
version: 1.0.0
metadata:
tags:
- meta
- skills
- clawhub
- security
install:
- id: node-clawhub
kind: node
package: clawhub
bins: [clawhub]
label: "Install ClawHub CLI"
userInvocable: true
disableModelInvocation: false
---
# Meta Skill Installer
Use this skill to close capability gaps by discovering and installing skills from ClawHub with a mandatory security gate.
## Safety Defaults
- Always run in this order: identify gap -> search -> stage install -> security review -> install to managed dir -> validate.
- Never install directly into the active skills directory before review.
- If risk is `dangerous`, stop and explain why.
- If risk is `needs-review`, ask for explicit user confirmation before final install.
## Resolve Paths and Commands
Use Multica managed skills path, not the current workspace:
```bash
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
SKILLS_DIR="$DATA_DIR/skills"
META_SKILL_DIR="$SKILLS_DIR/meta-skill-installer"
if command -v clawhub >/dev/null 2>&1; then
CLAWHUB_CMD=(clawhub)
else
CLAWHUB_CMD=(npx -y clawhub)
fi
```
If neither command path works, install the CLI first (`npm i -g clawhub`) and retry.
## Workflow
### 1) Detect the Capability Gap
When the current task cannot be completed with existing skills/tools:
- Summarize the missing capability in one sentence.
- Convert it to a focused search query (tool + domain + action).
- Keep the original user intent and success criteria.
### 2) Search ClawHub
Run one or more searches and collect top candidates:
```bash
"${CLAWHUB_CMD[@]}" search "<query>" --limit 10
```
Candidate ranking rules:
- Primary: semantic relevance to the missing capability.
- Secondary: clearer SKILL description and narrower scope.
- Tertiary: lower operational risk (fewer privileged or remote-exec patterns).
### 3) Stage Install in Quarantine Directory
Install candidate skill into a temporary workdir first:
```bash
STAGING_DIR="$(mktemp -d "${TMPDIR:-/tmp}/multica-skill-review.XXXXXX")"
"${CLAWHUB_CMD[@]}" install "<slug>" --workdir "$STAGING_DIR" --dir skills --version "<optional-version>" --force
```
Expected staged path:
```bash
"$STAGING_DIR/skills/<slug>"
```
### 4) Run Security Review
Use this skill's scanner script against the staged skill:
```bash
node "$META_SKILL_DIR/scripts/review-skill-security.mjs" "$STAGING_DIR/skills/<slug>"
```
Interpret scanner output:
- `riskLevel: safe` -> continue to install.
- `riskLevel: needs-review` -> present findings, ask user for explicit confirmation.
- `riskLevel: dangerous` -> block install by default.
### 5) Install to Multica Managed Skills Directory
Only after passing the review gate, install to the directory Multica actually loads:
```bash
mkdir -p "$SKILLS_DIR"
"${CLAWHUB_CMD[@]}" install "<slug>" --workdir "$DATA_DIR" --dir skills --version "<optional-version>" --force
```
If skill already exists, use update:
```bash
"${CLAWHUB_CMD[@]}" update "<slug>" --workdir "$DATA_DIR" --dir skills --version "<optional-version>" --force
```
### 6) Post-Install Validation
Validate presence and scan once more in the final location:
```bash
test -f "$SKILLS_DIR/<slug>/SKILL.md"
node "$META_SKILL_DIR/scripts/review-skill-security.mjs" "$SKILLS_DIR/<slug>"
```
Then retry the original user task with the new skill.
## Guardrails
- Never claim installation success without path-level verification.
- Never hide security findings; summarize concrete files and reasons.
- Prefer pinned versions when available, and report the installed version to the user.
- If the chosen skill requires secrets/API keys, pause after install and ask user to configure required env vars before using it.

View file

@ -0,0 +1,328 @@
#!/usr/bin/env node
import { existsSync, lstatSync, readdirSync, readFileSync } from "node:fs";
import { basename, extname, join, relative, resolve } from "node:path";
const args = process.argv.slice(2);
if (args.length !== 1 || args[0] === "--help" || args[0] === "-h") {
console.error("Usage: node review-skill-security.mjs <skill-directory>");
process.exit(1);
}
const targetDir = resolve(args[0]);
if (!existsSync(targetDir)) {
console.error(JSON.stringify({
targetDir,
riskLevel: "dangerous",
error: "Target directory does not exist",
}, null, 2));
process.exit(1);
}
/** Maximum file size to inspect as text (2 MB). */
const MAX_TEXT_FILE_BYTES = 2_000_000;
/** Maximum findings returned to avoid huge output. */
const MAX_FINDINGS = 200;
const SKIP_DIRS = new Set([
".git",
".hg",
".svn",
"node_modules",
"dist",
"build",
".next",
".turbo",
".cache",
]);
const TEXT_EXTENSIONS = new Set([
".md",
".txt",
".json",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
".env",
".sh",
".bash",
".zsh",
".fish",
".ps1",
".js",
".mjs",
".cjs",
".ts",
".tsx",
".jsx",
".py",
".rb",
".go",
".rs",
".java",
".kt",
".swift",
".php",
".lua",
".sql",
".xml",
".html",
".css",
]);
/**
* @typedef {"safe" | "needs-review" | "dangerous"} RiskLevel
*/
/**
* @typedef {{
* severity: Exclude<RiskLevel, "safe">;
* type: string;
* file: string;
* line?: number;
* message: string;
* snippet?: string;
* }} Finding
*/
const LINE_PATTERNS = [
{
type: "network-pipe-shell",
severity: "dangerous",
regex: /\b(?:curl|wget)\b[^\n|]*\|\s*(?:ba|z)?sh\b/i,
message: "Network content piped directly into shell.",
},
{
type: "powershell-iex-download",
severity: "dangerous",
regex: /\b(?:invoke-webrequest|iwr)\b[^\n|]*\|\s*iex\b/i,
message: "Downloaded content executed via PowerShell IEX.",
},
{
type: "destructive-rm-root",
severity: "dangerous",
regex: /(?:^|[\s;])(?:sudo\s+)?rm\s+-rf\s+(?:\/(?:\s|$)|~(?:\/|\s|$))/i,
message: "Potentially destructive recursive delete at root/home scope.",
},
{
type: "device-overwrite",
severity: "dangerous",
regex: /\bdd\s+if=.*\s+of=\/dev\/(?:sd[a-z]\d*|nvme\d+n\d+(?:p\d+)?|disk\d+)/i,
message: "Possible block-device overwrite command.",
},
{
type: "reverse-shell",
severity: "dangerous",
regex: /\/dev\/tcp\/|nc\s+-e\s+|bash\s+-i\b.*\/dev\/tcp\//i,
message: "Potential reverse-shell behavior.",
},
{
type: "sudo-usage",
severity: "needs-review",
regex: /(^|[\s;])sudo\s+/i,
message: "Uses privileged command execution (sudo).",
},
{
type: "remote-download",
severity: "needs-review",
regex: /\b(?:curl|wget|invoke-webrequest|iwr)\b.*https?:\/\//i,
message: "Downloads remote content. Verify source integrity and intent.",
},
{
type: "dynamic-exec-js",
severity: "needs-review",
regex: /\bchild_process\.(?:exec|spawn|execSync|spawnSync)\b|\beval\s*\(/i,
message: "Dynamic execution primitive found in JavaScript/TypeScript.",
},
{
type: "python-shell-exec",
severity: "needs-review",
regex: /\bos\.system\s*\(|\bsubprocess\.(?:run|Popen|call)\s*\(.*shell\s*=\s*True/i,
message: "Shell execution primitive found in Python.",
},
{
type: "secret-env-access",
severity: "needs-review",
regex: /process\.env\.[A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD)|\$\{?[A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD)\}?/i,
message: "Reads variables that may contain credentials/secrets.",
},
];
/**
* @param {string} value
* @returns {string}
*/
function compactSnippet(value) {
return value.replace(/\s+/g, " ").trim().slice(0, 200);
}
/**
* @param {string} filePath
* @returns {boolean}
*/
function shouldReadAsText(filePath) {
const base = basename(filePath).toLowerCase();
if (base === "skill.md") return true;
return TEXT_EXTENSIONS.has(extname(filePath).toLowerCase());
}
/**
* @param {string} filePath
* @returns {string | null}
*/
function readTextFile(filePath) {
const buf = readFileSync(filePath);
if (buf.includes(0)) return null;
return buf.toString("utf-8");
}
/** @type {Finding[]} */
const findings = [];
let scannedFiles = 0;
let skippedLargeFiles = 0;
let skippedBinaryFiles = 0;
let symlinkCount = 0;
/**
* @param {Finding} finding
*/
function addFinding(finding) {
if (findings.length >= MAX_FINDINGS) return;
findings.push(finding);
}
/**
* @param {string} currentDir
*/
function walk(currentDir) {
const entries = readdirSync(currentDir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = join(currentDir, entry.name);
const relPath = relative(targetDir, fullPath) || ".";
let stat;
try {
stat = lstatSync(fullPath);
} catch {
addFinding({
severity: "needs-review",
type: "stat-error",
file: relPath,
message: "Could not stat path. Manual inspection recommended.",
});
continue;
}
if (stat.isSymbolicLink()) {
symlinkCount++;
addFinding({
severity: "dangerous",
type: "symlink",
file: relPath,
message: "Symbolic links can hide path traversal or redirection behavior.",
});
continue;
}
if (stat.isDirectory()) {
if (SKIP_DIRS.has(entry.name)) continue;
walk(fullPath);
continue;
}
if (!stat.isFile()) continue;
scannedFiles++;
if (stat.size > MAX_TEXT_FILE_BYTES) {
skippedLargeFiles++;
addFinding({
severity: "needs-review",
type: "large-file",
file: relPath,
message: `Large file (${stat.size} bytes) was not fully scanned.`,
});
continue;
}
if (!shouldReadAsText(fullPath)) continue;
let content;
try {
content = readTextFile(fullPath);
} catch {
addFinding({
severity: "needs-review",
type: "read-error",
file: relPath,
message: "Failed to read file during scan.",
});
continue;
}
if (content === null) {
skippedBinaryFiles++;
continue;
}
const lines = content.split(/\r?\n/);
for (let i = 0; i < lines.length; i++) {
const line = lines[i] ?? "";
if (!line) continue;
for (const pattern of LINE_PATTERNS) {
if (!pattern.regex.test(line)) continue;
addFinding({
severity: pattern.severity,
type: pattern.type,
file: relPath,
line: i + 1,
message: pattern.message,
snippet: compactSnippet(line),
});
}
}
}
}
walk(targetDir);
if (!existsSync(join(targetDir, "SKILL.md"))) {
addFinding({
severity: "dangerous",
type: "missing-skill-md",
file: ".",
message: "SKILL.md not found at skill root.",
});
}
const dangerousCount = findings.filter((f) => f.severity === "dangerous").length;
const reviewCount = findings.filter((f) => f.severity === "needs-review").length;
/** @type {RiskLevel} */
let riskLevel = "safe";
if (dangerousCount > 0) {
riskLevel = "dangerous";
} else if (reviewCount > 0) {
riskLevel = "needs-review";
}
const output = {
targetDir,
riskLevel,
summary: {
scannedFiles,
symlinkCount,
skippedLargeFiles,
skippedBinaryFiles,
dangerousFindings: dangerousCount,
reviewFindings: reviewCount,
totalFindings: findings.length,
findingsTruncated: findings.length >= MAX_FINDINGS,
},
findings,
};
console.log(JSON.stringify(output, null, 2));