From 0c1856b54b78d521fe90ad7e8d03c12b9024c290 Mon Sep 17 00:00:00 2001
From: Jiayuan Zhang <forrestchang7@gmail.com>
Date: Tue, 17 Feb 2026 00:36:45 +0800
Subject: [PATCH 1/7] feat(skills): add clawhub meta skill with security gate

---
 skills/meta-skill-installer/SKILL.md          | 134 +++++++
 .../scripts/review-skill-security.mjs         | 328 ++++++++++++++++++
 2 files changed, 462 insertions(+)
 create mode 100644 skills/meta-skill-installer/SKILL.md
 create mode 100644 skills/meta-skill-installer/scripts/review-skill-security.mjs
diff --git a/skills/meta-skill-installer/SKILL.md b/skills/meta-skill-installer/SKILL.md
new file mode 100644
index 00000000..0bc1e7a7
--- /dev/null
+++ b/skills/meta-skill-installer/SKILL.md
@@ -0,0 +1,134 @@
+---
+name: Meta Skill Installer
+description: Detect missing capabilities, search clawhub.ai for matching skills, run security review on candidate skills, and install safe skills into Multica. Use when a task cannot be completed with current skills/tools or when the user asks to discover/install/update skills from ClawHub.
+version: 1.0.0
+metadata:
+  tags:
+    - meta
+    - skills
+    - clawhub
+    - security
+  install:
+    - id: node-clawhub
+      kind: node
+      package: clawhub
+      bins: [clawhub]
+      label: "Install ClawHub CLI"
+userInvocable: true
+disableModelInvocation: false
+---
+
+# Meta Skill Installer
+
+Use this skill to close capability gaps by discovering and installing skills from ClawHub with a mandatory security gate.
+
+## Safety Defaults
+
+- Always run in this order: identify gap -> search -> stage install -> security review -> install to managed dir -> validate.
+- Never install directly into the active skills directory before review.
+- If risk is `dangerous`, stop and explain why.
+- If risk is `needs-review`, ask for explicit user confirmation before final install.
+
+## Resolve Paths and Commands
+
+Use Multica managed skills path, not the current workspace:
+
+```bash
+DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
+SKILLS_DIR="$DATA_DIR/skills"
+META_SKILL_DIR="$SKILLS_DIR/meta-skill-installer"
+
+if command -v clawhub >/dev/null 2>&1; then
+  CLAWHUB_CMD=(clawhub)
+else
+  CLAWHUB_CMD=(npx -y clawhub)
+fi
+```
+
+If neither command path works, install the CLI first (`npm i -g clawhub`) and retry.
+
+## Workflow
+
+### 1) Detect the Capability Gap
+
+When the current task cannot be completed with existing skills/tools:
+
+- Summarize the missing capability in one sentence.
+- Convert it to a focused search query (tool + domain + action).
+- Keep the original user intent and success criteria.
+
+### 2) Search ClawHub
+
+Run one or more searches and collect top candidates:
+
+```bash
+"${CLAWHUB_CMD[@]}" search "<query>" --limit 10
+```
+
+Candidate ranking rules:
+
+- Primary: semantic relevance to the missing capability.
+- Secondary: clearer SKILL description and narrower scope.
+- Tertiary: lower operational risk (fewer privileged or remote-exec patterns).
+
+### 3) Stage Install in Quarantine Directory
+
+Install candidate skill into a temporary workdir first:
+
+```bash
+STAGING_DIR="$(mktemp -d "${TMPDIR:-/tmp}/multica-skill-review.XXXXXX")"
+"${CLAWHUB_CMD[@]}" install "<slug>" --workdir "$STAGING_DIR" --dir skills --version "<optional-version>" --force
+```
+
+Expected staged path:
+
+```bash
+"$STAGING_DIR/skills/<slug>"
+```
+
+### 4) Run Security Review
+
+Use this skill's scanner script against the staged skill:
+
+```bash
+node "$META_SKILL_DIR/scripts/review-skill-security.mjs" "$STAGING_DIR/skills/<slug>"
+```
+
+Interpret scanner output:
+
+- `riskLevel: safe` -> continue to install.
+- `riskLevel: needs-review` -> present findings, ask user for explicit confirmation.
+- `riskLevel: dangerous` -> block install by default.
+
+### 5) Install to Multica Managed Skills Directory
+
+Only after passing the review gate, install to the directory Multica actually loads:
+
+```bash
+mkdir -p "$SKILLS_DIR"
+"${CLAWHUB_CMD[@]}" install "<slug>" --workdir "$DATA_DIR" --dir skills --version "<optional-version>" --force
+```
+
+If skill already exists, use update:
+
+```bash
+"${CLAWHUB_CMD[@]}" update "<slug>" --workdir "$DATA_DIR" --dir skills --version "<optional-version>" --force
+```
+
+### 6) Post-Install Validation
+
+Validate presence and scan once more in the final location:
+
+```bash
+test -f "$SKILLS_DIR/<slug>/SKILL.md"
+node "$META_SKILL_DIR/scripts/review-skill-security.mjs" "$SKILLS_DIR/<slug>"
+```
+
+Then retry the original user task with the new skill.
+
+## Guardrails
+
+- Never claim installation success without path-level verification.
+- Never hide security findings; summarize concrete files and reasons.
+- Prefer pinned versions when available, and report the installed version to the user.
+- If the chosen skill requires secrets/API keys, pause after install and ask user to configure required env vars before using it.
diff --git a/skills/meta-skill-installer/scripts/review-skill-security.mjs b/skills/meta-skill-installer/scripts/review-skill-security.mjs
new file mode 100644
index 00000000..f68aa5a3
--- /dev/null
+++ b/skills/meta-skill-installer/scripts/review-skill-security.mjs
@@ -0,0 +1,328 @@
+#!/usr/bin/env node
+
+import { existsSync, lstatSync, readdirSync, readFileSync } from "node:fs";
+import { basename, extname, join, relative, resolve } from "node:path";
+
+const args = process.argv.slice(2);
+if (args.length !== 1 || args[0] === "--help" || args[0] === "-h") {
+  console.error("Usage: node review-skill-security.mjs <skill-directory>");
+  process.exit(1);
+}
+
+const targetDir = resolve(args[0]);
+if (!existsSync(targetDir)) {
+  console.error(JSON.stringify({
+    targetDir,
+    riskLevel: "dangerous",
+    error: "Target directory does not exist",
+  }, null, 2));
+  process.exit(1);
+}
+
+/** Maximum file size to inspect as text (2 MB). */
+const MAX_TEXT_FILE_BYTES = 2_000_000;
+/** Maximum findings returned to avoid huge output. */
+const MAX_FINDINGS = 200;
+
+const SKIP_DIRS = new Set([
+  ".git",
+  ".hg",
+  ".svn",
+  "node_modules",
+  "dist",
+  "build",
+  ".next",
+  ".turbo",
+  ".cache",
+]);
+
+const TEXT_EXTENSIONS = new Set([
+  ".md",
+  ".txt",
+  ".json",
+  ".yaml",
+  ".yml",
+  ".toml",
+  ".ini",
+  ".cfg",
+  ".conf",
+  ".env",
+  ".sh",
+  ".bash",
+  ".zsh",
+  ".fish",
+  ".ps1",
+  ".js",
+  ".mjs",
+  ".cjs",
+  ".ts",
+  ".tsx",
+  ".jsx",
+  ".py",
+  ".rb",
+  ".go",
+  ".rs",
+  ".java",
+  ".kt",
+  ".swift",
+  ".php",
+  ".lua",
+  ".sql",
+  ".xml",
+  ".html",
+  ".css",
+]);
+
+/**
+ * @typedef {"safe" | "needs-review" | "dangerous"} RiskLevel
+ */
+
+/**
+ * @typedef {{
+ *   severity: Exclude<RiskLevel, "safe">;
+ *   type: string;
+ *   file: string;
+ *   line?: number;
+ *   message: string;
+ *   snippet?: string;
+ * }} Finding
+ */
+
+const LINE_PATTERNS = [
+  {
+    type: "network-pipe-shell",
+    severity: "dangerous",
+    regex: /\b(?:curl|wget)\b[^\n|]*\|\s*(?:ba|z)?sh\b/i,
+    message: "Network content piped directly into shell.",
+  },
+  {
+    type: "powershell-iex-download",
+    severity: "dangerous",
+    regex: /\b(?:invoke-webrequest|iwr)\b[^\n|]*\|\s*iex\b/i,
+    message: "Downloaded content executed via PowerShell IEX.",
+  },
+  {
+    type: "destructive-rm-root",
+    severity: "dangerous",
+    regex: /(?:^|[\s;])(?:sudo\s+)?rm\s+-rf\s+(?:\/(?:\s|$)|~(?:\/|\s|$))/i,
+    message: "Potentially destructive recursive delete at root/home scope.",
+  },
+  {
+    type: "device-overwrite",
+    severity: "dangerous",
+    regex: /\bdd\s+if=.*\s+of=\/dev\/(?:sd[a-z]\d*|nvme\d+n\d+(?:p\d+)?|disk\d+)/i,
+    message: "Possible block-device overwrite command.",
+  },
+  {
+    type: "reverse-shell",
+    severity: "dangerous",
+    regex: /\/dev\/tcp\/|nc\s+-e\s+|bash\s+-i\b.*\/dev\/tcp\//i,
+    message: "Potential reverse-shell behavior.",
+  },
+  {
+    type: "sudo-usage",
+    severity: "needs-review",
+    regex: /(^|[\s;])sudo\s+/i,
+    message: "Uses privileged command execution (sudo).",
+  },
+  {
+    type: "remote-download",
+    severity: "needs-review",
+    regex: /\b(?:curl|wget|invoke-webrequest|iwr)\b.*https?:\/\//i,
+    message: "Downloads remote content. Verify source integrity and intent.",
+  },
+  {
+    type: "dynamic-exec-js",
+    severity: "needs-review",
+    regex: /\bchild_process\.(?:exec|spawn|execSync|spawnSync)\b|\beval\s*\(/i,
+    message: "Dynamic execution primitive found in JavaScript/TypeScript.",
+  },
+  {
+    type: "python-shell-exec",
+    severity: "needs-review",
+    regex: /\bos\.system\s*\(|\bsubprocess\.(?:run|Popen|call)\s*\(.*shell\s*=\s*True/i,
+    message: "Shell execution primitive found in Python.",
+  },
+  {
+    type: "secret-env-access",
+    severity: "needs-review",
+    regex: /process\.env\.[A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD)|\$\{?[A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD)\}?/i,
+    message: "Reads variables that may contain credentials/secrets.",
+  },
+];
+
+/**
+ * @param {string} value
+ * @returns {string}
+ */
+function compactSnippet(value) {
+  return value.replace(/\s+/g, " ").trim().slice(0, 200);
+}
+
+/**
+ * @param {string} filePath
+ * @returns {boolean}
+ */
+function shouldReadAsText(filePath) {
+  const base = basename(filePath).toLowerCase();
+  if (base === "skill.md") return true;
+  return TEXT_EXTENSIONS.has(extname(filePath).toLowerCase());
+}
+
+/**
+ * @param {string} filePath
+ * @returns {string | null}
+ */
+function readTextFile(filePath) {
+  const buf = readFileSync(filePath);
+  if (buf.includes(0)) return null;
+  return buf.toString("utf-8");
+}
+
+/** @type {Finding[]} */
+const findings = [];
+let scannedFiles = 0;
+let skippedLargeFiles = 0;
+let skippedBinaryFiles = 0;
+let symlinkCount = 0;
+
+/**
+ * @param {Finding} finding
+ */
+function addFinding(finding) {
+  if (findings.length >= MAX_FINDINGS) return;
+  findings.push(finding);
+}
+
+/**
+ * @param {string} currentDir
+ */
+function walk(currentDir) {
+  const entries = readdirSync(currentDir, { withFileTypes: true });
+
+  for (const entry of entries) {
+    const fullPath = join(currentDir, entry.name);
+    const relPath = relative(targetDir, fullPath) || ".";
+
+    let stat;
+    try {
+      stat = lstatSync(fullPath);
+    } catch {
+      addFinding({
+        severity: "needs-review",
+        type: "stat-error",
+        file: relPath,
+        message: "Could not stat path. Manual inspection recommended.",
+      });
+      continue;
+    }
+
+    if (stat.isSymbolicLink()) {
+      symlinkCount++;
+      addFinding({
+        severity: "dangerous",
+        type: "symlink",
+        file: relPath,
+        message: "Symbolic links can hide path traversal or redirection behavior.",
+      });
+      continue;
+    }
+
+    if (stat.isDirectory()) {
+      if (SKIP_DIRS.has(entry.name)) continue;
+      walk(fullPath);
+      continue;
+    }
+
+    if (!stat.isFile()) continue;
+    scannedFiles++;
+
+    if (stat.size > MAX_TEXT_FILE_BYTES) {
+      skippedLargeFiles++;
+      addFinding({
+        severity: "needs-review",
+        type: "large-file",
+        file: relPath,
+        message: `Large file (${stat.size} bytes) was not fully scanned.`,
+      });
+      continue;
+    }
+
+    if (!shouldReadAsText(fullPath)) continue;
+
+    let content;
+    try {
+      content = readTextFile(fullPath);
+    } catch {
+      addFinding({
+        severity: "needs-review",
+        type: "read-error",
+        file: relPath,
+        message: "Failed to read file during scan.",
+      });
+      continue;
+    }
+
+    if (content === null) {
+      skippedBinaryFiles++;
+      continue;
+    }
+
+    const lines = content.split(/\r?\n/);
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i] ?? "";
+      if (!line) continue;
+      for (const pattern of LINE_PATTERNS) {
+        if (!pattern.regex.test(line)) continue;
+        addFinding({
+          severity: pattern.severity,
+          type: pattern.type,
+          file: relPath,
+          line: i + 1,
+          message: pattern.message,
+          snippet: compactSnippet(line),
+        });
+      }
+    }
+  }
+}
+
+walk(targetDir);
+
+if (!existsSync(join(targetDir, "SKILL.md"))) {
+  addFinding({
+    severity: "dangerous",
+    type: "missing-skill-md",
+    file: ".",
+    message: "SKILL.md not found at skill root.",
+  });
+}
+
+const dangerousCount = findings.filter((f) => f.severity === "dangerous").length;
+const reviewCount = findings.filter((f) => f.severity === "needs-review").length;
+
+/** @type {RiskLevel} */
+let riskLevel = "safe";
+if (dangerousCount > 0) {
+  riskLevel = "dangerous";
+} else if (reviewCount > 0) {
+  riskLevel = "needs-review";
+}
+
+const output = {
+  targetDir,
+  riskLevel,
+  summary: {
+    scannedFiles,
+    symlinkCount,
+    skippedLargeFiles,
+    skippedBinaryFiles,
+    dangerousFindings: dangerousCount,
+    reviewFindings: reviewCount,
+    totalFindings: findings.length,
+    findingsTruncated: findings.length >= MAX_FINDINGS,
+  },
+  findings,
+};
+
+console.log(JSON.stringify(output, null, 2));

From 2074aac49e8b4b9b1e19b6cd2024d18141d89f51 Mon Sep 17 00:00:00 2001
From: Jiayuan Zhang <forrestchang7@gmail.com>
Date: Tue, 17 Feb 2026 00:50:01 +0800
Subject: [PATCH 2/7] feat(e2e): add clawhub skills benchmark suite

---
 docs/e2e-skills-benchmark.md                  |  94 ++++++
 package.json                                  |   3 +-
 scripts/e2e-skills-benchmark/analyze.mjs      | 297 ++++++++++++++++++
 .../cases/case-01-install-caldav-calendar.txt |  15 +
 .../case-02-gap-discovery-homeassistant.txt   |  16 +
 .../case-03-install-update-codexmonitor.txt   |  16 +
 scripts/e2e-skills-benchmark/run.sh           | 170 ++++++++++
 7 files changed, 610 insertions(+), 1 deletion(-)
 create mode 100644 docs/e2e-skills-benchmark.md
 create mode 100755 scripts/e2e-skills-benchmark/analyze.mjs
 create mode 100644 scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt
 create mode 100644 scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt
 create mode 100644 scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt
 create mode 100755 scripts/e2e-skills-benchmark/run.sh

diff --git a/docs/e2e-skills-benchmark.md b/docs/e2e-skills-benchmark.md
new file mode 100644
index 00000000..e9859624
--- /dev/null
+++ b/docs/e2e-skills-benchmark.md
@@ -0,0 +1,94 @@
+# Skills Agent-Driven E2E Benchmark
+
+This benchmark validates the meta skill workflow for capability-gap discovery, ClawHub installation, and security-gated rollout.
+
+## Scope
+
+- Domain: skill discovery + installation + update
+- Focus: `skills/meta-skill-installer`
+- Providers: default `kimi-coding` (override with `PROVIDERS`)
+- Cases: 3
+
+Case prompts are stored in:
+- `scripts/e2e-skills-benchmark/cases/`
+
+## Real ClawHub Examples Used
+
+The case set references real public pages from ClawHub:
+
+- [CalDAV Calendar](https://clawhub.ai/skills/caldav-calendar)
+- [Home Assistant](https://clawhub.ai/skills/homeassistant)
+- [CodexMonitor](https://clawhub.ai/odrobnik/codexmonitor)
+
+## Prerequisites
+
+1. Credentials configured (`pnpm multica credentials init` if needed)
+2. Dependencies installed in repo (`pnpm install`)
+3. `clawhub` CLI available, or allow runtime fallback to `npx -y clawhub`
+4. Required env:
+
+```bash
+export SMC_DATA_DIR=~/.super-multica-e2e
+export MULTICA_API_URL=https://api-dev.copilothub.ai
+```
+
+## Run Benchmark
+
+```bash
+scripts/e2e-skills-benchmark/run.sh
+```
+
+Defaults:
+
+- Providers: `kimi-coding`
+- Case glob: `case-*.txt`
+- Max parallel workers: `1`
+- Per-case timeout: `1200s` (`CASE_TIMEOUT_SEC=0` to disable)
+- Output directory: `.context/skills-e2e-runs/<timestamp>/`
+
+Generated artifacts:
+
+- `manifest.tsv`: provider/case/status/session/log metadata
+- `analysis.txt`: human-readable pass/fail report
+- `analysis.json`: structured detailed check output
+
+## Run Subset
+
+Only one case:
+
+```bash
+CASE_GLOB="case-01-*.txt" scripts/e2e-skills-benchmark/run.sh
+```
+
+Multiple providers:
+
+```bash
+PROVIDERS="kimi-coding claude-code" scripts/e2e-skills-benchmark/run.sh
+```
+
+Faster throughput:
+
+```bash
+MAX_PARALLEL=2 CASE_TIMEOUT_SEC=1800 scripts/e2e-skills-benchmark/run.sh
+```
+
+## Analyzer Checks
+
+For each run:
+
+1. `run_start` and `run_end` both present
+2. `run_end.error` is empty/null
+3. `tool_start` and `tool_end` are paired
+4. no `tool_end.is_error=true`
+5. at least one `exec` tool call exists
+6. case-specific command evidence in `tool_start.args`:
+   - `clawhub search`
+   - `clawhub install`
+   - `review-skill-security.mjs`
+   - for case 03 also `clawhub update`
+
+## Notes
+
+- These are agent-driven tests; prompt intent plus run-log evidence are both evaluated.
+- `SMC_DATA_DIR=~/.super-multica-e2e` avoids polluting normal user skill/session data.
+- If a case fails, open `manifest.tsv` and inspect the matching `session_dir/run-log.jsonl`.
diff --git a/package.json b/package.json
index 80b868f4..47661ee2 100644
--- a/package.json
+++ b/package.json
@@ -30,7 +30,8 @@
     "typecheck": "turbo typecheck",
     "test": "vitest run",
     "test:watch": "vitest",
-    "test:coverage": "vitest run --coverage"
+    "test:coverage": "vitest run --coverage",
+    "e2e:skills": "bash scripts/e2e-skills-benchmark/run.sh"
   },
   "keywords": [],
   "author": "",
diff --git a/scripts/e2e-skills-benchmark/analyze.mjs b/scripts/e2e-skills-benchmark/analyze.mjs
new file mode 100755
index 00000000..ac090783
--- /dev/null
+++ b/scripts/e2e-skills-benchmark/analyze.mjs
@@ -0,0 +1,297 @@
+#!/usr/bin/env node
+
+import { existsSync, readFileSync, writeFileSync } from "node:fs";
+import { dirname, join, resolve } from "node:path";
+
+/**
+ * @typedef {{
+ *   id: string;
+ *   check: string;
+ *   passed: boolean;
+ *   detail?: string;
+ * }} CheckResult
+ */
+
+/**
+ * @typedef {{
+ *   provider: string;
+ *   caseId: string;
+ *   status: string;
+ *   sessionId: string;
+ *   sessionDir: string;
+ *   logFile: string;
+ *   checks: CheckResult[];
+ *   pass: boolean;
+ * }} CaseAnalysis
+ */
+
+const manifestArg = process.argv[2];
+if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") {
+  console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs <manifest.tsv>");
+  process.exit(0);
+}
+
+const manifestPath = resolve(manifestArg);
+if (!existsSync(manifestPath)) {
+  console.error(`Manifest not found: ${manifestPath}`);
+  process.exit(1);
+}
+
+const CASE_RULES = {
+  "case-01-install-caldav-calendar": {
+    requiredCommandTokens: [
+      ["clawhub", "search"],
+      ["caldav"],
+      ["clawhub", "install"],
+      ["review-skill-security.mjs"],
+    ],
+  },
+  "case-02-gap-discovery-homeassistant": {
+    requiredCommandTokens: [
+      ["clawhub", "search"],
+      ["home", "assistant"],
+      ["clawhub", "install"],
+      ["review-skill-security.mjs"],
+    ],
+  },
+  "case-03-install-update-codexmonitor": {
+    requiredCommandTokens: [
+      ["clawhub", "search"],
+      ["codexmonitor"],
+      ["clawhub", "install"],
+      ["clawhub", "update"],
+      ["review-skill-security.mjs"],
+    ],
+  },
+};
+
+/**
+ * @param {string} text
+ * @returns {string[]}
+ */
+function splitLines(text) {
+  return text.split(/\r?\n/).filter(Boolean);
+}
+
+/**
+ * @param {string} command
+ * @param {string[]} tokens
+ * @returns {boolean}
+ */
+function commandHasTokens(command, tokens) {
+  const lower = command.toLowerCase();
+  return tokens.every((token) => lower.includes(token.toLowerCase()));
+}
+
+/**
+ * @param {string} rawArgs
+ * @returns {string}
+ */
+function extractCommand(rawArgs) {
+  if (!rawArgs) return "";
+  try {
+    const parsed = JSON.parse(rawArgs);
+    if (parsed && typeof parsed.command === "string") {
+      return parsed.command;
+    }
+  } catch {
+    // Fall through: args may be truncated JSON in run-log.
+  }
+  return rawArgs;
+}
+
+/**
+ * @param {string} runLogPath
+ */
+function parseRunLog(runLogPath) {
+  const lines = splitLines(readFileSync(runLogPath, "utf-8"));
+  const events = [];
+  for (const line of lines) {
+    try {
+      events.push(JSON.parse(line));
+    } catch {
+      // Ignore malformed lines but keep analysis alive.
+    }
+  }
+  return events;
+}
+
+/**
+ * @param {CaseAnalysis} analysis
+ * @param {string} id
+ * @param {string} check
+ * @param {boolean} passed
+ * @param {string} [detail]
+ */
+function addCheck(analysis, id, check, passed, detail) {
+  analysis.checks.push({ id, check, passed, detail });
+}
+
+const rows = splitLines(readFileSync(manifestPath, "utf-8"));
+if (rows.length <= 1) {
+  console.error(`Manifest has no data rows: ${manifestPath}`);
+  process.exit(1);
+}
+
+/** @type {CaseAnalysis[]} */
+const analyses = [];
+
+for (let i = 1; i < rows.length; i++) {
+  const row = rows[i];
+  if (!row) continue;
+
+  const cols = row.split("\t");
+  if (cols.length < 11) continue;
+
+  const provider = cols[1] ?? "";
+  const caseId = cols[2] ?? "";
+  const status = cols[3] ?? "";
+  const sessionId = cols[4] ?? "";
+  const sessionDir = cols[5] ?? "";
+  const logFile = cols[6] ?? "";
+
+  /** @type {CaseAnalysis} */
+  const analysis = {
+    provider,
+    caseId,
+    status,
+    sessionId,
+    sessionDir,
+    logFile,
+    checks: [],
+    pass: false,
+  };
+
+  addCheck(
+    analysis,
+    "run-status",
+    "runner status is success",
+    status === "success",
+    `status=${status}`,
+  );
+
+  if (!sessionDir) {
+    addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir");
+    analyses.push(analysis);
+    continue;
+  }
+
+  const runLogPath = join(sessionDir, "run-log.jsonl");
+  addCheck(
+    analysis,
+    "run-log-file",
+    "run-log.jsonl exists",
+    existsSync(runLogPath),
+    runLogPath,
+  );
+
+  if (!existsSync(runLogPath)) {
+    analyses.push(analysis);
+    continue;
+  }
+
+  const events = parseRunLog(runLogPath);
+  const runStarts = events.filter((e) => e.event === "run_start");
+  const runEnds = events.filter((e) => e.event === "run_end");
+  const toolStarts = events.filter((e) => e.event === "tool_start");
+  const toolEnds = events.filter((e) => e.event === "tool_end");
+  const errorToolEnds = toolEnds.filter((e) => e.is_error === true);
+
+  addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`);
+  addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`);
+  addCheck(
+    analysis,
+    "tool-pairing",
+    "tool_start count matches tool_end count",
+    toolStarts.length === toolEnds.length,
+    `start=${toolStarts.length} end=${toolEnds.length}`,
+  );
+
+  const finalRunEnd = runEnds.at(-1);
+  const runEndError = finalRunEnd?.error;
+  addCheck(
+    analysis,
+    "run-end-error",
+    "final run_end.error is null/empty",
+    runEndError === null || runEndError === undefined || runEndError === "",
+    `error=${String(runEndError)}`,
+  );
+
+  addCheck(
+    analysis,
+    "tool-errors",
+    "no tool_end has is_error=true",
+    errorToolEnds.length === 0,
+    `error_tool_calls=${errorToolEnds.length}`,
+  );
+
+  const execCommands = toolStarts
+    .filter((e) => e.tool === "exec")
+    .map((e) => extractCommand(typeof e.args === "string" ? e.args : ""))
+    .filter(Boolean);
+
+  addCheck(
+    analysis,
+    "exec-usage",
+    "at least one exec command was used",
+    execCommands.length > 0,
+    `exec_calls=${execCommands.length}`,
+  );
+
+  const rules = CASE_RULES[caseId];
+  if (rules) {
+    for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
+      const tokenList = rules.requiredCommandTokens[r];
+      const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
+      addCheck(
+        analysis,
+        `cmd-${r + 1}`,
+        `exec command contains tokens: ${tokenList.join(" + ")}`,
+        passed,
+      );
+    }
+  } else {
+    addCheck(
+      analysis,
+      "case-rules",
+      "case has rule set",
+      false,
+      `No rules defined for case_id=${caseId}`,
+    );
+  }
+
+  analysis.pass = analysis.checks.every((c) => c.passed);
+  analyses.push(analysis);
+}
+
+const passedCases = analyses.filter((a) => a.pass).length;
+const failedCases = analyses.length - passedCases;
+
+const output = {
+  manifestPath,
+  totalCases: analyses.length,
+  passedCases,
+  failedCases,
+  results: analyses,
+};
+
+const outputPath = join(dirname(manifestPath), "analysis.json");
+writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8");
+
+for (const item of analyses) {
+  const status = item.pass ? "PASS" : "FAIL";
+  console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`);
+  for (const check of item.checks) {
+    const marker = check.passed ? "  [ok]  " : "  [bad] ";
+    const detail = check.detail ? ` (${check.detail})` : "";
+    console.log(`${marker}${check.check}${detail}`);
+  }
+}
+
+console.log("");
+console.log(`Analysis file: ${outputPath}`);
+console.log(`Summary: pass=${passedCases} fail=${failedCases}`);
+
+if (failedCases > 0) {
+  process.exit(1);
+}
diff --git a/scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt b/scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt
new file mode 100644
index 00000000..2b1f6571
--- /dev/null
+++ b/scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt
@@ -0,0 +1,15 @@
+Run an end-to-end test for the Meta Skill Installer.
+
+Goal: install a real ClawHub skill for CalDAV calendar capability.
+Reference page: https://clawhub.ai/skills/caldav-calendar
+
+Follow this exact workflow:
+1. State the missing capability in one sentence.
+2. Search ClawHub for CalDAV-related skills and choose the best candidate.
+3. Stage-install to a temporary directory first (never install directly to active skills path).
+4. Run security review on the staged skill:
+   DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
+   node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
+5. If riskLevel is safe, install to "$DATA_DIR/skills".
+6. Verify final install by checking "$DATA_DIR/skills/<slug>/SKILL.md" exists.
+7. Return a short report: selected slug, riskLevel, final install path.
diff --git a/scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt b/scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt
new file mode 100644
index 00000000..a72d65f6
--- /dev/null
+++ b/scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt
@@ -0,0 +1,16 @@
+Run an end-to-end capability-gap discovery test for Meta Skill Installer.
+
+User intent: "I need to control Home Assistant lights and switches from the agent."
+Reference page: https://clawhub.ai/skills/homeassistant
+
+Requirements:
+1. Treat this as a missing capability and explicitly define the gap.
+2. Search ClawHub for relevant skills and list the top 3 candidates.
+3. Pick one candidate with rationale (scope match + lower security risk).
+4. Stage-install to a temporary directory.
+5. Run security review:
+   DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
+   node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
+6. If riskLevel is safe, install to "$DATA_DIR/skills".
+7. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
+8. Return: candidate list, chosen slug, riskLevel, and final path.
diff --git a/scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt b/scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt
new file mode 100644
index 00000000..9c828b62
--- /dev/null
+++ b/scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt
@@ -0,0 +1,16 @@
+Run an end-to-end install+update regression test for Meta Skill Installer.
+
+Goal: use a real ClawHub skill and verify install, review, and update flow.
+Reference page: https://clawhub.ai/odrobnik/codexmonitor
+
+Requirements:
+1. Search ClawHub for CodexMonitor and select the matching skill slug.
+2. Stage-install to a temporary directory and run security review:
+   DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
+   node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
+3. If riskLevel is safe, install to "$DATA_DIR/skills".
+4. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
+5. Run an update for the same slug in managed dir:
+   clawhub update "<slug>" --workdir "$DATA_DIR" --dir skills --force
+6. Run security review again on the final installed path.
+7. Return: slug, initial riskLevel, update executed (yes/no), final path.
diff --git a/scripts/e2e-skills-benchmark/run.sh b/scripts/e2e-skills-benchmark/run.sh
new file mode 100755
index 00000000..01c873cf
--- /dev/null
+++ b/scripts/e2e-skills-benchmark/run.sh
@@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+CASES_DIR="${SCRIPT_DIR}/cases"
+TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
+OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/skills-e2e-runs/${TIMESTAMP}}"
+RESULTS_DIR="${OUT_DIR}/results"
+MANIFEST="${OUT_DIR}/manifest.tsv"
+
+# Required environment for agent-driven E2E.
+SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
+MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
+PROVIDERS_RAW="${PROVIDERS:-kimi-coding}"
+CASE_GLOB="${CASE_GLOB:-case-*.txt}"
+CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-1200}"
+MAX_PARALLEL="${MAX_PARALLEL:-1}"
+TIMEOUT_ENABLED="true"
+if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
+  TIMEOUT_ENABLED="false"
+fi
+
+if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
+  echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
+  exit 1
+fi
+
+if [[ "${1:-}" == "--worker" ]]; then
+  provider="${2:?missing provider}"
+  case_file="${3:?missing case file}"
+  case_base="$(basename "${case_file}")"
+  case_id="${case_base%.txt}"
+  log_file="${OUT_DIR}/${provider}-${case_id}.log"
+  result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
+
+  prompt="$(cat "${case_file}")"
+
+  status="success"
+  timed_out="false"
+  started_epoch="$(date +%s)"
+  started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+
+  SMC_DATA_DIR="${SMC_DATA_DIR}" \
+    MULTICA_API_URL="${MULTICA_API_URL}" \
+    pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
+  cmd_pid=$!
+
+  while kill -0 "${cmd_pid}" 2>/dev/null; do
+    if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
+      now="$(date +%s)"
+      elapsed="$((now - started_epoch))"
+      if (( elapsed >= CASE_TIMEOUT_SEC )); then
+        timed_out="true"
+        kill "${cmd_pid}" 2>/dev/null || true
+        sleep 1
+        kill -9 "${cmd_pid}" 2>/dev/null || true
+        break
+      fi
+    fi
+    sleep 2
+  done
+
+  exit_code=0
+  wait "${cmd_pid}" 2>/dev/null || exit_code=$?
+  ended_epoch="$(date +%s)"
+  ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+  duration_sec="$((ended_epoch - started_epoch))"
+
+  if [[ "${timed_out}" == "true" ]]; then
+    status="timeout"
+    printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
+  elif (( exit_code != 0 )); then
+    status="failed"
+  elif [[ ! -s "${log_file}" ]]; then
+    status="failed"
+  elif ! rg -q "\[session: " "${log_file}"; then
+    status="failed"
+  fi
+
+  session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
+  session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
+
+  printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
+    "${TIMESTAMP}" \
+    "${provider}" \
+    "${case_id}" \
+    "${status}" \
+    "${session_id}" \
+    "${session_dir}" \
+    "${log_file}" \
+    "${started_at}" \
+    "${ended_at}" \
+    "${duration_sec}" \
+    "${exit_code}" > "${result_file}"
+
+  printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
+    "${provider}" \
+    "${case_id}" \
+    "${status}" \
+    "${duration_sec}" \
+    "${session_id:-N/A}"
+  exit 0
+fi
+
+mkdir -p "${OUT_DIR}"
+mkdir -p "${RESULTS_DIR}"
+printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
+
+read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
+
+CASE_FILES=()
+while IFS= read -r line; do
+  CASE_FILES+=("${line}")
+done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
+
+if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
+  echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
+  exit 1
+fi
+
+echo "Output directory: ${OUT_DIR}"
+echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
+echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
+echo "Providers: ${PROVIDERS[*]}"
+echo "Cases: ${#CASE_FILES[@]}"
+echo "Max parallel: ${MAX_PARALLEL}"
+if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
+  echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
+else
+  echo "Case timeout: disabled"
+fi
+
+TASKS=()
+for provider in "${PROVIDERS[@]}"; do
+  for case_file in "${CASE_FILES[@]}"; do
+    TASKS+=("${provider}" "${case_file}")
+  done
+done
+
+echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
+
+export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
+printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
+
+RESULT_FILES=()
+while IFS= read -r line; do
+  RESULT_FILES+=("${line}")
+done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
+
+if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
+  echo "No result files produced in ${RESULTS_DIR}" >&2
+  exit 1
+fi
+
+for result_file in "${RESULT_FILES[@]}"; do
+  cat "${result_file}" >> "${MANIFEST}"
+done
+
+success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
+failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
+timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
+
+echo
+echo "Completed run stage. Manifest: ${MANIFEST}"
+echo "Run summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
+
+echo
+echo "Running structured analysis..."
+node "${SCRIPT_DIR}/analyze.mjs" "${MANIFEST}" | tee "${OUT_DIR}/analysis.txt"

From 7eb18f47fcb19236adcd12979871c1ac30d7dc6e Mon Sep 17 00:00:00 2001
From: Jiayuan Zhang <forrestchang7@gmail.com>
Date: Tue, 17 Feb 2026 01:18:00 +0800
Subject: [PATCH 3/7] fix(agent): enforce capability-gap skill recovery
 guidance

---
 packages/core/src/agent/system-prompt/sections.test.ts | 9 +++++++++
 packages/core/src/agent/system-prompt/sections.ts      | 5 ++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/packages/core/src/agent/system-prompt/sections.test.ts b/packages/core/src/agent/system-prompt/sections.test.ts
index a1d2dd5a..f31585fa 100644
--- a/packages/core/src/agent/system-prompt/sections.test.ts
+++ b/packages/core/src/agent/system-prompt/sections.test.ts
@@ -218,6 +218,15 @@ describe("buildSkillsSection", () => {
     expect(text).toContain("suggest activating it");
   });
 
+  it("includes capability-gap recovery guidance", () => {
+    const result = buildSkillsSection("## commit\nDo commits.", "full");
+    const text = result.join("\n");
+    expect(text).toContain("capability gap");
+    expect(text).toContain("meta-skill-installer");
+    expect(text).toContain("explicit user confirmation");
+    expect(text).toContain("clawhub install");
+  });
+
   it("returns empty in minimal mode", () => {
     expect(buildSkillsSection("skills", "minimal")).toEqual([]);
   });
diff --git a/packages/core/src/agent/system-prompt/sections.ts b/packages/core/src/agent/system-prompt/sections.ts
index bf935fce..52d7057a 100644
--- a/packages/core/src/agent/system-prompt/sections.ts
+++ b/packages/core/src/agent/system-prompt/sections.ts
@@ -399,7 +399,10 @@ export function buildSkillsSection(
     "- If exactly one skill clearly applies: follow its instructions.",
     "- If multiple could apply: choose the most specific one.",
     "- If none clearly apply but an **inactive skill** matches the user's intent: suggest activating it.",
-    "- If no skill matches at all: skip skill invocation.",
+    "- If the request needs a capability you currently lack: do not stop at refusal. Treat it as a capability gap and propose a recovery path.",
+    "- If `meta-skill-installer` is available and no installed skill matches: proactively offer to search ClawHub for candidates and run security review before install.",
+    "- Ask for explicit user confirmation before final `clawhub install` / `clawhub update` unless the user already clearly asked you to install in this turn.",
+    "- After install/update, verify the skill path and retry the original user task.",
     "",
     budgeted,
     "",

From 50407918b93886a6e939d7ed69a04530545d388e Mon Sep 17 00:00:00 2001
From: Jiayuan Zhang <forrestchang7@gmail.com>
Date: Tue, 17 Feb 2026 01:18:06 +0800
Subject: [PATCH 4/7] test(e2e): add spotify capability-gap ux benchmark case

---
 docs/e2e-skills-benchmark.md                  |   4 +-
 scripts/e2e-skills-benchmark/analyze.mjs      | 125 ++++++++++++++++--
 .../case-04-gap-discovery-spotify-ux.txt      |  10 ++
 3 files changed, 125 insertions(+), 14 deletions(-)
 create mode 100644 scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt

diff --git a/docs/e2e-skills-benchmark.md b/docs/e2e-skills-benchmark.md
index e9859624..674a3b4b 100644
--- a/docs/e2e-skills-benchmark.md
+++ b/docs/e2e-skills-benchmark.md
@@ -7,7 +7,7 @@ This benchmark validates the meta skill workflow for capability-gap discovery, C
 - Domain: skill discovery + installation + update
 - Focus: `skills/meta-skill-installer`
 - Providers: default `kimi-coding` (override with `PROVIDERS`)
-- Cases: 3
+- Cases: 4
 
 Case prompts are stored in:
 - `scripts/e2e-skills-benchmark/cases/`
@@ -19,6 +19,7 @@ The case set references real public pages from ClawHub:
 - [CalDAV Calendar](https://clawhub.ai/skills/caldav-calendar)
 - [Home Assistant](https://clawhub.ai/skills/homeassistant)
 - [CodexMonitor](https://clawhub.ai/odrobnik/codexmonitor)
+- [Spotify (gap-discovery UX flow)](https://clawhub.ai/search?q=spotify)
 
 ## Prerequisites
 
@@ -86,6 +87,7 @@ For each run:
    - `clawhub install`
    - `review-skill-security.mjs`
    - for case 03 also `clawhub update`
+   - for case 04, final response must include ClawHub + install confirmation language, and must not run `clawhub install/update` before confirmation
 
 ## Notes
 
diff --git a/scripts/e2e-skills-benchmark/analyze.mjs b/scripts/e2e-skills-benchmark/analyze.mjs
index ac090783..0eaee0ed 100755
--- a/scripts/e2e-skills-benchmark/analyze.mjs
+++ b/scripts/e2e-skills-benchmark/analyze.mjs
@@ -63,6 +63,19 @@ const CASE_RULES = {
       ["review-skill-security.mjs"],
     ],
   },
+  "case-04-gap-discovery-spotify-ux": {
+    requireExecUsage: false,
+    requiredResponseRegex: [
+      "clawhub|cloud\\s*hub|cloudhub",
+      "安装|install",
+      "是否|要不要|would you like|do you want",
+      "安全|审查|security|review",
+    ],
+    forbiddenCommandTokens: [
+      ["clawhub", "install"],
+      ["clawhub", "update"],
+    ],
+  },
 };
 
 /**
@@ -100,6 +113,19 @@ function extractCommand(rawArgs) {
   return rawArgs;
 }
 
+/**
+ * @param {string} text
+ * @param {string} pattern
+ * @returns {boolean}
+ */
+function textMatchesPattern(text, pattern) {
+  try {
+    return new RegExp(pattern, "i").test(text);
+  } catch {
+    return false;
+  }
+}
+
 /**
  * @param {string} runLogPath
  */
@@ -116,6 +142,44 @@ function parseRunLog(runLogPath) {
   return events;
 }
 
+/**
+ * @param {string} sessionPath
+ * @returns {string}
+ */
+function parseFinalAssistantText(sessionPath) {
+  if (!existsSync(sessionPath)) return "";
+
+  const lines = splitLines(readFileSync(sessionPath, "utf-8"));
+  let latest = "";
+
+  for (const line of lines) {
+    try {
+      const entry = JSON.parse(line);
+      if (entry?.type !== "message") continue;
+      const msg = entry.message;
+      if (!msg || msg.role !== "assistant") continue;
+
+      if (typeof msg.content === "string") {
+        latest = msg.content;
+        continue;
+      }
+
+      if (Array.isArray(msg.content)) {
+        const text = msg.content
+          .filter((part) => part && part.type === "text" && typeof part.text === "string")
+          .map((part) => part.text)
+          .join("\n")
+          .trim();
+        if (text) latest = text;
+      }
+    } catch {
+      // Ignore malformed lines.
+    }
+  }
+
+  return latest;
+}
+
 /**
  * @param {CaseAnalysis} analysis
  * @param {string} id
@@ -145,6 +209,7 @@ for (let i = 1; i < rows.length; i++) {
 
   const provider = cols[1] ?? "";
   const caseId = cols[2] ?? "";
+  const rules = CASE_RULES[caseId];
   const status = cols[3] ?? "";
   const sessionId = cols[4] ?? "";
   const sessionDir = cols[5] ?? "";
@@ -191,6 +256,8 @@ for (let i = 1; i < rows.length; i++) {
   }
 
   const events = parseRunLog(runLogPath);
+  const sessionPath = join(sessionDir, "session.jsonl");
+  const finalAssistantText = parseFinalAssistantText(sessionPath);
   const runStarts = events.filter((e) => e.event === "run_start");
   const runEnds = events.filter((e) => e.event === "run_end");
   const toolStarts = events.filter((e) => e.event === "tool_start");
@@ -209,6 +276,8 @@ for (let i = 1; i < rows.length; i++) {
 
   const finalRunEnd = runEnds.at(-1);
   const runEndError = finalRunEnd?.error;
+  const finalRunText = typeof finalRunEnd?.text === "string" ? finalRunEnd.text : "";
+  const finalResponseText = finalAssistantText || finalRunText;
   addCheck(
     analysis,
     "run-end-error",
@@ -230,25 +299,55 @@ for (let i = 1; i < rows.length; i++) {
     .map((e) => extractCommand(typeof e.args === "string" ? e.args : ""))
     .filter(Boolean);
 
+  const requireExecUsage = rules?.requireExecUsage !== false;
   addCheck(
     analysis,
     "exec-usage",
-    "at least one exec command was used",
-    execCommands.length > 0,
-    `exec_calls=${execCommands.length}`,
+    requireExecUsage
+      ? "at least one exec command was used"
+      : "exec usage is optional for this case",
+    requireExecUsage ? execCommands.length > 0 : true,
+    requireExecUsage ? `exec_calls=${execCommands.length}` : `exec_calls=${execCommands.length} (optional)`,
   );
 
-  const rules = CASE_RULES[caseId];
   if (rules) {
-    for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
-      const tokenList = rules.requiredCommandTokens[r];
-      const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
-      addCheck(
-        analysis,
-        `cmd-${r + 1}`,
-        `exec command contains tokens: ${tokenList.join(" + ")}`,
-        passed,
-      );
+    if (Array.isArray(rules.requiredCommandTokens)) {
+      for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
+        const tokenList = rules.requiredCommandTokens[r];
+        const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
+        addCheck(
+          analysis,
+          `cmd-${r + 1}`,
+          `exec command contains tokens: ${tokenList.join(" + ")}`,
+          passed,
+        );
+      }
+    }
+
+    if (Array.isArray(rules.forbiddenCommandTokens)) {
+      for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) {
+        const tokenList = rules.forbiddenCommandTokens[r];
+        const passed = !execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
+        addCheck(
+          analysis,
+          `forbid-cmd-${r + 1}`,
+          `exec command does not contain tokens: ${tokenList.join(" + ")}`,
+          passed,
+        );
+      }
+    }
+
+    if (Array.isArray(rules.requiredResponseRegex)) {
+      for (let r = 0; r < rules.requiredResponseRegex.length; r++) {
+        const pattern = rules.requiredResponseRegex[r];
+        const passed = textMatchesPattern(finalResponseText, pattern);
+        addCheck(
+          analysis,
+          `resp-${r + 1}`,
+          `final response matches regex: /${pattern}/i`,
+          passed,
+        );
+      }
     }
   } else {
     addCheck(
diff --git a/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt b/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt
new file mode 100644
index 00000000..5c8a7c16
--- /dev/null
+++ b/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt
@@ -0,0 +1,10 @@
+请模拟真实用户的首轮请求：
+
+用户说：「随机播放 spotify 中的音乐」。
+
+要求：
+1. 先明确你当前缺少直接控制 Spotify 的能力（能力缺口识别）。
+2. 不要直接拒绝结束，要给出恢复路径：引导用户可通过 ClawHub 搜索并安装相关 skill。
+3. 明确说明会先做安全审查，再安装。
+4. 在同一轮里必须询问用户是否要继续安装（显式确认），在用户确认前不要执行 `clawhub install` 或 `clawhub update`。
+5. 输出用中文，且要包含关键词：ClawHub、安全审查、安装、是否要继续。

From 6fd4819280ff5f5eb6b7af7ebfe80e148886b8eb Mon Sep 17 00:00:00 2001
From: Jiayuan Zhang <forrestchang7@gmail.com>
Date: Tue, 17 Feb 2026 01:20:28 +0800
Subject: [PATCH 5/7] fix(agent): surface installed skill ids in prompt

---
 .../src/agent/system-prompt/sections.test.ts  | 17 ++++++-
 .../core/src/agent/system-prompt/sections.ts  | 51 +++++++++++++++++--
 2 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/packages/core/src/agent/system-prompt/sections.test.ts b/packages/core/src/agent/system-prompt/sections.test.ts
index f31585fa..da5b4f08 100644
--- a/packages/core/src/agent/system-prompt/sections.test.ts
+++ b/packages/core/src/agent/system-prompt/sections.test.ts
@@ -222,11 +222,26 @@ describe("buildSkillsSection", () => {
     const result = buildSkillsSection("## commit\nDo commits.", "full");
     const text = result.join("\n");
     expect(text).toContain("capability gap");
-    expect(text).toContain("meta-skill-installer");
     expect(text).toContain("explicit user confirmation");
     expect(text).toContain("clawhub install");
   });
 
+  it("surfaces installed skill IDs and prioritizes meta skill guidance when present", () => {
+    const prompt = [
+      "## 🔧 Meta Skill Installer (meta-skill-installer)",
+      "Detect missing capabilities.",
+      "",
+      "## 📄 PDF (pdf)",
+      "Handle PDFs.",
+    ].join("\n");
+    const result = buildSkillsSection(prompt, "full");
+    const text = result.join("\n");
+    expect(text).toContain("Installed skill IDs:");
+    expect(text).toContain("`meta-skill-installer`");
+    expect(text).toContain("is installed");
+    expect(text).toContain("ClawHub search");
+  });
+
   it("returns empty in minimal mode", () => {
     expect(buildSkillsSection("skills", "minimal")).toEqual([]);
   });
diff --git a/packages/core/src/agent/system-prompt/sections.ts b/packages/core/src/agent/system-prompt/sections.ts
index 52d7057a..32072dd4 100644
--- a/packages/core/src/agent/system-prompt/sections.ts
+++ b/packages/core/src/agent/system-prompt/sections.ts
@@ -391,22 +391,67 @@ export function buildSkillsSection(
   const trimmed = skillsPrompt?.trim();
   if (!trimmed) return [];
 
+  const skillIds = extractSkillIdsFromSkillsPrompt(trimmed);
+  const hasMetaSkillInstaller = skillIds.includes("meta-skill-installer");
   const { text: budgeted } = truncateWithBudget(trimmed, DEFAULT_SKILLS_MAX_CHARS);
 
-  return [
+  const lines: string[] = [
     "## Skills (mandatory)",
     "Before replying: scan the available skills below.",
+  ];
+
+  if (skillIds.length > 0) {
+    lines.push(
+      `Installed skill IDs: ${skillIds.map((id) => `\`${id}\``).join(", ")}`,
+    );
+  }
+
+  lines.push(
     "- If exactly one skill clearly applies: follow its instructions.",
     "- If multiple could apply: choose the most specific one.",
     "- If none clearly apply but an **inactive skill** matches the user's intent: suggest activating it.",
     "- If the request needs a capability you currently lack: do not stop at refusal. Treat it as a capability gap and propose a recovery path.",
-    "- If `meta-skill-installer` is available and no installed skill matches: proactively offer to search ClawHub for candidates and run security review before install.",
+  );
+
+  if (hasMetaSkillInstaller) {
+    lines.push(
+      "- `meta-skill-installer` is installed: for capability gaps with no matching installed skill, proactively offer ClawHub search + security review + explicit install confirmation.",
+    );
+  } else {
+    lines.push(
+      "- If `meta-skill-installer` is available and no installed skill matches: proactively offer to search ClawHub for candidates and run security review before install.",
+    );
+  }
+
+  lines.push(
     "- Ask for explicit user confirmation before final `clawhub install` / `clawhub update` unless the user already clearly asked you to install in this turn.",
     "- After install/update, verify the skill path and retry the original user task.",
     "",
     budgeted,
     "",
-  ];
+  );
+
+  return lines;
+}
+
+/**
+ * Extract skill IDs from SkillManager prompt headings.
+ * Expected heading format: `## <emoji> <name> (<id>)`
+ */
+function extractSkillIdsFromSkillsPrompt(skillsPrompt: string): string[] {
+  const ids: string[] = [];
+  const seen = new Set<string>();
+  const headingRegex = /^##\s+.*\(([^()\n]+)\)\s*$/gm;
+
+  let match: RegExpExecArray | null;
+  while ((match = headingRegex.exec(skillsPrompt)) !== null) {
+    const id = match[1]?.trim();
+    if (!id || seen.has(id)) continue;
+    seen.add(id);
+    ids.push(id);
+  }
+
+  return ids;
 }
 
 /**

From 4b7f0afb508f48fde3df6a2c8629e041b84a6b3a Mon Sep 17 00:00:00 2001
From: Jiayuan Zhang <forrestchang7@gmail.com>
Date: Tue, 17 Feb 2026 02:23:11 +0800
Subject: [PATCH 6/7] fix(agent): guard workaround and local skill mutation
 commands

---
 .../runner.skill-install-consent.test.ts      | 171 ++++++++++
 packages/core/src/agent/runner.ts             | 301 +++++++++++++++++-
 .../src/agent/system-prompt/sections.test.ts  |   4 +
 .../core/src/agent/system-prompt/sections.ts  |   5 +
 .../src/agent/tools/exec-approval-types.ts    |   2 +
 packages/core/src/agent/tools/exec.ts         |   5 +-
 6 files changed, 484 insertions(+), 4 deletions(-)
 create mode 100644 packages/core/src/agent/runner.skill-install-consent.test.ts

diff --git a/packages/core/src/agent/runner.skill-install-consent.test.ts b/packages/core/src/agent/runner.skill-install-consent.test.ts
new file mode 100644
index 00000000..94aea9a7
--- /dev/null
+++ b/packages/core/src/agent/runner.skill-install-consent.test.ts
@@ -0,0 +1,171 @@
+import { describe, expect, it } from "vitest";
+import {
+  evaluateCustomSkillAuthoringConsent,
+  evaluateWorkaroundConsent,
+  evaluateSkillInstallConsent,
+  isEnvironmentInstallCommand,
+  isLocalSkillMutationCommand,
+  isMutatingClawhubCommand,
+  isThirdPartyWorkaroundCommand,
+} from "./runner.js";
+
+describe("isMutatingClawhubCommand", () => {
+  it("detects clawhub install command", () => {
+    expect(
+      isMutatingClawhubCommand("npx -y clawhub install spotify --workdir /tmp --dir skills"),
+    ).toBe(true);
+  });
+
+  it("detects clawhub update command", () => {
+    expect(isMutatingClawhubCommand("clawhub update spotify --force")).toBe(true);
+  });
+
+  it("does not match non-mutating clawhub commands", () => {
+    expect(isMutatingClawhubCommand("clawhub search spotify --limit 10")).toBe(false);
+    expect(isMutatingClawhubCommand("clawhub inspect spotify")).toBe(false);
+  });
+
+  it("detects wrapped bash flow that expands CLAWHUB_CMD and runs install", () => {
+    const command = [
+      "cd /tmp/meta-skill-installer && bash -c '",
+      "if command -v clawhub >/dev/null 2>&1; then",
+      "  CLAWHUB_CMD=(clawhub)",
+      "else",
+      "  CLAWHUB_CMD=(npx -y clawhub)",
+      "fi",
+      "\"${CLAWHUB_CMD[@]}\" install \"spotify\" --workdir \"$DATA_DIR\" --dir skills --force",
+      "'",
+    ].join("\n");
+    expect(isMutatingClawhubCommand(command)).toBe(true);
+  });
+});
+
+describe("evaluateSkillInstallConsent", () => {
+  it("does not grant consent for generic capability requests", () => {
+    const result = evaluateSkillInstallConsent("随机播放 spotify 中的音乐", false);
+    expect(result).toEqual({ allowInstall: false, declined: false });
+  });
+
+  it("grants consent for explicit install requests", () => {
+    const result = evaluateSkillInstallConsent("请帮我安装 spotify skill", false);
+    expect(result).toEqual({ allowInstall: true, declined: false });
+  });
+
+  it("grants consent for short affirmative replies when awaiting confirmation", () => {
+    const result = evaluateSkillInstallConsent("继续", true);
+    expect(result).toEqual({ allowInstall: true, declined: false });
+  });
+
+  it("treats standalone Chinese affirmative as consent when awaiting confirmation", () => {
+    const result = evaluateSkillInstallConsent("行", true);
+    expect(result).toEqual({ allowInstall: true, declined: false });
+  });
+
+  it("marks declines explicitly", () => {
+    const result = evaluateSkillInstallConsent("不要安装，先别动", true);
+    expect(result).toEqual({ allowInstall: false, declined: true });
+  });
+});
+
+describe("isEnvironmentInstallCommand", () => {
+  it("detects package manager install commands", () => {
+    expect(isEnvironmentInstallCommand("brew install spogo")).toBe(true);
+    expect(isEnvironmentInstallCommand("pnpm add lodash")).toBe(true);
+    expect(isEnvironmentInstallCommand("npm install -g clawhub")).toBe(true);
+    expect(isEnvironmentInstallCommand("pip install requests")).toBe(true);
+  });
+
+  it("does not match read-only package manager commands", () => {
+    expect(isEnvironmentInstallCommand("brew list")).toBe(false);
+    expect(isEnvironmentInstallCommand("pnpm list --depth 0")).toBe(false);
+    expect(isEnvironmentInstallCommand("npm view clawhub")).toBe(false);
+  });
+});
+
+describe("isThirdPartyWorkaroundCommand", () => {
+  it("detects local workaround commands", () => {
+    expect(isThirdPartyWorkaroundCommand("spotify_player playback shuffle")).toBe(true);
+    expect(isThirdPartyWorkaroundCommand("spogo status")).toBe(true);
+    expect(isThirdPartyWorkaroundCommand("osascript -e 'tell app \"Spotify\" to play'")).toBe(true);
+    expect(isThirdPartyWorkaroundCommand("curl http://localhost:8123/api/states")).toBe(true);
+  });
+
+  it("does not match unrelated commands", () => {
+    expect(isThirdPartyWorkaroundCommand("ls -la")).toBe(false);
+    expect(isThirdPartyWorkaroundCommand("pnpm test")).toBe(false);
+  });
+});
+
+describe("evaluateWorkaroundConsent", () => {
+  it("does not grant workaround mode for generic capability requests", () => {
+    const result = evaluateWorkaroundConsent("随机播放 spotify 中的音乐", false);
+    expect(result).toEqual({ allowWorkaround: false, declined: false });
+  });
+
+  it("grants workaround mode for explicit local-command intent", () => {
+    const result = evaluateWorkaroundConsent("不要安装 skill，直接用本地命令试试", false);
+    expect(result).toEqual({ allowWorkaround: true, declined: false });
+  });
+
+  it("grants workaround mode for short affirmative replies when awaiting confirmation", () => {
+    const result = evaluateWorkaroundConsent("继续", true);
+    expect(result).toEqual({ allowWorkaround: true, declined: false });
+  });
+
+  it("treats standalone Chinese affirmative as workaround consent when awaiting confirmation", () => {
+    const result = evaluateWorkaroundConsent("行", true);
+    expect(result).toEqual({ allowWorkaround: true, declined: false });
+  });
+
+  it("marks declines when no workaround intent is present", () => {
+    const result = evaluateWorkaroundConsent("不要，先别执行", true);
+    expect(result).toEqual({ allowWorkaround: false, declined: true });
+  });
+});
+
+describe("isLocalSkillMutationCommand", () => {
+  it("detects direct local skill mutation commands", () => {
+    expect(
+      isLocalSkillMutationCommand(
+        "mkdir -p ~/.super-multica/skills/notion-integration && touch ~/.super-multica/skills/notion-integration/SKILL.md",
+      ),
+    ).toBe(true);
+
+    expect(
+      isLocalSkillMutationCommand(
+        "cat > ~/.super-multica/skills/notion-integration/SKILL.md << 'EOF'\n# skill\nEOF",
+      ),
+    ).toBe(true);
+  });
+
+  it("does not match read-only commands or clawhub install flow", () => {
+    expect(isLocalSkillMutationCommand("cat ~/.super-multica/skills/notion/SKILL.md")).toBe(false);
+    expect(
+      isLocalSkillMutationCommand(
+        "npx -y clawhub install notion --workdir ~/.super-multica --dir skills --force",
+      ),
+    ).toBe(false);
+  });
+});
+
+describe("evaluateCustomSkillAuthoringConsent", () => {
+  it("does not grant consent for generic third-party requests", () => {
+    const result = evaluateCustomSkillAuthoringConsent("帮我在 Notion 新建一个页面", false);
+    expect(result).toEqual({ allowAuthoring: false, declined: false });
+  });
+
+  it("grants consent when user explicitly asks to create a custom skill", () => {
+    const result = evaluateCustomSkillAuthoringConsent("请帮我创建一个 Notion skill", false);
+    expect(result).toEqual({ allowAuthoring: true, declined: false });
+  });
+
+  it("grants consent for short affirmatives when awaiting confirmation", () => {
+    const result = evaluateCustomSkillAuthoringConsent("继续", true);
+    expect(result).toEqual({ allowAuthoring: true, declined: false });
+  });
+
+  it("marks declines explicitly", () => {
+    const result = evaluateCustomSkillAuthoringConsent("先别创建技能", true);
+    expect(result).toEqual({ allowAuthoring: false, declined: true });
+  });
+});
diff --git a/packages/core/src/agent/runner.ts b/packages/core/src/agent/runner.ts
index f36fe596..0b51292a 100644
--- a/packages/core/src/agent/runner.ts
+++ b/packages/core/src/agent/runner.ts
@@ -50,6 +50,7 @@ import {
 import { isContextOverflowError } from "./errors.js";
 import { resolveWorkspaceDir, ensureWorkspaceDir } from "./workspace.js";
 import { createRunLog, type RunLog } from "./run-log.js";
+import type { ExecApprovalCallback } from "./tools/exec-approval-types.js";
 
 // ============================================================
 // Error classification for auth profile rotation
@@ -83,6 +84,153 @@ export function isRotatableError(reason: AuthProfileFailureReason): boolean {
   return reason === "auth" || reason === "rate_limit" || reason === "billing" || reason === "timeout";
 }
 
+// ── Skill install consent guard ─────────────────────────────────────────────
+
+const CLAWHUB_MUTATION_RE = /\bclawhub\b[\s\S]*\b(?:install|update)\b/i;
+const ENV_INSTALL_RE = /\b(?:brew|apt-get|apt|yum|dnf|pacman|zypper)\s+(?:install|upgrade|tap)\b|\b(?:npm|pnpm|yarn|bun)\s+(?:install|add)\b|\bpip(?:3)?\s+install\b|\buv\s+(?:tool\s+install|pip\s+install)\b|\bcargo\s+install\b|\bgo\s+install\b/i;
+const THIRD_PARTY_WORKAROUND_RE = /\b(?:osascript|spogo|spotify_player|ha\.sh|homeassistant|hass)\b|\/api\/states\b/i;
+const LOCAL_SKILL_PATH_RE = /(?:~\/\.super-multica(?:-[\w-]+)?\/skills\/|\/\.super-multica(?:-[\w-]+)?\/skills\/|\/skills\/)/i;
+const LOCAL_SKILL_MUTATION_VERB_RE = /\b(?:mkdir|cp|mv|rm|touch|install|clone)\b/i;
+const INSTALL_ACTION_RE = /\b(?:install|update|add)\b|安装|更新|添加|启用|配置/i;
+const SKILL_CONTEXT_RE = /\b(?:clawhub|skill|skills)\b|技能|插件|扩展/i;
+const WORKAROUND_ACTION_RE = /\b(?:workaround|fallback|local\s+command|local\s+script|shell\s+script|osascript|apple\s*script|spogo|spotify_player|homeassistant|ha\.sh)\b|绕过|临时方案|本地命令|本机命令|脚本方式|直接执行|不用技能|不用skill|不装skill|不安装skill/i;
+const CUSTOM_SKILL_AUTHORING_RE = /\b(?:create|author|build)\b[\s\S]*\bskills?\b|创建[\s\S]{0,30}(?:技能|skill)|自定义[\s\S]{0,20}(?:技能|skill)|手写[\s\S]{0,20}(?:技能|skill)|custom\s+skill/i;
+const AFFIRMATIVE_RE = /\b(?:yes|y|ok|okay|sure|confirm|confirmed|continue|go ahead|please do|do it)\b|继续|确认|同意|可以|好的|继续安装/i;
+const STANDALONE_AFFIRMATIVE_RE = /^\s*(?:行|行吧|行的)\s*[。！!]?$/i;
+const DECLINE_RE = /\b(?:no|cancel|stop|don't|do not|not now|skip)\b|不要|不需要|取消|先别|暂时不用/i;
+
+function hasAffirmativeConsent(text: string): boolean {
+  return AFFIRMATIVE_RE.test(text) || STANDALONE_AFFIRMATIVE_RE.test(text);
+}
+
+/**
+ * Detect mutating ClawHub commands that require explicit user confirmation.
+ */
+export function isMutatingClawhubCommand(command: string): boolean {
+  return CLAWHUB_MUTATION_RE.test(command);
+}
+
+/**
+ * Detect package/environment installation commands.
+ * These mutate the runtime environment and should require explicit user confirmation.
+ */
+export function isEnvironmentInstallCommand(command: string): boolean {
+  return ENV_INSTALL_RE.test(command);
+}
+
+/**
+ * Detect local workaround commands for third-party integrations.
+ * These should require explicit user opt-in before execution.
+ */
+export function isThirdPartyWorkaroundCommand(command: string): boolean {
+  return THIRD_PARTY_WORKAROUND_RE.test(command);
+}
+
+/**
+ * Detect direct local skill mutations outside ClawHub install/update flow.
+ */
+export function isLocalSkillMutationCommand(command: string): boolean {
+  if (!LOCAL_SKILL_PATH_RE.test(command)) return false;
+  if (/\bclawhub\b/i.test(command)) return false;
+
+  if (LOCAL_SKILL_MUTATION_VERB_RE.test(command)) return true;
+
+  const hasCatOrEchoWrite = /\b(?:cat|tee|echo)\b/i.test(command) && />>?|<<\s*['"]?EOF/i.test(command);
+  return hasCatOrEchoWrite;
+}
+
+/**
+ * Determine whether the current user prompt grants permission to install/update skills.
+ *
+ * If `awaitingConfirmation` is true, short affirmative replies (e.g. "继续", "yes")
+ * are treated as confirmation.
+ */
+export function evaluateSkillInstallConsent(
+  prompt: string,
+  awaitingConfirmation: boolean,
+): { allowInstall: boolean; declined: boolean } {
+  const text = prompt.trim();
+  if (!text) return { allowInstall: false, declined: false };
+
+  if (DECLINE_RE.test(text)) {
+    return { allowInstall: false, declined: true };
+  }
+
+  const hasInstallAction = INSTALL_ACTION_RE.test(text);
+  const hasSkillContext = SKILL_CONTEXT_RE.test(text);
+  const hasAffirmative = hasAffirmativeConsent(text);
+
+  if (hasInstallAction) {
+    return { allowInstall: true, declined: false };
+  }
+
+  if (hasSkillContext && hasAffirmative) {
+    return { allowInstall: true, declined: false };
+  }
+
+  if (awaitingConfirmation && hasAffirmative) {
+    return { allowInstall: true, declined: false };
+  }
+
+  return { allowInstall: false, declined: false };
+}
+
+/**
+ * Determine whether the current user prompt explicitly opts into local workaround mode.
+ */
+export function evaluateWorkaroundConsent(
+  prompt: string,
+  awaitingConfirmation: boolean,
+): { allowWorkaround: boolean; declined: boolean } {
+  const text = prompt.trim();
+  if (!text) return { allowWorkaround: false, declined: false };
+
+  const hasWorkaroundAction = WORKAROUND_ACTION_RE.test(text);
+  const hasAffirmative = hasAffirmativeConsent(text);
+
+  if (hasWorkaroundAction) {
+    return { allowWorkaround: true, declined: false };
+  }
+
+  if (awaitingConfirmation && hasAffirmative) {
+    return { allowWorkaround: true, declined: false };
+  }
+
+  if (DECLINE_RE.test(text)) {
+    return { allowWorkaround: false, declined: true };
+  }
+
+  return { allowWorkaround: false, declined: false };
+}
+
+/**
+ * Determine whether the current prompt explicitly opts into custom skill authoring.
+ */
+export function evaluateCustomSkillAuthoringConsent(
+  prompt: string,
+  awaitingConfirmation: boolean,
+): { allowAuthoring: boolean; declined: boolean } {
+  const text = prompt.trim();
+  if (!text) return { allowAuthoring: false, declined: false };
+
+  if (DECLINE_RE.test(text)) {
+    return { allowAuthoring: false, declined: true };
+  }
+
+  const hasAuthoringIntent = CUSTOM_SKILL_AUTHORING_RE.test(text);
+  const hasAffirmative = hasAffirmativeConsent(text);
+
+  if (hasAuthoringIntent) {
+    return { allowAuthoring: true, declined: false };
+  }
+
+  if (awaitingConfirmation && hasAffirmative) {
+    return { allowAuthoring: true, declined: false };
+  }
+
+  return { allowAuthoring: false, declined: false };
+}
+
 // ── Run-log result extraction helpers ──────────────────────────────────────
 // Lightweight extractors for tool_end metadata. These mirror the patterns in
 // cli/output.ts but are kept separate to avoid CLI-specific dependencies.
@@ -143,6 +291,13 @@ export class Agent {
   private readonly runLog: RunLog;
   private readonly toolStartTimes = new Map<string, number>();
   private initialized = false;
+  private allowSkillInstallForCurrentRun = false;
+  private awaitingSkillInstallConfirmation = false;
+  private allowWorkaroundForCurrentRun = false;
+  private awaitingWorkaroundConfirmation = false;
+  private allowCustomSkillAuthoringForCurrentRun = false;
+  private awaitingCustomSkillAuthoringConfirmation = false;
+  private readonly guardedExecApproval: ExecApprovalCallback;
 
   // Context window settings (for pre-flight compaction)
   private readonly reserveTokens: number;
@@ -186,6 +341,7 @@ export class Agent {
 
     // Load session metadata early so stored provider/model can inform defaults
     this.sessionId = options.sessionId ?? uuidv7();
+    this.guardedExecApproval = this.createGuardedExecApprovalCallback(options.onExecApprovalNeeded);
     this.runLog = createRunLog(
       options.enableRunLog ?? !!process.env.MULTICA_RUN_LOG,
       this.sessionId,
@@ -396,8 +552,25 @@ export class Agent {
     // Use this.sessionId (which may be auto-generated) instead of options.sessionId
     // (which may be undefined). Without this, delegate tool has no session context.
     this.toolsOptions = mergedToolsConfig
-      ? { ...options, sessionId: this.sessionId, cwd: effectiveCwd, tools: mergedToolsConfig, profileDir, provider: this.resolvedProvider, runLog: this.runLog }
-      : { ...options, sessionId: this.sessionId, cwd: effectiveCwd, profileDir, provider: this.resolvedProvider, runLog: this.runLog };
+      ? {
+        ...options,
+        sessionId: this.sessionId,
+        cwd: effectiveCwd,
+        tools: mergedToolsConfig,
+        profileDir,
+        provider: this.resolvedProvider,
+        runLog: this.runLog,
+        onExecApprovalNeeded: this.guardedExecApproval,
+      }
+      : {
+        ...options,
+        sessionId: this.sessionId,
+        cwd: effectiveCwd,
+        profileDir,
+        provider: this.resolvedProvider,
+        runLog: this.runLog,
+        onExecApprovalNeeded: this.guardedExecApproval,
+      };
 
     const tools = resolveTools(this.toolsOptions);
     if (this.debug) {
@@ -526,6 +699,42 @@ export class Agent {
     this._isRunning = true;
     this._aborted = false;
 
+    if (this._internalRun) {
+      this.allowSkillInstallForCurrentRun = false;
+      this.allowWorkaroundForCurrentRun = false;
+      this.allowCustomSkillAuthoringForCurrentRun = false;
+    } else {
+      const consent = evaluateSkillInstallConsent(prompt, this.awaitingSkillInstallConfirmation);
+      if (consent.declined) {
+        this.awaitingSkillInstallConfirmation = false;
+      }
+      this.allowSkillInstallForCurrentRun = consent.allowInstall;
+      if (consent.allowInstall) {
+        this.awaitingSkillInstallConfirmation = false;
+      }
+
+      const workaroundConsent = evaluateWorkaroundConsent(prompt, this.awaitingWorkaroundConfirmation);
+      if (workaroundConsent.declined) {
+        this.awaitingWorkaroundConfirmation = false;
+      }
+      this.allowWorkaroundForCurrentRun = workaroundConsent.allowWorkaround;
+      if (workaroundConsent.allowWorkaround) {
+        this.awaitingWorkaroundConfirmation = false;
+      }
+
+      const customSkillConsent = evaluateCustomSkillAuthoringConsent(
+        prompt,
+        this.awaitingCustomSkillAuthoringConfirmation,
+      );
+      if (customSkillConsent.declined) {
+        this.awaitingCustomSkillAuthoringConfirmation = false;
+      }
+      this.allowCustomSkillAuthoringForCurrentRun = customSkillConsent.allowAuthoring;
+      if (customSkillConsent.allowAuthoring) {
+        this.awaitingCustomSkillAuthoringConfirmation = false;
+      }
+    }
+
     const runStart = Date.now();
     this.runLog.log("run_start", {
       prompt: prompt.slice(0, 200),
@@ -690,6 +899,9 @@ export class Agent {
       }
       this._isRunning = false;
       this._aborted = false;
+      this.allowSkillInstallForCurrentRun = false;
+      this.allowWorkaroundForCurrentRun = false;
+      this.allowCustomSkillAuthoringForCurrentRun = false;
       this._lastEventSavedAssistant = undefined;
       this.currentUserDisplayPrompt = undefined;
       this.currentUserSource = undefined;
@@ -697,6 +909,91 @@ export class Agent {
     }
   }
 
+  private createGuardedExecApprovalCallback(
+    base?: ExecApprovalCallback,
+  ): ExecApprovalCallback {
+    return async (command, cwd) => {
+      const needsInstallConsent =
+        isMutatingClawhubCommand(command) || isEnvironmentInstallCommand(command);
+      const needsWorkaroundConsent = isThirdPartyWorkaroundCommand(command);
+      const needsCustomSkillAuthoringConsent = isLocalSkillMutationCommand(command);
+      if (needsInstallConsent && !this.allowSkillInstallForCurrentRun) {
+        this.awaitingSkillInstallConfirmation = true;
+        this.runLog.log("install_guard", {
+          action: "blocked",
+          reason: "explicit_user_confirmation_required",
+          command: command.slice(0, 200),
+        });
+        return {
+          approved: false,
+          decision: "deny",
+          message:
+            "Install command blocked: explicit user confirmation is required first. Ask the user whether to continue installation.",
+        };
+      }
+
+      if (needsInstallConsent) {
+        this.runLog.log("install_guard", {
+          action: "allowed",
+          reason: "user_confirmed",
+          command: command.slice(0, 200),
+        });
+      }
+
+      if (needsCustomSkillAuthoringConsent && !this.allowCustomSkillAuthoringForCurrentRun) {
+        this.awaitingCustomSkillAuthoringConfirmation = true;
+        this.runLog.log("custom_skill_guard", {
+          action: "blocked",
+          reason: "explicit_custom_skill_authoring_confirmation_required",
+          command: command.slice(0, 200),
+        });
+        return {
+          approved: false,
+          decision: "deny",
+          message:
+            "Manual local skill creation command blocked by policy. Use ClawHub discovery/install flow first, or ask the user to explicitly confirm custom skill authoring.",
+        };
+      }
+
+      if (needsCustomSkillAuthoringConsent) {
+        this.runLog.log("custom_skill_guard", {
+          action: "allowed",
+          reason: "user_confirmed_custom_skill_authoring",
+          command: command.slice(0, 200),
+        });
+      }
+
+      if (needsWorkaroundConsent && !this.allowWorkaroundForCurrentRun) {
+        this.awaitingWorkaroundConfirmation = true;
+        this.runLog.log("workaround_guard", {
+          action: "blocked",
+          reason: "explicit_workaround_opt_in_required",
+          command: command.slice(0, 200),
+        });
+        return {
+          approved: false,
+          decision: "deny",
+          message:
+            "Local workaround command blocked by policy. First explain the capability gap and ask whether to search/install a Cloud Hub skill, or get explicit user opt-in for workaround mode.",
+        };
+      }
+
+      if (needsWorkaroundConsent) {
+        this.runLog.log("workaround_guard", {
+          action: "allowed",
+          reason: "user_opted_in_workaround_mode",
+          command: command.slice(0, 200),
+        });
+      }
+
+      if (base) {
+        return base(command, cwd);
+      }
+
+      return { approved: true, decision: "allow-once" };
+    };
+  }
+
   /**
    * Advance to the next non-cooldown auth profile.
    * Returns true if a new profile was activated, false if exhausted.
diff --git a/packages/core/src/agent/system-prompt/sections.test.ts b/packages/core/src/agent/system-prompt/sections.test.ts
index da5b4f08..fb82b409 100644
--- a/packages/core/src/agent/system-prompt/sections.test.ts
+++ b/packages/core/src/agent/system-prompt/sections.test.ts
@@ -224,6 +224,9 @@ describe("buildSkillsSection", () => {
     expect(text).toContain("capability gap");
     expect(text).toContain("explicit user confirmation");
     expect(text).toContain("clawhub install");
+    expect(text).toContain("third-party service requests");
+    expect(text).toContain("local workaround commands");
+    expect(text).toContain("spotify_player");
   });
 
   it("surfaces installed skill IDs and prioritizes meta skill guidance when present", () => {
@@ -240,6 +243,7 @@ describe("buildSkillsSection", () => {
     expect(text).toContain("`meta-skill-installer`");
     expect(text).toContain("is installed");
     expect(text).toContain("ClawHub search");
+    expect(text).toContain("run ClawHub discovery first");
   });
 
   it("returns empty in minimal mode", () => {
diff --git a/packages/core/src/agent/system-prompt/sections.ts b/packages/core/src/agent/system-prompt/sections.ts
index 32072dd4..554b3903 100644
--- a/packages/core/src/agent/system-prompt/sections.ts
+++ b/packages/core/src/agent/system-prompt/sections.ts
@@ -411,20 +411,25 @@ export function buildSkillsSection(
     "- If multiple could apply: choose the most specific one.",
     "- If none clearly apply but an **inactive skill** matches the user's intent: suggest activating it.",
     "- If the request needs a capability you currently lack: do not stop at refusal. Treat it as a capability gap and propose a recovery path.",
+    "- For third-party service requests (Spotify, Notion, Slack, Jira, etc.), do not jump to ad-hoc shell/app hacks as the default path.",
+    "- Treat local CLIs/scripts (for example `spogo`, `spotify_player`, `osascript`, `ha.sh`) as workaround mode: only use them after explicit user opt-in.",
   );
 
   if (hasMetaSkillInstaller) {
     lines.push(
       "- `meta-skill-installer` is installed: for capability gaps with no matching installed skill, proactively offer ClawHub search + security review + explicit install confirmation.",
+      "- With `meta-skill-installer` installed, run ClawHub discovery first (`clawhub search`) before proposing to hand-build a new custom skill.",
     );
   } else {
     lines.push(
       "- If `meta-skill-installer` is available and no installed skill matches: proactively offer to search ClawHub for candidates and run security review before install.",
+      "- Prefer ClawHub discovery over creating a brand-new custom skill from scratch unless the user explicitly asks for custom skill authoring.",
     );
   }
 
   lines.push(
     "- Ask for explicit user confirmation before final `clawhub install` / `clawhub update` unless the user already clearly asked you to install in this turn.",
+    "- Only use local workaround commands (for example `osascript` or custom shell scripts) if the user explicitly asks for workaround mode or declines skill installation.",
     "- After install/update, verify the skill path and retry the original user task.",
     "",
     budgeted,
diff --git a/packages/core/src/agent/tools/exec-approval-types.ts b/packages/core/src/agent/tools/exec-approval-types.ts
index 9c32b3da..9b1ab449 100644
--- a/packages/core/src/agent/tools/exec-approval-types.ts
+++ b/packages/core/src/agent/tools/exec-approval-types.ts
@@ -40,6 +40,8 @@ export interface ExecApprovalRequest {
 export interface ApprovalResult {
   approved: boolean;
   decision: ApprovalDecision;
+  /** Optional denial/approval message for the exec tool response */
+  message?: string | undefined;
 }
 
 // ============ Configuration ============
diff --git a/packages/core/src/agent/tools/exec.ts b/packages/core/src/agent/tools/exec.ts
index 41b51550..07686706 100644
--- a/packages/core/src/agent/tools/exec.ts
+++ b/packages/core/src/agent/tools/exec.ts
@@ -59,10 +59,11 @@ export function createExecTool(
       if (onApprovalNeeded) {
         const approvalResult = await onApprovalNeeded(command, effectiveCwd);
         if (!approvalResult.approved) {
+          const denialText = approvalResult.message?.trim() || "Command execution denied by user.";
           return {
-            content: [{ type: "text", text: "Command execution denied by user." }],
+            content: [{ type: "text", text: denialText }],
             details: {
-              output: "Command execution denied by user.",
+              output: denialText,
               exitCode: 1,
               truncated: false,
             },

From 8a2b3e10f3d6a97a9edef926cf5059ea6f5d3a81 Mon Sep 17 00:00:00 2001
From: Jiayuan Zhang <forrestchang7@gmail.com>
Date: Tue, 17 Feb 2026 02:23:23 +0800
Subject: [PATCH 7/7] test(e2e): add natural Notion gap-discovery benchmark
 case

---
 docs/e2e-skills-benchmark.md                  |  6 ++-
 scripts/e2e-skills-benchmark/analyze.mjs      | 45 +++++++++++++++++++
 .../case-04-gap-discovery-spotify-ux.txt      | 11 +----
 .../cases/case-05-gap-discovery-notion-ux.txt |  1 +
 4 files changed, 51 insertions(+), 12 deletions(-)
 create mode 100644 scripts/e2e-skills-benchmark/cases/case-05-gap-discovery-notion-ux.txt

diff --git a/docs/e2e-skills-benchmark.md b/docs/e2e-skills-benchmark.md
index 674a3b4b..c82ca61b 100644
--- a/docs/e2e-skills-benchmark.md
+++ b/docs/e2e-skills-benchmark.md
@@ -7,7 +7,7 @@ This benchmark validates the meta skill workflow for capability-gap discovery, C
 - Domain: skill discovery + installation + update
 - Focus: `skills/meta-skill-installer`
 - Providers: default `kimi-coding` (override with `PROVIDERS`)
-- Cases: 4
+- Cases: 5
 
 Case prompts are stored in:
 - `scripts/e2e-skills-benchmark/cases/`
@@ -20,6 +20,7 @@ The case set references real public pages from ClawHub:
 - [Home Assistant](https://clawhub.ai/skills/homeassistant)
 - [CodexMonitor](https://clawhub.ai/odrobnik/codexmonitor)
 - [Spotify (gap-discovery UX flow)](https://clawhub.ai/search?q=spotify)
+- [Notion (gap-discovery UX flow)](https://clawhub.ai/search?q=notion)
 
 ## Prerequisites
 
@@ -87,7 +88,8 @@ For each run:
    - `clawhub install`
    - `review-skill-security.mjs`
    - for case 03 also `clawhub update`
-   - for case 04, final response must include ClawHub + install confirmation language, and must not run `clawhub install/update` before confirmation
+   - for case 04, prompt is a natural user request only; agent must self-discover capability gap, propose ClawHub + security review + install confirmation, and must not run workaround commands (`osascript`, `ha.sh`, `spogo`, `spotify_player`) before user confirmation
+   - for case 05, prompt is a natural Notion request; agent must discover missing capability, search skill candidates, trigger `install_guard` (blocked until confirmation), and ask for explicit install consent plus token/auth prerequisites
 
 ## Notes
 
diff --git a/scripts/e2e-skills-benchmark/analyze.mjs b/scripts/e2e-skills-benchmark/analyze.mjs
index 0eaee0ed..3f621328 100755
--- a/scripts/e2e-skills-benchmark/analyze.mjs
+++ b/scripts/e2e-skills-benchmark/analyze.mjs
@@ -66,6 +66,7 @@ const CASE_RULES = {
   "case-04-gap-discovery-spotify-ux": {
     requireExecUsage: false,
     requiredResponseRegex: [
+      "缺少|没有.*(技能|能力|集成)|capability gap",
       "clawhub|cloud\\s*hub|cloudhub",
       "安装|install",
       "是否|要不要|would you like|do you want",
@@ -74,6 +75,34 @@ const CASE_RULES = {
     forbiddenCommandTokens: [
       ["clawhub", "install"],
       ["clawhub", "update"],
+      ["osascript"],
+      ["spogo"],
+      ["spotify_player"],
+      ["ha.sh"],
+      ["/api/states"],
+    ],
+  },
+  "case-05-gap-discovery-notion-ux": {
+    requireExecUsage: false,
+    requiredCommandTokens: [
+      ["clawhub", "search"],
+      ["notion"],
+    ],
+    requiredEventTokens: [
+      ["install_guard", "blocked"],
+    ],
+    requiredResponseRegex: [
+      "notion",
+      "安装|install",
+      "是否|要不要|would you like|do you want|同意",
+      "token|授权|integration",
+    ],
+    forbiddenCommandTokens: [
+      ["osascript"],
+      ["spogo"],
+      ["spotify_player"],
+      ["ha.sh"],
+      ["/api/states"],
     ],
   },
 };
@@ -324,6 +353,22 @@ for (let i = 1; i < rows.length; i++) {
       }
     }
 
+    if (Array.isArray(rules.requiredEventTokens)) {
+      const eventLines = events.map((event) => JSON.stringify(event).toLowerCase());
+      for (let r = 0; r < rules.requiredEventTokens.length; r++) {
+        const tokenList = rules.requiredEventTokens[r];
+        const passed = eventLines.some((line) =>
+          tokenList.every((token) => line.includes(token.toLowerCase())),
+        );
+        addCheck(
+          analysis,
+          `event-${r + 1}`,
+          `event log contains tokens: ${tokenList.join(" + ")}`,
+          passed,
+        );
+      }
+    }
+
     if (Array.isArray(rules.forbiddenCommandTokens)) {
       for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) {
         const tokenList = rules.forbiddenCommandTokens[r];
diff --git a/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt b/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt
index 5c8a7c16..bf2b917a 100644
--- a/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt
+++ b/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt
@@ -1,10 +1 @@
-请模拟真实用户的首轮请求：
-
-用户说：「随机播放 spotify 中的音乐」。
-
-要求：
-1. 先明确你当前缺少直接控制 Spotify 的能力（能力缺口识别）。
-2. 不要直接拒绝结束，要给出恢复路径：引导用户可通过 ClawHub 搜索并安装相关 skill。
-3. 明确说明会先做安全审查，再安装。
-4. 在同一轮里必须询问用户是否要继续安装（显式确认），在用户确认前不要执行 `clawhub install` 或 `clawhub update`。
-5. 输出用中文，且要包含关键词：ClawHub、安全审查、安装、是否要继续。
+随机播放 spotify 中的音乐
diff --git a/scripts/e2e-skills-benchmark/cases/case-05-gap-discovery-notion-ux.txt b/scripts/e2e-skills-benchmark/cases/case-05-gap-discovery-notion-ux.txt
new file mode 100644
index 00000000..aff66735
--- /dev/null
+++ b/scripts/e2e-skills-benchmark/cases/case-05-gap-discovery-notion-ux.txt
@@ -0,0 +1 @@
+帮我在 Notion 新建一个页面，标题是今天待办，并写入三条任务：修复登录 bug、写周报、安排评审