feat: pivot to AI-native task management platform (#232)

Replace the agent framework codebase with a new monorepo structure for an AI-native Linear-like product where agents are first-class citizens. New architecture: - server/ — Go backend (Chi + gorilla/websocket + sqlc) - API server with REST routes for issues, agents, inbox, workspaces - WebSocket hub for real-time updates - Local daemon entry point for agent runtime connection - PostgreSQL migration with 13 tables (issue, agent, inbox, etc.) - WebSocket protocol types for server<->daemon communication - apps/web/ — Next.js 16 frontend - Dashboard layout with sidebar navigation - Route skeleton: inbox, issues, agents, board, settings - packages/ui/ — Preserved shadcn/ui design system (26+ components) - packages/types/ — Full API contract types (Issue, Agent, Workspace, Inbox, Events) - packages/sdk/ — REST ApiClient + WebSocket WSClient - packages/store/ — Zustand stores (issue, agent, inbox, auth) - packages/hooks/ — React hooks (useIssues, useAgents, useInbox, useRealtime) - packages/utils/ — Shared utilities Removed: apps/cli, apps/desktop, apps/mobile, apps/gateway, packages/core, skills/, and all agent-framework code. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 17:55:49 +08:00 · 2026-03-20 17:55:49 +08:00 · d4f5c5b16f
commit d4f5c5b16f
parent 3f589d8326
677 changed files with 2779 additions and 122531 deletions
--- a/scripts/archive-dev-data.sh
+++ b/scripts/archive-dev-data.sh
@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-#
-# Archive and clean the dev environment data.
-#
-# Moves ~/.super-multica-dev and ~/Documents/Multica-dev into a
-# timestamped archive directory for later debugging / analysis.
-#
-# Usage:
-#   pnpm dev:local:archive
-#
-# Archives are stored in: ~/.super-multica-dev-archives/<timestamp>/
-
-set -euo pipefail
-
-TIMESTAMP=$(date +"%Y%m%d-%H%M%S")
-ARCHIVE_BASE="$HOME/.super-multica-dev-archives"
-ARCHIVE_DIR="$ARCHIVE_BASE/$TIMESTAMP"
-
-DEV_DATA="$HOME/.super-multica-dev"
-DEV_WORKSPACE="$HOME/Documents/Multica-dev"
-
-# Check if there's anything to archive
-if [ ! -d "$DEV_DATA" ] && [ ! -d "$DEV_WORKSPACE" ]; then
-  echo "Nothing to archive — neither $DEV_DATA nor $DEV_WORKSPACE exists."
-  exit 0
-fi
-
-mkdir -p "$ARCHIVE_DIR"
-
-if [ -d "$DEV_DATA" ]; then
-  mv "$DEV_DATA" "$ARCHIVE_DIR/data"
-  echo "  Archived $DEV_DATA -> $ARCHIVE_DIR/data"
-fi
-
-if [ -d "$DEV_WORKSPACE" ]; then
-  mv "$DEV_WORKSPACE" "$ARCHIVE_DIR/workspace"
-  echo "  Archived $DEV_WORKSPACE -> $ARCHIVE_DIR/workspace"
-fi
-
-echo ""
-echo "Archived to: $ARCHIVE_DIR"
-echo "Dev environment is now clean. Run 'pnpm dev:local' to start fresh."
--- a/scripts/build-cli.js
+++ b/scripts/build-cli.js
@ -1,79 +0,0 @@
-#!/usr/bin/env node
-import * as esbuild from "esbuild";
-import { fileURLToPath } from "url";
-import { dirname, resolve } from "path";
-import { readFileSync, chmodSync } from "fs";
-
-const __dirname = dirname(fileURLToPath(import.meta.url));
-const rootDir = resolve(__dirname, "..");
-
-// Read package.json to get all dependencies
-const pkg = JSON.parse(readFileSync(resolve(rootDir, "package.json"), "utf8"));
-const allDeps = [
-  ...Object.keys(pkg.dependencies || {}),
-  ...Object.keys(pkg.devDependencies || {}),
-];
-
-// Plugin to strip shebangs from source files (they get bundled otherwise)
-const stripShebangPlugin = {
-  name: "strip-shebang",
-  setup(build) {
-    build.onLoad({ filter: /\.ts$/ }, async (args) => {
-      const source = readFileSync(args.path, "utf8");
-      // Remove shebang if present
-      const contents = source.replace(/^#!.*\n/, "");
-      return { contents, loader: "ts" };
-    });
-  },
-};
-
-async function build() {
-  // Unified CLI entry point
-  const entryPoint = {
-    entry: "src/agent/cli/index.ts",
-    outfile: "bin/multica.mjs",
-  };
-
-  console.log(`Building ${entryPoint.entry} -> ${entryPoint.outfile}...`);
-
-  await esbuild.build({
-    entryPoints: [resolve(rootDir, entryPoint.entry)],
-    outfile: resolve(rootDir, entryPoint.outfile),
-    bundle: true,
-    platform: "node",
-    target: "node20",
-    format: "esm",
-    banner: {
-      js: "#!/usr/bin/env node",
-    },
-    plugins: [stripShebangPlugin],
-    sourcemap: true,
-    minify: false,
-    // Externalize all dependencies - they will be loaded from node_modules at runtime
-    external: allDeps,
-  });
-
-  // Make executable
-  chmodSync(resolve(rootDir, entryPoint.outfile), 0o755);
-  console.log(`  ✓ ${entryPoint.outfile}`);
-
-  console.log("\nBuild complete! Binary is in ./bin/");
-  console.log("\nUsage:");
-  console.log("  multica                    # Interactive mode (default)");
-  console.log("  multica run <prompt>       # Run a single prompt");
-  console.log("  multica chat               # Interactive mode");
-  console.log("  multica session list       # List sessions");
-  console.log("  multica profile list       # List profiles");
-  console.log("  multica skills list        # List skills");
-  console.log("  multica tools list         # List tools");
-  console.log("  multica credentials init   # Initialize credentials");
-  console.log("  multica dev                # Start dev servers");
-  console.log("  multica help               # Show help");
-  console.log("\nNote: The built binary requires node_modules to be present.");
-  console.log("Run 'pnpm install --prod' to install only production dependencies.");
-}
-
-build().catch((err) => {
-  console.error(err);
-  process.exit(1);
-});
--- a/scripts/compaction-benchmark/run.sh
+++ b/scripts/compaction-benchmark/run.sh
@ -1,137 +0,0 @@
-#!/usr/bin/env bash
-# Compaction Benchmark - Multi-turn test with low context window
-#
-# This script runs a series of prompts against the Multica agent with a very
-# low context window (20k tokens) to force compaction to trigger quickly.
-# The run-log output is then available for analysis.
-#
-# Usage:
-#   bash scripts/compaction-benchmark/run.sh [provider]
-#
-# Default provider: kimi-coding
-
-set -euo pipefail
-
-PROVIDER="${1:-kimi-coding}"
-CONTEXT_WINDOW="${2:-20000}"
-SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
-
-export SMC_DATA_DIR=~/.super-multica-e2e
-
-echo "=== Compaction Benchmark ==="
-echo "Provider: $PROVIDER"
-echo "Context Window: $CONTEXT_WINDOW tokens"
-echo "Data Dir: $SMC_DATA_DIR"
-echo ""
-
-# Clean previous E2E data
-rm -rf "$SMC_DATA_DIR"
-
-cd "$ROOT_DIR"
-
-# Turn 1: Start a session with a substantial prompt that generates tool usage
-echo "--- Turn 1: Initial prompt (read multiple files) ---"
-TURN1_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
-  --run-log \
-  --provider "$PROVIDER" \
-  --context-window "$CONTEXT_WINDOW" \
-  "Read the following files and give me a brief summary of each: packages/core/src/agent/runner.ts, packages/core/src/agent/session/session-manager.ts, packages/core/src/agent/context-window/token-estimation.ts. List the main exports and key functions in each file." \
-  2>&1)
-
-# Extract session ID from stderr output
-SESSION_ID=$(echo "$TURN1_OUTPUT" | grep -o '\[session: [^]]*\]' | head -1 | sed 's/\[session: //;s/\]//')
-SESSION_DIR=$(echo "$TURN1_OUTPUT" | grep -o '\[session-dir: [^]]*\]' | head -1 | sed 's/\[session-dir: //;s/\]//')
-
-if [ -z "$SESSION_ID" ]; then
-  echo "ERROR: Could not extract session ID from output"
-  echo "$TURN1_OUTPUT"
-  exit 1
-fi
-
-echo "Session ID: $SESSION_ID"
-echo "Session Dir: $SESSION_DIR"
-echo ""
-
-# Turn 2: Continue the session with more file reads to push context higher
-echo "--- Turn 2: More file reads (push context higher) ---"
-TURN2_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
-  --run-log \
-  --provider "$PROVIDER" \
-  --context-window "$CONTEXT_WINDOW" \
-  --session "$SESSION_ID" \
-  "Now also read packages/core/src/agent/context-window/summarization.ts and packages/core/src/agent/context-window/tool-result-pruning.ts. Describe the key algorithms in each." \
-  2>&1)
-
-echo "$TURN2_OUTPUT" | head -5
-echo ""
-
-# Turn 3: More context-heavy work
-echo "--- Turn 3: Additional analysis (should trigger compaction) ---"
-TURN3_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
-  --run-log \
-  --provider "$PROVIDER" \
-  --context-window "$CONTEXT_WINDOW" \
-  --session "$SESSION_ID" \
-  "Read packages/core/src/agent/session/compaction.ts and explain the three compaction modes. Also read packages/core/src/agent/context-window/guard.ts and explain the guard thresholds." \
-  2>&1)
-
-echo "$TURN3_OUTPUT" | head -5
-echo ""
-
-# Turn 4: More tool usage
-echo "--- Turn 4: Write and test (more context pressure) ---"
-TURN4_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
-  --run-log \
-  --provider "$PROVIDER" \
-  --context-window "$CONTEXT_WINDOW" \
-  --session "$SESSION_ID" \
-  "Based on everything you've read so far, list all the constants and thresholds used in the compaction system. Provide exact values and which file each is defined in." \
-  2>&1)
-
-echo "$TURN4_OUTPUT" | head -5
-echo ""
-
-# Output analysis summary
-echo "=== Benchmark Complete ==="
-echo "Session Dir: $SESSION_DIR"
-echo ""
-
-# Show run-log stats
-if [ -f "$SESSION_DIR/run-log.jsonl" ]; then
-  echo "--- Run Log Event Summary ---"
-  echo "Total events: $(wc -l < "$SESSION_DIR/run-log.jsonl")"
-  echo ""
-  echo "Events by type:"
-  cat "$SESSION_DIR/run-log.jsonl" | python3 -c "
-import sys, json
-from collections import Counter
-events = Counter()
-for line in sys.stdin:
-    try:
-        obj = json.loads(line.strip())
-        events[obj.get('event', 'unknown')] += 1
-    except:
-        pass
-for event, count in sorted(events.items()):
-    print(f'  {event}: {count}')
-" 2>/dev/null || echo "  (python3 not available for analysis)"
-  echo ""
-
-  echo "--- Compaction Events ---"
-  cat "$SESSION_DIR/run-log.jsonl" | python3 -c "
-import sys, json
-for line in sys.stdin:
-    try:
-        obj = json.loads(line.strip())
-        event = obj.get('event', '')
-        if 'compact' in event or 'overflow' in event or 'pruning' in event:
-            print(json.dumps(obj, indent=2))
-    except:
-        pass
-" 2>/dev/null || echo "  (python3 not available for analysis)"
-fi
-
-echo ""
-echo "=== Full run-log path: $SESSION_DIR/run-log.jsonl ==="
-echo "=== Session file path: $SESSION_DIR/session.jsonl ==="
--- a/scripts/dev-local.sh
+++ b/scripts/dev-local.sh
@ -1,56 +0,0 @@
-#!/usr/bin/env bash
-#
-# Local development: Gateway (with Telegram bot) + Desktop + Web (for login)
-#
-# Usage:
-#   pnpm dev:local
-#
-# Reads TELEGRAM_BOT_TOKEN from .env at the repo root.
-# Gateway runs on port 4000 in long-polling mode (no TELEGRAM_WEBHOOK_URL needed).
-# Web app runs on port 3000 (default) for OAuth login flow.
-# Desktop connects to the local Gateway and uses local Web for login.
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-ROOT_DIR="$SCRIPT_DIR/.."
-ENV_FILE="$ROOT_DIR/.env"
-
-# Load .env
-if [ ! -f "$ENV_FILE" ]; then
-  echo "Error: .env file not found at $ENV_FILE"
-  echo "Copy .env.example to .env and fill in TELEGRAM_BOT_TOKEN"
-  exit 1
-fi
-
-set -a
-source "$ENV_FILE"
-set +a
-
-if [ -z "${TELEGRAM_BOT_TOKEN:-}" ]; then
-  echo "Error: TELEGRAM_BOT_TOKEN not set in .env"
-  exit 1
-fi
-
-echo "Starting local dev environment..."
-echo "  Gateway:  http://localhost:4000 (Telegram long-polling mode)"
-echo "  Web:      http://localhost:3000 (OAuth login)"
-echo "  Desktop:  connecting to local Gateway + Web"
-echo "  Data dir: ~/.super-multica-dev (isolated from production)"
-echo "  Workspace: ~/Documents/Multica-dev (isolated from production)"
-echo ""
-
-# Build shared packages first
-pnpm turbo build --filter=@multica/types --filter=@multica/utils --filter=@multica/core
-
-# Start everything
-# Gateway uses PORT=4000 to avoid conflict with Web app on port 3000
-exec pnpm concurrently \
-  -n types,utils,core,gateway,web,desktop \
-  -c blue,green,yellow,magenta,red,cyan \
-  "pnpm --filter @multica/types dev" \
-  "pnpm --filter @multica/utils dev" \
-  "pnpm --filter @multica/core dev" \
-  "PORT=4000 SMC_DATA_DIR=~/.super-multica-dev MULTICA_WORKSPACE_DIR=~/Documents/Multica-dev MULTICA_RUN_LOG=1 pnpm --filter @multica/gateway dev" \
-  "MULTICA_API_URL=https://api-dev.copilothub.ai pnpm --filter @multica/web dev" \
-  "GATEWAY_URL=http://localhost:4000 MAIN_VITE_WEB_URL=http://localhost:3000 SMC_DATA_DIR=~/.super-multica-dev MULTICA_WORKSPACE_DIR=~/Documents/Multica-dev MULTICA_RUN_LOG=1 pnpm --filter @multica/desktop dev"
--- a/scripts/e2e-finance-benchmark/cases/case-01-top10-financial-reports.txt
+++ b/scripts/e2e-finance-benchmark/cases/case-01-top10-financial-reports.txt
@ -1,13 +0,0 @@
-Complete a high-complexity investment research task:
-
-Objective: Analyze the top 10 US stocks by market capitalization across their most recent three complete fiscal years and provide investment recommendations for 2026 (2026-01-01 to 2026-12-31).
-
-Requirements:
-1. Use "top 10 US stocks by market cap as of 2026-02-01" as the sample; if data windows are incomplete for certain companies, note this and substitute with the most recent available complete fiscal year.
-2. Generate 1 detailed analysis per company, covering at minimum: revenue and profit structure, gross/operating margin trends, cash flow quality, capex and buybacks/dividends, valuation range, and key risks.
-3. Generate an Excel file (.xlsx) with at least 4 sheets: `raw_data`, `company_scorecard`, `valuation`, `risk_matrix`.
-4. Generate a comprehensive report with cross-company comparison and tiering (core holding / watchlist / avoid), along with 2026 portfolio recommendations (including position ranges and trigger conditions).
-5. Output a separate `sources.md` listing key data source links and retrieval timestamps.
-6. If unable to generate xlsx directly, explain why and provide structurally equivalent CSV files.
-
-Execution requirements: First present an 8-12 step execution plan, then execute. Conclude with a self-check checklist confirming all files are complete.
--- a/scripts/e2e-finance-benchmark/cases/case-02-ai-value-chain-scorecard.txt
+++ b/scripts/e2e-finance-benchmark/cases/case-02-ai-value-chain-scorecard.txt
@ -1,14 +0,0 @@
-Build an "AI Value Chain Fundamentals & Valuation Scorecard" project.
-
-Stock universe: NVDA, AMD, AVGO, MSFT, GOOGL, AMZN, META, TSM, ASML, ANET.
-Time range: 2023-01-01 to 2025-12-31 (fill gaps with most recent available data and flag accordingly).
-
-Requirements:
-1. Construct a 100-point scoring model with at least 6 dimensions: growth, profitability, capital efficiency, R&D intensity, cash flow quality, and valuation margin of safety.
-2. Provide weights and scoring logic for each dimension; must be reproducible.
-3. Generate an Excel file (.xlsx) with sheets: `input_data`, `factor_scores`, `weighted_rank`, `scenario_2026`.
-4. In `scenario_2026`, provide target ranges and trigger signals under three scenarios (optimistic / base / conservative).
-5. Produce `investment_memo.md` (including entry logic for the top 3 and avoidance logic for the bottom 3).
-6. Produce `sources.md` (source links + dates).
-
-Execution requirements: Plan before executing; conclude with a "reproducibility check" (can someone else reproduce your results following your steps).
--- a/scripts/e2e-finance-benchmark/cases/case-03-us-bank-stress-test.txt
+++ b/scripts/e2e-finance-benchmark/cases/case-03-us-bank-stress-test.txt
@ -1,15 +0,0 @@
-Perform a "US Major Bank 2026 Stress Test" task.
-
-Sample: JPM, BAC, C, WFC, GS, MS.
-
-Requirements:
-1. Compile key metrics from the most recent three complete fiscal years (preferably 2023-2025): net interest margin (NIM), CET1, loan loss provisions, commercial real estate (CRE) exposure, deposit cost changes, unrealized losses, etc.
-2. Construct two stress scenarios:
-   - Mild Recession: unemployment +150bp, federal funds rate -100bp
-   - Severe Recession: unemployment +300bp, federal funds rate -200bp, CRE default rate significantly higher
-3. Estimate directional changes in profit and capital adequacy for each bank under both scenarios, and rank vulnerability.
-4. Generate an Excel file (.xlsx) with sheets: `bank_raw`, `stress_assumptions`, `impact_estimate`, `ranking`.
-5. Generate `risk_brief.md` containing "top 5 risk signals to watch."
-6. Generate `sources.md`.
-
-Execution requirements: Present methodology first, then results; conclude by listing the 3 assumptions you are least confident about.
--- a/scripts/e2e-finance-benchmark/cases/case-04-consumer-sector-macro-linkage.txt
+++ b/scripts/e2e-finance-benchmark/cases/case-04-consumer-sector-macro-linkage.txt
@ -1,14 +0,0 @@
-Perform a "US Consumer Sector & Macro Variable Linkage Analysis."
-
-Sample companies: WMT, COST, TGT, HD, LOW, MCD, SBUX, NKE, DIS, AMZN.
-Time range: 2023-01-01 to 2025-12-31.
-
-Requirements:
-1. Split companies into "consumer staples" and "consumer discretionary" groups; compare revenue growth, margins, inventory changes, same-store sales (if available), and cash flow quality.
-2. Analyze each group's earnings elasticity relative to macro variables (CPI, real wages, unemployment, interest rates).
-3. Build a "2026 three-scenario" earnings elasticity matrix: soft landing / reflation / recession.
-4. Generate an Excel file (.xlsx) with sheets: `company_metrics`, `macro_series`, `elasticity_matrix`, `portfolio_actions`.
-5. Generate `strategy_note.md` with 2026 sector allocation recommendations and rebalancing trigger conditions.
-6. Generate `sources.md`.
-
-Execution requirements: Each allocation recommendation must explicitly state the verifiable metrics behind it.
--- a/scripts/e2e-finance-benchmark/cases/case-05-energy-transport-sensitivity.txt
+++ b/scripts/e2e-finance-benchmark/cases/case-05-energy-transport-sensitivity.txt
@ -1,17 +0,0 @@
-Complete an "Energy Price Shock Sensitivity Analysis for Energy & Transport Sectors."
-
-Sample: XOM, CVX, COP, SLB, DAL, UAL, FDX, UPS.
-Time range: 2023-01-01 to 2025-12-31.
-
-Requirements:
-1. Summarize each company's sensitivity direction to oil/fuel costs and sources of operating leverage.
-2. Construct three oil price paths for 2026:
-   - Scenario A: WTI average $60
-   - Scenario B: WTI average $80
-   - Scenario C: WTI average $100
-3. Estimate directional changes in earnings and valuation for each company under different scenarios (ranges are acceptable over point estimates, but rationale must be provided).
-4. Generate an Excel file (.xlsx) with sheets: `raw_financials`, `oil_scenarios`, `sensitivity_map`, `trade_ideas`.
-5. Generate `hedge_plan.md` proposing at least 2 hedging or paired trade strategies, including conditions under which they would fail.
-6. Generate `sources.md`.
-
-Execution requirements: Conclusions must include "base position + hedge position + trigger thresholds."
--- a/scripts/e2e-finance-benchmark/cases/case-06-cross-asset-allocation.txt
+++ b/scripts/e2e-finance-benchmark/cases/case-06-cross-asset-allocation.txt
@ -1,16 +0,0 @@
-Build a "Cross-Asset Tactical Allocation (2026)" project.
-
-Asset universe: SPY, QQQ, IWM, TLT, IEF, HYG, GLD, DBC, BTC-USD.
-Historical period: 2021-01-01 to 2025-12-31 (monthly frequency is sufficient).
-
-Requirements:
-1. Calculate and compare key metrics: annualized return, volatility, maximum drawdown, Sharpe ratio, and correlation matrix.
-2. Design two portfolios:
-   - Defensive (target: minimize maximum drawdown)
-   - Offensive (target: higher risk-adjusted returns)
-3. Stress test both portfolios under three 2026 scenarios (growth slowdown / inflation resurgence / liquidity easing), and provide rebalancing rules.
-4. Generate an Excel file (.xlsx) with sheets: `price_returns`, `risk_metrics`, `corr_matrix`, `portfolio_defensive`, `portfolio_offensive`, `scenario_test`.
-5. Generate `allocation_memo.md` explaining why these two portfolios are actionable in 2026.
-6. Generate `sources.md`.
-
-Execution requirements: Explicitly state rebalancing frequency, stop-loss rules, and re-entry conditions for each portfolio.
--- a/scripts/e2e-finance-benchmark/cases/case-07-reit-rate-risk.txt
+++ b/scripts/e2e-finance-benchmark/cases/case-07-reit-rate-risk.txt
@ -1,13 +0,0 @@
-Perform a "REIT Investment Screening in a High-Rate Environment" task.
-
-Sample: VNQ, PLD, AMT, EQIX, O, SPG, PSA, DLR.
-
-Requirements:
-1. Compile key metrics from the most recent three complete fiscal years: FFO/AFFO growth, leverage, interest coverage, debt maturity profile, and dividend coverage.
-2. Design three 2026 interest rate scenarios (10Y Treasury yield at 3.5% / 4.5% / 5.5%) and analyze valuation pressure and dividend sustainability.
-3. Classify each as "hold / watchlist / avoid" and explain the 2-3 most critical driving factors.
-4. Generate an Excel file (.xlsx) with sheets: `reit_raw`, `debt_profile`, `rate_scenarios`, `selection_result`.
-5. Generate `reit_investment_note.md`.
-6. Generate `sources.md`.
-
-Execution requirements: If data is missing, it must be explicitly marked as NA in the tables; silent omission is not allowed.
--- a/scripts/e2e-finance-benchmark/cases/case-08-earnings-quality-forensics.txt
+++ b/scripts/e2e-finance-benchmark/cases/case-08-earnings-quality-forensics.txt
@ -1,13 +0,0 @@
-Perform an "Earnings Quality Forensic Analysis."
-
-Sample: AAPL, MSFT, GOOGL, AMZN, META, NVDA, TSLA, BRK.B, UNH, JPM.
-Time range: 2023-01-01 to 2025-12-31.
-
-Requirements:
-1. Establish an earnings quality inspection framework covering at minimum: accruals quality, operating cash flow to net income matching, stock-based compensation dilution, buyback-to-debt relationship, and one-time item impact.
-2. Assign each company a Red / Yellow / Green rating with traceable supporting evidence.
-3. Generate an Excel file (.xlsx) with sheets: `quality_raw`, `forensic_flags`, `rating_summary`, `watchlist_2026`.
-4. Generate `forensic_report.md` summarizing the 5 most concerning red flags.
-5. Generate `sources.md`.
-
-Execution requirements: The report must clearly distinguish "which conclusions are factual vs. which are inferred."
--- a/scripts/e2e-finance-benchmark/cases/case-09-post-earnings-drift-study.txt
+++ b/scripts/e2e-finance-benchmark/cases/case-09-post-earnings-drift-study.txt
@ -1,14 +0,0 @@
-Perform a "Post-Earnings Announcement Drift (PEAD) Strategy Feasibility Study."
-
-Research period: 2023-01-01 to 2025-12-31.
-Sample: Select at least 30 US large/mid-cap stocks (provide selection criteria).
-
-Requirements:
-1. Define an executable PEAD signal (e.g., post-earnings 1-3 day information, earnings surprise proxy, or post-announcement momentum proxy) and explain its limitations.
-2. Group the sample (high signal / low signal) and analyze performance differences at 1-month and 3-month horizons.
-3. Add basic risk controls (position limits, stop-loss, sector exposure limits) and evaluate whether the strategy warrants a small-scale pilot in 2026.
-4. Generate an Excel file (.xlsx) with sheets: `universe`, `signal_definition`, `group_performance`, `risk_controls`, `pilot_plan_2026`.
-5. Generate `pead_study.md` (covering methodology, results, sources of bias, and implementation recommendations).
-6. Generate `sources.md`.
-
-Execution requirements: Must provide "failure scenarios" and objective conditions for "stopping the pilot."
--- a/scripts/e2e-finance-benchmark/cases/case-10-investment-committee-pack.txt
+++ b/scripts/e2e-finance-benchmark/cases/case-10-investment-committee-pack.txt
@ -1,12 +0,0 @@
-Produce a "Q2 2026 Investment Committee Materials Pack."
-
-Objective: Create meeting-ready investment committee documents for a USD multi-asset portfolio.
-
-Requirements:
-1. Output a summary document `committee_pack.md` with at least the following sections: macro outlook, equities, rates, credit, commodities, portfolio risk, and action list.
-2. Output an Excel workbook (.xlsx) with at least these sheets: `macro_dashboard`, `equity_watchlist`, `rates_credit`, `commodity_view`, `portfolio_risk`, `action_tracker`.
-3. In `action_tracker`, provide actionable items for Q2 2026, each with: trigger condition, target position change, risk control threshold, and review date.
-4. Additionally output `devil_advocate.md`, specifically rebutting your own core investment views with at least 5 counter-arguments.
-5. Additionally output `sources.md` listing key data sources and dates.
-
-Execution requirements: Plan first, then execute; conclude with a "10-minute oral briefing outline for the investment committee."
--- a/scripts/e2e-finance-benchmark/run.sh
+++ b/scripts/e2e-finance-benchmark/run.sh
@ -1,166 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
-CASES_DIR="${SCRIPT_DIR}/cases"
-TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
-OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/finance-e2e-runs/${TIMESTAMP}}"
-RESULTS_DIR="${OUT_DIR}/results"
-MANIFEST="${OUT_DIR}/manifest.tsv"
-
-# Required environment for agent-driven E2E with web_search/data tools.
-SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
-MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
-PROVIDERS_RAW="${PROVIDERS:-kimi-coding claude-code}"
-CASE_GLOB="${CASE_GLOB:-case-*.txt}"
-CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-900}"
-MAX_PARALLEL="${MAX_PARALLEL:-2}"
-TIMEOUT_ENABLED="true"
-if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
-  TIMEOUT_ENABLED="false"
-fi
-
-if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
-  echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
-  exit 1
-fi
-
-if [[ "${1:-}" == "--worker" ]]; then
-  provider="${2:?missing provider}"
-  case_file="${3:?missing case file}"
-  case_base="$(basename "${case_file}")"
-  case_id="${case_base%.txt}"
-  log_file="${OUT_DIR}/${provider}-${case_id}.log"
-  result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
-
-  prompt="$(cat "${case_file}")"
-
-  status="success"
-  timed_out="false"
-  started_epoch="$(date +%s)"
-  started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
-
-  SMC_DATA_DIR="${SMC_DATA_DIR}" \
-    MULTICA_API_URL="${MULTICA_API_URL}" \
-    pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
-  cmd_pid=$!
-
-  while kill -0 "${cmd_pid}" 2>/dev/null; do
-    if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
-      now="$(date +%s)"
-      elapsed="$((now - started_epoch))"
-      if (( elapsed >= CASE_TIMEOUT_SEC )); then
-        timed_out="true"
-        kill "${cmd_pid}" 2>/dev/null || true
-        sleep 1
-        kill -9 "${cmd_pid}" 2>/dev/null || true
-        break
-      fi
-    fi
-    sleep 2
-  done
-
-  exit_code=0
-  wait "${cmd_pid}" 2>/dev/null || exit_code=$?
-  ended_epoch="$(date +%s)"
-  ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
-  duration_sec="$((ended_epoch - started_epoch))"
-
-  if [[ "${timed_out}" == "true" ]]; then
-    status="timeout"
-    printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
-  elif (( exit_code != 0 )); then
-    status="failed"
-  elif [[ ! -s "${log_file}" ]]; then
-    status="failed"
-  elif ! rg -q "\[session: " "${log_file}"; then
-    status="failed"
-  fi
-
-  session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
-  session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
-
-  printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
-    "${TIMESTAMP}" \
-    "${provider}" \
-    "${case_id}" \
-    "${status}" \
-    "${session_id}" \
-    "${session_dir}" \
-    "${log_file}" \
-    "${started_at}" \
-    "${ended_at}" \
-    "${duration_sec}" \
-    "${exit_code}" > "${result_file}"
-
-  printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
-    "${provider}" \
-    "${case_id}" \
-    "${status}" \
-    "${duration_sec}" \
-    "${session_id:-N/A}"
-  exit 0
-fi
-
-mkdir -p "${OUT_DIR}"
-mkdir -p "${RESULTS_DIR}"
-printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
-
-read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
-
-CASE_FILES=()
-while IFS= read -r line; do
-  CASE_FILES+=("${line}")
-done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
-
-if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
-  echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
-  exit 1
-fi
-
-echo "Output directory: ${OUT_DIR}"
-echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
-echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
-echo "Providers: ${PROVIDERS[*]}"
-echo "Cases: ${#CASE_FILES[@]}"
-echo "Max parallel: ${MAX_PARALLEL}"
-if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
-  echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
-else
-  echo "Case timeout: disabled"
-fi
-
-TASKS=()
-for provider in "${PROVIDERS[@]}"; do
-  for case_file in "${CASE_FILES[@]}"; do
-    TASKS+=("${provider}" "${case_file}")
-  done
-done
-
-echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
-
-export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
-printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
-
-RESULT_FILES=()
-while IFS= read -r line; do
-  RESULT_FILES+=("${line}")
-done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
-
-if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
-  echo "No result files produced in ${RESULTS_DIR}" >&2
-  exit 1
-fi
-
-for result_file in "${RESULT_FILES[@]}"; do
-  cat "${result_file}" >> "${MANIFEST}"
-done
-
-success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
-failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
-timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
-
-echo
-echo "Completed. Manifest: ${MANIFEST}"
-echo "Summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
--- a/scripts/e2e-skills-benchmark/analyze.mjs
+++ b/scripts/e2e-skills-benchmark/analyze.mjs
@ -1,441 +0,0 @@
-#!/usr/bin/env node
-
-import { existsSync, readFileSync, writeFileSync } from "node:fs";
-import { dirname, join, resolve } from "node:path";
-
-/**
- * @typedef {{
- *   id: string;
- *   check: string;
- *   passed: boolean;
- *   detail?: string;
- * }} CheckResult
- */
-
-/**
- * @typedef {{
- *   provider: string;
- *   caseId: string;
- *   status: string;
- *   sessionId: string;
- *   sessionDir: string;
- *   logFile: string;
- *   checks: CheckResult[];
- *   pass: boolean;
- * }} CaseAnalysis
- */
-
-const manifestArg = process.argv[2];
-if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") {
-  console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs <manifest.tsv>");
-  process.exit(0);
-}
-
-const manifestPath = resolve(manifestArg);
-if (!existsSync(manifestPath)) {
-  console.error(`Manifest not found: ${manifestPath}`);
-  process.exit(1);
-}
-
-const CASE_RULES = {
-  "case-01-install-caldav-calendar": {
-    requiredCommandTokens: [
-      ["clawhub", "search"],
-      ["caldav"],
-      ["clawhub", "install"],
-      ["review-skill-security.mjs"],
-    ],
-  },
-  "case-02-gap-discovery-homeassistant": {
-    requiredCommandTokens: [
-      ["clawhub", "search"],
-      ["home", "assistant"],
-      ["clawhub", "install"],
-      ["review-skill-security.mjs"],
-    ],
-  },
-  "case-03-install-update-codexmonitor": {
-    requiredCommandTokens: [
-      ["clawhub", "search"],
-      ["codexmonitor"],
-      ["clawhub", "install"],
-      ["clawhub", "update"],
-      ["review-skill-security.mjs"],
-    ],
-  },
-  "case-04-gap-discovery-spotify-ux": {
-    requireExecUsage: false,
-    requiredResponseRegex: [
-      "缺少|没有.*(技能|能力|集成)|capability gap",
-      "clawhub|cloud\\s*hub|cloudhub",
-      "安装|install",
-      "是否|要不要|would you like|do you want",
-      "安全|审查|security|review",
-    ],
-    forbiddenCommandTokens: [
-      ["clawhub", "install"],
-      ["clawhub", "update"],
-      ["osascript"],
-      ["spogo"],
-      ["spotify_player"],
-      ["ha.sh"],
-      ["/api/states"],
-    ],
-  },
-  "case-05-gap-discovery-notion-ux": {
-    requireExecUsage: false,
-    requiredCommandTokens: [
-      ["clawhub", "search"],
-      ["notion"],
-    ],
-    requiredEventTokens: [
-      ["install_guard", "blocked"],
-    ],
-    requiredResponseRegex: [
-      "notion",
-      "安装|install",
-      "是否|要不要|would you like|do you want|同意",
-      "token|授权|integration",
-    ],
-    forbiddenCommandTokens: [
-      ["osascript"],
-      ["spogo"],
-      ["spotify_player"],
-      ["ha.sh"],
-      ["/api/states"],
-    ],
-  },
-};
-
-/**
- * @param {string} text
- * @returns {string[]}
- */
-function splitLines(text) {
-  return text.split(/\r?\n/).filter(Boolean);
-}
-
-/**
- * @param {string} command
- * @param {string[]} tokens
- * @returns {boolean}
- */
-function commandHasTokens(command, tokens) {
-  const lower = command.toLowerCase();
-  return tokens.every((token) => lower.includes(token.toLowerCase()));
-}
-
-/**
- * @param {string} rawArgs
- * @returns {string}
- */
-function extractCommand(rawArgs) {
-  if (!rawArgs) return "";
-  try {
-    const parsed = JSON.parse(rawArgs);
-    if (parsed && typeof parsed.command === "string") {
-      return parsed.command;
-    }
-  } catch {
-    // Fall through: args may be truncated JSON in run-log.
-  }
-  return rawArgs;
-}
-
-/**
- * @param {string} text
- * @param {string} pattern
- * @returns {boolean}
- */
-function textMatchesPattern(text, pattern) {
-  try {
-    return new RegExp(pattern, "i").test(text);
-  } catch {
-    return false;
-  }
-}
-
-/**
- * @param {string} runLogPath
- */
-function parseRunLog(runLogPath) {
-  const lines = splitLines(readFileSync(runLogPath, "utf-8"));
-  const events = [];
-  for (const line of lines) {
-    try {
-      events.push(JSON.parse(line));
-    } catch {
-      // Ignore malformed lines but keep analysis alive.
-    }
-  }
-  return events;
-}
-
-/**
- * @param {string} sessionPath
- * @returns {string}
- */
-function parseFinalAssistantText(sessionPath) {
-  if (!existsSync(sessionPath)) return "";
-
-  const lines = splitLines(readFileSync(sessionPath, "utf-8"));
-  let latest = "";
-
-  for (const line of lines) {
-    try {
-      const entry = JSON.parse(line);
-      if (entry?.type !== "message") continue;
-      const msg = entry.message;
-      if (!msg || msg.role !== "assistant") continue;
-
-      if (typeof msg.content === "string") {
-        latest = msg.content;
-        continue;
-      }
-
-      if (Array.isArray(msg.content)) {
-        const text = msg.content
-          .filter((part) => part && part.type === "text" && typeof part.text === "string")
-          .map((part) => part.text)
-          .join("\n")
-          .trim();
-        if (text) latest = text;
-      }
-    } catch {
-      // Ignore malformed lines.
-    }
-  }
-
-  return latest;
-}
-
-/**
- * @param {CaseAnalysis} analysis
- * @param {string} id
- * @param {string} check
- * @param {boolean} passed
- * @param {string} [detail]
- */
-function addCheck(analysis, id, check, passed, detail) {
-  analysis.checks.push({ id, check, passed, detail });
-}
-
-const rows = splitLines(readFileSync(manifestPath, "utf-8"));
-if (rows.length <= 1) {
-  console.error(`Manifest has no data rows: ${manifestPath}`);
-  process.exit(1);
-}
-
-/** @type {CaseAnalysis[]} */
-const analyses = [];
-
-for (let i = 1; i < rows.length; i++) {
-  const row = rows[i];
-  if (!row) continue;
-
-  const cols = row.split("\t");
-  if (cols.length < 11) continue;
-
-  const provider = cols[1] ?? "";
-  const caseId = cols[2] ?? "";
-  const rules = CASE_RULES[caseId];
-  const status = cols[3] ?? "";
-  const sessionId = cols[4] ?? "";
-  const sessionDir = cols[5] ?? "";
-  const logFile = cols[6] ?? "";
-
-  /** @type {CaseAnalysis} */
-  const analysis = {
-    provider,
-    caseId,
-    status,
-    sessionId,
-    sessionDir,
-    logFile,
-    checks: [],
-    pass: false,
-  };
-
-  addCheck(
-    analysis,
-    "run-status",
-    "runner status is success",
-    status === "success",
-    `status=${status}`,
-  );
-
-  if (!sessionDir) {
-    addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir");
-    analyses.push(analysis);
-    continue;
-  }
-
-  const runLogPath = join(sessionDir, "run-log.jsonl");
-  addCheck(
-    analysis,
-    "run-log-file",
-    "run-log.jsonl exists",
-    existsSync(runLogPath),
-    runLogPath,
-  );
-
-  if (!existsSync(runLogPath)) {
-    analyses.push(analysis);
-    continue;
-  }
-
-  const events = parseRunLog(runLogPath);
-  const sessionPath = join(sessionDir, "session.jsonl");
-  const finalAssistantText = parseFinalAssistantText(sessionPath);
-  const runStarts = events.filter((e) => e.event === "run_start");
-  const runEnds = events.filter((e) => e.event === "run_end");
-  const toolStarts = events.filter((e) => e.event === "tool_start");
-  const toolEnds = events.filter((e) => e.event === "tool_end");
-  const errorToolEnds = toolEnds.filter((e) => e.is_error === true);
-
-  addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`);
-  addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`);
-  addCheck(
-    analysis,
-    "tool-pairing",
-    "tool_start count matches tool_end count",
-    toolStarts.length === toolEnds.length,
-    `start=${toolStarts.length} end=${toolEnds.length}`,
-  );
-
-  const finalRunEnd = runEnds.at(-1);
-  const runEndError = finalRunEnd?.error;
-  const finalRunText = typeof finalRunEnd?.text === "string" ? finalRunEnd.text : "";
-  const finalResponseText = finalAssistantText || finalRunText;
-  addCheck(
-    analysis,
-    "run-end-error",
-    "final run_end.error is null/empty",
-    runEndError === null || runEndError === undefined || runEndError === "",
-    `error=${String(runEndError)}`,
-  );
-
-  addCheck(
-    analysis,
-    "tool-errors",
-    "no tool_end has is_error=true",
-    errorToolEnds.length === 0,
-    `error_tool_calls=${errorToolEnds.length}`,
-  );
-
-  const execCommands = toolStarts
-    .filter((e) => e.tool === "exec")
-    .map((e) => extractCommand(typeof e.args === "string" ? e.args : ""))
-    .filter(Boolean);
-
-  const requireExecUsage = rules?.requireExecUsage !== false;
-  addCheck(
-    analysis,
-    "exec-usage",
-    requireExecUsage
-      ? "at least one exec command was used"
-      : "exec usage is optional for this case",
-    requireExecUsage ? execCommands.length > 0 : true,
-    requireExecUsage ? `exec_calls=${execCommands.length}` : `exec_calls=${execCommands.length} (optional)`,
-  );
-
-  if (rules) {
-    if (Array.isArray(rules.requiredCommandTokens)) {
-      for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
-        const tokenList = rules.requiredCommandTokens[r];
-        const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
-        addCheck(
-          analysis,
-          `cmd-${r + 1}`,
-          `exec command contains tokens: ${tokenList.join(" + ")}`,
-          passed,
-        );
-      }
-    }
-
-    if (Array.isArray(rules.requiredEventTokens)) {
-      const eventLines = events.map((event) => JSON.stringify(event).toLowerCase());
-      for (let r = 0; r < rules.requiredEventTokens.length; r++) {
-        const tokenList = rules.requiredEventTokens[r];
-        const passed = eventLines.some((line) =>
-          tokenList.every((token) => line.includes(token.toLowerCase())),
-        );
-        addCheck(
-          analysis,
-          `event-${r + 1}`,
-          `event log contains tokens: ${tokenList.join(" + ")}`,
-          passed,
-        );
-      }
-    }
-
-    if (Array.isArray(rules.forbiddenCommandTokens)) {
-      for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) {
-        const tokenList = rules.forbiddenCommandTokens[r];
-        const passed = !execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
-        addCheck(
-          analysis,
-          `forbid-cmd-${r + 1}`,
-          `exec command does not contain tokens: ${tokenList.join(" + ")}`,
-          passed,
-        );
-      }
-    }
-
-    if (Array.isArray(rules.requiredResponseRegex)) {
-      for (let r = 0; r < rules.requiredResponseRegex.length; r++) {
-        const pattern = rules.requiredResponseRegex[r];
-        const passed = textMatchesPattern(finalResponseText, pattern);
-        addCheck(
-          analysis,
-          `resp-${r + 1}`,
-          `final response matches regex: /${pattern}/i`,
-          passed,
-        );
-      }
-    }
-  } else {
-    addCheck(
-      analysis,
-      "case-rules",
-      "case has rule set",
-      false,
-      `No rules defined for case_id=${caseId}`,
-    );
-  }
-
-  analysis.pass = analysis.checks.every((c) => c.passed);
-  analyses.push(analysis);
-}
-
-const passedCases = analyses.filter((a) => a.pass).length;
-const failedCases = analyses.length - passedCases;
-
-const output = {
-  manifestPath,
-  totalCases: analyses.length,
-  passedCases,
-  failedCases,
-  results: analyses,
-};
-
-const outputPath = join(dirname(manifestPath), "analysis.json");
-writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8");
-
-for (const item of analyses) {
-  const status = item.pass ? "PASS" : "FAIL";
-  console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`);
-  for (const check of item.checks) {
-    const marker = check.passed ? "  [ok]  " : "  [bad] ";
-    const detail = check.detail ? ` (${check.detail})` : "";
-    console.log(`${marker}${check.check}${detail}`);
-  }
-}
-
-console.log("");
-console.log(`Analysis file: ${outputPath}`);
-console.log(`Summary: pass=${passedCases} fail=${failedCases}`);
-
-if (failedCases > 0) {
-  process.exit(1);
-}
--- a/scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt
+++ b/scripts/e2e-skills-benchmark/cases/case-01-install-caldav-calendar.txt
@ -1,15 +0,0 @@
-Run an end-to-end test for the Meta Skill Installer.
-
-Goal: install a real ClawHub skill for CalDAV calendar capability.
-Reference page: https://clawhub.ai/skills/caldav-calendar
-
-Follow this exact workflow:
-1. State the missing capability in one sentence.
-2. Search ClawHub for CalDAV-related skills and choose the best candidate.
-3. Stage-install to a temporary directory first (never install directly to active skills path).
-4. Run security review on the staged skill:
-   DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
-   node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
-5. If riskLevel is safe, install to "$DATA_DIR/skills".
-6. Verify final install by checking "$DATA_DIR/skills/<slug>/SKILL.md" exists.
-7. Return a short report: selected slug, riskLevel, final install path.
--- a/scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt
+++ b/scripts/e2e-skills-benchmark/cases/case-02-gap-discovery-homeassistant.txt
@ -1,16 +0,0 @@
-Run an end-to-end capability-gap discovery test for Meta Skill Installer.
-
-User intent: "I need to control Home Assistant lights and switches from the agent."
-Reference page: https://clawhub.ai/skills/homeassistant
-
-Requirements:
-1. Treat this as a missing capability and explicitly define the gap.
-2. Search ClawHub for relevant skills and list the top 3 candidates.
-3. Pick one candidate with rationale (scope match + lower security risk).
-4. Stage-install to a temporary directory.
-5. Run security review:
-   DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
-   node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
-6. If riskLevel is safe, install to "$DATA_DIR/skills".
-7. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
-8. Return: candidate list, chosen slug, riskLevel, and final path.
--- a/scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt
+++ b/scripts/e2e-skills-benchmark/cases/case-03-install-update-codexmonitor.txt
@ -1,16 +0,0 @@
-Run an end-to-end install+update regression test for Meta Skill Installer.
-
-Goal: use a real ClawHub skill and verify install, review, and update flow.
-Reference page: https://clawhub.ai/odrobnik/codexmonitor
-
-Requirements:
-1. Search ClawHub for CodexMonitor and select the matching skill slug.
-2. Stage-install to a temporary directory and run security review:
-   DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
-   node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
-3. If riskLevel is safe, install to "$DATA_DIR/skills".
-4. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
-5. Run an update for the same slug in managed dir:
-   clawhub update "<slug>" --workdir "$DATA_DIR" --dir skills --force
-6. Run security review again on the final installed path.
-7. Return: slug, initial riskLevel, update executed (yes/no), final path.
--- a/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt
+++ b/scripts/e2e-skills-benchmark/cases/case-04-gap-discovery-spotify-ux.txt
@ -1 +0,0 @@
-随机播放 spotify 中的音乐
--- a/scripts/e2e-skills-benchmark/cases/case-05-gap-discovery-notion-ux.txt
+++ b/scripts/e2e-skills-benchmark/cases/case-05-gap-discovery-notion-ux.txt
@ -1 +0,0 @@
-帮我在 Notion 新建一个页面，标题是今天待办，并写入三条任务：修复登录 bug、写周报、安排评审
--- a/scripts/e2e-skills-benchmark/run.sh
+++ b/scripts/e2e-skills-benchmark/run.sh
@ -1,170 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
-CASES_DIR="${SCRIPT_DIR}/cases"
-TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
-OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/skills-e2e-runs/${TIMESTAMP}}"
-RESULTS_DIR="${OUT_DIR}/results"
-MANIFEST="${OUT_DIR}/manifest.tsv"
-
-# Required environment for agent-driven E2E.
-SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
-MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
-PROVIDERS_RAW="${PROVIDERS:-kimi-coding}"
-CASE_GLOB="${CASE_GLOB:-case-*.txt}"
-CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-1200}"
-MAX_PARALLEL="${MAX_PARALLEL:-1}"
-TIMEOUT_ENABLED="true"
-if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
-  TIMEOUT_ENABLED="false"
-fi
-
-if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
-  echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
-  exit 1
-fi
-
-if [[ "${1:-}" == "--worker" ]]; then
-  provider="${2:?missing provider}"
-  case_file="${3:?missing case file}"
-  case_base="$(basename "${case_file}")"
-  case_id="${case_base%.txt}"
-  log_file="${OUT_DIR}/${provider}-${case_id}.log"
-  result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
-
-  prompt="$(cat "${case_file}")"
-
-  status="success"
-  timed_out="false"
-  started_epoch="$(date +%s)"
-  started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
-
-  SMC_DATA_DIR="${SMC_DATA_DIR}" \
-    MULTICA_API_URL="${MULTICA_API_URL}" \
-    pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
-  cmd_pid=$!
-
-  while kill -0 "${cmd_pid}" 2>/dev/null; do
-    if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
-      now="$(date +%s)"
-      elapsed="$((now - started_epoch))"
-      if (( elapsed >= CASE_TIMEOUT_SEC )); then
-        timed_out="true"
-        kill "${cmd_pid}" 2>/dev/null || true
-        sleep 1
-        kill -9 "${cmd_pid}" 2>/dev/null || true
-        break
-      fi
-    fi
-    sleep 2
-  done
-
-  exit_code=0
-  wait "${cmd_pid}" 2>/dev/null || exit_code=$?
-  ended_epoch="$(date +%s)"
-  ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
-  duration_sec="$((ended_epoch - started_epoch))"
-
-  if [[ "${timed_out}" == "true" ]]; then
-    status="timeout"
-    printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
-  elif (( exit_code != 0 )); then
-    status="failed"
-  elif [[ ! -s "${log_file}" ]]; then
-    status="failed"
-  elif ! rg -q "\[session: " "${log_file}"; then
-    status="failed"
-  fi
-
-  session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
-  session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
-
-  printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
-    "${TIMESTAMP}" \
-    "${provider}" \
-    "${case_id}" \
-    "${status}" \
-    "${session_id}" \
-    "${session_dir}" \
-    "${log_file}" \
-    "${started_at}" \
-    "${ended_at}" \
-    "${duration_sec}" \
-    "${exit_code}" > "${result_file}"
-
-  printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
-    "${provider}" \
-    "${case_id}" \
-    "${status}" \
-    "${duration_sec}" \
-    "${session_id:-N/A}"
-  exit 0
-fi
-
-mkdir -p "${OUT_DIR}"
-mkdir -p "${RESULTS_DIR}"
-printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
-
-read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
-
-CASE_FILES=()
-while IFS= read -r line; do
-  CASE_FILES+=("${line}")
-done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
-
-if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
-  echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
-  exit 1
-fi
-
-echo "Output directory: ${OUT_DIR}"
-echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
-echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
-echo "Providers: ${PROVIDERS[*]}"
-echo "Cases: ${#CASE_FILES[@]}"
-echo "Max parallel: ${MAX_PARALLEL}"
-if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
-  echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
-else
-  echo "Case timeout: disabled"
-fi
-
-TASKS=()
-for provider in "${PROVIDERS[@]}"; do
-  for case_file in "${CASE_FILES[@]}"; do
-    TASKS+=("${provider}" "${case_file}")
-  done
-done
-
-echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
-
-export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
-printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
-
-RESULT_FILES=()
-while IFS= read -r line; do
-  RESULT_FILES+=("${line}")
-done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
-
-if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
-  echo "No result files produced in ${RESULTS_DIR}" >&2
-  exit 1
-fi
-
-for result_file in "${RESULT_FILES[@]}"; do
-  cat "${result_file}" >> "${MANIFEST}"
-done
-
-success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
-failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
-timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
-
-echo
-echo "Completed run stage. Manifest: ${MANIFEST}"
-echo "Run summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
-
-echo
-echo "Running structured analysis..."
-node "${SCRIPT_DIR}/analyze.mjs" "${MANIFEST}" | tee "${OUT_DIR}/analysis.txt"
--- a/scripts/generate-code-stats-report.sh
+++ b/scripts/generate-code-stats-report.sh
@ -1,499 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
-OUT_FILE="${1:-$ROOT_DIR/docs/code-stats-report.html}"
-TMP_DIR="$(mktemp -d)"
-
-cleanup() {
-  rm -rf "$TMP_DIR"
-}
-trap cleanup EXIT
-
-cd "$ROOT_DIR"
-
-if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
-  echo "Error: this script must run inside a git repository."
-  exit 1
-fi
-
-# 1) Snapshot LOC from tracked files.
-while IFS= read -r -d '' file; do
-  if [ -f "$file" ]; then
-    wc -l "$file"
-  fi
-done < <(git ls-files -z) > "$TMP_DIR/wc_all.txt"
-
-awk -v out_by_ext="$TMP_DIR/loc_by_ext.tsv" -v out_totals="$TMP_DIR/loc_totals.tsv" '
-{
-  lines = $1
-  $1 = ""
-  sub(/^ +/, "")
-  file = $0
-
-  n = split(file, parts, "/")
-  base = parts[n]
-  ext = base
-
-  if (index(base, ".") > 0) {
-    sub(/.*\./, "", ext)
-  } else {
-    ext = "[noext]"
-  }
-
-  ext_lines[ext] += lines
-  ext_files[ext] += 1
-  files += 1
-  lines_all += lines
-}
-END {
-  for (e in ext_lines) {
-    printf "%s\t%d\t%d\n", e, ext_files[e], ext_lines[e] > out_by_ext
-  }
-
-  source_lines = 0
-  source_files = 0
-  doc_lines = 0
-  doc_files = 0
-  cfg_lines = 0
-  cfg_files = 0
-
-  for (e in ext_lines) {
-    if (e ~ /^(ts|tsx|js|jsx|mjs|cjs|py|css|scss|html|sh)$/) {
-      source_lines += ext_lines[e]
-      source_files += ext_files[e]
-    }
-    if (e == "md") {
-      doc_lines += ext_lines[e]
-      doc_files += ext_files[e]
-    }
-    if (e ~ /^(json|json5|yaml|yml|xsd)$/) {
-      cfg_lines += ext_lines[e]
-      cfg_files += ext_files[e]
-    }
-  }
-
-  printf "files\t%d\nlines\t%d\nsource_files\t%d\nsource_lines\t%d\ndoc_files\t%d\ndoc_lines\t%d\nconfig_files\t%d\nconfig_lines\t%d\n", files, lines_all, source_files, source_lines, doc_files, doc_lines, cfg_files, cfg_lines > out_totals
-}
-' "$TMP_DIR/wc_all.txt"
-
-# 2) Contribution by author (email-normalized).
-git log --all --no-merges --numstat --format='@@@%aN|%aE' | awk -v out="$TMP_DIR/author_by_email.tsv" '
-BEGIN { FS = "\t" }
-/^@@@/ {
-  split(substr($0, 4), h, /\|/)
-  name = h[1]
-  email = h[2]
-  id = email
-
-  if (!(id in display)) {
-    display[id] = name " <" email ">"
-  }
-
-  commits[id] += 1
-  next
-}
-NF == 3 && $1 ~ /^[0-9]+$/ && $2 ~ /^[0-9]+$/ {
-  adds[id] += $1
-  dels[id] += $2
-}
-END {
-  for (k in commits) {
-    printf "%s\t%d\t%d\t%d\t%d\n", display[k], commits[k], adds[k] + 0, dels[k] + 0, (adds[k] - dels[k]) + 0 > out
-  }
-}
-'
-
-sort -t $'\t' -k3,3nr "$TMP_DIR/author_by_email.tsv" > "$TMP_DIR/author_by_email.sorted.tsv"
-
-awk -F '\t' -v out="$TMP_DIR/author_human_share.tsv" '
-$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
-  total_commits += $2
-  total_adds += $3
-  rows[++n] = $0
-}
-END {
-  for (i = 1; i <= n; i++) {
-    split(rows[i], f, "\t")
-    add_pct = (total_adds > 0) ? (f[3] / total_adds * 100) : 0
-    commit_pct = (total_commits > 0) ? (f[2] / total_commits * 100) : 0
-    printf "%s\t%d\t%d\t%d\t%d\t%.2f%%\t%.2f%%\n", f[1], f[2], f[3], f[4], f[5], add_pct, commit_pct > out
-  }
-}
-' "$TMP_DIR/author_by_email.sorted.tsv"
-
-# 3) Contribution by author/day/hour.
-git log --all --no-merges --numstat --date=format:'%Y-%m-%d|%H' --format='@@@%aE|%ad' | awk -v out="$TMP_DIR/author_day_hour_summary.tsv" '
-BEGIN { FS = "\t" }
-/^@@@/ {
-  split(substr($0, 4), h, /\|/)
-  email = h[1]
-  day = h[2]
-  hour = h[3]
-
-  key = email "\t" day "\t" hour
-  commits[key] += 1
-  next
-}
-NF == 3 && $1 ~ /^[0-9]+$/ && $2 ~ /^[0-9]+$/ {
-  adds[key] += $1
-  dels[key] += $2
-}
-END {
-  for (k in commits) {
-    split(k, f, "\t")
-    a = adds[k] + 0
-    d = dels[k] + 0
-    printf "%s\t%s\t%s\t%d\t%d\t%d\t%d\n", f[1], f[2], f[3], commits[k], a, d, (a - d) > out
-  }
-}
-'
-
-awk -F '\t' -v out="$TMP_DIR/day_summary_human.tsv" '
-$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
-  day = $2
-  commits[day] += $4
-  adds[day] += $5
-  dels[day] += $6
-
-  if (!(day in min_hour) || $3 < min_hour[day]) {
-    min_hour[day] = $3
-  }
-
-  if (!(day in max_hour) || $3 > max_hour[day]) {
-    max_hour[day] = $3
-  }
-}
-END {
-  for (d in commits) {
-    printf "%s\t%d\t%d\t%d\t%d\t%s\t%s\n", d, commits[d], adds[d], dels[d], adds[d] - dels[d], min_hour[d], max_hour[d] > out
-  }
-}
-' "$TMP_DIR/author_day_hour_summary.tsv"
-
-sort -t $'\t' -k1,1 "$TMP_DIR/day_summary_human.tsv" -o "$TMP_DIR/day_summary_human.tsv"
-
-awk -F '\t' -v out="$TMP_DIR/hour_summary_human.tsv" '
-$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
-  hour = $3
-  commits[hour] += $4
-  adds[hour] += $5
-  dels[hour] += $6
-}
-END {
-  for (i = 0; i < 24; i++) {
-    h = sprintf("%02d", i)
-    a = adds[h] + 0
-    d = dels[h] + 0
-    printf "%s\t%d\t%d\t%d\t%d\n", h, commits[h] + 0, a, d, a - d > out
-  }
-}
-' "$TMP_DIR/author_day_hour_summary.tsv"
-
-sort -t $'\t' -k1,1 "$TMP_DIR/hour_summary_human.tsv" -o "$TMP_DIR/hour_summary_human.tsv"
-
-awk -F '\t' -v out="$TMP_DIR/day_peak_hour_human.tsv" '
-$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
-  key = $2 "\t" $3
-  commits[key] += $4
-  adds[key] += $5
-  dels[key] += $6
-}
-END {
-  for (k in adds) {
-    split(k, parts, "\t")
-    day = parts[1]
-    hour = parts[2]
-
-    if (!(day in max_adds) || adds[k] > max_adds[day]) {
-      max_adds[day] = adds[k]
-      best_hour[day] = hour
-      best_commits[day] = commits[k]
-      best_dels[day] = dels[k]
-    }
-  }
-
-  for (d in max_adds) {
-    printf "%s\t%s\t%d\t%d\t%d\n", d, best_hour[d], best_commits[d], max_adds[d], best_dels[d] > out
-  }
-}
-' "$TMP_DIR/author_day_hour_summary.tsv"
-
-sort -t $'\t' -k1,1 "$TMP_DIR/day_peak_hour_human.tsv" -o "$TMP_DIR/day_peak_hour_human.tsv"
-
-mkdir -p "$(dirname "$OUT_FILE")"
-
-# 4) Render standalone HTML.
-{
-cat <<'HTML_HEAD'
-<!doctype html>
-<html lang="zh-CN">
-<head>
-  <meta charset="UTF-8" />
-  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-  <title>Super Multica 代码贡献统计</title>
-  <style>
-    :root {
-      --bg: #0b0d10;
-      --panel: #14181d;
-      --panel-2: #1a2027;
-      --line: #2a3440;
-      --text: #e8edf3;
-      --muted: #98a7b7;
-      --ok: #2fbf71;
-      --danger: #ef4444;
-    }
-    * { box-sizing: border-box; }
-    body {
-      margin: 0;
-      font-family: ui-sans-serif, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial;
-      background: radial-gradient(circle at 20% -10%, #1a2430 0%, #0b0d10 45%) fixed;
-      color: var(--text);
-      line-height: 1.4;
-    }
-    .wrap { max-width: 1200px; margin: 0 auto; padding: 24px; }
-    h1 { margin: 0 0 8px; font-size: 28px; }
-    .sub { color: var(--muted); margin-bottom: 20px; }
-    .grid {
-      display: grid;
-      grid-template-columns: repeat(auto-fit, minmax(190px, 1fr));
-      gap: 12px;
-      margin-bottom: 18px;
-    }
-    .card {
-      background: linear-gradient(180deg, var(--panel) 0%, var(--panel-2) 100%);
-      border: 1px solid var(--line);
-      border-radius: 10px;
-      padding: 12px;
-    }
-    .k { color: var(--muted); font-size: 12px; margin-bottom: 8px; }
-    .v { font-size: 24px; font-weight: 700; letter-spacing: 0.3px; }
-    .section { margin-top: 14px; }
-    .section h2 { margin: 0 0 10px; font-size: 16px; color: #d4dde7; }
-    .panel {
-      background: var(--panel);
-      border: 1px solid var(--line);
-      border-radius: 10px;
-      overflow: hidden;
-    }
-    table { width: 100%; border-collapse: collapse; }
-    th, td { padding: 9px 10px; border-bottom: 1px solid var(--line); font-size: 13px; }
-    th { background: #11161c; text-align: left; color: #c5d0db; position: sticky; top: 0; }
-    tr:last-child td { border-bottom: 0; }
-    .num { text-align: right; font-variant-numeric: tabular-nums; }
-    .mono { font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; }
-    .bar-wrap { background: #0f1318; border-radius: 999px; height: 8px; width: 180px; border: 1px solid #273241; }
-    .bar { height: 100%; border-radius: 999px; background: linear-gradient(90deg, #3f7ef7, #58a6ff); }
-    .ok { color: var(--ok); }
-    .danger { color: var(--danger); }
-    .foot { margin-top: 16px; color: var(--muted); font-size: 12px; }
-    .scroll { max-height: 420px; overflow: auto; }
-  </style>
-</head>
-<body>
-  <div class="wrap">
-    <h1>Super Multica 代码贡献统计</h1>
-    <div class="sub" id="subtitle"></div>
-
-    <div class="grid" id="summary"></div>
-
-    <div class="section">
-      <h2>代码量分布（按扩展名）</h2>
-      <div class="panel scroll"><table id="extTable"></table></div>
-    </div>
-
-    <div class="section">
-      <h2>人员贡献（人工口径）</h2>
-      <div class="panel scroll"><table id="authorTable"></table></div>
-    </div>
-
-    <div class="section">
-      <h2>每日贡献（人工口径）</h2>
-      <div class="panel scroll"><table id="dayTable"></table></div>
-    </div>
-
-    <div class="section">
-      <h2>小时段贡献（人工口径）</h2>
-      <div class="panel scroll"><table id="hourTable"></table></div>
-    </div>
-
-    <div class="foot">数据来源：git log --numstat 与当前工作树文件统计。人工口径排除 checkpointer / dependabot。</div>
-  </div>
-
-  <script>
-    const RAW = {
-      locTotals: String.raw`
-HTML_HEAD
-cat "$TMP_DIR/loc_totals.tsv"
-cat <<'MID1'
-`,
-      locByExt: String.raw`
-MID1
-cat "$TMP_DIR/loc_by_ext.tsv"
-cat <<'MID2'
-`,
-      authorHuman: String.raw`
-MID2
-cat "$TMP_DIR/author_human_share.tsv"
-cat <<'MID3'
-`,
-      dayHuman: String.raw`
-MID3
-cat "$TMP_DIR/day_summary_human.tsv"
-cat <<'MID4'
-`,
-      hourHuman: String.raw`
-MID4
-cat "$TMP_DIR/hour_summary_human.tsv"
-cat <<'MID5'
-`,
-      dayPeak: String.raw`
-MID5
-cat "$TMP_DIR/day_peak_hour_human.tsv"
-cat <<'HTML_TAIL'
-`
-    };
-
-    const fmt = (n) => Number(n).toLocaleString("en-US");
-    const tsv = (txt) => txt.trim().split(/\n+/).map((line) => line.split("\t"));
-    const toNum = (v) => Number(v || 0);
-
-    const locTotalsRows = tsv(RAW.locTotals);
-    const locTotals = Object.fromEntries(locTotalsRows.map(([k, v]) => [k, toNum(v)]));
-
-    const extRows = tsv(RAW.locByExt).map(([ext, files, lines]) => ({
-      ext,
-      files: toNum(files),
-      lines: toNum(lines),
-    })).sort((a, b) => b.lines - a.lines);
-
-    const authors = tsv(RAW.authorHuman).map(([name, commits, add, del, net, addPct, commitPct]) => ({
-      name,
-      commits: toNum(commits),
-      add: toNum(add),
-      del: toNum(del),
-      net: toNum(net),
-      addPct,
-      commitPct,
-    })).sort((a, b) => b.add - a.add);
-
-    const dayPeaks = Object.fromEntries(tsv(RAW.dayPeak).map(([d, h, c, a, del]) => [d, {
-      hour: h,
-      commits: toNum(c),
-      add: toNum(a),
-      del: toNum(del),
-    }]));
-
-    const days = tsv(RAW.dayHuman).map(([date, commits, add, del, net, startHour, endHour]) => ({
-      date,
-      commits: toNum(commits),
-      add: toNum(add),
-      del: toNum(del),
-      net: toNum(net),
-      startHour,
-      endHour,
-      peak: dayPeaks[date] || null,
-    })).sort((a, b) => a.date.localeCompare(b.date));
-
-    const hours = tsv(RAW.hourHuman).map(([hour, commits, add, del, net]) => ({
-      hour,
-      commits: toNum(commits),
-      add: toNum(add),
-      del: toNum(del),
-      net: toNum(net),
-    })).sort((a, b) => a.hour.localeCompare(b.hour));
-
-    const totalHumanCommits = authors.reduce((sum, x) => sum + x.commits, 0);
-    const totalHumanAdd = authors.reduce((sum, x) => sum + x.add, 0);
-    const totalHumanDel = authors.reduce((sum, x) => sum + x.del, 0);
-    const topHour = [...hours].sort((a, b) => b.add - a.add)[0] || { hour: "--", add: 0 };
-    const startDate = days[0]?.date || "--";
-    const endDate = days[days.length - 1]?.date || "--";
-
-    document.getElementById("subtitle").textContent = `${startDate} ~ ${endDate}`;
-
-    const summaryItems = [
-      ["总文件数", fmt(locTotals.files || 0)],
-      ["总行数", fmt(locTotals.lines || 0)],
-      ["源码行数", fmt(locTotals.source_lines || 0)],
-      ["贡献人数", fmt(authors.length)],
-      ["人工提交数", fmt(totalHumanCommits)],
-      ["人工新增", fmt(totalHumanAdd)],
-      ["人工删除", fmt(totalHumanDel)],
-      ["最高产小时", `${topHour.hour}:00 (${fmt(topHour.add)})`],
-    ];
-
-    document.getElementById("summary").innerHTML = summaryItems.map(([k, v]) => (
-      `<div class="card"><div class="k">${k}</div><div class="v">${v}</div></div>`
-    )).join("");
-
-    const maxExtLines = Math.max(...extRows.map((x) => x.lines), 1);
-    document.getElementById("extTable").innerHTML = `
-      <thead><tr><th>扩展名</th><th class="num">文件数</th><th class="num">行数</th><th>占比</th><th>可视化</th></tr></thead>
-      <tbody>
-        ${extRows.map((r) => {
-          const pct = ((r.lines / (locTotals.lines || 1)) * 100).toFixed(2);
-          const w = ((r.lines / maxExtLines) * 100).toFixed(1);
-          return `<tr>
-            <td class="mono">${r.ext}</td>
-            <td class="num">${fmt(r.files)}</td>
-            <td class="num">${fmt(r.lines)}</td>
-            <td class="num">${pct}%</td>
-            <td><div class="bar-wrap"><div class="bar" style="width:${w}%"></div></div></td>
-          </tr>`;
-        }).join("")}
-      </tbody>`;
-
-    document.getElementById("authorTable").innerHTML = `
-      <thead><tr><th>作者</th><th class="num">提交</th><th class="num">新增</th><th class="num">删除</th><th class="num">净新增</th><th class="num">新增占比</th><th class="num">提交占比</th></tr></thead>
-      <tbody>
-        ${authors.map((a) => `<tr>
-          <td>${a.name}</td>
-          <td class="num">${fmt(a.commits)}</td>
-          <td class="num">${fmt(a.add)}</td>
-          <td class="num">${fmt(a.del)}</td>
-          <td class="num ${a.net >= 0 ? "ok" : "danger"}">${fmt(a.net)}</td>
-          <td class="num">${a.addPct}</td>
-          <td class="num">${a.commitPct}</td>
-        </tr>`).join("")}
-      </tbody>`;
-
-    document.getElementById("dayTable").innerHTML = `
-      <thead><tr><th>日期</th><th class="num">提交</th><th class="num">新增</th><th class="num">删除</th><th class="num">净新增</th><th>活跃时段</th><th>峰值小时</th></tr></thead>
-      <tbody>
-        ${days.map((d) => `<tr>
-          <td class="mono">${d.date}</td>
-          <td class="num">${fmt(d.commits)}</td>
-          <td class="num">${fmt(d.add)}</td>
-          <td class="num">${fmt(d.del)}</td>
-          <td class="num ${d.net >= 0 ? "ok" : "danger"}">${fmt(d.net)}</td>
-          <td class="mono">${d.startHour}:00 - ${d.endHour}:59</td>
-          <td class="mono">${d.peak ? `${d.peak.hour}:00 (${fmt(d.peak.add)})` : "--"}</td>
-        </tr>`).join("")}
-      </tbody>`;
-
-    const maxHourAdd = Math.max(...hours.map((h) => h.add), 1);
-    document.getElementById("hourTable").innerHTML = `
-      <thead><tr><th>小时</th><th class="num">提交</th><th class="num">新增</th><th class="num">删除</th><th class="num">净新增</th><th>可视化</th></tr></thead>
-      <tbody>
-        ${hours.map((h) => {
-          const w = ((h.add / maxHourAdd) * 100).toFixed(1);
-          return `<tr>
-            <td class="mono">${h.hour}:00</td>
-            <td class="num">${fmt(h.commits)}</td>
-            <td class="num">${fmt(h.add)}</td>
-            <td class="num">${fmt(h.del)}</td>
-            <td class="num ${h.net >= 0 ? "ok" : "danger"}">${fmt(h.net)}</td>
-            <td><div class="bar-wrap"><div class="bar" style="width:${w}%"></div></div></td>
-          </tr>`;
-        }).join("")}
-      </tbody>`;
-  </script>
-</body>
-</html>
-HTML_TAIL
-} > "$OUT_FILE"
-
-echo "Report generated: $OUT_FILE"
--- a/scripts/reset-user-data.sh
+++ b/scripts/reset-user-data.sh
@ -1,53 +0,0 @@
-#!/bin/bash
-# Reset all user data for super-multica desktop app
-# Use this to simulate a fresh install for testing
-
-set -e
-
-echo "🧹 Resetting Super Multica user data..."
-
-# Main data directory
-MULTICA_DATA_DIR="$HOME/.super-multica"
-if [ -d "$MULTICA_DATA_DIR" ]; then
-  echo "  Removing $MULTICA_DATA_DIR"
-  rm -rf "$MULTICA_DATA_DIR"
-else
-  echo "  $MULTICA_DATA_DIR does not exist, skipping"
-fi
-
-# Dev data directory (used by pnpm dev:local)
-MULTICA_DEV_DIR="$HOME/.super-multica-dev"
-if [ -d "$MULTICA_DEV_DIR" ]; then
-  echo "  Removing $MULTICA_DEV_DIR"
-  rm -rf "$MULTICA_DEV_DIR"
-else
-  echo "  $MULTICA_DEV_DIR does not exist, skipping"
-fi
-
-# Electron app data (macOS)
-if [[ "$OSTYPE" == "darwin"* ]]; then
-  ELECTRON_APP_DATA="$HOME/Library/Application Support/super-multica"
-  if [ -d "$ELECTRON_APP_DATA" ]; then
-    echo "  Removing $ELECTRON_APP_DATA"
-    rm -rf "$ELECTRON_APP_DATA"
-  else
-    echo "  $ELECTRON_APP_DATA does not exist, skipping"
-  fi
-fi
-
-# Electron app data (Linux)
-if [[ "$OSTYPE" == "linux-gnu"* ]]; then
-  ELECTRON_APP_DATA="$HOME/.config/super-multica"
-  if [ -d "$ELECTRON_APP_DATA" ]; then
-    echo "  Removing $ELECTRON_APP_DATA"
-    rm -rf "$ELECTRON_APP_DATA"
-  else
-    echo "  $ELECTRON_APP_DATA does not exist, skipping"
-  fi
-fi
-
-echo "✅ User data reset complete!"
-echo ""
-echo "Next steps:"
-echo "  pnpm dev              # Start app (will show onboarding)"
-echo "  pnpm dev:reset        # Reset and start in one command"
--- a/scripts/set-telegram-webhook.sh
+++ b/scripts/set-telegram-webhook.sh
@ -1,60 +0,0 @@
-#!/usr/bin/env bash
-#
-# Set Telegram Bot Webhook
-#
-# Usage:
-#   ./scripts/set-telegram-webhook.sh <webhook_url>
-#
-# Example:
-#   ./scripts/set-telegram-webhook.sh https://your-domain.ngrok-free.dev
-#
-# Reads TELEGRAM_BOT_TOKEN and TELEGRAM_WEBHOOK_SECRET_TOKEN from .env
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-ENV_FILE="$SCRIPT_DIR/../.env"
-
-if [ ! -f "$ENV_FILE" ]; then
-  echo "Error: .env file not found at $ENV_FILE"
-  exit 1
-fi
-
-source "$ENV_FILE"
-
-if [ -z "${TELEGRAM_BOT_TOKEN:-}" ]; then
-  echo "Error: TELEGRAM_BOT_TOKEN not set in .env"
-  exit 1
-fi
-
-WEBHOOK_BASE_URL="${1:-}"
-
-if [ -z "$WEBHOOK_BASE_URL" ]; then
-  echo "Usage: $0 <webhook_base_url>"
-  echo ""
-  echo "Example:"
-  echo "  $0 https://your-domain.ngrok-free.dev"
-  exit 1
-fi
-
-# Remove trailing slash
-WEBHOOK_BASE_URL="${WEBHOOK_BASE_URL%/}"
-WEBHOOK_URL="${WEBHOOK_BASE_URL}/telegram/webhook"
-
-echo "Bot Token:    ${TELEGRAM_BOT_TOKEN:0:10}..."
-echo "Secret Token: ${TELEGRAM_WEBHOOK_SECRET_TOKEN:0:8}..."
-echo "Webhook URL:  $WEBHOOK_URL"
-echo ""
-
-# Set webhook
-echo "=> Setting webhook..."
-RESPONSE=$(curl -s "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/setWebhook" \
-  -d "url=${WEBHOOK_URL}" \
-  -d "secret_token=${TELEGRAM_WEBHOOK_SECRET_TOKEN:-}")
-
-echo "$RESPONSE" | python3 -m json.tool 2>/dev/null || echo "$RESPONSE"
-
-echo ""
-echo "=> Verifying webhook info..."
-INFO=$(curl -s "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/getWebhookInfo")
-echo "$INFO" | python3 -m json.tool 2>/dev/null || echo "$INFO"
--- a/scripts/swe-bench/.gitignore
+++ b/scripts/swe-bench/.gitignore
@ -1,5 +0,0 @@
-# Downloaded datasets
-*.jsonl
-
-# Don't ignore the scripts themselves
-!.gitignore
--- a/scripts/swe-bench/analyze.ts
+++ b/scripts/swe-bench/analyze.ts
@ -1,116 +0,0 @@
-#!/usr/bin/env tsx
-/**
- * Analyze SWE-bench run results.
- *
- * Reads the .results.jsonl file produced by run.ts and prints a summary.
- *
- * Usage:
- *   tsx scripts/swe-bench/analyze.ts [results.jsonl]
- */
-
-import { readFileSync, existsSync } from "node:fs";
-import { resolve, join } from "node:path";
-
-interface RunResult {
-  instance_id: string;
-  success: boolean;
-  patch: string;
-  error?: string;
-  duration_ms: number;
-  session_id: string;
-}
-
-function main() {
-  const resultsPath = resolve(
-    process.argv[2] || "scripts/swe-bench/predictions.results.jsonl",
-  );
-
-  if (!existsSync(resultsPath)) {
-    console.error(`Results file not found: ${resultsPath}`);
-    process.exit(1);
-  }
-
-  const lines = readFileSync(resultsPath, "utf-8").split("\n").filter(Boolean);
-  const results: RunResult[] = lines.map((l) => JSON.parse(l));
-
-  const total = results.length;
-  const patched = results.filter((r) => r.success).length;
-  const failed = results.filter((r) => !r.success).length;
-  const errors = results.filter((r) => r.error).length;
-  const durations = results.map((r) => r.duration_ms);
-  const avgDuration = durations.reduce((a, b) => a + b, 0) / total;
-  const maxDuration = Math.max(...durations);
-  const minDuration = Math.min(...durations);
-  const patchSizes = results
-    .filter((r) => r.success)
-    .map((r) => r.patch.length);
-  const avgPatchSize =
-    patchSizes.length > 0
-      ? patchSizes.reduce((a, b) => a + b, 0) / patchSizes.length
-      : 0;
-
-  console.log("=== SWE-bench Run Analysis ===\n");
-  console.log(`Total tasks:     ${total}`);
-  console.log(`Patched:         ${patched} (${((patched / total) * 100).toFixed(1)}%)`);
-  console.log(`No patch:        ${failed}`);
-  console.log(`Errors:          ${errors}`);
-  console.log();
-  console.log(`Avg duration:    ${(avgDuration / 1000).toFixed(1)}s`);
-  console.log(`Min duration:    ${(minDuration / 1000).toFixed(1)}s`);
-  console.log(`Max duration:    ${(maxDuration / 1000).toFixed(1)}s`);
-  console.log(`Avg patch size:  ${(avgPatchSize / 1024).toFixed(1)}KB`);
-
-  // Error breakdown
-  if (errors > 0) {
-    console.log("\n--- Errors ---");
-    const errorCounts = new Map<string, number>();
-    for (const r of results) {
-      if (r.error) {
-        const key = r.error.length > 60 ? r.error.slice(0, 60) + "..." : r.error;
-        errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
-      }
-    }
-    for (const [err, count] of [...errorCounts.entries()].sort(
-      (a, b) => b[1] - a[1],
-    )) {
-      console.log(`  ${count}x  ${err}`);
-    }
-  }
-
-  // Per-repo breakdown
-  console.log("\n--- By Repository ---");
-  const repoStats = new Map<string, { total: number; patched: number }>();
-  for (const r of results) {
-    const repo = r.instance_id.split("__")[0]?.replace(/__/g, "/") || "unknown";
-    const stats = repoStats.get(repo) || { total: 0, patched: 0 };
-    stats.total++;
-    if (r.success) stats.patched++;
-    repoStats.set(repo, stats);
-  }
-  for (const [repo, stats] of [...repoStats.entries()].sort(
-    (a, b) => b[1].total - a[1].total,
-  )) {
-    const pct = ((stats.patched / stats.total) * 100).toFixed(0);
-    console.log(
-      `  ${repo.padEnd(30)} ${stats.patched}/${stats.total} (${pct}%)`,
-    );
-  }
-
-  // Slowest tasks
-  console.log("\n--- Slowest Tasks ---");
-  const sorted = [...results].sort((a, b) => b.duration_ms - a.duration_ms);
-  for (const r of sorted.slice(0, 5)) {
-    console.log(
-      `  ${(r.duration_ms / 1000).toFixed(1)}s  ${r.instance_id}  ${r.success ? "PATCHED" : "NO_PATCH"}`,
-    );
-  }
-
-  // Session IDs for further analysis
-  const dataDir = process.env.SMC_DATA_DIR || join(process.env.HOME || "~", ".swe-bench-eval");
-  console.log(`\n--- Run Logs ---`);
-  console.log(`Session data: ${dataDir}/sessions/`);
-  console.log(`View a session's run log:`);
-  console.log(`  cat ${dataDir}/sessions/<session-id>/run-log.jsonl | head -20`);
-}
-
-main();
--- a/scripts/swe-bench/download-dataset.py
+++ b/scripts/swe-bench/download-dataset.py
@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-"""
-Download SWE-bench dataset from HuggingFace and export to JSONL for the Node.js runner.
-
-Usage:
-  pip install datasets
-  python scripts/swe-bench/download-dataset.py [--dataset verified|lite|full] [--limit N] [--output PATH]
-
-Output format (one JSON object per line):
-  {
-    "instance_id": "django__django-16379",
-    "repo": "django/django",
-    "base_commit": "abc123...",
-    "problem_statement": "...",
-    "hints_text": "...",
-    "patch": "...",           # gold patch (for reference, not shown to agent)
-    "test_patch": "...",      # test patch applied during evaluation
-    "version": "4.2",
-    "environment_setup_commit": "..."
-  }
-"""
-
-import argparse
-import json
-import sys
-
-DATASET_MAP = {
-    "verified": "princeton-nlp/SWE-bench_Verified",
-    "lite": "princeton-nlp/SWE-bench_Lite",
-    "full": "princeton-nlp/SWE-bench",
-}
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Download SWE-bench dataset to JSONL")
-    parser.add_argument(
-        "--dataset",
-        choices=["verified", "lite", "full"],
-        default="lite",
-        help="Dataset variant (default: lite)",
-    )
-    parser.add_argument(
-        "--limit", type=int, default=0, help="Limit number of instances (0 = all)"
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        default=None,
-        help="Output JSONL path (default: scripts/swe-bench/<dataset>.jsonl)",
-    )
-    parser.add_argument(
-        "--split",
-        type=str,
-        default="test",
-        help="Dataset split (default: test)",
-    )
-    args = parser.parse_args()
-
-    try:
-        from datasets import load_dataset
-    except ImportError:
-        print("Error: 'datasets' package not installed. Run: pip install datasets", file=sys.stderr)
-        sys.exit(1)
-
-    dataset_name = DATASET_MAP[args.dataset]
-    output_path = args.output or f"scripts/swe-bench/{args.dataset}.jsonl"
-
-    print(f"Downloading {dataset_name} (split={args.split})...", file=sys.stderr)
-    ds = load_dataset(dataset_name, split=args.split)
-
-    # Fields to keep
-    keep_fields = [
-        "instance_id",
-        "repo",
-        "base_commit",
-        "problem_statement",
-        "hints_text",
-        "patch",
-        "test_patch",
-        "version",
-        "environment_setup_commit",
-    ]
-
-    count = 0
-    with open(output_path, "w") as f:
-        for item in ds:
-            record = {}
-            for field in keep_fields:
-                if field in item:
-                    record[field] = item[field]
-            f.write(json.dumps(record, ensure_ascii=False) + "\n")
-            count += 1
-            if args.limit and count >= args.limit:
-                break
-
-    print(f"Wrote {count} instances to {output_path}", file=sys.stderr)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/swe-bench/evaluate.sh
+++ b/scripts/swe-bench/evaluate.sh
@ -1,68 +0,0 @@
-#!/usr/bin/env bash
-#
-# Evaluate Multica predictions against SWE-bench using the official Docker harness.
-#
-# Prerequisites:
-#   pip install swebench
-#   Docker running with at least 120GB storage, 16GB RAM, 8 CPU cores
-#
-# Usage:
-#   bash scripts/swe-bench/evaluate.sh [predictions.jsonl] [dataset] [run_id]
-#
-# Examples:
-#   bash scripts/swe-bench/evaluate.sh
-#   bash scripts/swe-bench/evaluate.sh scripts/swe-bench/predictions.jsonl lite multica-v1
-
-set -euo pipefail
-
-PREDICTIONS="${1:-scripts/swe-bench/predictions.jsonl}"
-DATASET="${2:-lite}"
-RUN_ID="${3:-multica}"
-
-# Map short names to HuggingFace dataset names
-case "$DATASET" in
-  lite)     DATASET_NAME="princeton-nlp/SWE-bench_Lite" ;;
-  verified) DATASET_NAME="princeton-nlp/SWE-bench_Verified" ;;
-  full)     DATASET_NAME="princeton-nlp/SWE-bench" ;;
-  *)        DATASET_NAME="$DATASET" ;;
-esac
-
-echo "=== SWE-bench Evaluation ==="
-echo "Predictions: $PREDICTIONS"
-echo "Dataset:     $DATASET_NAME"
-echo "Run ID:      $RUN_ID"
-echo ""
-
-if [ ! -f "$PREDICTIONS" ]; then
-  echo "Error: Predictions file not found: $PREDICTIONS"
-  exit 1
-fi
-
-TASK_COUNT=$(wc -l < "$PREDICTIONS" | tr -d ' ')
-echo "Tasks to evaluate: $TASK_COUNT"
-echo ""
-
-# Check if swebench is installed
-if ! python -c "import swebench" 2>/dev/null; then
-  echo "Error: swebench not installed. Run: pip install swebench"
-  exit 1
-fi
-
-# Check if Docker is running
-if ! docker info >/dev/null 2>&1; then
-  echo "Error: Docker is not running"
-  exit 1
-fi
-
-echo "Starting evaluation (this may take a while)..."
-echo ""
-
-python -m swebench.harness.run_evaluation \
-  --dataset_name "$DATASET_NAME" \
-  --predictions_path "$PREDICTIONS" \
-  --max_workers 4 \
-  --run_id "$RUN_ID"
-
-echo ""
-echo "=== Evaluation Complete ==="
-echo "Check logs/ and evaluation_results/ for detailed results."
--- a/scripts/swe-bench/run.ts
+++ b/scripts/swe-bench/run.ts
@ -1,392 +0,0 @@
-#!/usr/bin/env tsx
-/**
- * SWE-bench Runner for Multica
- *
- * Runs the Multica agent against SWE-bench task instances and collects patches.
- *
- * Usage:
- *   tsx scripts/swe-bench/run.ts [options]
- *
- * Options:
- *   --dataset PATH      Path to JSONL dataset (default: scripts/swe-bench/lite.jsonl)
- *   --provider NAME     LLM provider (default: kimi-coding)
- *   --model NAME        Model name
- *   --limit N           Max tasks to run (default: all)
- *   --offset N          Skip first N tasks (default: 0)
- *   --output PATH       Output predictions JSONL (default: scripts/swe-bench/predictions.jsonl)
- *   --workdir PATH      Working directory for repos (default: /tmp/swe-bench)
- *   --timeout MS        Timeout per task in ms (default: 300000 = 5min)
- *   --instance ID       Run a single instance by ID
- *   --debug             Enable debug logging
- */
-
-import { readFileSync, writeFileSync, appendFileSync, existsSync, mkdirSync } from "node:fs";
-import { join, resolve } from "node:path";
-import { execSync, spawn } from "node:child_process";
-import { Agent } from "@multica/core";
-import type { AgentOptions } from "@multica/core";
-
-// ============================================================
-// Types
-// ============================================================
-
-interface SWEBenchTask {
-  instance_id: string;
-  repo: string;
-  base_commit: string;
-  problem_statement: string;
-  hints_text?: string;
-  patch?: string;
-  test_patch?: string;
-  version?: string;
-  environment_setup_commit?: string;
-}
-
-interface Prediction {
-  instance_id: string;
-  model_patch: string;
-  model_name_or_path: string;
-}
-
-interface RunResult {
-  instance_id: string;
-  success: boolean;
-  patch: string;
-  error?: string;
-  duration_ms: number;
-  session_id: string;
-}
-
-// ============================================================
-// CLI argument parsing
-// ============================================================
-
-interface RunOptions {
-  dataset: string;
-  provider: string;
-  model?: string;
-  limit: number;
-  offset: number;
-  output: string;
-  workdir: string;
-  timeout: number;
-  instance?: string;
-  debug: boolean;
-}
-
-function parseArgs(): RunOptions {
-  const args = process.argv.slice(2);
-  const opts: RunOptions = {
-    dataset: "scripts/swe-bench/lite.jsonl",
-    provider: "kimi-coding",
-    limit: 0,
-    offset: 0,
-    output: "scripts/swe-bench/predictions.jsonl",
-    workdir: "/tmp/swe-bench",
-    timeout: 300_000, // 5 minutes
-    debug: false,
-  };
-
-  for (let i = 0; i < args.length; i++) {
-    const arg = args[i]!;
-    if (arg === "--dataset") opts.dataset = args[++i]!;
-    else if (arg === "--provider") opts.provider = args[++i]!;
-    else if (arg === "--model") opts.model = args[++i]!;
-    else if (arg === "--limit") opts.limit = parseInt(args[++i]!, 10);
-    else if (arg === "--offset") opts.offset = parseInt(args[++i]!, 10);
-    else if (arg === "--output") opts.output = args[++i]!;
-    else if (arg === "--workdir") opts.workdir = args[++i]!;
-    else if (arg === "--timeout") opts.timeout = parseInt(args[++i]!, 10);
-    else if (arg === "--instance") opts.instance = args[++i]!;
-    else if (arg === "--debug") opts.debug = true;
-    else {
-      console.error(`Unknown argument: ${arg}`);
-      process.exit(1);
-    }
-  }
-
-  return opts;
-}
-
-// ============================================================
-// Dataset loading
-// ============================================================
-
-function loadDataset(path: string): SWEBenchTask[] {
-  if (!existsSync(path)) {
-    console.error(`Dataset not found: ${path}`);
-    console.error("Run: python scripts/swe-bench/download-dataset.py");
-    process.exit(1);
-  }
-  const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
-  return lines.map((line) => JSON.parse(line) as SWEBenchTask);
-}
-
-// ============================================================
-// Repository setup
-// ============================================================
-
-function setupRepo(task: SWEBenchTask, workdir: string): string {
-  const repoDir = join(workdir, task.instance_id.replace(/\//g, "__"));
-
-  if (existsSync(repoDir)) {
-    // Reset existing repo to base commit
-    log(`  Resetting existing repo to ${task.base_commit.slice(0, 8)}...`);
-    execSync(`git checkout -f ${task.base_commit} && git clean -fdx`, {
-      cwd: repoDir,
-      stdio: "pipe",
-      timeout: 60_000,
-    });
-  } else {
-    // Clone from GitHub
-    const repoUrl = `https://github.com/${task.repo}.git`;
-    log(`  Cloning ${task.repo}...`);
-    mkdirSync(workdir, { recursive: true });
-    execSync(`git clone --quiet ${repoUrl} "${repoDir}"`, {
-      stdio: "pipe",
-      timeout: 120_000,
-    });
-    execSync(`git checkout -f ${task.base_commit}`, {
-      cwd: repoDir,
-      stdio: "pipe",
-      timeout: 30_000,
-    });
-  }
-
-  return repoDir;
-}
-
-// ============================================================
-// System prompt
-// ============================================================
-
-function buildSystemPrompt(task: SWEBenchTask): string {
-  return `You are an expert software engineer tasked with fixing a bug in an open-source repository.
-
-## Instructions
-
-1. Read the issue description carefully and understand the problem.
-2. Explore the repository to find the relevant source code.
-3. Identify the root cause of the issue.
-4. Make the minimal set of changes to fix the issue. Do NOT add tests.
-5. After making changes, verify your fix makes sense.
-
-## Important Rules
-
- Make ONLY the changes necessary to fix the described issue.
- Do NOT modify or add any test files.
- Do NOT add comments explaining the fix unless the code is non-obvious.
- Do NOT refactor unrelated code.
- Keep changes minimal and focused.
-
-## Repository
-
-This is the \`${task.repo}\` repository checked out at commit \`${task.base_commit.slice(0, 12)}\`.`;
-}
-
-function buildPrompt(task: SWEBenchTask): string {
-  let prompt = `## Issue\n\n${task.problem_statement}`;
-  if (task.hints_text) {
-    prompt += `\n\n## Hints\n\n${task.hints_text}`;
-  }
-  prompt += `\n\nPlease fix this issue. Remember: make minimal changes, do not modify tests.`;
-  return prompt;
-}
-
-// ============================================================
-// Run a single task
-// ============================================================
-
-async function runTask(
-  task: SWEBenchTask,
-  opts: RunOptions,
-): Promise<RunResult> {
-  const start = Date.now();
-
-  // Setup repo
-  const repoDir = setupRepo(task, opts.workdir);
-
-  // Create agent
-  const agentOptions: AgentOptions = {
-    provider: opts.provider,
-    model: opts.model,
-    cwd: repoDir,
-    enableRunLog: true,
-    debug: opts.debug,
-    systemPrompt: buildSystemPrompt(task),
-    enableSkills: false,
-    tools: {
-      // Only allow coding tools — no web, no cron, no sessions
-      deny: ["web_fetch", "web_search", "cron", "data", "delegate", "send_file"],
-    },
-  };
-
-  const agent = new Agent(agentOptions);
-
-  log(`  Session: ${agent.sessionId}`);
-
-  try {
-    // Run agent with timeout
-    const result = await Promise.race([
-      agent.run(buildPrompt(task)),
-      new Promise<never>((_, reject) =>
-        setTimeout(() => reject(new Error("timeout")), opts.timeout),
-      ),
-    ]);
-
-    // Collect the git diff (the patch)
-    let patch = "";
-    try {
-      patch = execSync("git diff", {
-        cwd: repoDir,
-        encoding: "utf-8",
-        maxBuffer: 10 * 1024 * 1024, // 10MB
-        timeout: 10_000,
-      });
-    } catch {
-      // Also check for staged changes
-      try {
-        patch = execSync("git diff HEAD", {
-          cwd: repoDir,
-          encoding: "utf-8",
-          maxBuffer: 10 * 1024 * 1024,
-          timeout: 10_000,
-        });
-      } catch {
-        patch = "";
-      }
-    }
-
-    return {
-      instance_id: task.instance_id,
-      success: patch.length > 0,
-      patch,
-      error: result.error,
-      duration_ms: Date.now() - start,
-      session_id: agent.sessionId,
-    };
-  } catch (err) {
-    // Collect any partial patch
-    let patch = "";
-    try {
-      patch = execSync("git diff", {
-        cwd: repoDir,
-        encoding: "utf-8",
-        maxBuffer: 10 * 1024 * 1024,
-        timeout: 10_000,
-      });
-    } catch {
-      // ignore
-    }
-
-    return {
-      instance_id: task.instance_id,
-      success: false,
-      patch,
-      error: err instanceof Error ? err.message : String(err),
-      duration_ms: Date.now() - start,
-      session_id: agent.sessionId,
-    };
-  }
-}
-
-// ============================================================
-// Logging
-// ============================================================
-
-function log(msg: string) {
-  const ts = new Date().toISOString().slice(11, 19);
-  console.error(`[${ts}] ${msg}`);
-}
-
-// ============================================================
-// Main
-// ============================================================
-
-async function main() {
-  const opts = parseArgs();
-
-  log("SWE-bench Runner for Multica");
-  log(`Provider: ${opts.provider}${opts.model ? ` (${opts.model})` : ""}`);
-  log(`Dataset: ${opts.dataset}`);
-  log(`Work dir: ${opts.workdir}`);
-  log(`Timeout: ${opts.timeout / 1000}s per task`);
-
-  // Set SMC_DATA_DIR for isolation
-  if (!process.env.SMC_DATA_DIR) {
-    process.env.SMC_DATA_DIR = join(process.env.HOME || "~", ".swe-bench-eval");
-    log(`SMC_DATA_DIR: ${process.env.SMC_DATA_DIR}`);
-  }
-
-  // Load dataset
-  let tasks = loadDataset(resolve(opts.dataset));
-  log(`Loaded ${tasks.length} tasks`);
-
-  // Filter by instance ID if specified
-  if (opts.instance) {
-    tasks = tasks.filter((t) => t.instance_id === opts.instance);
-    if (tasks.length === 0) {
-      console.error(`Instance not found: ${opts.instance}`);
-      process.exit(1);
-    }
-  }
-
-  // Apply offset and limit
-  if (opts.offset > 0) {
-    tasks = tasks.slice(opts.offset);
-  }
-  if (opts.limit > 0) {
-    tasks = tasks.slice(0, opts.limit);
-  }
-
-  log(`Running ${tasks.length} tasks`);
-
-  // Prepare output
-  const outputPath = resolve(opts.output);
-  const resultsPath = outputPath.replace(".jsonl", ".results.jsonl");
-
-  // Run tasks sequentially
-  const modelName = `multica-${opts.provider}${opts.model ? `-${opts.model}` : ""}`;
-  let completed = 0;
-  let succeeded = 0;
-
-  for (const task of tasks) {
-    completed++;
-    log(`\n[${completed}/${tasks.length}] ${task.instance_id}`);
-
-    const result = await runTask(task, opts);
-
-    if (result.success) succeeded++;
-
-    // Write prediction in SWE-bench format
-    const prediction: Prediction = {
-      instance_id: result.instance_id,
-      model_patch: result.patch,
-      model_name_or_path: modelName,
-    };
-    appendFileSync(outputPath, JSON.stringify(prediction) + "\n");
-
-    // Write detailed result
-    appendFileSync(resultsPath, JSON.stringify(result) + "\n");
-
-    const status = result.success ? "PATCHED" : "NO_PATCH";
-    const errorInfo = result.error ? ` (${result.error})` : "";
-    log(
-      `  ${status} | ${(result.duration_ms / 1000).toFixed(1)}s | patch=${result.patch.length} bytes${errorInfo}`,
-    );
-  }
-
-  log(`\n========================================`);
-  log(`Results: ${succeeded}/${completed} tasks produced patches`);
-  log(`Predictions: ${outputPath}`);
-  log(`Details: ${resultsPath}`);
-  log(`\nTo evaluate with SWE-bench harness:`);
-  log(
-    `  python -m swebench.harness.run_evaluation --dataset_name princeton-nlp/SWE-bench_Lite --predictions_path ${outputPath} --max_workers 4 --run_id multica`,
-  );
-}
-
-main().catch((err) => {
-  console.error("Fatal error:", err);
-  process.exit(1);
-});
				`@ -1 +0,0 @@`
				`帮我在 Notion 新建一个页面，标题是今天待办，并写入三条任务：修复登录 bug、写周报、安排评审`