feat: pivot to AI-native task management platform (#232)

Replace the agent framework codebase with a new monorepo structure
for an AI-native Linear-like product where agents are first-class citizens.

New architecture:
- server/ — Go backend (Chi + gorilla/websocket + sqlc)
  - API server with REST routes for issues, agents, inbox, workspaces
  - WebSocket hub for real-time updates
  - Local daemon entry point for agent runtime connection
  - PostgreSQL migration with 13 tables (issue, agent, inbox, etc.)
  - WebSocket protocol types for server<->daemon communication
- apps/web/ — Next.js 16 frontend
  - Dashboard layout with sidebar navigation
  - Route skeleton: inbox, issues, agents, board, settings
- packages/ui/ — Preserved shadcn/ui design system (26+ components)
- packages/types/ — Full API contract types (Issue, Agent, Workspace, Inbox, Events)
- packages/sdk/ — REST ApiClient + WebSocket WSClient
- packages/store/ — Zustand stores (issue, agent, inbox, auth)
- packages/hooks/ — React hooks (useIssues, useAgents, useInbox, useRealtime)
- packages/utils/ — Shared utilities

Removed: apps/cli, apps/desktop, apps/mobile, apps/gateway,
packages/core, skills/, and all agent-framework code.

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jiayuan Zhang 2026-03-20 17:55:49 +08:00 committed by GitHub
parent 3f589d8326
commit d4f5c5b16f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
677 changed files with 2779 additions and 122531 deletions

View file

@ -1,42 +0,0 @@
#!/usr/bin/env bash
#
# Archive and clean the dev environment data.
#
# Moves ~/.super-multica-dev and ~/Documents/Multica-dev into a
# timestamped archive directory for later debugging / analysis.
#
# Usage:
# pnpm dev:local:archive
#
# Archives are stored in: ~/.super-multica-dev-archives/<timestamp>/
set -euo pipefail
TIMESTAMP=$(date +"%Y%m%d-%H%M%S")
ARCHIVE_BASE="$HOME/.super-multica-dev-archives"
ARCHIVE_DIR="$ARCHIVE_BASE/$TIMESTAMP"
DEV_DATA="$HOME/.super-multica-dev"
DEV_WORKSPACE="$HOME/Documents/Multica-dev"
# Check if there's anything to archive
if [ ! -d "$DEV_DATA" ] && [ ! -d "$DEV_WORKSPACE" ]; then
echo "Nothing to archive — neither $DEV_DATA nor $DEV_WORKSPACE exists."
exit 0
fi
mkdir -p "$ARCHIVE_DIR"
if [ -d "$DEV_DATA" ]; then
mv "$DEV_DATA" "$ARCHIVE_DIR/data"
echo " Archived $DEV_DATA -> $ARCHIVE_DIR/data"
fi
if [ -d "$DEV_WORKSPACE" ]; then
mv "$DEV_WORKSPACE" "$ARCHIVE_DIR/workspace"
echo " Archived $DEV_WORKSPACE -> $ARCHIVE_DIR/workspace"
fi
echo ""
echo "Archived to: $ARCHIVE_DIR"
echo "Dev environment is now clean. Run 'pnpm dev:local' to start fresh."

View file

@ -1,79 +0,0 @@
#!/usr/bin/env node
import * as esbuild from "esbuild";
import { fileURLToPath } from "url";
import { dirname, resolve } from "path";
import { readFileSync, chmodSync } from "fs";
const __dirname = dirname(fileURLToPath(import.meta.url));
const rootDir = resolve(__dirname, "..");
// Read package.json to get all dependencies
const pkg = JSON.parse(readFileSync(resolve(rootDir, "package.json"), "utf8"));
const allDeps = [
...Object.keys(pkg.dependencies || {}),
...Object.keys(pkg.devDependencies || {}),
];
// Plugin to strip shebangs from source files (they get bundled otherwise)
const stripShebangPlugin = {
name: "strip-shebang",
setup(build) {
build.onLoad({ filter: /\.ts$/ }, async (args) => {
const source = readFileSync(args.path, "utf8");
// Remove shebang if present
const contents = source.replace(/^#!.*\n/, "");
return { contents, loader: "ts" };
});
},
};
async function build() {
// Unified CLI entry point
const entryPoint = {
entry: "src/agent/cli/index.ts",
outfile: "bin/multica.mjs",
};
console.log(`Building ${entryPoint.entry} -> ${entryPoint.outfile}...`);
await esbuild.build({
entryPoints: [resolve(rootDir, entryPoint.entry)],
outfile: resolve(rootDir, entryPoint.outfile),
bundle: true,
platform: "node",
target: "node20",
format: "esm",
banner: {
js: "#!/usr/bin/env node",
},
plugins: [stripShebangPlugin],
sourcemap: true,
minify: false,
// Externalize all dependencies - they will be loaded from node_modules at runtime
external: allDeps,
});
// Make executable
chmodSync(resolve(rootDir, entryPoint.outfile), 0o755);
console.log(`${entryPoint.outfile}`);
console.log("\nBuild complete! Binary is in ./bin/");
console.log("\nUsage:");
console.log(" multica # Interactive mode (default)");
console.log(" multica run <prompt> # Run a single prompt");
console.log(" multica chat # Interactive mode");
console.log(" multica session list # List sessions");
console.log(" multica profile list # List profiles");
console.log(" multica skills list # List skills");
console.log(" multica tools list # List tools");
console.log(" multica credentials init # Initialize credentials");
console.log(" multica dev # Start dev servers");
console.log(" multica help # Show help");
console.log("\nNote: The built binary requires node_modules to be present.");
console.log("Run 'pnpm install --prod' to install only production dependencies.");
}
build().catch((err) => {
console.error(err);
process.exit(1);
});

View file

@ -1,137 +0,0 @@
#!/usr/bin/env bash
# Compaction Benchmark - Multi-turn test with low context window
#
# This script runs a series of prompts against the Multica agent with a very
# low context window (20k tokens) to force compaction to trigger quickly.
# The run-log output is then available for analysis.
#
# Usage:
# bash scripts/compaction-benchmark/run.sh [provider]
#
# Default provider: kimi-coding
set -euo pipefail
PROVIDER="${1:-kimi-coding}"
CONTEXT_WINDOW="${2:-20000}"
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
export SMC_DATA_DIR=~/.super-multica-e2e
echo "=== Compaction Benchmark ==="
echo "Provider: $PROVIDER"
echo "Context Window: $CONTEXT_WINDOW tokens"
echo "Data Dir: $SMC_DATA_DIR"
echo ""
# Clean previous E2E data
rm -rf "$SMC_DATA_DIR"
cd "$ROOT_DIR"
# Turn 1: Start a session with a substantial prompt that generates tool usage
echo "--- Turn 1: Initial prompt (read multiple files) ---"
TURN1_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
--run-log \
--provider "$PROVIDER" \
--context-window "$CONTEXT_WINDOW" \
"Read the following files and give me a brief summary of each: packages/core/src/agent/runner.ts, packages/core/src/agent/session/session-manager.ts, packages/core/src/agent/context-window/token-estimation.ts. List the main exports and key functions in each file." \
2>&1)
# Extract session ID from stderr output
SESSION_ID=$(echo "$TURN1_OUTPUT" | grep -o '\[session: [^]]*\]' | head -1 | sed 's/\[session: //;s/\]//')
SESSION_DIR=$(echo "$TURN1_OUTPUT" | grep -o '\[session-dir: [^]]*\]' | head -1 | sed 's/\[session-dir: //;s/\]//')
if [ -z "$SESSION_ID" ]; then
echo "ERROR: Could not extract session ID from output"
echo "$TURN1_OUTPUT"
exit 1
fi
echo "Session ID: $SESSION_ID"
echo "Session Dir: $SESSION_DIR"
echo ""
# Turn 2: Continue the session with more file reads to push context higher
echo "--- Turn 2: More file reads (push context higher) ---"
TURN2_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
--run-log \
--provider "$PROVIDER" \
--context-window "$CONTEXT_WINDOW" \
--session "$SESSION_ID" \
"Now also read packages/core/src/agent/context-window/summarization.ts and packages/core/src/agent/context-window/tool-result-pruning.ts. Describe the key algorithms in each." \
2>&1)
echo "$TURN2_OUTPUT" | head -5
echo ""
# Turn 3: More context-heavy work
echo "--- Turn 3: Additional analysis (should trigger compaction) ---"
TURN3_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
--run-log \
--provider "$PROVIDER" \
--context-window "$CONTEXT_WINDOW" \
--session "$SESSION_ID" \
"Read packages/core/src/agent/session/compaction.ts and explain the three compaction modes. Also read packages/core/src/agent/context-window/guard.ts and explain the guard thresholds." \
2>&1)
echo "$TURN3_OUTPUT" | head -5
echo ""
# Turn 4: More tool usage
echo "--- Turn 4: Write and test (more context pressure) ---"
TURN4_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
--run-log \
--provider "$PROVIDER" \
--context-window "$CONTEXT_WINDOW" \
--session "$SESSION_ID" \
"Based on everything you've read so far, list all the constants and thresholds used in the compaction system. Provide exact values and which file each is defined in." \
2>&1)
echo "$TURN4_OUTPUT" | head -5
echo ""
# Output analysis summary
echo "=== Benchmark Complete ==="
echo "Session Dir: $SESSION_DIR"
echo ""
# Show run-log stats
if [ -f "$SESSION_DIR/run-log.jsonl" ]; then
echo "--- Run Log Event Summary ---"
echo "Total events: $(wc -l < "$SESSION_DIR/run-log.jsonl")"
echo ""
echo "Events by type:"
cat "$SESSION_DIR/run-log.jsonl" | python3 -c "
import sys, json
from collections import Counter
events = Counter()
for line in sys.stdin:
try:
obj = json.loads(line.strip())
events[obj.get('event', 'unknown')] += 1
except:
pass
for event, count in sorted(events.items()):
print(f' {event}: {count}')
" 2>/dev/null || echo " (python3 not available for analysis)"
echo ""
echo "--- Compaction Events ---"
cat "$SESSION_DIR/run-log.jsonl" | python3 -c "
import sys, json
for line in sys.stdin:
try:
obj = json.loads(line.strip())
event = obj.get('event', '')
if 'compact' in event or 'overflow' in event or 'pruning' in event:
print(json.dumps(obj, indent=2))
except:
pass
" 2>/dev/null || echo " (python3 not available for analysis)"
fi
echo ""
echo "=== Full run-log path: $SESSION_DIR/run-log.jsonl ==="
echo "=== Session file path: $SESSION_DIR/session.jsonl ==="

View file

@ -1,56 +0,0 @@
#!/usr/bin/env bash
#
# Local development: Gateway (with Telegram bot) + Desktop + Web (for login)
#
# Usage:
# pnpm dev:local
#
# Reads TELEGRAM_BOT_TOKEN from .env at the repo root.
# Gateway runs on port 4000 in long-polling mode (no TELEGRAM_WEBHOOK_URL needed).
# Web app runs on port 3000 (default) for OAuth login flow.
# Desktop connects to the local Gateway and uses local Web for login.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
ROOT_DIR="$SCRIPT_DIR/.."
ENV_FILE="$ROOT_DIR/.env"
# Load .env
if [ ! -f "$ENV_FILE" ]; then
echo "Error: .env file not found at $ENV_FILE"
echo "Copy .env.example to .env and fill in TELEGRAM_BOT_TOKEN"
exit 1
fi
set -a
source "$ENV_FILE"
set +a
if [ -z "${TELEGRAM_BOT_TOKEN:-}" ]; then
echo "Error: TELEGRAM_BOT_TOKEN not set in .env"
exit 1
fi
echo "Starting local dev environment..."
echo " Gateway: http://localhost:4000 (Telegram long-polling mode)"
echo " Web: http://localhost:3000 (OAuth login)"
echo " Desktop: connecting to local Gateway + Web"
echo " Data dir: ~/.super-multica-dev (isolated from production)"
echo " Workspace: ~/Documents/Multica-dev (isolated from production)"
echo ""
# Build shared packages first
pnpm turbo build --filter=@multica/types --filter=@multica/utils --filter=@multica/core
# Start everything
# Gateway uses PORT=4000 to avoid conflict with Web app on port 3000
exec pnpm concurrently \
-n types,utils,core,gateway,web,desktop \
-c blue,green,yellow,magenta,red,cyan \
"pnpm --filter @multica/types dev" \
"pnpm --filter @multica/utils dev" \
"pnpm --filter @multica/core dev" \
"PORT=4000 SMC_DATA_DIR=~/.super-multica-dev MULTICA_WORKSPACE_DIR=~/Documents/Multica-dev MULTICA_RUN_LOG=1 pnpm --filter @multica/gateway dev" \
"MULTICA_API_URL=https://api-dev.copilothub.ai pnpm --filter @multica/web dev" \
"GATEWAY_URL=http://localhost:4000 MAIN_VITE_WEB_URL=http://localhost:3000 SMC_DATA_DIR=~/.super-multica-dev MULTICA_WORKSPACE_DIR=~/Documents/Multica-dev MULTICA_RUN_LOG=1 pnpm --filter @multica/desktop dev"

View file

@ -1,13 +0,0 @@
Complete a high-complexity investment research task:
Objective: Analyze the top 10 US stocks by market capitalization across their most recent three complete fiscal years and provide investment recommendations for 2026 (2026-01-01 to 2026-12-31).
Requirements:
1. Use "top 10 US stocks by market cap as of 2026-02-01" as the sample; if data windows are incomplete for certain companies, note this and substitute with the most recent available complete fiscal year.
2. Generate 1 detailed analysis per company, covering at minimum: revenue and profit structure, gross/operating margin trends, cash flow quality, capex and buybacks/dividends, valuation range, and key risks.
3. Generate an Excel file (.xlsx) with at least 4 sheets: `raw_data`, `company_scorecard`, `valuation`, `risk_matrix`.
4. Generate a comprehensive report with cross-company comparison and tiering (core holding / watchlist / avoid), along with 2026 portfolio recommendations (including position ranges and trigger conditions).
5. Output a separate `sources.md` listing key data source links and retrieval timestamps.
6. If unable to generate xlsx directly, explain why and provide structurally equivalent CSV files.
Execution requirements: First present an 8-12 step execution plan, then execute. Conclude with a self-check checklist confirming all files are complete.

View file

@ -1,14 +0,0 @@
Build an "AI Value Chain Fundamentals & Valuation Scorecard" project.
Stock universe: NVDA, AMD, AVGO, MSFT, GOOGL, AMZN, META, TSM, ASML, ANET.
Time range: 2023-01-01 to 2025-12-31 (fill gaps with most recent available data and flag accordingly).
Requirements:
1. Construct a 100-point scoring model with at least 6 dimensions: growth, profitability, capital efficiency, R&D intensity, cash flow quality, and valuation margin of safety.
2. Provide weights and scoring logic for each dimension; must be reproducible.
3. Generate an Excel file (.xlsx) with sheets: `input_data`, `factor_scores`, `weighted_rank`, `scenario_2026`.
4. In `scenario_2026`, provide target ranges and trigger signals under three scenarios (optimistic / base / conservative).
5. Produce `investment_memo.md` (including entry logic for the top 3 and avoidance logic for the bottom 3).
6. Produce `sources.md` (source links + dates).
Execution requirements: Plan before executing; conclude with a "reproducibility check" (can someone else reproduce your results following your steps).

View file

@ -1,15 +0,0 @@
Perform a "US Major Bank 2026 Stress Test" task.
Sample: JPM, BAC, C, WFC, GS, MS.
Requirements:
1. Compile key metrics from the most recent three complete fiscal years (preferably 2023-2025): net interest margin (NIM), CET1, loan loss provisions, commercial real estate (CRE) exposure, deposit cost changes, unrealized losses, etc.
2. Construct two stress scenarios:
- Mild Recession: unemployment +150bp, federal funds rate -100bp
- Severe Recession: unemployment +300bp, federal funds rate -200bp, CRE default rate significantly higher
3. Estimate directional changes in profit and capital adequacy for each bank under both scenarios, and rank vulnerability.
4. Generate an Excel file (.xlsx) with sheets: `bank_raw`, `stress_assumptions`, `impact_estimate`, `ranking`.
5. Generate `risk_brief.md` containing "top 5 risk signals to watch."
6. Generate `sources.md`.
Execution requirements: Present methodology first, then results; conclude by listing the 3 assumptions you are least confident about.

View file

@ -1,14 +0,0 @@
Perform a "US Consumer Sector & Macro Variable Linkage Analysis."
Sample companies: WMT, COST, TGT, HD, LOW, MCD, SBUX, NKE, DIS, AMZN.
Time range: 2023-01-01 to 2025-12-31.
Requirements:
1. Split companies into "consumer staples" and "consumer discretionary" groups; compare revenue growth, margins, inventory changes, same-store sales (if available), and cash flow quality.
2. Analyze each group's earnings elasticity relative to macro variables (CPI, real wages, unemployment, interest rates).
3. Build a "2026 three-scenario" earnings elasticity matrix: soft landing / reflation / recession.
4. Generate an Excel file (.xlsx) with sheets: `company_metrics`, `macro_series`, `elasticity_matrix`, `portfolio_actions`.
5. Generate `strategy_note.md` with 2026 sector allocation recommendations and rebalancing trigger conditions.
6. Generate `sources.md`.
Execution requirements: Each allocation recommendation must explicitly state the verifiable metrics behind it.

View file

@ -1,17 +0,0 @@
Complete an "Energy Price Shock Sensitivity Analysis for Energy & Transport Sectors."
Sample: XOM, CVX, COP, SLB, DAL, UAL, FDX, UPS.
Time range: 2023-01-01 to 2025-12-31.
Requirements:
1. Summarize each company's sensitivity direction to oil/fuel costs and sources of operating leverage.
2. Construct three oil price paths for 2026:
- Scenario A: WTI average $60
- Scenario B: WTI average $80
- Scenario C: WTI average $100
3. Estimate directional changes in earnings and valuation for each company under different scenarios (ranges are acceptable over point estimates, but rationale must be provided).
4. Generate an Excel file (.xlsx) with sheets: `raw_financials`, `oil_scenarios`, `sensitivity_map`, `trade_ideas`.
5. Generate `hedge_plan.md` proposing at least 2 hedging or paired trade strategies, including conditions under which they would fail.
6. Generate `sources.md`.
Execution requirements: Conclusions must include "base position + hedge position + trigger thresholds."

View file

@ -1,16 +0,0 @@
Build a "Cross-Asset Tactical Allocation (2026)" project.
Asset universe: SPY, QQQ, IWM, TLT, IEF, HYG, GLD, DBC, BTC-USD.
Historical period: 2021-01-01 to 2025-12-31 (monthly frequency is sufficient).
Requirements:
1. Calculate and compare key metrics: annualized return, volatility, maximum drawdown, Sharpe ratio, and correlation matrix.
2. Design two portfolios:
- Defensive (target: minimize maximum drawdown)
- Offensive (target: higher risk-adjusted returns)
3. Stress test both portfolios under three 2026 scenarios (growth slowdown / inflation resurgence / liquidity easing), and provide rebalancing rules.
4. Generate an Excel file (.xlsx) with sheets: `price_returns`, `risk_metrics`, `corr_matrix`, `portfolio_defensive`, `portfolio_offensive`, `scenario_test`.
5. Generate `allocation_memo.md` explaining why these two portfolios are actionable in 2026.
6. Generate `sources.md`.
Execution requirements: Explicitly state rebalancing frequency, stop-loss rules, and re-entry conditions for each portfolio.

View file

@ -1,13 +0,0 @@
Perform a "REIT Investment Screening in a High-Rate Environment" task.
Sample: VNQ, PLD, AMT, EQIX, O, SPG, PSA, DLR.
Requirements:
1. Compile key metrics from the most recent three complete fiscal years: FFO/AFFO growth, leverage, interest coverage, debt maturity profile, and dividend coverage.
2. Design three 2026 interest rate scenarios (10Y Treasury yield at 3.5% / 4.5% / 5.5%) and analyze valuation pressure and dividend sustainability.
3. Classify each as "hold / watchlist / avoid" and explain the 2-3 most critical driving factors.
4. Generate an Excel file (.xlsx) with sheets: `reit_raw`, `debt_profile`, `rate_scenarios`, `selection_result`.
5. Generate `reit_investment_note.md`.
6. Generate `sources.md`.
Execution requirements: If data is missing, it must be explicitly marked as NA in the tables; silent omission is not allowed.

View file

@ -1,13 +0,0 @@
Perform an "Earnings Quality Forensic Analysis."
Sample: AAPL, MSFT, GOOGL, AMZN, META, NVDA, TSLA, BRK.B, UNH, JPM.
Time range: 2023-01-01 to 2025-12-31.
Requirements:
1. Establish an earnings quality inspection framework covering at minimum: accruals quality, operating cash flow to net income matching, stock-based compensation dilution, buyback-to-debt relationship, and one-time item impact.
2. Assign each company a Red / Yellow / Green rating with traceable supporting evidence.
3. Generate an Excel file (.xlsx) with sheets: `quality_raw`, `forensic_flags`, `rating_summary`, `watchlist_2026`.
4. Generate `forensic_report.md` summarizing the 5 most concerning red flags.
5. Generate `sources.md`.
Execution requirements: The report must clearly distinguish "which conclusions are factual vs. which are inferred."

View file

@ -1,14 +0,0 @@
Perform a "Post-Earnings Announcement Drift (PEAD) Strategy Feasibility Study."
Research period: 2023-01-01 to 2025-12-31.
Sample: Select at least 30 US large/mid-cap stocks (provide selection criteria).
Requirements:
1. Define an executable PEAD signal (e.g., post-earnings 1-3 day information, earnings surprise proxy, or post-announcement momentum proxy) and explain its limitations.
2. Group the sample (high signal / low signal) and analyze performance differences at 1-month and 3-month horizons.
3. Add basic risk controls (position limits, stop-loss, sector exposure limits) and evaluate whether the strategy warrants a small-scale pilot in 2026.
4. Generate an Excel file (.xlsx) with sheets: `universe`, `signal_definition`, `group_performance`, `risk_controls`, `pilot_plan_2026`.
5. Generate `pead_study.md` (covering methodology, results, sources of bias, and implementation recommendations).
6. Generate `sources.md`.
Execution requirements: Must provide "failure scenarios" and objective conditions for "stopping the pilot."

View file

@ -1,12 +0,0 @@
Produce a "Q2 2026 Investment Committee Materials Pack."
Objective: Create meeting-ready investment committee documents for a USD multi-asset portfolio.
Requirements:
1. Output a summary document `committee_pack.md` with at least the following sections: macro outlook, equities, rates, credit, commodities, portfolio risk, and action list.
2. Output an Excel workbook (.xlsx) with at least these sheets: `macro_dashboard`, `equity_watchlist`, `rates_credit`, `commodity_view`, `portfolio_risk`, `action_tracker`.
3. In `action_tracker`, provide actionable items for Q2 2026, each with: trigger condition, target position change, risk control threshold, and review date.
4. Additionally output `devil_advocate.md`, specifically rebutting your own core investment views with at least 5 counter-arguments.
5. Additionally output `sources.md` listing key data sources and dates.
Execution requirements: Plan first, then execute; conclude with a "10-minute oral briefing outline for the investment committee."

View file

@ -1,166 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
CASES_DIR="${SCRIPT_DIR}/cases"
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/finance-e2e-runs/${TIMESTAMP}}"
RESULTS_DIR="${OUT_DIR}/results"
MANIFEST="${OUT_DIR}/manifest.tsv"
# Required environment for agent-driven E2E with web_search/data tools.
SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
PROVIDERS_RAW="${PROVIDERS:-kimi-coding claude-code}"
CASE_GLOB="${CASE_GLOB:-case-*.txt}"
CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-900}"
MAX_PARALLEL="${MAX_PARALLEL:-2}"
TIMEOUT_ENABLED="true"
if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
TIMEOUT_ENABLED="false"
fi
if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
exit 1
fi
if [[ "${1:-}" == "--worker" ]]; then
provider="${2:?missing provider}"
case_file="${3:?missing case file}"
case_base="$(basename "${case_file}")"
case_id="${case_base%.txt}"
log_file="${OUT_DIR}/${provider}-${case_id}.log"
result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
prompt="$(cat "${case_file}")"
status="success"
timed_out="false"
started_epoch="$(date +%s)"
started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
SMC_DATA_DIR="${SMC_DATA_DIR}" \
MULTICA_API_URL="${MULTICA_API_URL}" \
pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
cmd_pid=$!
while kill -0 "${cmd_pid}" 2>/dev/null; do
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
now="$(date +%s)"
elapsed="$((now - started_epoch))"
if (( elapsed >= CASE_TIMEOUT_SEC )); then
timed_out="true"
kill "${cmd_pid}" 2>/dev/null || true
sleep 1
kill -9 "${cmd_pid}" 2>/dev/null || true
break
fi
fi
sleep 2
done
exit_code=0
wait "${cmd_pid}" 2>/dev/null || exit_code=$?
ended_epoch="$(date +%s)"
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
duration_sec="$((ended_epoch - started_epoch))"
if [[ "${timed_out}" == "true" ]]; then
status="timeout"
printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
elif (( exit_code != 0 )); then
status="failed"
elif [[ ! -s "${log_file}" ]]; then
status="failed"
elif ! rg -q "\[session: " "${log_file}"; then
status="failed"
fi
session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
"${TIMESTAMP}" \
"${provider}" \
"${case_id}" \
"${status}" \
"${session_id}" \
"${session_dir}" \
"${log_file}" \
"${started_at}" \
"${ended_at}" \
"${duration_sec}" \
"${exit_code}" > "${result_file}"
printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
"${provider}" \
"${case_id}" \
"${status}" \
"${duration_sec}" \
"${session_id:-N/A}"
exit 0
fi
mkdir -p "${OUT_DIR}"
mkdir -p "${RESULTS_DIR}"
printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
CASE_FILES=()
while IFS= read -r line; do
CASE_FILES+=("${line}")
done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
exit 1
fi
echo "Output directory: ${OUT_DIR}"
echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
echo "Providers: ${PROVIDERS[*]}"
echo "Cases: ${#CASE_FILES[@]}"
echo "Max parallel: ${MAX_PARALLEL}"
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
else
echo "Case timeout: disabled"
fi
TASKS=()
for provider in "${PROVIDERS[@]}"; do
for case_file in "${CASE_FILES[@]}"; do
TASKS+=("${provider}" "${case_file}")
done
done
echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
RESULT_FILES=()
while IFS= read -r line; do
RESULT_FILES+=("${line}")
done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
echo "No result files produced in ${RESULTS_DIR}" >&2
exit 1
fi
for result_file in "${RESULT_FILES[@]}"; do
cat "${result_file}" >> "${MANIFEST}"
done
success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
echo
echo "Completed. Manifest: ${MANIFEST}"
echo "Summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"

View file

@ -1,441 +0,0 @@
#!/usr/bin/env node
import { existsSync, readFileSync, writeFileSync } from "node:fs";
import { dirname, join, resolve } from "node:path";
/**
* @typedef {{
* id: string;
* check: string;
* passed: boolean;
* detail?: string;
* }} CheckResult
*/
/**
* @typedef {{
* provider: string;
* caseId: string;
* status: string;
* sessionId: string;
* sessionDir: string;
* logFile: string;
* checks: CheckResult[];
* pass: boolean;
* }} CaseAnalysis
*/
const manifestArg = process.argv[2];
if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") {
console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs <manifest.tsv>");
process.exit(0);
}
const manifestPath = resolve(manifestArg);
if (!existsSync(manifestPath)) {
console.error(`Manifest not found: ${manifestPath}`);
process.exit(1);
}
const CASE_RULES = {
"case-01-install-caldav-calendar": {
requiredCommandTokens: [
["clawhub", "search"],
["caldav"],
["clawhub", "install"],
["review-skill-security.mjs"],
],
},
"case-02-gap-discovery-homeassistant": {
requiredCommandTokens: [
["clawhub", "search"],
["home", "assistant"],
["clawhub", "install"],
["review-skill-security.mjs"],
],
},
"case-03-install-update-codexmonitor": {
requiredCommandTokens: [
["clawhub", "search"],
["codexmonitor"],
["clawhub", "install"],
["clawhub", "update"],
["review-skill-security.mjs"],
],
},
"case-04-gap-discovery-spotify-ux": {
requireExecUsage: false,
requiredResponseRegex: [
"缺少|没有.*(技能|能力|集成)|capability gap",
"clawhub|cloud\\s*hub|cloudhub",
"安装|install",
"是否|要不要|would you like|do you want",
"安全|审查|security|review",
],
forbiddenCommandTokens: [
["clawhub", "install"],
["clawhub", "update"],
["osascript"],
["spogo"],
["spotify_player"],
["ha.sh"],
["/api/states"],
],
},
"case-05-gap-discovery-notion-ux": {
requireExecUsage: false,
requiredCommandTokens: [
["clawhub", "search"],
["notion"],
],
requiredEventTokens: [
["install_guard", "blocked"],
],
requiredResponseRegex: [
"notion",
"安装|install",
"是否|要不要|would you like|do you want|同意",
"token|授权|integration",
],
forbiddenCommandTokens: [
["osascript"],
["spogo"],
["spotify_player"],
["ha.sh"],
["/api/states"],
],
},
};
/**
* @param {string} text
* @returns {string[]}
*/
function splitLines(text) {
return text.split(/\r?\n/).filter(Boolean);
}
/**
* @param {string} command
* @param {string[]} tokens
* @returns {boolean}
*/
function commandHasTokens(command, tokens) {
const lower = command.toLowerCase();
return tokens.every((token) => lower.includes(token.toLowerCase()));
}
/**
* @param {string} rawArgs
* @returns {string}
*/
function extractCommand(rawArgs) {
if (!rawArgs) return "";
try {
const parsed = JSON.parse(rawArgs);
if (parsed && typeof parsed.command === "string") {
return parsed.command;
}
} catch {
// Fall through: args may be truncated JSON in run-log.
}
return rawArgs;
}
/**
* @param {string} text
* @param {string} pattern
* @returns {boolean}
*/
function textMatchesPattern(text, pattern) {
try {
return new RegExp(pattern, "i").test(text);
} catch {
return false;
}
}
/**
* @param {string} runLogPath
*/
function parseRunLog(runLogPath) {
const lines = splitLines(readFileSync(runLogPath, "utf-8"));
const events = [];
for (const line of lines) {
try {
events.push(JSON.parse(line));
} catch {
// Ignore malformed lines but keep analysis alive.
}
}
return events;
}
/**
* @param {string} sessionPath
* @returns {string}
*/
function parseFinalAssistantText(sessionPath) {
if (!existsSync(sessionPath)) return "";
const lines = splitLines(readFileSync(sessionPath, "utf-8"));
let latest = "";
for (const line of lines) {
try {
const entry = JSON.parse(line);
if (entry?.type !== "message") continue;
const msg = entry.message;
if (!msg || msg.role !== "assistant") continue;
if (typeof msg.content === "string") {
latest = msg.content;
continue;
}
if (Array.isArray(msg.content)) {
const text = msg.content
.filter((part) => part && part.type === "text" && typeof part.text === "string")
.map((part) => part.text)
.join("\n")
.trim();
if (text) latest = text;
}
} catch {
// Ignore malformed lines.
}
}
return latest;
}
/**
* @param {CaseAnalysis} analysis
* @param {string} id
* @param {string} check
* @param {boolean} passed
* @param {string} [detail]
*/
function addCheck(analysis, id, check, passed, detail) {
analysis.checks.push({ id, check, passed, detail });
}
const rows = splitLines(readFileSync(manifestPath, "utf-8"));
if (rows.length <= 1) {
console.error(`Manifest has no data rows: ${manifestPath}`);
process.exit(1);
}
/** @type {CaseAnalysis[]} */
const analyses = [];
for (let i = 1; i < rows.length; i++) {
const row = rows[i];
if (!row) continue;
const cols = row.split("\t");
if (cols.length < 11) continue;
const provider = cols[1] ?? "";
const caseId = cols[2] ?? "";
const rules = CASE_RULES[caseId];
const status = cols[3] ?? "";
const sessionId = cols[4] ?? "";
const sessionDir = cols[5] ?? "";
const logFile = cols[6] ?? "";
/** @type {CaseAnalysis} */
const analysis = {
provider,
caseId,
status,
sessionId,
sessionDir,
logFile,
checks: [],
pass: false,
};
addCheck(
analysis,
"run-status",
"runner status is success",
status === "success",
`status=${status}`,
);
if (!sessionDir) {
addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir");
analyses.push(analysis);
continue;
}
const runLogPath = join(sessionDir, "run-log.jsonl");
addCheck(
analysis,
"run-log-file",
"run-log.jsonl exists",
existsSync(runLogPath),
runLogPath,
);
if (!existsSync(runLogPath)) {
analyses.push(analysis);
continue;
}
const events = parseRunLog(runLogPath);
const sessionPath = join(sessionDir, "session.jsonl");
const finalAssistantText = parseFinalAssistantText(sessionPath);
const runStarts = events.filter((e) => e.event === "run_start");
const runEnds = events.filter((e) => e.event === "run_end");
const toolStarts = events.filter((e) => e.event === "tool_start");
const toolEnds = events.filter((e) => e.event === "tool_end");
const errorToolEnds = toolEnds.filter((e) => e.is_error === true);
addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`);
addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`);
addCheck(
analysis,
"tool-pairing",
"tool_start count matches tool_end count",
toolStarts.length === toolEnds.length,
`start=${toolStarts.length} end=${toolEnds.length}`,
);
const finalRunEnd = runEnds.at(-1);
const runEndError = finalRunEnd?.error;
const finalRunText = typeof finalRunEnd?.text === "string" ? finalRunEnd.text : "";
const finalResponseText = finalAssistantText || finalRunText;
addCheck(
analysis,
"run-end-error",
"final run_end.error is null/empty",
runEndError === null || runEndError === undefined || runEndError === "",
`error=${String(runEndError)}`,
);
addCheck(
analysis,
"tool-errors",
"no tool_end has is_error=true",
errorToolEnds.length === 0,
`error_tool_calls=${errorToolEnds.length}`,
);
const execCommands = toolStarts
.filter((e) => e.tool === "exec")
.map((e) => extractCommand(typeof e.args === "string" ? e.args : ""))
.filter(Boolean);
const requireExecUsage = rules?.requireExecUsage !== false;
addCheck(
analysis,
"exec-usage",
requireExecUsage
? "at least one exec command was used"
: "exec usage is optional for this case",
requireExecUsage ? execCommands.length > 0 : true,
requireExecUsage ? `exec_calls=${execCommands.length}` : `exec_calls=${execCommands.length} (optional)`,
);
if (rules) {
if (Array.isArray(rules.requiredCommandTokens)) {
for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
const tokenList = rules.requiredCommandTokens[r];
const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
addCheck(
analysis,
`cmd-${r + 1}`,
`exec command contains tokens: ${tokenList.join(" + ")}`,
passed,
);
}
}
if (Array.isArray(rules.requiredEventTokens)) {
const eventLines = events.map((event) => JSON.stringify(event).toLowerCase());
for (let r = 0; r < rules.requiredEventTokens.length; r++) {
const tokenList = rules.requiredEventTokens[r];
const passed = eventLines.some((line) =>
tokenList.every((token) => line.includes(token.toLowerCase())),
);
addCheck(
analysis,
`event-${r + 1}`,
`event log contains tokens: ${tokenList.join(" + ")}`,
passed,
);
}
}
if (Array.isArray(rules.forbiddenCommandTokens)) {
for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) {
const tokenList = rules.forbiddenCommandTokens[r];
const passed = !execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
addCheck(
analysis,
`forbid-cmd-${r + 1}`,
`exec command does not contain tokens: ${tokenList.join(" + ")}`,
passed,
);
}
}
if (Array.isArray(rules.requiredResponseRegex)) {
for (let r = 0; r < rules.requiredResponseRegex.length; r++) {
const pattern = rules.requiredResponseRegex[r];
const passed = textMatchesPattern(finalResponseText, pattern);
addCheck(
analysis,
`resp-${r + 1}`,
`final response matches regex: /${pattern}/i`,
passed,
);
}
}
} else {
addCheck(
analysis,
"case-rules",
"case has rule set",
false,
`No rules defined for case_id=${caseId}`,
);
}
analysis.pass = analysis.checks.every((c) => c.passed);
analyses.push(analysis);
}
const passedCases = analyses.filter((a) => a.pass).length;
const failedCases = analyses.length - passedCases;
const output = {
manifestPath,
totalCases: analyses.length,
passedCases,
failedCases,
results: analyses,
};
const outputPath = join(dirname(manifestPath), "analysis.json");
writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8");
for (const item of analyses) {
const status = item.pass ? "PASS" : "FAIL";
console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`);
for (const check of item.checks) {
const marker = check.passed ? " [ok] " : " [bad] ";
const detail = check.detail ? ` (${check.detail})` : "";
console.log(`${marker}${check.check}${detail}`);
}
}
console.log("");
console.log(`Analysis file: ${outputPath}`);
console.log(`Summary: pass=${passedCases} fail=${failedCases}`);
if (failedCases > 0) {
process.exit(1);
}

View file

@ -1,15 +0,0 @@
Run an end-to-end test for the Meta Skill Installer.
Goal: install a real ClawHub skill for CalDAV calendar capability.
Reference page: https://clawhub.ai/skills/caldav-calendar
Follow this exact workflow:
1. State the missing capability in one sentence.
2. Search ClawHub for CalDAV-related skills and choose the best candidate.
3. Stage-install to a temporary directory first (never install directly to active skills path).
4. Run security review on the staged skill:
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
5. If riskLevel is safe, install to "$DATA_DIR/skills".
6. Verify final install by checking "$DATA_DIR/skills/<slug>/SKILL.md" exists.
7. Return a short report: selected slug, riskLevel, final install path.

View file

@ -1,16 +0,0 @@
Run an end-to-end capability-gap discovery test for Meta Skill Installer.
User intent: "I need to control Home Assistant lights and switches from the agent."
Reference page: https://clawhub.ai/skills/homeassistant
Requirements:
1. Treat this as a missing capability and explicitly define the gap.
2. Search ClawHub for relevant skills and list the top 3 candidates.
3. Pick one candidate with rationale (scope match + lower security risk).
4. Stage-install to a temporary directory.
5. Run security review:
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
6. If riskLevel is safe, install to "$DATA_DIR/skills".
7. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
8. Return: candidate list, chosen slug, riskLevel, and final path.

View file

@ -1,16 +0,0 @@
Run an end-to-end install+update regression test for Meta Skill Installer.
Goal: use a real ClawHub skill and verify install, review, and update flow.
Reference page: https://clawhub.ai/odrobnik/codexmonitor
Requirements:
1. Search ClawHub for CodexMonitor and select the matching skill slug.
2. Stage-install to a temporary directory and run security review:
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
3. If riskLevel is safe, install to "$DATA_DIR/skills".
4. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
5. Run an update for the same slug in managed dir:
clawhub update "<slug>" --workdir "$DATA_DIR" --dir skills --force
6. Run security review again on the final installed path.
7. Return: slug, initial riskLevel, update executed (yes/no), final path.

View file

@ -1 +0,0 @@
随机播放 spotify 中的音乐

View file

@ -1 +0,0 @@
帮我在 Notion 新建一个页面,标题是今天待办,并写入三条任务:修复登录 bug、写周报、安排评审

View file

@ -1,170 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
CASES_DIR="${SCRIPT_DIR}/cases"
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/skills-e2e-runs/${TIMESTAMP}}"
RESULTS_DIR="${OUT_DIR}/results"
MANIFEST="${OUT_DIR}/manifest.tsv"
# Required environment for agent-driven E2E.
SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
PROVIDERS_RAW="${PROVIDERS:-kimi-coding}"
CASE_GLOB="${CASE_GLOB:-case-*.txt}"
CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-1200}"
MAX_PARALLEL="${MAX_PARALLEL:-1}"
TIMEOUT_ENABLED="true"
if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
TIMEOUT_ENABLED="false"
fi
if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
exit 1
fi
if [[ "${1:-}" == "--worker" ]]; then
provider="${2:?missing provider}"
case_file="${3:?missing case file}"
case_base="$(basename "${case_file}")"
case_id="${case_base%.txt}"
log_file="${OUT_DIR}/${provider}-${case_id}.log"
result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
prompt="$(cat "${case_file}")"
status="success"
timed_out="false"
started_epoch="$(date +%s)"
started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
SMC_DATA_DIR="${SMC_DATA_DIR}" \
MULTICA_API_URL="${MULTICA_API_URL}" \
pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
cmd_pid=$!
while kill -0 "${cmd_pid}" 2>/dev/null; do
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
now="$(date +%s)"
elapsed="$((now - started_epoch))"
if (( elapsed >= CASE_TIMEOUT_SEC )); then
timed_out="true"
kill "${cmd_pid}" 2>/dev/null || true
sleep 1
kill -9 "${cmd_pid}" 2>/dev/null || true
break
fi
fi
sleep 2
done
exit_code=0
wait "${cmd_pid}" 2>/dev/null || exit_code=$?
ended_epoch="$(date +%s)"
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
duration_sec="$((ended_epoch - started_epoch))"
if [[ "${timed_out}" == "true" ]]; then
status="timeout"
printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
elif (( exit_code != 0 )); then
status="failed"
elif [[ ! -s "${log_file}" ]]; then
status="failed"
elif ! rg -q "\[session: " "${log_file}"; then
status="failed"
fi
session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
"${TIMESTAMP}" \
"${provider}" \
"${case_id}" \
"${status}" \
"${session_id}" \
"${session_dir}" \
"${log_file}" \
"${started_at}" \
"${ended_at}" \
"${duration_sec}" \
"${exit_code}" > "${result_file}"
printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
"${provider}" \
"${case_id}" \
"${status}" \
"${duration_sec}" \
"${session_id:-N/A}"
exit 0
fi
mkdir -p "${OUT_DIR}"
mkdir -p "${RESULTS_DIR}"
printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
CASE_FILES=()
while IFS= read -r line; do
CASE_FILES+=("${line}")
done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
exit 1
fi
echo "Output directory: ${OUT_DIR}"
echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
echo "Providers: ${PROVIDERS[*]}"
echo "Cases: ${#CASE_FILES[@]}"
echo "Max parallel: ${MAX_PARALLEL}"
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
else
echo "Case timeout: disabled"
fi
TASKS=()
for provider in "${PROVIDERS[@]}"; do
for case_file in "${CASE_FILES[@]}"; do
TASKS+=("${provider}" "${case_file}")
done
done
echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
RESULT_FILES=()
while IFS= read -r line; do
RESULT_FILES+=("${line}")
done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
echo "No result files produced in ${RESULTS_DIR}" >&2
exit 1
fi
for result_file in "${RESULT_FILES[@]}"; do
cat "${result_file}" >> "${MANIFEST}"
done
success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
echo
echo "Completed run stage. Manifest: ${MANIFEST}"
echo "Run summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
echo
echo "Running structured analysis..."
node "${SCRIPT_DIR}/analyze.mjs" "${MANIFEST}" | tee "${OUT_DIR}/analysis.txt"

View file

@ -1,499 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
OUT_FILE="${1:-$ROOT_DIR/docs/code-stats-report.html}"
TMP_DIR="$(mktemp -d)"
cleanup() {
rm -rf "$TMP_DIR"
}
trap cleanup EXIT
cd "$ROOT_DIR"
if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
echo "Error: this script must run inside a git repository."
exit 1
fi
# 1) Snapshot LOC from tracked files.
while IFS= read -r -d '' file; do
if [ -f "$file" ]; then
wc -l "$file"
fi
done < <(git ls-files -z) > "$TMP_DIR/wc_all.txt"
awk -v out_by_ext="$TMP_DIR/loc_by_ext.tsv" -v out_totals="$TMP_DIR/loc_totals.tsv" '
{
lines = $1
$1 = ""
sub(/^ +/, "")
file = $0
n = split(file, parts, "/")
base = parts[n]
ext = base
if (index(base, ".") > 0) {
sub(/.*\./, "", ext)
} else {
ext = "[noext]"
}
ext_lines[ext] += lines
ext_files[ext] += 1
files += 1
lines_all += lines
}
END {
for (e in ext_lines) {
printf "%s\t%d\t%d\n", e, ext_files[e], ext_lines[e] > out_by_ext
}
source_lines = 0
source_files = 0
doc_lines = 0
doc_files = 0
cfg_lines = 0
cfg_files = 0
for (e in ext_lines) {
if (e ~ /^(ts|tsx|js|jsx|mjs|cjs|py|css|scss|html|sh)$/) {
source_lines += ext_lines[e]
source_files += ext_files[e]
}
if (e == "md") {
doc_lines += ext_lines[e]
doc_files += ext_files[e]
}
if (e ~ /^(json|json5|yaml|yml|xsd)$/) {
cfg_lines += ext_lines[e]
cfg_files += ext_files[e]
}
}
printf "files\t%d\nlines\t%d\nsource_files\t%d\nsource_lines\t%d\ndoc_files\t%d\ndoc_lines\t%d\nconfig_files\t%d\nconfig_lines\t%d\n", files, lines_all, source_files, source_lines, doc_files, doc_lines, cfg_files, cfg_lines > out_totals
}
' "$TMP_DIR/wc_all.txt"
# 2) Contribution by author (email-normalized).
git log --all --no-merges --numstat --format='@@@%aN|%aE' | awk -v out="$TMP_DIR/author_by_email.tsv" '
BEGIN { FS = "\t" }
/^@@@/ {
split(substr($0, 4), h, /\|/)
name = h[1]
email = h[2]
id = email
if (!(id in display)) {
display[id] = name " <" email ">"
}
commits[id] += 1
next
}
NF == 3 && $1 ~ /^[0-9]+$/ && $2 ~ /^[0-9]+$/ {
adds[id] += $1
dels[id] += $2
}
END {
for (k in commits) {
printf "%s\t%d\t%d\t%d\t%d\n", display[k], commits[k], adds[k] + 0, dels[k] + 0, (adds[k] - dels[k]) + 0 > out
}
}
'
sort -t $'\t' -k3,3nr "$TMP_DIR/author_by_email.tsv" > "$TMP_DIR/author_by_email.sorted.tsv"
awk -F '\t' -v out="$TMP_DIR/author_human_share.tsv" '
$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
total_commits += $2
total_adds += $3
rows[++n] = $0
}
END {
for (i = 1; i <= n; i++) {
split(rows[i], f, "\t")
add_pct = (total_adds > 0) ? (f[3] / total_adds * 100) : 0
commit_pct = (total_commits > 0) ? (f[2] / total_commits * 100) : 0
printf "%s\t%d\t%d\t%d\t%d\t%.2f%%\t%.2f%%\n", f[1], f[2], f[3], f[4], f[5], add_pct, commit_pct > out
}
}
' "$TMP_DIR/author_by_email.sorted.tsv"
# 3) Contribution by author/day/hour.
git log --all --no-merges --numstat --date=format:'%Y-%m-%d|%H' --format='@@@%aE|%ad' | awk -v out="$TMP_DIR/author_day_hour_summary.tsv" '
BEGIN { FS = "\t" }
/^@@@/ {
split(substr($0, 4), h, /\|/)
email = h[1]
day = h[2]
hour = h[3]
key = email "\t" day "\t" hour
commits[key] += 1
next
}
NF == 3 && $1 ~ /^[0-9]+$/ && $2 ~ /^[0-9]+$/ {
adds[key] += $1
dels[key] += $2
}
END {
for (k in commits) {
split(k, f, "\t")
a = adds[k] + 0
d = dels[k] + 0
printf "%s\t%s\t%s\t%d\t%d\t%d\t%d\n", f[1], f[2], f[3], commits[k], a, d, (a - d) > out
}
}
'
awk -F '\t' -v out="$TMP_DIR/day_summary_human.tsv" '
$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
day = $2
commits[day] += $4
adds[day] += $5
dels[day] += $6
if (!(day in min_hour) || $3 < min_hour[day]) {
min_hour[day] = $3
}
if (!(day in max_hour) || $3 > max_hour[day]) {
max_hour[day] = $3
}
}
END {
for (d in commits) {
printf "%s\t%d\t%d\t%d\t%d\t%s\t%s\n", d, commits[d], adds[d], dels[d], adds[d] - dels[d], min_hour[d], max_hour[d] > out
}
}
' "$TMP_DIR/author_day_hour_summary.tsv"
sort -t $'\t' -k1,1 "$TMP_DIR/day_summary_human.tsv" -o "$TMP_DIR/day_summary_human.tsv"
awk -F '\t' -v out="$TMP_DIR/hour_summary_human.tsv" '
$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
hour = $3
commits[hour] += $4
adds[hour] += $5
dels[hour] += $6
}
END {
for (i = 0; i < 24; i++) {
h = sprintf("%02d", i)
a = adds[h] + 0
d = dels[h] + 0
printf "%s\t%d\t%d\t%d\t%d\n", h, commits[h] + 0, a, d, a - d > out
}
}
' "$TMP_DIR/author_day_hour_summary.tsv"
sort -t $'\t' -k1,1 "$TMP_DIR/hour_summary_human.tsv" -o "$TMP_DIR/hour_summary_human.tsv"
awk -F '\t' -v out="$TMP_DIR/day_peak_hour_human.tsv" '
$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
key = $2 "\t" $3
commits[key] += $4
adds[key] += $5
dels[key] += $6
}
END {
for (k in adds) {
split(k, parts, "\t")
day = parts[1]
hour = parts[2]
if (!(day in max_adds) || adds[k] > max_adds[day]) {
max_adds[day] = adds[k]
best_hour[day] = hour
best_commits[day] = commits[k]
best_dels[day] = dels[k]
}
}
for (d in max_adds) {
printf "%s\t%s\t%d\t%d\t%d\n", d, best_hour[d], best_commits[d], max_adds[d], best_dels[d] > out
}
}
' "$TMP_DIR/author_day_hour_summary.tsv"
sort -t $'\t' -k1,1 "$TMP_DIR/day_peak_hour_human.tsv" -o "$TMP_DIR/day_peak_hour_human.tsv"
mkdir -p "$(dirname "$OUT_FILE")"
# 4) Render standalone HTML.
{
cat <<'HTML_HEAD'
<!doctype html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Super Multica 代码贡献统计</title>
<style>
:root {
--bg: #0b0d10;
--panel: #14181d;
--panel-2: #1a2027;
--line: #2a3440;
--text: #e8edf3;
--muted: #98a7b7;
--ok: #2fbf71;
--danger: #ef4444;
}
* { box-sizing: border-box; }
body {
margin: 0;
font-family: ui-sans-serif, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial;
background: radial-gradient(circle at 20% -10%, #1a2430 0%, #0b0d10 45%) fixed;
color: var(--text);
line-height: 1.4;
}
.wrap { max-width: 1200px; margin: 0 auto; padding: 24px; }
h1 { margin: 0 0 8px; font-size: 28px; }
.sub { color: var(--muted); margin-bottom: 20px; }
.grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(190px, 1fr));
gap: 12px;
margin-bottom: 18px;
}
.card {
background: linear-gradient(180deg, var(--panel) 0%, var(--panel-2) 100%);
border: 1px solid var(--line);
border-radius: 10px;
padding: 12px;
}
.k { color: var(--muted); font-size: 12px; margin-bottom: 8px; }
.v { font-size: 24px; font-weight: 700; letter-spacing: 0.3px; }
.section { margin-top: 14px; }
.section h2 { margin: 0 0 10px; font-size: 16px; color: #d4dde7; }
.panel {
background: var(--panel);
border: 1px solid var(--line);
border-radius: 10px;
overflow: hidden;
}
table { width: 100%; border-collapse: collapse; }
th, td { padding: 9px 10px; border-bottom: 1px solid var(--line); font-size: 13px; }
th { background: #11161c; text-align: left; color: #c5d0db; position: sticky; top: 0; }
tr:last-child td { border-bottom: 0; }
.num { text-align: right; font-variant-numeric: tabular-nums; }
.mono { font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; }
.bar-wrap { background: #0f1318; border-radius: 999px; height: 8px; width: 180px; border: 1px solid #273241; }
.bar { height: 100%; border-radius: 999px; background: linear-gradient(90deg, #3f7ef7, #58a6ff); }
.ok { color: var(--ok); }
.danger { color: var(--danger); }
.foot { margin-top: 16px; color: var(--muted); font-size: 12px; }
.scroll { max-height: 420px; overflow: auto; }
</style>
</head>
<body>
<div class="wrap">
<h1>Super Multica 代码贡献统计</h1>
<div class="sub" id="subtitle"></div>
<div class="grid" id="summary"></div>
<div class="section">
<h2>代码量分布(按扩展名)</h2>
<div class="panel scroll"><table id="extTable"></table></div>
</div>
<div class="section">
<h2>人员贡献(人工口径)</h2>
<div class="panel scroll"><table id="authorTable"></table></div>
</div>
<div class="section">
<h2>每日贡献(人工口径)</h2>
<div class="panel scroll"><table id="dayTable"></table></div>
</div>
<div class="section">
<h2>小时段贡献(人工口径)</h2>
<div class="panel scroll"><table id="hourTable"></table></div>
</div>
<div class="foot">数据来源git log --numstat 与当前工作树文件统计。人工口径排除 checkpointer / dependabot。</div>
</div>
<script>
const RAW = {
locTotals: String.raw`
HTML_HEAD
cat "$TMP_DIR/loc_totals.tsv"
cat <<'MID1'
`,
locByExt: String.raw`
MID1
cat "$TMP_DIR/loc_by_ext.tsv"
cat <<'MID2'
`,
authorHuman: String.raw`
MID2
cat "$TMP_DIR/author_human_share.tsv"
cat <<'MID3'
`,
dayHuman: String.raw`
MID3
cat "$TMP_DIR/day_summary_human.tsv"
cat <<'MID4'
`,
hourHuman: String.raw`
MID4
cat "$TMP_DIR/hour_summary_human.tsv"
cat <<'MID5'
`,
dayPeak: String.raw`
MID5
cat "$TMP_DIR/day_peak_hour_human.tsv"
cat <<'HTML_TAIL'
`
};
const fmt = (n) => Number(n).toLocaleString("en-US");
const tsv = (txt) => txt.trim().split(/\n+/).map((line) => line.split("\t"));
const toNum = (v) => Number(v || 0);
const locTotalsRows = tsv(RAW.locTotals);
const locTotals = Object.fromEntries(locTotalsRows.map(([k, v]) => [k, toNum(v)]));
const extRows = tsv(RAW.locByExt).map(([ext, files, lines]) => ({
ext,
files: toNum(files),
lines: toNum(lines),
})).sort((a, b) => b.lines - a.lines);
const authors = tsv(RAW.authorHuman).map(([name, commits, add, del, net, addPct, commitPct]) => ({
name,
commits: toNum(commits),
add: toNum(add),
del: toNum(del),
net: toNum(net),
addPct,
commitPct,
})).sort((a, b) => b.add - a.add);
const dayPeaks = Object.fromEntries(tsv(RAW.dayPeak).map(([d, h, c, a, del]) => [d, {
hour: h,
commits: toNum(c),
add: toNum(a),
del: toNum(del),
}]));
const days = tsv(RAW.dayHuman).map(([date, commits, add, del, net, startHour, endHour]) => ({
date,
commits: toNum(commits),
add: toNum(add),
del: toNum(del),
net: toNum(net),
startHour,
endHour,
peak: dayPeaks[date] || null,
})).sort((a, b) => a.date.localeCompare(b.date));
const hours = tsv(RAW.hourHuman).map(([hour, commits, add, del, net]) => ({
hour,
commits: toNum(commits),
add: toNum(add),
del: toNum(del),
net: toNum(net),
})).sort((a, b) => a.hour.localeCompare(b.hour));
const totalHumanCommits = authors.reduce((sum, x) => sum + x.commits, 0);
const totalHumanAdd = authors.reduce((sum, x) => sum + x.add, 0);
const totalHumanDel = authors.reduce((sum, x) => sum + x.del, 0);
const topHour = [...hours].sort((a, b) => b.add - a.add)[0] || { hour: "--", add: 0 };
const startDate = days[0]?.date || "--";
const endDate = days[days.length - 1]?.date || "--";
document.getElementById("subtitle").textContent = `${startDate} ~ ${endDate}`;
const summaryItems = [
["总文件数", fmt(locTotals.files || 0)],
["总行数", fmt(locTotals.lines || 0)],
["源码行数", fmt(locTotals.source_lines || 0)],
["贡献人数", fmt(authors.length)],
["人工提交数", fmt(totalHumanCommits)],
["人工新增", fmt(totalHumanAdd)],
["人工删除", fmt(totalHumanDel)],
["最高产小时", `${topHour.hour}:00 (${fmt(topHour.add)})`],
];
document.getElementById("summary").innerHTML = summaryItems.map(([k, v]) => (
`<div class="card"><div class="k">${k}</div><div class="v">${v}</div></div>`
)).join("");
const maxExtLines = Math.max(...extRows.map((x) => x.lines), 1);
document.getElementById("extTable").innerHTML = `
<thead><tr><th>扩展名</th><th class="num">文件数</th><th class="num">行数</th><th>占比</th><th>可视化</th></tr></thead>
<tbody>
${extRows.map((r) => {
const pct = ((r.lines / (locTotals.lines || 1)) * 100).toFixed(2);
const w = ((r.lines / maxExtLines) * 100).toFixed(1);
return `<tr>
<td class="mono">${r.ext}</td>
<td class="num">${fmt(r.files)}</td>
<td class="num">${fmt(r.lines)}</td>
<td class="num">${pct}%</td>
<td><div class="bar-wrap"><div class="bar" style="width:${w}%"></div></div></td>
</tr>`;
}).join("")}
</tbody>`;
document.getElementById("authorTable").innerHTML = `
<thead><tr><th>作者</th><th class="num">提交</th><th class="num">新增</th><th class="num">删除</th><th class="num">净新增</th><th class="num">新增占比</th><th class="num">提交占比</th></tr></thead>
<tbody>
${authors.map((a) => `<tr>
<td>${a.name}</td>
<td class="num">${fmt(a.commits)}</td>
<td class="num">${fmt(a.add)}</td>
<td class="num">${fmt(a.del)}</td>
<td class="num ${a.net >= 0 ? "ok" : "danger"}">${fmt(a.net)}</td>
<td class="num">${a.addPct}</td>
<td class="num">${a.commitPct}</td>
</tr>`).join("")}
</tbody>`;
document.getElementById("dayTable").innerHTML = `
<thead><tr><th>日期</th><th class="num">提交</th><th class="num">新增</th><th class="num">删除</th><th class="num">净新增</th><th>活跃时段</th><th>峰值小时</th></tr></thead>
<tbody>
${days.map((d) => `<tr>
<td class="mono">${d.date}</td>
<td class="num">${fmt(d.commits)}</td>
<td class="num">${fmt(d.add)}</td>
<td class="num">${fmt(d.del)}</td>
<td class="num ${d.net >= 0 ? "ok" : "danger"}">${fmt(d.net)}</td>
<td class="mono">${d.startHour}:00 - ${d.endHour}:59</td>
<td class="mono">${d.peak ? `${d.peak.hour}:00 (${fmt(d.peak.add)})` : "--"}</td>
</tr>`).join("")}
</tbody>`;
const maxHourAdd = Math.max(...hours.map((h) => h.add), 1);
document.getElementById("hourTable").innerHTML = `
<thead><tr><th>小时</th><th class="num">提交</th><th class="num">新增</th><th class="num">删除</th><th class="num">净新增</th><th>可视化</th></tr></thead>
<tbody>
${hours.map((h) => {
const w = ((h.add / maxHourAdd) * 100).toFixed(1);
return `<tr>
<td class="mono">${h.hour}:00</td>
<td class="num">${fmt(h.commits)}</td>
<td class="num">${fmt(h.add)}</td>
<td class="num">${fmt(h.del)}</td>
<td class="num ${h.net >= 0 ? "ok" : "danger"}">${fmt(h.net)}</td>
<td><div class="bar-wrap"><div class="bar" style="width:${w}%"></div></div></td>
</tr>`;
}).join("")}
</tbody>`;
</script>
</body>
</html>
HTML_TAIL
} > "$OUT_FILE"
echo "Report generated: $OUT_FILE"

View file

@ -1,53 +0,0 @@
#!/bin/bash
# Reset all user data for super-multica desktop app
# Use this to simulate a fresh install for testing
set -e
echo "🧹 Resetting Super Multica user data..."
# Main data directory
MULTICA_DATA_DIR="$HOME/.super-multica"
if [ -d "$MULTICA_DATA_DIR" ]; then
echo " Removing $MULTICA_DATA_DIR"
rm -rf "$MULTICA_DATA_DIR"
else
echo " $MULTICA_DATA_DIR does not exist, skipping"
fi
# Dev data directory (used by pnpm dev:local)
MULTICA_DEV_DIR="$HOME/.super-multica-dev"
if [ -d "$MULTICA_DEV_DIR" ]; then
echo " Removing $MULTICA_DEV_DIR"
rm -rf "$MULTICA_DEV_DIR"
else
echo " $MULTICA_DEV_DIR does not exist, skipping"
fi
# Electron app data (macOS)
if [[ "$OSTYPE" == "darwin"* ]]; then
ELECTRON_APP_DATA="$HOME/Library/Application Support/super-multica"
if [ -d "$ELECTRON_APP_DATA" ]; then
echo " Removing $ELECTRON_APP_DATA"
rm -rf "$ELECTRON_APP_DATA"
else
echo " $ELECTRON_APP_DATA does not exist, skipping"
fi
fi
# Electron app data (Linux)
if [[ "$OSTYPE" == "linux-gnu"* ]]; then
ELECTRON_APP_DATA="$HOME/.config/super-multica"
if [ -d "$ELECTRON_APP_DATA" ]; then
echo " Removing $ELECTRON_APP_DATA"
rm -rf "$ELECTRON_APP_DATA"
else
echo " $ELECTRON_APP_DATA does not exist, skipping"
fi
fi
echo "✅ User data reset complete!"
echo ""
echo "Next steps:"
echo " pnpm dev # Start app (will show onboarding)"
echo " pnpm dev:reset # Reset and start in one command"

View file

@ -1,60 +0,0 @@
#!/usr/bin/env bash
#
# Set Telegram Bot Webhook
#
# Usage:
# ./scripts/set-telegram-webhook.sh <webhook_url>
#
# Example:
# ./scripts/set-telegram-webhook.sh https://your-domain.ngrok-free.dev
#
# Reads TELEGRAM_BOT_TOKEN and TELEGRAM_WEBHOOK_SECRET_TOKEN from .env
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
ENV_FILE="$SCRIPT_DIR/../.env"
if [ ! -f "$ENV_FILE" ]; then
echo "Error: .env file not found at $ENV_FILE"
exit 1
fi
source "$ENV_FILE"
if [ -z "${TELEGRAM_BOT_TOKEN:-}" ]; then
echo "Error: TELEGRAM_BOT_TOKEN not set in .env"
exit 1
fi
WEBHOOK_BASE_URL="${1:-}"
if [ -z "$WEBHOOK_BASE_URL" ]; then
echo "Usage: $0 <webhook_base_url>"
echo ""
echo "Example:"
echo " $0 https://your-domain.ngrok-free.dev"
exit 1
fi
# Remove trailing slash
WEBHOOK_BASE_URL="${WEBHOOK_BASE_URL%/}"
WEBHOOK_URL="${WEBHOOK_BASE_URL}/telegram/webhook"
echo "Bot Token: ${TELEGRAM_BOT_TOKEN:0:10}..."
echo "Secret Token: ${TELEGRAM_WEBHOOK_SECRET_TOKEN:0:8}..."
echo "Webhook URL: $WEBHOOK_URL"
echo ""
# Set webhook
echo "=> Setting webhook..."
RESPONSE=$(curl -s "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/setWebhook" \
-d "url=${WEBHOOK_URL}" \
-d "secret_token=${TELEGRAM_WEBHOOK_SECRET_TOKEN:-}")
echo "$RESPONSE" | python3 -m json.tool 2>/dev/null || echo "$RESPONSE"
echo ""
echo "=> Verifying webhook info..."
INFO=$(curl -s "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/getWebhookInfo")
echo "$INFO" | python3 -m json.tool 2>/dev/null || echo "$INFO"

View file

@ -1,5 +0,0 @@
# Downloaded datasets
*.jsonl
# Don't ignore the scripts themselves
!.gitignore

View file

@ -1,116 +0,0 @@
#!/usr/bin/env tsx
/**
* Analyze SWE-bench run results.
*
* Reads the .results.jsonl file produced by run.ts and prints a summary.
*
* Usage:
* tsx scripts/swe-bench/analyze.ts [results.jsonl]
*/
import { readFileSync, existsSync } from "node:fs";
import { resolve, join } from "node:path";
interface RunResult {
instance_id: string;
success: boolean;
patch: string;
error?: string;
duration_ms: number;
session_id: string;
}
function main() {
const resultsPath = resolve(
process.argv[2] || "scripts/swe-bench/predictions.results.jsonl",
);
if (!existsSync(resultsPath)) {
console.error(`Results file not found: ${resultsPath}`);
process.exit(1);
}
const lines = readFileSync(resultsPath, "utf-8").split("\n").filter(Boolean);
const results: RunResult[] = lines.map((l) => JSON.parse(l));
const total = results.length;
const patched = results.filter((r) => r.success).length;
const failed = results.filter((r) => !r.success).length;
const errors = results.filter((r) => r.error).length;
const durations = results.map((r) => r.duration_ms);
const avgDuration = durations.reduce((a, b) => a + b, 0) / total;
const maxDuration = Math.max(...durations);
const minDuration = Math.min(...durations);
const patchSizes = results
.filter((r) => r.success)
.map((r) => r.patch.length);
const avgPatchSize =
patchSizes.length > 0
? patchSizes.reduce((a, b) => a + b, 0) / patchSizes.length
: 0;
console.log("=== SWE-bench Run Analysis ===\n");
console.log(`Total tasks: ${total}`);
console.log(`Patched: ${patched} (${((patched / total) * 100).toFixed(1)}%)`);
console.log(`No patch: ${failed}`);
console.log(`Errors: ${errors}`);
console.log();
console.log(`Avg duration: ${(avgDuration / 1000).toFixed(1)}s`);
console.log(`Min duration: ${(minDuration / 1000).toFixed(1)}s`);
console.log(`Max duration: ${(maxDuration / 1000).toFixed(1)}s`);
console.log(`Avg patch size: ${(avgPatchSize / 1024).toFixed(1)}KB`);
// Error breakdown
if (errors > 0) {
console.log("\n--- Errors ---");
const errorCounts = new Map<string, number>();
for (const r of results) {
if (r.error) {
const key = r.error.length > 60 ? r.error.slice(0, 60) + "..." : r.error;
errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
}
}
for (const [err, count] of [...errorCounts.entries()].sort(
(a, b) => b[1] - a[1],
)) {
console.log(` ${count}x ${err}`);
}
}
// Per-repo breakdown
console.log("\n--- By Repository ---");
const repoStats = new Map<string, { total: number; patched: number }>();
for (const r of results) {
const repo = r.instance_id.split("__")[0]?.replace(/__/g, "/") || "unknown";
const stats = repoStats.get(repo) || { total: 0, patched: 0 };
stats.total++;
if (r.success) stats.patched++;
repoStats.set(repo, stats);
}
for (const [repo, stats] of [...repoStats.entries()].sort(
(a, b) => b[1].total - a[1].total,
)) {
const pct = ((stats.patched / stats.total) * 100).toFixed(0);
console.log(
` ${repo.padEnd(30)} ${stats.patched}/${stats.total} (${pct}%)`,
);
}
// Slowest tasks
console.log("\n--- Slowest Tasks ---");
const sorted = [...results].sort((a, b) => b.duration_ms - a.duration_ms);
for (const r of sorted.slice(0, 5)) {
console.log(
` ${(r.duration_ms / 1000).toFixed(1)}s ${r.instance_id} ${r.success ? "PATCHED" : "NO_PATCH"}`,
);
}
// Session IDs for further analysis
const dataDir = process.env.SMC_DATA_DIR || join(process.env.HOME || "~", ".swe-bench-eval");
console.log(`\n--- Run Logs ---`);
console.log(`Session data: ${dataDir}/sessions/`);
console.log(`View a session's run log:`);
console.log(` cat ${dataDir}/sessions/<session-id>/run-log.jsonl | head -20`);
}
main();

View file

@ -1,100 +0,0 @@
#!/usr/bin/env python3
"""
Download SWE-bench dataset from HuggingFace and export to JSONL for the Node.js runner.
Usage:
pip install datasets
python scripts/swe-bench/download-dataset.py [--dataset verified|lite|full] [--limit N] [--output PATH]
Output format (one JSON object per line):
{
"instance_id": "django__django-16379",
"repo": "django/django",
"base_commit": "abc123...",
"problem_statement": "...",
"hints_text": "...",
"patch": "...", # gold patch (for reference, not shown to agent)
"test_patch": "...", # test patch applied during evaluation
"version": "4.2",
"environment_setup_commit": "..."
}
"""
import argparse
import json
import sys
DATASET_MAP = {
"verified": "princeton-nlp/SWE-bench_Verified",
"lite": "princeton-nlp/SWE-bench_Lite",
"full": "princeton-nlp/SWE-bench",
}
def main():
parser = argparse.ArgumentParser(description="Download SWE-bench dataset to JSONL")
parser.add_argument(
"--dataset",
choices=["verified", "lite", "full"],
default="lite",
help="Dataset variant (default: lite)",
)
parser.add_argument(
"--limit", type=int, default=0, help="Limit number of instances (0 = all)"
)
parser.add_argument(
"--output",
type=str,
default=None,
help="Output JSONL path (default: scripts/swe-bench/<dataset>.jsonl)",
)
parser.add_argument(
"--split",
type=str,
default="test",
help="Dataset split (default: test)",
)
args = parser.parse_args()
try:
from datasets import load_dataset
except ImportError:
print("Error: 'datasets' package not installed. Run: pip install datasets", file=sys.stderr)
sys.exit(1)
dataset_name = DATASET_MAP[args.dataset]
output_path = args.output or f"scripts/swe-bench/{args.dataset}.jsonl"
print(f"Downloading {dataset_name} (split={args.split})...", file=sys.stderr)
ds = load_dataset(dataset_name, split=args.split)
# Fields to keep
keep_fields = [
"instance_id",
"repo",
"base_commit",
"problem_statement",
"hints_text",
"patch",
"test_patch",
"version",
"environment_setup_commit",
]
count = 0
with open(output_path, "w") as f:
for item in ds:
record = {}
for field in keep_fields:
if field in item:
record[field] = item[field]
f.write(json.dumps(record, ensure_ascii=False) + "\n")
count += 1
if args.limit and count >= args.limit:
break
print(f"Wrote {count} instances to {output_path}", file=sys.stderr)
if __name__ == "__main__":
main()

View file

@ -1,68 +0,0 @@
#!/usr/bin/env bash
#
# Evaluate Multica predictions against SWE-bench using the official Docker harness.
#
# Prerequisites:
# pip install swebench
# Docker running with at least 120GB storage, 16GB RAM, 8 CPU cores
#
# Usage:
# bash scripts/swe-bench/evaluate.sh [predictions.jsonl] [dataset] [run_id]
#
# Examples:
# bash scripts/swe-bench/evaluate.sh
# bash scripts/swe-bench/evaluate.sh scripts/swe-bench/predictions.jsonl lite multica-v1
set -euo pipefail
PREDICTIONS="${1:-scripts/swe-bench/predictions.jsonl}"
DATASET="${2:-lite}"
RUN_ID="${3:-multica}"
# Map short names to HuggingFace dataset names
case "$DATASET" in
lite) DATASET_NAME="princeton-nlp/SWE-bench_Lite" ;;
verified) DATASET_NAME="princeton-nlp/SWE-bench_Verified" ;;
full) DATASET_NAME="princeton-nlp/SWE-bench" ;;
*) DATASET_NAME="$DATASET" ;;
esac
echo "=== SWE-bench Evaluation ==="
echo "Predictions: $PREDICTIONS"
echo "Dataset: $DATASET_NAME"
echo "Run ID: $RUN_ID"
echo ""
if [ ! -f "$PREDICTIONS" ]; then
echo "Error: Predictions file not found: $PREDICTIONS"
exit 1
fi
TASK_COUNT=$(wc -l < "$PREDICTIONS" | tr -d ' ')
echo "Tasks to evaluate: $TASK_COUNT"
echo ""
# Check if swebench is installed
if ! python -c "import swebench" 2>/dev/null; then
echo "Error: swebench not installed. Run: pip install swebench"
exit 1
fi
# Check if Docker is running
if ! docker info >/dev/null 2>&1; then
echo "Error: Docker is not running"
exit 1
fi
echo "Starting evaluation (this may take a while)..."
echo ""
python -m swebench.harness.run_evaluation \
--dataset_name "$DATASET_NAME" \
--predictions_path "$PREDICTIONS" \
--max_workers 4 \
--run_id "$RUN_ID"
echo ""
echo "=== Evaluation Complete ==="
echo "Check logs/ and evaluation_results/ for detailed results."

View file

@ -1,392 +0,0 @@
#!/usr/bin/env tsx
/**
* SWE-bench Runner for Multica
*
* Runs the Multica agent against SWE-bench task instances and collects patches.
*
* Usage:
* tsx scripts/swe-bench/run.ts [options]
*
* Options:
* --dataset PATH Path to JSONL dataset (default: scripts/swe-bench/lite.jsonl)
* --provider NAME LLM provider (default: kimi-coding)
* --model NAME Model name
* --limit N Max tasks to run (default: all)
* --offset N Skip first N tasks (default: 0)
* --output PATH Output predictions JSONL (default: scripts/swe-bench/predictions.jsonl)
* --workdir PATH Working directory for repos (default: /tmp/swe-bench)
* --timeout MS Timeout per task in ms (default: 300000 = 5min)
* --instance ID Run a single instance by ID
* --debug Enable debug logging
*/
import { readFileSync, writeFileSync, appendFileSync, existsSync, mkdirSync } from "node:fs";
import { join, resolve } from "node:path";
import { execSync, spawn } from "node:child_process";
import { Agent } from "@multica/core";
import type { AgentOptions } from "@multica/core";
// ============================================================
// Types
// ============================================================
interface SWEBenchTask {
instance_id: string;
repo: string;
base_commit: string;
problem_statement: string;
hints_text?: string;
patch?: string;
test_patch?: string;
version?: string;
environment_setup_commit?: string;
}
interface Prediction {
instance_id: string;
model_patch: string;
model_name_or_path: string;
}
interface RunResult {
instance_id: string;
success: boolean;
patch: string;
error?: string;
duration_ms: number;
session_id: string;
}
// ============================================================
// CLI argument parsing
// ============================================================
interface RunOptions {
dataset: string;
provider: string;
model?: string;
limit: number;
offset: number;
output: string;
workdir: string;
timeout: number;
instance?: string;
debug: boolean;
}
function parseArgs(): RunOptions {
const args = process.argv.slice(2);
const opts: RunOptions = {
dataset: "scripts/swe-bench/lite.jsonl",
provider: "kimi-coding",
limit: 0,
offset: 0,
output: "scripts/swe-bench/predictions.jsonl",
workdir: "/tmp/swe-bench",
timeout: 300_000, // 5 minutes
debug: false,
};
for (let i = 0; i < args.length; i++) {
const arg = args[i]!;
if (arg === "--dataset") opts.dataset = args[++i]!;
else if (arg === "--provider") opts.provider = args[++i]!;
else if (arg === "--model") opts.model = args[++i]!;
else if (arg === "--limit") opts.limit = parseInt(args[++i]!, 10);
else if (arg === "--offset") opts.offset = parseInt(args[++i]!, 10);
else if (arg === "--output") opts.output = args[++i]!;
else if (arg === "--workdir") opts.workdir = args[++i]!;
else if (arg === "--timeout") opts.timeout = parseInt(args[++i]!, 10);
else if (arg === "--instance") opts.instance = args[++i]!;
else if (arg === "--debug") opts.debug = true;
else {
console.error(`Unknown argument: ${arg}`);
process.exit(1);
}
}
return opts;
}
// ============================================================
// Dataset loading
// ============================================================
function loadDataset(path: string): SWEBenchTask[] {
if (!existsSync(path)) {
console.error(`Dataset not found: ${path}`);
console.error("Run: python scripts/swe-bench/download-dataset.py");
process.exit(1);
}
const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
return lines.map((line) => JSON.parse(line) as SWEBenchTask);
}
// ============================================================
// Repository setup
// ============================================================
function setupRepo(task: SWEBenchTask, workdir: string): string {
const repoDir = join(workdir, task.instance_id.replace(/\//g, "__"));
if (existsSync(repoDir)) {
// Reset existing repo to base commit
log(` Resetting existing repo to ${task.base_commit.slice(0, 8)}...`);
execSync(`git checkout -f ${task.base_commit} && git clean -fdx`, {
cwd: repoDir,
stdio: "pipe",
timeout: 60_000,
});
} else {
// Clone from GitHub
const repoUrl = `https://github.com/${task.repo}.git`;
log(` Cloning ${task.repo}...`);
mkdirSync(workdir, { recursive: true });
execSync(`git clone --quiet ${repoUrl} "${repoDir}"`, {
stdio: "pipe",
timeout: 120_000,
});
execSync(`git checkout -f ${task.base_commit}`, {
cwd: repoDir,
stdio: "pipe",
timeout: 30_000,
});
}
return repoDir;
}
// ============================================================
// System prompt
// ============================================================
function buildSystemPrompt(task: SWEBenchTask): string {
return `You are an expert software engineer tasked with fixing a bug in an open-source repository.
## Instructions
1. Read the issue description carefully and understand the problem.
2. Explore the repository to find the relevant source code.
3. Identify the root cause of the issue.
4. Make the minimal set of changes to fix the issue. Do NOT add tests.
5. After making changes, verify your fix makes sense.
## Important Rules
- Make ONLY the changes necessary to fix the described issue.
- Do NOT modify or add any test files.
- Do NOT add comments explaining the fix unless the code is non-obvious.
- Do NOT refactor unrelated code.
- Keep changes minimal and focused.
## Repository
This is the \`${task.repo}\` repository checked out at commit \`${task.base_commit.slice(0, 12)}\`.`;
}
function buildPrompt(task: SWEBenchTask): string {
let prompt = `## Issue\n\n${task.problem_statement}`;
if (task.hints_text) {
prompt += `\n\n## Hints\n\n${task.hints_text}`;
}
prompt += `\n\nPlease fix this issue. Remember: make minimal changes, do not modify tests.`;
return prompt;
}
// ============================================================
// Run a single task
// ============================================================
async function runTask(
task: SWEBenchTask,
opts: RunOptions,
): Promise<RunResult> {
const start = Date.now();
// Setup repo
const repoDir = setupRepo(task, opts.workdir);
// Create agent
const agentOptions: AgentOptions = {
provider: opts.provider,
model: opts.model,
cwd: repoDir,
enableRunLog: true,
debug: opts.debug,
systemPrompt: buildSystemPrompt(task),
enableSkills: false,
tools: {
// Only allow coding tools — no web, no cron, no sessions
deny: ["web_fetch", "web_search", "cron", "data", "delegate", "send_file"],
},
};
const agent = new Agent(agentOptions);
log(` Session: ${agent.sessionId}`);
try {
// Run agent with timeout
const result = await Promise.race([
agent.run(buildPrompt(task)),
new Promise<never>((_, reject) =>
setTimeout(() => reject(new Error("timeout")), opts.timeout),
),
]);
// Collect the git diff (the patch)
let patch = "";
try {
patch = execSync("git diff", {
cwd: repoDir,
encoding: "utf-8",
maxBuffer: 10 * 1024 * 1024, // 10MB
timeout: 10_000,
});
} catch {
// Also check for staged changes
try {
patch = execSync("git diff HEAD", {
cwd: repoDir,
encoding: "utf-8",
maxBuffer: 10 * 1024 * 1024,
timeout: 10_000,
});
} catch {
patch = "";
}
}
return {
instance_id: task.instance_id,
success: patch.length > 0,
patch,
error: result.error,
duration_ms: Date.now() - start,
session_id: agent.sessionId,
};
} catch (err) {
// Collect any partial patch
let patch = "";
try {
patch = execSync("git diff", {
cwd: repoDir,
encoding: "utf-8",
maxBuffer: 10 * 1024 * 1024,
timeout: 10_000,
});
} catch {
// ignore
}
return {
instance_id: task.instance_id,
success: false,
patch,
error: err instanceof Error ? err.message : String(err),
duration_ms: Date.now() - start,
session_id: agent.sessionId,
};
}
}
// ============================================================
// Logging
// ============================================================
function log(msg: string) {
const ts = new Date().toISOString().slice(11, 19);
console.error(`[${ts}] ${msg}`);
}
// ============================================================
// Main
// ============================================================
async function main() {
const opts = parseArgs();
log("SWE-bench Runner for Multica");
log(`Provider: ${opts.provider}${opts.model ? ` (${opts.model})` : ""}`);
log(`Dataset: ${opts.dataset}`);
log(`Work dir: ${opts.workdir}`);
log(`Timeout: ${opts.timeout / 1000}s per task`);
// Set SMC_DATA_DIR for isolation
if (!process.env.SMC_DATA_DIR) {
process.env.SMC_DATA_DIR = join(process.env.HOME || "~", ".swe-bench-eval");
log(`SMC_DATA_DIR: ${process.env.SMC_DATA_DIR}`);
}
// Load dataset
let tasks = loadDataset(resolve(opts.dataset));
log(`Loaded ${tasks.length} tasks`);
// Filter by instance ID if specified
if (opts.instance) {
tasks = tasks.filter((t) => t.instance_id === opts.instance);
if (tasks.length === 0) {
console.error(`Instance not found: ${opts.instance}`);
process.exit(1);
}
}
// Apply offset and limit
if (opts.offset > 0) {
tasks = tasks.slice(opts.offset);
}
if (opts.limit > 0) {
tasks = tasks.slice(0, opts.limit);
}
log(`Running ${tasks.length} tasks`);
// Prepare output
const outputPath = resolve(opts.output);
const resultsPath = outputPath.replace(".jsonl", ".results.jsonl");
// Run tasks sequentially
const modelName = `multica-${opts.provider}${opts.model ? `-${opts.model}` : ""}`;
let completed = 0;
let succeeded = 0;
for (const task of tasks) {
completed++;
log(`\n[${completed}/${tasks.length}] ${task.instance_id}`);
const result = await runTask(task, opts);
if (result.success) succeeded++;
// Write prediction in SWE-bench format
const prediction: Prediction = {
instance_id: result.instance_id,
model_patch: result.patch,
model_name_or_path: modelName,
};
appendFileSync(outputPath, JSON.stringify(prediction) + "\n");
// Write detailed result
appendFileSync(resultsPath, JSON.stringify(result) + "\n");
const status = result.success ? "PATCHED" : "NO_PATCH";
const errorInfo = result.error ? ` (${result.error})` : "";
log(
` ${status} | ${(result.duration_ms / 1000).toFixed(1)}s | patch=${result.patch.length} bytes${errorInfo}`,
);
}
log(`\n========================================`);
log(`Results: ${succeeded}/${completed} tasks produced patches`);
log(`Predictions: ${outputPath}`);
log(`Details: ${resultsPath}`);
log(`\nTo evaluate with SWE-bench harness:`);
log(
` python -m swebench.harness.run_evaluation --dataset_name princeton-nlp/SWE-bench_Lite --predictions_path ${outputPath} --max_workers 4 --run_id multica`,
);
}
main().catch((err) => {
console.error("Fatal error:", err);
process.exit(1);
});