feat: pivot to AI-native task management platform (#232)
Replace the agent framework codebase with a new monorepo structure for an AI-native Linear-like product where agents are first-class citizens. New architecture: - server/ — Go backend (Chi + gorilla/websocket + sqlc) - API server with REST routes for issues, agents, inbox, workspaces - WebSocket hub for real-time updates - Local daemon entry point for agent runtime connection - PostgreSQL migration with 13 tables (issue, agent, inbox, etc.) - WebSocket protocol types for server<->daemon communication - apps/web/ — Next.js 16 frontend - Dashboard layout with sidebar navigation - Route skeleton: inbox, issues, agents, board, settings - packages/ui/ — Preserved shadcn/ui design system (26+ components) - packages/types/ — Full API contract types (Issue, Agent, Workspace, Inbox, Events) - packages/sdk/ — REST ApiClient + WebSocket WSClient - packages/store/ — Zustand stores (issue, agent, inbox, auth) - packages/hooks/ — React hooks (useIssues, useAgents, useInbox, useRealtime) - packages/utils/ — Shared utilities Removed: apps/cli, apps/desktop, apps/mobile, apps/gateway, packages/core, skills/, and all agent-framework code. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
3f589d8326
commit
d4f5c5b16f
677 changed files with 2779 additions and 122531 deletions
|
|
@ -1,42 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Archive and clean the dev environment data.
|
||||
#
|
||||
# Moves ~/.super-multica-dev and ~/Documents/Multica-dev into a
|
||||
# timestamped archive directory for later debugging / analysis.
|
||||
#
|
||||
# Usage:
|
||||
# pnpm dev:local:archive
|
||||
#
|
||||
# Archives are stored in: ~/.super-multica-dev-archives/<timestamp>/
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
TIMESTAMP=$(date +"%Y%m%d-%H%M%S")
|
||||
ARCHIVE_BASE="$HOME/.super-multica-dev-archives"
|
||||
ARCHIVE_DIR="$ARCHIVE_BASE/$TIMESTAMP"
|
||||
|
||||
DEV_DATA="$HOME/.super-multica-dev"
|
||||
DEV_WORKSPACE="$HOME/Documents/Multica-dev"
|
||||
|
||||
# Check if there's anything to archive
|
||||
if [ ! -d "$DEV_DATA" ] && [ ! -d "$DEV_WORKSPACE" ]; then
|
||||
echo "Nothing to archive — neither $DEV_DATA nor $DEV_WORKSPACE exists."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "$ARCHIVE_DIR"
|
||||
|
||||
if [ -d "$DEV_DATA" ]; then
|
||||
mv "$DEV_DATA" "$ARCHIVE_DIR/data"
|
||||
echo " Archived $DEV_DATA -> $ARCHIVE_DIR/data"
|
||||
fi
|
||||
|
||||
if [ -d "$DEV_WORKSPACE" ]; then
|
||||
mv "$DEV_WORKSPACE" "$ARCHIVE_DIR/workspace"
|
||||
echo " Archived $DEV_WORKSPACE -> $ARCHIVE_DIR/workspace"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Archived to: $ARCHIVE_DIR"
|
||||
echo "Dev environment is now clean. Run 'pnpm dev:local' to start fresh."
|
||||
|
|
@ -1,79 +0,0 @@
|
|||
#!/usr/bin/env node
|
||||
import * as esbuild from "esbuild";
|
||||
import { fileURLToPath } from "url";
|
||||
import { dirname, resolve } from "path";
|
||||
import { readFileSync, chmodSync } from "fs";
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const rootDir = resolve(__dirname, "..");
|
||||
|
||||
// Read package.json to get all dependencies
|
||||
const pkg = JSON.parse(readFileSync(resolve(rootDir, "package.json"), "utf8"));
|
||||
const allDeps = [
|
||||
...Object.keys(pkg.dependencies || {}),
|
||||
...Object.keys(pkg.devDependencies || {}),
|
||||
];
|
||||
|
||||
// Plugin to strip shebangs from source files (they get bundled otherwise)
|
||||
const stripShebangPlugin = {
|
||||
name: "strip-shebang",
|
||||
setup(build) {
|
||||
build.onLoad({ filter: /\.ts$/ }, async (args) => {
|
||||
const source = readFileSync(args.path, "utf8");
|
||||
// Remove shebang if present
|
||||
const contents = source.replace(/^#!.*\n/, "");
|
||||
return { contents, loader: "ts" };
|
||||
});
|
||||
},
|
||||
};
|
||||
|
||||
async function build() {
|
||||
// Unified CLI entry point
|
||||
const entryPoint = {
|
||||
entry: "src/agent/cli/index.ts",
|
||||
outfile: "bin/multica.mjs",
|
||||
};
|
||||
|
||||
console.log(`Building ${entryPoint.entry} -> ${entryPoint.outfile}...`);
|
||||
|
||||
await esbuild.build({
|
||||
entryPoints: [resolve(rootDir, entryPoint.entry)],
|
||||
outfile: resolve(rootDir, entryPoint.outfile),
|
||||
bundle: true,
|
||||
platform: "node",
|
||||
target: "node20",
|
||||
format: "esm",
|
||||
banner: {
|
||||
js: "#!/usr/bin/env node",
|
||||
},
|
||||
plugins: [stripShebangPlugin],
|
||||
sourcemap: true,
|
||||
minify: false,
|
||||
// Externalize all dependencies - they will be loaded from node_modules at runtime
|
||||
external: allDeps,
|
||||
});
|
||||
|
||||
// Make executable
|
||||
chmodSync(resolve(rootDir, entryPoint.outfile), 0o755);
|
||||
console.log(` ✓ ${entryPoint.outfile}`);
|
||||
|
||||
console.log("\nBuild complete! Binary is in ./bin/");
|
||||
console.log("\nUsage:");
|
||||
console.log(" multica # Interactive mode (default)");
|
||||
console.log(" multica run <prompt> # Run a single prompt");
|
||||
console.log(" multica chat # Interactive mode");
|
||||
console.log(" multica session list # List sessions");
|
||||
console.log(" multica profile list # List profiles");
|
||||
console.log(" multica skills list # List skills");
|
||||
console.log(" multica tools list # List tools");
|
||||
console.log(" multica credentials init # Initialize credentials");
|
||||
console.log(" multica dev # Start dev servers");
|
||||
console.log(" multica help # Show help");
|
||||
console.log("\nNote: The built binary requires node_modules to be present.");
|
||||
console.log("Run 'pnpm install --prod' to install only production dependencies.");
|
||||
}
|
||||
|
||||
build().catch((err) => {
|
||||
console.error(err);
|
||||
process.exit(1);
|
||||
});
|
||||
|
|
@ -1,137 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# Compaction Benchmark - Multi-turn test with low context window
|
||||
#
|
||||
# This script runs a series of prompts against the Multica agent with a very
|
||||
# low context window (20k tokens) to force compaction to trigger quickly.
|
||||
# The run-log output is then available for analysis.
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/compaction-benchmark/run.sh [provider]
|
||||
#
|
||||
# Default provider: kimi-coding
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PROVIDER="${1:-kimi-coding}"
|
||||
CONTEXT_WINDOW="${2:-20000}"
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
ROOT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
|
||||
export SMC_DATA_DIR=~/.super-multica-e2e
|
||||
|
||||
echo "=== Compaction Benchmark ==="
|
||||
echo "Provider: $PROVIDER"
|
||||
echo "Context Window: $CONTEXT_WINDOW tokens"
|
||||
echo "Data Dir: $SMC_DATA_DIR"
|
||||
echo ""
|
||||
|
||||
# Clean previous E2E data
|
||||
rm -rf "$SMC_DATA_DIR"
|
||||
|
||||
cd "$ROOT_DIR"
|
||||
|
||||
# Turn 1: Start a session with a substantial prompt that generates tool usage
|
||||
echo "--- Turn 1: Initial prompt (read multiple files) ---"
|
||||
TURN1_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
|
||||
--run-log \
|
||||
--provider "$PROVIDER" \
|
||||
--context-window "$CONTEXT_WINDOW" \
|
||||
"Read the following files and give me a brief summary of each: packages/core/src/agent/runner.ts, packages/core/src/agent/session/session-manager.ts, packages/core/src/agent/context-window/token-estimation.ts. List the main exports and key functions in each file." \
|
||||
2>&1)
|
||||
|
||||
# Extract session ID from stderr output
|
||||
SESSION_ID=$(echo "$TURN1_OUTPUT" | grep -o '\[session: [^]]*\]' | head -1 | sed 's/\[session: //;s/\]//')
|
||||
SESSION_DIR=$(echo "$TURN1_OUTPUT" | grep -o '\[session-dir: [^]]*\]' | head -1 | sed 's/\[session-dir: //;s/\]//')
|
||||
|
||||
if [ -z "$SESSION_ID" ]; then
|
||||
echo "ERROR: Could not extract session ID from output"
|
||||
echo "$TURN1_OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Session ID: $SESSION_ID"
|
||||
echo "Session Dir: $SESSION_DIR"
|
||||
echo ""
|
||||
|
||||
# Turn 2: Continue the session with more file reads to push context higher
|
||||
echo "--- Turn 2: More file reads (push context higher) ---"
|
||||
TURN2_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
|
||||
--run-log \
|
||||
--provider "$PROVIDER" \
|
||||
--context-window "$CONTEXT_WINDOW" \
|
||||
--session "$SESSION_ID" \
|
||||
"Now also read packages/core/src/agent/context-window/summarization.ts and packages/core/src/agent/context-window/tool-result-pruning.ts. Describe the key algorithms in each." \
|
||||
2>&1)
|
||||
|
||||
echo "$TURN2_OUTPUT" | head -5
|
||||
echo ""
|
||||
|
||||
# Turn 3: More context-heavy work
|
||||
echo "--- Turn 3: Additional analysis (should trigger compaction) ---"
|
||||
TURN3_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
|
||||
--run-log \
|
||||
--provider "$PROVIDER" \
|
||||
--context-window "$CONTEXT_WINDOW" \
|
||||
--session "$SESSION_ID" \
|
||||
"Read packages/core/src/agent/session/compaction.ts and explain the three compaction modes. Also read packages/core/src/agent/context-window/guard.ts and explain the guard thresholds." \
|
||||
2>&1)
|
||||
|
||||
echo "$TURN3_OUTPUT" | head -5
|
||||
echo ""
|
||||
|
||||
# Turn 4: More tool usage
|
||||
echo "--- Turn 4: Write and test (more context pressure) ---"
|
||||
TURN4_OUTPUT=$(SMC_DATA_DIR="$SMC_DATA_DIR" pnpm multica run \
|
||||
--run-log \
|
||||
--provider "$PROVIDER" \
|
||||
--context-window "$CONTEXT_WINDOW" \
|
||||
--session "$SESSION_ID" \
|
||||
"Based on everything you've read so far, list all the constants and thresholds used in the compaction system. Provide exact values and which file each is defined in." \
|
||||
2>&1)
|
||||
|
||||
echo "$TURN4_OUTPUT" | head -5
|
||||
echo ""
|
||||
|
||||
# Output analysis summary
|
||||
echo "=== Benchmark Complete ==="
|
||||
echo "Session Dir: $SESSION_DIR"
|
||||
echo ""
|
||||
|
||||
# Show run-log stats
|
||||
if [ -f "$SESSION_DIR/run-log.jsonl" ]; then
|
||||
echo "--- Run Log Event Summary ---"
|
||||
echo "Total events: $(wc -l < "$SESSION_DIR/run-log.jsonl")"
|
||||
echo ""
|
||||
echo "Events by type:"
|
||||
cat "$SESSION_DIR/run-log.jsonl" | python3 -c "
|
||||
import sys, json
|
||||
from collections import Counter
|
||||
events = Counter()
|
||||
for line in sys.stdin:
|
||||
try:
|
||||
obj = json.loads(line.strip())
|
||||
events[obj.get('event', 'unknown')] += 1
|
||||
except:
|
||||
pass
|
||||
for event, count in sorted(events.items()):
|
||||
print(f' {event}: {count}')
|
||||
" 2>/dev/null || echo " (python3 not available for analysis)"
|
||||
echo ""
|
||||
|
||||
echo "--- Compaction Events ---"
|
||||
cat "$SESSION_DIR/run-log.jsonl" | python3 -c "
|
||||
import sys, json
|
||||
for line in sys.stdin:
|
||||
try:
|
||||
obj = json.loads(line.strip())
|
||||
event = obj.get('event', '')
|
||||
if 'compact' in event or 'overflow' in event or 'pruning' in event:
|
||||
print(json.dumps(obj, indent=2))
|
||||
except:
|
||||
pass
|
||||
" 2>/dev/null || echo " (python3 not available for analysis)"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Full run-log path: $SESSION_DIR/run-log.jsonl ==="
|
||||
echo "=== Session file path: $SESSION_DIR/session.jsonl ==="
|
||||
|
|
@ -1,56 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Local development: Gateway (with Telegram bot) + Desktop + Web (for login)
|
||||
#
|
||||
# Usage:
|
||||
# pnpm dev:local
|
||||
#
|
||||
# Reads TELEGRAM_BOT_TOKEN from .env at the repo root.
|
||||
# Gateway runs on port 4000 in long-polling mode (no TELEGRAM_WEBHOOK_URL needed).
|
||||
# Web app runs on port 3000 (default) for OAuth login flow.
|
||||
# Desktop connects to the local Gateway and uses local Web for login.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
ROOT_DIR="$SCRIPT_DIR/.."
|
||||
ENV_FILE="$ROOT_DIR/.env"
|
||||
|
||||
# Load .env
|
||||
if [ ! -f "$ENV_FILE" ]; then
|
||||
echo "Error: .env file not found at $ENV_FILE"
|
||||
echo "Copy .env.example to .env and fill in TELEGRAM_BOT_TOKEN"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
set -a
|
||||
source "$ENV_FILE"
|
||||
set +a
|
||||
|
||||
if [ -z "${TELEGRAM_BOT_TOKEN:-}" ]; then
|
||||
echo "Error: TELEGRAM_BOT_TOKEN not set in .env"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Starting local dev environment..."
|
||||
echo " Gateway: http://localhost:4000 (Telegram long-polling mode)"
|
||||
echo " Web: http://localhost:3000 (OAuth login)"
|
||||
echo " Desktop: connecting to local Gateway + Web"
|
||||
echo " Data dir: ~/.super-multica-dev (isolated from production)"
|
||||
echo " Workspace: ~/Documents/Multica-dev (isolated from production)"
|
||||
echo ""
|
||||
|
||||
# Build shared packages first
|
||||
pnpm turbo build --filter=@multica/types --filter=@multica/utils --filter=@multica/core
|
||||
|
||||
# Start everything
|
||||
# Gateway uses PORT=4000 to avoid conflict with Web app on port 3000
|
||||
exec pnpm concurrently \
|
||||
-n types,utils,core,gateway,web,desktop \
|
||||
-c blue,green,yellow,magenta,red,cyan \
|
||||
"pnpm --filter @multica/types dev" \
|
||||
"pnpm --filter @multica/utils dev" \
|
||||
"pnpm --filter @multica/core dev" \
|
||||
"PORT=4000 SMC_DATA_DIR=~/.super-multica-dev MULTICA_WORKSPACE_DIR=~/Documents/Multica-dev MULTICA_RUN_LOG=1 pnpm --filter @multica/gateway dev" \
|
||||
"MULTICA_API_URL=https://api-dev.copilothub.ai pnpm --filter @multica/web dev" \
|
||||
"GATEWAY_URL=http://localhost:4000 MAIN_VITE_WEB_URL=http://localhost:3000 SMC_DATA_DIR=~/.super-multica-dev MULTICA_WORKSPACE_DIR=~/Documents/Multica-dev MULTICA_RUN_LOG=1 pnpm --filter @multica/desktop dev"
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
Complete a high-complexity investment research task:
|
||||
|
||||
Objective: Analyze the top 10 US stocks by market capitalization across their most recent three complete fiscal years and provide investment recommendations for 2026 (2026-01-01 to 2026-12-31).
|
||||
|
||||
Requirements:
|
||||
1. Use "top 10 US stocks by market cap as of 2026-02-01" as the sample; if data windows are incomplete for certain companies, note this and substitute with the most recent available complete fiscal year.
|
||||
2. Generate 1 detailed analysis per company, covering at minimum: revenue and profit structure, gross/operating margin trends, cash flow quality, capex and buybacks/dividends, valuation range, and key risks.
|
||||
3. Generate an Excel file (.xlsx) with at least 4 sheets: `raw_data`, `company_scorecard`, `valuation`, `risk_matrix`.
|
||||
4. Generate a comprehensive report with cross-company comparison and tiering (core holding / watchlist / avoid), along with 2026 portfolio recommendations (including position ranges and trigger conditions).
|
||||
5. Output a separate `sources.md` listing key data source links and retrieval timestamps.
|
||||
6. If unable to generate xlsx directly, explain why and provide structurally equivalent CSV files.
|
||||
|
||||
Execution requirements: First present an 8-12 step execution plan, then execute. Conclude with a self-check checklist confirming all files are complete.
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
Build an "AI Value Chain Fundamentals & Valuation Scorecard" project.
|
||||
|
||||
Stock universe: NVDA, AMD, AVGO, MSFT, GOOGL, AMZN, META, TSM, ASML, ANET.
|
||||
Time range: 2023-01-01 to 2025-12-31 (fill gaps with most recent available data and flag accordingly).
|
||||
|
||||
Requirements:
|
||||
1. Construct a 100-point scoring model with at least 6 dimensions: growth, profitability, capital efficiency, R&D intensity, cash flow quality, and valuation margin of safety.
|
||||
2. Provide weights and scoring logic for each dimension; must be reproducible.
|
||||
3. Generate an Excel file (.xlsx) with sheets: `input_data`, `factor_scores`, `weighted_rank`, `scenario_2026`.
|
||||
4. In `scenario_2026`, provide target ranges and trigger signals under three scenarios (optimistic / base / conservative).
|
||||
5. Produce `investment_memo.md` (including entry logic for the top 3 and avoidance logic for the bottom 3).
|
||||
6. Produce `sources.md` (source links + dates).
|
||||
|
||||
Execution requirements: Plan before executing; conclude with a "reproducibility check" (can someone else reproduce your results following your steps).
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
Perform a "US Major Bank 2026 Stress Test" task.
|
||||
|
||||
Sample: JPM, BAC, C, WFC, GS, MS.
|
||||
|
||||
Requirements:
|
||||
1. Compile key metrics from the most recent three complete fiscal years (preferably 2023-2025): net interest margin (NIM), CET1, loan loss provisions, commercial real estate (CRE) exposure, deposit cost changes, unrealized losses, etc.
|
||||
2. Construct two stress scenarios:
|
||||
- Mild Recession: unemployment +150bp, federal funds rate -100bp
|
||||
- Severe Recession: unemployment +300bp, federal funds rate -200bp, CRE default rate significantly higher
|
||||
3. Estimate directional changes in profit and capital adequacy for each bank under both scenarios, and rank vulnerability.
|
||||
4. Generate an Excel file (.xlsx) with sheets: `bank_raw`, `stress_assumptions`, `impact_estimate`, `ranking`.
|
||||
5. Generate `risk_brief.md` containing "top 5 risk signals to watch."
|
||||
6. Generate `sources.md`.
|
||||
|
||||
Execution requirements: Present methodology first, then results; conclude by listing the 3 assumptions you are least confident about.
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
Perform a "US Consumer Sector & Macro Variable Linkage Analysis."
|
||||
|
||||
Sample companies: WMT, COST, TGT, HD, LOW, MCD, SBUX, NKE, DIS, AMZN.
|
||||
Time range: 2023-01-01 to 2025-12-31.
|
||||
|
||||
Requirements:
|
||||
1. Split companies into "consumer staples" and "consumer discretionary" groups; compare revenue growth, margins, inventory changes, same-store sales (if available), and cash flow quality.
|
||||
2. Analyze each group's earnings elasticity relative to macro variables (CPI, real wages, unemployment, interest rates).
|
||||
3. Build a "2026 three-scenario" earnings elasticity matrix: soft landing / reflation / recession.
|
||||
4. Generate an Excel file (.xlsx) with sheets: `company_metrics`, `macro_series`, `elasticity_matrix`, `portfolio_actions`.
|
||||
5. Generate `strategy_note.md` with 2026 sector allocation recommendations and rebalancing trigger conditions.
|
||||
6. Generate `sources.md`.
|
||||
|
||||
Execution requirements: Each allocation recommendation must explicitly state the verifiable metrics behind it.
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
Complete an "Energy Price Shock Sensitivity Analysis for Energy & Transport Sectors."
|
||||
|
||||
Sample: XOM, CVX, COP, SLB, DAL, UAL, FDX, UPS.
|
||||
Time range: 2023-01-01 to 2025-12-31.
|
||||
|
||||
Requirements:
|
||||
1. Summarize each company's sensitivity direction to oil/fuel costs and sources of operating leverage.
|
||||
2. Construct three oil price paths for 2026:
|
||||
- Scenario A: WTI average $60
|
||||
- Scenario B: WTI average $80
|
||||
- Scenario C: WTI average $100
|
||||
3. Estimate directional changes in earnings and valuation for each company under different scenarios (ranges are acceptable over point estimates, but rationale must be provided).
|
||||
4. Generate an Excel file (.xlsx) with sheets: `raw_financials`, `oil_scenarios`, `sensitivity_map`, `trade_ideas`.
|
||||
5. Generate `hedge_plan.md` proposing at least 2 hedging or paired trade strategies, including conditions under which they would fail.
|
||||
6. Generate `sources.md`.
|
||||
|
||||
Execution requirements: Conclusions must include "base position + hedge position + trigger thresholds."
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
Build a "Cross-Asset Tactical Allocation (2026)" project.
|
||||
|
||||
Asset universe: SPY, QQQ, IWM, TLT, IEF, HYG, GLD, DBC, BTC-USD.
|
||||
Historical period: 2021-01-01 to 2025-12-31 (monthly frequency is sufficient).
|
||||
|
||||
Requirements:
|
||||
1. Calculate and compare key metrics: annualized return, volatility, maximum drawdown, Sharpe ratio, and correlation matrix.
|
||||
2. Design two portfolios:
|
||||
- Defensive (target: minimize maximum drawdown)
|
||||
- Offensive (target: higher risk-adjusted returns)
|
||||
3. Stress test both portfolios under three 2026 scenarios (growth slowdown / inflation resurgence / liquidity easing), and provide rebalancing rules.
|
||||
4. Generate an Excel file (.xlsx) with sheets: `price_returns`, `risk_metrics`, `corr_matrix`, `portfolio_defensive`, `portfolio_offensive`, `scenario_test`.
|
||||
5. Generate `allocation_memo.md` explaining why these two portfolios are actionable in 2026.
|
||||
6. Generate `sources.md`.
|
||||
|
||||
Execution requirements: Explicitly state rebalancing frequency, stop-loss rules, and re-entry conditions for each portfolio.
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
Perform a "REIT Investment Screening in a High-Rate Environment" task.
|
||||
|
||||
Sample: VNQ, PLD, AMT, EQIX, O, SPG, PSA, DLR.
|
||||
|
||||
Requirements:
|
||||
1. Compile key metrics from the most recent three complete fiscal years: FFO/AFFO growth, leverage, interest coverage, debt maturity profile, and dividend coverage.
|
||||
2. Design three 2026 interest rate scenarios (10Y Treasury yield at 3.5% / 4.5% / 5.5%) and analyze valuation pressure and dividend sustainability.
|
||||
3. Classify each as "hold / watchlist / avoid" and explain the 2-3 most critical driving factors.
|
||||
4. Generate an Excel file (.xlsx) with sheets: `reit_raw`, `debt_profile`, `rate_scenarios`, `selection_result`.
|
||||
5. Generate `reit_investment_note.md`.
|
||||
6. Generate `sources.md`.
|
||||
|
||||
Execution requirements: If data is missing, it must be explicitly marked as NA in the tables; silent omission is not allowed.
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
Perform an "Earnings Quality Forensic Analysis."
|
||||
|
||||
Sample: AAPL, MSFT, GOOGL, AMZN, META, NVDA, TSLA, BRK.B, UNH, JPM.
|
||||
Time range: 2023-01-01 to 2025-12-31.
|
||||
|
||||
Requirements:
|
||||
1. Establish an earnings quality inspection framework covering at minimum: accruals quality, operating cash flow to net income matching, stock-based compensation dilution, buyback-to-debt relationship, and one-time item impact.
|
||||
2. Assign each company a Red / Yellow / Green rating with traceable supporting evidence.
|
||||
3. Generate an Excel file (.xlsx) with sheets: `quality_raw`, `forensic_flags`, `rating_summary`, `watchlist_2026`.
|
||||
4. Generate `forensic_report.md` summarizing the 5 most concerning red flags.
|
||||
5. Generate `sources.md`.
|
||||
|
||||
Execution requirements: The report must clearly distinguish "which conclusions are factual vs. which are inferred."
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
Perform a "Post-Earnings Announcement Drift (PEAD) Strategy Feasibility Study."
|
||||
|
||||
Research period: 2023-01-01 to 2025-12-31.
|
||||
Sample: Select at least 30 US large/mid-cap stocks (provide selection criteria).
|
||||
|
||||
Requirements:
|
||||
1. Define an executable PEAD signal (e.g., post-earnings 1-3 day information, earnings surprise proxy, or post-announcement momentum proxy) and explain its limitations.
|
||||
2. Group the sample (high signal / low signal) and analyze performance differences at 1-month and 3-month horizons.
|
||||
3. Add basic risk controls (position limits, stop-loss, sector exposure limits) and evaluate whether the strategy warrants a small-scale pilot in 2026.
|
||||
4. Generate an Excel file (.xlsx) with sheets: `universe`, `signal_definition`, `group_performance`, `risk_controls`, `pilot_plan_2026`.
|
||||
5. Generate `pead_study.md` (covering methodology, results, sources of bias, and implementation recommendations).
|
||||
6. Generate `sources.md`.
|
||||
|
||||
Execution requirements: Must provide "failure scenarios" and objective conditions for "stopping the pilot."
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
Produce a "Q2 2026 Investment Committee Materials Pack."
|
||||
|
||||
Objective: Create meeting-ready investment committee documents for a USD multi-asset portfolio.
|
||||
|
||||
Requirements:
|
||||
1. Output a summary document `committee_pack.md` with at least the following sections: macro outlook, equities, rates, credit, commodities, portfolio risk, and action list.
|
||||
2. Output an Excel workbook (.xlsx) with at least these sheets: `macro_dashboard`, `equity_watchlist`, `rates_credit`, `commodity_view`, `portfolio_risk`, `action_tracker`.
|
||||
3. In `action_tracker`, provide actionable items for Q2 2026, each with: trigger condition, target position change, risk control threshold, and review date.
|
||||
4. Additionally output `devil_advocate.md`, specifically rebutting your own core investment views with at least 5 counter-arguments.
|
||||
5. Additionally output `sources.md` listing key data sources and dates.
|
||||
|
||||
Execution requirements: Plan first, then execute; conclude with a "10-minute oral briefing outline for the investment committee."
|
||||
|
|
@ -1,166 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
CASES_DIR="${SCRIPT_DIR}/cases"
|
||||
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
|
||||
OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/finance-e2e-runs/${TIMESTAMP}}"
|
||||
RESULTS_DIR="${OUT_DIR}/results"
|
||||
MANIFEST="${OUT_DIR}/manifest.tsv"
|
||||
|
||||
# Required environment for agent-driven E2E with web_search/data tools.
|
||||
SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
|
||||
MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
|
||||
PROVIDERS_RAW="${PROVIDERS:-kimi-coding claude-code}"
|
||||
CASE_GLOB="${CASE_GLOB:-case-*.txt}"
|
||||
CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-900}"
|
||||
MAX_PARALLEL="${MAX_PARALLEL:-2}"
|
||||
TIMEOUT_ENABLED="true"
|
||||
if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
|
||||
TIMEOUT_ENABLED="false"
|
||||
fi
|
||||
|
||||
if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
|
||||
echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${1:-}" == "--worker" ]]; then
|
||||
provider="${2:?missing provider}"
|
||||
case_file="${3:?missing case file}"
|
||||
case_base="$(basename "${case_file}")"
|
||||
case_id="${case_base%.txt}"
|
||||
log_file="${OUT_DIR}/${provider}-${case_id}.log"
|
||||
result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
|
||||
|
||||
prompt="$(cat "${case_file}")"
|
||||
|
||||
status="success"
|
||||
timed_out="false"
|
||||
started_epoch="$(date +%s)"
|
||||
started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
|
||||
SMC_DATA_DIR="${SMC_DATA_DIR}" \
|
||||
MULTICA_API_URL="${MULTICA_API_URL}" \
|
||||
pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
|
||||
cmd_pid=$!
|
||||
|
||||
while kill -0 "${cmd_pid}" 2>/dev/null; do
|
||||
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
|
||||
now="$(date +%s)"
|
||||
elapsed="$((now - started_epoch))"
|
||||
if (( elapsed >= CASE_TIMEOUT_SEC )); then
|
||||
timed_out="true"
|
||||
kill "${cmd_pid}" 2>/dev/null || true
|
||||
sleep 1
|
||||
kill -9 "${cmd_pid}" 2>/dev/null || true
|
||||
break
|
||||
fi
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
exit_code=0
|
||||
wait "${cmd_pid}" 2>/dev/null || exit_code=$?
|
||||
ended_epoch="$(date +%s)"
|
||||
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
duration_sec="$((ended_epoch - started_epoch))"
|
||||
|
||||
if [[ "${timed_out}" == "true" ]]; then
|
||||
status="timeout"
|
||||
printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
|
||||
elif (( exit_code != 0 )); then
|
||||
status="failed"
|
||||
elif [[ ! -s "${log_file}" ]]; then
|
||||
status="failed"
|
||||
elif ! rg -q "\[session: " "${log_file}"; then
|
||||
status="failed"
|
||||
fi
|
||||
|
||||
session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
|
||||
session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
|
||||
|
||||
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
|
||||
"${TIMESTAMP}" \
|
||||
"${provider}" \
|
||||
"${case_id}" \
|
||||
"${status}" \
|
||||
"${session_id}" \
|
||||
"${session_dir}" \
|
||||
"${log_file}" \
|
||||
"${started_at}" \
|
||||
"${ended_at}" \
|
||||
"${duration_sec}" \
|
||||
"${exit_code}" > "${result_file}"
|
||||
|
||||
printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
|
||||
"${provider}" \
|
||||
"${case_id}" \
|
||||
"${status}" \
|
||||
"${duration_sec}" \
|
||||
"${session_id:-N/A}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${OUT_DIR}"
|
||||
mkdir -p "${RESULTS_DIR}"
|
||||
printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
|
||||
|
||||
read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
|
||||
|
||||
CASE_FILES=()
|
||||
while IFS= read -r line; do
|
||||
CASE_FILES+=("${line}")
|
||||
done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
|
||||
|
||||
if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
|
||||
echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Output directory: ${OUT_DIR}"
|
||||
echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
|
||||
echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
|
||||
echo "Providers: ${PROVIDERS[*]}"
|
||||
echo "Cases: ${#CASE_FILES[@]}"
|
||||
echo "Max parallel: ${MAX_PARALLEL}"
|
||||
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
|
||||
echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
|
||||
else
|
||||
echo "Case timeout: disabled"
|
||||
fi
|
||||
|
||||
TASKS=()
|
||||
for provider in "${PROVIDERS[@]}"; do
|
||||
for case_file in "${CASE_FILES[@]}"; do
|
||||
TASKS+=("${provider}" "${case_file}")
|
||||
done
|
||||
done
|
||||
|
||||
echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
|
||||
|
||||
export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
|
||||
printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
|
||||
|
||||
RESULT_FILES=()
|
||||
while IFS= read -r line; do
|
||||
RESULT_FILES+=("${line}")
|
||||
done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
|
||||
|
||||
if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
|
||||
echo "No result files produced in ${RESULTS_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for result_file in "${RESULT_FILES[@]}"; do
|
||||
cat "${result_file}" >> "${MANIFEST}"
|
||||
done
|
||||
|
||||
success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
|
||||
echo
|
||||
echo "Completed. Manifest: ${MANIFEST}"
|
||||
echo "Summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
|
||||
|
|
@ -1,441 +0,0 @@
|
|||
#!/usr/bin/env node
|
||||
|
||||
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
||||
import { dirname, join, resolve } from "node:path";
|
||||
|
||||
/**
|
||||
* @typedef {{
|
||||
* id: string;
|
||||
* check: string;
|
||||
* passed: boolean;
|
||||
* detail?: string;
|
||||
* }} CheckResult
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {{
|
||||
* provider: string;
|
||||
* caseId: string;
|
||||
* status: string;
|
||||
* sessionId: string;
|
||||
* sessionDir: string;
|
||||
* logFile: string;
|
||||
* checks: CheckResult[];
|
||||
* pass: boolean;
|
||||
* }} CaseAnalysis
|
||||
*/
|
||||
|
||||
const manifestArg = process.argv[2];
|
||||
if (!manifestArg || manifestArg === "--help" || manifestArg === "-h") {
|
||||
console.log("Usage: node scripts/e2e-skills-benchmark/analyze.mjs <manifest.tsv>");
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const manifestPath = resolve(manifestArg);
|
||||
if (!existsSync(manifestPath)) {
|
||||
console.error(`Manifest not found: ${manifestPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const CASE_RULES = {
|
||||
"case-01-install-caldav-calendar": {
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["caldav"],
|
||||
["clawhub", "install"],
|
||||
["review-skill-security.mjs"],
|
||||
],
|
||||
},
|
||||
"case-02-gap-discovery-homeassistant": {
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["home", "assistant"],
|
||||
["clawhub", "install"],
|
||||
["review-skill-security.mjs"],
|
||||
],
|
||||
},
|
||||
"case-03-install-update-codexmonitor": {
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["codexmonitor"],
|
||||
["clawhub", "install"],
|
||||
["clawhub", "update"],
|
||||
["review-skill-security.mjs"],
|
||||
],
|
||||
},
|
||||
"case-04-gap-discovery-spotify-ux": {
|
||||
requireExecUsage: false,
|
||||
requiredResponseRegex: [
|
||||
"缺少|没有.*(技能|能力|集成)|capability gap",
|
||||
"clawhub|cloud\\s*hub|cloudhub",
|
||||
"安装|install",
|
||||
"是否|要不要|would you like|do you want",
|
||||
"安全|审查|security|review",
|
||||
],
|
||||
forbiddenCommandTokens: [
|
||||
["clawhub", "install"],
|
||||
["clawhub", "update"],
|
||||
["osascript"],
|
||||
["spogo"],
|
||||
["spotify_player"],
|
||||
["ha.sh"],
|
||||
["/api/states"],
|
||||
],
|
||||
},
|
||||
"case-05-gap-discovery-notion-ux": {
|
||||
requireExecUsage: false,
|
||||
requiredCommandTokens: [
|
||||
["clawhub", "search"],
|
||||
["notion"],
|
||||
],
|
||||
requiredEventTokens: [
|
||||
["install_guard", "blocked"],
|
||||
],
|
||||
requiredResponseRegex: [
|
||||
"notion",
|
||||
"安装|install",
|
||||
"是否|要不要|would you like|do you want|同意",
|
||||
"token|授权|integration",
|
||||
],
|
||||
forbiddenCommandTokens: [
|
||||
["osascript"],
|
||||
["spogo"],
|
||||
["spotify_player"],
|
||||
["ha.sh"],
|
||||
["/api/states"],
|
||||
],
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* @param {string} text
|
||||
* @returns {string[]}
|
||||
*/
|
||||
function splitLines(text) {
|
||||
return text.split(/\r?\n/).filter(Boolean);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} command
|
||||
* @param {string[]} tokens
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function commandHasTokens(command, tokens) {
|
||||
const lower = command.toLowerCase();
|
||||
return tokens.every((token) => lower.includes(token.toLowerCase()));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} rawArgs
|
||||
* @returns {string}
|
||||
*/
|
||||
function extractCommand(rawArgs) {
|
||||
if (!rawArgs) return "";
|
||||
try {
|
||||
const parsed = JSON.parse(rawArgs);
|
||||
if (parsed && typeof parsed.command === "string") {
|
||||
return parsed.command;
|
||||
}
|
||||
} catch {
|
||||
// Fall through: args may be truncated JSON in run-log.
|
||||
}
|
||||
return rawArgs;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} text
|
||||
* @param {string} pattern
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function textMatchesPattern(text, pattern) {
|
||||
try {
|
||||
return new RegExp(pattern, "i").test(text);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} runLogPath
|
||||
*/
|
||||
function parseRunLog(runLogPath) {
|
||||
const lines = splitLines(readFileSync(runLogPath, "utf-8"));
|
||||
const events = [];
|
||||
for (const line of lines) {
|
||||
try {
|
||||
events.push(JSON.parse(line));
|
||||
} catch {
|
||||
// Ignore malformed lines but keep analysis alive.
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} sessionPath
|
||||
* @returns {string}
|
||||
*/
|
||||
function parseFinalAssistantText(sessionPath) {
|
||||
if (!existsSync(sessionPath)) return "";
|
||||
|
||||
const lines = splitLines(readFileSync(sessionPath, "utf-8"));
|
||||
let latest = "";
|
||||
|
||||
for (const line of lines) {
|
||||
try {
|
||||
const entry = JSON.parse(line);
|
||||
if (entry?.type !== "message") continue;
|
||||
const msg = entry.message;
|
||||
if (!msg || msg.role !== "assistant") continue;
|
||||
|
||||
if (typeof msg.content === "string") {
|
||||
latest = msg.content;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (Array.isArray(msg.content)) {
|
||||
const text = msg.content
|
||||
.filter((part) => part && part.type === "text" && typeof part.text === "string")
|
||||
.map((part) => part.text)
|
||||
.join("\n")
|
||||
.trim();
|
||||
if (text) latest = text;
|
||||
}
|
||||
} catch {
|
||||
// Ignore malformed lines.
|
||||
}
|
||||
}
|
||||
|
||||
return latest;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {CaseAnalysis} analysis
|
||||
* @param {string} id
|
||||
* @param {string} check
|
||||
* @param {boolean} passed
|
||||
* @param {string} [detail]
|
||||
*/
|
||||
function addCheck(analysis, id, check, passed, detail) {
|
||||
analysis.checks.push({ id, check, passed, detail });
|
||||
}
|
||||
|
||||
const rows = splitLines(readFileSync(manifestPath, "utf-8"));
|
||||
if (rows.length <= 1) {
|
||||
console.error(`Manifest has no data rows: ${manifestPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
/** @type {CaseAnalysis[]} */
|
||||
const analyses = [];
|
||||
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const row = rows[i];
|
||||
if (!row) continue;
|
||||
|
||||
const cols = row.split("\t");
|
||||
if (cols.length < 11) continue;
|
||||
|
||||
const provider = cols[1] ?? "";
|
||||
const caseId = cols[2] ?? "";
|
||||
const rules = CASE_RULES[caseId];
|
||||
const status = cols[3] ?? "";
|
||||
const sessionId = cols[4] ?? "";
|
||||
const sessionDir = cols[5] ?? "";
|
||||
const logFile = cols[6] ?? "";
|
||||
|
||||
/** @type {CaseAnalysis} */
|
||||
const analysis = {
|
||||
provider,
|
||||
caseId,
|
||||
status,
|
||||
sessionId,
|
||||
sessionDir,
|
||||
logFile,
|
||||
checks: [],
|
||||
pass: false,
|
||||
};
|
||||
|
||||
addCheck(
|
||||
analysis,
|
||||
"run-status",
|
||||
"runner status is success",
|
||||
status === "success",
|
||||
`status=${status}`,
|
||||
);
|
||||
|
||||
if (!sessionDir) {
|
||||
addCheck(analysis, "session-dir", "session_dir exists in manifest", false, "missing session_dir");
|
||||
analyses.push(analysis);
|
||||
continue;
|
||||
}
|
||||
|
||||
const runLogPath = join(sessionDir, "run-log.jsonl");
|
||||
addCheck(
|
||||
analysis,
|
||||
"run-log-file",
|
||||
"run-log.jsonl exists",
|
||||
existsSync(runLogPath),
|
||||
runLogPath,
|
||||
);
|
||||
|
||||
if (!existsSync(runLogPath)) {
|
||||
analyses.push(analysis);
|
||||
continue;
|
||||
}
|
||||
|
||||
const events = parseRunLog(runLogPath);
|
||||
const sessionPath = join(sessionDir, "session.jsonl");
|
||||
const finalAssistantText = parseFinalAssistantText(sessionPath);
|
||||
const runStarts = events.filter((e) => e.event === "run_start");
|
||||
const runEnds = events.filter((e) => e.event === "run_end");
|
||||
const toolStarts = events.filter((e) => e.event === "tool_start");
|
||||
const toolEnds = events.filter((e) => e.event === "tool_end");
|
||||
const errorToolEnds = toolEnds.filter((e) => e.is_error === true);
|
||||
|
||||
addCheck(analysis, "event-run-start", "has run_start", runStarts.length > 0, `count=${runStarts.length}`);
|
||||
addCheck(analysis, "event-run-end", "has run_end", runEnds.length > 0, `count=${runEnds.length}`);
|
||||
addCheck(
|
||||
analysis,
|
||||
"tool-pairing",
|
||||
"tool_start count matches tool_end count",
|
||||
toolStarts.length === toolEnds.length,
|
||||
`start=${toolStarts.length} end=${toolEnds.length}`,
|
||||
);
|
||||
|
||||
const finalRunEnd = runEnds.at(-1);
|
||||
const runEndError = finalRunEnd?.error;
|
||||
const finalRunText = typeof finalRunEnd?.text === "string" ? finalRunEnd.text : "";
|
||||
const finalResponseText = finalAssistantText || finalRunText;
|
||||
addCheck(
|
||||
analysis,
|
||||
"run-end-error",
|
||||
"final run_end.error is null/empty",
|
||||
runEndError === null || runEndError === undefined || runEndError === "",
|
||||
`error=${String(runEndError)}`,
|
||||
);
|
||||
|
||||
addCheck(
|
||||
analysis,
|
||||
"tool-errors",
|
||||
"no tool_end has is_error=true",
|
||||
errorToolEnds.length === 0,
|
||||
`error_tool_calls=${errorToolEnds.length}`,
|
||||
);
|
||||
|
||||
const execCommands = toolStarts
|
||||
.filter((e) => e.tool === "exec")
|
||||
.map((e) => extractCommand(typeof e.args === "string" ? e.args : ""))
|
||||
.filter(Boolean);
|
||||
|
||||
const requireExecUsage = rules?.requireExecUsage !== false;
|
||||
addCheck(
|
||||
analysis,
|
||||
"exec-usage",
|
||||
requireExecUsage
|
||||
? "at least one exec command was used"
|
||||
: "exec usage is optional for this case",
|
||||
requireExecUsage ? execCommands.length > 0 : true,
|
||||
requireExecUsage ? `exec_calls=${execCommands.length}` : `exec_calls=${execCommands.length} (optional)`,
|
||||
);
|
||||
|
||||
if (rules) {
|
||||
if (Array.isArray(rules.requiredCommandTokens)) {
|
||||
for (let r = 0; r < rules.requiredCommandTokens.length; r++) {
|
||||
const tokenList = rules.requiredCommandTokens[r];
|
||||
const passed = execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
|
||||
addCheck(
|
||||
analysis,
|
||||
`cmd-${r + 1}`,
|
||||
`exec command contains tokens: ${tokenList.join(" + ")}`,
|
||||
passed,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(rules.requiredEventTokens)) {
|
||||
const eventLines = events.map((event) => JSON.stringify(event).toLowerCase());
|
||||
for (let r = 0; r < rules.requiredEventTokens.length; r++) {
|
||||
const tokenList = rules.requiredEventTokens[r];
|
||||
const passed = eventLines.some((line) =>
|
||||
tokenList.every((token) => line.includes(token.toLowerCase())),
|
||||
);
|
||||
addCheck(
|
||||
analysis,
|
||||
`event-${r + 1}`,
|
||||
`event log contains tokens: ${tokenList.join(" + ")}`,
|
||||
passed,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(rules.forbiddenCommandTokens)) {
|
||||
for (let r = 0; r < rules.forbiddenCommandTokens.length; r++) {
|
||||
const tokenList = rules.forbiddenCommandTokens[r];
|
||||
const passed = !execCommands.some((cmd) => commandHasTokens(cmd, tokenList));
|
||||
addCheck(
|
||||
analysis,
|
||||
`forbid-cmd-${r + 1}`,
|
||||
`exec command does not contain tokens: ${tokenList.join(" + ")}`,
|
||||
passed,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(rules.requiredResponseRegex)) {
|
||||
for (let r = 0; r < rules.requiredResponseRegex.length; r++) {
|
||||
const pattern = rules.requiredResponseRegex[r];
|
||||
const passed = textMatchesPattern(finalResponseText, pattern);
|
||||
addCheck(
|
||||
analysis,
|
||||
`resp-${r + 1}`,
|
||||
`final response matches regex: /${pattern}/i`,
|
||||
passed,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
addCheck(
|
||||
analysis,
|
||||
"case-rules",
|
||||
"case has rule set",
|
||||
false,
|
||||
`No rules defined for case_id=${caseId}`,
|
||||
);
|
||||
}
|
||||
|
||||
analysis.pass = analysis.checks.every((c) => c.passed);
|
||||
analyses.push(analysis);
|
||||
}
|
||||
|
||||
const passedCases = analyses.filter((a) => a.pass).length;
|
||||
const failedCases = analyses.length - passedCases;
|
||||
|
||||
const output = {
|
||||
manifestPath,
|
||||
totalCases: analyses.length,
|
||||
passedCases,
|
||||
failedCases,
|
||||
results: analyses,
|
||||
};
|
||||
|
||||
const outputPath = join(dirname(manifestPath), "analysis.json");
|
||||
writeFileSync(outputPath, JSON.stringify(output, null, 2) + "\n", "utf-8");
|
||||
|
||||
for (const item of analyses) {
|
||||
const status = item.pass ? "PASS" : "FAIL";
|
||||
console.log(`[${status}] provider=${item.provider} case=${item.caseId} session=${item.sessionId || "N/A"}`);
|
||||
for (const check of item.checks) {
|
||||
const marker = check.passed ? " [ok] " : " [bad] ";
|
||||
const detail = check.detail ? ` (${check.detail})` : "";
|
||||
console.log(`${marker}${check.check}${detail}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("");
|
||||
console.log(`Analysis file: ${outputPath}`);
|
||||
console.log(`Summary: pass=${passedCases} fail=${failedCases}`);
|
||||
|
||||
if (failedCases > 0) {
|
||||
process.exit(1);
|
||||
}
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
Run an end-to-end test for the Meta Skill Installer.
|
||||
|
||||
Goal: install a real ClawHub skill for CalDAV calendar capability.
|
||||
Reference page: https://clawhub.ai/skills/caldav-calendar
|
||||
|
||||
Follow this exact workflow:
|
||||
1. State the missing capability in one sentence.
|
||||
2. Search ClawHub for CalDAV-related skills and choose the best candidate.
|
||||
3. Stage-install to a temporary directory first (never install directly to active skills path).
|
||||
4. Run security review on the staged skill:
|
||||
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
|
||||
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
|
||||
5. If riskLevel is safe, install to "$DATA_DIR/skills".
|
||||
6. Verify final install by checking "$DATA_DIR/skills/<slug>/SKILL.md" exists.
|
||||
7. Return a short report: selected slug, riskLevel, final install path.
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
Run an end-to-end capability-gap discovery test for Meta Skill Installer.
|
||||
|
||||
User intent: "I need to control Home Assistant lights and switches from the agent."
|
||||
Reference page: https://clawhub.ai/skills/homeassistant
|
||||
|
||||
Requirements:
|
||||
1. Treat this as a missing capability and explicitly define the gap.
|
||||
2. Search ClawHub for relevant skills and list the top 3 candidates.
|
||||
3. Pick one candidate with rationale (scope match + lower security risk).
|
||||
4. Stage-install to a temporary directory.
|
||||
5. Run security review:
|
||||
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
|
||||
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
|
||||
6. If riskLevel is safe, install to "$DATA_DIR/skills".
|
||||
7. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
|
||||
8. Return: candidate list, chosen slug, riskLevel, and final path.
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
Run an end-to-end install+update regression test for Meta Skill Installer.
|
||||
|
||||
Goal: use a real ClawHub skill and verify install, review, and update flow.
|
||||
Reference page: https://clawhub.ai/odrobnik/codexmonitor
|
||||
|
||||
Requirements:
|
||||
1. Search ClawHub for CodexMonitor and select the matching skill slug.
|
||||
2. Stage-install to a temporary directory and run security review:
|
||||
DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica}"
|
||||
node "$DATA_DIR/skills/meta-skill-installer/scripts/review-skill-security.mjs" "<staged skill path>"
|
||||
3. If riskLevel is safe, install to "$DATA_DIR/skills".
|
||||
4. Verify "$DATA_DIR/skills/<slug>/SKILL.md" exists.
|
||||
5. Run an update for the same slug in managed dir:
|
||||
clawhub update "<slug>" --workdir "$DATA_DIR" --dir skills --force
|
||||
6. Run security review again on the final installed path.
|
||||
7. Return: slug, initial riskLevel, update executed (yes/no), final path.
|
||||
|
|
@ -1 +0,0 @@
|
|||
随机播放 spotify 中的音乐
|
||||
|
|
@ -1 +0,0 @@
|
|||
帮我在 Notion 新建一个页面,标题是今天待办,并写入三条任务:修复登录 bug、写周报、安排评审
|
||||
|
|
@ -1,170 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
CASES_DIR="${SCRIPT_DIR}/cases"
|
||||
TIMESTAMP="${TIMESTAMP:-$(date +%Y%m%d-%H%M%S)}"
|
||||
OUT_DIR="${OUT_DIR:-${ROOT_DIR}/.context/skills-e2e-runs/${TIMESTAMP}}"
|
||||
RESULTS_DIR="${OUT_DIR}/results"
|
||||
MANIFEST="${OUT_DIR}/manifest.tsv"
|
||||
|
||||
# Required environment for agent-driven E2E.
|
||||
SMC_DATA_DIR="${SMC_DATA_DIR:-$HOME/.super-multica-e2e}"
|
||||
MULTICA_API_URL="${MULTICA_API_URL:-https://api-dev.copilothub.ai}"
|
||||
PROVIDERS_RAW="${PROVIDERS:-kimi-coding}"
|
||||
CASE_GLOB="${CASE_GLOB:-case-*.txt}"
|
||||
CASE_TIMEOUT_SEC="${CASE_TIMEOUT_SEC:-1200}"
|
||||
MAX_PARALLEL="${MAX_PARALLEL:-1}"
|
||||
TIMEOUT_ENABLED="true"
|
||||
if [[ "${CASE_TIMEOUT_SEC}" =~ ^[0-9]+$ ]] && (( CASE_TIMEOUT_SEC <= 0 )); then
|
||||
TIMEOUT_ENABLED="false"
|
||||
fi
|
||||
|
||||
if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || (( MAX_PARALLEL <= 0 )); then
|
||||
echo "MAX_PARALLEL must be a positive integer, got: ${MAX_PARALLEL}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${1:-}" == "--worker" ]]; then
|
||||
provider="${2:?missing provider}"
|
||||
case_file="${3:?missing case file}"
|
||||
case_base="$(basename "${case_file}")"
|
||||
case_id="${case_base%.txt}"
|
||||
log_file="${OUT_DIR}/${provider}-${case_id}.log"
|
||||
result_file="${RESULTS_DIR}/${provider}-${case_id}.tsv"
|
||||
|
||||
prompt="$(cat "${case_file}")"
|
||||
|
||||
status="success"
|
||||
timed_out="false"
|
||||
started_epoch="$(date +%s)"
|
||||
started_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
|
||||
SMC_DATA_DIR="${SMC_DATA_DIR}" \
|
||||
MULTICA_API_URL="${MULTICA_API_URL}" \
|
||||
pnpm multica run --run-log --provider "${provider}" "${prompt}" > "${log_file}" 2>&1 &
|
||||
cmd_pid=$!
|
||||
|
||||
while kill -0 "${cmd_pid}" 2>/dev/null; do
|
||||
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
|
||||
now="$(date +%s)"
|
||||
elapsed="$((now - started_epoch))"
|
||||
if (( elapsed >= CASE_TIMEOUT_SEC )); then
|
||||
timed_out="true"
|
||||
kill "${cmd_pid}" 2>/dev/null || true
|
||||
sleep 1
|
||||
kill -9 "${cmd_pid}" 2>/dev/null || true
|
||||
break
|
||||
fi
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
|
||||
exit_code=0
|
||||
wait "${cmd_pid}" 2>/dev/null || exit_code=$?
|
||||
ended_epoch="$(date +%s)"
|
||||
ended_at="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
duration_sec="$((ended_epoch - started_epoch))"
|
||||
|
||||
if [[ "${timed_out}" == "true" ]]; then
|
||||
status="timeout"
|
||||
printf "\n[runner] timed out after %ss\n" "${CASE_TIMEOUT_SEC}" >> "${log_file}"
|
||||
elif (( exit_code != 0 )); then
|
||||
status="failed"
|
||||
elif [[ ! -s "${log_file}" ]]; then
|
||||
status="failed"
|
||||
elif ! rg -q "\[session: " "${log_file}"; then
|
||||
status="failed"
|
||||
fi
|
||||
|
||||
session_id="$(rg -o "\[session: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session: ([^]]+)\]/\1/' || true)"
|
||||
session_dir="$(rg -o "\[session-dir: [^]]+\]" "${log_file}" | tail -n 1 | sed -E 's/\[session-dir: ([^]]+)\]/\1/' || true)"
|
||||
|
||||
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
|
||||
"${TIMESTAMP}" \
|
||||
"${provider}" \
|
||||
"${case_id}" \
|
||||
"${status}" \
|
||||
"${session_id}" \
|
||||
"${session_dir}" \
|
||||
"${log_file}" \
|
||||
"${started_at}" \
|
||||
"${ended_at}" \
|
||||
"${duration_sec}" \
|
||||
"${exit_code}" > "${result_file}"
|
||||
|
||||
printf "[worker] provider=%s case=%s status=%s duration=%ss session=%s\n" \
|
||||
"${provider}" \
|
||||
"${case_id}" \
|
||||
"${status}" \
|
||||
"${duration_sec}" \
|
||||
"${session_id:-N/A}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${OUT_DIR}"
|
||||
mkdir -p "${RESULTS_DIR}"
|
||||
printf "timestamp\tprovider\tcase_id\tstatus\tsession_id\tsession_dir\tlog_file\tstarted_at\tended_at\tduration_sec\texit_code\n" > "${MANIFEST}"
|
||||
|
||||
read -r -a PROVIDERS <<< "${PROVIDERS_RAW}"
|
||||
|
||||
CASE_FILES=()
|
||||
while IFS= read -r line; do
|
||||
CASE_FILES+=("${line}")
|
||||
done < <(find "${CASES_DIR}" -maxdepth 1 -type f -name "${CASE_GLOB}" | sort)
|
||||
|
||||
if [[ ${#CASE_FILES[@]} -eq 0 ]]; then
|
||||
echo "No case files matched ${CASE_GLOB} in ${CASES_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Output directory: ${OUT_DIR}"
|
||||
echo "Using SMC_DATA_DIR=${SMC_DATA_DIR}"
|
||||
echo "Using MULTICA_API_URL=${MULTICA_API_URL}"
|
||||
echo "Providers: ${PROVIDERS[*]}"
|
||||
echo "Cases: ${#CASE_FILES[@]}"
|
||||
echo "Max parallel: ${MAX_PARALLEL}"
|
||||
if [[ "${TIMEOUT_ENABLED}" == "true" ]]; then
|
||||
echo "Case timeout: ${CASE_TIMEOUT_SEC}s"
|
||||
else
|
||||
echo "Case timeout: disabled"
|
||||
fi
|
||||
|
||||
TASKS=()
|
||||
for provider in "${PROVIDERS[@]}"; do
|
||||
for case_file in "${CASE_FILES[@]}"; do
|
||||
TASKS+=("${provider}" "${case_file}")
|
||||
done
|
||||
done
|
||||
|
||||
echo "Total tasks: $(( ${#TASKS[@]} / 2 ))"
|
||||
|
||||
export TIMESTAMP OUT_DIR RESULTS_DIR SMC_DATA_DIR MULTICA_API_URL CASE_TIMEOUT_SEC TIMEOUT_ENABLED
|
||||
printf '%s\0' "${TASKS[@]}" | xargs -0 -n 2 -P "${MAX_PARALLEL}" bash "${BASH_SOURCE[0]}" --worker
|
||||
|
||||
RESULT_FILES=()
|
||||
while IFS= read -r line; do
|
||||
RESULT_FILES+=("${line}")
|
||||
done < <(find "${RESULTS_DIR}" -maxdepth 1 -type f -name "*.tsv" | sort)
|
||||
|
||||
if [[ ${#RESULT_FILES[@]} -eq 0 ]]; then
|
||||
echo "No result files produced in ${RESULTS_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for result_file in "${RESULT_FILES[@]}"; do
|
||||
cat "${result_file}" >> "${MANIFEST}"
|
||||
done
|
||||
|
||||
success_count="$(awk -F '\t' 'NR>1 && $4=="success" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
failed_count="$(awk -F '\t' 'NR>1 && $4=="failed" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
timeout_count="$(awk -F '\t' 'NR>1 && $4=="timeout" {c++} END{print c+0}' "${MANIFEST}")"
|
||||
|
||||
echo
|
||||
echo "Completed run stage. Manifest: ${MANIFEST}"
|
||||
echo "Run summary: success=${success_count} failed=${failed_count} timeout=${timeout_count}"
|
||||
|
||||
echo
|
||||
echo "Running structured analysis..."
|
||||
node "${SCRIPT_DIR}/analyze.mjs" "${MANIFEST}" | tee "${OUT_DIR}/analysis.txt"
|
||||
|
|
@ -1,499 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
OUT_FILE="${1:-$ROOT_DIR/docs/code-stats-report.html}"
|
||||
TMP_DIR="$(mktemp -d)"
|
||||
|
||||
cleanup() {
|
||||
rm -rf "$TMP_DIR"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
cd "$ROOT_DIR"
|
||||
|
||||
if ! git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
|
||||
echo "Error: this script must run inside a git repository."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 1) Snapshot LOC from tracked files.
|
||||
while IFS= read -r -d '' file; do
|
||||
if [ -f "$file" ]; then
|
||||
wc -l "$file"
|
||||
fi
|
||||
done < <(git ls-files -z) > "$TMP_DIR/wc_all.txt"
|
||||
|
||||
awk -v out_by_ext="$TMP_DIR/loc_by_ext.tsv" -v out_totals="$TMP_DIR/loc_totals.tsv" '
|
||||
{
|
||||
lines = $1
|
||||
$1 = ""
|
||||
sub(/^ +/, "")
|
||||
file = $0
|
||||
|
||||
n = split(file, parts, "/")
|
||||
base = parts[n]
|
||||
ext = base
|
||||
|
||||
if (index(base, ".") > 0) {
|
||||
sub(/.*\./, "", ext)
|
||||
} else {
|
||||
ext = "[noext]"
|
||||
}
|
||||
|
||||
ext_lines[ext] += lines
|
||||
ext_files[ext] += 1
|
||||
files += 1
|
||||
lines_all += lines
|
||||
}
|
||||
END {
|
||||
for (e in ext_lines) {
|
||||
printf "%s\t%d\t%d\n", e, ext_files[e], ext_lines[e] > out_by_ext
|
||||
}
|
||||
|
||||
source_lines = 0
|
||||
source_files = 0
|
||||
doc_lines = 0
|
||||
doc_files = 0
|
||||
cfg_lines = 0
|
||||
cfg_files = 0
|
||||
|
||||
for (e in ext_lines) {
|
||||
if (e ~ /^(ts|tsx|js|jsx|mjs|cjs|py|css|scss|html|sh)$/) {
|
||||
source_lines += ext_lines[e]
|
||||
source_files += ext_files[e]
|
||||
}
|
||||
if (e == "md") {
|
||||
doc_lines += ext_lines[e]
|
||||
doc_files += ext_files[e]
|
||||
}
|
||||
if (e ~ /^(json|json5|yaml|yml|xsd)$/) {
|
||||
cfg_lines += ext_lines[e]
|
||||
cfg_files += ext_files[e]
|
||||
}
|
||||
}
|
||||
|
||||
printf "files\t%d\nlines\t%d\nsource_files\t%d\nsource_lines\t%d\ndoc_files\t%d\ndoc_lines\t%d\nconfig_files\t%d\nconfig_lines\t%d\n", files, lines_all, source_files, source_lines, doc_files, doc_lines, cfg_files, cfg_lines > out_totals
|
||||
}
|
||||
' "$TMP_DIR/wc_all.txt"
|
||||
|
||||
# 2) Contribution by author (email-normalized).
|
||||
git log --all --no-merges --numstat --format='@@@%aN|%aE' | awk -v out="$TMP_DIR/author_by_email.tsv" '
|
||||
BEGIN { FS = "\t" }
|
||||
/^@@@/ {
|
||||
split(substr($0, 4), h, /\|/)
|
||||
name = h[1]
|
||||
email = h[2]
|
||||
id = email
|
||||
|
||||
if (!(id in display)) {
|
||||
display[id] = name " <" email ">"
|
||||
}
|
||||
|
||||
commits[id] += 1
|
||||
next
|
||||
}
|
||||
NF == 3 && $1 ~ /^[0-9]+$/ && $2 ~ /^[0-9]+$/ {
|
||||
adds[id] += $1
|
||||
dels[id] += $2
|
||||
}
|
||||
END {
|
||||
for (k in commits) {
|
||||
printf "%s\t%d\t%d\t%d\t%d\n", display[k], commits[k], adds[k] + 0, dels[k] + 0, (adds[k] - dels[k]) + 0 > out
|
||||
}
|
||||
}
|
||||
'
|
||||
|
||||
sort -t $'\t' -k3,3nr "$TMP_DIR/author_by_email.tsv" > "$TMP_DIR/author_by_email.sorted.tsv"
|
||||
|
||||
awk -F '\t' -v out="$TMP_DIR/author_human_share.tsv" '
|
||||
$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
|
||||
total_commits += $2
|
||||
total_adds += $3
|
||||
rows[++n] = $0
|
||||
}
|
||||
END {
|
||||
for (i = 1; i <= n; i++) {
|
||||
split(rows[i], f, "\t")
|
||||
add_pct = (total_adds > 0) ? (f[3] / total_adds * 100) : 0
|
||||
commit_pct = (total_commits > 0) ? (f[2] / total_commits * 100) : 0
|
||||
printf "%s\t%d\t%d\t%d\t%d\t%.2f%%\t%.2f%%\n", f[1], f[2], f[3], f[4], f[5], add_pct, commit_pct > out
|
||||
}
|
||||
}
|
||||
' "$TMP_DIR/author_by_email.sorted.tsv"
|
||||
|
||||
# 3) Contribution by author/day/hour.
|
||||
git log --all --no-merges --numstat --date=format:'%Y-%m-%d|%H' --format='@@@%aE|%ad' | awk -v out="$TMP_DIR/author_day_hour_summary.tsv" '
|
||||
BEGIN { FS = "\t" }
|
||||
/^@@@/ {
|
||||
split(substr($0, 4), h, /\|/)
|
||||
email = h[1]
|
||||
day = h[2]
|
||||
hour = h[3]
|
||||
|
||||
key = email "\t" day "\t" hour
|
||||
commits[key] += 1
|
||||
next
|
||||
}
|
||||
NF == 3 && $1 ~ /^[0-9]+$/ && $2 ~ /^[0-9]+$/ {
|
||||
adds[key] += $1
|
||||
dels[key] += $2
|
||||
}
|
||||
END {
|
||||
for (k in commits) {
|
||||
split(k, f, "\t")
|
||||
a = adds[k] + 0
|
||||
d = dels[k] + 0
|
||||
printf "%s\t%s\t%s\t%d\t%d\t%d\t%d\n", f[1], f[2], f[3], commits[k], a, d, (a - d) > out
|
||||
}
|
||||
}
|
||||
'
|
||||
|
||||
awk -F '\t' -v out="$TMP_DIR/day_summary_human.tsv" '
|
||||
$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
|
||||
day = $2
|
||||
commits[day] += $4
|
||||
adds[day] += $5
|
||||
dels[day] += $6
|
||||
|
||||
if (!(day in min_hour) || $3 < min_hour[day]) {
|
||||
min_hour[day] = $3
|
||||
}
|
||||
|
||||
if (!(day in max_hour) || $3 > max_hour[day]) {
|
||||
max_hour[day] = $3
|
||||
}
|
||||
}
|
||||
END {
|
||||
for (d in commits) {
|
||||
printf "%s\t%d\t%d\t%d\t%d\t%s\t%s\n", d, commits[d], adds[d], dels[d], adds[d] - dels[d], min_hour[d], max_hour[d] > out
|
||||
}
|
||||
}
|
||||
' "$TMP_DIR/author_day_hour_summary.tsv"
|
||||
|
||||
sort -t $'\t' -k1,1 "$TMP_DIR/day_summary_human.tsv" -o "$TMP_DIR/day_summary_human.tsv"
|
||||
|
||||
awk -F '\t' -v out="$TMP_DIR/hour_summary_human.tsv" '
|
||||
$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
|
||||
hour = $3
|
||||
commits[hour] += $4
|
||||
adds[hour] += $5
|
||||
dels[hour] += $6
|
||||
}
|
||||
END {
|
||||
for (i = 0; i < 24; i++) {
|
||||
h = sprintf("%02d", i)
|
||||
a = adds[h] + 0
|
||||
d = dels[h] + 0
|
||||
printf "%s\t%d\t%d\t%d\t%d\n", h, commits[h] + 0, a, d, a - d > out
|
||||
}
|
||||
}
|
||||
' "$TMP_DIR/author_day_hour_summary.tsv"
|
||||
|
||||
sort -t $'\t' -k1,1 "$TMP_DIR/hour_summary_human.tsv" -o "$TMP_DIR/hour_summary_human.tsv"
|
||||
|
||||
awk -F '\t' -v out="$TMP_DIR/day_peak_hour_human.tsv" '
|
||||
$1 !~ /checkpointer@noreply|dependabot\[bot\]/ {
|
||||
key = $2 "\t" $3
|
||||
commits[key] += $4
|
||||
adds[key] += $5
|
||||
dels[key] += $6
|
||||
}
|
||||
END {
|
||||
for (k in adds) {
|
||||
split(k, parts, "\t")
|
||||
day = parts[1]
|
||||
hour = parts[2]
|
||||
|
||||
if (!(day in max_adds) || adds[k] > max_adds[day]) {
|
||||
max_adds[day] = adds[k]
|
||||
best_hour[day] = hour
|
||||
best_commits[day] = commits[k]
|
||||
best_dels[day] = dels[k]
|
||||
}
|
||||
}
|
||||
|
||||
for (d in max_adds) {
|
||||
printf "%s\t%s\t%d\t%d\t%d\n", d, best_hour[d], best_commits[d], max_adds[d], best_dels[d] > out
|
||||
}
|
||||
}
|
||||
' "$TMP_DIR/author_day_hour_summary.tsv"
|
||||
|
||||
sort -t $'\t' -k1,1 "$TMP_DIR/day_peak_hour_human.tsv" -o "$TMP_DIR/day_peak_hour_human.tsv"
|
||||
|
||||
mkdir -p "$(dirname "$OUT_FILE")"
|
||||
|
||||
# 4) Render standalone HTML.
|
||||
{
|
||||
cat <<'HTML_HEAD'
|
||||
<!doctype html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>Super Multica 代码贡献统计</title>
|
||||
<style>
|
||||
:root {
|
||||
--bg: #0b0d10;
|
||||
--panel: #14181d;
|
||||
--panel-2: #1a2027;
|
||||
--line: #2a3440;
|
||||
--text: #e8edf3;
|
||||
--muted: #98a7b7;
|
||||
--ok: #2fbf71;
|
||||
--danger: #ef4444;
|
||||
}
|
||||
* { box-sizing: border-box; }
|
||||
body {
|
||||
margin: 0;
|
||||
font-family: ui-sans-serif, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial;
|
||||
background: radial-gradient(circle at 20% -10%, #1a2430 0%, #0b0d10 45%) fixed;
|
||||
color: var(--text);
|
||||
line-height: 1.4;
|
||||
}
|
||||
.wrap { max-width: 1200px; margin: 0 auto; padding: 24px; }
|
||||
h1 { margin: 0 0 8px; font-size: 28px; }
|
||||
.sub { color: var(--muted); margin-bottom: 20px; }
|
||||
.grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(190px, 1fr));
|
||||
gap: 12px;
|
||||
margin-bottom: 18px;
|
||||
}
|
||||
.card {
|
||||
background: linear-gradient(180deg, var(--panel) 0%, var(--panel-2) 100%);
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 10px;
|
||||
padding: 12px;
|
||||
}
|
||||
.k { color: var(--muted); font-size: 12px; margin-bottom: 8px; }
|
||||
.v { font-size: 24px; font-weight: 700; letter-spacing: 0.3px; }
|
||||
.section { margin-top: 14px; }
|
||||
.section h2 { margin: 0 0 10px; font-size: 16px; color: #d4dde7; }
|
||||
.panel {
|
||||
background: var(--panel);
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 10px;
|
||||
overflow: hidden;
|
||||
}
|
||||
table { width: 100%; border-collapse: collapse; }
|
||||
th, td { padding: 9px 10px; border-bottom: 1px solid var(--line); font-size: 13px; }
|
||||
th { background: #11161c; text-align: left; color: #c5d0db; position: sticky; top: 0; }
|
||||
tr:last-child td { border-bottom: 0; }
|
||||
.num { text-align: right; font-variant-numeric: tabular-nums; }
|
||||
.mono { font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; }
|
||||
.bar-wrap { background: #0f1318; border-radius: 999px; height: 8px; width: 180px; border: 1px solid #273241; }
|
||||
.bar { height: 100%; border-radius: 999px; background: linear-gradient(90deg, #3f7ef7, #58a6ff); }
|
||||
.ok { color: var(--ok); }
|
||||
.danger { color: var(--danger); }
|
||||
.foot { margin-top: 16px; color: var(--muted); font-size: 12px; }
|
||||
.scroll { max-height: 420px; overflow: auto; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="wrap">
|
||||
<h1>Super Multica 代码贡献统计</h1>
|
||||
<div class="sub" id="subtitle"></div>
|
||||
|
||||
<div class="grid" id="summary"></div>
|
||||
|
||||
<div class="section">
|
||||
<h2>代码量分布(按扩展名)</h2>
|
||||
<div class="panel scroll"><table id="extTable"></table></div>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2>人员贡献(人工口径)</h2>
|
||||
<div class="panel scroll"><table id="authorTable"></table></div>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2>每日贡献(人工口径)</h2>
|
||||
<div class="panel scroll"><table id="dayTable"></table></div>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2>小时段贡献(人工口径)</h2>
|
||||
<div class="panel scroll"><table id="hourTable"></table></div>
|
||||
</div>
|
||||
|
||||
<div class="foot">数据来源:git log --numstat 与当前工作树文件统计。人工口径排除 checkpointer / dependabot。</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const RAW = {
|
||||
locTotals: String.raw`
|
||||
HTML_HEAD
|
||||
cat "$TMP_DIR/loc_totals.tsv"
|
||||
cat <<'MID1'
|
||||
`,
|
||||
locByExt: String.raw`
|
||||
MID1
|
||||
cat "$TMP_DIR/loc_by_ext.tsv"
|
||||
cat <<'MID2'
|
||||
`,
|
||||
authorHuman: String.raw`
|
||||
MID2
|
||||
cat "$TMP_DIR/author_human_share.tsv"
|
||||
cat <<'MID3'
|
||||
`,
|
||||
dayHuman: String.raw`
|
||||
MID3
|
||||
cat "$TMP_DIR/day_summary_human.tsv"
|
||||
cat <<'MID4'
|
||||
`,
|
||||
hourHuman: String.raw`
|
||||
MID4
|
||||
cat "$TMP_DIR/hour_summary_human.tsv"
|
||||
cat <<'MID5'
|
||||
`,
|
||||
dayPeak: String.raw`
|
||||
MID5
|
||||
cat "$TMP_DIR/day_peak_hour_human.tsv"
|
||||
cat <<'HTML_TAIL'
|
||||
`
|
||||
};
|
||||
|
||||
const fmt = (n) => Number(n).toLocaleString("en-US");
|
||||
const tsv = (txt) => txt.trim().split(/\n+/).map((line) => line.split("\t"));
|
||||
const toNum = (v) => Number(v || 0);
|
||||
|
||||
const locTotalsRows = tsv(RAW.locTotals);
|
||||
const locTotals = Object.fromEntries(locTotalsRows.map(([k, v]) => [k, toNum(v)]));
|
||||
|
||||
const extRows = tsv(RAW.locByExt).map(([ext, files, lines]) => ({
|
||||
ext,
|
||||
files: toNum(files),
|
||||
lines: toNum(lines),
|
||||
})).sort((a, b) => b.lines - a.lines);
|
||||
|
||||
const authors = tsv(RAW.authorHuman).map(([name, commits, add, del, net, addPct, commitPct]) => ({
|
||||
name,
|
||||
commits: toNum(commits),
|
||||
add: toNum(add),
|
||||
del: toNum(del),
|
||||
net: toNum(net),
|
||||
addPct,
|
||||
commitPct,
|
||||
})).sort((a, b) => b.add - a.add);
|
||||
|
||||
const dayPeaks = Object.fromEntries(tsv(RAW.dayPeak).map(([d, h, c, a, del]) => [d, {
|
||||
hour: h,
|
||||
commits: toNum(c),
|
||||
add: toNum(a),
|
||||
del: toNum(del),
|
||||
}]));
|
||||
|
||||
const days = tsv(RAW.dayHuman).map(([date, commits, add, del, net, startHour, endHour]) => ({
|
||||
date,
|
||||
commits: toNum(commits),
|
||||
add: toNum(add),
|
||||
del: toNum(del),
|
||||
net: toNum(net),
|
||||
startHour,
|
||||
endHour,
|
||||
peak: dayPeaks[date] || null,
|
||||
})).sort((a, b) => a.date.localeCompare(b.date));
|
||||
|
||||
const hours = tsv(RAW.hourHuman).map(([hour, commits, add, del, net]) => ({
|
||||
hour,
|
||||
commits: toNum(commits),
|
||||
add: toNum(add),
|
||||
del: toNum(del),
|
||||
net: toNum(net),
|
||||
})).sort((a, b) => a.hour.localeCompare(b.hour));
|
||||
|
||||
const totalHumanCommits = authors.reduce((sum, x) => sum + x.commits, 0);
|
||||
const totalHumanAdd = authors.reduce((sum, x) => sum + x.add, 0);
|
||||
const totalHumanDel = authors.reduce((sum, x) => sum + x.del, 0);
|
||||
const topHour = [...hours].sort((a, b) => b.add - a.add)[0] || { hour: "--", add: 0 };
|
||||
const startDate = days[0]?.date || "--";
|
||||
const endDate = days[days.length - 1]?.date || "--";
|
||||
|
||||
document.getElementById("subtitle").textContent = `${startDate} ~ ${endDate}`;
|
||||
|
||||
const summaryItems = [
|
||||
["总文件数", fmt(locTotals.files || 0)],
|
||||
["总行数", fmt(locTotals.lines || 0)],
|
||||
["源码行数", fmt(locTotals.source_lines || 0)],
|
||||
["贡献人数", fmt(authors.length)],
|
||||
["人工提交数", fmt(totalHumanCommits)],
|
||||
["人工新增", fmt(totalHumanAdd)],
|
||||
["人工删除", fmt(totalHumanDel)],
|
||||
["最高产小时", `${topHour.hour}:00 (${fmt(topHour.add)})`],
|
||||
];
|
||||
|
||||
document.getElementById("summary").innerHTML = summaryItems.map(([k, v]) => (
|
||||
`<div class="card"><div class="k">${k}</div><div class="v">${v}</div></div>`
|
||||
)).join("");
|
||||
|
||||
const maxExtLines = Math.max(...extRows.map((x) => x.lines), 1);
|
||||
document.getElementById("extTable").innerHTML = `
|
||||
<thead><tr><th>扩展名</th><th class="num">文件数</th><th class="num">行数</th><th>占比</th><th>可视化</th></tr></thead>
|
||||
<tbody>
|
||||
${extRows.map((r) => {
|
||||
const pct = ((r.lines / (locTotals.lines || 1)) * 100).toFixed(2);
|
||||
const w = ((r.lines / maxExtLines) * 100).toFixed(1);
|
||||
return `<tr>
|
||||
<td class="mono">${r.ext}</td>
|
||||
<td class="num">${fmt(r.files)}</td>
|
||||
<td class="num">${fmt(r.lines)}</td>
|
||||
<td class="num">${pct}%</td>
|
||||
<td><div class="bar-wrap"><div class="bar" style="width:${w}%"></div></div></td>
|
||||
</tr>`;
|
||||
}).join("")}
|
||||
</tbody>`;
|
||||
|
||||
document.getElementById("authorTable").innerHTML = `
|
||||
<thead><tr><th>作者</th><th class="num">提交</th><th class="num">新增</th><th class="num">删除</th><th class="num">净新增</th><th class="num">新增占比</th><th class="num">提交占比</th></tr></thead>
|
||||
<tbody>
|
||||
${authors.map((a) => `<tr>
|
||||
<td>${a.name}</td>
|
||||
<td class="num">${fmt(a.commits)}</td>
|
||||
<td class="num">${fmt(a.add)}</td>
|
||||
<td class="num">${fmt(a.del)}</td>
|
||||
<td class="num ${a.net >= 0 ? "ok" : "danger"}">${fmt(a.net)}</td>
|
||||
<td class="num">${a.addPct}</td>
|
||||
<td class="num">${a.commitPct}</td>
|
||||
</tr>`).join("")}
|
||||
</tbody>`;
|
||||
|
||||
document.getElementById("dayTable").innerHTML = `
|
||||
<thead><tr><th>日期</th><th class="num">提交</th><th class="num">新增</th><th class="num">删除</th><th class="num">净新增</th><th>活跃时段</th><th>峰值小时</th></tr></thead>
|
||||
<tbody>
|
||||
${days.map((d) => `<tr>
|
||||
<td class="mono">${d.date}</td>
|
||||
<td class="num">${fmt(d.commits)}</td>
|
||||
<td class="num">${fmt(d.add)}</td>
|
||||
<td class="num">${fmt(d.del)}</td>
|
||||
<td class="num ${d.net >= 0 ? "ok" : "danger"}">${fmt(d.net)}</td>
|
||||
<td class="mono">${d.startHour}:00 - ${d.endHour}:59</td>
|
||||
<td class="mono">${d.peak ? `${d.peak.hour}:00 (${fmt(d.peak.add)})` : "--"}</td>
|
||||
</tr>`).join("")}
|
||||
</tbody>`;
|
||||
|
||||
const maxHourAdd = Math.max(...hours.map((h) => h.add), 1);
|
||||
document.getElementById("hourTable").innerHTML = `
|
||||
<thead><tr><th>小时</th><th class="num">提交</th><th class="num">新增</th><th class="num">删除</th><th class="num">净新增</th><th>可视化</th></tr></thead>
|
||||
<tbody>
|
||||
${hours.map((h) => {
|
||||
const w = ((h.add / maxHourAdd) * 100).toFixed(1);
|
||||
return `<tr>
|
||||
<td class="mono">${h.hour}:00</td>
|
||||
<td class="num">${fmt(h.commits)}</td>
|
||||
<td class="num">${fmt(h.add)}</td>
|
||||
<td class="num">${fmt(h.del)}</td>
|
||||
<td class="num ${h.net >= 0 ? "ok" : "danger"}">${fmt(h.net)}</td>
|
||||
<td><div class="bar-wrap"><div class="bar" style="width:${w}%"></div></div></td>
|
||||
</tr>`;
|
||||
}).join("")}
|
||||
</tbody>`;
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
HTML_TAIL
|
||||
} > "$OUT_FILE"
|
||||
|
||||
echo "Report generated: $OUT_FILE"
|
||||
|
|
@ -1,53 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Reset all user data for super-multica desktop app
|
||||
# Use this to simulate a fresh install for testing
|
||||
|
||||
set -e
|
||||
|
||||
echo "🧹 Resetting Super Multica user data..."
|
||||
|
||||
# Main data directory
|
||||
MULTICA_DATA_DIR="$HOME/.super-multica"
|
||||
if [ -d "$MULTICA_DATA_DIR" ]; then
|
||||
echo " Removing $MULTICA_DATA_DIR"
|
||||
rm -rf "$MULTICA_DATA_DIR"
|
||||
else
|
||||
echo " $MULTICA_DATA_DIR does not exist, skipping"
|
||||
fi
|
||||
|
||||
# Dev data directory (used by pnpm dev:local)
|
||||
MULTICA_DEV_DIR="$HOME/.super-multica-dev"
|
||||
if [ -d "$MULTICA_DEV_DIR" ]; then
|
||||
echo " Removing $MULTICA_DEV_DIR"
|
||||
rm -rf "$MULTICA_DEV_DIR"
|
||||
else
|
||||
echo " $MULTICA_DEV_DIR does not exist, skipping"
|
||||
fi
|
||||
|
||||
# Electron app data (macOS)
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
ELECTRON_APP_DATA="$HOME/Library/Application Support/super-multica"
|
||||
if [ -d "$ELECTRON_APP_DATA" ]; then
|
||||
echo " Removing $ELECTRON_APP_DATA"
|
||||
rm -rf "$ELECTRON_APP_DATA"
|
||||
else
|
||||
echo " $ELECTRON_APP_DATA does not exist, skipping"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Electron app data (Linux)
|
||||
if [[ "$OSTYPE" == "linux-gnu"* ]]; then
|
||||
ELECTRON_APP_DATA="$HOME/.config/super-multica"
|
||||
if [ -d "$ELECTRON_APP_DATA" ]; then
|
||||
echo " Removing $ELECTRON_APP_DATA"
|
||||
rm -rf "$ELECTRON_APP_DATA"
|
||||
else
|
||||
echo " $ELECTRON_APP_DATA does not exist, skipping"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "✅ User data reset complete!"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo " pnpm dev # Start app (will show onboarding)"
|
||||
echo " pnpm dev:reset # Reset and start in one command"
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Set Telegram Bot Webhook
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/set-telegram-webhook.sh <webhook_url>
|
||||
#
|
||||
# Example:
|
||||
# ./scripts/set-telegram-webhook.sh https://your-domain.ngrok-free.dev
|
||||
#
|
||||
# Reads TELEGRAM_BOT_TOKEN and TELEGRAM_WEBHOOK_SECRET_TOKEN from .env
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
ENV_FILE="$SCRIPT_DIR/../.env"
|
||||
|
||||
if [ ! -f "$ENV_FILE" ]; then
|
||||
echo "Error: .env file not found at $ENV_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
source "$ENV_FILE"
|
||||
|
||||
if [ -z "${TELEGRAM_BOT_TOKEN:-}" ]; then
|
||||
echo "Error: TELEGRAM_BOT_TOKEN not set in .env"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WEBHOOK_BASE_URL="${1:-}"
|
||||
|
||||
if [ -z "$WEBHOOK_BASE_URL" ]; then
|
||||
echo "Usage: $0 <webhook_base_url>"
|
||||
echo ""
|
||||
echo "Example:"
|
||||
echo " $0 https://your-domain.ngrok-free.dev"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Remove trailing slash
|
||||
WEBHOOK_BASE_URL="${WEBHOOK_BASE_URL%/}"
|
||||
WEBHOOK_URL="${WEBHOOK_BASE_URL}/telegram/webhook"
|
||||
|
||||
echo "Bot Token: ${TELEGRAM_BOT_TOKEN:0:10}..."
|
||||
echo "Secret Token: ${TELEGRAM_WEBHOOK_SECRET_TOKEN:0:8}..."
|
||||
echo "Webhook URL: $WEBHOOK_URL"
|
||||
echo ""
|
||||
|
||||
# Set webhook
|
||||
echo "=> Setting webhook..."
|
||||
RESPONSE=$(curl -s "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/setWebhook" \
|
||||
-d "url=${WEBHOOK_URL}" \
|
||||
-d "secret_token=${TELEGRAM_WEBHOOK_SECRET_TOKEN:-}")
|
||||
|
||||
echo "$RESPONSE" | python3 -m json.tool 2>/dev/null || echo "$RESPONSE"
|
||||
|
||||
echo ""
|
||||
echo "=> Verifying webhook info..."
|
||||
INFO=$(curl -s "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/getWebhookInfo")
|
||||
echo "$INFO" | python3 -m json.tool 2>/dev/null || echo "$INFO"
|
||||
5
scripts/swe-bench/.gitignore
vendored
5
scripts/swe-bench/.gitignore
vendored
|
|
@ -1,5 +0,0 @@
|
|||
# Downloaded datasets
|
||||
*.jsonl
|
||||
|
||||
# Don't ignore the scripts themselves
|
||||
!.gitignore
|
||||
|
|
@ -1,116 +0,0 @@
|
|||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* Analyze SWE-bench run results.
|
||||
*
|
||||
* Reads the .results.jsonl file produced by run.ts and prints a summary.
|
||||
*
|
||||
* Usage:
|
||||
* tsx scripts/swe-bench/analyze.ts [results.jsonl]
|
||||
*/
|
||||
|
||||
import { readFileSync, existsSync } from "node:fs";
|
||||
import { resolve, join } from "node:path";
|
||||
|
||||
interface RunResult {
|
||||
instance_id: string;
|
||||
success: boolean;
|
||||
patch: string;
|
||||
error?: string;
|
||||
duration_ms: number;
|
||||
session_id: string;
|
||||
}
|
||||
|
||||
function main() {
|
||||
const resultsPath = resolve(
|
||||
process.argv[2] || "scripts/swe-bench/predictions.results.jsonl",
|
||||
);
|
||||
|
||||
if (!existsSync(resultsPath)) {
|
||||
console.error(`Results file not found: ${resultsPath}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const lines = readFileSync(resultsPath, "utf-8").split("\n").filter(Boolean);
|
||||
const results: RunResult[] = lines.map((l) => JSON.parse(l));
|
||||
|
||||
const total = results.length;
|
||||
const patched = results.filter((r) => r.success).length;
|
||||
const failed = results.filter((r) => !r.success).length;
|
||||
const errors = results.filter((r) => r.error).length;
|
||||
const durations = results.map((r) => r.duration_ms);
|
||||
const avgDuration = durations.reduce((a, b) => a + b, 0) / total;
|
||||
const maxDuration = Math.max(...durations);
|
||||
const minDuration = Math.min(...durations);
|
||||
const patchSizes = results
|
||||
.filter((r) => r.success)
|
||||
.map((r) => r.patch.length);
|
||||
const avgPatchSize =
|
||||
patchSizes.length > 0
|
||||
? patchSizes.reduce((a, b) => a + b, 0) / patchSizes.length
|
||||
: 0;
|
||||
|
||||
console.log("=== SWE-bench Run Analysis ===\n");
|
||||
console.log(`Total tasks: ${total}`);
|
||||
console.log(`Patched: ${patched} (${((patched / total) * 100).toFixed(1)}%)`);
|
||||
console.log(`No patch: ${failed}`);
|
||||
console.log(`Errors: ${errors}`);
|
||||
console.log();
|
||||
console.log(`Avg duration: ${(avgDuration / 1000).toFixed(1)}s`);
|
||||
console.log(`Min duration: ${(minDuration / 1000).toFixed(1)}s`);
|
||||
console.log(`Max duration: ${(maxDuration / 1000).toFixed(1)}s`);
|
||||
console.log(`Avg patch size: ${(avgPatchSize / 1024).toFixed(1)}KB`);
|
||||
|
||||
// Error breakdown
|
||||
if (errors > 0) {
|
||||
console.log("\n--- Errors ---");
|
||||
const errorCounts = new Map<string, number>();
|
||||
for (const r of results) {
|
||||
if (r.error) {
|
||||
const key = r.error.length > 60 ? r.error.slice(0, 60) + "..." : r.error;
|
||||
errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
|
||||
}
|
||||
}
|
||||
for (const [err, count] of [...errorCounts.entries()].sort(
|
||||
(a, b) => b[1] - a[1],
|
||||
)) {
|
||||
console.log(` ${count}x ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Per-repo breakdown
|
||||
console.log("\n--- By Repository ---");
|
||||
const repoStats = new Map<string, { total: number; patched: number }>();
|
||||
for (const r of results) {
|
||||
const repo = r.instance_id.split("__")[0]?.replace(/__/g, "/") || "unknown";
|
||||
const stats = repoStats.get(repo) || { total: 0, patched: 0 };
|
||||
stats.total++;
|
||||
if (r.success) stats.patched++;
|
||||
repoStats.set(repo, stats);
|
||||
}
|
||||
for (const [repo, stats] of [...repoStats.entries()].sort(
|
||||
(a, b) => b[1].total - a[1].total,
|
||||
)) {
|
||||
const pct = ((stats.patched / stats.total) * 100).toFixed(0);
|
||||
console.log(
|
||||
` ${repo.padEnd(30)} ${stats.patched}/${stats.total} (${pct}%)`,
|
||||
);
|
||||
}
|
||||
|
||||
// Slowest tasks
|
||||
console.log("\n--- Slowest Tasks ---");
|
||||
const sorted = [...results].sort((a, b) => b.duration_ms - a.duration_ms);
|
||||
for (const r of sorted.slice(0, 5)) {
|
||||
console.log(
|
||||
` ${(r.duration_ms / 1000).toFixed(1)}s ${r.instance_id} ${r.success ? "PATCHED" : "NO_PATCH"}`,
|
||||
);
|
||||
}
|
||||
|
||||
// Session IDs for further analysis
|
||||
const dataDir = process.env.SMC_DATA_DIR || join(process.env.HOME || "~", ".swe-bench-eval");
|
||||
console.log(`\n--- Run Logs ---`);
|
||||
console.log(`Session data: ${dataDir}/sessions/`);
|
||||
console.log(`View a session's run log:`);
|
||||
console.log(` cat ${dataDir}/sessions/<session-id>/run-log.jsonl | head -20`);
|
||||
}
|
||||
|
||||
main();
|
||||
|
|
@ -1,100 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download SWE-bench dataset from HuggingFace and export to JSONL for the Node.js runner.
|
||||
|
||||
Usage:
|
||||
pip install datasets
|
||||
python scripts/swe-bench/download-dataset.py [--dataset verified|lite|full] [--limit N] [--output PATH]
|
||||
|
||||
Output format (one JSON object per line):
|
||||
{
|
||||
"instance_id": "django__django-16379",
|
||||
"repo": "django/django",
|
||||
"base_commit": "abc123...",
|
||||
"problem_statement": "...",
|
||||
"hints_text": "...",
|
||||
"patch": "...", # gold patch (for reference, not shown to agent)
|
||||
"test_patch": "...", # test patch applied during evaluation
|
||||
"version": "4.2",
|
||||
"environment_setup_commit": "..."
|
||||
}
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
|
||||
DATASET_MAP = {
|
||||
"verified": "princeton-nlp/SWE-bench_Verified",
|
||||
"lite": "princeton-nlp/SWE-bench_Lite",
|
||||
"full": "princeton-nlp/SWE-bench",
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Download SWE-bench dataset to JSONL")
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
choices=["verified", "lite", "full"],
|
||||
default="lite",
|
||||
help="Dataset variant (default: lite)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit", type=int, default=0, help="Limit number of instances (0 = all)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Output JSONL path (default: scripts/swe-bench/<dataset>.jsonl)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--split",
|
||||
type=str,
|
||||
default="test",
|
||||
help="Dataset split (default: test)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
from datasets import load_dataset
|
||||
except ImportError:
|
||||
print("Error: 'datasets' package not installed. Run: pip install datasets", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
dataset_name = DATASET_MAP[args.dataset]
|
||||
output_path = args.output or f"scripts/swe-bench/{args.dataset}.jsonl"
|
||||
|
||||
print(f"Downloading {dataset_name} (split={args.split})...", file=sys.stderr)
|
||||
ds = load_dataset(dataset_name, split=args.split)
|
||||
|
||||
# Fields to keep
|
||||
keep_fields = [
|
||||
"instance_id",
|
||||
"repo",
|
||||
"base_commit",
|
||||
"problem_statement",
|
||||
"hints_text",
|
||||
"patch",
|
||||
"test_patch",
|
||||
"version",
|
||||
"environment_setup_commit",
|
||||
]
|
||||
|
||||
count = 0
|
||||
with open(output_path, "w") as f:
|
||||
for item in ds:
|
||||
record = {}
|
||||
for field in keep_fields:
|
||||
if field in item:
|
||||
record[field] = item[field]
|
||||
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
count += 1
|
||||
if args.limit and count >= args.limit:
|
||||
break
|
||||
|
||||
print(f"Wrote {count} instances to {output_path}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,68 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Evaluate Multica predictions against SWE-bench using the official Docker harness.
|
||||
#
|
||||
# Prerequisites:
|
||||
# pip install swebench
|
||||
# Docker running with at least 120GB storage, 16GB RAM, 8 CPU cores
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/swe-bench/evaluate.sh [predictions.jsonl] [dataset] [run_id]
|
||||
#
|
||||
# Examples:
|
||||
# bash scripts/swe-bench/evaluate.sh
|
||||
# bash scripts/swe-bench/evaluate.sh scripts/swe-bench/predictions.jsonl lite multica-v1
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PREDICTIONS="${1:-scripts/swe-bench/predictions.jsonl}"
|
||||
DATASET="${2:-lite}"
|
||||
RUN_ID="${3:-multica}"
|
||||
|
||||
# Map short names to HuggingFace dataset names
|
||||
case "$DATASET" in
|
||||
lite) DATASET_NAME="princeton-nlp/SWE-bench_Lite" ;;
|
||||
verified) DATASET_NAME="princeton-nlp/SWE-bench_Verified" ;;
|
||||
full) DATASET_NAME="princeton-nlp/SWE-bench" ;;
|
||||
*) DATASET_NAME="$DATASET" ;;
|
||||
esac
|
||||
|
||||
echo "=== SWE-bench Evaluation ==="
|
||||
echo "Predictions: $PREDICTIONS"
|
||||
echo "Dataset: $DATASET_NAME"
|
||||
echo "Run ID: $RUN_ID"
|
||||
echo ""
|
||||
|
||||
if [ ! -f "$PREDICTIONS" ]; then
|
||||
echo "Error: Predictions file not found: $PREDICTIONS"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TASK_COUNT=$(wc -l < "$PREDICTIONS" | tr -d ' ')
|
||||
echo "Tasks to evaluate: $TASK_COUNT"
|
||||
echo ""
|
||||
|
||||
# Check if swebench is installed
|
||||
if ! python -c "import swebench" 2>/dev/null; then
|
||||
echo "Error: swebench not installed. Run: pip install swebench"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if Docker is running
|
||||
if ! docker info >/dev/null 2>&1; then
|
||||
echo "Error: Docker is not running"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Starting evaluation (this may take a while)..."
|
||||
echo ""
|
||||
|
||||
python -m swebench.harness.run_evaluation \
|
||||
--dataset_name "$DATASET_NAME" \
|
||||
--predictions_path "$PREDICTIONS" \
|
||||
--max_workers 4 \
|
||||
--run_id "$RUN_ID"
|
||||
|
||||
echo ""
|
||||
echo "=== Evaluation Complete ==="
|
||||
echo "Check logs/ and evaluation_results/ for detailed results."
|
||||
|
|
@ -1,392 +0,0 @@
|
|||
#!/usr/bin/env tsx
|
||||
/**
|
||||
* SWE-bench Runner for Multica
|
||||
*
|
||||
* Runs the Multica agent against SWE-bench task instances and collects patches.
|
||||
*
|
||||
* Usage:
|
||||
* tsx scripts/swe-bench/run.ts [options]
|
||||
*
|
||||
* Options:
|
||||
* --dataset PATH Path to JSONL dataset (default: scripts/swe-bench/lite.jsonl)
|
||||
* --provider NAME LLM provider (default: kimi-coding)
|
||||
* --model NAME Model name
|
||||
* --limit N Max tasks to run (default: all)
|
||||
* --offset N Skip first N tasks (default: 0)
|
||||
* --output PATH Output predictions JSONL (default: scripts/swe-bench/predictions.jsonl)
|
||||
* --workdir PATH Working directory for repos (default: /tmp/swe-bench)
|
||||
* --timeout MS Timeout per task in ms (default: 300000 = 5min)
|
||||
* --instance ID Run a single instance by ID
|
||||
* --debug Enable debug logging
|
||||
*/
|
||||
|
||||
import { readFileSync, writeFileSync, appendFileSync, existsSync, mkdirSync } from "node:fs";
|
||||
import { join, resolve } from "node:path";
|
||||
import { execSync, spawn } from "node:child_process";
|
||||
import { Agent } from "@multica/core";
|
||||
import type { AgentOptions } from "@multica/core";
|
||||
|
||||
// ============================================================
|
||||
// Types
|
||||
// ============================================================
|
||||
|
||||
interface SWEBenchTask {
|
||||
instance_id: string;
|
||||
repo: string;
|
||||
base_commit: string;
|
||||
problem_statement: string;
|
||||
hints_text?: string;
|
||||
patch?: string;
|
||||
test_patch?: string;
|
||||
version?: string;
|
||||
environment_setup_commit?: string;
|
||||
}
|
||||
|
||||
interface Prediction {
|
||||
instance_id: string;
|
||||
model_patch: string;
|
||||
model_name_or_path: string;
|
||||
}
|
||||
|
||||
interface RunResult {
|
||||
instance_id: string;
|
||||
success: boolean;
|
||||
patch: string;
|
||||
error?: string;
|
||||
duration_ms: number;
|
||||
session_id: string;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// CLI argument parsing
|
||||
// ============================================================
|
||||
|
||||
interface RunOptions {
|
||||
dataset: string;
|
||||
provider: string;
|
||||
model?: string;
|
||||
limit: number;
|
||||
offset: number;
|
||||
output: string;
|
||||
workdir: string;
|
||||
timeout: number;
|
||||
instance?: string;
|
||||
debug: boolean;
|
||||
}
|
||||
|
||||
function parseArgs(): RunOptions {
|
||||
const args = process.argv.slice(2);
|
||||
const opts: RunOptions = {
|
||||
dataset: "scripts/swe-bench/lite.jsonl",
|
||||
provider: "kimi-coding",
|
||||
limit: 0,
|
||||
offset: 0,
|
||||
output: "scripts/swe-bench/predictions.jsonl",
|
||||
workdir: "/tmp/swe-bench",
|
||||
timeout: 300_000, // 5 minutes
|
||||
debug: false,
|
||||
};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
const arg = args[i]!;
|
||||
if (arg === "--dataset") opts.dataset = args[++i]!;
|
||||
else if (arg === "--provider") opts.provider = args[++i]!;
|
||||
else if (arg === "--model") opts.model = args[++i]!;
|
||||
else if (arg === "--limit") opts.limit = parseInt(args[++i]!, 10);
|
||||
else if (arg === "--offset") opts.offset = parseInt(args[++i]!, 10);
|
||||
else if (arg === "--output") opts.output = args[++i]!;
|
||||
else if (arg === "--workdir") opts.workdir = args[++i]!;
|
||||
else if (arg === "--timeout") opts.timeout = parseInt(args[++i]!, 10);
|
||||
else if (arg === "--instance") opts.instance = args[++i]!;
|
||||
else if (arg === "--debug") opts.debug = true;
|
||||
else {
|
||||
console.error(`Unknown argument: ${arg}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return opts;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Dataset loading
|
||||
// ============================================================
|
||||
|
||||
function loadDataset(path: string): SWEBenchTask[] {
|
||||
if (!existsSync(path)) {
|
||||
console.error(`Dataset not found: ${path}`);
|
||||
console.error("Run: python scripts/swe-bench/download-dataset.py");
|
||||
process.exit(1);
|
||||
}
|
||||
const lines = readFileSync(path, "utf-8").split("\n").filter(Boolean);
|
||||
return lines.map((line) => JSON.parse(line) as SWEBenchTask);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Repository setup
|
||||
// ============================================================
|
||||
|
||||
function setupRepo(task: SWEBenchTask, workdir: string): string {
|
||||
const repoDir = join(workdir, task.instance_id.replace(/\//g, "__"));
|
||||
|
||||
if (existsSync(repoDir)) {
|
||||
// Reset existing repo to base commit
|
||||
log(` Resetting existing repo to ${task.base_commit.slice(0, 8)}...`);
|
||||
execSync(`git checkout -f ${task.base_commit} && git clean -fdx`, {
|
||||
cwd: repoDir,
|
||||
stdio: "pipe",
|
||||
timeout: 60_000,
|
||||
});
|
||||
} else {
|
||||
// Clone from GitHub
|
||||
const repoUrl = `https://github.com/${task.repo}.git`;
|
||||
log(` Cloning ${task.repo}...`);
|
||||
mkdirSync(workdir, { recursive: true });
|
||||
execSync(`git clone --quiet ${repoUrl} "${repoDir}"`, {
|
||||
stdio: "pipe",
|
||||
timeout: 120_000,
|
||||
});
|
||||
execSync(`git checkout -f ${task.base_commit}`, {
|
||||
cwd: repoDir,
|
||||
stdio: "pipe",
|
||||
timeout: 30_000,
|
||||
});
|
||||
}
|
||||
|
||||
return repoDir;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// System prompt
|
||||
// ============================================================
|
||||
|
||||
function buildSystemPrompt(task: SWEBenchTask): string {
|
||||
return `You are an expert software engineer tasked with fixing a bug in an open-source repository.
|
||||
|
||||
## Instructions
|
||||
|
||||
1. Read the issue description carefully and understand the problem.
|
||||
2. Explore the repository to find the relevant source code.
|
||||
3. Identify the root cause of the issue.
|
||||
4. Make the minimal set of changes to fix the issue. Do NOT add tests.
|
||||
5. After making changes, verify your fix makes sense.
|
||||
|
||||
## Important Rules
|
||||
|
||||
- Make ONLY the changes necessary to fix the described issue.
|
||||
- Do NOT modify or add any test files.
|
||||
- Do NOT add comments explaining the fix unless the code is non-obvious.
|
||||
- Do NOT refactor unrelated code.
|
||||
- Keep changes minimal and focused.
|
||||
|
||||
## Repository
|
||||
|
||||
This is the \`${task.repo}\` repository checked out at commit \`${task.base_commit.slice(0, 12)}\`.`;
|
||||
}
|
||||
|
||||
function buildPrompt(task: SWEBenchTask): string {
|
||||
let prompt = `## Issue\n\n${task.problem_statement}`;
|
||||
if (task.hints_text) {
|
||||
prompt += `\n\n## Hints\n\n${task.hints_text}`;
|
||||
}
|
||||
prompt += `\n\nPlease fix this issue. Remember: make minimal changes, do not modify tests.`;
|
||||
return prompt;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Run a single task
|
||||
// ============================================================
|
||||
|
||||
async function runTask(
|
||||
task: SWEBenchTask,
|
||||
opts: RunOptions,
|
||||
): Promise<RunResult> {
|
||||
const start = Date.now();
|
||||
|
||||
// Setup repo
|
||||
const repoDir = setupRepo(task, opts.workdir);
|
||||
|
||||
// Create agent
|
||||
const agentOptions: AgentOptions = {
|
||||
provider: opts.provider,
|
||||
model: opts.model,
|
||||
cwd: repoDir,
|
||||
enableRunLog: true,
|
||||
debug: opts.debug,
|
||||
systemPrompt: buildSystemPrompt(task),
|
||||
enableSkills: false,
|
||||
tools: {
|
||||
// Only allow coding tools — no web, no cron, no sessions
|
||||
deny: ["web_fetch", "web_search", "cron", "data", "delegate", "send_file"],
|
||||
},
|
||||
};
|
||||
|
||||
const agent = new Agent(agentOptions);
|
||||
|
||||
log(` Session: ${agent.sessionId}`);
|
||||
|
||||
try {
|
||||
// Run agent with timeout
|
||||
const result = await Promise.race([
|
||||
agent.run(buildPrompt(task)),
|
||||
new Promise<never>((_, reject) =>
|
||||
setTimeout(() => reject(new Error("timeout")), opts.timeout),
|
||||
),
|
||||
]);
|
||||
|
||||
// Collect the git diff (the patch)
|
||||
let patch = "";
|
||||
try {
|
||||
patch = execSync("git diff", {
|
||||
cwd: repoDir,
|
||||
encoding: "utf-8",
|
||||
maxBuffer: 10 * 1024 * 1024, // 10MB
|
||||
timeout: 10_000,
|
||||
});
|
||||
} catch {
|
||||
// Also check for staged changes
|
||||
try {
|
||||
patch = execSync("git diff HEAD", {
|
||||
cwd: repoDir,
|
||||
encoding: "utf-8",
|
||||
maxBuffer: 10 * 1024 * 1024,
|
||||
timeout: 10_000,
|
||||
});
|
||||
} catch {
|
||||
patch = "";
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
instance_id: task.instance_id,
|
||||
success: patch.length > 0,
|
||||
patch,
|
||||
error: result.error,
|
||||
duration_ms: Date.now() - start,
|
||||
session_id: agent.sessionId,
|
||||
};
|
||||
} catch (err) {
|
||||
// Collect any partial patch
|
||||
let patch = "";
|
||||
try {
|
||||
patch = execSync("git diff", {
|
||||
cwd: repoDir,
|
||||
encoding: "utf-8",
|
||||
maxBuffer: 10 * 1024 * 1024,
|
||||
timeout: 10_000,
|
||||
});
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
|
||||
return {
|
||||
instance_id: task.instance_id,
|
||||
success: false,
|
||||
patch,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
duration_ms: Date.now() - start,
|
||||
session_id: agent.sessionId,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Logging
|
||||
// ============================================================
|
||||
|
||||
function log(msg: string) {
|
||||
const ts = new Date().toISOString().slice(11, 19);
|
||||
console.error(`[${ts}] ${msg}`);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Main
|
||||
// ============================================================
|
||||
|
||||
async function main() {
|
||||
const opts = parseArgs();
|
||||
|
||||
log("SWE-bench Runner for Multica");
|
||||
log(`Provider: ${opts.provider}${opts.model ? ` (${opts.model})` : ""}`);
|
||||
log(`Dataset: ${opts.dataset}`);
|
||||
log(`Work dir: ${opts.workdir}`);
|
||||
log(`Timeout: ${opts.timeout / 1000}s per task`);
|
||||
|
||||
// Set SMC_DATA_DIR for isolation
|
||||
if (!process.env.SMC_DATA_DIR) {
|
||||
process.env.SMC_DATA_DIR = join(process.env.HOME || "~", ".swe-bench-eval");
|
||||
log(`SMC_DATA_DIR: ${process.env.SMC_DATA_DIR}`);
|
||||
}
|
||||
|
||||
// Load dataset
|
||||
let tasks = loadDataset(resolve(opts.dataset));
|
||||
log(`Loaded ${tasks.length} tasks`);
|
||||
|
||||
// Filter by instance ID if specified
|
||||
if (opts.instance) {
|
||||
tasks = tasks.filter((t) => t.instance_id === opts.instance);
|
||||
if (tasks.length === 0) {
|
||||
console.error(`Instance not found: ${opts.instance}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Apply offset and limit
|
||||
if (opts.offset > 0) {
|
||||
tasks = tasks.slice(opts.offset);
|
||||
}
|
||||
if (opts.limit > 0) {
|
||||
tasks = tasks.slice(0, opts.limit);
|
||||
}
|
||||
|
||||
log(`Running ${tasks.length} tasks`);
|
||||
|
||||
// Prepare output
|
||||
const outputPath = resolve(opts.output);
|
||||
const resultsPath = outputPath.replace(".jsonl", ".results.jsonl");
|
||||
|
||||
// Run tasks sequentially
|
||||
const modelName = `multica-${opts.provider}${opts.model ? `-${opts.model}` : ""}`;
|
||||
let completed = 0;
|
||||
let succeeded = 0;
|
||||
|
||||
for (const task of tasks) {
|
||||
completed++;
|
||||
log(`\n[${completed}/${tasks.length}] ${task.instance_id}`);
|
||||
|
||||
const result = await runTask(task, opts);
|
||||
|
||||
if (result.success) succeeded++;
|
||||
|
||||
// Write prediction in SWE-bench format
|
||||
const prediction: Prediction = {
|
||||
instance_id: result.instance_id,
|
||||
model_patch: result.patch,
|
||||
model_name_or_path: modelName,
|
||||
};
|
||||
appendFileSync(outputPath, JSON.stringify(prediction) + "\n");
|
||||
|
||||
// Write detailed result
|
||||
appendFileSync(resultsPath, JSON.stringify(result) + "\n");
|
||||
|
||||
const status = result.success ? "PATCHED" : "NO_PATCH";
|
||||
const errorInfo = result.error ? ` (${result.error})` : "";
|
||||
log(
|
||||
` ${status} | ${(result.duration_ms / 1000).toFixed(1)}s | patch=${result.patch.length} bytes${errorInfo}`,
|
||||
);
|
||||
}
|
||||
|
||||
log(`\n========================================`);
|
||||
log(`Results: ${succeeded}/${completed} tasks produced patches`);
|
||||
log(`Predictions: ${outputPath}`);
|
||||
log(`Details: ${resultsPath}`);
|
||||
log(`\nTo evaluate with SWE-bench harness:`);
|
||||
log(
|
||||
` python -m swebench.harness.run_evaluation --dataset_name princeton-nlp/SWE-bench_Lite --predictions_path ${outputPath} --max_workers 4 --run_id multica`,
|
||||
);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error("Fatal error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
Loading…
Add table
Add a link
Reference in a new issue