ai-marketing-skills/growth-engine/experiment-engine.py
Alfred Claw a96d0d8889 Initial commit: 6 AI marketing skill categories
- growth-engine: Autonomous experiment engine (Karpathy autoresearch for marketing)
- sales-pipeline: RB2B router, deal resurrector, trigger prospector, ICP learner
- content-ops: Expert panel, quality gate, editorial brain, quote miner
- outbound-engine: Cold outbound optimizer, lead pipeline, competitive monitor
- seo-ops: Content attack briefs, GSC optimizer, trend scout
- finance-ops: CFO briefing, cost estimate, scenario modeler

79 files, all sanitized - zero hardcoded credentials or internal references.
2026-03-27 20:14:52 -07:00

493 lines
22 KiB
Python

#!/usr/bin/env python3
"""
Experiment Engine — Autonomous growth experimentation for AI agents.
Inspired by Karpathy's autoresearch pattern: create experiments with hypotheses,
log data points, run statistical analysis (bootstrap CI + Mann-Whitney U),
auto-promote winners to a living playbook, and suggest next experiments.
Supports batch mode (up to 10 variants simultaneously).
Usage:
# Create a new experiment
python3 experiment-engine.py create --agent content --hypothesis "Thread posts get 2x impressions vs single posts" \
--variable "format" --variants '["thread", "single"]' --metric "impressions" --cycle-hours 8
# Log a data point for a running experiment
python3 experiment-engine.py log --agent content --experiment-id EXP-001 --variant "thread" \
--metrics '{"impressions": 4500, "clicks": 120, "replies": 8}'
# Score an experiment (auto-promotes winner if criteria met)
python3 experiment-engine.py score --agent content --experiment-id EXP-001
# List active experiments for an agent
python3 experiment-engine.py list --agent content
# Get current best practices (promoted winners)
python3 experiment-engine.py playbook --agent content
# Suggest next experiment based on gaps
python3 experiment-engine.py suggest --agent content
"""
import argparse, json, os, sys
from datetime import datetime, timezone
from pathlib import Path
import numpy as np
from scipy import stats
# ── Configuration ──────────────────────────────────────────────────────────────
# Base directory for experiment data. Override with GROWTH_ENGINE_DATA_DIR env var.
BASE_DIR = Path(os.environ.get("GROWTH_ENGINE_DATA_DIR", "./data/experiments"))
# Define your agent/channel taxonomy. High-volume channels need fewer samples
# per variant because data arrives faster. Adjust to match your setup.
HIGH_VOLUME_AGENTS = set(os.environ.get("HIGH_VOLUME_AGENTS", "content,email").split(","))
LOW_VOLUME_AGENTS = set(os.environ.get("LOW_VOLUME_AGENTS", "seo,linkedin,blog").split(","))
# Batch mode: allow up to this many variants simultaneously (vs simple A/B)
BATCH_MODE_MAX_VARIANTS = int(os.environ.get("BATCH_MODE_MAX_VARIANTS", "10"))
# Map agent names to their marketing channels. Customize for your org.
AGENT_CHANNEL = {
"content": "social",
"email": "email",
"linkedin": "linkedin",
"seo": "seo",
"blog": "blog",
}
# Statistical parameters
BOOTSTRAP_ITERATIONS = int(os.environ.get("BOOTSTRAP_ITERATIONS", "1000"))
P_WINNER = float(os.environ.get("P_WINNER", "0.05")) # p-value threshold for declaring a winner
P_TREND = float(os.environ.get("P_TREND", "0.10")) # p-value threshold for "trending" status
LIFT_WIN = float(os.environ.get("LIFT_WIN", "15.0")) # minimum % lift required for "keep" decision
def get_min_samples(agent: str, override: int | None = None) -> int:
"""Return minimum samples per variant before scoring.
High-volume channels (email, social) need fewer samples (10).
Low-volume channels (SEO, blog) need more (30) for reliable signal.
Explicit override wins if > 3.
"""
if override is not None and override > 3:
return override
return 10 if agent in HIGH_VOLUME_AGENTS else 30
def bootstrap_lift_ci(a_vals, b_vals, n_iter=BOOTSTRAP_ITERATIONS, ci=95):
"""Bootstrap confidence interval for lift = (mean(b) - mean(a)) / mean(a) * 100.
Returns (lower_bound, upper_bound) as percentages, or (None, None) if baseline is zero.
"""
a = np.array(a_vals, dtype=float)
b = np.array(b_vals, dtype=float)
lifts = []
rng = np.random.default_rng(42)
for _ in range(n_iter):
sa = rng.choice(a, size=len(a), replace=True)
sb = rng.choice(b, size=len(b), replace=True)
baseline_mean = sa.mean()
if baseline_mean == 0:
continue
lifts.append((sb.mean() - baseline_mean) / baseline_mean * 100)
if not lifts:
return None, None
lo = float(np.percentile(lifts, (100 - ci) / 2))
hi = float(np.percentile(lifts, 100 - (100 - ci) / 2))
return round(lo, 1), round(hi, 1)
def get_agent_dir(agent):
d = BASE_DIR / agent
d.mkdir(parents=True, exist_ok=True)
return d
def load_json(path, default=None):
if path.exists():
return json.loads(path.read_text())
return default if default is not None else {}
def save_json(path, data):
path.write_text(json.dumps(data, indent=2, default=str))
def next_id(agent):
d = get_agent_dir(agent)
experiments = load_json(d / "experiments.json", [])
return f"EXP-{agent.upper()}-{len(experiments)+1:03d}"
def cmd_create(args):
d = get_agent_dir(args.agent)
experiments = load_json(d / "experiments.json", [])
exp_id = next_id(args.agent)
min_s = get_min_samples(args.agent, args.min_samples if args.min_samples != 3 else None)
variants = json.loads(args.variants)
batch_mode = getattr(args, "batch_mode", False)
if batch_mode and len(variants) > BATCH_MODE_MAX_VARIANTS:
print(f"⚠️ Batch mode capped at {BATCH_MODE_MAX_VARIANTS} variants (got {len(variants)})")
variants = variants[:BATCH_MODE_MAX_VARIANTS]
experiment = {
"id": exp_id,
"agent": args.agent,
"channel": AGENT_CHANNEL.get(args.agent, "unknown"),
"hypothesis": args.hypothesis,
"variable": args.variable,
"variants": variants,
"primary_metric": args.metric,
"cycle_hours": args.cycle_hours,
"min_samples": min_s,
"batch_mode": batch_mode,
"max_variants": BATCH_MODE_MAX_VARIANTS if batch_mode else 2,
"status": "running",
"created_at": datetime.now(timezone.utc).isoformat(),
"data_points": [],
"baseline_variant": variants[0],
"result": None,
"winner": None
}
experiments.append(experiment)
save_json(d / "experiments.json", experiments)
# Update active experiments index
active = load_json(d / "active.json", [])
active.append({"id": exp_id, "variable": args.variable, "variants": experiment["variants"],
"current_variant_idx": 0})
save_json(d / "active.json", active)
mode_str = f"BATCH ({len(variants)} variants)" if batch_mode else "A/B"
print(f"✅ Created {exp_id}: {args.hypothesis}")
print(f" Channel: {experiment['channel']} | Variable: {args.variable} | Mode: {mode_str}")
print(f" Variants: {experiment['variants']}")
print(f" Metric: {args.metric} | Cycle: {args.cycle_hours}h | Min samples/variant: {min_s}")
return exp_id
def cmd_log(args):
d = get_agent_dir(args.agent)
experiments = load_json(d / "experiments.json", [])
for exp in experiments:
if exp["id"] == args.experiment_id:
dp = {
"variant": args.variant,
"metrics": json.loads(args.metrics),
"logged_at": datetime.now(timezone.utc).isoformat(),
"notes": args.notes or ""
}
exp["data_points"].append(dp)
save_json(d / "experiments.json", experiments)
print(f"✅ Logged data point for {args.experiment_id} variant '{args.variant}': {dp['metrics']}")
return
print(f"❌ Experiment {args.experiment_id} not found")
sys.exit(1)
def cmd_score(args):
d = get_agent_dir(args.agent)
experiments = load_json(d / "experiments.json", [])
for exp in experiments:
if exp["id"] == args.experiment_id and exp["status"] in ("running", "active", "trending"):
# Group data points by variant
variant_data = {}
for dp in exp["data_points"]:
v = dp["variant"]
if v not in variant_data:
variant_data[v] = []
variant_data[v].append(dp["metrics"].get(exp["primary_metric"], 0))
baseline_v = exp["baseline_variant"]
min_samples = exp.get("min_samples",
get_min_samples(exp["agent"]) if "agent" in exp else 15)
# Enforce per-variant sample floor
insufficient = []
for v, data in variant_data.items():
if len(data) < min_samples:
insufficient.append((v, len(data)))
if insufficient:
for v, n in insufficient:
print(f"{exp['id']}: Variant '{v}' has {n}/{min_samples} samples. Need more data.")
# Check for trending signal even with fewer samples (need at least 15)
all_counts = {v: len(data) for v, data in variant_data.items()}
min_count = min(all_counts.values()) if all_counts else 0
if min_count >= 15 and baseline_v in variant_data:
baseline_vals = variant_data[baseline_v]
best_trend_v, best_trend_p = None, 1.0
for v, vals in variant_data.items():
if v == baseline_v or len(vals) < 15:
continue
_, p = stats.mannwhitneyu(baseline_vals, vals, alternative="less")
if p < P_TREND and p < best_trend_p:
best_trend_p = p
best_trend_v = v
if best_trend_v:
exp["status"] = "trending"
save_json(d / "experiments.json", experiments)
lift = (np.mean(variant_data[best_trend_v]) - np.mean(baseline_vals)) / np.mean(baseline_vals) * 100 if np.mean(baseline_vals) else 0
print(f"📈 {exp['id']}: TRENDING — '{best_trend_v}' p={best_trend_p:.3f}, lift={lift:.1f}% (needs more samples to confirm)")
return
if not variant_data:
print(f"{exp['id']}: No data points yet.")
return
baseline_vals = np.array(variant_data.get(baseline_v, []), dtype=float)
if len(baseline_vals) < min_samples:
print(f"{exp['id']}: Baseline variant '{baseline_v}' has {len(baseline_vals)}/{min_samples} samples.")
return
# Evaluate all non-baseline variants
results = []
for v, vals in variant_data.items():
if v == baseline_v:
continue
arr = np.array(vals, dtype=float)
baseline_mean = baseline_vals.mean()
variant_mean = arr.mean()
lift = ((variant_mean - baseline_mean) / baseline_mean * 100) if baseline_mean != 0 else 0
# Mann-Whitney U test (non-parametric, no normality assumption)
_, p_two = stats.mannwhitneyu(baseline_vals, arr, alternative="two-sided")
_, p_less = stats.mannwhitneyu(baseline_vals, arr, alternative="less")
ci_lo, ci_hi = bootstrap_lift_ci(baseline_vals.tolist(), arr.tolist())
if p_less < P_WINNER and lift >= LIFT_WIN:
status = "keep"
elif p_two < P_WINNER and lift < 0:
status = "crash" if lift <= -LIFT_WIN else "discard"
elif p_less < P_TREND and len(vals) >= 15:
status = "trending"
else:
status = "running"
results.append({
"variant": v,
"mean": round(float(variant_mean), 2),
"lift_pct": round(lift, 1),
"p_value": round(float(p_less), 4),
"ci_95": [ci_lo, ci_hi],
"n": len(vals),
"status": status
})
baseline_mean = float(baseline_vals.mean())
overall_result = {
"baseline": baseline_v,
"baseline_mean": round(baseline_mean, 2),
"baseline_n": len(baseline_vals),
"variants": results,
"scored_at": datetime.now(timezone.utc).isoformat(),
"min_samples": min_samples,
"thresholds": {"p_winner": P_WINNER, "p_trend": P_TREND, "lift_pct_required": LIFT_WIN}
}
winners = [r for r in results if r["status"] == "keep"]
crashes = [r for r in results if r["status"] in ("crash", "discard")]
trending = [r for r in results if r["status"] == "trending"]
if winners:
best = max(winners, key=lambda r: r["lift_pct"])
exp["status"] = "keep"
exp["winner"] = best["variant"]
exp["result"] = overall_result
save_json(d / "experiments.json", experiments)
# Auto-promote to playbook
playbook = load_json(d / "playbook.json", {})
playbook[exp["variable"]] = {
"best": best["variant"],
"metric": exp["primary_metric"],
"avg": best["mean"],
"improvement": best["lift_pct"],
"p_value": best["p_value"],
"ci_95": best["ci_95"],
"experiment_id": exp["id"],
"promoted_at": datetime.now(timezone.utc).isoformat()
}
save_json(d / "playbook.json", playbook)
# Remove from active index
active = load_json(d / "active.json", [])
active = [a for a in active if a["id"] != exp["id"]]
save_json(d / "active.json", active)
print(f"🏆 {exp['id']}: KEEP — '{best['variant']}' +{best['lift_pct']}% lift "
f"(p={best['p_value']}, 95% CI [{best['ci_95'][0]}, {best['ci_95'][1]}]%)")
print(f" 📖 Playbook updated: {exp['variable']}'{best['variant']}'")
elif all(r["status"] in ("crash", "discard") for r in results) and results:
worst = min(results, key=lambda r: r["lift_pct"])
exp["status"] = "discard"
exp["result"] = overall_result
save_json(d / "experiments.json", experiments)
active = load_json(d / "active.json", [])
active = [a for a in active if a["id"] != exp["id"]]
save_json(d / "active.json", active)
print(f"💀 {exp['id']}: DISCARD — baseline wins. Best variant: '{worst['variant']}' "
f"at {worst['lift_pct']}% (p={worst['p_value']})")
elif trending:
exp["status"] = "trending"
exp["result"] = overall_result
save_json(d / "experiments.json", experiments)
best_t = max(trending, key=lambda r: r["lift_pct"])
print(f"📈 {exp['id']}: TRENDING — '{best_t['variant']}' +{best_t['lift_pct']}% "
f"(p={best_t['p_value']}, n={best_t['n']}). Keep collecting data.")
else:
exp["status"] = "running"
exp["result"] = overall_result
save_json(d / "experiments.json", experiments)
for r in results:
print(f"{exp['id']}: '{r['variant']}' {r['lift_pct']:+.1f}% lift, p={r['p_value']} — running")
return
print(f"❌ Active experiment {args.experiment_id} not found")
def cmd_list(args):
d = get_agent_dir(args.agent)
experiments = load_json(d / "experiments.json", [])
status_filter = args.status or "all"
icons = {
"running": "🔬", "active": "🔬",
"trending": "📈",
"keep": "🏆", "promoted": "🏆",
"discard": "💀", "killed": "💀",
"crash": "🔴",
"inconclusive": "🤷"
}
for exp in experiments:
s = exp["status"]
if status_filter != "all" and s != status_filter:
aliases = {"active": "running", "promoted": "keep", "killed": "discard"}
if aliases.get(s) != status_filter and s != status_filter:
continue
dp_count = len(exp.get("data_points", []))
icon = icons.get(s, "")
ch = exp.get("channel", AGENT_CHANNEL.get(exp["agent"], "?"))
print(f"{icon} {exp['id']}: {exp['hypothesis']}")
print(f" Variable: {exp['variable']} | Channel: {ch} | Status: {s} | Data points: {dp_count}")
if exp.get("winner"):
result = exp.get("result", {})
lift = ""
if isinstance(result, dict):
for vr in result.get("variants", []):
if vr["variant"] == exp["winner"]:
lift = f" ({vr['lift_pct']:+.1f}% lift, p={vr['p_value']})"
break
print(f" Winner: {exp['winner']}{lift}")
print()
def cmd_playbook(args):
d = get_agent_dir(args.agent)
playbook = load_json(d / "playbook.json", {})
if not playbook:
print(f"📖 No playbook entries for {args.agent} yet. Run some experiments!")
return
print(f"📖 {args.agent.upper()} PLAYBOOK — Empirically Proven Best Practices\n")
for variable, entry in playbook.items():
p_str = f", p={entry['p_value']}" if "p_value" in entry else ""
ci_str = f", 95% CI {entry['ci_95']}" if "ci_95" in entry else ""
print(f" {variable}: '{entry['best']}' (+{entry['improvement']}% on {entry['metric']}{p_str}{ci_str})")
print(f" Source: {entry['experiment_id']} | Promoted: {entry['promoted_at'][:10]}")
print()
def cmd_suggest(args):
d = get_agent_dir(args.agent)
experiments = load_json(d / "experiments.json", [])
playbook = load_json(d / "playbook.json", {})
# Define testable categories per channel. Customize these for your business.
categories = {
"content": ["hook_style", "post_format", "cta_type", "post_time", "thread_length",
"emoji_usage", "data_vs_narrative", "question_vs_statement"],
"email": ["subject_line_style", "opener_type", "email_length", "personalization_depth",
"cta_style", "send_time", "follow_up_timing", "social_proof_type"],
"linkedin": ["inmail_opener", "role_framing", "company_pitch", "personalization_level",
"subject_line", "follow_up_cadence"],
"blog": ["headline_style", "content_format", "platform_priority", "visual_style",
"posting_time", "content_length"],
"seo": ["title_tag_format", "meta_description_style", "content_structure",
"internal_linking", "heading_format"]
}
tested = set(playbook.keys())
tested.update(e["variable"] for e in experiments if e["status"] in ("running", "active", "trending"))
agent_cats = categories.get(args.agent, [])
untested = [c for c in agent_cats if c not in tested]
min_s = get_min_samples(args.agent)
ch = AGENT_CHANNEL.get(args.agent, "?")
if untested:
print(f"💡 Suggested next experiments for {args.agent} ({ch}, min {min_s} samples/variant):")
for cat in untested[:3]:
print(f"{cat}")
else:
print(f"{args.agent} has tested all standard categories. Time for advanced experiments!")
def main():
parser = argparse.ArgumentParser(description="Experiment Engine — Autonomous growth experimentation")
sub = parser.add_subparsers(dest="command")
p_create = sub.add_parser("create", help="Create a new experiment")
p_create.add_argument("--agent", required=True, help="Agent/channel name (e.g., content, email, seo)")
p_create.add_argument("--hypothesis", required=True, help="What you're testing and expected outcome")
p_create.add_argument("--variable", required=True, help="The variable being tested (e.g., hook_style)")
p_create.add_argument("--variants", required=True, help="JSON array of variant names")
p_create.add_argument("--metric", required=True, help="Primary metric to optimize (e.g., impressions)")
p_create.add_argument("--cycle-hours", type=int, default=24, help="Hours per experiment cycle (default: 24)")
p_create.add_argument("--min-samples", type=int, default=3,
help="Override min samples/variant (default: auto based on channel volume)")
p_create.add_argument("--batch-mode", action="store_true",
help="Enable batch mode: up to 10 variants simultaneously")
p_log = sub.add_parser("log", help="Log a data point for a running experiment")
p_log.add_argument("--agent", required=True)
p_log.add_argument("--experiment-id", required=True)
p_log.add_argument("--variant", required=True)
p_log.add_argument("--metrics", required=True, help="JSON object of metric values")
p_log.add_argument("--notes", default="")
p_score = sub.add_parser("score", help="Score an experiment (auto-promotes winners)")
p_score.add_argument("--agent", required=True)
p_score.add_argument("--experiment-id", required=True)
p_list = sub.add_parser("list", help="List experiments for an agent")
p_list.add_argument("--agent", required=True)
p_list.add_argument("--status", default="all", help="Filter by status (running/trending/keep/discard/all)")
p_play = sub.add_parser("playbook", help="Show empirically proven best practices")
p_play.add_argument("--agent", required=True)
p_sug = sub.add_parser("suggest", help="Suggest next experiments based on gaps")
p_sug.add_argument("--agent", required=True)
args = parser.parse_args()
if not args.command:
parser.print_help()
return
{"create": cmd_create, "log": cmd_log, "score": cmd_score,
"list": cmd_list, "playbook": cmd_playbook, "suggest": cmd_suggest}[args.command](args)
if __name__ == "__main__":
main()