ai-marketing-skills/outbound-engine/scripts/lead-pipeline.py
Alfred Claw a96d0d8889 Initial commit: 6 AI marketing skill categories
- growth-engine: Autonomous experiment engine (Karpathy autoresearch for marketing)
- sales-pipeline: RB2B router, deal resurrector, trigger prospector, ICP learner
- content-ops: Expert panel, quality gate, editorial brain, quote miner
- outbound-engine: Cold outbound optimizer, lead pipeline, competitive monitor
- seo-ops: Content attack briefs, GSC optimizer, trend scout
- finance-ops: CFO briefing, cost estimate, scenario modeler

79 files, all sanitized - zero hardcoded credentials or internal references.
2026-03-27 20:14:52 -07:00

607 lines
21 KiB
Python

#!/usr/bin/env python3
"""
Lead Pipeline: Apollo → LeadMagic → Dedupe → Instantly
End-to-end lead sourcing, verification, deduplication, and upload pipeline.
Usage:
python3 lead-pipeline.py \\
--titles "VP Marketing,CMO" --industries "SaaS" \\
--company-size "11,50" --locations "United States" \\
--campaign-id YOUR_CAMPAIGN_UUID --volume 500
# Dry run (no upload)
python3 lead-pipeline.py \\
--titles "CTO,VP Engineering" --company-size "51,200" \\
--campaign-id YOUR_CAMPAIGN_UUID --volume 100 --dry-run
API keys are read from environment variables:
APOLLO_API_KEY, LEADMAGIC_API_KEY, INSTANTLY_API_KEY
Or pass them via --apollo-key, --leadmagic-key, --instantly-key flags.
"""
import argparse
import json
import os
import sys
import time
from datetime import datetime
from pathlib import Path
try:
import requests
except ImportError:
print("ERROR: 'requests' package required. Run: pip3 install requests", file=sys.stderr)
sys.exit(1)
# ---------------------------------------------------------------------------
# Retry / backoff helper
# ---------------------------------------------------------------------------
def request_with_retry(method, url, max_retries=5, **kwargs):
"""HTTP request with exponential backoff on 429 / 5xx."""
backoff = 1
for attempt in range(max_retries + 1):
try:
resp = requests.request(method, url, timeout=30, **kwargs)
if resp.status_code == 429:
wait = int(resp.headers.get("Retry-After", backoff))
print(f" ⏳ Rate limited (429). Waiting {wait}s …")
time.sleep(wait)
backoff = min(backoff * 2, 60)
continue
if resp.status_code >= 500:
print(f" ⚠️ Server error {resp.status_code}. Retry in {backoff}s …")
time.sleep(backoff)
backoff = min(backoff * 2, 60)
continue
return resp
except requests.exceptions.RequestException as e:
if attempt == max_retries:
raise
print(f" ⚠️ Request error: {e}. Retry in {backoff}s …")
time.sleep(backoff)
backoff = min(backoff * 2, 60)
return resp # type: ignore
# ---------------------------------------------------------------------------
# Step 1: Apollo People Search
# ---------------------------------------------------------------------------
def source_from_apollo(api_key, titles, industries, company_size, locations, keywords, volume):
"""Pull leads from Apollo People Search API."""
print(f"\n{'='*50}")
print(f"STEP 1: Sourcing from Apollo (target: {volume})")
print(f"{'='*50}")
url = "https://api.apollo.io/api/v1/mixed_people/search"
leads = []
page = 1
# Parse company size into Apollo format
size_ranges = []
if company_size:
parts = [s.strip() for s in company_size.split(",")]
if len(parts) == 2:
size_ranges = [f"{parts[0]},{parts[1]}"]
else:
size_ranges = parts
while len(leads) < volume:
body = {
"api_key": api_key,
"per_page": 100,
"page": page,
}
if titles:
body["person_titles"] = [t.strip() for t in titles.split(",")]
if industries:
body["q_organization_keyword_tags"] = [i.strip() for i in industries.split(",")]
if size_ranges:
body["organization_num_employees_ranges"] = size_ranges
if locations:
body["person_locations"] = [l.strip() for l in locations.split(",")]
if keywords:
body["q_keywords"] = keywords
print(f" 📡 Apollo page {page}", end=" ", flush=True)
resp = request_with_retry("POST", url, json=body)
if resp.status_code != 200:
print(f"ERROR {resp.status_code}: {resp.text[:200]}")
break
data = resp.json()
people = data.get("people", [])
if not people:
print("no more results.")
break
page_leads = 0
for person in people:
email = person.get("email")
if not email:
continue
leads.append({
"email": email.lower().strip(),
"first_name": person.get("first_name", ""),
"last_name": person.get("last_name", ""),
"title": person.get("title", ""),
"company_name": (person.get("organization") or {}).get("name", ""),
"domain": (person.get("organization") or {}).get("primary_domain", ""),
})
page_leads += 1
if len(leads) >= volume:
break
print(f"{page_leads} with email ({len(leads)} total)")
total_pages = data.get("pagination", {}).get("total_pages", page)
if page >= total_pages:
print(" Reached last Apollo page.")
break
page += 1
time.sleep(0.5)
# Dedupe by email within sourced set
seen = set()
unique_leads = []
for lead in leads:
if lead["email"] not in seen:
seen.add(lead["email"])
unique_leads.append(lead)
print(f"\n ✅ Sourced {len(unique_leads)} unique leads with emails")
return unique_leads
# ---------------------------------------------------------------------------
# Step 2: LeadMagic Email Verification
# ---------------------------------------------------------------------------
def verify_with_leadmagic(api_key, leads):
"""Verify emails via LeadMagic. Returns only valid leads."""
print(f"\n{'='*50}")
print(f"STEP 2: Verifying {len(leads)} emails via LeadMagic")
print(f"{'='*50}")
url = "https://api.leadmagic.io/v1/people/email-validation"
headers = {
"X-API-Key": api_key,
"Content-Type": "application/json",
}
valid_leads = []
invalid_count = 0
unknown_count = 0
error_count = 0
rejection_reasons = {}
for i, lead in enumerate(leads):
if (i + 1) % 50 == 0 or i == 0:
print(f" 🔍 Verifying {i+1}/{len(leads)}")
try:
resp = request_with_retry("POST", url, headers=headers, json={"email": lead["email"]})
if resp.status_code != 200:
error_count += 1
continue
data = resp.json()
status = data.get("email_status", "unknown")
if status == "valid":
lead["is_free_email"] = data.get("is_free_email", False)
lead["is_role_based"] = data.get("is_role_based", False)
valid_leads.append(lead)
elif status == "invalid":
invalid_count += 1
rejection_reasons["invalid"] = rejection_reasons.get("invalid", 0) + 1
else:
unknown_count += 1
rejection_reasons["unknown"] = rejection_reasons.get("unknown", 0) + 1
except Exception as e:
error_count += 1
print(f" ⚠️ Error verifying {lead['email']}: {e}")
if (i + 1) % 20 == 0:
time.sleep(0.5)
print(f"\n ✅ Verified: {len(valid_leads)} valid")
print(f" ❌ Invalid: {invalid_count}")
print(f" ❓ Unknown: {unknown_count}")
print(f" ⚠️ Errors: {error_count}")
if rejection_reasons:
print(f" 📊 Rejection breakdown: {rejection_reasons}")
return valid_leads, {
"total": len(leads),
"valid": len(valid_leads),
"invalid": invalid_count,
"unknown": unknown_count,
"errors": error_count,
"rejection_reasons": rejection_reasons,
}
# ---------------------------------------------------------------------------
# Step 3: Deduplicate against Instantly + exclusion list
# ---------------------------------------------------------------------------
def get_instantly_existing_emails(api_key):
"""Pull ALL existing leads from Instantly workspace for dedup."""
print(f"\n 📥 Fetching existing Instantly leads for dedup …")
url = "https://api.instantly.ai/api/v2/leads/list"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
existing_emails = set()
cursor = None
page = 0
while True:
body = {"limit": 100}
if cursor:
body["starting_after"] = cursor
resp = request_with_retry("POST", url, headers=headers, json=body)
if resp.status_code != 200:
print(f" ⚠️ Instantly list error {resp.status_code}: {resp.text[:200]}")
break
data = resp.json()
items = data.get("items", [])
if not items:
break
for item in items:
email = item.get("email", "").lower().strip()
if email:
existing_emails.add(email)
cursor = data.get("next_starting_after")
if not cursor:
break
page += 1
if page % 10 == 0:
print(f"{len(existing_emails)} existing leads so far")
time.sleep(1)
print(f" 📊 Found {len(existing_emails)} existing leads in Instantly")
return existing_emails
def load_exclusion_list(filepath):
"""Load burned emails from a CSV file (one email per line or first column)."""
excluded = set()
if not filepath or not os.path.exists(filepath):
return excluded
with open(filepath, "r") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
email = line.split(",")[0].strip().strip('"').lower()
if "@" in email:
excluded.add(email)
print(f" 📋 Loaded {len(excluded)} emails from exclusion list")
return excluded
def deduplicate(leads, api_key, exclude_file=None):
"""Remove leads already in Instantly or on exclusion list."""
print(f"\n{'='*50}")
print(f"STEP 3: Deduplicating {len(leads)} leads")
print(f"{'='*50}")
existing = get_instantly_existing_emails(api_key)
excluded = load_exclusion_list(exclude_file)
deduped = []
instantly_dupes = 0
burned_dupes = 0
for lead in leads:
email = lead["email"]
if email in existing:
instantly_dupes += 1
elif email in excluded:
burned_dupes += 1
else:
deduped.append(lead)
print(f"\n ✅ Net new leads: {len(deduped)}")
print(f" 🔄 Already in Instantly: {instantly_dupes}")
print(f" 🚫 On exclusion list: {burned_dupes}")
return deduped, {
"instantly_dupes": instantly_dupes,
"burned_dupes": burned_dupes,
"net_new": len(deduped),
}
# ---------------------------------------------------------------------------
# Step 4: Upload to Instantly
# ---------------------------------------------------------------------------
def generate_personalization(lead):
"""Generate a simple 1-line personalization based on available data."""
name = lead.get("first_name", "")
company = lead.get("company_name", "")
title = lead.get("title", "")
if company and title:
return f"Noticed you're {title} at {company} — curious how you're thinking about growth this quarter."
elif company:
return f"Been following {company}'s trajectory — impressive momentum."
elif title:
return f"As a {title}, you're probably juggling growth and efficiency right now."
return "Your background caught my eye — wanted to reach out."
def upload_to_instantly(api_key, leads, campaign_id, dry_run=False):
"""Upload leads to Instantly campaign in batches."""
print(f"\n{'='*50}")
print(f"STEP 4: Uploading {len(leads)} leads to Instantly")
print(f"{'='*50}")
if dry_run:
print(" 🏃 DRY RUN — skipping actual upload")
return {"uploaded": 0, "failed": 0, "dry_run": True}
url = "https://api.instantly.ai/api/v2/leads"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
uploaded = 0
failed = 0
batch_size = 25
for i in range(0, len(leads), batch_size):
batch = leads[i:i + batch_size]
batch_num = (i // batch_size) + 1
total_batches = (len(leads) + batch_size - 1) // batch_size
print(f" 📤 Batch {batch_num}/{total_batches} ({len(batch)} leads) …", end=" ", flush=True)
batch_success = 0
batch_fail = 0
for lead in batch:
body = {
"email": lead["email"],
"first_name": lead.get("first_name", ""),
"last_name": lead.get("last_name", ""),
"company_name": lead.get("company_name", ""),
"campaign": campaign_id,
"custom_variables": {
"title": lead.get("title", ""),
"company_name": lead.get("company_name", ""),
"personalization": generate_personalization(lead),
},
}
try:
resp = request_with_retry("POST", url, headers=headers, json=body)
if resp.status_code in (200, 201):
batch_success += 1
else:
batch_fail += 1
if batch_fail <= 3:
print(f"\n ⚠️ Failed {lead['email']}: {resp.status_code} {resp.text[:100]}")
except Exception as e:
batch_fail += 1
print(f"\n ⚠️ Error uploading {lead['email']}: {e}")
uploaded += batch_success
failed += batch_fail
print(f"{batch_success} ok, {batch_fail} failed")
if i + batch_size < len(leads):
time.sleep(1)
print(f"\n ✅ Uploaded: {uploaded}")
if failed:
print(f" ❌ Failed: {failed}")
return {"uploaded": uploaded, "failed": failed, "dry_run": False}
# ---------------------------------------------------------------------------
# Reporting
# ---------------------------------------------------------------------------
def save_report(output_dir, sourced, verified_stats, dedup_stats, upload_stats, leads_uploaded, args):
"""Save run log as JSON."""
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M")
report_path = os.path.join(output_dir, f"{timestamp}.json")
report = {
"timestamp": datetime.now().isoformat(),
"parameters": {
"titles": args.titles,
"industries": args.industries,
"company_size": args.company_size,
"locations": args.locations,
"keywords": args.keywords,
"campaign_id": args.campaign_id,
"volume": args.volume,
"exclude_file": args.exclude_file,
"dry_run": args.dry_run,
},
"results": {
"sourced_from_apollo": sourced,
"verification": verified_stats,
"deduplication": dedup_stats,
"upload": upload_stats,
},
"leads_uploaded": [
{k: v for k, v in lead.items() if k not in ("is_free_email", "is_role_based")}
for lead in leads_uploaded
],
}
os.makedirs(output_dir, exist_ok=True)
with open(report_path, "w") as f:
json.dump(report, f, indent=2, default=str)
print(f"\n 💾 Run log saved: {report_path}")
return report_path
def print_summary(sourced_count, verified_stats, dedup_stats, upload_stats):
"""Print final summary."""
print(f"\n{'='*50}")
print(f" LEAD PIPELINE SUMMARY")
print(f"{'='*50}")
print(f" Sourced from Apollo: {sourced_count:>6}")
print(f" Verified (LeadMagic): {verified_stats['valid']:>6} ({verified_stats['valid']/max(sourced_count,1)*100:.1f}%)")
print(f" Already in Instantly: {dedup_stats['instantly_dupes']:>6}")
print(f" Excluded (burned list): {dedup_stats['burned_dupes']:>6}")
print(f" Net new uploaded: {upload_stats['uploaded']:>6}")
if upload_stats.get('failed'):
print(f" Failed uploads: {upload_stats['failed']:>6}")
if upload_stats.get('dry_run'):
print(f" ⚠️ DRY RUN — nothing was uploaded")
print(f"{'='*50}\n")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
description="Lead Pipeline: Apollo → LeadMagic → Dedupe → Instantly",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Full pipeline run
python3 lead-pipeline.py \\
--titles "VP Marketing,CMO" --industries "SaaS" \\
--company-size "11,50" --locations "United States" \\
--campaign-id abc-123 --volume 200
# Dry run (no upload)
python3 lead-pipeline.py \\
--titles "CTO,VP Engineering" --company-size "51,200" \\
--campaign-id abc-123 --volume 100 --dry-run
""",
)
parser.add_argument("--apollo-key", default=os.environ.get("APOLLO_API_KEY"),
help="Apollo API key (or set APOLLO_API_KEY env var)")
parser.add_argument("--leadmagic-key", default=os.environ.get("LEADMAGIC_API_KEY"),
help="LeadMagic API key (or set LEADMAGIC_API_KEY env var)")
parser.add_argument("--instantly-key", default=os.environ.get("INSTANTLY_API_KEY"),
help="Instantly API key (or set INSTANTLY_API_KEY env var)")
parser.add_argument("--titles", required=True, help="Comma-separated job titles")
parser.add_argument("--industries", default="", help="Comma-separated industries/keywords")
parser.add_argument("--company-size", default="", help="Employee range, e.g. '11,50'")
parser.add_argument("--locations", default="", help="Comma-separated locations")
parser.add_argument("--keywords", default="", help="Additional search keywords")
parser.add_argument("--campaign-id", required=True, help="Instantly campaign UUID")
parser.add_argument("--volume", type=int, default=500, help="Target number of leads (default: 500)")
parser.add_argument("--exclude-file", default=None, help="Path to CSV of burned/excluded emails")
parser.add_argument("--output-dir", default="./data/lead-pipeline-runs/",
help="Directory for run logs (default: ./data/lead-pipeline-runs/)")
parser.add_argument("--dry-run", action="store_true", help="Run pipeline but skip Instantly upload")
args = parser.parse_args()
# Validate required keys
if not args.apollo_key:
print("ERROR: Apollo API key required. Set APOLLO_API_KEY env var or pass --apollo-key.")
sys.exit(1)
if not args.leadmagic_key:
print("ERROR: LeadMagic API key required. Set LEADMAGIC_API_KEY env var or pass --leadmagic-key.")
sys.exit(1)
if not args.instantly_key:
print("ERROR: Instantly API key required. Set INSTANTLY_API_KEY env var or pass --instantly-key.")
sys.exit(1)
start_time = time.time()
os.makedirs(args.output_dir, exist_ok=True)
print(f"\n🚀 Lead Pipeline Started — {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f" Target: {args.volume} leads → Campaign {args.campaign_id}")
if args.dry_run:
print(f" ⚠️ DRY RUN MODE — will not upload to Instantly")
# Step 1: Source from Apollo
sourced_leads = source_from_apollo(
api_key=args.apollo_key,
titles=args.titles,
industries=args.industries,
company_size=args.company_size,
locations=args.locations,
keywords=args.keywords,
volume=args.volume,
)
if not sourced_leads:
print("\n❌ No leads sourced from Apollo. Exiting.")
sys.exit(1)
# Save intermediate state
intermediate_path = os.path.join(args.output_dir, "last-sourced.json")
with open(intermediate_path, "w") as f:
json.dump(sourced_leads, f, indent=2)
# Step 2: Verify via LeadMagic
verified_leads, verified_stats = verify_with_leadmagic(args.leadmagic_key, sourced_leads)
if not verified_leads:
print("\n❌ No leads passed verification. Exiting.")
sys.exit(1)
intermediate_path = os.path.join(args.output_dir, "last-verified.json")
with open(intermediate_path, "w") as f:
json.dump(verified_leads, f, indent=2)
# Step 3: Deduplicate
deduped_leads, dedup_stats = deduplicate(verified_leads, args.instantly_key, args.exclude_file)
if not deduped_leads:
print("\n⚠️ All leads already exist in Instantly. Nothing to upload.")
upload_stats = {"uploaded": 0, "failed": 0, "dry_run": args.dry_run}
else:
# Step 4: Upload to Instantly
upload_stats = upload_to_instantly(args.instantly_key, deduped_leads, args.campaign_id, args.dry_run)
# Step 5: Report
print_summary(len(sourced_leads), verified_stats, dedup_stats, upload_stats)
save_report(
args.output_dir,
sourced=len(sourced_leads),
verified_stats=verified_stats,
dedup_stats=dedup_stats,
upload_stats=upload_stats,
leads_uploaded=deduped_leads,
args=args,
)
elapsed = time.time() - start_time
print(f"⏱️ Completed in {elapsed/60:.1f} minutes")
if __name__ == "__main__":
main()