Agent-Reach/agent_eyes/readers/twitter.py
Panniantong d891af5b7d fix: import paths (fetchers→readers), schema field mismatch, RSS detection
Bugs found during fresh pip install testing:
- readers/*.py still referenced agent_eyes.fetchers → fixed to agent_eyes.readers
- reader.py passed 'author'/'metadata' to UnifiedContent which doesn't have those → use 'extra' field
- RSS URL detection missed domains containing 'rss' (e.g. hnrss.org)
2026-02-24 05:13:52 +01:00

217 lines
7.1 KiB
Python

# -*- coding: utf-8 -*-
"""
X/Twitter fetcher — three-tier fallback:
1. X oEmbed API (fast, reliable for individual tweets, no login needed)
2. Jina Reader (handles non-tweet X pages like profiles)
3. Playwright + saved session (handles login-required content)
Install browser tier: pip install "x-reader[browser]" && playwright install chromium
Save X session: x-reader login twitter
"""
import re
import requests
from loguru import logger
from typing import Dict, Any
from agent_eyes.readers.jina import fetch_via_jina
OEMBED_URL = "https://publish.twitter.com/oembed"
def _extract_author(url: str) -> str:
"""Extract @username from tweet URL."""
match = re.search(r'x\.com/(\w+)/status', url)
return f"@{match.group(1)}" if match else ""
def _is_tweet_url(url: str) -> bool:
"""Check if this is a direct tweet/status URL (vs profile or other X page)."""
return bool(re.search(r'x\.com/\w+/status/\d+', url))
def _fetch_via_oembed(url: str) -> Dict[str, Any]:
"""
Fetch tweet text via X's oEmbed API.
Free, reliable, no auth needed. Works for public tweets.
Note: oEmbed requires twitter.com URLs (not x.com).
"""
# oEmbed API requires twitter.com format
oembed_query_url = url.replace("x.com", "twitter.com")
resp = requests.get(
OEMBED_URL,
params={"url": oembed_query_url, "omit_script": "true"},
timeout=10,
)
resp.raise_for_status()
data = resp.json()
# Strip HTML tags from the embedded HTML to get clean text
html = data.get("html", "")
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text).strip()
return {
"text": text,
"author": data.get("author_name", ""),
"author_url": data.get("author_url", ""),
"title": text[:100] if text else "",
}
async def _fetch_via_playwright(url: str) -> Dict[str, Any]:
"""
Fetch tweet via Playwright with X-specific DOM selectors.
Uses saved login session if available (~/.x-reader/sessions/twitter.json).
"""
try:
from playwright.async_api import async_playwright
except ImportError:
raise RuntimeError(
"Playwright not installed. Run:\n"
' pip install "x-reader[browser]"\n'
" playwright install chromium"
)
from agent_eyes.readers.browser import get_session_path
from pathlib import Path
session_path = get_session_path("twitter")
has_session = Path(session_path).exists()
if has_session:
logger.info(f"Using saved X session: {session_path}")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context_kwargs = {}
if has_session:
context_kwargs["storage_state"] = session_path
context = await browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36",
**context_kwargs,
)
page = await context.new_page()
try:
await page.goto(url, wait_until="domcontentloaded", timeout=30_000)
# Wait for tweet text to render (X is a SPA, needs JS execution)
try:
await page.wait_for_selector(
'[data-testid="tweetText"]', timeout=10_000
)
except Exception:
pass # May not appear if login required
# Extract tweet content with X-specific selectors
tweet_text = await page.evaluate("""() => {
// Priority 1: tweet text element
const tweetEl = document.querySelector('[data-testid="tweetText"]');
if (tweetEl) return tweetEl.innerText;
// Priority 2: article element (thread view)
const article = document.querySelector('article');
if (article) return article.innerText;
// Priority 3: main content area
const main = document.querySelector('main');
if (main) return main.innerText;
return '';
}""")
title = await page.title()
return {
"text": (tweet_text or "").strip(),
"title": (title or "").strip()[:200],
}
finally:
await context.close()
await browser.close()
async def fetch_twitter(url: str) -> Dict[str, Any]:
"""
Fetch a tweet or X post with three-tier fallback.
Args:
url: Tweet URL (x.com or twitter.com)
Returns:
Dict with: text, author, url, title, platform
"""
url = url.replace("twitter.com", "x.com")
author = _extract_author(url)
# Tier 1: oEmbed API (best for individual tweets)
if _is_tweet_url(url):
try:
logger.info(f"[Twitter] Tier 1 — oEmbed: {url}")
data = _fetch_via_oembed(url)
if data.get("text") and len(data["text"].strip()) > 20:
return {
"text": data["text"],
"author": author or data.get("author", ""),
"url": url,
"title": data.get("title", ""),
"platform": "twitter",
}
logger.warning("[Twitter] oEmbed returned thin content")
except Exception as e:
logger.warning(f"[Twitter] oEmbed failed ({e})")
# Tier 2: Jina Reader (handles profiles, threads, non-tweet pages)
try:
logger.info(f"[Twitter] Tier 2 — Jina: {url}")
data = fetch_via_jina(url)
content = data.get("content", "")
title = data.get("title", "")
jina_ok = (
content
and len(content.strip()) > 100
and "not yet fully loaded" not in content.lower()
and title.lower() not in ("x", "title: x", "")
)
if jina_ok:
return {
"text": content,
"author": author,
"url": url,
"title": title,
"platform": "twitter",
}
logger.warning("[Twitter] Jina returned unusable content")
except Exception as e:
logger.warning(f"[Twitter] Jina failed ({e})")
# Tier 3: Playwright + session with X-specific extraction
try:
logger.info(f"[Twitter] Tier 3 — Playwright: {url}")
data = await _fetch_via_playwright(url)
content = data.get("text", "")
if content and len(content.strip()) > 20:
return {
"text": content,
"author": author,
"url": url,
"title": data.get("title", ""),
"platform": "twitter",
}
logger.warning("[Twitter] Playwright returned empty content")
except RuntimeError:
raise
except Exception as e:
logger.error(f"[Twitter] All methods failed: {e}")
raise RuntimeError(
f"❌ All Twitter fetch methods failed for: {url}\n"
f" Try: x-reader login twitter (to save session for browser fallback)\n"
f" Then retry: x-reader {url}"
)