Initial: forked from runesleo/x-reader (MIT License) - thank you @runes_leo!
This commit is contained in:
commit
ee2ad83b12
25 changed files with 2512 additions and 0 deletions
3
x_reader/__init__.py
Normal file
3
x_reader/__init__.py
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
"""x-reader: Universal content reader for 7+ platforms."""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
139
x_reader/cli.py
Normal file
139
x_reader/cli.py
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
x-reader CLI — fetch content from any platform.
|
||||
|
||||
Usage:
|
||||
x-reader <url> # Fetch a single URL
|
||||
x-reader <url1> <url2> ... # Fetch multiple URLs
|
||||
x-reader list # Show inbox contents
|
||||
x-reader clear # Clear inbox
|
||||
"""
|
||||
|
||||
import sys
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
from x_reader.reader import UniversalReader
|
||||
from x_reader.schema import UnifiedInbox, SourceType
|
||||
|
||||
|
||||
def get_inbox_path() -> str:
|
||||
import os
|
||||
return os.getenv("INBOX_FILE", "unified_inbox.json")
|
||||
|
||||
|
||||
def cmd_fetch(urls: list[str]):
|
||||
"""Fetch one or more URLs."""
|
||||
inbox = UnifiedInbox(get_inbox_path())
|
||||
reader = UniversalReader(inbox=inbox)
|
||||
|
||||
async def run():
|
||||
if len(urls) == 1:
|
||||
item = await reader.read(urls[0])
|
||||
print(f"✅ [{item.source_type.value}] {item.title[:60]}")
|
||||
print(f" {item.url}")
|
||||
print(f" {item.content[:200]}...")
|
||||
else:
|
||||
items = await reader.read_batch(urls)
|
||||
for item in items:
|
||||
print(f"✅ [{item.source_type.value}] {item.title[:60]}")
|
||||
print(f"\n📦 Fetched {len(items)}/{len(urls)} URLs")
|
||||
|
||||
try:
|
||||
asyncio.run(run())
|
||||
except KeyboardInterrupt:
|
||||
print("\n⏹ Cancelled")
|
||||
except Exception as e:
|
||||
print(f"❌ {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def cmd_list():
|
||||
"""Show inbox contents."""
|
||||
inbox = UnifiedInbox(get_inbox_path())
|
||||
if not inbox.items:
|
||||
print("📦 Inbox is empty")
|
||||
return
|
||||
|
||||
print(f"📦 Inbox: {len(inbox.items)} items\n")
|
||||
|
||||
emoji_map = {
|
||||
SourceType.TELEGRAM: "📢", SourceType.RSS: "📰",
|
||||
SourceType.BILIBILI: "🎬", SourceType.XIAOHONGSHU: "📕",
|
||||
SourceType.TWITTER: "🐦", SourceType.WECHAT: "💬",
|
||||
SourceType.YOUTUBE: "▶️", SourceType.MANUAL: "✏️",
|
||||
}
|
||||
|
||||
for i, item in enumerate(inbox.items[-20:], 1):
|
||||
emoji = emoji_map.get(item.source_type, "📄")
|
||||
print(f" {i:2d}. {emoji} [{item.source_type.value:8s}] {item.title[:50]}")
|
||||
|
||||
|
||||
def cmd_clear():
|
||||
"""Clear inbox."""
|
||||
path = Path(get_inbox_path())
|
||||
if path.exists():
|
||||
confirm = input("Clear inbox? (y/N) ")
|
||||
if confirm.lower() == 'y':
|
||||
path.write_text("[]")
|
||||
print("✅ Inbox cleared")
|
||||
else:
|
||||
print("📦 Inbox is already empty")
|
||||
|
||||
|
||||
def cmd_login(platform: str):
|
||||
"""Open browser for manual login to a platform."""
|
||||
from x_reader.login import login
|
||||
login(platform)
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("""
|
||||
📖 x-reader — Universal content reader
|
||||
|
||||
Usage:
|
||||
x-reader <url> Fetch content from any URL
|
||||
x-reader <url1> <url2> Fetch multiple URLs
|
||||
x-reader login <platform> Login to a platform (saves session for browser fallback)
|
||||
x-reader list Show inbox contents
|
||||
x-reader clear Clear inbox
|
||||
|
||||
Supported platforms:
|
||||
WeChat, Telegram, X/Twitter, YouTube,
|
||||
Bilibili, Xiaohongshu, RSS, and any web page
|
||||
|
||||
Examples:
|
||||
x-reader https://mp.weixin.qq.com/s/abc123
|
||||
x-reader https://x.com/elonmusk/status/123456
|
||||
x-reader https://www.xiaohongshu.com/explore/abc123
|
||||
x-reader login xhs
|
||||
""")
|
||||
return
|
||||
|
||||
cmd = sys.argv[1].lower()
|
||||
|
||||
if cmd == "login":
|
||||
if len(sys.argv) < 3:
|
||||
print("❌ Usage: x-reader login <platform>")
|
||||
print(" Supported: xhs, wechat")
|
||||
sys.exit(1)
|
||||
cmd_login(sys.argv[2])
|
||||
elif cmd == "list":
|
||||
cmd_list()
|
||||
elif cmd == "clear":
|
||||
cmd_clear()
|
||||
elif cmd.startswith("http") or cmd.startswith("www.") or "." in cmd:
|
||||
urls = [arg for arg in sys.argv[1:] if arg.startswith(("http", "www.")) or "." in arg]
|
||||
cmd_fetch(urls)
|
||||
else:
|
||||
print(f"❌ Unknown command: {cmd}")
|
||||
print(" Run 'x-reader' with no args for help")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
x_reader/fetchers/__init__.py
Normal file
0
x_reader/fetchers/__init__.py
Normal file
46
x_reader/fetchers/bilibili.py
Normal file
46
x_reader/fetchers/bilibili.py
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""Bilibili video fetcher — uses official web API."""
|
||||
|
||||
import re
|
||||
import requests
|
||||
from loguru import logger
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
API_URL = "https://api.bilibili.com/x/web-interface/view"
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
|
||||
}
|
||||
|
||||
|
||||
async def fetch_bilibili(url_or_bv: str) -> Dict[str, Any]:
|
||||
"""Fetch Bilibili video metadata via official API."""
|
||||
logger.info(f"Fetching Bilibili: {url_or_bv}")
|
||||
|
||||
bv_id = url_or_bv
|
||||
if "bilibili.com" in url_or_bv or "b23.tv" in url_or_bv:
|
||||
match = re.search(r'BV\w+', url_or_bv)
|
||||
if match:
|
||||
bv_id = match.group()
|
||||
else:
|
||||
raise ValueError(f"Cannot extract BV ID from: {url_or_bv}")
|
||||
|
||||
resp = requests.get(API_URL, params={"bvid": bv_id}, headers=HEADERS, timeout=10)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
if data.get("code") != 0:
|
||||
raise ValueError(f"Bilibili API error: {data.get('message')}")
|
||||
|
||||
video = data["data"]
|
||||
return {
|
||||
"title": video.get("title", ""),
|
||||
"description": video.get("desc", ""),
|
||||
"author": video.get("owner", {}).get("name", ""),
|
||||
"url": f"https://www.bilibili.com/video/{bv_id}",
|
||||
"cover": video.get("pic", ""),
|
||||
"bvid": bv_id,
|
||||
"duration": video.get("duration", 0),
|
||||
"view_count": video.get("stat", {}).get("view", 0),
|
||||
"platform": "bilibili",
|
||||
}
|
||||
88
x_reader/fetchers/browser.py
Normal file
88
x_reader/fetchers/browser.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Playwright browser fetcher — headless Chromium fallback for anti-scraping sites.
|
||||
|
||||
Used when Jina Reader fails (403/451/timeout). Supports persistent login
|
||||
sessions via Playwright's storage_state for platforms requiring authentication.
|
||||
|
||||
Install: pip install "x-reader[browser]" && playwright install chromium
|
||||
"""
|
||||
|
||||
from loguru import logger
|
||||
from pathlib import Path
|
||||
|
||||
SESSION_DIR = Path.home() / ".x-reader" / "sessions"
|
||||
TIMEOUT_MS = 30_000
|
||||
|
||||
|
||||
async def fetch_via_browser(url: str, storage_state: str = None) -> dict:
|
||||
"""
|
||||
Fetch a URL using headless Chromium via Playwright.
|
||||
|
||||
Args:
|
||||
url: Target URL to fetch.
|
||||
storage_state: Path to a Playwright storage state JSON file (cookies/localStorage).
|
||||
If provided, the browser context will load this session.
|
||||
|
||||
Returns:
|
||||
dict with keys: title, content, url, author
|
||||
"""
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
except ImportError:
|
||||
raise RuntimeError(
|
||||
"Playwright is not installed. Run:\n"
|
||||
' pip install "x-reader[browser]"\n'
|
||||
" playwright install chromium"
|
||||
)
|
||||
|
||||
logger.info(f"Browser fetch: {url}")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
|
||||
context_kwargs = {}
|
||||
if storage_state and Path(storage_state).exists():
|
||||
context_kwargs["storage_state"] = storage_state
|
||||
logger.info(f"Using session: {storage_state}")
|
||||
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36",
|
||||
**context_kwargs,
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=TIMEOUT_MS)
|
||||
# Extra wait for JS-heavy pages
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
title = await page.title()
|
||||
# Extract main text content, stripping scripts/styles
|
||||
content = await page.evaluate("""() => {
|
||||
const el = document.querySelector('article')
|
||||
|| document.querySelector('main')
|
||||
|| document.querySelector('.content')
|
||||
|| document.body;
|
||||
return el ? el.innerText : '';
|
||||
}""")
|
||||
|
||||
result = {
|
||||
"title": (title or "").strip()[:200],
|
||||
"content": (content or "").strip(),
|
||||
"url": url,
|
||||
"author": "",
|
||||
}
|
||||
logger.info(f"Browser fetch OK: {title[:60]}")
|
||||
return result
|
||||
|
||||
finally:
|
||||
await context.close()
|
||||
await browser.close()
|
||||
|
||||
|
||||
def get_session_path(platform: str) -> str:
|
||||
"""Get the session file path for a platform."""
|
||||
return str(SESSION_DIR / f"{platform}.json")
|
||||
63
x_reader/fetchers/jina.py
Normal file
63
x_reader/fetchers/jina.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Jina Reader — universal fallback for content extraction.
|
||||
|
||||
Uses https://r.jina.ai/{url} to extract markdown from any web page.
|
||||
Free, no API key required, handles JS rendering and anti-scraping.
|
||||
"""
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
|
||||
JINA_BASE = "https://r.jina.ai"
|
||||
TIMEOUT = 30
|
||||
|
||||
HEADERS = {
|
||||
"Accept": "text/markdown",
|
||||
"User-Agent": "x-reader/0.1",
|
||||
}
|
||||
|
||||
|
||||
def fetch_via_jina(url: str) -> dict:
|
||||
"""
|
||||
Fetch any URL via Jina Reader and return structured data.
|
||||
|
||||
Returns:
|
||||
dict with keys: title, content, url, author (best-effort)
|
||||
"""
|
||||
jina_url = f"{JINA_BASE}/{url}"
|
||||
logger.info(f"Jina fetch: {url}")
|
||||
|
||||
try:
|
||||
resp = requests.get(jina_url, headers=HEADERS, timeout=TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
text = resp.text
|
||||
|
||||
# Jina returns markdown; first line is usually the title
|
||||
lines = text.strip().split("\n")
|
||||
title = ""
|
||||
content_lines = []
|
||||
|
||||
for line in lines:
|
||||
if not title and line.strip():
|
||||
# First non-empty line as title, strip markdown heading
|
||||
title = line.lstrip("#").strip()
|
||||
else:
|
||||
content_lines.append(line)
|
||||
|
||||
content = "\n".join(content_lines).strip()
|
||||
|
||||
return {
|
||||
"title": title[:200],
|
||||
"content": content,
|
||||
"url": url,
|
||||
"author": "",
|
||||
}
|
||||
|
||||
except requests.Timeout:
|
||||
logger.error(f"Jina timeout: {url}")
|
||||
raise
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Jina fetch failed: {url} — {e}")
|
||||
raise
|
||||
47
x_reader/fetchers/rss.py
Normal file
47
x_reader/fetchers/rss.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""RSS feed fetcher — uses feedparser."""
|
||||
|
||||
import feedparser
|
||||
from loguru import logger
|
||||
from typing import Dict, Any, List
|
||||
|
||||
|
||||
async def fetch_rss(url: str, limit: int = 20) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Fetch and parse an RSS/Atom feed.
|
||||
|
||||
Args:
|
||||
url: RSS feed URL
|
||||
limit: Max number of entries to return
|
||||
|
||||
Returns:
|
||||
List of article dicts with: title, summary, url, source, published
|
||||
"""
|
||||
logger.info(f"Fetching RSS: {url}")
|
||||
|
||||
feed = feedparser.parse(url)
|
||||
|
||||
if feed.bozo and not feed.entries:
|
||||
raise ValueError(f"Failed to parse RSS feed: {feed.bozo_exception}")
|
||||
|
||||
source_name = feed.feed.get("title", url)
|
||||
articles = []
|
||||
|
||||
for entry in feed.entries[:limit]:
|
||||
summary = ""
|
||||
if hasattr(entry, "summary"):
|
||||
summary = entry.summary
|
||||
elif hasattr(entry, "content"):
|
||||
summary = entry.content[0].get("value", "")
|
||||
|
||||
articles.append({
|
||||
"title": entry.get("title", ""),
|
||||
"summary": summary,
|
||||
"url": entry.get("link", ""),
|
||||
"source": source_name,
|
||||
"published": entry.get("published", ""),
|
||||
"platform": "rss",
|
||||
})
|
||||
|
||||
logger.info(f"RSS: {len(articles)} articles from {source_name}")
|
||||
return articles
|
||||
71
x_reader/fetchers/telegram.py
Normal file
71
x_reader/fetchers/telegram.py
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Telegram channel fetcher — uses Telethon.
|
||||
|
||||
Requires: pip install x-reader[telegram]
|
||||
Requires: TG_API_ID + TG_API_HASH in .env
|
||||
"""
|
||||
|
||||
import os
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from loguru import logger
|
||||
from typing import Dict, Any, List
|
||||
|
||||
|
||||
async def fetch_telegram(
|
||||
channel: str,
|
||||
limit: int = 20,
|
||||
hours: int = 24,
|
||||
session_path: str = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Fetch recent messages from a Telegram channel.
|
||||
|
||||
Args:
|
||||
channel: Channel username (e.g. 'predictionmkt')
|
||||
limit: Max messages per channel
|
||||
hours: Only fetch messages from the last N hours
|
||||
session_path: Path to Telethon session file
|
||||
|
||||
Returns:
|
||||
List of message dicts
|
||||
"""
|
||||
try:
|
||||
from telethon import TelegramClient
|
||||
from telethon.tl.types import Message
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Telethon is required for Telegram fetching. "
|
||||
"Install with: pip install x-reader[telegram]"
|
||||
)
|
||||
|
||||
api_id = os.getenv("TG_API_ID", "")
|
||||
api_hash = os.getenv("TG_API_HASH", "")
|
||||
|
||||
if not api_id or not api_hash:
|
||||
raise ValueError("TG_API_ID and TG_API_HASH must be set in .env")
|
||||
|
||||
session = session_path or os.getenv("TG_SESSION_PATH", "./tg_session")
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
|
||||
|
||||
messages = []
|
||||
async with TelegramClient(session, int(api_id), api_hash) as client:
|
||||
logger.info(f"Fetching TG channel: {channel}")
|
||||
entity = await client.get_entity(channel)
|
||||
|
||||
async for msg in client.iter_messages(entity, limit=limit):
|
||||
if not isinstance(msg, Message) or not msg.text:
|
||||
continue
|
||||
if msg.date < cutoff:
|
||||
break
|
||||
|
||||
messages.append({
|
||||
"text": msg.text,
|
||||
"views": msg.views or 0,
|
||||
"date": msg.date.isoformat(),
|
||||
"url": f"https://t.me/{channel}/{msg.id}",
|
||||
"platform": "telegram",
|
||||
})
|
||||
|
||||
logger.info(f"TG {channel}: {len(messages)} messages")
|
||||
return messages
|
||||
217
x_reader/fetchers/twitter.py
Normal file
217
x_reader/fetchers/twitter.py
Normal file
|
|
@ -0,0 +1,217 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
X/Twitter fetcher — three-tier fallback:
|
||||
|
||||
1. X oEmbed API (fast, reliable for individual tweets, no login needed)
|
||||
2. Jina Reader (handles non-tweet X pages like profiles)
|
||||
3. Playwright + saved session (handles login-required content)
|
||||
|
||||
Install browser tier: pip install "x-reader[browser]" && playwright install chromium
|
||||
Save X session: x-reader login twitter
|
||||
"""
|
||||
|
||||
import re
|
||||
import requests
|
||||
from loguru import logger
|
||||
from typing import Dict, Any
|
||||
|
||||
from x_reader.fetchers.jina import fetch_via_jina
|
||||
|
||||
|
||||
OEMBED_URL = "https://publish.twitter.com/oembed"
|
||||
|
||||
|
||||
def _extract_author(url: str) -> str:
|
||||
"""Extract @username from tweet URL."""
|
||||
match = re.search(r'x\.com/(\w+)/status', url)
|
||||
return f"@{match.group(1)}" if match else ""
|
||||
|
||||
|
||||
def _is_tweet_url(url: str) -> bool:
|
||||
"""Check if this is a direct tweet/status URL (vs profile or other X page)."""
|
||||
return bool(re.search(r'x\.com/\w+/status/\d+', url))
|
||||
|
||||
|
||||
def _fetch_via_oembed(url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch tweet text via X's oEmbed API.
|
||||
Free, reliable, no auth needed. Works for public tweets.
|
||||
Note: oEmbed requires twitter.com URLs (not x.com).
|
||||
"""
|
||||
# oEmbed API requires twitter.com format
|
||||
oembed_query_url = url.replace("x.com", "twitter.com")
|
||||
resp = requests.get(
|
||||
OEMBED_URL,
|
||||
params={"url": oembed_query_url, "omit_script": "true"},
|
||||
timeout=10,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
# Strip HTML tags from the embedded HTML to get clean text
|
||||
html = data.get("html", "")
|
||||
text = re.sub(r'<[^>]+>', ' ', html)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
return {
|
||||
"text": text,
|
||||
"author": data.get("author_name", ""),
|
||||
"author_url": data.get("author_url", ""),
|
||||
"title": text[:100] if text else "",
|
||||
}
|
||||
|
||||
|
||||
async def _fetch_via_playwright(url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch tweet via Playwright with X-specific DOM selectors.
|
||||
Uses saved login session if available (~/.x-reader/sessions/twitter.json).
|
||||
"""
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
except ImportError:
|
||||
raise RuntimeError(
|
||||
"Playwright not installed. Run:\n"
|
||||
' pip install "x-reader[browser]"\n'
|
||||
" playwright install chromium"
|
||||
)
|
||||
|
||||
from x_reader.fetchers.browser import get_session_path
|
||||
from pathlib import Path
|
||||
|
||||
session_path = get_session_path("twitter")
|
||||
has_session = Path(session_path).exists()
|
||||
if has_session:
|
||||
logger.info(f"Using saved X session: {session_path}")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
|
||||
context_kwargs = {}
|
||||
if has_session:
|
||||
context_kwargs["storage_state"] = session_path
|
||||
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36",
|
||||
**context_kwargs,
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
await page.goto(url, wait_until="domcontentloaded", timeout=30_000)
|
||||
|
||||
# Wait for tweet text to render (X is a SPA, needs JS execution)
|
||||
try:
|
||||
await page.wait_for_selector(
|
||||
'[data-testid="tweetText"]', timeout=10_000
|
||||
)
|
||||
except Exception:
|
||||
pass # May not appear if login required
|
||||
|
||||
# Extract tweet content with X-specific selectors
|
||||
tweet_text = await page.evaluate("""() => {
|
||||
// Priority 1: tweet text element
|
||||
const tweetEl = document.querySelector('[data-testid="tweetText"]');
|
||||
if (tweetEl) return tweetEl.innerText;
|
||||
|
||||
// Priority 2: article element (thread view)
|
||||
const article = document.querySelector('article');
|
||||
if (article) return article.innerText;
|
||||
|
||||
// Priority 3: main content area
|
||||
const main = document.querySelector('main');
|
||||
if (main) return main.innerText;
|
||||
|
||||
return '';
|
||||
}""")
|
||||
|
||||
title = await page.title()
|
||||
|
||||
return {
|
||||
"text": (tweet_text or "").strip(),
|
||||
"title": (title or "").strip()[:200],
|
||||
}
|
||||
finally:
|
||||
await context.close()
|
||||
await browser.close()
|
||||
|
||||
|
||||
async def fetch_twitter(url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch a tweet or X post with three-tier fallback.
|
||||
|
||||
Args:
|
||||
url: Tweet URL (x.com or twitter.com)
|
||||
|
||||
Returns:
|
||||
Dict with: text, author, url, title, platform
|
||||
"""
|
||||
url = url.replace("twitter.com", "x.com")
|
||||
author = _extract_author(url)
|
||||
|
||||
# Tier 1: oEmbed API (best for individual tweets)
|
||||
if _is_tweet_url(url):
|
||||
try:
|
||||
logger.info(f"[Twitter] Tier 1 — oEmbed: {url}")
|
||||
data = _fetch_via_oembed(url)
|
||||
if data.get("text") and len(data["text"].strip()) > 20:
|
||||
return {
|
||||
"text": data["text"],
|
||||
"author": author or data.get("author", ""),
|
||||
"url": url,
|
||||
"title": data.get("title", ""),
|
||||
"platform": "twitter",
|
||||
}
|
||||
logger.warning("[Twitter] oEmbed returned thin content")
|
||||
except Exception as e:
|
||||
logger.warning(f"[Twitter] oEmbed failed ({e})")
|
||||
|
||||
# Tier 2: Jina Reader (handles profiles, threads, non-tweet pages)
|
||||
try:
|
||||
logger.info(f"[Twitter] Tier 2 — Jina: {url}")
|
||||
data = fetch_via_jina(url)
|
||||
content = data.get("content", "")
|
||||
title = data.get("title", "")
|
||||
jina_ok = (
|
||||
content
|
||||
and len(content.strip()) > 100
|
||||
and "not yet fully loaded" not in content.lower()
|
||||
and title.lower() not in ("x", "title: x", "")
|
||||
)
|
||||
if jina_ok:
|
||||
return {
|
||||
"text": content,
|
||||
"author": author,
|
||||
"url": url,
|
||||
"title": title,
|
||||
"platform": "twitter",
|
||||
}
|
||||
logger.warning("[Twitter] Jina returned unusable content")
|
||||
except Exception as e:
|
||||
logger.warning(f"[Twitter] Jina failed ({e})")
|
||||
|
||||
# Tier 3: Playwright + session with X-specific extraction
|
||||
try:
|
||||
logger.info(f"[Twitter] Tier 3 — Playwright: {url}")
|
||||
data = await _fetch_via_playwright(url)
|
||||
content = data.get("text", "")
|
||||
if content and len(content.strip()) > 20:
|
||||
return {
|
||||
"text": content,
|
||||
"author": author,
|
||||
"url": url,
|
||||
"title": data.get("title", ""),
|
||||
"platform": "twitter",
|
||||
}
|
||||
logger.warning("[Twitter] Playwright returned empty content")
|
||||
except RuntimeError:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"[Twitter] All methods failed: {e}")
|
||||
|
||||
raise RuntimeError(
|
||||
f"❌ All Twitter fetch methods failed for: {url}\n"
|
||||
f" Try: x-reader login twitter (to save session for browser fallback)\n"
|
||||
f" Then retry: x-reader {url}"
|
||||
)
|
||||
62
x_reader/fetchers/wechat.py
Normal file
62
x_reader/fetchers/wechat.py
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
WeChat article fetcher — two-tier fallback:
|
||||
|
||||
1. Jina Reader (fast, no deps)
|
||||
2. Playwright headless (no login needed for public articles)
|
||||
"""
|
||||
|
||||
from loguru import logger
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
async def fetch_wechat(url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch a WeChat public account article with fallback.
|
||||
|
||||
Args:
|
||||
url: mp.weixin.qq.com article URL
|
||||
|
||||
Returns:
|
||||
Dict with: title, content, author, url, platform
|
||||
"""
|
||||
# Tier 1: Jina Reader
|
||||
try:
|
||||
logger.info(f"[WeChat] Tier 1 — Jina: {url}")
|
||||
from x_reader.fetchers.jina import fetch_via_jina
|
||||
|
||||
data = fetch_via_jina(url)
|
||||
if data.get("content"):
|
||||
return {
|
||||
"title": data["title"],
|
||||
"content": data["content"],
|
||||
"author": data.get("author", ""),
|
||||
"url": url,
|
||||
"platform": "wechat",
|
||||
}
|
||||
logger.warning("[WeChat] Jina returned empty content, falling back to browser")
|
||||
except Exception as e:
|
||||
logger.warning(f"[WeChat] Jina failed ({e}), falling back to browser")
|
||||
|
||||
# Tier 2: Playwright headless (no session needed)
|
||||
try:
|
||||
logger.info(f"[WeChat] Tier 2 — Playwright headless: {url}")
|
||||
from x_reader.fetchers.browser import fetch_via_browser
|
||||
|
||||
data = await fetch_via_browser(url)
|
||||
return {
|
||||
"title": data["title"],
|
||||
"content": data["content"],
|
||||
"author": data.get("author", ""),
|
||||
"url": url,
|
||||
"platform": "wechat",
|
||||
}
|
||||
except RuntimeError:
|
||||
# Playwright not installed — re-raise with original Jina error context
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"[WeChat] Browser fetch also failed: {e}")
|
||||
raise RuntimeError(
|
||||
f"❌ All WeChat fetch methods failed.\n"
|
||||
f" Last error: {e}"
|
||||
)
|
||||
78
x_reader/fetchers/xhs.py
Normal file
78
x_reader/fetchers/xhs.py
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Xiaohongshu (RED) note fetcher — three-tier fallback:
|
||||
|
||||
1. Jina Reader (fast, no deps)
|
||||
2. Playwright + saved session (handles 451/403)
|
||||
3. Error with login instructions
|
||||
|
||||
Install browser tier: pip install "x-reader[browser]" && playwright install chromium
|
||||
"""
|
||||
|
||||
from loguru import logger
|
||||
from typing import Dict, Any
|
||||
from pathlib import Path
|
||||
|
||||
from x_reader.fetchers.jina import fetch_via_jina
|
||||
|
||||
|
||||
async def fetch_xhs(url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch a Xiaohongshu note with three-tier fallback.
|
||||
|
||||
Args:
|
||||
url: xiaohongshu.com or xhslink.com URL
|
||||
|
||||
Returns:
|
||||
Dict with: title, content, author, url, platform
|
||||
"""
|
||||
# Tier 1: Jina Reader
|
||||
try:
|
||||
logger.info(f"[XHS] Tier 1 — Jina: {url}")
|
||||
data = fetch_via_jina(url)
|
||||
if data.get("content"):
|
||||
return {
|
||||
"title": data["title"],
|
||||
"content": data["content"],
|
||||
"author": data.get("author", ""),
|
||||
"url": url,
|
||||
"platform": "xhs",
|
||||
}
|
||||
logger.warning("[XHS] Jina returned empty content, falling back to browser")
|
||||
except Exception as e:
|
||||
logger.warning(f"[XHS] Jina failed ({e}), falling back to browser")
|
||||
|
||||
# Tier 2: Playwright with session
|
||||
from x_reader.fetchers.browser import get_session_path, SESSION_DIR
|
||||
|
||||
session_path = get_session_path("xhs")
|
||||
if not Path(session_path).exists():
|
||||
# Tier 3: No session — guide user
|
||||
raise RuntimeError(
|
||||
f"❌ XHS blocked Jina and no saved session found.\n"
|
||||
f" Run: x-reader login xhs\n"
|
||||
f" Then retry this URL."
|
||||
)
|
||||
|
||||
try:
|
||||
logger.info(f"[XHS] Tier 2 — Playwright with session: {url}")
|
||||
from x_reader.fetchers.browser import fetch_via_browser
|
||||
|
||||
data = await fetch_via_browser(url, storage_state=session_path)
|
||||
return {
|
||||
"title": data["title"],
|
||||
"content": data["content"],
|
||||
"author": data.get("author", ""),
|
||||
"url": url,
|
||||
"platform": "xhs",
|
||||
}
|
||||
except RuntimeError:
|
||||
# Playwright not installed
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"[XHS] Browser fetch also failed: {e}")
|
||||
raise RuntimeError(
|
||||
f"❌ All XHS fetch methods failed.\n"
|
||||
f" Last error: {e}\n"
|
||||
f" Try: x-reader login xhs (to refresh session)"
|
||||
)
|
||||
211
x_reader/fetchers/youtube.py
Normal file
211
x_reader/fetchers/youtube.py
Normal file
|
|
@ -0,0 +1,211 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
YouTube video fetcher — three-tier content extraction:
|
||||
|
||||
1. yt-dlp auto-subtitles (fastest, best quality for subtitled videos)
|
||||
2. yt-dlp audio download → Groq Whisper API transcription (for non-subtitled videos)
|
||||
3. Jina Reader fallback (page description only)
|
||||
|
||||
Requires: yt-dlp installed (brew install yt-dlp / pip install yt-dlp)
|
||||
Optional: GROQ_API_KEY env var for Whisper transcription
|
||||
"""
|
||||
|
||||
import re
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from loguru import logger
|
||||
from typing import Dict, Any
|
||||
|
||||
from x_reader.fetchers.jina import fetch_via_jina
|
||||
|
||||
|
||||
def _extract_video_id(url: str) -> str:
|
||||
"""Extract video ID from YouTube URL."""
|
||||
match = re.search(r'(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})', url)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
|
||||
def _get_subtitles_via_ytdlp(url: str, lang: str = "en") -> str:
|
||||
"""
|
||||
Download auto-generated subtitles using yt-dlp.
|
||||
Returns subtitle text, or empty string if unavailable.
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_path = os.path.join(tmpdir, "sub")
|
||||
|
||||
cmd = [
|
||||
"yt-dlp",
|
||||
"--write-auto-sub",
|
||||
"--write-sub",
|
||||
"--sub-lang", lang,
|
||||
"--sub-format", "srt",
|
||||
"--skip-download",
|
||||
"-o", output_path,
|
||||
url,
|
||||
]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
||||
except FileNotFoundError:
|
||||
logger.warning("yt-dlp not found. Install with: brew install yt-dlp")
|
||||
return ""
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("yt-dlp subtitle download timed out")
|
||||
return ""
|
||||
|
||||
for ext in [f".{lang}.srt", f".{lang}.vtt"]:
|
||||
sub_file = output_path + ext
|
||||
if os.path.exists(sub_file):
|
||||
return _parse_srt(sub_file)
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def _parse_srt(filepath: str) -> str:
|
||||
"""Parse SRT file into clean text (strip timestamps and sequence numbers)."""
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
text_lines = []
|
||||
seen = set()
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line or line.isdigit() or '-->' in line:
|
||||
continue
|
||||
if line.startswith('[') and line.endswith(']'):
|
||||
continue
|
||||
if line not in seen:
|
||||
seen.add(line)
|
||||
text_lines.append(line)
|
||||
|
||||
return " ".join(text_lines)
|
||||
|
||||
|
||||
def _transcribe_via_whisper(url: str) -> str:
|
||||
"""
|
||||
Download audio with yt-dlp and transcribe via Groq Whisper API.
|
||||
|
||||
Requires: GROQ_API_KEY env var + yt-dlp + ffmpeg installed.
|
||||
Groq Whisper limit: 25MB audio file.
|
||||
Returns transcript text, or empty string if unavailable.
|
||||
"""
|
||||
api_key = os.getenv("GROQ_API_KEY")
|
||||
if not api_key:
|
||||
logger.info("GROQ_API_KEY not set, skipping Whisper transcription")
|
||||
return ""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_template = os.path.join(tmpdir, "audio.%(ext)s")
|
||||
|
||||
cmd = [
|
||||
"yt-dlp",
|
||||
"-x",
|
||||
"--audio-format", "m4a",
|
||||
"--audio-quality", "5",
|
||||
"-o", output_template,
|
||||
"--no-playlist",
|
||||
url,
|
||||
]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, text=True, timeout=180)
|
||||
except FileNotFoundError:
|
||||
logger.warning("yt-dlp not found for audio download")
|
||||
return ""
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("yt-dlp audio download timed out")
|
||||
return ""
|
||||
|
||||
# Find the downloaded audio file
|
||||
audio_path = os.path.join(tmpdir, "audio.m4a")
|
||||
if not os.path.exists(audio_path):
|
||||
for f in os.listdir(tmpdir):
|
||||
if f.startswith("audio."):
|
||||
audio_path = os.path.join(tmpdir, f)
|
||||
break
|
||||
else:
|
||||
logger.warning("No audio file downloaded")
|
||||
return ""
|
||||
|
||||
file_size = os.path.getsize(audio_path)
|
||||
if file_size > 25 * 1024 * 1024:
|
||||
logger.warning(f"Audio file too large ({file_size // 1024 // 1024}MB > 25MB limit)")
|
||||
return ""
|
||||
|
||||
logger.info(f"Transcribing {file_size // 1024}KB audio via Groq Whisper...")
|
||||
|
||||
import requests
|
||||
try:
|
||||
with open(audio_path, "rb") as f:
|
||||
response = requests.post(
|
||||
"https://api.groq.com/openai/v1/audio/transcriptions",
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
files={"file": (os.path.basename(audio_path), f, "audio/mp4")},
|
||||
data={"model": "whisper-large-v3", "response_format": "text"},
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
transcript = response.text.strip()
|
||||
logger.info(f"Whisper transcript: {len(transcript)} chars")
|
||||
return transcript
|
||||
else:
|
||||
logger.warning(f"Groq Whisper API error: {response.status_code} {response.text[:200]}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.warning(f"Whisper transcription failed: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
async def fetch_youtube(url: str, sub_lang: str = "en") -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch YouTube video content with three-tier extraction.
|
||||
|
||||
Strategy:
|
||||
1. yt-dlp auto-subtitles (full transcript, fastest)
|
||||
2. yt-dlp audio + Groq Whisper API (for non-subtitled videos)
|
||||
3. Jina Reader fallback (page description only)
|
||||
|
||||
Args:
|
||||
url: YouTube video URL
|
||||
sub_lang: Subtitle language code (default: "en")
|
||||
|
||||
Returns:
|
||||
Dict with: title, description, author, url, video_id, has_transcript, platform
|
||||
"""
|
||||
logger.info(f"Fetching YouTube: {url}")
|
||||
video_id = _extract_video_id(url)
|
||||
|
||||
# Step 1: Get metadata via Jina (fast, always works)
|
||||
jina_data = fetch_via_jina(url)
|
||||
title = jina_data["title"]
|
||||
|
||||
# Step 2: Try yt-dlp auto-subtitles
|
||||
logger.info(f"Extracting subtitles ({sub_lang})...")
|
||||
transcript = _get_subtitles_via_ytdlp(url, lang=sub_lang)
|
||||
|
||||
# Step 3: No subtitles? Try Whisper transcription
|
||||
if not transcript:
|
||||
logger.info("No subtitles available, trying Whisper transcription...")
|
||||
transcript = _transcribe_via_whisper(url)
|
||||
|
||||
if transcript:
|
||||
logger.info(f"Got transcript: {len(transcript)} chars")
|
||||
content = transcript
|
||||
has_transcript = True
|
||||
else:
|
||||
logger.info("No transcript available, using page description")
|
||||
content = jina_data["content"]
|
||||
has_transcript = False
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
"description": content,
|
||||
"author": jina_data.get("author", ""),
|
||||
"url": url,
|
||||
"video_id": video_id,
|
||||
"has_transcript": has_transcript,
|
||||
"platform": "youtube",
|
||||
}
|
||||
91
x_reader/login.py
Normal file
91
x_reader/login.py
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Login manager — opens a visible browser for manual login, saves session.
|
||||
|
||||
Usage:
|
||||
x-reader login xhs # Login to Xiaohongshu
|
||||
x-reader login wechat # Login to WeChat (if needed)
|
||||
|
||||
Sessions are saved as Playwright storage_state JSON files.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
|
||||
SESSION_DIR = Path.home() / ".x-reader" / "sessions"
|
||||
|
||||
PLATFORM_URLS = {
|
||||
"xhs": "https://www.xiaohongshu.com/explore",
|
||||
"xiaohongshu": "https://www.xiaohongshu.com/explore",
|
||||
"wechat": "https://mp.weixin.qq.com",
|
||||
"twitter": "https://x.com/login",
|
||||
"x": "https://x.com/login",
|
||||
}
|
||||
|
||||
|
||||
def login(platform: str) -> None:
|
||||
"""
|
||||
Open a visible browser for the user to log in manually.
|
||||
After login, saves cookies/localStorage to a session file.
|
||||
|
||||
Args:
|
||||
platform: Platform key (e.g. 'xhs', 'wechat')
|
||||
"""
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except ImportError:
|
||||
print(
|
||||
"❌ Playwright is not installed. Run:\n"
|
||||
' pip install "x-reader[browser]"\n'
|
||||
" playwright install chromium"
|
||||
)
|
||||
return
|
||||
|
||||
platform = platform.lower()
|
||||
login_url = PLATFORM_URLS.get(platform)
|
||||
if not login_url:
|
||||
supported = ", ".join(sorted(PLATFORM_URLS.keys()))
|
||||
print(f"❌ Unknown platform: {platform}")
|
||||
print(f" Supported: {supported}")
|
||||
return
|
||||
|
||||
SESSION_DIR.mkdir(parents=True, exist_ok=True)
|
||||
session_path = SESSION_DIR / f"{platform}.json"
|
||||
# Normalize alias to canonical name
|
||||
if platform in ("xhs", "xiaohongshu"):
|
||||
canonical = "xhs"
|
||||
elif platform in ("twitter", "x"):
|
||||
canonical = "twitter"
|
||||
else:
|
||||
canonical = platform
|
||||
session_path = SESSION_DIR / f"{canonical}.json"
|
||||
|
||||
print(f"🌐 Opening {platform} login page: {login_url}")
|
||||
print(" Please log in manually in the browser window.")
|
||||
print(" When done, close the browser or press Ctrl+C.\n")
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=False)
|
||||
context = browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36",
|
||||
)
|
||||
page = context.new_page()
|
||||
page.goto(login_url)
|
||||
|
||||
try:
|
||||
# Wait for user to log in — blocks until browser is closed
|
||||
page.wait_for_event("close", timeout=300_000) # 5 min max
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
except Exception:
|
||||
pass # Browser closed by user
|
||||
|
||||
# Save session regardless of how we got here
|
||||
context.storage_state(path=str(session_path))
|
||||
logger.info(f"Session saved: {session_path}")
|
||||
print(f"\n✅ Session saved to {session_path}")
|
||||
|
||||
context.close()
|
||||
browser.close()
|
||||
154
x_reader/reader.py
Normal file
154
x_reader/reader.py
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Universal Reader — routes any URL to the right fetcher.
|
||||
|
||||
The core dispatcher: give it a URL, get back structured content.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from x_reader.schema import (
|
||||
UnifiedContent, UnifiedInbox, SourceType,
|
||||
from_bilibili, from_twitter, from_wechat,
|
||||
from_xiaohongshu, from_youtube, from_rss, from_telegram,
|
||||
)
|
||||
from x_reader.fetchers.jina import fetch_via_jina
|
||||
|
||||
|
||||
class UniversalReader:
|
||||
"""
|
||||
Routes URLs to platform-specific fetchers.
|
||||
Falls back to Jina Reader for unknown platforms.
|
||||
"""
|
||||
|
||||
def __init__(self, inbox: Optional[UnifiedInbox] = None):
|
||||
self.inbox = inbox
|
||||
|
||||
def _detect_platform(self, url: str) -> str:
|
||||
"""Detect platform from URL."""
|
||||
domain = urlparse(url).netloc.lower()
|
||||
|
||||
if "mp.weixin.qq.com" in domain:
|
||||
return "wechat"
|
||||
if "x.com" in domain or "twitter.com" in domain:
|
||||
return "twitter"
|
||||
if "youtube.com" in domain or "youtu.be" in domain:
|
||||
return "youtube"
|
||||
if "xiaohongshu.com" in domain or "xhslink.com" in domain:
|
||||
return "xhs"
|
||||
if "bilibili.com" in domain or "b23.tv" in domain:
|
||||
return "bilibili"
|
||||
if "xiaoyuzhoufm.com" in domain:
|
||||
return "podcast"
|
||||
if "podcasts.apple.com" in domain:
|
||||
return "podcast"
|
||||
if "t.me" in domain or "telegram.org" in domain:
|
||||
return "telegram"
|
||||
if url.endswith(".xml") or "/rss" in url or "/feed" in url or "/atom" in url:
|
||||
return "rss"
|
||||
return "generic"
|
||||
|
||||
async def read(self, url: str) -> UnifiedContent:
|
||||
"""
|
||||
Fetch content from any URL and return as UnifiedContent.
|
||||
|
||||
The main entry point — give it a URL, get back structured content.
|
||||
"""
|
||||
# Ensure URL has scheme
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = f"https://{url}"
|
||||
|
||||
platform = self._detect_platform(url)
|
||||
logger.info(f"[{platform}] {url[:60]}...")
|
||||
|
||||
try:
|
||||
content = await self._fetch(platform, url)
|
||||
|
||||
# Save to inbox if configured
|
||||
if self.inbox:
|
||||
if self.inbox.add(content):
|
||||
self.inbox.save()
|
||||
logger.info(f"Saved to inbox: {content.title[:50]}")
|
||||
|
||||
# Save to markdown output if configured
|
||||
from x_reader.utils.storage import save_to_markdown
|
||||
save_to_markdown(content)
|
||||
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[{platform}] Failed: {e}")
|
||||
raise
|
||||
|
||||
async def _fetch(self, platform: str, url: str) -> UnifiedContent:
|
||||
"""Dispatch to platform-specific fetcher."""
|
||||
|
||||
if platform == "bilibili":
|
||||
from x_reader.fetchers.bilibili import fetch_bilibili
|
||||
data = await fetch_bilibili(url)
|
||||
return from_bilibili(data)
|
||||
|
||||
if platform == "twitter":
|
||||
from x_reader.fetchers.twitter import fetch_twitter
|
||||
data = await fetch_twitter(url)
|
||||
return from_twitter(data)
|
||||
|
||||
if platform == "wechat":
|
||||
from x_reader.fetchers.wechat import fetch_wechat
|
||||
data = await fetch_wechat(url)
|
||||
return from_wechat(data)
|
||||
|
||||
if platform == "xhs":
|
||||
from x_reader.fetchers.xhs import fetch_xhs
|
||||
data = await fetch_xhs(url)
|
||||
return from_xiaohongshu(data)
|
||||
|
||||
if platform == "youtube":
|
||||
from x_reader.fetchers.youtube import fetch_youtube
|
||||
data = await fetch_youtube(url)
|
||||
return from_youtube(data)
|
||||
|
||||
if platform == "rss":
|
||||
from x_reader.fetchers.rss import fetch_rss
|
||||
articles = await fetch_rss(url, limit=1)
|
||||
if articles:
|
||||
return from_rss(articles[0])
|
||||
raise ValueError(f"No articles found in RSS feed: {url}")
|
||||
|
||||
if platform == "telegram":
|
||||
from x_reader.fetchers.telegram import fetch_telegram
|
||||
# Extract channel username from t.me URL
|
||||
path = urlparse(url).path.strip("/").split("/")[0]
|
||||
channel = path if path else url
|
||||
messages = await fetch_telegram(channel, limit=1)
|
||||
if messages:
|
||||
return from_telegram(messages[0], channel, channel)
|
||||
raise ValueError(f"No messages from Telegram channel: {url}")
|
||||
|
||||
# Fallback: Jina Reader for any unknown URL
|
||||
logger.info(f"Using Jina fallback for: {url}")
|
||||
data = fetch_via_jina(url)
|
||||
return UnifiedContent(
|
||||
source_type=SourceType.MANUAL,
|
||||
source_name=urlparse(url).netloc,
|
||||
title=data["title"],
|
||||
content=data["content"],
|
||||
url=url,
|
||||
)
|
||||
|
||||
async def read_batch(self, urls: list[str]) -> list[UnifiedContent]:
|
||||
"""Fetch multiple URLs concurrently."""
|
||||
tasks = [self.read(url) for url in urls]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
contents = []
|
||||
for url, result in zip(urls, results):
|
||||
if isinstance(result, Exception):
|
||||
logger.error(f"Batch failed for {url}: {result}")
|
||||
else:
|
||||
contents.append(result)
|
||||
|
||||
return contents
|
||||
277
x_reader/schema.py
Normal file
277
x_reader/schema.py
Normal file
|
|
@ -0,0 +1,277 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Unified content schema for x-reader.
|
||||
|
||||
Defines the standard data format for all content sources:
|
||||
- Telegram channels
|
||||
- RSS feeds
|
||||
- Bilibili videos
|
||||
- Xiaohongshu (RED) notes
|
||||
- WeChat articles
|
||||
- X/Twitter posts
|
||||
- YouTube videos
|
||||
- Manual input
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, List
|
||||
from enum import Enum
|
||||
import hashlib
|
||||
import json
|
||||
|
||||
|
||||
class SourceType(str, Enum):
|
||||
"""Content source types."""
|
||||
TELEGRAM = "telegram"
|
||||
RSS = "rss"
|
||||
BILIBILI = "bilibili"
|
||||
XIAOHONGSHU = "xhs"
|
||||
TWITTER = "twitter"
|
||||
WECHAT = "wechat"
|
||||
YOUTUBE = "youtube"
|
||||
MANUAL = "manual"
|
||||
|
||||
|
||||
class MediaType(str, Enum):
|
||||
"""Media types."""
|
||||
TEXT = "text"
|
||||
VIDEO = "video"
|
||||
AUDIO = "audio"
|
||||
IMAGE = "image"
|
||||
|
||||
|
||||
class Priority(str, Enum):
|
||||
"""Content priority levels."""
|
||||
HOT = "hot"
|
||||
QUALITY = "quality"
|
||||
DEEP = "deep"
|
||||
NORMAL = "normal"
|
||||
LOW = "low"
|
||||
|
||||
|
||||
@dataclass
|
||||
class UnifiedContent:
|
||||
"""Unified content format across all platforms."""
|
||||
|
||||
# === Required ===
|
||||
source_type: SourceType
|
||||
source_name: str
|
||||
title: str
|
||||
content: str
|
||||
url: str
|
||||
|
||||
# === Auto-generated ===
|
||||
id: str = ""
|
||||
fetched_at: str = ""
|
||||
|
||||
# === Media ===
|
||||
media_type: MediaType = MediaType.TEXT
|
||||
media_url: Optional[str] = None
|
||||
|
||||
# === Scoring ===
|
||||
score: int = 0
|
||||
priority: Priority = Priority.NORMAL
|
||||
category: str = ""
|
||||
tags: List[str] = field(default_factory=list)
|
||||
|
||||
# === Processing state ===
|
||||
processed: bool = False
|
||||
digest_date: Optional[str] = None
|
||||
|
||||
# === Translation ===
|
||||
title_cn: Optional[str] = None
|
||||
content_cn: Optional[str] = None
|
||||
|
||||
# === Metadata ===
|
||||
extra: dict = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.id:
|
||||
self.id = hashlib.md5(self.url.encode()).hexdigest()[:12]
|
||||
if not self.fetched_at:
|
||||
self.fetched_at = datetime.now().isoformat()
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
d = asdict(self)
|
||||
d['source_type'] = self.source_type.value
|
||||
d['media_type'] = self.media_type.value
|
||||
d['priority'] = self.priority.value
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> 'UnifiedContent':
|
||||
if isinstance(data.get('source_type'), str):
|
||||
data['source_type'] = SourceType(data['source_type'])
|
||||
if isinstance(data.get('media_type'), str):
|
||||
data['media_type'] = MediaType(data['media_type'])
|
||||
if isinstance(data.get('priority'), str):
|
||||
data['priority'] = Priority(data['priority'])
|
||||
known = {f.name for f in cls.__dataclass_fields__.values()}
|
||||
data = {k: v for k, v in data.items() if k in known}
|
||||
return cls(**data)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Converters: platform-specific dict → UnifiedContent
|
||||
# =============================================================================
|
||||
|
||||
def from_telegram(msg: dict, channel_name: str, channel_username: str) -> UnifiedContent:
|
||||
return UnifiedContent(
|
||||
source_type=SourceType.TELEGRAM,
|
||||
source_name=channel_name,
|
||||
title=msg.get('text', '')[:100],
|
||||
content=msg.get('text', ''),
|
||||
url=msg.get('url', f"https://t.me/{channel_username}"),
|
||||
extra={"views": msg.get('views', 0), "channel_username": channel_username},
|
||||
)
|
||||
|
||||
|
||||
def from_rss(article: dict) -> UnifiedContent:
|
||||
return UnifiedContent(
|
||||
source_type=SourceType.RSS,
|
||||
source_name=article.get('source', ''),
|
||||
title=article.get('title', ''),
|
||||
content=article.get('summary', ''),
|
||||
url=article.get('url', article.get('link', '')),
|
||||
score=article.get('score', 0),
|
||||
category=article.get('category', ''),
|
||||
title_cn=article.get('title_cn'),
|
||||
content_cn=article.get('summary_cn'),
|
||||
)
|
||||
|
||||
|
||||
def from_bilibili(video: dict) -> UnifiedContent:
|
||||
return UnifiedContent(
|
||||
source_type=SourceType.BILIBILI,
|
||||
source_name=video.get('author', ''),
|
||||
title=video.get('title', ''),
|
||||
content=video.get('description', ''),
|
||||
url=video.get('url', ''),
|
||||
media_type=MediaType.VIDEO,
|
||||
media_url=video.get('cover', ''),
|
||||
extra={
|
||||
"bvid": video.get('bvid', ''),
|
||||
"duration": video.get('duration', 0),
|
||||
"view_count": video.get('view_count', 0),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def from_twitter(data: dict) -> UnifiedContent:
|
||||
return UnifiedContent(
|
||||
source_type=SourceType.TWITTER,
|
||||
source_name=data.get('author', ''),
|
||||
title=data.get('text', '')[:100],
|
||||
content=data.get('text', ''),
|
||||
url=data.get('url', ''),
|
||||
extra={
|
||||
"likes": data.get('likes', 0),
|
||||
"retweets": data.get('retweets', 0),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def from_wechat(article: dict) -> UnifiedContent:
|
||||
return UnifiedContent(
|
||||
source_type=SourceType.WECHAT,
|
||||
source_name=article.get('author', ''),
|
||||
title=article.get('title', ''),
|
||||
content=article.get('content', ''),
|
||||
url=article.get('url', ''),
|
||||
)
|
||||
|
||||
|
||||
def from_xiaohongshu(note: dict) -> UnifiedContent:
|
||||
return UnifiedContent(
|
||||
source_type=SourceType.XIAOHONGSHU,
|
||||
source_name=note.get('author', ''),
|
||||
title=note.get('title', ''),
|
||||
content=note.get('content', ''),
|
||||
url=note.get('url', ''),
|
||||
media_type=MediaType.IMAGE if note.get('images') else MediaType.TEXT,
|
||||
extra={
|
||||
"likes": note.get('likes', 0),
|
||||
"collects": note.get('collects', 0),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def from_youtube(video: dict) -> UnifiedContent:
|
||||
return UnifiedContent(
|
||||
source_type=SourceType.YOUTUBE,
|
||||
source_name=video.get('author', ''),
|
||||
title=video.get('title', ''),
|
||||
content=video.get('description', ''),
|
||||
url=video.get('url', ''),
|
||||
media_type=MediaType.VIDEO,
|
||||
extra={
|
||||
"duration": video.get('duration', ''),
|
||||
"view_count": video.get('view_count', 0),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def from_manual(title: str, content: str, url: str = "") -> UnifiedContent:
|
||||
return UnifiedContent(
|
||||
source_type=SourceType.MANUAL,
|
||||
source_name="manual",
|
||||
title=title,
|
||||
content=content,
|
||||
url=url or f"manual://{hashlib.md5(title.encode()).hexdigest()[:8]}",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unified Inbox
|
||||
# =============================================================================
|
||||
|
||||
class UnifiedInbox:
|
||||
"""JSON-based content inbox with dedup."""
|
||||
|
||||
def __init__(self, filepath: str = "unified_inbox.json"):
|
||||
self.filepath = filepath
|
||||
self.items: List[UnifiedContent] = []
|
||||
self.load()
|
||||
|
||||
def load(self):
|
||||
import os
|
||||
if os.path.exists(self.filepath):
|
||||
try:
|
||||
with open(self.filepath, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
self.items = [UnifiedContent.from_dict(d) for d in data]
|
||||
except (json.JSONDecodeError, IOError):
|
||||
self.items = []
|
||||
|
||||
def save(self):
|
||||
with open(self.filepath, 'w', encoding='utf-8') as f:
|
||||
json.dump([item.to_dict() for item in self.items], f,
|
||||
ensure_ascii=False, indent=2)
|
||||
|
||||
def add(self, item: UnifiedContent) -> bool:
|
||||
if any(i.id == item.id for i in self.items):
|
||||
return False
|
||||
self.items.append(item)
|
||||
return True
|
||||
|
||||
def add_batch(self, items: List[UnifiedContent]) -> int:
|
||||
return sum(1 for item in items if self.add(item))
|
||||
|
||||
def get_unprocessed(self) -> List[UnifiedContent]:
|
||||
return [i for i in self.items if not i.processed]
|
||||
|
||||
def get_by_source(self, source_type: SourceType) -> List[UnifiedContent]:
|
||||
return [i for i in self.items if i.source_type == source_type]
|
||||
|
||||
def mark_processed(self, item_id: str, digest_date: str = None):
|
||||
for item in self.items:
|
||||
if item.id == item_id:
|
||||
item.processed = True
|
||||
if digest_date:
|
||||
item.digest_date = digest_date
|
||||
break
|
||||
|
||||
def clear_old(self, days: int = 7):
|
||||
cutoff = (datetime.now() - timedelta(days=days)).isoformat()
|
||||
self.items = [i for i in self.items if i.fetched_at > cutoff]
|
||||
0
x_reader/utils/__init__.py
Normal file
0
x_reader/utils/__init__.py
Normal file
88
x_reader/utils/storage.py
Normal file
88
x_reader/utils/storage.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Storage utilities — save content to JSON inbox and optional Markdown file.
|
||||
|
||||
Implements the "atomic archiving" from the tweet:
|
||||
- unified_inbox.json (for AI/programmatic use)
|
||||
- markdown file (for human reading, e.g. Obsidian)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
|
||||
from x_reader.schema import UnifiedContent
|
||||
|
||||
|
||||
def save_to_json(item: UnifiedContent, filepath: str = "unified_inbox.json"):
|
||||
"""Append content to JSON inbox file."""
|
||||
path = Path(filepath)
|
||||
data = []
|
||||
|
||||
if path.exists():
|
||||
try:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
except (json.JSONDecodeError, IOError):
|
||||
data = []
|
||||
|
||||
data.append(item.to_dict())
|
||||
|
||||
# Keep last 500 entries to prevent unbounded growth
|
||||
data = data[-500:]
|
||||
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"Saved to JSON: {path}")
|
||||
|
||||
|
||||
def save_to_markdown(item: UnifiedContent, filepath: str = None):
|
||||
"""
|
||||
Append content to a Markdown file (e.g. Obsidian vault).
|
||||
|
||||
Supports two output modes:
|
||||
- OUTPUT_DIR: Write to {OUTPUT_DIR}/content_hub.md
|
||||
- OBSIDIAN_VAULT: Write to {OBSIDIAN_VAULT}/01-收集箱/x-reader-inbox.md
|
||||
|
||||
If neither is set, skips markdown output.
|
||||
"""
|
||||
if not filepath:
|
||||
# Priority 1: Obsidian vault
|
||||
vault_path = os.getenv("OBSIDIAN_VAULT", "")
|
||||
if vault_path:
|
||||
filepath = os.path.join(vault_path, "01-收集箱", "x-reader-inbox.md")
|
||||
else:
|
||||
# Priority 2: generic output dir
|
||||
output_dir = os.getenv("OUTPUT_DIR", "")
|
||||
if not output_dir:
|
||||
return
|
||||
filepath = os.path.join(output_dir, "content_hub.md")
|
||||
|
||||
path = Path(filepath)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
emoji = {
|
||||
"telegram": "📢", "rss": "📰", "bilibili": "🎬",
|
||||
"xhs": "📕", "twitter": "🐦", "wechat": "💬",
|
||||
"youtube": "▶️", "manual": "✏️",
|
||||
}.get(item.source_type.value, "📄")
|
||||
|
||||
with open(path, 'a', encoding='utf-8') as f:
|
||||
f.write(f"\n## {emoji} {item.title}\n")
|
||||
f.write(f"- Source: {item.source_name} ({item.source_type.value})\n")
|
||||
f.write(f"- URL: {item.url}\n")
|
||||
f.write(f"- Fetched: {item.fetched_at[:16]}\n\n")
|
||||
f.write(f"{item.content[:2000]}\n")
|
||||
f.write("\n---\n")
|
||||
|
||||
logger.info(f"Saved to Markdown: {path}")
|
||||
|
||||
|
||||
def save_content(item: UnifiedContent, json_path: str = None, md_path: str = None):
|
||||
"""Save content to both JSON and Markdown."""
|
||||
inbox_file = json_path or os.getenv("INBOX_FILE", "unified_inbox.json")
|
||||
save_to_json(item, inbox_file)
|
||||
save_to_markdown(item, md_path)
|
||||
Loading…
Add table
Add a link
Reference in a new issue