Initial: forked from runesleo/x-reader (MIT License) - thank you @runes_leo!

This commit is contained in:
Panniantong 2026-02-24 03:00:05 +01:00
commit ee2ad83b12
25 changed files with 2512 additions and 0 deletions

3
x_reader/__init__.py Normal file
View file

@ -0,0 +1,3 @@
"""x-reader: Universal content reader for 7+ platforms."""
__version__ = "0.1.0"

139
x_reader/cli.py Normal file
View file

@ -0,0 +1,139 @@
# -*- coding: utf-8 -*-
"""
x-reader CLI fetch content from any platform.
Usage:
x-reader <url> # Fetch a single URL
x-reader <url1> <url2> ... # Fetch multiple URLs
x-reader list # Show inbox contents
x-reader clear # Clear inbox
"""
import sys
import asyncio
import json
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
from x_reader.reader import UniversalReader
from x_reader.schema import UnifiedInbox, SourceType
def get_inbox_path() -> str:
import os
return os.getenv("INBOX_FILE", "unified_inbox.json")
def cmd_fetch(urls: list[str]):
"""Fetch one or more URLs."""
inbox = UnifiedInbox(get_inbox_path())
reader = UniversalReader(inbox=inbox)
async def run():
if len(urls) == 1:
item = await reader.read(urls[0])
print(f"✅ [{item.source_type.value}] {item.title[:60]}")
print(f" {item.url}")
print(f" {item.content[:200]}...")
else:
items = await reader.read_batch(urls)
for item in items:
print(f"✅ [{item.source_type.value}] {item.title[:60]}")
print(f"\n📦 Fetched {len(items)}/{len(urls)} URLs")
try:
asyncio.run(run())
except KeyboardInterrupt:
print("\n⏹ Cancelled")
except Exception as e:
print(f"{e}")
sys.exit(1)
def cmd_list():
"""Show inbox contents."""
inbox = UnifiedInbox(get_inbox_path())
if not inbox.items:
print("📦 Inbox is empty")
return
print(f"📦 Inbox: {len(inbox.items)} items\n")
emoji_map = {
SourceType.TELEGRAM: "📢", SourceType.RSS: "📰",
SourceType.BILIBILI: "🎬", SourceType.XIAOHONGSHU: "📕",
SourceType.TWITTER: "🐦", SourceType.WECHAT: "💬",
SourceType.YOUTUBE: "▶️", SourceType.MANUAL: "✏️",
}
for i, item in enumerate(inbox.items[-20:], 1):
emoji = emoji_map.get(item.source_type, "📄")
print(f" {i:2d}. {emoji} [{item.source_type.value:8s}] {item.title[:50]}")
def cmd_clear():
"""Clear inbox."""
path = Path(get_inbox_path())
if path.exists():
confirm = input("Clear inbox? (y/N) ")
if confirm.lower() == 'y':
path.write_text("[]")
print("✅ Inbox cleared")
else:
print("📦 Inbox is already empty")
def cmd_login(platform: str):
"""Open browser for manual login to a platform."""
from x_reader.login import login
login(platform)
def main():
if len(sys.argv) < 2:
print("""
📖 x-reader Universal content reader
Usage:
x-reader <url> Fetch content from any URL
x-reader <url1> <url2> Fetch multiple URLs
x-reader login <platform> Login to a platform (saves session for browser fallback)
x-reader list Show inbox contents
x-reader clear Clear inbox
Supported platforms:
WeChat, Telegram, X/Twitter, YouTube,
Bilibili, Xiaohongshu, RSS, and any web page
Examples:
x-reader https://mp.weixin.qq.com/s/abc123
x-reader https://x.com/elonmusk/status/123456
x-reader https://www.xiaohongshu.com/explore/abc123
x-reader login xhs
""")
return
cmd = sys.argv[1].lower()
if cmd == "login":
if len(sys.argv) < 3:
print("❌ Usage: x-reader login <platform>")
print(" Supported: xhs, wechat")
sys.exit(1)
cmd_login(sys.argv[2])
elif cmd == "list":
cmd_list()
elif cmd == "clear":
cmd_clear()
elif cmd.startswith("http") or cmd.startswith("www.") or "." in cmd:
urls = [arg for arg in sys.argv[1:] if arg.startswith(("http", "www.")) or "." in arg]
cmd_fetch(urls)
else:
print(f"❌ Unknown command: {cmd}")
print(" Run 'x-reader' with no args for help")
if __name__ == "__main__":
main()

View file

View file

@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
"""Bilibili video fetcher — uses official web API."""
import re
import requests
from loguru import logger
from typing import Dict, Any
API_URL = "https://api.bilibili.com/x/web-interface/view"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
}
async def fetch_bilibili(url_or_bv: str) -> Dict[str, Any]:
"""Fetch Bilibili video metadata via official API."""
logger.info(f"Fetching Bilibili: {url_or_bv}")
bv_id = url_or_bv
if "bilibili.com" in url_or_bv or "b23.tv" in url_or_bv:
match = re.search(r'BV\w+', url_or_bv)
if match:
bv_id = match.group()
else:
raise ValueError(f"Cannot extract BV ID from: {url_or_bv}")
resp = requests.get(API_URL, params={"bvid": bv_id}, headers=HEADERS, timeout=10)
resp.raise_for_status()
data = resp.json()
if data.get("code") != 0:
raise ValueError(f"Bilibili API error: {data.get('message')}")
video = data["data"]
return {
"title": video.get("title", ""),
"description": video.get("desc", ""),
"author": video.get("owner", {}).get("name", ""),
"url": f"https://www.bilibili.com/video/{bv_id}",
"cover": video.get("pic", ""),
"bvid": bv_id,
"duration": video.get("duration", 0),
"view_count": video.get("stat", {}).get("view", 0),
"platform": "bilibili",
}

View file

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
"""
Playwright browser fetcher headless Chromium fallback for anti-scraping sites.
Used when Jina Reader fails (403/451/timeout). Supports persistent login
sessions via Playwright's storage_state for platforms requiring authentication.
Install: pip install "x-reader[browser]" && playwright install chromium
"""
from loguru import logger
from pathlib import Path
SESSION_DIR = Path.home() / ".x-reader" / "sessions"
TIMEOUT_MS = 30_000
async def fetch_via_browser(url: str, storage_state: str = None) -> dict:
"""
Fetch a URL using headless Chromium via Playwright.
Args:
url: Target URL to fetch.
storage_state: Path to a Playwright storage state JSON file (cookies/localStorage).
If provided, the browser context will load this session.
Returns:
dict with keys: title, content, url, author
"""
try:
from playwright.async_api import async_playwright
except ImportError:
raise RuntimeError(
"Playwright is not installed. Run:\n"
' pip install "x-reader[browser]"\n'
" playwright install chromium"
)
logger.info(f"Browser fetch: {url}")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context_kwargs = {}
if storage_state and Path(storage_state).exists():
context_kwargs["storage_state"] = storage_state
logger.info(f"Using session: {storage_state}")
context = await browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36",
**context_kwargs,
)
page = await context.new_page()
try:
await page.goto(url, wait_until="domcontentloaded", timeout=TIMEOUT_MS)
# Extra wait for JS-heavy pages
await page.wait_for_timeout(2000)
title = await page.title()
# Extract main text content, stripping scripts/styles
content = await page.evaluate("""() => {
const el = document.querySelector('article')
|| document.querySelector('main')
|| document.querySelector('.content')
|| document.body;
return el ? el.innerText : '';
}""")
result = {
"title": (title or "").strip()[:200],
"content": (content or "").strip(),
"url": url,
"author": "",
}
logger.info(f"Browser fetch OK: {title[:60]}")
return result
finally:
await context.close()
await browser.close()
def get_session_path(platform: str) -> str:
"""Get the session file path for a platform."""
return str(SESSION_DIR / f"{platform}.json")

63
x_reader/fetchers/jina.py Normal file
View file

@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
"""
Jina Reader universal fallback for content extraction.
Uses https://r.jina.ai/{url} to extract markdown from any web page.
Free, no API key required, handles JS rendering and anti-scraping.
"""
import requests
from loguru import logger
JINA_BASE = "https://r.jina.ai"
TIMEOUT = 30
HEADERS = {
"Accept": "text/markdown",
"User-Agent": "x-reader/0.1",
}
def fetch_via_jina(url: str) -> dict:
"""
Fetch any URL via Jina Reader and return structured data.
Returns:
dict with keys: title, content, url, author (best-effort)
"""
jina_url = f"{JINA_BASE}/{url}"
logger.info(f"Jina fetch: {url}")
try:
resp = requests.get(jina_url, headers=HEADERS, timeout=TIMEOUT)
resp.raise_for_status()
text = resp.text
# Jina returns markdown; first line is usually the title
lines = text.strip().split("\n")
title = ""
content_lines = []
for line in lines:
if not title and line.strip():
# First non-empty line as title, strip markdown heading
title = line.lstrip("#").strip()
else:
content_lines.append(line)
content = "\n".join(content_lines).strip()
return {
"title": title[:200],
"content": content,
"url": url,
"author": "",
}
except requests.Timeout:
logger.error(f"Jina timeout: {url}")
raise
except requests.RequestException as e:
logger.error(f"Jina fetch failed: {url}{e}")
raise

47
x_reader/fetchers/rss.py Normal file
View file

@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
"""RSS feed fetcher — uses feedparser."""
import feedparser
from loguru import logger
from typing import Dict, Any, List
async def fetch_rss(url: str, limit: int = 20) -> List[Dict[str, Any]]:
"""
Fetch and parse an RSS/Atom feed.
Args:
url: RSS feed URL
limit: Max number of entries to return
Returns:
List of article dicts with: title, summary, url, source, published
"""
logger.info(f"Fetching RSS: {url}")
feed = feedparser.parse(url)
if feed.bozo and not feed.entries:
raise ValueError(f"Failed to parse RSS feed: {feed.bozo_exception}")
source_name = feed.feed.get("title", url)
articles = []
for entry in feed.entries[:limit]:
summary = ""
if hasattr(entry, "summary"):
summary = entry.summary
elif hasattr(entry, "content"):
summary = entry.content[0].get("value", "")
articles.append({
"title": entry.get("title", ""),
"summary": summary,
"url": entry.get("link", ""),
"source": source_name,
"published": entry.get("published", ""),
"platform": "rss",
})
logger.info(f"RSS: {len(articles)} articles from {source_name}")
return articles

View file

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
"""
Telegram channel fetcher uses Telethon.
Requires: pip install x-reader[telegram]
Requires: TG_API_ID + TG_API_HASH in .env
"""
import os
from datetime import datetime, timedelta, timezone
from loguru import logger
from typing import Dict, Any, List
async def fetch_telegram(
channel: str,
limit: int = 20,
hours: int = 24,
session_path: str = None,
) -> List[Dict[str, Any]]:
"""
Fetch recent messages from a Telegram channel.
Args:
channel: Channel username (e.g. 'predictionmkt')
limit: Max messages per channel
hours: Only fetch messages from the last N hours
session_path: Path to Telethon session file
Returns:
List of message dicts
"""
try:
from telethon import TelegramClient
from telethon.tl.types import Message
except ImportError:
raise ImportError(
"Telethon is required for Telegram fetching. "
"Install with: pip install x-reader[telegram]"
)
api_id = os.getenv("TG_API_ID", "")
api_hash = os.getenv("TG_API_HASH", "")
if not api_id or not api_hash:
raise ValueError("TG_API_ID and TG_API_HASH must be set in .env")
session = session_path or os.getenv("TG_SESSION_PATH", "./tg_session")
cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
messages = []
async with TelegramClient(session, int(api_id), api_hash) as client:
logger.info(f"Fetching TG channel: {channel}")
entity = await client.get_entity(channel)
async for msg in client.iter_messages(entity, limit=limit):
if not isinstance(msg, Message) or not msg.text:
continue
if msg.date < cutoff:
break
messages.append({
"text": msg.text,
"views": msg.views or 0,
"date": msg.date.isoformat(),
"url": f"https://t.me/{channel}/{msg.id}",
"platform": "telegram",
})
logger.info(f"TG {channel}: {len(messages)} messages")
return messages

View file

@ -0,0 +1,217 @@
# -*- coding: utf-8 -*-
"""
X/Twitter fetcher three-tier fallback:
1. X oEmbed API (fast, reliable for individual tweets, no login needed)
2. Jina Reader (handles non-tweet X pages like profiles)
3. Playwright + saved session (handles login-required content)
Install browser tier: pip install "x-reader[browser]" && playwright install chromium
Save X session: x-reader login twitter
"""
import re
import requests
from loguru import logger
from typing import Dict, Any
from x_reader.fetchers.jina import fetch_via_jina
OEMBED_URL = "https://publish.twitter.com/oembed"
def _extract_author(url: str) -> str:
"""Extract @username from tweet URL."""
match = re.search(r'x\.com/(\w+)/status', url)
return f"@{match.group(1)}" if match else ""
def _is_tweet_url(url: str) -> bool:
"""Check if this is a direct tweet/status URL (vs profile or other X page)."""
return bool(re.search(r'x\.com/\w+/status/\d+', url))
def _fetch_via_oembed(url: str) -> Dict[str, Any]:
"""
Fetch tweet text via X's oEmbed API.
Free, reliable, no auth needed. Works for public tweets.
Note: oEmbed requires twitter.com URLs (not x.com).
"""
# oEmbed API requires twitter.com format
oembed_query_url = url.replace("x.com", "twitter.com")
resp = requests.get(
OEMBED_URL,
params={"url": oembed_query_url, "omit_script": "true"},
timeout=10,
)
resp.raise_for_status()
data = resp.json()
# Strip HTML tags from the embedded HTML to get clean text
html = data.get("html", "")
text = re.sub(r'<[^>]+>', ' ', html)
text = re.sub(r'\s+', ' ', text).strip()
return {
"text": text,
"author": data.get("author_name", ""),
"author_url": data.get("author_url", ""),
"title": text[:100] if text else "",
}
async def _fetch_via_playwright(url: str) -> Dict[str, Any]:
"""
Fetch tweet via Playwright with X-specific DOM selectors.
Uses saved login session if available (~/.x-reader/sessions/twitter.json).
"""
try:
from playwright.async_api import async_playwright
except ImportError:
raise RuntimeError(
"Playwright not installed. Run:\n"
' pip install "x-reader[browser]"\n'
" playwright install chromium"
)
from x_reader.fetchers.browser import get_session_path
from pathlib import Path
session_path = get_session_path("twitter")
has_session = Path(session_path).exists()
if has_session:
logger.info(f"Using saved X session: {session_path}")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context_kwargs = {}
if has_session:
context_kwargs["storage_state"] = session_path
context = await browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36",
**context_kwargs,
)
page = await context.new_page()
try:
await page.goto(url, wait_until="domcontentloaded", timeout=30_000)
# Wait for tweet text to render (X is a SPA, needs JS execution)
try:
await page.wait_for_selector(
'[data-testid="tweetText"]', timeout=10_000
)
except Exception:
pass # May not appear if login required
# Extract tweet content with X-specific selectors
tweet_text = await page.evaluate("""() => {
// Priority 1: tweet text element
const tweetEl = document.querySelector('[data-testid="tweetText"]');
if (tweetEl) return tweetEl.innerText;
// Priority 2: article element (thread view)
const article = document.querySelector('article');
if (article) return article.innerText;
// Priority 3: main content area
const main = document.querySelector('main');
if (main) return main.innerText;
return '';
}""")
title = await page.title()
return {
"text": (tweet_text or "").strip(),
"title": (title or "").strip()[:200],
}
finally:
await context.close()
await browser.close()
async def fetch_twitter(url: str) -> Dict[str, Any]:
"""
Fetch a tweet or X post with three-tier fallback.
Args:
url: Tweet URL (x.com or twitter.com)
Returns:
Dict with: text, author, url, title, platform
"""
url = url.replace("twitter.com", "x.com")
author = _extract_author(url)
# Tier 1: oEmbed API (best for individual tweets)
if _is_tweet_url(url):
try:
logger.info(f"[Twitter] Tier 1 — oEmbed: {url}")
data = _fetch_via_oembed(url)
if data.get("text") and len(data["text"].strip()) > 20:
return {
"text": data["text"],
"author": author or data.get("author", ""),
"url": url,
"title": data.get("title", ""),
"platform": "twitter",
}
logger.warning("[Twitter] oEmbed returned thin content")
except Exception as e:
logger.warning(f"[Twitter] oEmbed failed ({e})")
# Tier 2: Jina Reader (handles profiles, threads, non-tweet pages)
try:
logger.info(f"[Twitter] Tier 2 — Jina: {url}")
data = fetch_via_jina(url)
content = data.get("content", "")
title = data.get("title", "")
jina_ok = (
content
and len(content.strip()) > 100
and "not yet fully loaded" not in content.lower()
and title.lower() not in ("x", "title: x", "")
)
if jina_ok:
return {
"text": content,
"author": author,
"url": url,
"title": title,
"platform": "twitter",
}
logger.warning("[Twitter] Jina returned unusable content")
except Exception as e:
logger.warning(f"[Twitter] Jina failed ({e})")
# Tier 3: Playwright + session with X-specific extraction
try:
logger.info(f"[Twitter] Tier 3 — Playwright: {url}")
data = await _fetch_via_playwright(url)
content = data.get("text", "")
if content and len(content.strip()) > 20:
return {
"text": content,
"author": author,
"url": url,
"title": data.get("title", ""),
"platform": "twitter",
}
logger.warning("[Twitter] Playwright returned empty content")
except RuntimeError:
raise
except Exception as e:
logger.error(f"[Twitter] All methods failed: {e}")
raise RuntimeError(
f"❌ All Twitter fetch methods failed for: {url}\n"
f" Try: x-reader login twitter (to save session for browser fallback)\n"
f" Then retry: x-reader {url}"
)

View file

@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
"""
WeChat article fetcher two-tier fallback:
1. Jina Reader (fast, no deps)
2. Playwright headless (no login needed for public articles)
"""
from loguru import logger
from typing import Dict, Any
async def fetch_wechat(url: str) -> Dict[str, Any]:
"""
Fetch a WeChat public account article with fallback.
Args:
url: mp.weixin.qq.com article URL
Returns:
Dict with: title, content, author, url, platform
"""
# Tier 1: Jina Reader
try:
logger.info(f"[WeChat] Tier 1 — Jina: {url}")
from x_reader.fetchers.jina import fetch_via_jina
data = fetch_via_jina(url)
if data.get("content"):
return {
"title": data["title"],
"content": data["content"],
"author": data.get("author", ""),
"url": url,
"platform": "wechat",
}
logger.warning("[WeChat] Jina returned empty content, falling back to browser")
except Exception as e:
logger.warning(f"[WeChat] Jina failed ({e}), falling back to browser")
# Tier 2: Playwright headless (no session needed)
try:
logger.info(f"[WeChat] Tier 2 — Playwright headless: {url}")
from x_reader.fetchers.browser import fetch_via_browser
data = await fetch_via_browser(url)
return {
"title": data["title"],
"content": data["content"],
"author": data.get("author", ""),
"url": url,
"platform": "wechat",
}
except RuntimeError:
# Playwright not installed — re-raise with original Jina error context
raise
except Exception as e:
logger.error(f"[WeChat] Browser fetch also failed: {e}")
raise RuntimeError(
f"❌ All WeChat fetch methods failed.\n"
f" Last error: {e}"
)

78
x_reader/fetchers/xhs.py Normal file
View file

@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
"""
Xiaohongshu (RED) note fetcher three-tier fallback:
1. Jina Reader (fast, no deps)
2. Playwright + saved session (handles 451/403)
3. Error with login instructions
Install browser tier: pip install "x-reader[browser]" && playwright install chromium
"""
from loguru import logger
from typing import Dict, Any
from pathlib import Path
from x_reader.fetchers.jina import fetch_via_jina
async def fetch_xhs(url: str) -> Dict[str, Any]:
"""
Fetch a Xiaohongshu note with three-tier fallback.
Args:
url: xiaohongshu.com or xhslink.com URL
Returns:
Dict with: title, content, author, url, platform
"""
# Tier 1: Jina Reader
try:
logger.info(f"[XHS] Tier 1 — Jina: {url}")
data = fetch_via_jina(url)
if data.get("content"):
return {
"title": data["title"],
"content": data["content"],
"author": data.get("author", ""),
"url": url,
"platform": "xhs",
}
logger.warning("[XHS] Jina returned empty content, falling back to browser")
except Exception as e:
logger.warning(f"[XHS] Jina failed ({e}), falling back to browser")
# Tier 2: Playwright with session
from x_reader.fetchers.browser import get_session_path, SESSION_DIR
session_path = get_session_path("xhs")
if not Path(session_path).exists():
# Tier 3: No session — guide user
raise RuntimeError(
f"❌ XHS blocked Jina and no saved session found.\n"
f" Run: x-reader login xhs\n"
f" Then retry this URL."
)
try:
logger.info(f"[XHS] Tier 2 — Playwright with session: {url}")
from x_reader.fetchers.browser import fetch_via_browser
data = await fetch_via_browser(url, storage_state=session_path)
return {
"title": data["title"],
"content": data["content"],
"author": data.get("author", ""),
"url": url,
"platform": "xhs",
}
except RuntimeError:
# Playwright not installed
raise
except Exception as e:
logger.error(f"[XHS] Browser fetch also failed: {e}")
raise RuntimeError(
f"❌ All XHS fetch methods failed.\n"
f" Last error: {e}\n"
f" Try: x-reader login xhs (to refresh session)"
)

View file

@ -0,0 +1,211 @@
# -*- coding: utf-8 -*-
"""
YouTube video fetcher three-tier content extraction:
1. yt-dlp auto-subtitles (fastest, best quality for subtitled videos)
2. yt-dlp audio download Groq Whisper API transcription (for non-subtitled videos)
3. Jina Reader fallback (page description only)
Requires: yt-dlp installed (brew install yt-dlp / pip install yt-dlp)
Optional: GROQ_API_KEY env var for Whisper transcription
"""
import re
import os
import subprocess
import tempfile
from loguru import logger
from typing import Dict, Any
from x_reader.fetchers.jina import fetch_via_jina
def _extract_video_id(url: str) -> str:
"""Extract video ID from YouTube URL."""
match = re.search(r'(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})', url)
return match.group(1) if match else ""
def _get_subtitles_via_ytdlp(url: str, lang: str = "en") -> str:
"""
Download auto-generated subtitles using yt-dlp.
Returns subtitle text, or empty string if unavailable.
"""
with tempfile.TemporaryDirectory() as tmpdir:
output_path = os.path.join(tmpdir, "sub")
cmd = [
"yt-dlp",
"--write-auto-sub",
"--write-sub",
"--sub-lang", lang,
"--sub-format", "srt",
"--skip-download",
"-o", output_path,
url,
]
try:
subprocess.run(cmd, capture_output=True, text=True, timeout=60)
except FileNotFoundError:
logger.warning("yt-dlp not found. Install with: brew install yt-dlp")
return ""
except subprocess.TimeoutExpired:
logger.warning("yt-dlp subtitle download timed out")
return ""
for ext in [f".{lang}.srt", f".{lang}.vtt"]:
sub_file = output_path + ext
if os.path.exists(sub_file):
return _parse_srt(sub_file)
return ""
def _parse_srt(filepath: str) -> str:
"""Parse SRT file into clean text (strip timestamps and sequence numbers)."""
with open(filepath, 'r', encoding='utf-8') as f:
lines = f.readlines()
text_lines = []
seen = set()
for line in lines:
line = line.strip()
if not line or line.isdigit() or '-->' in line:
continue
if line.startswith('[') and line.endswith(']'):
continue
if line not in seen:
seen.add(line)
text_lines.append(line)
return " ".join(text_lines)
def _transcribe_via_whisper(url: str) -> str:
"""
Download audio with yt-dlp and transcribe via Groq Whisper API.
Requires: GROQ_API_KEY env var + yt-dlp + ffmpeg installed.
Groq Whisper limit: 25MB audio file.
Returns transcript text, or empty string if unavailable.
"""
api_key = os.getenv("GROQ_API_KEY")
if not api_key:
logger.info("GROQ_API_KEY not set, skipping Whisper transcription")
return ""
with tempfile.TemporaryDirectory() as tmpdir:
output_template = os.path.join(tmpdir, "audio.%(ext)s")
cmd = [
"yt-dlp",
"-x",
"--audio-format", "m4a",
"--audio-quality", "5",
"-o", output_template,
"--no-playlist",
url,
]
try:
subprocess.run(cmd, capture_output=True, text=True, timeout=180)
except FileNotFoundError:
logger.warning("yt-dlp not found for audio download")
return ""
except subprocess.TimeoutExpired:
logger.warning("yt-dlp audio download timed out")
return ""
# Find the downloaded audio file
audio_path = os.path.join(tmpdir, "audio.m4a")
if not os.path.exists(audio_path):
for f in os.listdir(tmpdir):
if f.startswith("audio."):
audio_path = os.path.join(tmpdir, f)
break
else:
logger.warning("No audio file downloaded")
return ""
file_size = os.path.getsize(audio_path)
if file_size > 25 * 1024 * 1024:
logger.warning(f"Audio file too large ({file_size // 1024 // 1024}MB > 25MB limit)")
return ""
logger.info(f"Transcribing {file_size // 1024}KB audio via Groq Whisper...")
import requests
try:
with open(audio_path, "rb") as f:
response = requests.post(
"https://api.groq.com/openai/v1/audio/transcriptions",
headers={"Authorization": f"Bearer {api_key}"},
files={"file": (os.path.basename(audio_path), f, "audio/mp4")},
data={"model": "whisper-large-v3", "response_format": "text"},
timeout=120,
)
if response.status_code == 200:
transcript = response.text.strip()
logger.info(f"Whisper transcript: {len(transcript)} chars")
return transcript
else:
logger.warning(f"Groq Whisper API error: {response.status_code} {response.text[:200]}")
return ""
except Exception as e:
logger.warning(f"Whisper transcription failed: {e}")
return ""
async def fetch_youtube(url: str, sub_lang: str = "en") -> Dict[str, Any]:
"""
Fetch YouTube video content with three-tier extraction.
Strategy:
1. yt-dlp auto-subtitles (full transcript, fastest)
2. yt-dlp audio + Groq Whisper API (for non-subtitled videos)
3. Jina Reader fallback (page description only)
Args:
url: YouTube video URL
sub_lang: Subtitle language code (default: "en")
Returns:
Dict with: title, description, author, url, video_id, has_transcript, platform
"""
logger.info(f"Fetching YouTube: {url}")
video_id = _extract_video_id(url)
# Step 1: Get metadata via Jina (fast, always works)
jina_data = fetch_via_jina(url)
title = jina_data["title"]
# Step 2: Try yt-dlp auto-subtitles
logger.info(f"Extracting subtitles ({sub_lang})...")
transcript = _get_subtitles_via_ytdlp(url, lang=sub_lang)
# Step 3: No subtitles? Try Whisper transcription
if not transcript:
logger.info("No subtitles available, trying Whisper transcription...")
transcript = _transcribe_via_whisper(url)
if transcript:
logger.info(f"Got transcript: {len(transcript)} chars")
content = transcript
has_transcript = True
else:
logger.info("No transcript available, using page description")
content = jina_data["content"]
has_transcript = False
return {
"title": title,
"description": content,
"author": jina_data.get("author", ""),
"url": url,
"video_id": video_id,
"has_transcript": has_transcript,
"platform": "youtube",
}

91
x_reader/login.py Normal file
View file

@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
"""
Login manager opens a visible browser for manual login, saves session.
Usage:
x-reader login xhs # Login to Xiaohongshu
x-reader login wechat # Login to WeChat (if needed)
Sessions are saved as Playwright storage_state JSON files.
"""
from pathlib import Path
from loguru import logger
SESSION_DIR = Path.home() / ".x-reader" / "sessions"
PLATFORM_URLS = {
"xhs": "https://www.xiaohongshu.com/explore",
"xiaohongshu": "https://www.xiaohongshu.com/explore",
"wechat": "https://mp.weixin.qq.com",
"twitter": "https://x.com/login",
"x": "https://x.com/login",
}
def login(platform: str) -> None:
"""
Open a visible browser for the user to log in manually.
After login, saves cookies/localStorage to a session file.
Args:
platform: Platform key (e.g. 'xhs', 'wechat')
"""
try:
from playwright.sync_api import sync_playwright
except ImportError:
print(
"❌ Playwright is not installed. Run:\n"
' pip install "x-reader[browser]"\n'
" playwright install chromium"
)
return
platform = platform.lower()
login_url = PLATFORM_URLS.get(platform)
if not login_url:
supported = ", ".join(sorted(PLATFORM_URLS.keys()))
print(f"❌ Unknown platform: {platform}")
print(f" Supported: {supported}")
return
SESSION_DIR.mkdir(parents=True, exist_ok=True)
session_path = SESSION_DIR / f"{platform}.json"
# Normalize alias to canonical name
if platform in ("xhs", "xiaohongshu"):
canonical = "xhs"
elif platform in ("twitter", "x"):
canonical = "twitter"
else:
canonical = platform
session_path = SESSION_DIR / f"{canonical}.json"
print(f"🌐 Opening {platform} login page: {login_url}")
print(" Please log in manually in the browser window.")
print(" When done, close the browser or press Ctrl+C.\n")
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
context = browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36",
)
page = context.new_page()
page.goto(login_url)
try:
# Wait for user to log in — blocks until browser is closed
page.wait_for_event("close", timeout=300_000) # 5 min max
except KeyboardInterrupt:
pass
except Exception:
pass # Browser closed by user
# Save session regardless of how we got here
context.storage_state(path=str(session_path))
logger.info(f"Session saved: {session_path}")
print(f"\n✅ Session saved to {session_path}")
context.close()
browser.close()

154
x_reader/reader.py Normal file
View file

@ -0,0 +1,154 @@
# -*- coding: utf-8 -*-
"""
Universal Reader routes any URL to the right fetcher.
The core dispatcher: give it a URL, get back structured content.
"""
import asyncio
from urllib.parse import urlparse
from loguru import logger
from typing import Dict, Any, Optional
from x_reader.schema import (
UnifiedContent, UnifiedInbox, SourceType,
from_bilibili, from_twitter, from_wechat,
from_xiaohongshu, from_youtube, from_rss, from_telegram,
)
from x_reader.fetchers.jina import fetch_via_jina
class UniversalReader:
"""
Routes URLs to platform-specific fetchers.
Falls back to Jina Reader for unknown platforms.
"""
def __init__(self, inbox: Optional[UnifiedInbox] = None):
self.inbox = inbox
def _detect_platform(self, url: str) -> str:
"""Detect platform from URL."""
domain = urlparse(url).netloc.lower()
if "mp.weixin.qq.com" in domain:
return "wechat"
if "x.com" in domain or "twitter.com" in domain:
return "twitter"
if "youtube.com" in domain or "youtu.be" in domain:
return "youtube"
if "xiaohongshu.com" in domain or "xhslink.com" in domain:
return "xhs"
if "bilibili.com" in domain or "b23.tv" in domain:
return "bilibili"
if "xiaoyuzhoufm.com" in domain:
return "podcast"
if "podcasts.apple.com" in domain:
return "podcast"
if "t.me" in domain or "telegram.org" in domain:
return "telegram"
if url.endswith(".xml") or "/rss" in url or "/feed" in url or "/atom" in url:
return "rss"
return "generic"
async def read(self, url: str) -> UnifiedContent:
"""
Fetch content from any URL and return as UnifiedContent.
The main entry point give it a URL, get back structured content.
"""
# Ensure URL has scheme
if not url.startswith(("http://", "https://")):
url = f"https://{url}"
platform = self._detect_platform(url)
logger.info(f"[{platform}] {url[:60]}...")
try:
content = await self._fetch(platform, url)
# Save to inbox if configured
if self.inbox:
if self.inbox.add(content):
self.inbox.save()
logger.info(f"Saved to inbox: {content.title[:50]}")
# Save to markdown output if configured
from x_reader.utils.storage import save_to_markdown
save_to_markdown(content)
return content
except Exception as e:
logger.error(f"[{platform}] Failed: {e}")
raise
async def _fetch(self, platform: str, url: str) -> UnifiedContent:
"""Dispatch to platform-specific fetcher."""
if platform == "bilibili":
from x_reader.fetchers.bilibili import fetch_bilibili
data = await fetch_bilibili(url)
return from_bilibili(data)
if platform == "twitter":
from x_reader.fetchers.twitter import fetch_twitter
data = await fetch_twitter(url)
return from_twitter(data)
if platform == "wechat":
from x_reader.fetchers.wechat import fetch_wechat
data = await fetch_wechat(url)
return from_wechat(data)
if platform == "xhs":
from x_reader.fetchers.xhs import fetch_xhs
data = await fetch_xhs(url)
return from_xiaohongshu(data)
if platform == "youtube":
from x_reader.fetchers.youtube import fetch_youtube
data = await fetch_youtube(url)
return from_youtube(data)
if platform == "rss":
from x_reader.fetchers.rss import fetch_rss
articles = await fetch_rss(url, limit=1)
if articles:
return from_rss(articles[0])
raise ValueError(f"No articles found in RSS feed: {url}")
if platform == "telegram":
from x_reader.fetchers.telegram import fetch_telegram
# Extract channel username from t.me URL
path = urlparse(url).path.strip("/").split("/")[0]
channel = path if path else url
messages = await fetch_telegram(channel, limit=1)
if messages:
return from_telegram(messages[0], channel, channel)
raise ValueError(f"No messages from Telegram channel: {url}")
# Fallback: Jina Reader for any unknown URL
logger.info(f"Using Jina fallback for: {url}")
data = fetch_via_jina(url)
return UnifiedContent(
source_type=SourceType.MANUAL,
source_name=urlparse(url).netloc,
title=data["title"],
content=data["content"],
url=url,
)
async def read_batch(self, urls: list[str]) -> list[UnifiedContent]:
"""Fetch multiple URLs concurrently."""
tasks = [self.read(url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
contents = []
for url, result in zip(urls, results):
if isinstance(result, Exception):
logger.error(f"Batch failed for {url}: {result}")
else:
contents.append(result)
return contents

277
x_reader/schema.py Normal file
View file

@ -0,0 +1,277 @@
# -*- coding: utf-8 -*-
"""
Unified content schema for x-reader.
Defines the standard data format for all content sources:
- Telegram channels
- RSS feeds
- Bilibili videos
- Xiaohongshu (RED) notes
- WeChat articles
- X/Twitter posts
- YouTube videos
- Manual input
"""
from dataclasses import dataclass, field, asdict
from datetime import datetime, timedelta
from typing import Optional, List
from enum import Enum
import hashlib
import json
class SourceType(str, Enum):
"""Content source types."""
TELEGRAM = "telegram"
RSS = "rss"
BILIBILI = "bilibili"
XIAOHONGSHU = "xhs"
TWITTER = "twitter"
WECHAT = "wechat"
YOUTUBE = "youtube"
MANUAL = "manual"
class MediaType(str, Enum):
"""Media types."""
TEXT = "text"
VIDEO = "video"
AUDIO = "audio"
IMAGE = "image"
class Priority(str, Enum):
"""Content priority levels."""
HOT = "hot"
QUALITY = "quality"
DEEP = "deep"
NORMAL = "normal"
LOW = "low"
@dataclass
class UnifiedContent:
"""Unified content format across all platforms."""
# === Required ===
source_type: SourceType
source_name: str
title: str
content: str
url: str
# === Auto-generated ===
id: str = ""
fetched_at: str = ""
# === Media ===
media_type: MediaType = MediaType.TEXT
media_url: Optional[str] = None
# === Scoring ===
score: int = 0
priority: Priority = Priority.NORMAL
category: str = ""
tags: List[str] = field(default_factory=list)
# === Processing state ===
processed: bool = False
digest_date: Optional[str] = None
# === Translation ===
title_cn: Optional[str] = None
content_cn: Optional[str] = None
# === Metadata ===
extra: dict = field(default_factory=dict)
def __post_init__(self):
if not self.id:
self.id = hashlib.md5(self.url.encode()).hexdigest()[:12]
if not self.fetched_at:
self.fetched_at = datetime.now().isoformat()
def to_dict(self) -> dict:
d = asdict(self)
d['source_type'] = self.source_type.value
d['media_type'] = self.media_type.value
d['priority'] = self.priority.value
return d
@classmethod
def from_dict(cls, data: dict) -> 'UnifiedContent':
if isinstance(data.get('source_type'), str):
data['source_type'] = SourceType(data['source_type'])
if isinstance(data.get('media_type'), str):
data['media_type'] = MediaType(data['media_type'])
if isinstance(data.get('priority'), str):
data['priority'] = Priority(data['priority'])
known = {f.name for f in cls.__dataclass_fields__.values()}
data = {k: v for k, v in data.items() if k in known}
return cls(**data)
# =============================================================================
# Converters: platform-specific dict → UnifiedContent
# =============================================================================
def from_telegram(msg: dict, channel_name: str, channel_username: str) -> UnifiedContent:
return UnifiedContent(
source_type=SourceType.TELEGRAM,
source_name=channel_name,
title=msg.get('text', '')[:100],
content=msg.get('text', ''),
url=msg.get('url', f"https://t.me/{channel_username}"),
extra={"views": msg.get('views', 0), "channel_username": channel_username},
)
def from_rss(article: dict) -> UnifiedContent:
return UnifiedContent(
source_type=SourceType.RSS,
source_name=article.get('source', ''),
title=article.get('title', ''),
content=article.get('summary', ''),
url=article.get('url', article.get('link', '')),
score=article.get('score', 0),
category=article.get('category', ''),
title_cn=article.get('title_cn'),
content_cn=article.get('summary_cn'),
)
def from_bilibili(video: dict) -> UnifiedContent:
return UnifiedContent(
source_type=SourceType.BILIBILI,
source_name=video.get('author', ''),
title=video.get('title', ''),
content=video.get('description', ''),
url=video.get('url', ''),
media_type=MediaType.VIDEO,
media_url=video.get('cover', ''),
extra={
"bvid": video.get('bvid', ''),
"duration": video.get('duration', 0),
"view_count": video.get('view_count', 0),
},
)
def from_twitter(data: dict) -> UnifiedContent:
return UnifiedContent(
source_type=SourceType.TWITTER,
source_name=data.get('author', ''),
title=data.get('text', '')[:100],
content=data.get('text', ''),
url=data.get('url', ''),
extra={
"likes": data.get('likes', 0),
"retweets": data.get('retweets', 0),
},
)
def from_wechat(article: dict) -> UnifiedContent:
return UnifiedContent(
source_type=SourceType.WECHAT,
source_name=article.get('author', ''),
title=article.get('title', ''),
content=article.get('content', ''),
url=article.get('url', ''),
)
def from_xiaohongshu(note: dict) -> UnifiedContent:
return UnifiedContent(
source_type=SourceType.XIAOHONGSHU,
source_name=note.get('author', ''),
title=note.get('title', ''),
content=note.get('content', ''),
url=note.get('url', ''),
media_type=MediaType.IMAGE if note.get('images') else MediaType.TEXT,
extra={
"likes": note.get('likes', 0),
"collects": note.get('collects', 0),
},
)
def from_youtube(video: dict) -> UnifiedContent:
return UnifiedContent(
source_type=SourceType.YOUTUBE,
source_name=video.get('author', ''),
title=video.get('title', ''),
content=video.get('description', ''),
url=video.get('url', ''),
media_type=MediaType.VIDEO,
extra={
"duration": video.get('duration', ''),
"view_count": video.get('view_count', 0),
},
)
def from_manual(title: str, content: str, url: str = "") -> UnifiedContent:
return UnifiedContent(
source_type=SourceType.MANUAL,
source_name="manual",
title=title,
content=content,
url=url or f"manual://{hashlib.md5(title.encode()).hexdigest()[:8]}",
)
# =============================================================================
# Unified Inbox
# =============================================================================
class UnifiedInbox:
"""JSON-based content inbox with dedup."""
def __init__(self, filepath: str = "unified_inbox.json"):
self.filepath = filepath
self.items: List[UnifiedContent] = []
self.load()
def load(self):
import os
if os.path.exists(self.filepath):
try:
with open(self.filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
self.items = [UnifiedContent.from_dict(d) for d in data]
except (json.JSONDecodeError, IOError):
self.items = []
def save(self):
with open(self.filepath, 'w', encoding='utf-8') as f:
json.dump([item.to_dict() for item in self.items], f,
ensure_ascii=False, indent=2)
def add(self, item: UnifiedContent) -> bool:
if any(i.id == item.id for i in self.items):
return False
self.items.append(item)
return True
def add_batch(self, items: List[UnifiedContent]) -> int:
return sum(1 for item in items if self.add(item))
def get_unprocessed(self) -> List[UnifiedContent]:
return [i for i in self.items if not i.processed]
def get_by_source(self, source_type: SourceType) -> List[UnifiedContent]:
return [i for i in self.items if i.source_type == source_type]
def mark_processed(self, item_id: str, digest_date: str = None):
for item in self.items:
if item.id == item_id:
item.processed = True
if digest_date:
item.digest_date = digest_date
break
def clear_old(self, days: int = 7):
cutoff = (datetime.now() - timedelta(days=days)).isoformat()
self.items = [i for i in self.items if i.fetched_at > cutoff]

View file

88
x_reader/utils/storage.py Normal file
View file

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
"""
Storage utilities save content to JSON inbox and optional Markdown file.
Implements the "atomic archiving" from the tweet:
- unified_inbox.json (for AI/programmatic use)
- markdown file (for human reading, e.g. Obsidian)
"""
import json
import os
from datetime import datetime
from pathlib import Path
from loguru import logger
from x_reader.schema import UnifiedContent
def save_to_json(item: UnifiedContent, filepath: str = "unified_inbox.json"):
"""Append content to JSON inbox file."""
path = Path(filepath)
data = []
if path.exists():
try:
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
except (json.JSONDecodeError, IOError):
data = []
data.append(item.to_dict())
# Keep last 500 entries to prevent unbounded growth
data = data[-500:]
with open(path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.info(f"Saved to JSON: {path}")
def save_to_markdown(item: UnifiedContent, filepath: str = None):
"""
Append content to a Markdown file (e.g. Obsidian vault).
Supports two output modes:
- OUTPUT_DIR: Write to {OUTPUT_DIR}/content_hub.md
- OBSIDIAN_VAULT: Write to {OBSIDIAN_VAULT}/01-收集箱/x-reader-inbox.md
If neither is set, skips markdown output.
"""
if not filepath:
# Priority 1: Obsidian vault
vault_path = os.getenv("OBSIDIAN_VAULT", "")
if vault_path:
filepath = os.path.join(vault_path, "01-收集箱", "x-reader-inbox.md")
else:
# Priority 2: generic output dir
output_dir = os.getenv("OUTPUT_DIR", "")
if not output_dir:
return
filepath = os.path.join(output_dir, "content_hub.md")
path = Path(filepath)
path.parent.mkdir(parents=True, exist_ok=True)
emoji = {
"telegram": "📢", "rss": "📰", "bilibili": "🎬",
"xhs": "📕", "twitter": "🐦", "wechat": "💬",
"youtube": "▶️", "manual": "✏️",
}.get(item.source_type.value, "📄")
with open(path, 'a', encoding='utf-8') as f:
f.write(f"\n## {emoji} {item.title}\n")
f.write(f"- Source: {item.source_name} ({item.source_type.value})\n")
f.write(f"- URL: {item.url}\n")
f.write(f"- Fetched: {item.fetched_at[:16]}\n\n")
f.write(f"{item.content[:2000]}\n")
f.write("\n---\n")
logger.info(f"Saved to Markdown: {path}")
def save_content(item: UnifiedContent, json_path: str = None, md_path: str = None):
"""Save content to both JSON and Markdown."""
inbox_file = json_path or os.getenv("INBOX_FILE", "unified_inbox.json")
save_to_json(item, inbox_file)
save_to_markdown(item, md_path)