Initial: forked from runesleo/x-reader (MIT License) - thank you @runes_leo!

2026-02-24 03:00:05 +01:00 · 2026-02-24 03:00:05 +01:00 · ee2ad83b12
commit ee2ad83b12
25 changed files with 2512 additions and 0 deletions
--- a/x_reader/init.py
+++ b/x_reader/init.py
@ -0,0 +1,3 @@
+"""x-reader: Universal content reader for 7+ platforms."""
+
+__version__ = "0.1.0"
--- a/x_reader/cli.py
+++ b/x_reader/cli.py
@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+"""
+x-reader CLI — fetch content from any platform.
+
+Usage:
+    x-reader <url>                     # Fetch a single URL
+    x-reader <url1> <url2> ...         # Fetch multiple URLs
+    x-reader list                      # Show inbox contents
+    x-reader clear                     # Clear inbox
+"""
+
+import sys
+import asyncio
+import json
+from pathlib import Path
+
+from dotenv import load_dotenv
+load_dotenv()
+
+from x_reader.reader import UniversalReader
+from x_reader.schema import UnifiedInbox, SourceType
+
+
+def get_inbox_path() -> str:
+    import os
+    return os.getenv("INBOX_FILE", "unified_inbox.json")
+
+
+def cmd_fetch(urls: list[str]):
+    """Fetch one or more URLs."""
+    inbox = UnifiedInbox(get_inbox_path())
+    reader = UniversalReader(inbox=inbox)
+
+    async def run():
+        if len(urls) == 1:
+            item = await reader.read(urls[0])
+            print(f"✅ [{item.source_type.value}] {item.title[:60]}")
+            print(f"   {item.url}")
+            print(f"   {item.content[:200]}...")
+        else:
+            items = await reader.read_batch(urls)
+            for item in items:
+                print(f"✅ [{item.source_type.value}] {item.title[:60]}")
+            print(f"\n📦 Fetched {len(items)}/{len(urls)} URLs")
+
+    try:
+        asyncio.run(run())
+    except KeyboardInterrupt:
+        print("\n⏹ Cancelled")
+    except Exception as e:
+        print(f"❌ {e}")
+        sys.exit(1)
+
+
+def cmd_list():
+    """Show inbox contents."""
+    inbox = UnifiedInbox(get_inbox_path())
+    if not inbox.items:
+        print("📦 Inbox is empty")
+        return
+
+    print(f"📦 Inbox: {len(inbox.items)} items\n")
+
+    emoji_map = {
+        SourceType.TELEGRAM: "📢", SourceType.RSS: "📰",
+        SourceType.BILIBILI: "🎬", SourceType.XIAOHONGSHU: "📕",
+        SourceType.TWITTER: "🐦", SourceType.WECHAT: "💬",
+        SourceType.YOUTUBE: "▶️", SourceType.MANUAL: "✏️",
+    }
+
+    for i, item in enumerate(inbox.items[-20:], 1):
+        emoji = emoji_map.get(item.source_type, "📄")
+        print(f"  {i:2d}. {emoji} [{item.source_type.value:8s}] {item.title[:50]}")
+
+
+def cmd_clear():
+    """Clear inbox."""
+    path = Path(get_inbox_path())
+    if path.exists():
+        confirm = input("Clear inbox? (y/N) ")
+        if confirm.lower() == 'y':
+            path.write_text("[]")
+            print("✅ Inbox cleared")
+    else:
+        print("📦 Inbox is already empty")
+
+
+def cmd_login(platform: str):
+    """Open browser for manual login to a platform."""
+    from x_reader.login import login
+    login(platform)
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("""
+📖 x-reader — Universal content reader
+
+Usage:
+    x-reader <url>              Fetch content from any URL
+    x-reader <url1> <url2>      Fetch multiple URLs
+    x-reader login <platform>   Login to a platform (saves session for browser fallback)
+    x-reader list               Show inbox contents
+    x-reader clear              Clear inbox
+
+Supported platforms:
+    WeChat, Telegram, X/Twitter, YouTube,
+    Bilibili, Xiaohongshu, RSS, and any web page
+
+Examples:
+    x-reader https://mp.weixin.qq.com/s/abc123
+    x-reader https://x.com/elonmusk/status/123456
+    x-reader https://www.xiaohongshu.com/explore/abc123
+    x-reader login xhs
+""")
+        return
+
+    cmd = sys.argv[1].lower()
+
+    if cmd == "login":
+        if len(sys.argv) < 3:
+            print("❌ Usage: x-reader login <platform>")
+            print("   Supported: xhs, wechat")
+            sys.exit(1)
+        cmd_login(sys.argv[2])
+    elif cmd == "list":
+        cmd_list()
+    elif cmd == "clear":
+        cmd_clear()
+    elif cmd.startswith("http") or cmd.startswith("www.") or "." in cmd:
+        urls = [arg for arg in sys.argv[1:] if arg.startswith(("http", "www.")) or "." in arg]
+        cmd_fetch(urls)
+    else:
+        print(f"❌ Unknown command: {cmd}")
+        print("   Run 'x-reader' with no args for help")
+
+
+if __name__ == "__main__":
+    main()
--- a/x_reader/fetchers/init.py
+++ b/x_reader/fetchers/init.py
--- a/x_reader/fetchers/bilibili.py
+++ b/x_reader/fetchers/bilibili.py
@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+"""Bilibili video fetcher — uses official web API."""
+
+import re
+import requests
+from loguru import logger
+from typing import Dict, Any
+
+
+API_URL = "https://api.bilibili.com/x/web-interface/view"
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
+}
+
+
+async def fetch_bilibili(url_or_bv: str) -> Dict[str, Any]:
+    """Fetch Bilibili video metadata via official API."""
+    logger.info(f"Fetching Bilibili: {url_or_bv}")
+
+    bv_id = url_or_bv
+    if "bilibili.com" in url_or_bv or "b23.tv" in url_or_bv:
+        match = re.search(r'BV\w+', url_or_bv)
+        if match:
+            bv_id = match.group()
+        else:
+            raise ValueError(f"Cannot extract BV ID from: {url_or_bv}")
+
+    resp = requests.get(API_URL, params={"bvid": bv_id}, headers=HEADERS, timeout=10)
+    resp.raise_for_status()
+    data = resp.json()
+
+    if data.get("code") != 0:
+        raise ValueError(f"Bilibili API error: {data.get('message')}")
+
+    video = data["data"]
+    return {
+        "title": video.get("title", ""),
+        "description": video.get("desc", ""),
+        "author": video.get("owner", {}).get("name", ""),
+        "url": f"https://www.bilibili.com/video/{bv_id}",
+        "cover": video.get("pic", ""),
+        "bvid": bv_id,
+        "duration": video.get("duration", 0),
+        "view_count": video.get("stat", {}).get("view", 0),
+        "platform": "bilibili",
+    }
--- a/x_reader/fetchers/browser.py
+++ b/x_reader/fetchers/browser.py
@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+"""
+Playwright browser fetcher — headless Chromium fallback for anti-scraping sites.
+
+Used when Jina Reader fails (403/451/timeout). Supports persistent login
+sessions via Playwright's storage_state for platforms requiring authentication.
+
+Install: pip install "x-reader[browser]" && playwright install chromium
+"""
+
+from loguru import logger
+from pathlib import Path
+
+SESSION_DIR = Path.home() / ".x-reader" / "sessions"
+TIMEOUT_MS = 30_000
+
+
+async def fetch_via_browser(url: str, storage_state: str = None) -> dict:
+    """
+    Fetch a URL using headless Chromium via Playwright.
+
+    Args:
+        url: Target URL to fetch.
+        storage_state: Path to a Playwright storage state JSON file (cookies/localStorage).
+                       If provided, the browser context will load this session.
+
+    Returns:
+        dict with keys: title, content, url, author
+    """
+    try:
+        from playwright.async_api import async_playwright
+    except ImportError:
+        raise RuntimeError(
+            "Playwright is not installed. Run:\n"
+            '  pip install "x-reader[browser]"\n'
+            "  playwright install chromium"
+        )
+
+    logger.info(f"Browser fetch: {url}")
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+
+        context_kwargs = {}
+        if storage_state and Path(storage_state).exists():
+            context_kwargs["storage_state"] = storage_state
+            logger.info(f"Using session: {storage_state}")
+
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                       "AppleWebKit/537.36 (KHTML, like Gecko) "
+                       "Chrome/120.0.0.0 Safari/537.36",
+            **context_kwargs,
+        )
+        page = await context.new_page()
+
+        try:
+            await page.goto(url, wait_until="domcontentloaded", timeout=TIMEOUT_MS)
+            # Extra wait for JS-heavy pages
+            await page.wait_for_timeout(2000)
+
+            title = await page.title()
+            # Extract main text content, stripping scripts/styles
+            content = await page.evaluate("""() => {
+                const el = document.querySelector('article')
+                    || document.querySelector('main')
+                    || document.querySelector('.content')
+                    || document.body;
+                return el ? el.innerText : '';
+            }""")
+
+            result = {
+                "title": (title or "").strip()[:200],
+                "content": (content or "").strip(),
+                "url": url,
+                "author": "",
+            }
+            logger.info(f"Browser fetch OK: {title[:60]}")
+            return result
+
+        finally:
+            await context.close()
+            await browser.close()
+
+
+def get_session_path(platform: str) -> str:
+    """Get the session file path for a platform."""
+    return str(SESSION_DIR / f"{platform}.json")
--- a/x_reader/fetchers/jina.py
+++ b/x_reader/fetchers/jina.py
@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+"""
+Jina Reader — universal fallback for content extraction.
+
+Uses https://r.jina.ai/{url} to extract markdown from any web page.
+Free, no API key required, handles JS rendering and anti-scraping.
+"""
+
+import requests
+from loguru import logger
+
+
+JINA_BASE = "https://r.jina.ai"
+TIMEOUT = 30
+
+HEADERS = {
+    "Accept": "text/markdown",
+    "User-Agent": "x-reader/0.1",
+}
+
+
+def fetch_via_jina(url: str) -> dict:
+    """
+    Fetch any URL via Jina Reader and return structured data.
+
+    Returns:
+        dict with keys: title, content, url, author (best-effort)
+    """
+    jina_url = f"{JINA_BASE}/{url}"
+    logger.info(f"Jina fetch: {url}")
+
+    try:
+        resp = requests.get(jina_url, headers=HEADERS, timeout=TIMEOUT)
+        resp.raise_for_status()
+        text = resp.text
+
+        # Jina returns markdown; first line is usually the title
+        lines = text.strip().split("\n")
+        title = ""
+        content_lines = []
+
+        for line in lines:
+            if not title and line.strip():
+                # First non-empty line as title, strip markdown heading
+                title = line.lstrip("#").strip()
+            else:
+                content_lines.append(line)
+
+        content = "\n".join(content_lines).strip()
+
+        return {
+            "title": title[:200],
+            "content": content,
+            "url": url,
+            "author": "",
+        }
+
+    except requests.Timeout:
+        logger.error(f"Jina timeout: {url}")
+        raise
+    except requests.RequestException as e:
+        logger.error(f"Jina fetch failed: {url} — {e}")
+        raise
--- a/x_reader/fetchers/rss.py
+++ b/x_reader/fetchers/rss.py
@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+"""RSS feed fetcher — uses feedparser."""
+
+import feedparser
+from loguru import logger
+from typing import Dict, Any, List
+
+
+async def fetch_rss(url: str, limit: int = 20) -> List[Dict[str, Any]]:
+    """
+    Fetch and parse an RSS/Atom feed.
+
+    Args:
+        url: RSS feed URL
+        limit: Max number of entries to return
+
+    Returns:
+        List of article dicts with: title, summary, url, source, published
+    """
+    logger.info(f"Fetching RSS: {url}")
+
+    feed = feedparser.parse(url)
+
+    if feed.bozo and not feed.entries:
+        raise ValueError(f"Failed to parse RSS feed: {feed.bozo_exception}")
+
+    source_name = feed.feed.get("title", url)
+    articles = []
+
+    for entry in feed.entries[:limit]:
+        summary = ""
+        if hasattr(entry, "summary"):
+            summary = entry.summary
+        elif hasattr(entry, "content"):
+            summary = entry.content[0].get("value", "")
+
+        articles.append({
+            "title": entry.get("title", ""),
+            "summary": summary,
+            "url": entry.get("link", ""),
+            "source": source_name,
+            "published": entry.get("published", ""),
+            "platform": "rss",
+        })
+
+    logger.info(f"RSS: {len(articles)} articles from {source_name}")
+    return articles
--- a/x_reader/fetchers/telegram.py
+++ b/x_reader/fetchers/telegram.py
@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+"""
+Telegram channel fetcher — uses Telethon.
+
+Requires: pip install x-reader[telegram]
+Requires: TG_API_ID + TG_API_HASH in .env
+"""
+
+import os
+from datetime import datetime, timedelta, timezone
+from loguru import logger
+from typing import Dict, Any, List
+
+
+async def fetch_telegram(
+    channel: str,
+    limit: int = 20,
+    hours: int = 24,
+    session_path: str = None,
+) -> List[Dict[str, Any]]:
+    """
+    Fetch recent messages from a Telegram channel.
+
+    Args:
+        channel: Channel username (e.g. 'predictionmkt')
+        limit: Max messages per channel
+        hours: Only fetch messages from the last N hours
+        session_path: Path to Telethon session file
+
+    Returns:
+        List of message dicts
+    """
+    try:
+        from telethon import TelegramClient
+        from telethon.tl.types import Message
+    except ImportError:
+        raise ImportError(
+            "Telethon is required for Telegram fetching. "
+            "Install with: pip install x-reader[telegram]"
+        )
+
+    api_id = os.getenv("TG_API_ID", "")
+    api_hash = os.getenv("TG_API_HASH", "")
+
+    if not api_id or not api_hash:
+        raise ValueError("TG_API_ID and TG_API_HASH must be set in .env")
+
+    session = session_path or os.getenv("TG_SESSION_PATH", "./tg_session")
+    cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
+
+    messages = []
+    async with TelegramClient(session, int(api_id), api_hash) as client:
+        logger.info(f"Fetching TG channel: {channel}")
+        entity = await client.get_entity(channel)
+
+        async for msg in client.iter_messages(entity, limit=limit):
+            if not isinstance(msg, Message) or not msg.text:
+                continue
+            if msg.date < cutoff:
+                break
+
+            messages.append({
+                "text": msg.text,
+                "views": msg.views or 0,
+                "date": msg.date.isoformat(),
+                "url": f"https://t.me/{channel}/{msg.id}",
+                "platform": "telegram",
+            })
+
+    logger.info(f"TG {channel}: {len(messages)} messages")
+    return messages
--- a/x_reader/fetchers/twitter.py
+++ b/x_reader/fetchers/twitter.py
@ -0,0 +1,217 @@
+# -*- coding: utf-8 -*-
+"""
+X/Twitter fetcher — three-tier fallback:
+
+1. X oEmbed API (fast, reliable for individual tweets, no login needed)
+2. Jina Reader (handles non-tweet X pages like profiles)
+3. Playwright + saved session (handles login-required content)
+
+Install browser tier: pip install "x-reader[browser]" && playwright install chromium
+Save X session:       x-reader login twitter
+"""
+
+import re
+import requests
+from loguru import logger
+from typing import Dict, Any
+
+from x_reader.fetchers.jina import fetch_via_jina
+
+
+OEMBED_URL = "https://publish.twitter.com/oembed"
+
+
+def _extract_author(url: str) -> str:
+    """Extract @username from tweet URL."""
+    match = re.search(r'x\.com/(\w+)/status', url)
+    return f"@{match.group(1)}" if match else ""
+
+
+def _is_tweet_url(url: str) -> bool:
+    """Check if this is a direct tweet/status URL (vs profile or other X page)."""
+    return bool(re.search(r'x\.com/\w+/status/\d+', url))
+
+
+def _fetch_via_oembed(url: str) -> Dict[str, Any]:
+    """
+    Fetch tweet text via X's oEmbed API.
+    Free, reliable, no auth needed. Works for public tweets.
+    Note: oEmbed requires twitter.com URLs (not x.com).
+    """
+    # oEmbed API requires twitter.com format
+    oembed_query_url = url.replace("x.com", "twitter.com")
+    resp = requests.get(
+        OEMBED_URL,
+        params={"url": oembed_query_url, "omit_script": "true"},
+        timeout=10,
+    )
+    resp.raise_for_status()
+    data = resp.json()
+
+    # Strip HTML tags from the embedded HTML to get clean text
+    html = data.get("html", "")
+    text = re.sub(r'<[^>]+>', ' ', html)
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return {
+        "text": text,
+        "author": data.get("author_name", ""),
+        "author_url": data.get("author_url", ""),
+        "title": text[:100] if text else "",
+    }
+
+
+async def _fetch_via_playwright(url: str) -> Dict[str, Any]:
+    """
+    Fetch tweet via Playwright with X-specific DOM selectors.
+    Uses saved login session if available (~/.x-reader/sessions/twitter.json).
+    """
+    try:
+        from playwright.async_api import async_playwright
+    except ImportError:
+        raise RuntimeError(
+            "Playwright not installed. Run:\n"
+            '  pip install "x-reader[browser]"\n'
+            "  playwright install chromium"
+        )
+
+    from x_reader.fetchers.browser import get_session_path
+    from pathlib import Path
+
+    session_path = get_session_path("twitter")
+    has_session = Path(session_path).exists()
+    if has_session:
+        logger.info(f"Using saved X session: {session_path}")
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+
+        context_kwargs = {}
+        if has_session:
+            context_kwargs["storage_state"] = session_path
+
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                       "AppleWebKit/537.36 (KHTML, like Gecko) "
+                       "Chrome/120.0.0.0 Safari/537.36",
+            **context_kwargs,
+        )
+        page = await context.new_page()
+
+        try:
+            await page.goto(url, wait_until="domcontentloaded", timeout=30_000)
+
+            # Wait for tweet text to render (X is a SPA, needs JS execution)
+            try:
+                await page.wait_for_selector(
+                    '[data-testid="tweetText"]', timeout=10_000
+                )
+            except Exception:
+                pass  # May not appear if login required
+
+            # Extract tweet content with X-specific selectors
+            tweet_text = await page.evaluate("""() => {
+                // Priority 1: tweet text element
+                const tweetEl = document.querySelector('[data-testid="tweetText"]');
+                if (tweetEl) return tweetEl.innerText;
+
+                // Priority 2: article element (thread view)
+                const article = document.querySelector('article');
+                if (article) return article.innerText;
+
+                // Priority 3: main content area
+                const main = document.querySelector('main');
+                if (main) return main.innerText;
+
+                return '';
+            }""")
+
+            title = await page.title()
+
+            return {
+                "text": (tweet_text or "").strip(),
+                "title": (title or "").strip()[:200],
+            }
+        finally:
+            await context.close()
+            await browser.close()
+
+
+async def fetch_twitter(url: str) -> Dict[str, Any]:
+    """
+    Fetch a tweet or X post with three-tier fallback.
+
+    Args:
+        url: Tweet URL (x.com or twitter.com)
+
+    Returns:
+        Dict with: text, author, url, title, platform
+    """
+    url = url.replace("twitter.com", "x.com")
+    author = _extract_author(url)
+
+    # Tier 1: oEmbed API (best for individual tweets)
+    if _is_tweet_url(url):
+        try:
+            logger.info(f"[Twitter] Tier 1 — oEmbed: {url}")
+            data = _fetch_via_oembed(url)
+            if data.get("text") and len(data["text"].strip()) > 20:
+                return {
+                    "text": data["text"],
+                    "author": author or data.get("author", ""),
+                    "url": url,
+                    "title": data.get("title", ""),
+                    "platform": "twitter",
+                }
+            logger.warning("[Twitter] oEmbed returned thin content")
+        except Exception as e:
+            logger.warning(f"[Twitter] oEmbed failed ({e})")
+
+    # Tier 2: Jina Reader (handles profiles, threads, non-tweet pages)
+    try:
+        logger.info(f"[Twitter] Tier 2 — Jina: {url}")
+        data = fetch_via_jina(url)
+        content = data.get("content", "")
+        title = data.get("title", "")
+        jina_ok = (
+            content
+            and len(content.strip()) > 100
+            and "not yet fully loaded" not in content.lower()
+            and title.lower() not in ("x", "title: x", "")
+        )
+        if jina_ok:
+            return {
+                "text": content,
+                "author": author,
+                "url": url,
+                "title": title,
+                "platform": "twitter",
+            }
+        logger.warning("[Twitter] Jina returned unusable content")
+    except Exception as e:
+        logger.warning(f"[Twitter] Jina failed ({e})")
+
+    # Tier 3: Playwright + session with X-specific extraction
+    try:
+        logger.info(f"[Twitter] Tier 3 — Playwright: {url}")
+        data = await _fetch_via_playwright(url)
+        content = data.get("text", "")
+        if content and len(content.strip()) > 20:
+            return {
+                "text": content,
+                "author": author,
+                "url": url,
+                "title": data.get("title", ""),
+                "platform": "twitter",
+            }
+        logger.warning("[Twitter] Playwright returned empty content")
+    except RuntimeError:
+        raise
+    except Exception as e:
+        logger.error(f"[Twitter] All methods failed: {e}")
+
+    raise RuntimeError(
+        f"❌ All Twitter fetch methods failed for: {url}\n"
+        f"   Try: x-reader login twitter (to save session for browser fallback)\n"
+        f"   Then retry: x-reader {url}"
+    )
--- a/x_reader/fetchers/wechat.py
+++ b/x_reader/fetchers/wechat.py
@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+"""
+WeChat article fetcher — two-tier fallback:
+
+1. Jina Reader (fast, no deps)
+2. Playwright headless (no login needed for public articles)
+"""
+
+from loguru import logger
+from typing import Dict, Any
+
+
+async def fetch_wechat(url: str) -> Dict[str, Any]:
+    """
+    Fetch a WeChat public account article with fallback.
+
+    Args:
+        url: mp.weixin.qq.com article URL
+
+    Returns:
+        Dict with: title, content, author, url, platform
+    """
+    # Tier 1: Jina Reader
+    try:
+        logger.info(f"[WeChat] Tier 1 — Jina: {url}")
+        from x_reader.fetchers.jina import fetch_via_jina
+
+        data = fetch_via_jina(url)
+        if data.get("content"):
+            return {
+                "title": data["title"],
+                "content": data["content"],
+                "author": data.get("author", ""),
+                "url": url,
+                "platform": "wechat",
+            }
+        logger.warning("[WeChat] Jina returned empty content, falling back to browser")
+    except Exception as e:
+        logger.warning(f"[WeChat] Jina failed ({e}), falling back to browser")
+
+    # Tier 2: Playwright headless (no session needed)
+    try:
+        logger.info(f"[WeChat] Tier 2 — Playwright headless: {url}")
+        from x_reader.fetchers.browser import fetch_via_browser
+
+        data = await fetch_via_browser(url)
+        return {
+            "title": data["title"],
+            "content": data["content"],
+            "author": data.get("author", ""),
+            "url": url,
+            "platform": "wechat",
+        }
+    except RuntimeError:
+        # Playwright not installed — re-raise with original Jina error context
+        raise
+    except Exception as e:
+        logger.error(f"[WeChat] Browser fetch also failed: {e}")
+        raise RuntimeError(
+            f"❌ All WeChat fetch methods failed.\n"
+            f"   Last error: {e}"
+        )
--- a/x_reader/fetchers/xhs.py
+++ b/x_reader/fetchers/xhs.py
@ -0,0 +1,78 @@
+# -*- coding: utf-8 -*-
+"""
+Xiaohongshu (RED) note fetcher — three-tier fallback:
+
+1. Jina Reader (fast, no deps)
+2. Playwright + saved session (handles 451/403)
+3. Error with login instructions
+
+Install browser tier: pip install "x-reader[browser]" && playwright install chromium
+"""
+
+from loguru import logger
+from typing import Dict, Any
+from pathlib import Path
+
+from x_reader.fetchers.jina import fetch_via_jina
+
+
+async def fetch_xhs(url: str) -> Dict[str, Any]:
+    """
+    Fetch a Xiaohongshu note with three-tier fallback.
+
+    Args:
+        url: xiaohongshu.com or xhslink.com URL
+
+    Returns:
+        Dict with: title, content, author, url, platform
+    """
+    # Tier 1: Jina Reader
+    try:
+        logger.info(f"[XHS] Tier 1 — Jina: {url}")
+        data = fetch_via_jina(url)
+        if data.get("content"):
+            return {
+                "title": data["title"],
+                "content": data["content"],
+                "author": data.get("author", ""),
+                "url": url,
+                "platform": "xhs",
+            }
+        logger.warning("[XHS] Jina returned empty content, falling back to browser")
+    except Exception as e:
+        logger.warning(f"[XHS] Jina failed ({e}), falling back to browser")
+
+    # Tier 2: Playwright with session
+    from x_reader.fetchers.browser import get_session_path, SESSION_DIR
+
+    session_path = get_session_path("xhs")
+    if not Path(session_path).exists():
+        # Tier 3: No session — guide user
+        raise RuntimeError(
+            f"❌ XHS blocked Jina and no saved session found.\n"
+            f"   Run: x-reader login xhs\n"
+            f"   Then retry this URL."
+        )
+
+    try:
+        logger.info(f"[XHS] Tier 2 — Playwright with session: {url}")
+        from x_reader.fetchers.browser import fetch_via_browser
+
+        data = await fetch_via_browser(url, storage_state=session_path)
+        return {
+            "title": data["title"],
+            "content": data["content"],
+            "author": data.get("author", ""),
+            "url": url,
+            "platform": "xhs",
+        }
+    except RuntimeError:
+        # Playwright not installed
+        raise
+    except Exception as e:
+        logger.error(f"[XHS] Browser fetch also failed: {e}")
+        raise RuntimeError(
+            f"❌ All XHS fetch methods failed.\n"
+            f"   Last error: {e}\n"
+            f"   Try: x-reader login xhs (to refresh session)"
+        )
--- a/x_reader/fetchers/youtube.py
+++ b/x_reader/fetchers/youtube.py
@ -0,0 +1,211 @@
+# -*- coding: utf-8 -*-
+"""
+YouTube video fetcher — three-tier content extraction:
+
+1. yt-dlp auto-subtitles (fastest, best quality for subtitled videos)
+2. yt-dlp audio download → Groq Whisper API transcription (for non-subtitled videos)
+3. Jina Reader fallback (page description only)
+
+Requires: yt-dlp installed (brew install yt-dlp / pip install yt-dlp)
+Optional: GROQ_API_KEY env var for Whisper transcription
+"""
+
+import re
+import os
+import subprocess
+import tempfile
+from loguru import logger
+from typing import Dict, Any
+
+from x_reader.fetchers.jina import fetch_via_jina
+
+
+def _extract_video_id(url: str) -> str:
+    """Extract video ID from YouTube URL."""
+    match = re.search(r'(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})', url)
+    return match.group(1) if match else ""
+
+
+def _get_subtitles_via_ytdlp(url: str, lang: str = "en") -> str:
+    """
+    Download auto-generated subtitles using yt-dlp.
+    Returns subtitle text, or empty string if unavailable.
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_path = os.path.join(tmpdir, "sub")
+
+        cmd = [
+            "yt-dlp",
+            "--write-auto-sub",
+            "--write-sub",
+            "--sub-lang", lang,
+            "--sub-format", "srt",
+            "--skip-download",
+            "-o", output_path,
+            url,
+        ]
+
+        try:
+            subprocess.run(cmd, capture_output=True, text=True, timeout=60)
+        except FileNotFoundError:
+            logger.warning("yt-dlp not found. Install with: brew install yt-dlp")
+            return ""
+        except subprocess.TimeoutExpired:
+            logger.warning("yt-dlp subtitle download timed out")
+            return ""
+
+        for ext in [f".{lang}.srt", f".{lang}.vtt"]:
+            sub_file = output_path + ext
+            if os.path.exists(sub_file):
+                return _parse_srt(sub_file)
+
+    return ""
+
+
+def _parse_srt(filepath: str) -> str:
+    """Parse SRT file into clean text (strip timestamps and sequence numbers)."""
+    with open(filepath, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+
+    text_lines = []
+    seen = set()
+
+    for line in lines:
+        line = line.strip()
+        if not line or line.isdigit() or '-->' in line:
+            continue
+        if line.startswith('[') and line.endswith(']'):
+            continue
+        if line not in seen:
+            seen.add(line)
+            text_lines.append(line)
+
+    return " ".join(text_lines)
+
+
+def _transcribe_via_whisper(url: str) -> str:
+    """
+    Download audio with yt-dlp and transcribe via Groq Whisper API.
+
+    Requires: GROQ_API_KEY env var + yt-dlp + ffmpeg installed.
+    Groq Whisper limit: 25MB audio file.
+    Returns transcript text, or empty string if unavailable.
+    """
+    api_key = os.getenv("GROQ_API_KEY")
+    if not api_key:
+        logger.info("GROQ_API_KEY not set, skipping Whisper transcription")
+        return ""
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_template = os.path.join(tmpdir, "audio.%(ext)s")
+
+        cmd = [
+            "yt-dlp",
+            "-x",
+            "--audio-format", "m4a",
+            "--audio-quality", "5",
+            "-o", output_template,
+            "--no-playlist",
+            url,
+        ]
+
+        try:
+            subprocess.run(cmd, capture_output=True, text=True, timeout=180)
+        except FileNotFoundError:
+            logger.warning("yt-dlp not found for audio download")
+            return ""
+        except subprocess.TimeoutExpired:
+            logger.warning("yt-dlp audio download timed out")
+            return ""
+
+        # Find the downloaded audio file
+        audio_path = os.path.join(tmpdir, "audio.m4a")
+        if not os.path.exists(audio_path):
+            for f in os.listdir(tmpdir):
+                if f.startswith("audio."):
+                    audio_path = os.path.join(tmpdir, f)
+                    break
+            else:
+                logger.warning("No audio file downloaded")
+                return ""
+
+        file_size = os.path.getsize(audio_path)
+        if file_size > 25 * 1024 * 1024:
+            logger.warning(f"Audio file too large ({file_size // 1024 // 1024}MB > 25MB limit)")
+            return ""
+
+        logger.info(f"Transcribing {file_size // 1024}KB audio via Groq Whisper...")
+
+        import requests
+        try:
+            with open(audio_path, "rb") as f:
+                response = requests.post(
+                    "https://api.groq.com/openai/v1/audio/transcriptions",
+                    headers={"Authorization": f"Bearer {api_key}"},
+                    files={"file": (os.path.basename(audio_path), f, "audio/mp4")},
+                    data={"model": "whisper-large-v3", "response_format": "text"},
+                    timeout=120,
+                )
+
+            if response.status_code == 200:
+                transcript = response.text.strip()
+                logger.info(f"Whisper transcript: {len(transcript)} chars")
+                return transcript
+            else:
+                logger.warning(f"Groq Whisper API error: {response.status_code} {response.text[:200]}")
+                return ""
+        except Exception as e:
+            logger.warning(f"Whisper transcription failed: {e}")
+            return ""
+
+
+async def fetch_youtube(url: str, sub_lang: str = "en") -> Dict[str, Any]:
+    """
+    Fetch YouTube video content with three-tier extraction.
+
+    Strategy:
+    1. yt-dlp auto-subtitles (full transcript, fastest)
+    2. yt-dlp audio + Groq Whisper API (for non-subtitled videos)
+    3. Jina Reader fallback (page description only)
+
+    Args:
+        url: YouTube video URL
+        sub_lang: Subtitle language code (default: "en")
+
+    Returns:
+        Dict with: title, description, author, url, video_id, has_transcript, platform
+    """
+    logger.info(f"Fetching YouTube: {url}")
+    video_id = _extract_video_id(url)
+
+    # Step 1: Get metadata via Jina (fast, always works)
+    jina_data = fetch_via_jina(url)
+    title = jina_data["title"]
+
+    # Step 2: Try yt-dlp auto-subtitles
+    logger.info(f"Extracting subtitles ({sub_lang})...")
+    transcript = _get_subtitles_via_ytdlp(url, lang=sub_lang)
+
+    # Step 3: No subtitles? Try Whisper transcription
+    if not transcript:
+        logger.info("No subtitles available, trying Whisper transcription...")
+        transcript = _transcribe_via_whisper(url)
+
+    if transcript:
+        logger.info(f"Got transcript: {len(transcript)} chars")
+        content = transcript
+        has_transcript = True
+    else:
+        logger.info("No transcript available, using page description")
+        content = jina_data["content"]
+        has_transcript = False
+
+    return {
+        "title": title,
+        "description": content,
+        "author": jina_data.get("author", ""),
+        "url": url,
+        "video_id": video_id,
+        "has_transcript": has_transcript,
+        "platform": "youtube",
+    }
--- a/x_reader/login.py
+++ b/x_reader/login.py
@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+"""
+Login manager — opens a visible browser for manual login, saves session.
+
+Usage:
+    x-reader login xhs       # Login to Xiaohongshu
+    x-reader login wechat     # Login to WeChat (if needed)
+
+Sessions are saved as Playwright storage_state JSON files.
+"""
+
+from pathlib import Path
+from loguru import logger
+
+SESSION_DIR = Path.home() / ".x-reader" / "sessions"
+
+PLATFORM_URLS = {
+    "xhs": "https://www.xiaohongshu.com/explore",
+    "xiaohongshu": "https://www.xiaohongshu.com/explore",
+    "wechat": "https://mp.weixin.qq.com",
+    "twitter": "https://x.com/login",
+    "x": "https://x.com/login",
+}
+
+
+def login(platform: str) -> None:
+    """
+    Open a visible browser for the user to log in manually.
+    After login, saves cookies/localStorage to a session file.
+
+    Args:
+        platform: Platform key (e.g. 'xhs', 'wechat')
+    """
+    try:
+        from playwright.sync_api import sync_playwright
+    except ImportError:
+        print(
+            "❌ Playwright is not installed. Run:\n"
+            '   pip install "x-reader[browser]"\n'
+            "   playwright install chromium"
+        )
+        return
+
+    platform = platform.lower()
+    login_url = PLATFORM_URLS.get(platform)
+    if not login_url:
+        supported = ", ".join(sorted(PLATFORM_URLS.keys()))
+        print(f"❌ Unknown platform: {platform}")
+        print(f"   Supported: {supported}")
+        return
+
+    SESSION_DIR.mkdir(parents=True, exist_ok=True)
+    session_path = SESSION_DIR / f"{platform}.json"
+    # Normalize alias to canonical name
+    if platform in ("xhs", "xiaohongshu"):
+        canonical = "xhs"
+    elif platform in ("twitter", "x"):
+        canonical = "twitter"
+    else:
+        canonical = platform
+    session_path = SESSION_DIR / f"{canonical}.json"
+
+    print(f"🌐 Opening {platform} login page: {login_url}")
+    print("   Please log in manually in the browser window.")
+    print("   When done, close the browser or press Ctrl+C.\n")
+
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=False)
+        context = browser.new_context(
+            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                       "AppleWebKit/537.36 (KHTML, like Gecko) "
+                       "Chrome/120.0.0.0 Safari/537.36",
+        )
+        page = context.new_page()
+        page.goto(login_url)
+
+        try:
+            # Wait for user to log in — blocks until browser is closed
+            page.wait_for_event("close", timeout=300_000)  # 5 min max
+        except KeyboardInterrupt:
+            pass
+        except Exception:
+            pass  # Browser closed by user
+
+        # Save session regardless of how we got here
+        context.storage_state(path=str(session_path))
+        logger.info(f"Session saved: {session_path}")
+        print(f"\n✅ Session saved to {session_path}")
+
+        context.close()
+        browser.close()
--- a/x_reader/reader.py
+++ b/x_reader/reader.py
@ -0,0 +1,154 @@
+# -*- coding: utf-8 -*-
+"""
+Universal Reader — routes any URL to the right fetcher.
+
+The core dispatcher: give it a URL, get back structured content.
+"""
+
+import asyncio
+from urllib.parse import urlparse
+from loguru import logger
+from typing import Dict, Any, Optional
+
+from x_reader.schema import (
+    UnifiedContent, UnifiedInbox, SourceType,
+    from_bilibili, from_twitter, from_wechat,
+    from_xiaohongshu, from_youtube, from_rss, from_telegram,
+)
+from x_reader.fetchers.jina import fetch_via_jina
+
+
+class UniversalReader:
+    """
+    Routes URLs to platform-specific fetchers.
+    Falls back to Jina Reader for unknown platforms.
+    """
+
+    def __init__(self, inbox: Optional[UnifiedInbox] = None):
+        self.inbox = inbox
+
+    def _detect_platform(self, url: str) -> str:
+        """Detect platform from URL."""
+        domain = urlparse(url).netloc.lower()
+
+        if "mp.weixin.qq.com" in domain:
+            return "wechat"
+        if "x.com" in domain or "twitter.com" in domain:
+            return "twitter"
+        if "youtube.com" in domain or "youtu.be" in domain:
+            return "youtube"
+        if "xiaohongshu.com" in domain or "xhslink.com" in domain:
+            return "xhs"
+        if "bilibili.com" in domain or "b23.tv" in domain:
+            return "bilibili"
+        if "xiaoyuzhoufm.com" in domain:
+            return "podcast"
+        if "podcasts.apple.com" in domain:
+            return "podcast"
+        if "t.me" in domain or "telegram.org" in domain:
+            return "telegram"
+        if url.endswith(".xml") or "/rss" in url or "/feed" in url or "/atom" in url:
+            return "rss"
+        return "generic"
+
+    async def read(self, url: str) -> UnifiedContent:
+        """
+        Fetch content from any URL and return as UnifiedContent.
+
+        The main entry point — give it a URL, get back structured content.
+        """
+        # Ensure URL has scheme
+        if not url.startswith(("http://", "https://")):
+            url = f"https://{url}"
+
+        platform = self._detect_platform(url)
+        logger.info(f"[{platform}] {url[:60]}...")
+
+        try:
+            content = await self._fetch(platform, url)
+
+            # Save to inbox if configured
+            if self.inbox:
+                if self.inbox.add(content):
+                    self.inbox.save()
+                    logger.info(f"Saved to inbox: {content.title[:50]}")
+
+            # Save to markdown output if configured
+            from x_reader.utils.storage import save_to_markdown
+            save_to_markdown(content)
+
+            return content
+
+        except Exception as e:
+            logger.error(f"[{platform}] Failed: {e}")
+            raise
+
+    async def _fetch(self, platform: str, url: str) -> UnifiedContent:
+        """Dispatch to platform-specific fetcher."""
+
+        if platform == "bilibili":
+            from x_reader.fetchers.bilibili import fetch_bilibili
+            data = await fetch_bilibili(url)
+            return from_bilibili(data)
+
+        if platform == "twitter":
+            from x_reader.fetchers.twitter import fetch_twitter
+            data = await fetch_twitter(url)
+            return from_twitter(data)
+
+        if platform == "wechat":
+            from x_reader.fetchers.wechat import fetch_wechat
+            data = await fetch_wechat(url)
+            return from_wechat(data)
+
+        if platform == "xhs":
+            from x_reader.fetchers.xhs import fetch_xhs
+            data = await fetch_xhs(url)
+            return from_xiaohongshu(data)
+
+        if platform == "youtube":
+            from x_reader.fetchers.youtube import fetch_youtube
+            data = await fetch_youtube(url)
+            return from_youtube(data)
+
+        if platform == "rss":
+            from x_reader.fetchers.rss import fetch_rss
+            articles = await fetch_rss(url, limit=1)
+            if articles:
+                return from_rss(articles[0])
+            raise ValueError(f"No articles found in RSS feed: {url}")
+
+        if platform == "telegram":
+            from x_reader.fetchers.telegram import fetch_telegram
+            # Extract channel username from t.me URL
+            path = urlparse(url).path.strip("/").split("/")[0]
+            channel = path if path else url
+            messages = await fetch_telegram(channel, limit=1)
+            if messages:
+                return from_telegram(messages[0], channel, channel)
+            raise ValueError(f"No messages from Telegram channel: {url}")
+
+        # Fallback: Jina Reader for any unknown URL
+        logger.info(f"Using Jina fallback for: {url}")
+        data = fetch_via_jina(url)
+        return UnifiedContent(
+            source_type=SourceType.MANUAL,
+            source_name=urlparse(url).netloc,
+            title=data["title"],
+            content=data["content"],
+            url=url,
+        )
+
+    async def read_batch(self, urls: list[str]) -> list[UnifiedContent]:
+        """Fetch multiple URLs concurrently."""
+        tasks = [self.read(url) for url in urls]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        contents = []
+        for url, result in zip(urls, results):
+            if isinstance(result, Exception):
+                logger.error(f"Batch failed for {url}: {result}")
+            else:
+                contents.append(result)
+
+        return contents
--- a/x_reader/schema.py
+++ b/x_reader/schema.py
@ -0,0 +1,277 @@
+# -*- coding: utf-8 -*-
+"""
+Unified content schema for x-reader.
+
+Defines the standard data format for all content sources:
+- Telegram channels
+- RSS feeds
+- Bilibili videos
+- Xiaohongshu (RED) notes
+- WeChat articles
+- X/Twitter posts
+- YouTube videos
+- Manual input
+"""
+
+from dataclasses import dataclass, field, asdict
+from datetime import datetime, timedelta
+from typing import Optional, List
+from enum import Enum
+import hashlib
+import json
+
+
+class SourceType(str, Enum):
+    """Content source types."""
+    TELEGRAM = "telegram"
+    RSS = "rss"
+    BILIBILI = "bilibili"
+    XIAOHONGSHU = "xhs"
+    TWITTER = "twitter"
+    WECHAT = "wechat"
+    YOUTUBE = "youtube"
+    MANUAL = "manual"
+
+
+class MediaType(str, Enum):
+    """Media types."""
+    TEXT = "text"
+    VIDEO = "video"
+    AUDIO = "audio"
+    IMAGE = "image"
+
+
+class Priority(str, Enum):
+    """Content priority levels."""
+    HOT = "hot"
+    QUALITY = "quality"
+    DEEP = "deep"
+    NORMAL = "normal"
+    LOW = "low"
+
+
+@dataclass
+class UnifiedContent:
+    """Unified content format across all platforms."""
+
+    # === Required ===
+    source_type: SourceType
+    source_name: str
+    title: str
+    content: str
+    url: str
+
+    # === Auto-generated ===
+    id: str = ""
+    fetched_at: str = ""
+
+    # === Media ===
+    media_type: MediaType = MediaType.TEXT
+    media_url: Optional[str] = None
+
+    # === Scoring ===
+    score: int = 0
+    priority: Priority = Priority.NORMAL
+    category: str = ""
+    tags: List[str] = field(default_factory=list)
+
+    # === Processing state ===
+    processed: bool = False
+    digest_date: Optional[str] = None
+
+    # === Translation ===
+    title_cn: Optional[str] = None
+    content_cn: Optional[str] = None
+
+    # === Metadata ===
+    extra: dict = field(default_factory=dict)
+
+    def __post_init__(self):
+        if not self.id:
+            self.id = hashlib.md5(self.url.encode()).hexdigest()[:12]
+        if not self.fetched_at:
+            self.fetched_at = datetime.now().isoformat()
+
+    def to_dict(self) -> dict:
+        d = asdict(self)
+        d['source_type'] = self.source_type.value
+        d['media_type'] = self.media_type.value
+        d['priority'] = self.priority.value
+        return d
+
+    @classmethod
+    def from_dict(cls, data: dict) -> 'UnifiedContent':
+        if isinstance(data.get('source_type'), str):
+            data['source_type'] = SourceType(data['source_type'])
+        if isinstance(data.get('media_type'), str):
+            data['media_type'] = MediaType(data['media_type'])
+        if isinstance(data.get('priority'), str):
+            data['priority'] = Priority(data['priority'])
+        known = {f.name for f in cls.__dataclass_fields__.values()}
+        data = {k: v for k, v in data.items() if k in known}
+        return cls(**data)
+
+
+# =============================================================================
+# Converters: platform-specific dict → UnifiedContent
+# =============================================================================
+
+def from_telegram(msg: dict, channel_name: str, channel_username: str) -> UnifiedContent:
+    return UnifiedContent(
+        source_type=SourceType.TELEGRAM,
+        source_name=channel_name,
+        title=msg.get('text', '')[:100],
+        content=msg.get('text', ''),
+        url=msg.get('url', f"https://t.me/{channel_username}"),
+        extra={"views": msg.get('views', 0), "channel_username": channel_username},
+    )
+
+
+def from_rss(article: dict) -> UnifiedContent:
+    return UnifiedContent(
+        source_type=SourceType.RSS,
+        source_name=article.get('source', ''),
+        title=article.get('title', ''),
+        content=article.get('summary', ''),
+        url=article.get('url', article.get('link', '')),
+        score=article.get('score', 0),
+        category=article.get('category', ''),
+        title_cn=article.get('title_cn'),
+        content_cn=article.get('summary_cn'),
+    )
+
+
+def from_bilibili(video: dict) -> UnifiedContent:
+    return UnifiedContent(
+        source_type=SourceType.BILIBILI,
+        source_name=video.get('author', ''),
+        title=video.get('title', ''),
+        content=video.get('description', ''),
+        url=video.get('url', ''),
+        media_type=MediaType.VIDEO,
+        media_url=video.get('cover', ''),
+        extra={
+            "bvid": video.get('bvid', ''),
+            "duration": video.get('duration', 0),
+            "view_count": video.get('view_count', 0),
+        },
+    )
+
+
+def from_twitter(data: dict) -> UnifiedContent:
+    return UnifiedContent(
+        source_type=SourceType.TWITTER,
+        source_name=data.get('author', ''),
+        title=data.get('text', '')[:100],
+        content=data.get('text', ''),
+        url=data.get('url', ''),
+        extra={
+            "likes": data.get('likes', 0),
+            "retweets": data.get('retweets', 0),
+        },
+    )
+
+
+def from_wechat(article: dict) -> UnifiedContent:
+    return UnifiedContent(
+        source_type=SourceType.WECHAT,
+        source_name=article.get('author', ''),
+        title=article.get('title', ''),
+        content=article.get('content', ''),
+        url=article.get('url', ''),
+    )
+
+
+def from_xiaohongshu(note: dict) -> UnifiedContent:
+    return UnifiedContent(
+        source_type=SourceType.XIAOHONGSHU,
+        source_name=note.get('author', ''),
+        title=note.get('title', ''),
+        content=note.get('content', ''),
+        url=note.get('url', ''),
+        media_type=MediaType.IMAGE if note.get('images') else MediaType.TEXT,
+        extra={
+            "likes": note.get('likes', 0),
+            "collects": note.get('collects', 0),
+        },
+    )
+
+
+def from_youtube(video: dict) -> UnifiedContent:
+    return UnifiedContent(
+        source_type=SourceType.YOUTUBE,
+        source_name=video.get('author', ''),
+        title=video.get('title', ''),
+        content=video.get('description', ''),
+        url=video.get('url', ''),
+        media_type=MediaType.VIDEO,
+        extra={
+            "duration": video.get('duration', ''),
+            "view_count": video.get('view_count', 0),
+        },
+    )
+
+
+def from_manual(title: str, content: str, url: str = "") -> UnifiedContent:
+    return UnifiedContent(
+        source_type=SourceType.MANUAL,
+        source_name="manual",
+        title=title,
+        content=content,
+        url=url or f"manual://{hashlib.md5(title.encode()).hexdigest()[:8]}",
+    )
+
+
+# =============================================================================
+# Unified Inbox
+# =============================================================================
+
+class UnifiedInbox:
+    """JSON-based content inbox with dedup."""
+
+    def __init__(self, filepath: str = "unified_inbox.json"):
+        self.filepath = filepath
+        self.items: List[UnifiedContent] = []
+        self.load()
+
+    def load(self):
+        import os
+        if os.path.exists(self.filepath):
+            try:
+                with open(self.filepath, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                    self.items = [UnifiedContent.from_dict(d) for d in data]
+            except (json.JSONDecodeError, IOError):
+                self.items = []
+
+    def save(self):
+        with open(self.filepath, 'w', encoding='utf-8') as f:
+            json.dump([item.to_dict() for item in self.items], f,
+                      ensure_ascii=False, indent=2)
+
+    def add(self, item: UnifiedContent) -> bool:
+        if any(i.id == item.id for i in self.items):
+            return False
+        self.items.append(item)
+        return True
+
+    def add_batch(self, items: List[UnifiedContent]) -> int:
+        return sum(1 for item in items if self.add(item))
+
+    def get_unprocessed(self) -> List[UnifiedContent]:
+        return [i for i in self.items if not i.processed]
+
+    def get_by_source(self, source_type: SourceType) -> List[UnifiedContent]:
+        return [i for i in self.items if i.source_type == source_type]
+
+    def mark_processed(self, item_id: str, digest_date: str = None):
+        for item in self.items:
+            if item.id == item_id:
+                item.processed = True
+                if digest_date:
+                    item.digest_date = digest_date
+                break
+
+    def clear_old(self, days: int = 7):
+        cutoff = (datetime.now() - timedelta(days=days)).isoformat()
+        self.items = [i for i in self.items if i.fetched_at > cutoff]
--- a/x_reader/utils/init.py
+++ b/x_reader/utils/init.py
--- a/x_reader/utils/storage.py
+++ b/x_reader/utils/storage.py
@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+"""
+Storage utilities — save content to JSON inbox and optional Markdown file.
+
+Implements the "atomic archiving" from the tweet:
+- unified_inbox.json (for AI/programmatic use)
+- markdown file (for human reading, e.g. Obsidian)
+"""
+
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+from loguru import logger
+
+from x_reader.schema import UnifiedContent
+
+
+def save_to_json(item: UnifiedContent, filepath: str = "unified_inbox.json"):
+    """Append content to JSON inbox file."""
+    path = Path(filepath)
+    data = []
+
+    if path.exists():
+        try:
+            with open(path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+        except (json.JSONDecodeError, IOError):
+            data = []
+
+    data.append(item.to_dict())
+
+    # Keep last 500 entries to prevent unbounded growth
+    data = data[-500:]
+
+    with open(path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+    logger.info(f"Saved to JSON: {path}")
+
+
+def save_to_markdown(item: UnifiedContent, filepath: str = None):
+    """
+    Append content to a Markdown file (e.g. Obsidian vault).
+
+    Supports two output modes:
+    - OUTPUT_DIR: Write to {OUTPUT_DIR}/content_hub.md
+    - OBSIDIAN_VAULT: Write to {OBSIDIAN_VAULT}/01-收集箱/x-reader-inbox.md
+
+    If neither is set, skips markdown output.
+    """
+    if not filepath:
+        # Priority 1: Obsidian vault
+        vault_path = os.getenv("OBSIDIAN_VAULT", "")
+        if vault_path:
+            filepath = os.path.join(vault_path, "01-收集箱", "x-reader-inbox.md")
+        else:
+            # Priority 2: generic output dir
+            output_dir = os.getenv("OUTPUT_DIR", "")
+            if not output_dir:
+                return
+            filepath = os.path.join(output_dir, "content_hub.md")
+
+    path = Path(filepath)
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    emoji = {
+        "telegram": "📢", "rss": "📰", "bilibili": "🎬",
+        "xhs": "📕", "twitter": "🐦", "wechat": "💬",
+        "youtube": "▶️", "manual": "✏️",
+    }.get(item.source_type.value, "📄")
+
+    with open(path, 'a', encoding='utf-8') as f:
+        f.write(f"\n## {emoji} {item.title}\n")
+        f.write(f"- Source: {item.source_name} ({item.source_type.value})\n")
+        f.write(f"- URL: {item.url}\n")
+        f.write(f"- Fetched: {item.fetched_at[:16]}\n\n")
+        f.write(f"{item.content[:2000]}\n")
+        f.write("\n---\n")
+
+    logger.info(f"Saved to Markdown: {path}")
+
+
+def save_content(item: UnifiedContent, json_path: str = None, md_path: str = None):
+    """Save content to both JSON and Markdown."""
+    inbox_file = json_path or os.getenv("INBOX_FILE", "unified_inbox.json")
+    save_to_json(item, inbox_file)
+    save_to_markdown(item, md_path)