Initial: forked from runesleo/x-reader (MIT License) - thank you @runes_leo!

2026-02-24 03:00:05 +01:00 · 2026-02-24 03:00:05 +01:00 · ee2ad83b12
commit ee2ad83b12
25 changed files with 2512 additions and 0 deletions
--- a/x_reader/reader.py
+++ b/x_reader/reader.py
@ -0,0 +1,154 @@
+# -*- coding: utf-8 -*-
+"""
+Universal Reader — routes any URL to the right fetcher.
+
+The core dispatcher: give it a URL, get back structured content.
+"""
+
+import asyncio
+from urllib.parse import urlparse
+from loguru import logger
+from typing import Dict, Any, Optional
+
+from x_reader.schema import (
+    UnifiedContent, UnifiedInbox, SourceType,
+    from_bilibili, from_twitter, from_wechat,
+    from_xiaohongshu, from_youtube, from_rss, from_telegram,
+)
+from x_reader.fetchers.jina import fetch_via_jina
+
+
+class UniversalReader:
+    """
+    Routes URLs to platform-specific fetchers.
+    Falls back to Jina Reader for unknown platforms.
+    """
+
+    def __init__(self, inbox: Optional[UnifiedInbox] = None):
+        self.inbox = inbox
+
+    def _detect_platform(self, url: str) -> str:
+        """Detect platform from URL."""
+        domain = urlparse(url).netloc.lower()
+
+        if "mp.weixin.qq.com" in domain:
+            return "wechat"
+        if "x.com" in domain or "twitter.com" in domain:
+            return "twitter"
+        if "youtube.com" in domain or "youtu.be" in domain:
+            return "youtube"
+        if "xiaohongshu.com" in domain or "xhslink.com" in domain:
+            return "xhs"
+        if "bilibili.com" in domain or "b23.tv" in domain:
+            return "bilibili"
+        if "xiaoyuzhoufm.com" in domain:
+            return "podcast"
+        if "podcasts.apple.com" in domain:
+            return "podcast"
+        if "t.me" in domain or "telegram.org" in domain:
+            return "telegram"
+        if url.endswith(".xml") or "/rss" in url or "/feed" in url or "/atom" in url:
+            return "rss"
+        return "generic"
+
+    async def read(self, url: str) -> UnifiedContent:
+        """
+        Fetch content from any URL and return as UnifiedContent.
+
+        The main entry point — give it a URL, get back structured content.
+        """
+        # Ensure URL has scheme
+        if not url.startswith(("http://", "https://")):
+            url = f"https://{url}"
+
+        platform = self._detect_platform(url)
+        logger.info(f"[{platform}] {url[:60]}...")
+
+        try:
+            content = await self._fetch(platform, url)
+
+            # Save to inbox if configured
+            if self.inbox:
+                if self.inbox.add(content):
+                    self.inbox.save()
+                    logger.info(f"Saved to inbox: {content.title[:50]}")
+
+            # Save to markdown output if configured
+            from x_reader.utils.storage import save_to_markdown
+            save_to_markdown(content)
+
+            return content
+
+        except Exception as e:
+            logger.error(f"[{platform}] Failed: {e}")
+            raise
+
+    async def _fetch(self, platform: str, url: str) -> UnifiedContent:
+        """Dispatch to platform-specific fetcher."""
+
+        if platform == "bilibili":
+            from x_reader.fetchers.bilibili import fetch_bilibili
+            data = await fetch_bilibili(url)
+            return from_bilibili(data)
+
+        if platform == "twitter":
+            from x_reader.fetchers.twitter import fetch_twitter
+            data = await fetch_twitter(url)
+            return from_twitter(data)
+
+        if platform == "wechat":
+            from x_reader.fetchers.wechat import fetch_wechat
+            data = await fetch_wechat(url)
+            return from_wechat(data)
+
+        if platform == "xhs":
+            from x_reader.fetchers.xhs import fetch_xhs
+            data = await fetch_xhs(url)
+            return from_xiaohongshu(data)
+
+        if platform == "youtube":
+            from x_reader.fetchers.youtube import fetch_youtube
+            data = await fetch_youtube(url)
+            return from_youtube(data)
+
+        if platform == "rss":
+            from x_reader.fetchers.rss import fetch_rss
+            articles = await fetch_rss(url, limit=1)
+            if articles:
+                return from_rss(articles[0])
+            raise ValueError(f"No articles found in RSS feed: {url}")
+
+        if platform == "telegram":
+            from x_reader.fetchers.telegram import fetch_telegram
+            # Extract channel username from t.me URL
+            path = urlparse(url).path.strip("/").split("/")[0]
+            channel = path if path else url
+            messages = await fetch_telegram(channel, limit=1)
+            if messages:
+                return from_telegram(messages[0], channel, channel)
+            raise ValueError(f"No messages from Telegram channel: {url}")
+
+        # Fallback: Jina Reader for any unknown URL
+        logger.info(f"Using Jina fallback for: {url}")
+        data = fetch_via_jina(url)
+        return UnifiedContent(
+            source_type=SourceType.MANUAL,
+            source_name=urlparse(url).netloc,
+            title=data["title"],
+            content=data["content"],
+            url=url,
+        )
+
+    async def read_batch(self, urls: list[str]) -> list[UnifiedContent]:
+        """Fetch multiple URLs concurrently."""
+        tasks = [self.read(url) for url in urls]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        contents = []
+        for url, result in zip(urls, results):
+            if isinstance(result, Exception):
+                logger.error(f"Batch failed for {url}: {result}")
+            else:
+                contents.append(result)
+
+        return contents