154 lines
5.2 KiB
Python
154 lines
5.2 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Universal Reader — routes any URL to the right fetcher.
|
|
|
|
The core dispatcher: give it a URL, get back structured content.
|
|
"""
|
|
|
|
import asyncio
|
|
from urllib.parse import urlparse
|
|
from loguru import logger
|
|
from typing import Dict, Any, Optional
|
|
|
|
from x_reader.schema import (
|
|
UnifiedContent, UnifiedInbox, SourceType,
|
|
from_bilibili, from_twitter, from_wechat,
|
|
from_xiaohongshu, from_youtube, from_rss, from_telegram,
|
|
)
|
|
from x_reader.fetchers.jina import fetch_via_jina
|
|
|
|
|
|
class UniversalReader:
|
|
"""
|
|
Routes URLs to platform-specific fetchers.
|
|
Falls back to Jina Reader for unknown platforms.
|
|
"""
|
|
|
|
def __init__(self, inbox: Optional[UnifiedInbox] = None):
|
|
self.inbox = inbox
|
|
|
|
def _detect_platform(self, url: str) -> str:
|
|
"""Detect platform from URL."""
|
|
domain = urlparse(url).netloc.lower()
|
|
|
|
if "mp.weixin.qq.com" in domain:
|
|
return "wechat"
|
|
if "x.com" in domain or "twitter.com" in domain:
|
|
return "twitter"
|
|
if "youtube.com" in domain or "youtu.be" in domain:
|
|
return "youtube"
|
|
if "xiaohongshu.com" in domain or "xhslink.com" in domain:
|
|
return "xhs"
|
|
if "bilibili.com" in domain or "b23.tv" in domain:
|
|
return "bilibili"
|
|
if "xiaoyuzhoufm.com" in domain:
|
|
return "podcast"
|
|
if "podcasts.apple.com" in domain:
|
|
return "podcast"
|
|
if "t.me" in domain or "telegram.org" in domain:
|
|
return "telegram"
|
|
if url.endswith(".xml") or "/rss" in url or "/feed" in url or "/atom" in url:
|
|
return "rss"
|
|
return "generic"
|
|
|
|
async def read(self, url: str) -> UnifiedContent:
|
|
"""
|
|
Fetch content from any URL and return as UnifiedContent.
|
|
|
|
The main entry point — give it a URL, get back structured content.
|
|
"""
|
|
# Ensure URL has scheme
|
|
if not url.startswith(("http://", "https://")):
|
|
url = f"https://{url}"
|
|
|
|
platform = self._detect_platform(url)
|
|
logger.info(f"[{platform}] {url[:60]}...")
|
|
|
|
try:
|
|
content = await self._fetch(platform, url)
|
|
|
|
# Save to inbox if configured
|
|
if self.inbox:
|
|
if self.inbox.add(content):
|
|
self.inbox.save()
|
|
logger.info(f"Saved to inbox: {content.title[:50]}")
|
|
|
|
# Save to markdown output if configured
|
|
from x_reader.utils.storage import save_to_markdown
|
|
save_to_markdown(content)
|
|
|
|
return content
|
|
|
|
except Exception as e:
|
|
logger.error(f"[{platform}] Failed: {e}")
|
|
raise
|
|
|
|
async def _fetch(self, platform: str, url: str) -> UnifiedContent:
|
|
"""Dispatch to platform-specific fetcher."""
|
|
|
|
if platform == "bilibili":
|
|
from x_reader.fetchers.bilibili import fetch_bilibili
|
|
data = await fetch_bilibili(url)
|
|
return from_bilibili(data)
|
|
|
|
if platform == "twitter":
|
|
from x_reader.fetchers.twitter import fetch_twitter
|
|
data = await fetch_twitter(url)
|
|
return from_twitter(data)
|
|
|
|
if platform == "wechat":
|
|
from x_reader.fetchers.wechat import fetch_wechat
|
|
data = await fetch_wechat(url)
|
|
return from_wechat(data)
|
|
|
|
if platform == "xhs":
|
|
from x_reader.fetchers.xhs import fetch_xhs
|
|
data = await fetch_xhs(url)
|
|
return from_xiaohongshu(data)
|
|
|
|
if platform == "youtube":
|
|
from x_reader.fetchers.youtube import fetch_youtube
|
|
data = await fetch_youtube(url)
|
|
return from_youtube(data)
|
|
|
|
if platform == "rss":
|
|
from x_reader.fetchers.rss import fetch_rss
|
|
articles = await fetch_rss(url, limit=1)
|
|
if articles:
|
|
return from_rss(articles[0])
|
|
raise ValueError(f"No articles found in RSS feed: {url}")
|
|
|
|
if platform == "telegram":
|
|
from x_reader.fetchers.telegram import fetch_telegram
|
|
# Extract channel username from t.me URL
|
|
path = urlparse(url).path.strip("/").split("/")[0]
|
|
channel = path if path else url
|
|
messages = await fetch_telegram(channel, limit=1)
|
|
if messages:
|
|
return from_telegram(messages[0], channel, channel)
|
|
raise ValueError(f"No messages from Telegram channel: {url}")
|
|
|
|
# Fallback: Jina Reader for any unknown URL
|
|
logger.info(f"Using Jina fallback for: {url}")
|
|
data = fetch_via_jina(url)
|
|
return UnifiedContent(
|
|
source_type=SourceType.MANUAL,
|
|
source_name=urlparse(url).netloc,
|
|
title=data["title"],
|
|
content=data["content"],
|
|
url=url,
|
|
)
|
|
|
|
async def read_batch(self, urls: list[str]) -> list[UnifiedContent]:
|
|
"""Fetch multiple URLs concurrently."""
|
|
tasks = [self.read(url) for url in urls]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
contents = []
|
|
for url, result in zip(urls, results):
|
|
if isinstance(result, Exception):
|
|
logger.error(f"Batch failed for {url}: {result}")
|
|
else:
|
|
contents.append(result)
|
|
|
|
return contents
|