Initial: forked from runesleo/x-reader (MIT License) - thank you @runes_leo!
This commit is contained in:
commit
ee2ad83b12
25 changed files with 2512 additions and 0 deletions
154
x_reader/reader.py
Normal file
154
x_reader/reader.py
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Universal Reader — routes any URL to the right fetcher.
|
||||
|
||||
The core dispatcher: give it a URL, get back structured content.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from urllib.parse import urlparse
|
||||
from loguru import logger
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from x_reader.schema import (
|
||||
UnifiedContent, UnifiedInbox, SourceType,
|
||||
from_bilibili, from_twitter, from_wechat,
|
||||
from_xiaohongshu, from_youtube, from_rss, from_telegram,
|
||||
)
|
||||
from x_reader.fetchers.jina import fetch_via_jina
|
||||
|
||||
|
||||
class UniversalReader:
|
||||
"""
|
||||
Routes URLs to platform-specific fetchers.
|
||||
Falls back to Jina Reader for unknown platforms.
|
||||
"""
|
||||
|
||||
def __init__(self, inbox: Optional[UnifiedInbox] = None):
|
||||
self.inbox = inbox
|
||||
|
||||
def _detect_platform(self, url: str) -> str:
|
||||
"""Detect platform from URL."""
|
||||
domain = urlparse(url).netloc.lower()
|
||||
|
||||
if "mp.weixin.qq.com" in domain:
|
||||
return "wechat"
|
||||
if "x.com" in domain or "twitter.com" in domain:
|
||||
return "twitter"
|
||||
if "youtube.com" in domain or "youtu.be" in domain:
|
||||
return "youtube"
|
||||
if "xiaohongshu.com" in domain or "xhslink.com" in domain:
|
||||
return "xhs"
|
||||
if "bilibili.com" in domain or "b23.tv" in domain:
|
||||
return "bilibili"
|
||||
if "xiaoyuzhoufm.com" in domain:
|
||||
return "podcast"
|
||||
if "podcasts.apple.com" in domain:
|
||||
return "podcast"
|
||||
if "t.me" in domain or "telegram.org" in domain:
|
||||
return "telegram"
|
||||
if url.endswith(".xml") or "/rss" in url or "/feed" in url or "/atom" in url:
|
||||
return "rss"
|
||||
return "generic"
|
||||
|
||||
async def read(self, url: str) -> UnifiedContent:
|
||||
"""
|
||||
Fetch content from any URL and return as UnifiedContent.
|
||||
|
||||
The main entry point — give it a URL, get back structured content.
|
||||
"""
|
||||
# Ensure URL has scheme
|
||||
if not url.startswith(("http://", "https://")):
|
||||
url = f"https://{url}"
|
||||
|
||||
platform = self._detect_platform(url)
|
||||
logger.info(f"[{platform}] {url[:60]}...")
|
||||
|
||||
try:
|
||||
content = await self._fetch(platform, url)
|
||||
|
||||
# Save to inbox if configured
|
||||
if self.inbox:
|
||||
if self.inbox.add(content):
|
||||
self.inbox.save()
|
||||
logger.info(f"Saved to inbox: {content.title[:50]}")
|
||||
|
||||
# Save to markdown output if configured
|
||||
from x_reader.utils.storage import save_to_markdown
|
||||
save_to_markdown(content)
|
||||
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[{platform}] Failed: {e}")
|
||||
raise
|
||||
|
||||
async def _fetch(self, platform: str, url: str) -> UnifiedContent:
|
||||
"""Dispatch to platform-specific fetcher."""
|
||||
|
||||
if platform == "bilibili":
|
||||
from x_reader.fetchers.bilibili import fetch_bilibili
|
||||
data = await fetch_bilibili(url)
|
||||
return from_bilibili(data)
|
||||
|
||||
if platform == "twitter":
|
||||
from x_reader.fetchers.twitter import fetch_twitter
|
||||
data = await fetch_twitter(url)
|
||||
return from_twitter(data)
|
||||
|
||||
if platform == "wechat":
|
||||
from x_reader.fetchers.wechat import fetch_wechat
|
||||
data = await fetch_wechat(url)
|
||||
return from_wechat(data)
|
||||
|
||||
if platform == "xhs":
|
||||
from x_reader.fetchers.xhs import fetch_xhs
|
||||
data = await fetch_xhs(url)
|
||||
return from_xiaohongshu(data)
|
||||
|
||||
if platform == "youtube":
|
||||
from x_reader.fetchers.youtube import fetch_youtube
|
||||
data = await fetch_youtube(url)
|
||||
return from_youtube(data)
|
||||
|
||||
if platform == "rss":
|
||||
from x_reader.fetchers.rss import fetch_rss
|
||||
articles = await fetch_rss(url, limit=1)
|
||||
if articles:
|
||||
return from_rss(articles[0])
|
||||
raise ValueError(f"No articles found in RSS feed: {url}")
|
||||
|
||||
if platform == "telegram":
|
||||
from x_reader.fetchers.telegram import fetch_telegram
|
||||
# Extract channel username from t.me URL
|
||||
path = urlparse(url).path.strip("/").split("/")[0]
|
||||
channel = path if path else url
|
||||
messages = await fetch_telegram(channel, limit=1)
|
||||
if messages:
|
||||
return from_telegram(messages[0], channel, channel)
|
||||
raise ValueError(f"No messages from Telegram channel: {url}")
|
||||
|
||||
# Fallback: Jina Reader for any unknown URL
|
||||
logger.info(f"Using Jina fallback for: {url}")
|
||||
data = fetch_via_jina(url)
|
||||
return UnifiedContent(
|
||||
source_type=SourceType.MANUAL,
|
||||
source_name=urlparse(url).netloc,
|
||||
title=data["title"],
|
||||
content=data["content"],
|
||||
url=url,
|
||||
)
|
||||
|
||||
async def read_batch(self, urls: list[str]) -> list[UnifiedContent]:
|
||||
"""Fetch multiple URLs concurrently."""
|
||||
tasks = [self.read(url) for url in urls]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
contents = []
|
||||
for url, result in zip(urls, results):
|
||||
if isinstance(result, Exception):
|
||||
logger.error(f"Batch failed for {url}: {result}")
|
||||
else:
|
||||
contents.append(result)
|
||||
|
||||
return contents
|
||||
Loading…
Add table
Add a link
Reference in a new issue