Bugs found during fresh pip install testing: - readers/*.py still referenced agent_eyes.fetchers → fixed to agent_eyes.readers - reader.py passed 'author'/'metadata' to UnifiedContent which doesn't have those → use 'extra' field - RSS URL detection missed domains containing 'rss' (e.g. hnrss.org)
78 lines
2.4 KiB
Python
78 lines
2.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Xiaohongshu (RED) note fetcher — three-tier fallback:
|
|
|
|
1. Jina Reader (fast, no deps)
|
|
2. Playwright + saved session (handles 451/403)
|
|
3. Error with login instructions
|
|
|
|
Install browser tier: pip install "x-reader[browser]" && playwright install chromium
|
|
"""
|
|
|
|
from loguru import logger
|
|
from typing import Dict, Any
|
|
from pathlib import Path
|
|
|
|
from agent_eyes.readers.jina import fetch_via_jina
|
|
|
|
|
|
async def fetch_xhs(url: str) -> Dict[str, Any]:
|
|
"""
|
|
Fetch a Xiaohongshu note with three-tier fallback.
|
|
|
|
Args:
|
|
url: xiaohongshu.com or xhslink.com URL
|
|
|
|
Returns:
|
|
Dict with: title, content, author, url, platform
|
|
"""
|
|
# Tier 1: Jina Reader
|
|
try:
|
|
logger.info(f"[XHS] Tier 1 — Jina: {url}")
|
|
data = fetch_via_jina(url)
|
|
if data.get("content"):
|
|
return {
|
|
"title": data["title"],
|
|
"content": data["content"],
|
|
"author": data.get("author", ""),
|
|
"url": url,
|
|
"platform": "xhs",
|
|
}
|
|
logger.warning("[XHS] Jina returned empty content, falling back to browser")
|
|
except Exception as e:
|
|
logger.warning(f"[XHS] Jina failed ({e}), falling back to browser")
|
|
|
|
# Tier 2: Playwright with session
|
|
from agent_eyes.readers.browser import get_session_path, SESSION_DIR
|
|
|
|
session_path = get_session_path("xhs")
|
|
if not Path(session_path).exists():
|
|
# Tier 3: No session — guide user
|
|
raise RuntimeError(
|
|
f"❌ XHS blocked Jina and no saved session found.\n"
|
|
f" Run: x-reader login xhs\n"
|
|
f" Then retry this URL."
|
|
)
|
|
|
|
try:
|
|
logger.info(f"[XHS] Tier 2 — Playwright with session: {url}")
|
|
from agent_eyes.readers.browser import fetch_via_browser
|
|
|
|
data = await fetch_via_browser(url, storage_state=session_path)
|
|
return {
|
|
"title": data["title"],
|
|
"content": data["content"],
|
|
"author": data.get("author", ""),
|
|
"url": url,
|
|
"platform": "xhs",
|
|
}
|
|
except RuntimeError:
|
|
# Playwright not installed
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"[XHS] Browser fetch also failed: {e}")
|
|
raise RuntimeError(
|
|
f"❌ All XHS fetch methods failed.\n"
|
|
f" Last error: {e}\n"
|
|
f" Try: x-reader login xhs (to refresh session)"
|
|
)
|