Agent-Reach/agent_eyes/readers/reddit.py
Panniantong 8eab038cb9 v1.0.0 — Agent Eyes: search + read the entire internet
Major restructure from x-reader fork to independent project:

Architecture:
- readers/ — content extraction from 10+ platforms (based on x-reader, MIT)
- search/ — semantic search via Exa, GitHub API, birdx (NEW)
- config.py — configuration management (~/.agent-eyes/config.yaml) (NEW)
- doctor.py — environment health checker (NEW)
- core.py — AgentEyes unified entry point (NEW)
- cli.py — full CLI: read, search, setup, doctor (NEW)
- integrations/mcp_server.py — 8 MCP tools (NEW)
- guides/ — 6 Agent-readable setup guides (NEW)
- integrations/skill/ — OpenClaw Skill package (NEW)

Platforms (zero config):
- Web pages, GitHub, Bilibili, YouTube, RSS, single tweets

Platforms (one free API key):
- Web search, Reddit search, Twitter search (via Exa)

Platforms (optional setup):
- Reddit full reader, Twitter advanced, WeChat, XiaoHongShu

Tests: 34/34 passing

Credits: Built on x-reader by @runes_leo (MIT License)
2026-02-24 04:00:47 +01:00

132 lines
4 KiB
Python

# -*- coding: utf-8 -*-
"""Reddit fetcher — extracts posts and comments via JSON API.
Supports optional proxy via REDDIT_PROXY env var (many IPs are blocked by Reddit).
Example: REDDIT_PROXY=http://user:pass@host:port
"""
import os
import re
import requests
from loguru import logger
from typing import Dict, Any, List, Optional
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}
def _get_proxies() -> Optional[Dict[str, str]]:
"""Get proxy config from env."""
proxy = os.environ.get("REDDIT_PROXY")
if proxy:
return {"http": proxy, "https": proxy}
return None
def _extract_post(post_data: Dict) -> Dict[str, Any]:
"""Extract post info from Reddit JSON."""
data = post_data.get("data", {})
return {
"title": data.get("title", ""),
"author": data.get("author", "[deleted]"),
"selftext": data.get("selftext", ""),
"score": data.get("score", 0),
"num_comments": data.get("num_comments", 0),
"url": f"https://www.reddit.com{data.get('permalink', '')}",
"created_utc": data.get("created_utc", 0),
"subreddit": data.get("subreddit", ""),
}
def _extract_comments(comments_data: Dict, limit: int = 20) -> List[Dict[str, str]]:
"""Extract top-level comments."""
comments = []
children = comments_data.get("data", {}).get("children", [])
for child in children[:limit]:
if child.get("kind") != "t1":
continue
data = child.get("data", {})
comments.append({
"author": data.get("author", "[deleted]"),
"body": data.get("body", ""),
"score": data.get("score", 0),
})
return comments
async def fetch_reddit(url: str) -> Dict[str, Any]:
"""Fetch Reddit post + comments via JSON API."""
logger.info(f"Fetching Reddit: {url}")
# Normalize URL and append .json
clean_url = re.sub(r'\?.*$', '', url.rstrip('/'))
json_url = f"{clean_url}.json"
resp = requests.get(
json_url,
headers=HEADERS,
proxies=_get_proxies(),
timeout=15,
)
resp.raise_for_status()
data = resp.json()
# Reddit returns [post_listing, comments_listing]
if not isinstance(data, list) or len(data) < 2:
raise ValueError(f"Unexpected Reddit response format")
post_listing = data[0].get("data", {}).get("children", [])
if not post_listing:
raise ValueError("No post found")
post = _extract_post(post_listing[0])
comments = _extract_comments(data[1])
# Build readable content
content_parts = [post["selftext"]] if post["selftext"] else []
if comments:
content_parts.append("\n---\n## Top Comments\n")
for c in comments:
content_parts.append(f"**u/{c['author']}** ({c['score']} pts):\n{c['body']}\n")
return {
"title": post["title"],
"content": "\n".join(content_parts),
"author": f"u/{post['author']}",
"url": post["url"],
"subreddit": post["subreddit"],
"score": post["score"],
"num_comments": post["num_comments"],
"platform": "reddit",
}
async def search_reddit(query: str, subreddit: str = None, limit: int = 10) -> List[Dict[str, Any]]:
"""Search Reddit posts."""
logger.info(f"Searching Reddit: {query} (sub={subreddit})")
if subreddit:
search_url = f"https://www.reddit.com/r/{subreddit}/search.json"
params = {"q": query, "restrict_sr": "on", "limit": limit, "sort": "relevance"}
else:
search_url = "https://www.reddit.com/search.json"
params = {"q": query, "limit": limit, "sort": "relevance"}
resp = requests.get(
search_url,
headers=HEADERS,
params=params,
proxies=_get_proxies(),
timeout=15,
)
resp.raise_for_status()
data = resp.json()
results = []
for child in data.get("data", {}).get("children", []):
results.append(_extract_post(child))
return results