diff --git a/README.md b/README.md index 9578367..5b6273f 100644 --- a/README.md +++ b/README.md @@ -57,15 +57,15 @@ AI Agent 已经能访问互联网——但只是"能上网"而已。 |------|------|:--------:|------| | 🌐 **网页** | 阅读 | 零配置 | 任意 URL → 干净 Markdown([Jina Reader](https://github.com/jina-ai/reader) ⭐9.8K 驱动) | | 🐦 **Twitter/X** | 阅读 · 搜索 | 零配置 / Cookie | 单条推文零配置可读。配置 Cookie 可解锁搜索、时间线、发推([birdx](https://github.com/runesleo/birdx) 驱动) | -| 📕 **小红书** | 阅读 · 搜索 · **发帖 · 评论 · 点赞** | Cookie | 配置 Cookie 即可使用全部功能 | -| 🔍 **全网搜索** | 搜索 | 免费 Key | 一个 Key 搜全网 + Reddit + Twitter([Exa](https://exa.ai) 驱动,免费 1000 次/月) | -| 📦 **GitHub** | 阅读 · 搜索 | 零配置 | 公开仓库直接可用。配置 `gh` CLI 或 Token 后可解锁 Fork、Issue、PR 等完整操作 | -| 📺 **YouTube** | 阅读 | 零配置 | 1800+ 视频网站字幕提取([yt-dlp](https://github.com/yt-dlp/yt-dlp) ⭐148K 驱动) | -| 📺 **B站** | 阅读 | 零配置 / 代理 | 视频信息 + 字幕。本地直接用,服务器配个代理即可 | +| 📕 **小红书** | 阅读 · 搜索 · **发帖 · 评论 · 点赞** | mcporter | 通过 [xiaohongshu-mcp](https://github.com/user/xiaohongshu-mcp) 内部 API,安装即可用 | +| 🔍 **全网搜索** | 搜索 | 自动配置 | 安装时自动配置,免 Key 免费用([Exa](https://exa.ai) via [mcporter](https://github.com/nicepkg/mcporter) 驱动) | +| 📦 **GitHub** | 阅读 · 搜索 | 零配置 | [gh CLI](https://cli.github.com) 驱动,公开仓库直接可用。`gh auth login` 后可解锁 Fork、Issue、PR | +| 📺 **YouTube** | 阅读 · **搜索** | 零配置 | 视频字幕 + 搜索,支持 1800+ 视频网站([yt-dlp](https://github.com/yt-dlp/yt-dlp) ⭐148K 驱动) | +| 📺 **B站** | 阅读 · **搜索** | 零配置 / 代理 | 视频信息 + 字幕 + 搜索。本地直接用,服务器配个代理([yt-dlp](https://github.com/yt-dlp/yt-dlp) 驱动) | | 📡 **RSS** | 阅读 | 零配置 | 任意 RSS/Atom 源([feedparser](https://github.com/kurtmckee/feedparser) ⭐2.3K 驱动) | -| 📖 **Reddit** | 搜索 · 阅读 | 免费 / 代理 | 搜索通过 Exa 免费直接可用。读帖子配个代理即可。配置 OAuth Bot 可解锁发帖 | +| 📖 **Reddit** | 搜索 · 阅读 | 免费 / 代理 | 搜索通过 Exa 免费直接可用。读帖子配个代理即可 | -> **配置难度说明:** 零配置 = 装好即用 · 免费 Key = 30 秒注册 · Cookie = 从浏览器导出 · 代理 = $1/月 +> **配置难度说明:** 零配置 = 装好即用 · 自动配置 = 安装时搞定 · mcporter = 需要 MCP 服务 · Cookie = 从浏览器导出 · 代理 = $1/月 --- @@ -109,13 +109,9 @@ agent-reach install --env=auto 不用的不用配。每一步都可以跳过,直接告诉 Agent 就行。 -### 🔍 搜索 — 免费,30 秒 - -去 [exa.ai](https://exa.ai) 注册拿个免费 Key(1000 次/月),发给 Agent。一个 Key 同时解锁全网搜索 + Reddit 搜索 + Twitter 搜索。 - ### 🍪 Cookie — 免费,2 分钟 -告诉 Agent "帮我配置 Twitter Cookie" 或 "帮我配置小红书",Agent 会引导你从浏览器导入。本地电脑可以一键自动导入。 +告诉 Agent "帮我配置 Twitter Cookie",Agent 会引导你从浏览器导入。本地电脑可以一键自动导入。 ### 🌐 代理 — $1/月,仅服务器需要 @@ -165,9 +161,11 @@ Agent Reach 做的事情很简单:**帮你把这些选型和配置的活儿做 |------|------|-----------| | 读网页 | [Jina Reader](https://github.com/jina-ai/reader) | 9.8K Star,免费,不需要 API Key | | 读推特 | [birdx](https://github.com/runesleo/birdx) | Cookie 登录,不用花 $100/月买官方 API | -| 提字幕 | [yt-dlp](https://github.com/yt-dlp/yt-dlp) | 148K Star,支持 1800+ 视频网站 | -| 搜全网 | [Exa](https://exa.ai) | AI 语义搜索,免费 1000 次/月 | +| 视频字幕 + 搜索 | [yt-dlp](https://github.com/yt-dlp/yt-dlp) | 148K Star,YouTube + B站 + 1800 站通吃 | +| 搜全网 | [Exa](https://exa.ai) via [mcporter](https://github.com/nicepkg/mcporter) | AI 语义搜索,MCP 接入免 Key | +| GitHub | [gh CLI](https://cli.github.com) | 官方工具,认证后完整 API 能力 | | 读 RSS | [feedparser](https://github.com/kurtmckee/feedparser) | Python 生态标准选择,2.3K Star | +| 小红书 | [xiaohongshu-mcp](https://github.com/user/xiaohongshu-mcp) | 内部 API,不受反爬限制 | 每个平台一个文件,每个文件 ~50 行代码。后端工具随时可以换——哪天出了更好的工具,改一个文件就行,其他不用动。 diff --git a/agent_reach/channels/bilibili.py b/agent_reach/channels/bilibili.py index 959e4c6..7eaa4b5 100644 --- a/agent_reach/channels/bilibili.py +++ b/agent_reach/channels/bilibili.py @@ -1,121 +1,151 @@ # -*- coding: utf-8 -*- -"""Bilibili — via public API (free, no config needed). +"""Bilibili — via yt-dlp (same backend as YouTube). -Backend: Bilibili public API -Swap to: any Bilibili access method +Backend: yt-dlp (https://github.com/yt-dlp/yt-dlp) +yt-dlp natively supports Bilibili — video info, subtitles, and search. """ -import requests -from urllib.parse import urlparse, parse_qs -from .base import Channel, ReadResult +import json +import shutil +import subprocess +from urllib.parse import urlparse +from .base import Channel, ReadResult, SearchResult +from typing import List class BilibiliChannel(Channel): name = "bilibili" description = "B站视频信息和字幕" - backends = ["Bilibili API"] + backends = ["yt-dlp"] + requires_tools = ["yt-dlp"] tier = 0 def can_handle(self, url: str) -> bool: - domain = urlparse(url).netloc.lower() - return "bilibili.com" in domain or "b23.tv" in domain + d = urlparse(url).netloc.lower() + return "bilibili.com" in d or "b23.tv" in d def check(self, config=None): + if not shutil.which("yt-dlp"): + return "off", "yt-dlp 未安装。安装:pip install yt-dlp" proxy = config.get("bilibili_proxy") if config else None if proxy: return "ok", "已配置代理,完整可用" - # Detect if we're on a server (same logic as cli._detect_environment) import os - indicators = [ - os.path.exists("/var/run/docker.sock"), - os.path.exists("/etc/cloud"), - "SSH_CONNECTION" in os.environ, - "container" in os.environ.get("container", ""), - ] - is_server = any(indicators) + is_server = bool(os.environ.get("SSH_CONNECTION") or os.path.exists("/etc/cloud")) if is_server: return "warn", "服务器 IP 可能被封,配置代理即可解决:agent-reach configure proxy URL" return "ok", "本地直连可用" async def read(self, url: str, config=None) -> ReadResult: - # Proxy support (Bilibili blocks server IPs) + if not shutil.which("yt-dlp"): + raise RuntimeError("yt-dlp not installed. Install: pip install yt-dlp") + proxy = config.get("bilibili_proxy") if config else None - proxies = {"http": proxy, "https": proxy} if proxy else None - # Extract BV id from URL - path = urlparse(url).path - bv_id = "" - for part in path.split("/"): - if part.startswith("BV"): - bv_id = part - break - - if not bv_id: - # Fallback to Jina Reader - from agent_reach.channels.web import WebChannel - return await WebChannel().read(url, config) - - # Get video info - resp = requests.get( - "https://api.bilibili.com/x/web-interface/view", - params={"bvid": bv_id}, - headers={"User-Agent": "Mozilla/5.0"}, - proxies=proxies, - timeout=15, - ) - resp.raise_for_status() - api_data = resp.json() - - # Check for API errors (IP blocked, video not found, etc.) - if api_data.get("code") != 0: - msg = api_data.get("message", "Unknown error") - # Bilibili returns -404 when server IP is blocked - if api_data.get("code") in (-404, -403, -412): - return ReadResult( - title=f"Bilibili: {bv_id}", - content=f"⚠️ Bilibili blocked this request ({msg}). " - f"This usually means the server IP is blocked. " - f"Try: agent-reach configure proxy http://user:pass@ip:port", - url=url, - platform="bilibili", - ) + # Get video info via yt-dlp + info = self._get_info(url, proxy) + if not info: return ReadResult( - title=f"Bilibili: {bv_id}", - content=f"Bilibili API error: {msg} (code: {api_data.get('code')})", - url=url, - platform="bilibili", + title="Bilibili", + content=f"⚠️ 无法获取视频信息: {url}\n服务器 IP 可能被封,配个代理:agent-reach configure proxy URL", + url=url, platform="bilibili", ) - data = api_data.get("data", {}) - - title = data.get("title", "") - desc = data.get("desc", "") - author = data.get("owner", {}).get("name", "") - - # Try to get subtitles - subtitle_text = "" - subtitle_list = data.get("subtitle", {}).get("list", []) - if subtitle_list: - sub_url = subtitle_list[0].get("subtitle_url", "") - if sub_url: - if sub_url.startswith("//"): - sub_url = "https:" + sub_url - sr = requests.get(sub_url, timeout=10) - if sr.ok: - sub_data = sr.json() - lines = [item.get("content", "") for item in sub_data.get("body", [])] - subtitle_text = "\n".join(lines) + title = info.get("title", url) + author = info.get("uploader", "") + desc = info.get("description", "") + # Try subtitles + subtitle = self._get_subtitles(url, proxy) content = desc - if subtitle_text: - content += f"\n\n## Transcript\n{subtitle_text}" + if subtitle: + content += f"\n\n## 字幕\n{subtitle}" return ReadResult( - title=title, - content=content, - url=url, - author=author, - platform="bilibili", - extra={"view": data.get("stat", {}).get("view", 0), - "like": data.get("stat", {}).get("like", 0)}, + title=title, content=content, url=url, + author=author, platform="bilibili", + extra={ + "view_count": info.get("view_count"), + "like_count": info.get("like_count"), + "duration": info.get("duration_string"), + }, ) + + async def search(self, query: str, config=None, **kwargs) -> List[SearchResult]: + """Search Bilibili via yt-dlp's bilisearch.""" + if not shutil.which("yt-dlp"): + raise RuntimeError("yt-dlp not installed. Install: pip install yt-dlp") + + limit = kwargs.get("limit", 10) + proxy = config.get("bilibili_proxy") if config else None + + cmd = [ + "yt-dlp", "--dump-json", "--flat-playlist", + f"bilisearch{limit}:{query}", + ] + if proxy: + cmd += ["--proxy", proxy] + + try: + r = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + results = [] + for line in r.stdout.strip().split("\n"): + if not line.strip(): + continue + try: + d = json.loads(line) + results.append(SearchResult( + title=d.get("title", ""), + url=f"https://www.bilibili.com/video/{d.get('id', '')}", + snippet=f"👤 {d.get('uploader', '?')} · 👁 {d.get('view_count', '?')}", + extra={ + "view_count": d.get("view_count"), + "uploader": d.get("uploader"), + }, + )) + except json.JSONDecodeError: + continue + return results + except subprocess.TimeoutExpired: + return [] + + def _get_info(self, url: str, proxy: str = None) -> dict: + cmd = ["yt-dlp", "--dump-json", "--no-download", url] + if proxy: + cmd += ["--proxy", proxy] + try: + r = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + if r.returncode == 0: + return json.loads(r.stdout) + except (subprocess.TimeoutExpired, json.JSONDecodeError): + pass + return {} + + def _get_subtitles(self, url: str, proxy: str = None) -> str: + import tempfile + from pathlib import Path + + with tempfile.TemporaryDirectory() as tmpdir: + cmd = [ + "yt-dlp", "--write-sub", "--write-auto-sub", + "--sub-lang", "zh-Hans,zh,en", + "--skip-download", "--sub-format", "vtt", + "-o", f"{tmpdir}/%(id)s.%(ext)s", url, + ] + if proxy: + cmd += ["--proxy", proxy] + try: + subprocess.run(cmd, capture_output=True, text=True, timeout=30) + for f in Path(tmpdir).glob("*.vtt"): + text = f.read_text(errors="replace") + lines = [] + for line in text.split("\n"): + line = line.strip() + if not line or line.startswith("WEBVTT") or "-->" in line or line.isdigit(): + continue + if line not in lines[-1:]: + lines.append(line) + return "\n".join(lines) + except subprocess.TimeoutExpired: + pass + return "" diff --git a/agent_reach/channels/github.py b/agent_reach/channels/github.py index 0fac33f..bb4a937 100644 --- a/agent_reach/channels/github.py +++ b/agent_reach/channels/github.py @@ -1,11 +1,13 @@ # -*- coding: utf-8 -*- -"""GitHub — via GitHub REST API (free, no config needed). +"""GitHub — via gh CLI. -Backend: GitHub API v3 -Swap to: gh CLI, or any GitHub API wrapper +Backend: gh CLI (https://cli.github.com) +Swap to: GitHub REST API """ -import requests +import json +import shutil +import subprocess from urllib.parse import urlparse from .base import Channel, ReadResult, SearchResult from typing import List @@ -14,115 +16,124 @@ from typing import List class GitHubChannel(Channel): name = "github" description = "GitHub 仓库和代码" - backends = ["GitHub API"] + backends = ["gh CLI"] tier = 0 - API = "https://api.github.com" + def _gh(self, args: list, timeout: int = 15) -> str: + r = subprocess.run( + ["gh"] + args, + capture_output=True, text=True, timeout=timeout, + ) + if r.returncode != 0: + raise RuntimeError(r.stderr or r.stdout) + return r.stdout - def _headers(self, config=None): - h = {"Accept": "application/vnd.github+json"} - token = config.get("github_token") if config else None - if token: - h["Authorization"] = f"Bearer {token}" - return h - - def check(self, config=None): - import shutil - token = config.get("github_token") if config else None - has_gh = shutil.which("gh") - if token or has_gh: - return "ok", "完整可用(读取、搜索、Fork、Issue、PR 等)" - return "ok", "公开仓库可读可搜。配置 gh CLI 或 github_token 可解锁 Fork、Issue、PR 等操作" + def _gh_json(self, args: list, timeout: int = 15) -> dict: + return json.loads(self._gh(args + ["--json"], timeout)) def can_handle(self, url: str) -> bool: - domain = urlparse(url).netloc.lower() - return "github.com" in domain + return "github.com" in urlparse(url).netloc.lower() + + def check(self, config=None): + if not shutil.which("gh"): + return "warn", "gh CLI 未安装。安装:https://cli.github.com 。公开仓库仍可通过 Jina Reader 读取" + try: + self._gh(["auth", "status"], timeout=5) + return "ok", "完整可用(读取、搜索、Fork、Issue、PR 等)" + except Exception: + return "ok", "gh CLI 已装但未认证。运行 gh auth login 可解锁完整功能" async def read(self, url: str, config=None) -> ReadResult: - path = urlparse(url).path.strip("/").split("/") + if not shutil.which("gh"): + # Fallback to Jina Reader for public repos + from agent_reach.channels.web import WebChannel + return await WebChannel().read(url, config) + path = urlparse(url).path.strip("/").split("/") if len(path) < 2: - raise ValueError(f"Invalid GitHub URL: {url}") + from agent_reach.channels.web import WebChannel + return await WebChannel().read(url, config) owner, repo = path[0], path[1] - headers = self._headers(config) - # Issues/PRs + # Issues / PRs if len(path) >= 4 and path[2] in ("issues", "pull"): - num = path[3] - resp = requests.get(f"{self.API}/repos/{owner}/{repo}/issues/{num}", headers=headers, timeout=15) - resp.raise_for_status() - data = resp.json() - - # Get comments - comments_text = "" - if data.get("comments", 0) > 0: - cr = requests.get(f"{self.API}/repos/{owner}/{repo}/issues/{num}/comments", - headers=headers, params={"per_page": 20}, timeout=15) - if cr.ok: - for c in cr.json(): - comments_text += f"\n\n---\n**{c.get('user', {}).get('login', '')}** ({c.get('created_at', '')}):\n{c.get('body', '')}" - - return ReadResult( - title=data.get("title", ""), - content=(data.get("body", "") or "") + comments_text, - url=url, - author=data.get("user", {}).get("login", ""), - date=data.get("created_at", ""), - platform="github", - extra={"state": data.get("state"), "comments": data.get("comments", 0), - "reactions": data.get("reactions", {}).get("total_count", 0)}, - ) + return await self._read_issue(owner, repo, path[3], url) # Repo - resp = requests.get(f"{self.API}/repos/{owner}/{repo}", headers=headers, timeout=15) - resp.raise_for_status() - data = resp.json() + return await self._read_repo(owner, repo, url) - # Get README - readme_text = "" - rr = requests.get(f"{self.API}/repos/{owner}/{repo}/readme", headers=headers, timeout=15) - if rr.ok: - import base64 - readme_data = rr.json() - if readme_data.get("encoding") == "base64": - readme_text = base64.b64decode(readme_data["content"]).decode("utf-8", errors="replace") + async def _read_repo(self, owner: str, repo: str, url: str) -> ReadResult: + slug = f"{owner}/{repo}" + try: + # Get repo info + info = self._gh(["repo", "view", slug]) + # Get README + try: + readme = self._gh( + ["api", f"repos/{slug}/readme", "--jq", ".content"], + timeout=10, + ) + import base64 + readme_text = base64.b64decode(readme).decode("utf-8", errors="replace") + except Exception: + readme_text = "" - return ReadResult( - title=f"{owner}/{repo}", - content=readme_text or data.get("description", ""), - url=url, - author=owner, - platform="github", - extra={"stars": data.get("stargazers_count", 0), "forks": data.get("forks_count", 0), - "language": data.get("language", ""), "description": data.get("description", "")}, - ) + content = readme_text or info + return ReadResult( + title=slug, content=content, url=url, + author=owner, platform="github", + ) + except Exception: + from agent_reach.channels.web import WebChannel + return await WebChannel().read(url) + + async def _read_issue(self, owner: str, repo: str, num: str, url: str) -> ReadResult: + slug = f"{owner}/{repo}" + try: + out = self._gh(["issue", "view", num, "-R", slug]) + return ReadResult( + title=f"{slug}#{num}", content=out, url=url, + platform="github", + ) + except Exception: + # Might be a PR + try: + out = self._gh(["pr", "view", num, "-R", slug]) + return ReadResult( + title=f"{slug}#{num}", content=out, url=url, + platform="github", + ) + except Exception: + from agent_reach.channels.web import WebChannel + return await WebChannel().read(url) async def search(self, query: str, config=None, **kwargs) -> List[SearchResult]: + if not shutil.which("gh"): + raise ValueError("GitHub search requires gh CLI. Install: https://cli.github.com") + language = kwargs.get("language") limit = kwargs.get("limit", 5) - q = query + args = ["search", "repos", query, "--sort", "stars", f"--limit={limit}"] if language: - q += f" language:{language}" - - resp = requests.get( - f"{self.API}/search/repositories", - headers=self._headers(config), - params={"q": q, "sort": "stars", "per_page": min(limit, 30)}, - timeout=15, - ) - resp.raise_for_status() + args += [f"--language={language}"] + out = self._gh(args, timeout=15) results = [] - for repo in resp.json().get("items", []): - results.append(SearchResult( - title=repo.get("full_name", ""), - url=repo.get("html_url", ""), - snippet=repo.get("description", ""), - date=repo.get("updated_at", ""), - extra={"stars": repo.get("stargazers_count", 0), - "forks": repo.get("forks_count", 0), - "language": repo.get("language", "")}, - )) + for line in out.strip().split("\n"): + if not line.strip(): + continue + parts = line.split("\t") + if len(parts) >= 1: + slug = parts[0].strip() + desc = parts[1].strip() if len(parts) > 1 else "" + stars = parts[3].strip() if len(parts) > 3 else "" + lang = parts[5].strip() if len(parts) > 5 else "" + results.append(SearchResult( + title=slug, + url=f"https://github.com/{slug}", + snippet=desc, + extra={"stars": stars, "language": lang}, + )) return results diff --git a/agent_reach/channels/youtube.py b/agent_reach/channels/youtube.py index a186a96..a544f8c 100644 --- a/agent_reach/channels/youtube.py +++ b/agent_reach/channels/youtube.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- -"""YouTube — via yt-dlp (free, pip install yt-dlp). +"""YouTube — via yt-dlp (video info, subtitles, and search). Backend: yt-dlp (https://github.com/yt-dlp/yt-dlp) -Swap to: any YouTube subtitle extractor +Supports: read (info + subtitles), search (ytsearch) """ import json @@ -10,8 +10,9 @@ import shutil import subprocess import tempfile from pathlib import Path -from urllib.parse import urlparse, parse_qs -from .base import Channel, ReadResult +from urllib.parse import urlparse +from .base import Channel, ReadResult, SearchResult +from typing import List class YouTubeChannel(Channel): @@ -22,52 +23,85 @@ class YouTubeChannel(Channel): tier = 0 def can_handle(self, url: str) -> bool: - domain = urlparse(url).netloc.lower() - return "youtube.com" in domain or "youtu.be" in domain + d = urlparse(url).netloc.lower() + return "youtube.com" in d or "youtu.be" in d async def read(self, url: str, config=None) -> ReadResult: if not shutil.which("yt-dlp"): raise RuntimeError("yt-dlp not installed. Install: pip install yt-dlp") with tempfile.TemporaryDirectory() as tmpdir: - # Get video info info = self._get_info(url) title = info.get("title", url) author = info.get("uploader", "") - # Try to get subtitles transcript = self._get_subtitles(url, tmpdir) - if not transcript: - transcript = f"[Video: {title}]\n[No subtitles available. Use Groq Whisper for transcription.]" + transcript = f"[Video: {title}]\n[No subtitles available.]" return ReadResult( - title=title, - content=transcript, - url=url, - author=author, - platform="youtube", + title=title, content=transcript, url=url, + author=author, platform="youtube", extra={ - "duration": info.get("duration"), + "duration": info.get("duration_string"), "view_count": info.get("view_count"), "upload_date": info.get("upload_date"), }, ) + async def search(self, query: str, config=None, **kwargs) -> List[SearchResult]: + """Search YouTube via yt-dlp's ytsearch.""" + if not shutil.which("yt-dlp"): + raise RuntimeError("yt-dlp not installed. Install: pip install yt-dlp") + + limit = kwargs.get("limit", 10) + + try: + r = subprocess.run( + ["yt-dlp", "--dump-json", "--flat-playlist", + f"ytsearch{limit}:{query}"], + capture_output=True, text=True, timeout=30, + ) + results = [] + for line in r.stdout.strip().split("\n"): + if not line.strip(): + continue + try: + d = json.loads(line) + vid = d.get("id", "") + results.append(SearchResult( + title=d.get("title", ""), + url=f"https://youtube.com/watch?v={vid}" if vid else "", + snippet=( + f"👤 {d.get('channel', '?')} · " + f"⏱ {d.get('duration_string', '?')} · " + f"👁 {d.get('view_count', '?')}" + ), + extra={ + "channel": d.get("channel"), + "duration": d.get("duration_string"), + "view_count": d.get("view_count"), + }, + )) + except json.JSONDecodeError: + continue + return results + except subprocess.TimeoutExpired: + return [] + def _get_info(self, url: str) -> dict: try: - result = subprocess.run( + r = subprocess.run( ["yt-dlp", "--dump-json", "--no-download", url], capture_output=True, text=True, timeout=30, ) - if result.returncode == 0: - return json.loads(result.stdout) + if r.returncode == 0: + return json.loads(r.stdout) except (subprocess.TimeoutExpired, json.JSONDecodeError): pass return {} def _get_subtitles(self, url: str, tmpdir: str) -> str: - """Extract subtitles using yt-dlp.""" try: subprocess.run( ["yt-dlp", "--write-auto-sub", "--write-sub", @@ -76,17 +110,14 @@ class YouTubeChannel(Channel): "-o", f"{tmpdir}/%(id)s.%(ext)s", url], capture_output=True, text=True, timeout=30, ) - - # Find and read subtitle file for f in Path(tmpdir).glob("*.vtt"): text = f.read_text(errors="replace") - # Strip VTT headers and timestamps lines = [] for line in text.split("\n"): line = line.strip() if not line or line.startswith("WEBVTT") or "-->" in line or line.isdigit(): continue - if line not in lines[-1:]: # deduplicate + if line not in lines[-1:]: lines.append(line) return "\n".join(lines) except subprocess.TimeoutExpired: diff --git a/docs/README_en.md b/docs/README_en.md index 5e5f738..bc1fafb 100644 --- a/docs/README_en.md +++ b/docs/README_en.md @@ -57,15 +57,15 @@ Copy that to your Agent. 30 seconds later, it can read tweets, search Reddit, an |----------|-------------|:-----:|-------| | 🌐 **Web** | Read | Zero config | Any URL → clean Markdown ([Jina Reader](https://github.com/jina-ai/reader) ⭐9.8K) | | 🐦 **Twitter/X** | Read · Search | Zero config / Cookie | Single tweets readable out of the box. Cookie unlocks search, timeline, posting ([birdx](https://github.com/runesleo/birdx)) | -| 📕 **XiaoHongShu** | Read · Search · **Post · Comment · Like** | Cookie | Full functionality with browser cookie | -| 🔍 **Web Search** | Search | Free key | One key unlocks web + Reddit + Twitter search ([Exa](https://exa.ai), free 1000/month) | -| 📦 **GitHub** | Read · Search | Zero config | Public repos work immediately. `gh` CLI or token unlocks Fork, Issue, PR | -| 📺 **YouTube** | Read | Zero config | Subtitles from 1800+ video sites ([yt-dlp](https://github.com/yt-dlp/yt-dlp) ⭐148K) | -| 📺 **Bilibili** | Read | Zero config / Proxy | Video info + subtitles. Local works directly, servers need a proxy | +| 📕 **XiaoHongShu** | Read · Search · **Post · Comment · Like** | mcporter | Via [xiaohongshu-mcp](https://github.com/user/xiaohongshu-mcp) internal API, install and go | +| 🔍 **Web Search** | Search | Auto-configured | Auto-configured during install, free, no API key ([Exa](https://exa.ai) via [mcporter](https://github.com/nicepkg/mcporter)) | +| 📦 **GitHub** | Read · Search | Zero config | [gh CLI](https://cli.github.com) powered. Public repos work immediately. `gh auth login` unlocks Fork, Issue, PR | +| 📺 **YouTube** | Read · **Search** | Zero config | Subtitles + search across 1800+ video sites ([yt-dlp](https://github.com/yt-dlp/yt-dlp) ⭐148K) | +| 📺 **Bilibili** | Read · **Search** | Zero config / Proxy | Video info + subtitles + search. Local works directly, servers need a proxy ([yt-dlp](https://github.com/yt-dlp/yt-dlp)) | | 📡 **RSS** | Read | Zero config | Any RSS/Atom feed ([feedparser](https://github.com/kurtmckee/feedparser) ⭐2.3K) | -| 📖 **Reddit** | Search · Read | Free / Proxy | Search via Exa (free). Reading posts needs a proxy on servers. OAuth bot unlocks posting | +| 📖 **Reddit** | Search · Read | Free / Proxy | Search via Exa (free). Reading posts needs a proxy on servers | -> **Setup levels:** Zero config = install and go · Free key = 30-second signup · Cookie = export from browser · Proxy = $1/month +> **Setup levels:** Zero config = install and go · Auto-configured = handled during install · mcporter = needs MCP service · Cookie = export from browser · Proxy = $1/month --- @@ -109,13 +109,9 @@ No configuration needed — just tell your Agent: Don't use it? Don't configure it. Every step is optional. -### 🔍 Search — Free, 30 seconds - -Go to [exa.ai](https://exa.ai), sign up for a free key (1000 searches/month), and send it to your Agent. One key unlocks web search + Reddit search + Twitter search. - ### 🍪 Cookies — Free, 2 minutes -Tell your Agent "help me configure Twitter cookies" or "set up XiaoHongShu" — it'll guide you through exporting from your browser. Local computers can auto-import. +Tell your Agent "help me configure Twitter cookies" — it'll guide you through exporting from your browser. Local computers can auto-import. ### 🌐 Proxy — $1/month, servers only @@ -165,9 +161,11 @@ Agent Reach does one simple thing: **it makes those tool selection and configura |----------|------|-----| | Read web pages | [Jina Reader](https://github.com/jina-ai/reader) | 9.8K stars, free, no API key needed | | Read tweets | [birdx](https://github.com/runesleo/birdx) | Cookie auth, no $100/month official API | -| Extract subtitles | [yt-dlp](https://github.com/yt-dlp/yt-dlp) | 148K stars, 1800+ video sites | -| Search the web | [Exa](https://exa.ai) | AI semantic search, 1000 free/month | +| Video subtitles + search | [yt-dlp](https://github.com/yt-dlp/yt-dlp) | 148K stars, YouTube + Bilibili + 1800 sites | +| Search the web | [Exa](https://exa.ai) via [mcporter](https://github.com/nicepkg/mcporter) | AI semantic search, MCP integration, no API key | +| GitHub | [gh CLI](https://cli.github.com) | Official tool, full API after auth | | Read RSS | [feedparser](https://github.com/kurtmckee/feedparser) | Python ecosystem standard, 2.3K stars | +| XiaoHongShu | [xiaohongshu-mcp](https://github.com/user/xiaohongshu-mcp) | Internal API, bypasses anti-bot | One file per platform, ~50 lines each. Swap any backend by editing one file — everything else stays untouched.