multica/src/agent/tools/web/web-fetch.ts
Jiayuan 9b3ffd1e90
feat(agent): add web_fetch and web_search tools (#20)
* chore(deps): add web tools dependencies

Add undici, @mozilla/readability, linkedom, and turndown for
web fetching and HTML content extraction capabilities.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* feat(agent): add web_fetch and web_search tools

Port network tools from moltbot with the following features:

web_fetch:
- Fetch and extract readable content from URLs
- Dual extraction mode: readability (smart) or turndown (full page)
- SSRF protection (blocks private IPs, localhost)
- Response caching with TTL
- Redirect handling

web_search:
- Brave Search API for traditional results
- Perplexity API for AI-synthesized answers
- Region/language settings
- Freshness filtering (Brave only)
- Auto-detects provider from available API keys

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* feat(agent): register web_fetch and web_search tools

Add web tools to the agent's tool registry.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-30 05:09:16 +08:00

335 lines
10 KiB
TypeScript

import { Type } from "@sinclair/typebox";
import type { AgentTool } from "@mariozechner/pi-agent-core";
import type { Dispatcher } from "undici";
import {
closeDispatcher,
createPinnedDispatcher,
resolvePinnedHostname,
SsrfBlockedError,
} from "./ssrf.js";
import {
DEFAULT_CACHE_TTL_MINUTES,
DEFAULT_TIMEOUT_SECONDS,
normalizeCacheKey,
readCache,
readResponseText,
resolveCacheTtlMs,
resolveTimeoutSeconds,
withTimeout,
writeCache,
} from "./cache.js";
import type { CacheEntry } from "./cache.js";
import { extractContent, markdownToText, truncateText, type ExtractMode, type ExtractorType } from "./html-utils.js";
import { jsonResult, readNumberParam, readStringParam } from "./param-helpers.js";
const EXTRACT_MODES = ["markdown", "text"] as const;
const EXTRACTOR_TYPES = ["readability", "turndown"] as const;
const DEFAULT_FETCH_MAX_CHARS = 50_000;
const DEFAULT_FETCH_MAX_REDIRECTS = 3;
const DEFAULT_ERROR_MAX_CHARS = 4_000;
const DEFAULT_FETCH_USER_AGENT =
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36";
const FETCH_CACHE = new Map<string, CacheEntry<Record<string, unknown>>>();
const WebFetchSchema = Type.Object({
url: Type.String({ description: "HTTP or HTTPS URL to fetch." }),
extractMode: Type.Optional(
Type.String({
description: 'Output format: "markdown" (default) or "text" (plain text).',
}),
),
extractor: Type.Optional(
Type.String({
description:
'Extraction method: "readability" (default, smart extraction of main content) or "turndown" (convert entire page).',
}),
),
maxChars: Type.Optional(
Type.Number({
description: "Maximum characters to return (truncates when exceeded). Default: 50000.",
minimum: 100,
}),
),
});
type WebFetchArgs = {
url: string;
extractMode?: string;
extractor?: string;
maxChars?: number;
};
export type WebFetchResult = {
url: string;
finalUrl: string;
status: number;
contentType: string;
title?: string;
extractMode: ExtractMode;
extractor: ExtractorType | "raw" | "json";
truncated: boolean;
length: number;
fetchedAt: string;
tookMs: number;
text: string;
cached?: boolean;
};
function resolveMaxChars(value: unknown, fallback: number): number {
const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback;
return Math.max(100, Math.floor(parsed));
}
function resolveMaxRedirects(value: unknown, fallback: number): number {
const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback;
return Math.max(0, Math.floor(parsed));
}
function looksLikeHtml(value: string): boolean {
const trimmed = value.trimStart();
if (!trimmed) return false;
const head = trimmed.slice(0, 256).toLowerCase();
return head.startsWith("<!doctype html") || head.startsWith("<html");
}
function isRedirectStatus(status: number): boolean {
return status === 301 || status === 302 || status === 303 || status === 307 || status === 308;
}
async function fetchWithRedirects(params: {
url: string;
maxRedirects: number;
timeoutSeconds: number;
userAgent: string;
}): Promise<{ response: Response; finalUrl: string; dispatcher: Dispatcher }> {
const signal = withTimeout(undefined, params.timeoutSeconds * 1000);
const visited = new Set<string>();
let currentUrl = params.url;
let redirectCount = 0;
while (true) {
let parsedUrl: URL;
try {
parsedUrl = new URL(currentUrl);
} catch {
throw new Error("Invalid URL: must be http or https");
}
if (!["http:", "https:"].includes(parsedUrl.protocol)) {
throw new Error("Invalid URL: must be http or https");
}
const pinned = await resolvePinnedHostname(parsedUrl.hostname);
const dispatcher = createPinnedDispatcher(pinned);
let res: Response;
try {
// Use undici's dispatcher for SSRF protection
res = await fetch(parsedUrl.toString(), {
method: "GET",
headers: {
Accept: "*/*",
"User-Agent": params.userAgent,
"Accept-Language": "en-US,en;q=0.9",
},
signal,
redirect: "manual",
dispatcher,
} as unknown as RequestInit);
} catch (err) {
await closeDispatcher(dispatcher);
throw err;
}
if (isRedirectStatus(res.status)) {
const location = res.headers.get("location");
if (!location) {
await closeDispatcher(dispatcher);
throw new Error(`Redirect missing location header (${res.status})`);
}
redirectCount += 1;
if (redirectCount > params.maxRedirects) {
await closeDispatcher(dispatcher);
throw new Error(`Too many redirects (limit: ${params.maxRedirects})`);
}
const nextUrl = new URL(location, parsedUrl).toString();
if (visited.has(nextUrl)) {
await closeDispatcher(dispatcher);
throw new Error("Redirect loop detected");
}
visited.add(nextUrl);
void res.body?.cancel();
await closeDispatcher(dispatcher);
currentUrl = nextUrl;
continue;
}
return { response: res, finalUrl: currentUrl, dispatcher };
}
}
function formatWebFetchErrorDetail(params: {
detail: string;
contentType?: string | null;
maxChars: number;
}): string {
const { detail, contentType, maxChars } = params;
if (!detail) return "";
let text = detail;
const contentTypeLower = contentType?.toLowerCase();
if (contentTypeLower?.includes("text/html") || looksLikeHtml(detail)) {
text = markdownToText(detail);
}
const truncated = truncateText(text.trim(), maxChars);
return truncated.text;
}
async function runWebFetch(params: {
url: string;
extractMode: ExtractMode;
extractor: ExtractorType;
maxChars: number;
maxRedirects: number;
timeoutSeconds: number;
cacheTtlMs: number;
userAgent: string;
}): Promise<WebFetchResult> {
const cacheKey = normalizeCacheKey(
`fetch:${params.url}:${params.extractMode}:${params.extractor}:${params.maxChars}`,
);
const cached = readCache(FETCH_CACHE, cacheKey);
if (cached) return { ...cached.value, cached: true } as WebFetchResult;
let parsedUrl: URL;
try {
parsedUrl = new URL(params.url);
} catch {
throw new Error("Invalid URL: must be http or https");
}
if (!["http:", "https:"].includes(parsedUrl.protocol)) {
throw new Error("Invalid URL: must be http or https");
}
const start = Date.now();
let res: Response;
let dispatcher: Dispatcher | null = null;
let finalUrl = params.url;
const result = await fetchWithRedirects({
url: params.url,
maxRedirects: params.maxRedirects,
timeoutSeconds: params.timeoutSeconds,
userAgent: params.userAgent,
});
res = result.response;
finalUrl = result.finalUrl;
dispatcher = result.dispatcher;
try {
if (!res.ok) {
const rawDetail = await readResponseText(res);
const detail = formatWebFetchErrorDetail({
detail: rawDetail,
contentType: res.headers.get("content-type"),
maxChars: DEFAULT_ERROR_MAX_CHARS,
});
throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`);
}
const contentType = res.headers.get("content-type") ?? "application/octet-stream";
const body = await readResponseText(res);
let title: string | undefined;
let extractor: ExtractorType | "raw" | "json" = "raw";
let text = body;
if (contentType.includes("text/html")) {
const extracted = await extractContent({
html: body,
url: finalUrl,
extractMode: params.extractMode,
extractor: params.extractor,
});
text = extracted.text;
title = extracted.title;
extractor = extracted.extractor;
} else if (contentType.includes("application/json")) {
try {
text = JSON.stringify(JSON.parse(body), null, 2);
extractor = "json";
} catch {
text = body;
extractor = "raw";
}
}
const truncated = truncateText(text, params.maxChars);
const payload: WebFetchResult = {
url: params.url,
finalUrl,
status: res.status,
contentType,
extractMode: params.extractMode,
extractor,
truncated: truncated.truncated,
length: truncated.text.length,
fetchedAt: new Date().toISOString(),
tookMs: Date.now() - start,
text: truncated.text,
};
if (title) {
payload.title = title;
}
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
return payload;
} finally {
await closeDispatcher(dispatcher);
}
}
export function createWebFetchTool(): AgentTool<typeof WebFetchSchema, unknown> {
return {
name: "web_fetch",
label: "Web Fetch",
description:
'Fetch and extract readable content from a URL. Converts HTML to markdown or plain text. Use extractor="readability" for smart article extraction, or "turndown" for full page conversion.',
parameters: WebFetchSchema,
execute: async (_toolCallId, args) => {
const params = args as WebFetchArgs;
const url = readStringParam(params as Record<string, unknown>, "url", { required: true });
const extractModeRaw = readStringParam(params as Record<string, unknown>, "extractMode");
const extractMode: ExtractMode =
extractModeRaw === "text" ? "text" : "markdown";
const extractorRaw = readStringParam(params as Record<string, unknown>, "extractor");
const extractor: ExtractorType =
extractorRaw === "turndown" ? "turndown" : "readability";
const maxChars = readNumberParam(params as Record<string, unknown>, "maxChars", { integer: true });
try {
const result = await runWebFetch({
url,
extractMode,
extractor,
maxChars: resolveMaxChars(maxChars, DEFAULT_FETCH_MAX_CHARS),
maxRedirects: DEFAULT_FETCH_MAX_REDIRECTS,
timeoutSeconds: DEFAULT_TIMEOUT_SECONDS,
cacheTtlMs: resolveCacheTtlMs(DEFAULT_CACHE_TTL_MINUTES, DEFAULT_CACHE_TTL_MINUTES),
userAgent: DEFAULT_FETCH_USER_AGENT,
});
return jsonResult(result);
} catch (error) {
if (error instanceof SsrfBlockedError) {
return jsonResult({
error: "ssrf_blocked",
message: error.message,
});
}
return jsonResult({
error: "fetch_failed",
message: error instanceof Error ? error.message : String(error),
});
}
},
};
}