import { Type } from "@sinclair/typebox"; import type { AgentTool } from "@mariozechner/pi-agent-core"; import type { Dispatcher } from "undici"; import { closeDispatcher, createPinnedDispatcher, resolvePinnedHostname, SsrfBlockedError, } from "./ssrf.js"; import { DEFAULT_CACHE_TTL_MINUTES, DEFAULT_TIMEOUT_SECONDS, normalizeCacheKey, readCache, readResponseText, resolveCacheTtlMs, resolveTimeoutSeconds, withTimeout, writeCache, } from "./cache.js"; import type { CacheEntry } from "./cache.js"; import { extractContent, markdownToText, truncateText, type ExtractMode, type ExtractorType } from "./html-utils.js"; import { jsonResult, readNumberParam, readStringParam } from "./param-helpers.js"; const EXTRACT_MODES = ["markdown", "text"] as const; const EXTRACTOR_TYPES = ["readability", "turndown"] as const; const DEFAULT_FETCH_MAX_CHARS = 50_000; const DEFAULT_FETCH_MAX_REDIRECTS = 3; const DEFAULT_ERROR_MAX_CHARS = 4_000; const DEFAULT_FETCH_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"; const FETCH_CACHE = new Map>>(); const WebFetchSchema = Type.Object({ url: Type.String({ description: "HTTP or HTTPS URL to fetch." }), extractMode: Type.Optional( Type.String({ description: 'Output format: "markdown" (default) or "text" (plain text).', }), ), extractor: Type.Optional( Type.String({ description: 'Extraction method: "readability" (default, smart extraction of main content) or "turndown" (convert entire page).', }), ), maxChars: Type.Optional( Type.Number({ description: "Maximum characters to return (truncates when exceeded). Default: 50000.", minimum: 100, }), ), }); type WebFetchArgs = { url: string; extractMode?: string; extractor?: string; maxChars?: number; }; export type WebFetchResult = { url: string; finalUrl: string; status: number; contentType: string; title?: string; extractMode: ExtractMode; extractor: ExtractorType | "raw" | "json"; truncated: boolean; length: number; fetchedAt: string; tookMs: number; text: string; cached?: boolean; }; function resolveMaxChars(value: unknown, fallback: number): number { const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; return Math.max(100, Math.floor(parsed)); } function resolveMaxRedirects(value: unknown, fallback: number): number { const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; return Math.max(0, Math.floor(parsed)); } function looksLikeHtml(value: string): boolean { const trimmed = value.trimStart(); if (!trimmed) return false; const head = trimmed.slice(0, 256).toLowerCase(); return head.startsWith(" { const signal = withTimeout(undefined, params.timeoutSeconds * 1000); const visited = new Set(); let currentUrl = params.url; let redirectCount = 0; while (true) { let parsedUrl: URL; try { parsedUrl = new URL(currentUrl); } catch { throw new Error("Invalid URL: must be http or https"); } if (!["http:", "https:"].includes(parsedUrl.protocol)) { throw new Error("Invalid URL: must be http or https"); } const pinned = await resolvePinnedHostname(parsedUrl.hostname); const dispatcher = createPinnedDispatcher(pinned); let res: Response; try { // Use undici's dispatcher for SSRF protection res = await fetch(parsedUrl.toString(), { method: "GET", headers: { Accept: "*/*", "User-Agent": params.userAgent, "Accept-Language": "en-US,en;q=0.9", }, signal, redirect: "manual", dispatcher, } as unknown as RequestInit); } catch (err) { await closeDispatcher(dispatcher); throw err; } if (isRedirectStatus(res.status)) { const location = res.headers.get("location"); if (!location) { await closeDispatcher(dispatcher); throw new Error(`Redirect missing location header (${res.status})`); } redirectCount += 1; if (redirectCount > params.maxRedirects) { await closeDispatcher(dispatcher); throw new Error(`Too many redirects (limit: ${params.maxRedirects})`); } const nextUrl = new URL(location, parsedUrl).toString(); if (visited.has(nextUrl)) { await closeDispatcher(dispatcher); throw new Error("Redirect loop detected"); } visited.add(nextUrl); void res.body?.cancel(); await closeDispatcher(dispatcher); currentUrl = nextUrl; continue; } return { response: res, finalUrl: currentUrl, dispatcher }; } } function formatWebFetchErrorDetail(params: { detail: string; contentType?: string | null; maxChars: number; }): string { const { detail, contentType, maxChars } = params; if (!detail) return ""; let text = detail; const contentTypeLower = contentType?.toLowerCase(); if (contentTypeLower?.includes("text/html") || looksLikeHtml(detail)) { text = markdownToText(detail); } const truncated = truncateText(text.trim(), maxChars); return truncated.text; } async function runWebFetch(params: { url: string; extractMode: ExtractMode; extractor: ExtractorType; maxChars: number; maxRedirects: number; timeoutSeconds: number; cacheTtlMs: number; userAgent: string; }): Promise { const cacheKey = normalizeCacheKey( `fetch:${params.url}:${params.extractMode}:${params.extractor}:${params.maxChars}`, ); const cached = readCache(FETCH_CACHE, cacheKey); if (cached) return { ...cached.value, cached: true } as WebFetchResult; let parsedUrl: URL; try { parsedUrl = new URL(params.url); } catch { throw new Error("Invalid URL: must be http or https"); } if (!["http:", "https:"].includes(parsedUrl.protocol)) { throw new Error("Invalid URL: must be http or https"); } const start = Date.now(); let res: Response; let dispatcher: Dispatcher | null = null; let finalUrl = params.url; const result = await fetchWithRedirects({ url: params.url, maxRedirects: params.maxRedirects, timeoutSeconds: params.timeoutSeconds, userAgent: params.userAgent, }); res = result.response; finalUrl = result.finalUrl; dispatcher = result.dispatcher; try { if (!res.ok) { const rawDetail = await readResponseText(res); const detail = formatWebFetchErrorDetail({ detail: rawDetail, contentType: res.headers.get("content-type"), maxChars: DEFAULT_ERROR_MAX_CHARS, }); throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`); } const contentType = res.headers.get("content-type") ?? "application/octet-stream"; const body = await readResponseText(res); let title: string | undefined; let extractor: ExtractorType | "raw" | "json" = "raw"; let text = body; if (contentType.includes("text/html")) { const extracted = await extractContent({ html: body, url: finalUrl, extractMode: params.extractMode, extractor: params.extractor, }); text = extracted.text; title = extracted.title; extractor = extracted.extractor; } else if (contentType.includes("application/json")) { try { text = JSON.stringify(JSON.parse(body), null, 2); extractor = "json"; } catch { text = body; extractor = "raw"; } } const truncated = truncateText(text, params.maxChars); const payload: WebFetchResult = { url: params.url, finalUrl, status: res.status, contentType, extractMode: params.extractMode, extractor, truncated: truncated.truncated, length: truncated.text.length, fetchedAt: new Date().toISOString(), tookMs: Date.now() - start, text: truncated.text, }; if (title) { payload.title = title; } writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); return payload; } finally { await closeDispatcher(dispatcher); } } export function createWebFetchTool(): AgentTool { return { name: "web_fetch", label: "Web Fetch", description: 'Fetch and extract readable content from a URL. Converts HTML to markdown or plain text. Use extractor="readability" for smart article extraction, or "turndown" for full page conversion.', parameters: WebFetchSchema, execute: async (_toolCallId, args) => { const params = args as WebFetchArgs; const url = readStringParam(params as Record, "url", { required: true }); const extractModeRaw = readStringParam(params as Record, "extractMode"); const extractMode: ExtractMode = extractModeRaw === "text" ? "text" : "markdown"; const extractorRaw = readStringParam(params as Record, "extractor"); const extractor: ExtractorType = extractorRaw === "turndown" ? "turndown" : "readability"; const maxChars = readNumberParam(params as Record, "maxChars", { integer: true }); try { const result = await runWebFetch({ url, extractMode, extractor, maxChars: resolveMaxChars(maxChars, DEFAULT_FETCH_MAX_CHARS), maxRedirects: DEFAULT_FETCH_MAX_REDIRECTS, timeoutSeconds: DEFAULT_TIMEOUT_SECONDS, cacheTtlMs: resolveCacheTtlMs(DEFAULT_CACHE_TTL_MINUTES, DEFAULT_CACHE_TTL_MINUTES), userAgent: DEFAULT_FETCH_USER_AGENT, }); return jsonResult(result); } catch (error) { if (error instanceof SsrfBlockedError) { return jsonResult({ error: "ssrf_blocked", message: error.message, }); } return jsonResult({ error: "fetch_failed", message: error instanceof Error ? error.message : String(error), }); } }, }; }