diff --git a/package.json b/package.json index 43572309..1e3856ff 100644 --- a/package.json +++ b/package.json @@ -23,6 +23,7 @@ "packageManager": "pnpm@10.16.1", "devDependencies": { "@types/node": "^25.0.10", + "@types/turndown": "^5.0.6", "@types/uuid": "^11.0.0", "tsx": "^4.21.0", "turbo": "^2.3.4", @@ -32,6 +33,7 @@ "@mariozechner/pi-agent-core": "^0.50.3", "@mariozechner/pi-ai": "^0.50.3", "@mariozechner/pi-coding-agent": "^0.50.3", + "@mozilla/readability": "^0.6.0", "@nestjs/common": "^11.1.12", "@nestjs/core": "^11.1.12", "@nestjs/platform-express": "^11.1.12", @@ -40,6 +42,7 @@ "@nestjs/websockets": "^11.1.12", "@sinclair/typebox": "^0.34.41", "fast-glob": "^3.3.3", + "linkedom": "^0.18.12", "nestjs-pino": "^4.5.0", "pino": "^10.3.0", "pino-http": "^11.0.0", @@ -48,6 +51,8 @@ "rxjs": "^7.8.2", "socket.io": "^4.8.3", "socket.io-client": "^4.8.3", + "turndown": "^7.2.2", + "undici": "^7.19.2", "uuid": "^13.0.0" } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 81c3cb44..02d9e9ec 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -17,6 +17,9 @@ importers: '@mariozechner/pi-coding-agent': specifier: ^0.50.3 version: 0.50.3(@modelcontextprotocol/sdk@1.25.3(hono@4.11.7)(zod@4.3.6))(ws@8.18.3)(zod@4.3.6) + '@mozilla/readability': + specifier: ^0.6.0 + version: 0.6.0 '@nestjs/common': specifier: ^11.1.12 version: 11.1.12(reflect-metadata@0.2.2)(rxjs@7.8.2) @@ -41,6 +44,9 @@ importers: fast-glob: specifier: ^3.3.3 version: 3.3.3 + linkedom: + specifier: ^0.18.12 + version: 0.18.12 nestjs-pino: specifier: ^4.5.0 version: 4.5.0(@nestjs/common@11.1.12(reflect-metadata@0.2.2)(rxjs@7.8.2))(pino-http@11.0.0)(pino@10.3.0)(rxjs@7.8.2) @@ -65,6 +71,12 @@ importers: socket.io-client: specifier: ^4.8.3 version: 4.8.3 + turndown: + specifier: ^7.2.2 + version: 7.2.2 + undici: + specifier: ^7.19.2 + version: 7.19.2 uuid: specifier: ^13.0.0 version: 13.0.0 @@ -72,6 +84,9 @@ importers: '@types/node': specifier: ^25.0.10 version: 25.0.10 + '@types/turndown': + specifier: ^5.0.6 + version: 5.0.6 '@types/uuid': specifier: ^11.0.0 version: 11.0.0 @@ -1043,6 +1058,9 @@ packages: '@mistralai/mistralai@1.10.0': resolution: {integrity: sha512-tdIgWs4Le8vpvPiUEWne6tK0qbVc+jMenujnvTqOjogrJUsCSQhus0tHTU1avDDh5//Rq2dFgP9mWRAdIEoBqg==} + '@mixmark-io/domino@2.2.0': + resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==} + '@modelcontextprotocol/sdk@1.25.3': resolution: {integrity: sha512-vsAMBMERybvYgKbg/l4L1rhS7VXV1c0CtyJg72vwxONVX0l4ZfKVAnZEWTQixJGTzKnELjQ59e4NbdFDALRiAQ==} engines: {node: '>=18'} @@ -1053,6 +1071,10 @@ packages: '@cfworker/json-schema': optional: true + '@mozilla/readability@0.6.0': + resolution: {integrity: sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==} + engines: {node: '>=14.0.0'} + '@mswjs/interceptors@0.40.0': resolution: {integrity: sha512-EFd6cVbHsgLa6wa4RljGj6Wk75qoHxUSyc5asLyyPSyuhIcdS2Q3Phw6ImS1q+CkALthJRShiYfKANcQMuMqsQ==} engines: {node: '>=18'} @@ -1585,6 +1607,9 @@ packages: '@types/statuses@2.0.6': resolution: {integrity: sha512-xMAgYwceFhRA2zY+XbEA7mxYbA093wdiW8Vu6gZPGWy9cmOyU9XesH1tNcEWsKFd5Vzrqx5T3D38PWx1FIIXkA==} + '@types/turndown@5.0.6': + resolution: {integrity: sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==} + '@types/uuid@11.0.0': resolution: {integrity: sha512-HVyk8nj2m+jcFRNazzqyVKiZezyhDKrGUA3jlEcg/nZ6Ms+qHwocba1Y/AaVaznJTAM9xpdFSh+ptbNrhOGvZA==} deprecated: This is a stub types definition. uuid provides its own type definitions, so you do not need this installed. @@ -1903,6 +1928,9 @@ packages: resolution: {integrity: sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==} engines: {node: '>=18'} + boolbase@1.0.0: + resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} + bowser@2.13.1: resolution: {integrity: sha512-OHawaAbjwx6rqICCKgSG0SAnT05bzd7ppyKLVUITZpANBaaMFBAsaNkto3LoQ31tyFP5kNujE8Cdx85G9VzOkw==} @@ -2076,11 +2104,21 @@ packages: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} + css-select@5.2.2: + resolution: {integrity: sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==} + + css-what@6.2.2: + resolution: {integrity: sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==} + engines: {node: '>= 6'} + cssesc@3.0.0: resolution: {integrity: sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==} engines: {node: '>=4'} hasBin: true + cssom@0.5.0: + resolution: {integrity: sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==} + csstype@3.2.3: resolution: {integrity: sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==} @@ -2182,6 +2220,19 @@ packages: resolution: {integrity: sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==} engines: {node: '>=0.10.0'} + dom-serializer@2.0.0: + resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==} + + domelementtype@2.3.0: + resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==} + + domhandler@5.0.3: + resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==} + engines: {node: '>= 4'} + + domutils@3.2.2: + resolution: {integrity: sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==} + dotenv@17.2.3: resolution: {integrity: sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==} engines: {node: '>=12'} @@ -2237,6 +2288,14 @@ packages: resolution: {integrity: sha512-LgQMM4WXU3QI+SYgEc2liRgznaD5ojbmY3sb8LxyguVkIg5FxdpTkvk72te2R38/TGKxH634oLxXRGY6d7AP+Q==} engines: {node: '>=10.13.0'} + entities@4.5.0: + resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} + engines: {node: '>=0.12'} + + entities@7.0.1: + resolution: {integrity: sha512-TWrgLOFUQTH994YUyl1yT4uyavY5nNB5muff+RtWaqNVCAK408b5ZnnbNAUEWLTCpum9w6arT70i1XdQ4UeOPA==} + engines: {node: '>=0.12'} + env-paths@2.2.1: resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==} engines: {node: '>=6'} @@ -2726,6 +2785,12 @@ packages: resolution: {integrity: sha512-l7qMiNee7t82bH3SeyUCt9UF15EVmaBvsppY2zQtrbIhl/yzBTny+YUxsVjSjQ6gaqaeVtZmGocom8TzBlA4Yw==} engines: {node: '>=16.9.0'} + html-escaper@3.0.3: + resolution: {integrity: sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==} + + htmlparser2@10.1.0: + resolution: {integrity: sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ==} + http-errors@2.0.1: resolution: {integrity: sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==} engines: {node: '>= 0.8'} @@ -3140,6 +3205,15 @@ packages: lines-and-columns@1.2.4: resolution: {integrity: sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==} + linkedom@0.18.12: + resolution: {integrity: sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q==} + engines: {node: '>=16'} + peerDependencies: + canvas: '>= 2' + peerDependenciesMeta: + canvas: + optional: true + load-esm@1.0.3: resolution: {integrity: sha512-v5xlu8eHD1+6r8EHTg6hfmO97LN8ugKtiXcy5e6oN72iD2r6u0RPfLl6fxM+7Wnh2ZRq15o0russMst44WauPA==} engines: {node: '>=13.2.0'} @@ -3353,6 +3427,9 @@ packages: resolution: {integrity: sha512-9qny7Z9DsQU8Ou39ERsPU4OZQlSTP47ShQzuKZ6PRXpYLtIFgl/DEBYEXKlvcEa+9tHVcK8CF81Y2V72qaZhWA==} engines: {node: '>=18'} + nth-check@2.1.1: + resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==} + object-assign@4.1.1: resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==} engines: {node: '>=0.10.0'} @@ -4119,6 +4196,9 @@ packages: resolution: {integrity: sha512-hYbxnLEdvJF+DLALS+Ia+PbfNtn0sDP0hH2u9AFoskSUDmcVHSrtwHpzdX94MrRJKo9D9tYxY3MyP20gnlrWyA==} hasBin: true + turndown@7.2.2: + resolution: {integrity: sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ==} + tw-animate-css@1.4.0: resolution: {integrity: sha512-7bziOlRqH0hJx80h/3mbicLW7o8qLsH5+RaLR2t+OHM3D0JlWGODQKQ4cxbK7WlvmUxpcj6Kgu6EKqjrGFe3QQ==} @@ -4169,6 +4249,9 @@ packages: engines: {node: '>=14.17'} hasBin: true + uhyphen@0.2.0: + resolution: {integrity: sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==} + uid@2.0.2: resolution: {integrity: sha512-u3xV3X7uzvi5b1MncmZo3i2Aw222Zk1keqLA1YkHldREkAhAqi65wuPfe7lHx8H/Wzy+8CE7S7uS3jekIM5s8g==} engines: {node: '>=8'} @@ -5561,6 +5644,8 @@ snapshots: zod: 3.25.76 zod-to-json-schema: 3.25.1(zod@3.25.76) + '@mixmark-io/domino@2.2.0': {} + '@modelcontextprotocol/sdk@1.25.3(hono@4.11.7)(zod@3.25.76)': dependencies: '@hono/node-server': 1.19.9(hono@4.11.7) @@ -5606,6 +5691,8 @@ snapshots: - supports-color optional: true + '@mozilla/readability@0.6.0': {} + '@mswjs/interceptors@0.40.0': dependencies: '@open-draft/deferred-promise': 2.2.0 @@ -6204,6 +6291,8 @@ snapshots: '@types/statuses@2.0.6': {} + '@types/turndown@5.0.6': {} + '@types/uuid@11.0.0': dependencies: uuid: 13.0.0 @@ -6531,6 +6620,8 @@ snapshots: transitivePeerDependencies: - supports-color + boolbase@1.0.0: {} + bowser@2.13.1: {} brace-expansion@1.1.12: @@ -6695,8 +6786,20 @@ snapshots: shebang-command: 2.0.0 which: 2.0.2 + css-select@5.2.2: + dependencies: + boolbase: 1.0.0 + css-what: 6.2.2 + domhandler: 5.0.3 + domutils: 3.2.2 + nth-check: 2.1.1 + + css-what@6.2.2: {} + cssesc@3.0.0: {} + cssom@0.5.0: {} + csstype@3.2.3: {} damerau-levenshtein@1.0.8: {} @@ -6776,6 +6879,24 @@ snapshots: dependencies: esutils: 2.0.3 + dom-serializer@2.0.0: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + entities: 4.5.0 + + domelementtype@2.3.0: {} + + domhandler@5.0.3: + dependencies: + domelementtype: 2.3.0 + + domutils@3.2.2: + dependencies: + dom-serializer: 2.0.0 + domelementtype: 2.3.0 + domhandler: 5.0.3 + dotenv@17.2.3: {} dunder-proto@1.0.1: @@ -6848,6 +6969,10 @@ snapshots: graceful-fs: 4.2.11 tapable: 2.3.0 + entities@4.5.0: {} + + entities@7.0.1: {} + env-paths@2.2.1: {} error-ex@1.3.4: @@ -7577,6 +7702,15 @@ snapshots: hono@4.11.7: {} + html-escaper@3.0.3: {} + + htmlparser2@10.1.0: + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + domutils: 3.2.2 + entities: 7.0.1 + http-errors@2.0.1: dependencies: depd: 2.0.0 @@ -7942,6 +8076,14 @@ snapshots: lines-and-columns@1.2.4: {} + linkedom@0.18.12: + dependencies: + css-select: 5.2.2 + cssom: 0.5.0 + html-escaper: 3.0.3 + htmlparser2: 10.1.0 + uhyphen: 0.2.0 + load-esm@1.0.3: {} locate-path@6.0.0: @@ -8135,6 +8277,10 @@ snapshots: path-key: 4.0.0 unicorn-magic: 0.3.0 + nth-check@2.1.1: + dependencies: + boolbase: 1.0.0 + object-assign@4.1.1: {} object-hash@3.0.0: {} @@ -9075,6 +9221,10 @@ snapshots: turbo-windows-64: 2.8.0 turbo-windows-arm64: 2.8.0 + turndown@7.2.2: + dependencies: + '@mixmark-io/domino': 2.2.0 + tw-animate-css@1.4.0: {} type-check@0.4.0: @@ -9144,6 +9294,8 @@ snapshots: typescript@5.9.3: {} + uhyphen@0.2.0: {} + uid@2.0.2: dependencies: '@lukeed/csprng': 1.1.0 diff --git a/src/agent/tools.ts b/src/agent/tools.ts index c83478c1..594f1cc1 100644 --- a/src/agent/tools.ts +++ b/src/agent/tools.ts @@ -5,6 +5,7 @@ import type { AgentTool } from "@mariozechner/pi-agent-core"; import { createExecTool } from "./tools/exec.js"; import { createProcessTool } from "./tools/process.js"; import { createGlobTool } from "./tools/glob.js"; +import { createWebFetchTool, createWebSearchTool } from "./tools/web/index.js"; export function resolveModel(options: AgentOptions) { if (options.provider && options.model) { @@ -23,5 +24,14 @@ export function resolveTools(options: AgentOptions): AgentTool[] { const execTool = createExecTool(cwd); const processTool = createProcessTool(cwd); const globTool = createGlobTool(cwd); - return [...baseTools, execTool as AgentTool, processTool as AgentTool, globTool as AgentTool]; + const webFetchTool = createWebFetchTool(); + const webSearchTool = createWebSearchTool(); + return [ + ...baseTools, + execTool as AgentTool, + processTool as AgentTool, + globTool as AgentTool, + webFetchTool as AgentTool, + webSearchTool as AgentTool, + ]; } diff --git a/src/agent/tools/web/cache.ts b/src/agent/tools/web/cache.ts new file mode 100644 index 00000000..d5680006 --- /dev/null +++ b/src/agent/tools/web/cache.ts @@ -0,0 +1,87 @@ +export type CacheEntry = { + value: T; + expiresAt: number; + insertedAt: number; +}; + +export const DEFAULT_TIMEOUT_SECONDS = 30; +export const DEFAULT_CACHE_TTL_MINUTES = 15; +const DEFAULT_CACHE_MAX_ENTRIES = 100; + +export function resolveTimeoutSeconds(value: unknown, fallback: number): number { + const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; + return Math.max(1, Math.floor(parsed)); +} + +export function resolveCacheTtlMs(value: unknown, fallbackMinutes: number): number { + const minutes = + typeof value === "number" && Number.isFinite(value) ? Math.max(0, value) : fallbackMinutes; + return Math.round(minutes * 60_000); +} + +export function normalizeCacheKey(value: string): string { + return value.trim().toLowerCase(); +} + +export function readCache( + cache: Map>, + key: string, +): { value: T; cached: boolean } | null { + const entry = cache.get(key); + if (!entry) return null; + if (Date.now() > entry.expiresAt) { + cache.delete(key); + return null; + } + return { value: entry.value, cached: true }; +} + +export function writeCache( + cache: Map>, + key: string, + value: T, + ttlMs: number, +) { + if (ttlMs <= 0) return; + if (cache.size >= DEFAULT_CACHE_MAX_ENTRIES) { + const oldest = cache.keys().next(); + if (!oldest.done) cache.delete(oldest.value); + } + cache.set(key, { + value, + expiresAt: Date.now() + ttlMs, + insertedAt: Date.now(), + }); +} + +export function withTimeout(signal: AbortSignal | undefined, timeoutMs: number): AbortSignal { + if (timeoutMs <= 0) return signal ?? new AbortController().signal; + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + if (signal) { + signal.addEventListener( + "abort", + () => { + clearTimeout(timer); + controller.abort(); + }, + { once: true }, + ); + } + controller.signal.addEventListener( + "abort", + () => { + clearTimeout(timer); + }, + { once: true }, + ); + return controller.signal; +} + +export async function readResponseText(res: Response): Promise { + try { + return await res.text(); + } catch { + return ""; + } +} diff --git a/src/agent/tools/web/html-utils.ts b/src/agent/tools/web/html-utils.ts new file mode 100644 index 00000000..349de6c9 --- /dev/null +++ b/src/agent/tools/web/html-utils.ts @@ -0,0 +1,208 @@ +import TurndownService from "turndown"; + +export type ExtractMode = "markdown" | "text"; +export type ExtractorType = "readability" | "turndown"; + +export type ExtractResult = { + text: string; + title?: string; +}; + +export type ExtractResultWithExtractor = ExtractResult & { + extractor: ExtractorType; +}; + +function decodeEntities(value: string): string { + return value + .replace(/ /gi, " ") + .replace(/&/gi, "&") + .replace(/"/gi, '"') + .replace(/'/gi, "'") + .replace(/</gi, "<") + .replace(/>/gi, ">") + .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16))) + .replace(/&#(\d+);/gi, (_, dec) => String.fromCharCode(Number.parseInt(dec, 10))); +} + +function stripTags(value: string): string { + return decodeEntities(value.replace(/<[^>]+>/g, "")); +} + +function normalizeWhitespace(value: string): string { + return value + .replace(/\r/g, "") + .replace(/[ \t]+\n/g, "\n") + .replace(/\n{3,}/g, "\n\n") + .replace(/[ \t]{2,}/g, " ") + .trim(); +} + +function extractTitle(html: string): string | undefined { + const titleMatch = html.match(/]*>([\s\S]*?)<\/title>/i); + if (!titleMatch || !titleMatch[1]) return undefined; + const title = normalizeWhitespace(stripTags(titleMatch[1])); + return title || undefined; +} + +function buildResult(text: string, title: string | undefined): ExtractResult { + if (title) { + return { text, title }; + } + return { text }; +} + +function buildResultWithExtractor( + text: string, + title: string | undefined, + extractor: ExtractorType, +): ExtractResultWithExtractor { + if (title) { + return { text, title, extractor }; + } + return { text, extractor }; +} + +export function htmlToMarkdownSimple(html: string): ExtractResult { + const title = extractTitle(html); + let text = html + .replace(//gi, "") + .replace(//gi, "") + .replace(//gi, ""); + text = text.replace(/]*href=["']([^"']+)["'][^>]*>([\s\S]*?)<\/a>/gi, (_, href, body) => { + const label = normalizeWhitespace(stripTags(body)); + if (!label) return href; + return `[${label}](${href})`; + }); + text = text.replace(/]*>([\s\S]*?)<\/h\1>/gi, (_, level, body) => { + const prefix = "#".repeat(Math.max(1, Math.min(6, Number.parseInt(level, 10)))); + const label = normalizeWhitespace(stripTags(body)); + return `\n${prefix} ${label}\n`; + }); + text = text.replace(/]*>([\s\S]*?)<\/li>/gi, (_, body) => { + const label = normalizeWhitespace(stripTags(body)); + return label ? `\n- ${label}` : ""; + }); + text = text + .replace(/<(br|hr)\s*\/?>/gi, "\n") + .replace(/<\/(p|div|section|article|header|footer|table|tr|ul|ol)>/gi, "\n"); + text = stripTags(text); + text = normalizeWhitespace(text); + return buildResult(text, title); +} + +export function markdownToText(markdown: string): string { + let text = markdown; + text = text.replace(/!\[[^\]]*]\([^)]+\)/g, ""); + text = text.replace(/\[([^\]]+)]\([^)]+\)/g, "$1"); + text = text.replace(/```[\s\S]*?```/g, (block) => + block.replace(/```[^\n]*\n?/g, "").replace(/```/g, ""), + ); + text = text.replace(/`([^`]+)`/g, "$1"); + text = text.replace(/^#{1,6}\s+/gm, ""); + text = text.replace(/^\s*[-*+]\s+/gm, ""); + text = text.replace(/^\s*\d+\.\s+/gm, ""); + return normalizeWhitespace(text); +} + +export function truncateText( + value: string, + maxChars: number, +): { text: string; truncated: boolean } { + if (value.length <= maxChars) return { text: value, truncated: false }; + return { text: value.slice(0, maxChars), truncated: true }; +} + +/** + * Convert HTML to markdown using TurndownService (simpler, converts whole page) + */ +export function convertWithTurndown(html: string): ExtractResult { + const title = extractTitle(html); + + const turndownService = new TurndownService({ + headingStyle: "atx", + hr: "---", + bulletListMarker: "-", + codeBlockStyle: "fenced", + emDelimiter: "*", + }); + turndownService.remove(["script", "style", "meta", "link", "noscript"]); + + const text = normalizeWhitespace(turndownService.turndown(html)); + return buildResult(text, title); +} + +/** + * Extract readable content using Mozilla Readability (smarter, extracts main content) + */ +export async function extractWithReadability(params: { + html: string; + url: string; + extractMode: ExtractMode; +}): Promise { + const fallback = (): ExtractResult => { + const rendered = htmlToMarkdownSimple(params.html); + if (params.extractMode === "text") { + const text = markdownToText(rendered.text) || normalizeWhitespace(stripTags(params.html)); + return buildResult(text, rendered.title); + } + return rendered; + }; + + try { + const [{ Readability }, { parseHTML }] = await Promise.all([ + import("@mozilla/readability"), + import("linkedom"), + ]); + const { document } = parseHTML(params.html); + try { + (document as { baseURI?: string }).baseURI = params.url; + } catch { + // Best-effort base URI for relative links. + } + const reader = new Readability(document, { charThreshold: 0 }); + const parsed = reader.parse(); + if (!parsed?.content) return fallback(); + const title = parsed.title || undefined; + if (params.extractMode === "text") { + const text = normalizeWhitespace(parsed.textContent ?? ""); + if (!text) return fallback(); + return buildResult(text, title); + } + const rendered = htmlToMarkdownSimple(parsed.content); + return buildResult(rendered.text, title ?? rendered.title); + } catch { + return fallback(); + } +} + +/** + * Extract content from HTML using the specified extractor + */ +export async function extractContent(params: { + html: string; + url: string; + extractMode: ExtractMode; + extractor: ExtractorType; +}): Promise { + if (params.extractor === "turndown") { + const result = convertWithTurndown(params.html); + const text = params.extractMode === "text" ? markdownToText(result.text) : result.text; + return buildResultWithExtractor(text, result.title, "turndown"); + } + + // Default: readability + const result = await extractWithReadability({ + html: params.html, + url: params.url, + extractMode: params.extractMode, + }); + + if (result) { + return buildResultWithExtractor(result.text, result.title, "readability"); + } + + // Fallback to turndown if readability fails + const fallback = convertWithTurndown(params.html); + const text = params.extractMode === "text" ? markdownToText(fallback.text) : fallback.text; + return buildResultWithExtractor(text, fallback.title, "turndown"); +} diff --git a/src/agent/tools/web/index.ts b/src/agent/tools/web/index.ts new file mode 100644 index 00000000..e6213cf1 --- /dev/null +++ b/src/agent/tools/web/index.ts @@ -0,0 +1,2 @@ +export { createWebFetchTool, type WebFetchResult } from "./web-fetch.js"; +export { createWebSearchTool, type WebSearchResult } from "./web-search.js"; diff --git a/src/agent/tools/web/param-helpers.ts b/src/agent/tools/web/param-helpers.ts new file mode 100644 index 00000000..32991edb --- /dev/null +++ b/src/agent/tools/web/param-helpers.ts @@ -0,0 +1,73 @@ +import type { AgentToolResult } from "@mariozechner/pi-agent-core"; + +export type StringParamOptions = { + required?: boolean; + trim?: boolean; + label?: string; + allowEmpty?: boolean; +}; + +export function readStringParam( + params: Record, + key: string, + options: StringParamOptions & { required: true }, +): string; +export function readStringParam( + params: Record, + key: string, + options?: StringParamOptions, +): string | undefined; +export function readStringParam( + params: Record, + key: string, + options: StringParamOptions = {}, +) { + const { required = false, trim = true, label = key, allowEmpty = false } = options; + const raw = params[key]; + if (typeof raw !== "string") { + if (required) throw new Error(`${label} required`); + return undefined; + } + const value = trim ? raw.trim() : raw; + if (!value && !allowEmpty) { + if (required) throw new Error(`${label} required`); + return undefined; + } + return value; +} + +export function readNumberParam( + params: Record, + key: string, + options: { required?: boolean; label?: string; integer?: boolean } = {}, +): number | undefined { + const { required = false, label = key, integer = false } = options; + const raw = params[key]; + let value: number | undefined; + if (typeof raw === "number" && Number.isFinite(raw)) { + value = raw; + } else if (typeof raw === "string") { + const trimmed = raw.trim(); + if (trimmed) { + const parsed = Number.parseFloat(trimmed); + if (Number.isFinite(parsed)) value = parsed; + } + } + if (value === undefined) { + if (required) throw new Error(`${label} required`); + return undefined; + } + return integer ? Math.trunc(value) : value; +} + +export function jsonResult(payload: unknown): AgentToolResult { + return { + content: [ + { + type: "text", + text: JSON.stringify(payload, null, 2), + }, + ], + details: payload, + }; +} diff --git a/src/agent/tools/web/ssrf.ts b/src/agent/tools/web/ssrf.ts new file mode 100644 index 00000000..9962ab58 --- /dev/null +++ b/src/agent/tools/web/ssrf.ts @@ -0,0 +1,244 @@ +import { lookup as dnsLookup } from "node:dns/promises"; +import { lookup as dnsLookupCb, type LookupAddress } from "node:dns"; +import { Agent, type Dispatcher } from "undici"; + +type LookupCallback = ( + err: NodeJS.ErrnoException | null, + address: string | LookupAddress[], + family?: number, +) => void; + +export class SsrfBlockedError extends Error { + constructor(message: string) { + super(message); + this.name = "SsrfBlockedError"; + } +} + +type LookupFn = typeof dnsLookup; + +const PRIVATE_IPV6_PREFIXES = ["fe80:", "fec0:", "fc", "fd"]; +const BLOCKED_HOSTNAMES = new Set(["localhost", "metadata.google.internal"]); + +function normalizeHostname(hostname: string): string { + const normalized = hostname.trim().toLowerCase().replace(/\.$/, ""); + if (normalized.startsWith("[") && normalized.endsWith("]")) { + return normalized.slice(1, -1); + } + return normalized; +} + +function parseIpv4(address: string): number[] | null { + const parts = address.split("."); + if (parts.length !== 4) return null; + const numbers = parts.map((part) => Number.parseInt(part, 10)); + if (numbers.some((value) => Number.isNaN(value) || value < 0 || value > 255)) return null; + return numbers; +} + +function parseIpv4FromMappedIpv6(mapped: string): number[] | null { + if (mapped.includes(".")) { + return parseIpv4(mapped); + } + const parts = mapped.split(":").filter(Boolean); + if (parts.length === 1) { + const part0 = parts[0]; + if (!part0) return null; + const value = Number.parseInt(part0, 16); + if (Number.isNaN(value) || value < 0 || value > 0xffff_ffff) return null; + return [(value >>> 24) & 0xff, (value >>> 16) & 0xff, (value >>> 8) & 0xff, value & 0xff]; + } + if (parts.length !== 2) return null; + const part0 = parts[0]; + const part1 = parts[1]; + if (!part0 || !part1) return null; + const high = Number.parseInt(part0, 16); + const low = Number.parseInt(part1, 16); + if ( + Number.isNaN(high) || + Number.isNaN(low) || + high < 0 || + low < 0 || + high > 0xffff || + low > 0xffff + ) { + return null; + } + const value = (high << 16) + low; + return [(value >>> 24) & 0xff, (value >>> 16) & 0xff, (value >>> 8) & 0xff, value & 0xff]; +} + +function isPrivateIpv4(parts: number[]): boolean { + const octet1 = parts[0]; + const octet2 = parts[1]; + if (octet1 === undefined || octet2 === undefined) return false; + if (octet1 === 0) return true; + if (octet1 === 10) return true; + if (octet1 === 127) return true; + if (octet1 === 169 && octet2 === 254) return true; + if (octet1 === 172 && octet2 >= 16 && octet2 <= 31) return true; + if (octet1 === 192 && octet2 === 168) return true; + if (octet1 === 100 && octet2 >= 64 && octet2 <= 127) return true; + return false; +} + +export function isPrivateIpAddress(address: string): boolean { + let normalized = address.trim().toLowerCase(); + if (normalized.startsWith("[") && normalized.endsWith("]")) { + normalized = normalized.slice(1, -1); + } + if (!normalized) return false; + + if (normalized.startsWith("::ffff:")) { + const mapped = normalized.slice("::ffff:".length); + const ipv4 = parseIpv4FromMappedIpv6(mapped); + if (ipv4) return isPrivateIpv4(ipv4); + } + + if (normalized.includes(":")) { + if (normalized === "::" || normalized === "::1") return true; + return PRIVATE_IPV6_PREFIXES.some((prefix) => normalized.startsWith(prefix)); + } + + const ipv4 = parseIpv4(normalized); + if (!ipv4) return false; + return isPrivateIpv4(ipv4); +} + +export function isBlockedHostname(hostname: string): boolean { + const normalized = normalizeHostname(hostname); + if (!normalized) return false; + if (BLOCKED_HOSTNAMES.has(normalized)) return true; + return ( + normalized.endsWith(".localhost") || + normalized.endsWith(".local") || + normalized.endsWith(".internal") + ); +} + +export function createPinnedLookup(params: { + hostname: string; + addresses: string[]; + fallback?: typeof dnsLookupCb; +}): typeof dnsLookupCb { + const normalizedHost = normalizeHostname(params.hostname); + const fallback = params.fallback ?? dnsLookupCb; + const fallbackLookup = fallback as unknown as ( + hostname: string, + callback: LookupCallback, + ) => void; + const fallbackWithOptions = fallback as unknown as ( + hostname: string, + options: unknown, + callback: LookupCallback, + ) => void; + const records = params.addresses.map((address) => ({ + address, + family: address.includes(":") ? 6 : 4, + })); + let index = 0; + + return ((host: string, options?: unknown, callback?: unknown) => { + const cb: LookupCallback = + typeof options === "function" ? (options as LookupCallback) : (callback as LookupCallback); + if (!cb) return; + const normalized = normalizeHostname(host); + if (!normalized || normalized !== normalizedHost) { + if (typeof options === "function" || options === undefined) { + return fallbackLookup(host, cb); + } + return fallbackWithOptions(host, options, cb); + } + + const opts = + typeof options === "object" && options !== null + ? (options as { all?: boolean; family?: number }) + : {}; + const requestedFamily = + typeof options === "number" ? options : typeof opts.family === "number" ? opts.family : 0; + const candidates = + requestedFamily === 4 || requestedFamily === 6 + ? records.filter((entry) => entry.family === requestedFamily) + : records; + const usable = candidates.length > 0 ? candidates : records; + if (opts.all) { + cb(null, usable as LookupAddress[]); + return; + } + const chosen = usable[index % usable.length]; + index += 1; + if (chosen) { + cb(null, chosen.address, chosen.family); + } + }) as typeof dnsLookupCb; +} + +export type PinnedHostname = { + hostname: string; + addresses: string[]; + lookup: typeof dnsLookupCb; +}; + +export async function resolvePinnedHostname( + hostname: string, + lookupFn: LookupFn = dnsLookup, +): Promise { + const normalized = normalizeHostname(hostname); + if (!normalized) { + throw new Error("Invalid hostname"); + } + + if (isBlockedHostname(normalized)) { + throw new SsrfBlockedError(`Blocked hostname: ${hostname}`); + } + + if (isPrivateIpAddress(normalized)) { + throw new SsrfBlockedError("Blocked: private/internal IP address"); + } + + const results = await lookupFn(normalized, { all: true }); + if (results.length === 0) { + throw new Error(`Unable to resolve hostname: ${hostname}`); + } + + for (const entry of results) { + if (isPrivateIpAddress(entry.address)) { + throw new SsrfBlockedError("Blocked: resolves to private/internal IP address"); + } + } + + const addresses = Array.from(new Set(results.map((entry) => entry.address))); + if (addresses.length === 0) { + throw new Error(`Unable to resolve hostname: ${hostname}`); + } + + return { + hostname: normalized, + addresses, + lookup: createPinnedLookup({ hostname: normalized, addresses }), + }; +} + +export function createPinnedDispatcher(pinned: PinnedHostname): Dispatcher { + return new Agent({ + connect: { + lookup: pinned.lookup, + }, + }); +} + +export async function closeDispatcher(dispatcher?: Dispatcher | null): Promise { + if (!dispatcher) return; + const candidate = dispatcher as { close?: () => Promise | void; destroy?: () => void }; + try { + if (typeof candidate.close === "function") { + await candidate.close(); + return; + } + if (typeof candidate.destroy === "function") { + candidate.destroy(); + } + } catch { + // ignore dispatcher cleanup errors + } +} diff --git a/src/agent/tools/web/web-fetch.ts b/src/agent/tools/web/web-fetch.ts new file mode 100644 index 00000000..9fb4d253 --- /dev/null +++ b/src/agent/tools/web/web-fetch.ts @@ -0,0 +1,335 @@ +import { Type } from "@sinclair/typebox"; +import type { AgentTool } from "@mariozechner/pi-agent-core"; +import type { Dispatcher } from "undici"; + +import { + closeDispatcher, + createPinnedDispatcher, + resolvePinnedHostname, + SsrfBlockedError, +} from "./ssrf.js"; +import { + DEFAULT_CACHE_TTL_MINUTES, + DEFAULT_TIMEOUT_SECONDS, + normalizeCacheKey, + readCache, + readResponseText, + resolveCacheTtlMs, + resolveTimeoutSeconds, + withTimeout, + writeCache, +} from "./cache.js"; +import type { CacheEntry } from "./cache.js"; +import { extractContent, markdownToText, truncateText, type ExtractMode, type ExtractorType } from "./html-utils.js"; +import { jsonResult, readNumberParam, readStringParam } from "./param-helpers.js"; + +const EXTRACT_MODES = ["markdown", "text"] as const; +const EXTRACTOR_TYPES = ["readability", "turndown"] as const; + +const DEFAULT_FETCH_MAX_CHARS = 50_000; +const DEFAULT_FETCH_MAX_REDIRECTS = 3; +const DEFAULT_ERROR_MAX_CHARS = 4_000; +const DEFAULT_FETCH_USER_AGENT = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"; + +const FETCH_CACHE = new Map>>(); + +const WebFetchSchema = Type.Object({ + url: Type.String({ description: "HTTP or HTTPS URL to fetch." }), + extractMode: Type.Optional( + Type.String({ + description: 'Output format: "markdown" (default) or "text" (plain text).', + }), + ), + extractor: Type.Optional( + Type.String({ + description: + 'Extraction method: "readability" (default, smart extraction of main content) or "turndown" (convert entire page).', + }), + ), + maxChars: Type.Optional( + Type.Number({ + description: "Maximum characters to return (truncates when exceeded). Default: 50000.", + minimum: 100, + }), + ), +}); + +type WebFetchArgs = { + url: string; + extractMode?: string; + extractor?: string; + maxChars?: number; +}; + +export type WebFetchResult = { + url: string; + finalUrl: string; + status: number; + contentType: string; + title?: string; + extractMode: ExtractMode; + extractor: ExtractorType | "raw" | "json"; + truncated: boolean; + length: number; + fetchedAt: string; + tookMs: number; + text: string; + cached?: boolean; +}; + +function resolveMaxChars(value: unknown, fallback: number): number { + const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; + return Math.max(100, Math.floor(parsed)); +} + +function resolveMaxRedirects(value: unknown, fallback: number): number { + const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; + return Math.max(0, Math.floor(parsed)); +} + +function looksLikeHtml(value: string): boolean { + const trimmed = value.trimStart(); + if (!trimmed) return false; + const head = trimmed.slice(0, 256).toLowerCase(); + return head.startsWith(" { + const signal = withTimeout(undefined, params.timeoutSeconds * 1000); + const visited = new Set(); + let currentUrl = params.url; + let redirectCount = 0; + + while (true) { + let parsedUrl: URL; + try { + parsedUrl = new URL(currentUrl); + } catch { + throw new Error("Invalid URL: must be http or https"); + } + if (!["http:", "https:"].includes(parsedUrl.protocol)) { + throw new Error("Invalid URL: must be http or https"); + } + + const pinned = await resolvePinnedHostname(parsedUrl.hostname); + const dispatcher = createPinnedDispatcher(pinned); + let res: Response; + try { + // Use undici's dispatcher for SSRF protection + res = await fetch(parsedUrl.toString(), { + method: "GET", + headers: { + Accept: "*/*", + "User-Agent": params.userAgent, + "Accept-Language": "en-US,en;q=0.9", + }, + signal, + redirect: "manual", + dispatcher, + } as unknown as RequestInit); + } catch (err) { + await closeDispatcher(dispatcher); + throw err; + } + + if (isRedirectStatus(res.status)) { + const location = res.headers.get("location"); + if (!location) { + await closeDispatcher(dispatcher); + throw new Error(`Redirect missing location header (${res.status})`); + } + redirectCount += 1; + if (redirectCount > params.maxRedirects) { + await closeDispatcher(dispatcher); + throw new Error(`Too many redirects (limit: ${params.maxRedirects})`); + } + const nextUrl = new URL(location, parsedUrl).toString(); + if (visited.has(nextUrl)) { + await closeDispatcher(dispatcher); + throw new Error("Redirect loop detected"); + } + visited.add(nextUrl); + void res.body?.cancel(); + await closeDispatcher(dispatcher); + currentUrl = nextUrl; + continue; + } + + return { response: res, finalUrl: currentUrl, dispatcher }; + } +} + +function formatWebFetchErrorDetail(params: { + detail: string; + contentType?: string | null; + maxChars: number; +}): string { + const { detail, contentType, maxChars } = params; + if (!detail) return ""; + let text = detail; + const contentTypeLower = contentType?.toLowerCase(); + if (contentTypeLower?.includes("text/html") || looksLikeHtml(detail)) { + text = markdownToText(detail); + } + const truncated = truncateText(text.trim(), maxChars); + return truncated.text; +} + +async function runWebFetch(params: { + url: string; + extractMode: ExtractMode; + extractor: ExtractorType; + maxChars: number; + maxRedirects: number; + timeoutSeconds: number; + cacheTtlMs: number; + userAgent: string; +}): Promise { + const cacheKey = normalizeCacheKey( + `fetch:${params.url}:${params.extractMode}:${params.extractor}:${params.maxChars}`, + ); + const cached = readCache(FETCH_CACHE, cacheKey); + if (cached) return { ...cached.value, cached: true } as WebFetchResult; + + let parsedUrl: URL; + try { + parsedUrl = new URL(params.url); + } catch { + throw new Error("Invalid URL: must be http or https"); + } + if (!["http:", "https:"].includes(parsedUrl.protocol)) { + throw new Error("Invalid URL: must be http or https"); + } + + const start = Date.now(); + let res: Response; + let dispatcher: Dispatcher | null = null; + let finalUrl = params.url; + + const result = await fetchWithRedirects({ + url: params.url, + maxRedirects: params.maxRedirects, + timeoutSeconds: params.timeoutSeconds, + userAgent: params.userAgent, + }); + res = result.response; + finalUrl = result.finalUrl; + dispatcher = result.dispatcher; + + try { + if (!res.ok) { + const rawDetail = await readResponseText(res); + const detail = formatWebFetchErrorDetail({ + detail: rawDetail, + contentType: res.headers.get("content-type"), + maxChars: DEFAULT_ERROR_MAX_CHARS, + }); + throw new Error(`Web fetch failed (${res.status}): ${detail || res.statusText}`); + } + + const contentType = res.headers.get("content-type") ?? "application/octet-stream"; + const body = await readResponseText(res); + + let title: string | undefined; + let extractor: ExtractorType | "raw" | "json" = "raw"; + let text = body; + + if (contentType.includes("text/html")) { + const extracted = await extractContent({ + html: body, + url: finalUrl, + extractMode: params.extractMode, + extractor: params.extractor, + }); + text = extracted.text; + title = extracted.title; + extractor = extracted.extractor; + } else if (contentType.includes("application/json")) { + try { + text = JSON.stringify(JSON.parse(body), null, 2); + extractor = "json"; + } catch { + text = body; + extractor = "raw"; + } + } + + const truncated = truncateText(text, params.maxChars); + const payload: WebFetchResult = { + url: params.url, + finalUrl, + status: res.status, + contentType, + extractMode: params.extractMode, + extractor, + truncated: truncated.truncated, + length: truncated.text.length, + fetchedAt: new Date().toISOString(), + tookMs: Date.now() - start, + text: truncated.text, + }; + if (title) { + payload.title = title; + } + writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; + } finally { + await closeDispatcher(dispatcher); + } +} + +export function createWebFetchTool(): AgentTool { + return { + name: "web_fetch", + label: "Web Fetch", + description: + 'Fetch and extract readable content from a URL. Converts HTML to markdown or plain text. Use extractor="readability" for smart article extraction, or "turndown" for full page conversion.', + parameters: WebFetchSchema, + execute: async (_toolCallId, args) => { + const params = args as WebFetchArgs; + const url = readStringParam(params as Record, "url", { required: true }); + const extractModeRaw = readStringParam(params as Record, "extractMode"); + const extractMode: ExtractMode = + extractModeRaw === "text" ? "text" : "markdown"; + const extractorRaw = readStringParam(params as Record, "extractor"); + const extractor: ExtractorType = + extractorRaw === "turndown" ? "turndown" : "readability"; + const maxChars = readNumberParam(params as Record, "maxChars", { integer: true }); + + try { + const result = await runWebFetch({ + url, + extractMode, + extractor, + maxChars: resolveMaxChars(maxChars, DEFAULT_FETCH_MAX_CHARS), + maxRedirects: DEFAULT_FETCH_MAX_REDIRECTS, + timeoutSeconds: DEFAULT_TIMEOUT_SECONDS, + cacheTtlMs: resolveCacheTtlMs(DEFAULT_CACHE_TTL_MINUTES, DEFAULT_CACHE_TTL_MINUTES), + userAgent: DEFAULT_FETCH_USER_AGENT, + }); + return jsonResult(result); + } catch (error) { + if (error instanceof SsrfBlockedError) { + return jsonResult({ + error: "ssrf_blocked", + message: error.message, + }); + } + return jsonResult({ + error: "fetch_failed", + message: error instanceof Error ? error.message : String(error), + }); + } + }, + }; +} diff --git a/src/agent/tools/web/web-search.ts b/src/agent/tools/web/web-search.ts new file mode 100644 index 00000000..5b69a850 --- /dev/null +++ b/src/agent/tools/web/web-search.ts @@ -0,0 +1,451 @@ +import { Type } from "@sinclair/typebox"; +import type { AgentTool } from "@mariozechner/pi-agent-core"; + +import { + DEFAULT_CACHE_TTL_MINUTES, + DEFAULT_TIMEOUT_SECONDS, + normalizeCacheKey, + readCache, + readResponseText, + resolveCacheTtlMs, + resolveTimeoutSeconds, + withTimeout, + writeCache, +} from "./cache.js"; +import type { CacheEntry } from "./cache.js"; +import { jsonResult, readNumberParam, readStringParam } from "./param-helpers.js"; + +const SEARCH_PROVIDERS = ["brave", "perplexity"] as const; +type SearchProvider = (typeof SEARCH_PROVIDERS)[number]; + +const DEFAULT_SEARCH_COUNT = 5; +const MAX_SEARCH_COUNT = 10; + +const BRAVE_SEARCH_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"; +const DEFAULT_PERPLEXITY_BASE_URL = "https://openrouter.ai/api/v1"; +const PERPLEXITY_DIRECT_BASE_URL = "https://api.perplexity.ai"; +const DEFAULT_PERPLEXITY_MODEL = "perplexity/sonar-pro"; +const PERPLEXITY_KEY_PREFIXES = ["pplx-"]; +const OPENROUTER_KEY_PREFIXES = ["sk-or-"]; + +const SEARCH_CACHE = new Map>>(); +const BRAVE_FRESHNESS_SHORTCUTS = new Set(["pd", "pw", "pm", "py"]); +const BRAVE_FRESHNESS_RANGE = /^(\d{4}-\d{2}-\d{2})to(\d{4}-\d{2}-\d{2})$/; + +const WebSearchSchema = Type.Object({ + query: Type.String({ description: "Search query string." }), + provider: Type.Optional( + Type.String({ + description: + 'Search provider: "brave" (default, traditional search results) or "perplexity" (AI-synthesized answers).', + }), + ), + count: Type.Optional( + Type.Number({ + description: "Number of results to return (1-10). Default: 5. Brave only.", + minimum: 1, + maximum: MAX_SEARCH_COUNT, + }), + ), + country: Type.Optional( + Type.String({ + description: + "2-letter country code for region-specific results (e.g., 'DE', 'US'). Default: 'US'.", + }), + ), + freshness: Type.Optional( + Type.String({ + description: + "Filter results by time (Brave only): 'pd' (past day), 'pw' (past week), 'pm' (past month), 'py' (past year), or 'YYYY-MM-DDtoYYYY-MM-DD'.", + }), + ), +}); + +type WebSearchArgs = { + query: string; + provider?: string; + count?: number; + country?: string; + freshness?: string; +}; + +type BraveSearchResult = { + title?: string; + url?: string; + description?: string; + age?: string; +}; + +type BraveSearchResponse = { + web?: { + results?: BraveSearchResult[]; + }; +}; + +type PerplexitySearchResponse = { + choices?: Array<{ + message?: { + content?: string; + }; + }>; + citations?: string[]; +}; + +export type WebSearchResult = { + query: string; + provider: SearchProvider; + tookMs: number; + cached?: boolean; +} & ( + | { + // Brave result + count: number; + results: Array<{ + title: string; + url: string; + description: string; + published?: string; + siteName?: string; + }>; + } + | { + // Perplexity result + model: string; + content: string; + citations: string[]; + } +); + +function resolveSearchCount(value: unknown, fallback: number): number { + const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; + const clamped = Math.max(1, Math.min(MAX_SEARCH_COUNT, Math.floor(parsed))); + return clamped; +} + +function normalizeFreshness(value: string | undefined): string | undefined { + if (!value) return undefined; + const trimmed = value.trim(); + if (!trimmed) return undefined; + + const lower = trimmed.toLowerCase(); + if (BRAVE_FRESHNESS_SHORTCUTS.has(lower)) return lower; + + const match = trimmed.match(BRAVE_FRESHNESS_RANGE); + if (!match) return undefined; + + const start = match[1]; + const end = match[2]; + if (!start || !end) return undefined; + if (!isValidIsoDate(start) || !isValidIsoDate(end)) return undefined; + if (start > end) return undefined; + + return `${start}to${end}`; +} + +function isValidIsoDate(value: string): boolean { + if (!/^\d{4}-\d{2}-\d{2}$/.test(value)) return false; + const parts = value.split("-").map((part) => Number.parseInt(part, 10)); + const year = parts[0]; + const month = parts[1]; + const day = parts[2]; + if (year === undefined || month === undefined || day === undefined) return false; + if (!Number.isFinite(year) || !Number.isFinite(month) || !Number.isFinite(day)) return false; + + const date = new Date(Date.UTC(year, month - 1, day)); + return ( + date.getUTCFullYear() === year && date.getUTCMonth() === month - 1 && date.getUTCDate() === day + ); +} + +function resolveSiteName(url: string | undefined): string | undefined { + if (!url) return undefined; + try { + return new URL(url).hostname; + } catch { + return undefined; + } +} + +function inferPerplexityBaseUrl(apiKey: string): string { + const normalized = apiKey.toLowerCase(); + if (PERPLEXITY_KEY_PREFIXES.some((prefix) => normalized.startsWith(prefix))) { + return PERPLEXITY_DIRECT_BASE_URL; + } + if (OPENROUTER_KEY_PREFIXES.some((prefix) => normalized.startsWith(prefix))) { + return DEFAULT_PERPLEXITY_BASE_URL; + } + return DEFAULT_PERPLEXITY_BASE_URL; +} + +function resolvePerplexityApiKey(): { apiKey: string; source: string } | { apiKey: null; source: "none" } { + const perplexityKey = (process.env.PERPLEXITY_API_KEY ?? "").trim(); + if (perplexityKey) { + return { apiKey: perplexityKey, source: "PERPLEXITY_API_KEY" }; + } + + const openrouterKey = (process.env.OPENROUTER_API_KEY ?? "").trim(); + if (openrouterKey) { + return { apiKey: openrouterKey, source: "OPENROUTER_API_KEY" }; + } + + return { apiKey: null, source: "none" }; +} + +function resolveBraveApiKey(): string | undefined { + return (process.env.BRAVE_API_KEY ?? "").trim() || undefined; +} + +function resolveProvider(requested?: string): SearchProvider { + if (requested === "perplexity") return "perplexity"; + if (requested === "brave") return "brave"; + + // Auto-detect based on available API keys + const braveKey = resolveBraveApiKey(); + if (braveKey) return "brave"; + + const perplexityResult = resolvePerplexityApiKey(); + if (perplexityResult.apiKey) return "perplexity"; + + // Default to brave + return "brave"; +} + +async function runPerplexitySearch(params: { + query: string; + apiKey: string; + baseUrl: string; + model: string; + timeoutSeconds: number; +}): Promise<{ content: string; citations: string[] }> { + const endpoint = `${params.baseUrl.replace(/\/$/, "")}/chat/completions`; + + const res = await fetch(endpoint, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${params.apiKey}`, + "HTTP-Referer": "https://multica.ai", + "X-Title": "Multica Web Search", + }, + body: JSON.stringify({ + model: params.model, + messages: [ + { + role: "user", + content: params.query, + }, + ], + }), + signal: withTimeout(undefined, params.timeoutSeconds * 1000), + }); + + if (!res.ok) { + const detail = await readResponseText(res); + throw new Error(`Perplexity API error (${res.status}): ${detail || res.statusText}`); + } + + const data = (await res.json()) as PerplexitySearchResponse; + const content = data.choices?.[0]?.message?.content ?? "No response"; + const citations = data.citations ?? []; + + return { content, citations }; +} + +async function runBraveSearch(params: { + query: string; + count: number; + apiKey: string; + timeoutSeconds: number; + country: string | undefined; + freshness: string | undefined; +}): Promise<{ + results: Array<{ + title: string; + url: string; + description: string; + published?: string; + siteName?: string; + }>; +}> { + const url = new URL(BRAVE_SEARCH_ENDPOINT); + url.searchParams.set("q", params.query); + url.searchParams.set("count", String(params.count)); + if (params.country) { + url.searchParams.set("country", params.country); + } + if (params.freshness) { + url.searchParams.set("freshness", params.freshness); + } + + const res = await fetch(url.toString(), { + method: "GET", + headers: { + Accept: "application/json", + "X-Subscription-Token": params.apiKey, + }, + signal: withTimeout(undefined, params.timeoutSeconds * 1000), + }); + + if (!res.ok) { + const detail = await readResponseText(res); + throw new Error(`Brave Search API error (${res.status}): ${detail || res.statusText}`); + } + + const data = (await res.json()) as BraveSearchResponse; + const rawResults = Array.isArray(data.web?.results) ? (data.web?.results ?? []) : []; + const results = rawResults.map((entry) => { + const result: { + title: string; + url: string; + description: string; + published?: string; + siteName?: string; + } = { + title: entry.title ?? "", + url: entry.url ?? "", + description: entry.description ?? "", + }; + if (entry.age) { + result.published = entry.age; + } + const siteName = resolveSiteName(entry.url); + if (siteName) { + result.siteName = siteName; + } + return result; + }); + return { results }; +} + +async function runWebSearch(params: { + query: string; + provider: SearchProvider; + count: number; + timeoutSeconds: number; + cacheTtlMs: number; + country: string | undefined; + freshness: string | undefined; +}): Promise> { + const cacheKey = normalizeCacheKey( + `${params.provider}:${params.query}:${params.count}:${params.country || "default"}:${params.freshness || "default"}`, + ); + const cached = readCache(SEARCH_CACHE, cacheKey); + if (cached) return { ...cached.value, cached: true }; + + const start = Date.now(); + + if (params.provider === "perplexity") { + const perplexityResult = resolvePerplexityApiKey(); + if (!perplexityResult.apiKey) { + return { + error: "missing_api_key", + message: + "Perplexity search requires PERPLEXITY_API_KEY or OPENROUTER_API_KEY environment variable.", + }; + } + + const apiKey = perplexityResult.apiKey; + const baseUrl = inferPerplexityBaseUrl(apiKey); + const { content, citations } = await runPerplexitySearch({ + query: params.query, + apiKey, + baseUrl, + model: DEFAULT_PERPLEXITY_MODEL, + timeoutSeconds: params.timeoutSeconds, + }); + + const payload = { + query: params.query, + provider: params.provider, + model: DEFAULT_PERPLEXITY_MODEL, + tookMs: Date.now() - start, + content, + citations, + }; + writeCache(SEARCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; + } + + // Brave search + const apiKey = resolveBraveApiKey(); + if (!apiKey) { + return { + error: "missing_api_key", + message: "Brave search requires BRAVE_API_KEY environment variable.", + }; + } + + const { results } = await runBraveSearch({ + query: params.query, + count: params.count, + apiKey, + timeoutSeconds: params.timeoutSeconds, + country: params.country, + freshness: params.freshness, + }); + + const payload = { + query: params.query, + provider: params.provider, + count: results.length, + tookMs: Date.now() - start, + results, + }; + writeCache(SEARCH_CACHE, cacheKey, payload, params.cacheTtlMs); + return payload; +} + +export function createWebSearchTool(): AgentTool { + return { + name: "web_search", + label: "Web Search", + description: + 'Search the web. Supports "brave" (traditional results with titles/URLs/snippets) and "perplexity" (AI-synthesized answers with citations). Provider auto-detected from available API keys if not specified.', + parameters: WebSearchSchema, + execute: async (_toolCallId, args) => { + const params = args as WebSearchArgs; + const query = readStringParam(params as Record, "query", { required: true }); + const providerRaw = readStringParam(params as Record, "provider"); + const provider = resolveProvider(providerRaw); + const count = + readNumberParam(params as Record, "count", { integer: true }) ?? + DEFAULT_SEARCH_COUNT; + const country = readStringParam(params as Record, "country"); + const rawFreshness = readStringParam(params as Record, "freshness"); + + if (rawFreshness && provider !== "brave") { + return jsonResult({ + error: "unsupported_parameter", + message: "freshness parameter is only supported by the Brave search provider.", + }); + } + + const freshness = rawFreshness ? normalizeFreshness(rawFreshness) : undefined; + if (rawFreshness && !freshness) { + return jsonResult({ + error: "invalid_freshness", + message: + "freshness must be one of: pd (past day), pw (past week), pm (past month), py (past year), or YYYY-MM-DDtoYYYY-MM-DD.", + }); + } + + try { + const result = await runWebSearch({ + query, + provider, + count: resolveSearchCount(count, DEFAULT_SEARCH_COUNT), + timeoutSeconds: resolveTimeoutSeconds(DEFAULT_TIMEOUT_SECONDS, DEFAULT_TIMEOUT_SECONDS), + cacheTtlMs: resolveCacheTtlMs(DEFAULT_CACHE_TTL_MINUTES, DEFAULT_CACHE_TTL_MINUTES), + country, + freshness, + }); + return jsonResult(result); + } catch (error) { + return jsonResult({ + error: "search_failed", + message: error instanceof Error ? error.message : String(error), + }); + } + }, + }; +}