diff --git a/packages/core/src/agent/tools/web/html-utils.test.ts b/packages/core/src/agent/tools/web/html-utils.test.ts index 5477f241..05591dbf 100644 --- a/packages/core/src/agent/tools/web/html-utils.test.ts +++ b/packages/core/src/agent/tools/web/html-utils.test.ts @@ -4,6 +4,7 @@ import { markdownToText, truncateText, convertWithTurndown, + extractMarkdownTitle, } from "./html-utils.js"; describe("html-utils", () => { @@ -190,6 +191,32 @@ describe("html-utils", () => { }); }); + describe("extractMarkdownTitle", () => { + it("should extract title from YAML frontmatter", () => { + const md = "---\ntitle: My Page Title\ndescription: Some desc\n---\n\n# Heading\n\nContent"; + expect(extractMarkdownTitle(md)).toBe("My Page Title"); + }); + + it("should fall back to first # heading when no frontmatter", () => { + const md = "# My Heading\n\nSome content here"; + expect(extractMarkdownTitle(md)).toBe("My Heading"); + }); + + it("should prefer frontmatter title over heading", () => { + const md = "---\ntitle: Frontmatter Title\n---\n\n# Heading Title"; + expect(extractMarkdownTitle(md)).toBe("Frontmatter Title"); + }); + + it("should return undefined when no title found", () => { + const md = "Just some text without a title or heading"; + expect(extractMarkdownTitle(md)).toBeUndefined(); + }); + + it("should handle empty string", () => { + expect(extractMarkdownTitle("")).toBeUndefined(); + }); + }); + describe("convertWithTurndown", () => { it("should convert HTML to markdown", () => { const html = "
World
"; diff --git a/packages/core/src/agent/tools/web/html-utils.ts b/packages/core/src/agent/tools/web/html-utils.ts index 349de6c9..728a3c61 100644 --- a/packages/core/src/agent/tools/web/html-utils.ts +++ b/packages/core/src/agent/tools/web/html-utils.ts @@ -112,6 +112,29 @@ export function truncateText( return { text: value.slice(0, maxChars), truncated: true }; } +/** + * Extract a title from a native markdown response. + * Checks YAML frontmatter `title:` first, then falls back to the first `# heading`. + */ +export function extractMarkdownTitle(markdown: string): string | undefined { + // Check YAML frontmatter + const frontmatterMatch = markdown.match(/^---\s*\n([\s\S]*?)\n---/); + if (frontmatterMatch?.[1]) { + const titleMatch = frontmatterMatch[1].match(/^title:\s*(.+)$/m); + if (titleMatch?.[1]) { + const title = titleMatch[1].trim(); + if (title) return title; + } + } + // Fall back to first # heading + const headingMatch = markdown.match(/^#\s+(.+)$/m); + if (headingMatch?.[1]) { + const title = headingMatch[1].trim(); + if (title) return title; + } + return undefined; +} + /** * Convert HTML to markdown using TurndownService (simpler, converts whole page) */ diff --git a/packages/core/src/agent/tools/web/web-fetch.ts b/packages/core/src/agent/tools/web/web-fetch.ts index 9fb4d253..32a1e079 100644 --- a/packages/core/src/agent/tools/web/web-fetch.ts +++ b/packages/core/src/agent/tools/web/web-fetch.ts @@ -20,7 +20,7 @@ import { writeCache, } from "./cache.js"; import type { CacheEntry } from "./cache.js"; -import { extractContent, markdownToText, truncateText, type ExtractMode, type ExtractorType } from "./html-utils.js"; +import { extractContent, extractMarkdownTitle, markdownToText, truncateText, type ExtractMode, type ExtractorType } from "./html-utils.js"; import { jsonResult, readNumberParam, readStringParam } from "./param-helpers.js"; const EXTRACT_MODES = ["markdown", "text"] as const; @@ -69,13 +69,14 @@ export type WebFetchResult = { contentType: string; title?: string; extractMode: ExtractMode; - extractor: ExtractorType | "raw" | "json"; + extractor: ExtractorType | "raw" | "json" | "markdown-native"; truncated: boolean; length: number; fetchedAt: string; tookMs: number; text: string; cached?: boolean; + markdownTokens?: number; }; function resolveMaxChars(value: unknown, fallback: number): number { @@ -129,7 +130,7 @@ async function fetchWithRedirects(params: { res = await fetch(parsedUrl.toString(), { method: "GET", headers: { - Accept: "*/*", + Accept: "text/markdown, text/html;q=0.9, */*;q=0.8", "User-Agent": params.userAgent, "Accept-Language": "en-US,en;q=0.9", }, @@ -241,10 +242,28 @@ async function runWebFetch(params: { const body = await readResponseText(res); let title: string | undefined; - let extractor: ExtractorType | "raw" | "json" = "raw"; + let extractor: ExtractorType | "raw" | "json" | "markdown-native" = "raw"; let text = body; + let markdownTokens: number | undefined; - if (contentType.includes("text/html")) { + // Capture x-markdown-tokens header when present (Cloudflare Markdown for Agents) + const markdownTokensHeader = res.headers.get("x-markdown-tokens"); + if (markdownTokensHeader) { + const parsed = Number.parseInt(markdownTokensHeader, 10); + if (Number.isFinite(parsed) && parsed > 0) { + markdownTokens = parsed; + } + } + + if (contentType.includes("text/markdown")) { + // Server returned markdown directly (e.g. Cloudflare Markdown for Agents) — skip HTML parsing + text = body; + extractor = "markdown-native"; + title = extractMarkdownTitle(body); + if (params.extractMode === "text") { + text = markdownToText(body); + } + } else if (contentType.includes("text/html")) { const extracted = await extractContent({ html: body, url: finalUrl, @@ -281,6 +300,9 @@ async function runWebFetch(params: { if (title) { payload.title = title; } + if (markdownTokens !== undefined) { + payload.markdownTokens = markdownTokens; + } writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs); return payload; } finally {