Merge pull request #171 from multica-ai/forrestchang/markdown-accept

feat(web): add Accept: text/markdown header for Cloudflare Markdown for Agents
2026-02-13 21:00:08 +08:00 · 2026-02-13 21:00:08 +08:00 · 13e5492993
commit 13e5492993
parent 8881ae8f4b 27c3ba5682
3 changed files with 77 additions and 5 deletions
--- a/packages/core/src/agent/tools/web/html-utils.test.ts
+++ b/packages/core/src/agent/tools/web/html-utils.test.ts
@ -4,6 +4,7 @@ import {
  markdownToText,
  truncateText,
  convertWithTurndown,
+  extractMarkdownTitle,
 } from "./html-utils.js";

 describe("html-utils", () => {
@ -190,6 +191,32 @@ describe("html-utils", () => {
    });
  });

+  describe("extractMarkdownTitle", () => {
+    it("should extract title from YAML frontmatter", () => {
+      const md = "---\ntitle: My Page Title\ndescription: Some desc\n---\n\n# Heading\n\nContent";
+      expect(extractMarkdownTitle(md)).toBe("My Page Title");
+    });
+
+    it("should fall back to first # heading when no frontmatter", () => {
+      const md = "# My Heading\n\nSome content here";
+      expect(extractMarkdownTitle(md)).toBe("My Heading");
+    });
+
+    it("should prefer frontmatter title over heading", () => {
+      const md = "---\ntitle: Frontmatter Title\n---\n\n# Heading Title";
+      expect(extractMarkdownTitle(md)).toBe("Frontmatter Title");
+    });
+
+    it("should return undefined when no title found", () => {
+      const md = "Just some text without a title or heading";
+      expect(extractMarkdownTitle(md)).toBeUndefined();
+    });
+
+    it("should handle empty string", () => {
+      expect(extractMarkdownTitle("")).toBeUndefined();
+    });
+  });
+
  describe("convertWithTurndown", () => {
    it("should convert HTML to markdown", () => {
      const html = "<html><head><title>Page</title></head><body><h1>Hello</h1><p>World</p></body></html>";
--- a/packages/core/src/agent/tools/web/html-utils.ts
+++ b/packages/core/src/agent/tools/web/html-utils.ts
@ -112,6 +112,29 @@ export function truncateText(
  return { text: value.slice(0, maxChars), truncated: true };
 }

+/**
+ * Extract a title from a native markdown response.
+ * Checks YAML frontmatter `title:` first, then falls back to the first `# heading`.
+ */
+export function extractMarkdownTitle(markdown: string): string | undefined {
+  // Check YAML frontmatter
+  const frontmatterMatch = markdown.match(/^---\s*\n([\s\S]*?)\n---/);
+  if (frontmatterMatch?.[1]) {
+    const titleMatch = frontmatterMatch[1].match(/^title:\s*(.+)$/m);
+    if (titleMatch?.[1]) {
+      const title = titleMatch[1].trim();
+      if (title) return title;
+    }
+  }
+  // Fall back to first # heading
+  const headingMatch = markdown.match(/^#\s+(.+)$/m);
+  if (headingMatch?.[1]) {
+    const title = headingMatch[1].trim();
+    if (title) return title;
+  }
+  return undefined;
+}
+
 /**
 * Convert HTML to markdown using TurndownService (simpler, converts whole page)
 */
--- a/packages/core/src/agent/tools/web/web-fetch.ts
+++ b/packages/core/src/agent/tools/web/web-fetch.ts
@ -20,7 +20,7 @@ import {
  writeCache,
 } from "./cache.js";
 import type { CacheEntry } from "./cache.js";
-import { extractContent, markdownToText, truncateText, type ExtractMode, type ExtractorType } from "./html-utils.js";
+import { extractContent, extractMarkdownTitle, markdownToText, truncateText, type ExtractMode, type ExtractorType } from "./html-utils.js";
 import { jsonResult, readNumberParam, readStringParam } from "./param-helpers.js";

 const EXTRACT_MODES = ["markdown", "text"] as const;
@ -69,13 +69,14 @@ export type WebFetchResult = {
  contentType: string;
  title?: string;
  extractMode: ExtractMode;
-  extractor: ExtractorType | "raw" | "json";
+  extractor: ExtractorType | "raw" | "json" | "markdown-native";
  truncated: boolean;
  length: number;
  fetchedAt: string;
  tookMs: number;
  text: string;
  cached?: boolean;
+  markdownTokens?: number;
 };

 function resolveMaxChars(value: unknown, fallback: number): number {
@ -129,7 +130,7 @@ async function fetchWithRedirects(params: {
      res = await fetch(parsedUrl.toString(), {
        method: "GET",
        headers: {
-          Accept: "*/*",
+          Accept: "text/markdown, text/html;q=0.9, */*;q=0.8",
          "User-Agent": params.userAgent,
          "Accept-Language": "en-US,en;q=0.9",
        },
@ -241,10 +242,28 @@ async function runWebFetch(params: {
    const body = await readResponseText(res);

    let title: string | undefined;
-    let extractor: ExtractorType | "raw" | "json" = "raw";
+    let extractor: ExtractorType | "raw" | "json" | "markdown-native" = "raw";
    let text = body;
+    let markdownTokens: number | undefined;

-    if (contentType.includes("text/html")) {
+    // Capture x-markdown-tokens header when present (Cloudflare Markdown for Agents)
+    const markdownTokensHeader = res.headers.get("x-markdown-tokens");
+    if (markdownTokensHeader) {
+      const parsed = Number.parseInt(markdownTokensHeader, 10);
+      if (Number.isFinite(parsed) && parsed > 0) {
+        markdownTokens = parsed;
+      }
+    }
+
+    if (contentType.includes("text/markdown")) {
+      // Server returned markdown directly (e.g. Cloudflare Markdown for Agents) — skip HTML parsing
+      text = body;
+      extractor = "markdown-native";
+      title = extractMarkdownTitle(body);
+      if (params.extractMode === "text") {
+        text = markdownToText(body);
+      }
+    } else if (contentType.includes("text/html")) {
      const extracted = await extractContent({
        html: body,
        url: finalUrl,
@ -281,6 +300,9 @@ async function runWebFetch(params: {
    if (title) {
      payload.title = title;
    }
+    if (markdownTokens !== undefined) {
+      payload.markdownTokens = markdownTokens;
+    }
    writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
    return payload;
  } finally {