Merge pull request #171 from multica-ai/forrestchang/markdown-accept

feat(web): add Accept: text/markdown header for Cloudflare Markdown for Agents
This commit is contained in:
Jiayuan Zhang 2026-02-13 21:00:08 +08:00 committed by GitHub
commit 13e5492993
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 77 additions and 5 deletions

View file

@ -4,6 +4,7 @@ import {
markdownToText,
truncateText,
convertWithTurndown,
extractMarkdownTitle,
} from "./html-utils.js";
describe("html-utils", () => {
@ -190,6 +191,32 @@ describe("html-utils", () => {
});
});
describe("extractMarkdownTitle", () => {
it("should extract title from YAML frontmatter", () => {
const md = "---\ntitle: My Page Title\ndescription: Some desc\n---\n\n# Heading\n\nContent";
expect(extractMarkdownTitle(md)).toBe("My Page Title");
});
it("should fall back to first # heading when no frontmatter", () => {
const md = "# My Heading\n\nSome content here";
expect(extractMarkdownTitle(md)).toBe("My Heading");
});
it("should prefer frontmatter title over heading", () => {
const md = "---\ntitle: Frontmatter Title\n---\n\n# Heading Title";
expect(extractMarkdownTitle(md)).toBe("Frontmatter Title");
});
it("should return undefined when no title found", () => {
const md = "Just some text without a title or heading";
expect(extractMarkdownTitle(md)).toBeUndefined();
});
it("should handle empty string", () => {
expect(extractMarkdownTitle("")).toBeUndefined();
});
});
describe("convertWithTurndown", () => {
it("should convert HTML to markdown", () => {
const html = "<html><head><title>Page</title></head><body><h1>Hello</h1><p>World</p></body></html>";

View file

@ -112,6 +112,29 @@ export function truncateText(
return { text: value.slice(0, maxChars), truncated: true };
}
/**
* Extract a title from a native markdown response.
* Checks YAML frontmatter `title:` first, then falls back to the first `# heading`.
*/
export function extractMarkdownTitle(markdown: string): string | undefined {
// Check YAML frontmatter
const frontmatterMatch = markdown.match(/^---\s*\n([\s\S]*?)\n---/);
if (frontmatterMatch?.[1]) {
const titleMatch = frontmatterMatch[1].match(/^title:\s*(.+)$/m);
if (titleMatch?.[1]) {
const title = titleMatch[1].trim();
if (title) return title;
}
}
// Fall back to first # heading
const headingMatch = markdown.match(/^#\s+(.+)$/m);
if (headingMatch?.[1]) {
const title = headingMatch[1].trim();
if (title) return title;
}
return undefined;
}
/**
* Convert HTML to markdown using TurndownService (simpler, converts whole page)
*/

View file

@ -20,7 +20,7 @@ import {
writeCache,
} from "./cache.js";
import type { CacheEntry } from "./cache.js";
import { extractContent, markdownToText, truncateText, type ExtractMode, type ExtractorType } from "./html-utils.js";
import { extractContent, extractMarkdownTitle, markdownToText, truncateText, type ExtractMode, type ExtractorType } from "./html-utils.js";
import { jsonResult, readNumberParam, readStringParam } from "./param-helpers.js";
const EXTRACT_MODES = ["markdown", "text"] as const;
@ -69,13 +69,14 @@ export type WebFetchResult = {
contentType: string;
title?: string;
extractMode: ExtractMode;
extractor: ExtractorType | "raw" | "json";
extractor: ExtractorType | "raw" | "json" | "markdown-native";
truncated: boolean;
length: number;
fetchedAt: string;
tookMs: number;
text: string;
cached?: boolean;
markdownTokens?: number;
};
function resolveMaxChars(value: unknown, fallback: number): number {
@ -129,7 +130,7 @@ async function fetchWithRedirects(params: {
res = await fetch(parsedUrl.toString(), {
method: "GET",
headers: {
Accept: "*/*",
Accept: "text/markdown, text/html;q=0.9, */*;q=0.8",
"User-Agent": params.userAgent,
"Accept-Language": "en-US,en;q=0.9",
},
@ -241,10 +242,28 @@ async function runWebFetch(params: {
const body = await readResponseText(res);
let title: string | undefined;
let extractor: ExtractorType | "raw" | "json" = "raw";
let extractor: ExtractorType | "raw" | "json" | "markdown-native" = "raw";
let text = body;
let markdownTokens: number | undefined;
if (contentType.includes("text/html")) {
// Capture x-markdown-tokens header when present (Cloudflare Markdown for Agents)
const markdownTokensHeader = res.headers.get("x-markdown-tokens");
if (markdownTokensHeader) {
const parsed = Number.parseInt(markdownTokensHeader, 10);
if (Number.isFinite(parsed) && parsed > 0) {
markdownTokens = parsed;
}
}
if (contentType.includes("text/markdown")) {
// Server returned markdown directly (e.g. Cloudflare Markdown for Agents) — skip HTML parsing
text = body;
extractor = "markdown-native";
title = extractMarkdownTitle(body);
if (params.extractMode === "text") {
text = markdownToText(body);
}
} else if (contentType.includes("text/html")) {
const extracted = await extractContent({
html: body,
url: finalUrl,
@ -281,6 +300,9 @@ async function runWebFetch(params: {
if (title) {
payload.title = title;
}
if (markdownTokens !== undefined) {
payload.markdownTokens = markdownTokens;
}
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
return payload;
} finally {