Merge pull request #171 from multica-ai/forrestchang/markdown-accept
feat(web): add Accept: text/markdown header for Cloudflare Markdown for Agents
This commit is contained in:
commit
13e5492993
3 changed files with 77 additions and 5 deletions
|
|
@ -4,6 +4,7 @@ import {
|
|||
markdownToText,
|
||||
truncateText,
|
||||
convertWithTurndown,
|
||||
extractMarkdownTitle,
|
||||
} from "./html-utils.js";
|
||||
|
||||
describe("html-utils", () => {
|
||||
|
|
@ -190,6 +191,32 @@ describe("html-utils", () => {
|
|||
});
|
||||
});
|
||||
|
||||
describe("extractMarkdownTitle", () => {
|
||||
it("should extract title from YAML frontmatter", () => {
|
||||
const md = "---\ntitle: My Page Title\ndescription: Some desc\n---\n\n# Heading\n\nContent";
|
||||
expect(extractMarkdownTitle(md)).toBe("My Page Title");
|
||||
});
|
||||
|
||||
it("should fall back to first # heading when no frontmatter", () => {
|
||||
const md = "# My Heading\n\nSome content here";
|
||||
expect(extractMarkdownTitle(md)).toBe("My Heading");
|
||||
});
|
||||
|
||||
it("should prefer frontmatter title over heading", () => {
|
||||
const md = "---\ntitle: Frontmatter Title\n---\n\n# Heading Title";
|
||||
expect(extractMarkdownTitle(md)).toBe("Frontmatter Title");
|
||||
});
|
||||
|
||||
it("should return undefined when no title found", () => {
|
||||
const md = "Just some text without a title or heading";
|
||||
expect(extractMarkdownTitle(md)).toBeUndefined();
|
||||
});
|
||||
|
||||
it("should handle empty string", () => {
|
||||
expect(extractMarkdownTitle("")).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe("convertWithTurndown", () => {
|
||||
it("should convert HTML to markdown", () => {
|
||||
const html = "<html><head><title>Page</title></head><body><h1>Hello</h1><p>World</p></body></html>";
|
||||
|
|
|
|||
|
|
@ -112,6 +112,29 @@ export function truncateText(
|
|||
return { text: value.slice(0, maxChars), truncated: true };
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a title from a native markdown response.
|
||||
* Checks YAML frontmatter `title:` first, then falls back to the first `# heading`.
|
||||
*/
|
||||
export function extractMarkdownTitle(markdown: string): string | undefined {
|
||||
// Check YAML frontmatter
|
||||
const frontmatterMatch = markdown.match(/^---\s*\n([\s\S]*?)\n---/);
|
||||
if (frontmatterMatch?.[1]) {
|
||||
const titleMatch = frontmatterMatch[1].match(/^title:\s*(.+)$/m);
|
||||
if (titleMatch?.[1]) {
|
||||
const title = titleMatch[1].trim();
|
||||
if (title) return title;
|
||||
}
|
||||
}
|
||||
// Fall back to first # heading
|
||||
const headingMatch = markdown.match(/^#\s+(.+)$/m);
|
||||
if (headingMatch?.[1]) {
|
||||
const title = headingMatch[1].trim();
|
||||
if (title) return title;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert HTML to markdown using TurndownService (simpler, converts whole page)
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ import {
|
|||
writeCache,
|
||||
} from "./cache.js";
|
||||
import type { CacheEntry } from "./cache.js";
|
||||
import { extractContent, markdownToText, truncateText, type ExtractMode, type ExtractorType } from "./html-utils.js";
|
||||
import { extractContent, extractMarkdownTitle, markdownToText, truncateText, type ExtractMode, type ExtractorType } from "./html-utils.js";
|
||||
import { jsonResult, readNumberParam, readStringParam } from "./param-helpers.js";
|
||||
|
||||
const EXTRACT_MODES = ["markdown", "text"] as const;
|
||||
|
|
@ -69,13 +69,14 @@ export type WebFetchResult = {
|
|||
contentType: string;
|
||||
title?: string;
|
||||
extractMode: ExtractMode;
|
||||
extractor: ExtractorType | "raw" | "json";
|
||||
extractor: ExtractorType | "raw" | "json" | "markdown-native";
|
||||
truncated: boolean;
|
||||
length: number;
|
||||
fetchedAt: string;
|
||||
tookMs: number;
|
||||
text: string;
|
||||
cached?: boolean;
|
||||
markdownTokens?: number;
|
||||
};
|
||||
|
||||
function resolveMaxChars(value: unknown, fallback: number): number {
|
||||
|
|
@ -129,7 +130,7 @@ async function fetchWithRedirects(params: {
|
|||
res = await fetch(parsedUrl.toString(), {
|
||||
method: "GET",
|
||||
headers: {
|
||||
Accept: "*/*",
|
||||
Accept: "text/markdown, text/html;q=0.9, */*;q=0.8",
|
||||
"User-Agent": params.userAgent,
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
},
|
||||
|
|
@ -241,10 +242,28 @@ async function runWebFetch(params: {
|
|||
const body = await readResponseText(res);
|
||||
|
||||
let title: string | undefined;
|
||||
let extractor: ExtractorType | "raw" | "json" = "raw";
|
||||
let extractor: ExtractorType | "raw" | "json" | "markdown-native" = "raw";
|
||||
let text = body;
|
||||
let markdownTokens: number | undefined;
|
||||
|
||||
if (contentType.includes("text/html")) {
|
||||
// Capture x-markdown-tokens header when present (Cloudflare Markdown for Agents)
|
||||
const markdownTokensHeader = res.headers.get("x-markdown-tokens");
|
||||
if (markdownTokensHeader) {
|
||||
const parsed = Number.parseInt(markdownTokensHeader, 10);
|
||||
if (Number.isFinite(parsed) && parsed > 0) {
|
||||
markdownTokens = parsed;
|
||||
}
|
||||
}
|
||||
|
||||
if (contentType.includes("text/markdown")) {
|
||||
// Server returned markdown directly (e.g. Cloudflare Markdown for Agents) — skip HTML parsing
|
||||
text = body;
|
||||
extractor = "markdown-native";
|
||||
title = extractMarkdownTitle(body);
|
||||
if (params.extractMode === "text") {
|
||||
text = markdownToText(body);
|
||||
}
|
||||
} else if (contentType.includes("text/html")) {
|
||||
const extracted = await extractContent({
|
||||
html: body,
|
||||
url: finalUrl,
|
||||
|
|
@ -281,6 +300,9 @@ async function runWebFetch(params: {
|
|||
if (title) {
|
||||
payload.title = title;
|
||||
}
|
||||
if (markdownTokens !== undefined) {
|
||||
payload.markdownTokens = markdownTokens;
|
||||
}
|
||||
writeCache(FETCH_CACHE, cacheKey, payload, params.cacheTtlMs);
|
||||
return payload;
|
||||
} finally {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue