multica/src/hub/block-chunker.ts
yushen efb1495326 fix(hub): fence close must reject info strings per CommonMark spec
The closing fence regex was not checking for an empty info string,
allowing e.g. ```python to incorrectly close an open fence. Also
adds missing test for tool_execution_update passthrough.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 22:23:55 +08:00

240 lines
7.4 KiB
TypeScript

/**
* Block chunker for splitting streaming text at natural boundaries.
*
* Used by MessageAggregator to produce appropriately-sized text blocks
* for messaging platforms that cannot consume raw streaming deltas.
*/
export interface BlockChunkerConfig {
/** Minimum text chars before attempting a break (default: 200) */
minChars: number;
/** Hard maximum chunk size — will force-break here (default: 2000) */
maxChars: number;
/** Preferred break type priority */
breakPreference: "paragraph" | "newline" | "sentence";
}
export const DEFAULT_CHUNKER_CONFIG: BlockChunkerConfig = {
minChars: 200,
maxChars: 2000,
breakPreference: "paragraph",
};
export interface ChunkResult {
/** Text to emit as a block */
chunk: string;
/** Remaining text to keep in buffer */
remainder: string;
}
interface FenceInfo {
marker: string; // "```" or "~~~"
lang: string; // language identifier, e.g. "typescript"
}
/**
* Detect if `text` has an unclosed fenced code block at the given position.
* Scans line-by-line tracking fence open/close pairs.
*/
function detectFenceAt(text: string, upTo: number): FenceInfo | null {
const region = text.slice(0, upTo);
const lines = region.split("\n");
let openFence: FenceInfo | null = null;
for (const line of lines) {
const match = line.match(/^(`{3,}|~{3,})(\S*)\s*$/);
if (!match) continue;
const marker = match[1];
const lang = match[2] ?? "";
const markerChar = marker[0];
if (openFence === null) {
// Opening a new fence
openFence = { marker, lang };
} else if (markerChar === openFence.marker[0] && marker.length >= openFence.marker.length && lang === "") {
// Closing the current fence (same char, at least as many chars, no info string per CommonMark)
openFence = null;
}
// Otherwise: different char or shorter marker, not a close — ignore
}
return openFence;
}
/**
* Check if position is inside an unclosed fenced code block.
*/
function isInsideFence(text: string, position: number): boolean {
return detectFenceAt(text, position) !== null;
}
/**
* If a chunk ends inside an open fence, close it in the chunk
* and reopen it in the remainder.
*/
function applyFenceSafety(chunk: string, remainder: string): { chunk: string; remainder: string } {
const fence = detectFenceAt(chunk, chunk.length);
if (!fence) return { chunk, remainder };
// Close the fence in the chunk
const closedChunk = chunk.endsWith("\n")
? chunk + fence.marker + "\n"
: chunk + "\n" + fence.marker + "\n";
// Reopen the fence in the remainder
const reopenTag = fence.lang ? `${fence.marker}${fence.lang}` : fence.marker;
const reopenedRemainder = reopenTag + "\n" + remainder;
return { chunk: closedChunk, remainder: reopenedRemainder };
}
export class BlockChunker {
private readonly minChars: number;
private readonly maxChars: number;
private readonly breakPreference: "paragraph" | "newline" | "sentence";
constructor(config: BlockChunkerConfig) {
this.minChars = config.minChars;
this.maxChars = config.maxChars;
this.breakPreference = config.breakPreference;
}
/**
* Attempt to extract a chunk from the buffer.
* Returns null if buffer is below minChars or no suitable break found
* (unless buffer exceeds maxChars, in which case a hard break is forced).
*/
tryChunk(buffer: string): ChunkResult | null {
if (buffer.length < this.minChars) return null;
// Search window: minChars..min(buffer.length, maxChars)
const searchEnd = Math.min(buffer.length, this.maxChars);
const breakIndex = this.findBreakIndex(buffer, this.minChars, searchEnd);
if (breakIndex !== -1) {
return {
chunk: buffer.slice(0, breakIndex),
remainder: buffer.slice(breakIndex),
};
}
// No break found within search window
if (buffer.length >= this.maxChars) {
// Hard cut at maxChars with fence safety
const raw = {
chunk: buffer.slice(0, this.maxChars),
remainder: buffer.slice(this.maxChars),
};
return applyFenceSafety(raw.chunk, raw.remainder);
}
// Buffer is between minChars and maxChars with no break — wait for more text
return null;
}
/**
* Force-flush: return entire buffer as a chunk.
*/
flush(buffer: string): ChunkResult | null {
if (buffer.length === 0) return null;
return { chunk: buffer, remainder: "" };
}
/**
* Find the best break index in the buffer within [searchStart, searchEnd).
* Scans backwards from searchEnd to prefer later breaks (larger chunks).
* Returns the index AFTER the break character(s) — i.e., the start of the remainder.
* Returns -1 if no suitable break found.
*/
private findBreakIndex(buffer: string, searchStart: number, searchEnd: number): number {
const breakers = this.getBreakers();
const bufLen = buffer.length;
for (const breaker of breakers) {
const index = breaker(buffer, searchStart, searchEnd, bufLen);
if (index !== -1 && !isInsideFence(buffer, index)) {
return index;
}
}
return -1;
}
/**
* Get break functions in priority order based on breakPreference.
*/
private getBreakers(): Array<(buffer: string, start: number, end: number, bufLen: number) => number> {
switch (this.breakPreference) {
case "paragraph":
return [findParagraphBreak, findNewlineBreak, findSentenceBreak, findWordBreak];
case "newline":
return [findNewlineBreak, findSentenceBreak, findWordBreak];
case "sentence":
return [findSentenceBreak, findWordBreak];
}
}
}
/**
* Find a paragraph break (\n\n) scanning backwards from end.
* Returns the index after the break (start of next paragraph).
*/
function findParagraphBreak(buffer: string, start: number, end: number, bufLen: number): number {
for (let i = end - 1; i >= start + 1; i--) {
if (buffer[i] === "\n" && buffer[i - 1] === "\n") {
const idx = i + 1;
if (idx < bufLen) return idx;
}
}
return -1;
}
/**
* Find a newline break (\n) scanning backwards from end.
* Returns the index after the newline.
*/
function findNewlineBreak(buffer: string, start: number, end: number, bufLen: number): number {
for (let i = end - 1; i >= start; i--) {
if (buffer[i] === "\n") {
const idx = i + 1;
if (idx < bufLen) return idx;
}
}
return -1;
}
/**
* Find a sentence break (.!? followed by whitespace or end of string).
* Returns the index after the whitespace following the punctuation.
*/
function findSentenceBreak(buffer: string, start: number, end: number, bufLen: number): number {
for (let i = end - 1; i >= start; i--) {
const ch = buffer[i];
if (ch === "." || ch === "!" || ch === "?") {
const next = i + 1;
if (next < bufLen && /\s/.test(buffer[next])) {
// Break after the whitespace
const idx = next + 1;
if (idx < bufLen) return idx;
} else if (next >= bufLen) {
// Punctuation at end of text — not a useful split point, skip
continue;
}
}
}
return -1;
}
/**
* Find a word boundary (whitespace) scanning backwards from end.
* Returns the index after the whitespace character.
*/
function findWordBreak(buffer: string, start: number, end: number, bufLen: number): number {
for (let i = end - 1; i >= start; i--) {
if (/\s/.test(buffer[i])) {
const idx = i + 1;
if (idx < bufLen) return idx;
}
}
return -1;
}