fix(agent): enforce sufficient search-fetch evidence

This commit is contained in:
Jiayuan Zhang 2026-02-17 02:08:15 +08:00
parent b5b65c6bae
commit 850d55336a
3 changed files with 245 additions and 18 deletions

View file

@ -44,6 +44,7 @@ import {
import type { AuthProfileFailureReason } from "./auth-profiles/index.js";
import {
analyzeCrossTurnWebFetchNeed,
resolveWebFetchRequirementFromPrompt,
shouldEnforceWebFetchAfterSearch,
summarizeWebToolUsage,
type ToolExecutionRecord,
@ -133,15 +134,45 @@ function formatRunLogToolSummary(tool: string, details: Record<string, unknown>
}
}
const WEB_SEARCH_FETCH_ENFORCEMENT_PROMPT = [
"You used web_search but did not complete a successful web_fetch in this turn.",
"Search snippets are incomplete previews and are not sufficient evidence for detailed claims.",
"Before finalizing your answer, you MUST:",
"1) Pick the 1-3 most relevant URLs from the web_search results.",
"2) Call web_fetch on those URLs.",
"3) Revise your answer based on fetched content.",
"If all fetch attempts fail, explicitly say so and avoid relying on snippets for specific claims.",
].join("\n");
function buildWebSearchFetchEnforcementPrompt(params: {
requiredMinFetchSuccess: number;
fetchSuccess: number;
needsFollowupForLatestSearch: boolean;
}): { prompt: string; additionalFetchNeeded: number } {
const additionalFetchNeeded = Math.max(
1,
params.requiredMinFetchSuccess - params.fetchSuccess,
params.needsFollowupForLatestSearch ? 1 : 0,
);
const lines = [
"You used web_search, but web evidence coverage for this turn is still incomplete.",
"Search snippets are incomplete previews and are not sufficient evidence for detailed claims.",
];
if (params.requiredMinFetchSuccess > 1) {
lines.push(
`This task currently requires at least ${params.requiredMinFetchSuccess} successful web_fetch calls.`,
);
}
if (params.needsFollowupForLatestSearch) {
lines.push(
"You performed another successful web_search after your last successful web_fetch. " +
"You must fetch URLs from the latest search results before finalizing.",
);
}
lines.push(
"Before finalizing your answer, you MUST:",
"1) Pick the 1-3 most relevant URLs from the latest successful web_search results.",
`2) Complete at least ${additionalFetchNeeded} additional successful web_fetch call(s).`,
"3) Revise your answer based on fetched page content.",
"If all additional fetch attempts fail, explicitly say so and avoid relying on snippets for specific claims.",
);
return { prompt: lines.join("\n"), additionalFetchNeeded };
}
const CROSS_TURN_WEB_FETCH_ENFORCEMENT_PROMPT = [
"You are about to finalize a web-dependent answer, but no successful web_fetch happened in this turn.",
@ -590,7 +621,10 @@ export class Agent {
messages: this.agent.state.messages.length,
});
await this.agent.prompt(prompt);
await this.enforceWebFetchAfterSearchIfNeeded(toolExecutionStartIndex);
await this.enforceWebFetchAfterSearchIfNeeded({
toolExecutionStartIndex,
userPrompt: prompt,
});
await this.enforceCrossTurnWebFetchIfNeeded({
toolExecutionStartIndex,
userPrompt: prompt,
@ -816,9 +850,10 @@ export class Agent {
this.session.setApiKey(this.currentApiKey);
}
private async enforceWebFetchAfterSearchIfNeeded(
toolExecutionStartIndex: number,
): Promise<void> {
private async enforceWebFetchAfterSearchIfNeeded(params: {
toolExecutionStartIndex: number;
userPrompt: string;
}): Promise<void> {
if (this._internalRun) return;
const activeTools = new Set(
@ -828,32 +863,48 @@ export class Agent {
const webFetchAvailable = activeTools.has("web_fetch");
const currentTurnExecutions = this.currentRunToolExecutions.slice(
toolExecutionStartIndex,
params.toolExecutionStartIndex,
);
const usage = summarizeWebToolUsage(currentTurnExecutions);
const requirement = resolveWebFetchRequirementFromPrompt(params.userPrompt);
if (
!shouldEnforceWebFetchAfterSearch({
usage,
webSearchAvailable,
webFetchAvailable,
requiredMinFetchSuccess: requirement.requiredMinFetchSuccess,
})
) {
return;
}
const { prompt, additionalFetchNeeded } = buildWebSearchFetchEnforcementPrompt({
requiredMinFetchSuccess: requirement.requiredMinFetchSuccess,
fetchSuccess: usage.fetchSuccess,
needsFollowupForLatestSearch: usage.searchNeedsFollowupFetch,
});
this.runLog.log("web_search_fetch_guard", {
search_calls: usage.searchCalls,
search_success: usage.searchSuccess,
search_with_results: usage.searchSuccessWithResults,
search_needs_followup_fetch: usage.searchNeedsFollowupFetch,
fetch_calls: usage.fetchCalls,
fetch_success: usage.fetchSuccess,
required_min_fetch_success: requirement.requiredMinFetchSuccess,
prompt_suggests_research_depth: requirement.promptSuggestsResearchDepth,
prompt_multi_source_cue: requirement.multiSourceCue,
prompt_explicit_min_fetch: requirement.explicitMinFetchFromPrompt,
});
try {
await this.agent.prompt(WEB_SEARCH_FETCH_ENFORCEMENT_PROMPT);
await this.agent.prompt(prompt);
this.runLog.log("web_search_fetch_guard_applied", {
search_with_results: usage.searchSuccessWithResults,
search_needs_followup_fetch: usage.searchNeedsFollowupFetch,
required_min_fetch_success: requirement.requiredMinFetchSuccess,
additional_fetch_needed: additionalFetchNeeded,
});
} catch (error) {
const message = error instanceof Error ? error.message : String(error);

View file

@ -1,6 +1,7 @@
import { describe, expect, it } from "vitest";
import {
analyzeCrossTurnWebFetchNeed,
resolveWebFetchRequirementFromPrompt,
shouldEnforceWebFetchAfterSearch,
summarizeWebToolUsage,
type ToolExecutionRecord,
@ -31,6 +32,7 @@ describe("web-tools-policy", () => {
expect(usage.searchCalls).toBe(1);
expect(usage.searchSuccess).toBe(1);
expect(usage.searchSuccessWithResults).toBe(1);
expect(usage.searchNeedsFollowupFetch).toBe(true);
expect(usage.fetchCalls).toBe(0);
expect(usage.fetchSuccess).toBe(0);
});
@ -46,6 +48,22 @@ describe("web-tools-policy", () => {
expect(usage.searchCalls).toBe(1);
expect(usage.searchSuccess).toBe(0);
expect(usage.searchSuccessWithResults).toBe(0);
expect(usage.searchNeedsFollowupFetch).toBe(false);
});
it("marks latest search as covered when successful fetch follows", () => {
const usage = summarizeWebToolUsage([
buildRecord({
toolName: "web_search",
details: { count: 1, results: [{}] },
}),
buildRecord({
toolName: "web_fetch",
details: { status: 200, length: 1024 },
}),
]);
expect(usage.searchNeedsFollowupFetch).toBe(false);
});
});
@ -88,6 +106,53 @@ describe("web-tools-policy", () => {
).toBe(false);
});
it("enforces when the latest successful search has no follow-up fetch", () => {
const usage = summarizeWebToolUsage([
buildRecord({
toolName: "web_search",
details: { count: 2, results: [{}, {}] },
}),
buildRecord({
toolName: "web_fetch",
details: { status: 200, length: 1200 },
}),
buildRecord({
toolName: "web_search",
details: { count: 3, results: [{}, {}, {}] },
}),
]);
expect(
shouldEnforceWebFetchAfterSearch({
usage,
webSearchAvailable: true,
webFetchAvailable: true,
}),
).toBe(true);
});
it("enforces when prompt requires deeper evidence coverage", () => {
const usage = summarizeWebToolUsage([
buildRecord({
toolName: "web_search",
details: { count: 6, results: [{}, {}, {}] },
}),
buildRecord({
toolName: "web_fetch",
details: { status: 200, length: 2200 },
}),
]);
expect(
shouldEnforceWebFetchAfterSearch({
usage,
webSearchAvailable: true,
webFetchAvailable: true,
requiredMinFetchSuccess: 2,
}),
).toBe(true);
});
it("does not enforce when search returns no results", () => {
const usage = summarizeWebToolUsage([
buildRecord({
@ -230,4 +295,33 @@ describe("web-tools-policy", () => {
expect(analysis.webCue).toBe(false);
});
});
describe("resolveWebFetchRequirementFromPrompt", () => {
it("requires deeper fetch coverage for research-style prompts", () => {
const result = resolveWebFetchRequirementFromPrompt(
"帮我调研一下 APPLE 最近的产品信息,并做分析。",
);
expect(result.requiredMinFetchSuccess).toBe(2);
expect(result.promptSuggestsResearchDepth).toBe(true);
});
it("uses explicit minimum source count when present", () => {
const result = resolveWebFetchRequirementFromPrompt(
"Please use at least 3 sources and summarize the latest updates.",
);
expect(result.requiredMinFetchSuccess).toBe(3);
expect(result.explicitMinFetchFromPrompt).toBe(3);
});
it("falls back to 1 for simple prompts", () => {
const result = resolveWebFetchRequirementFromPrompt(
"What is OpenAI's CEO?",
);
expect(result.requiredMinFetchSuccess).toBe(1);
expect(result.promptSuggestsResearchDepth).toBe(false);
});
});
});

View file

@ -8,10 +8,19 @@ export type WebToolUsage = {
searchCalls: number;
searchSuccess: number;
searchSuccessWithResults: number;
/** True when the latest successful search (with results) has no later successful fetch. */
searchNeedsFollowupFetch: boolean;
fetchCalls: number;
fetchSuccess: number;
};
export type WebFetchRequirement = {
requiredMinFetchSuccess: number;
promptSuggestsResearchDepth: boolean;
multiSourceCue: boolean;
explicitMinFetchFromPrompt: number | null;
};
export type CrossTurnWebFetchGuardAnalysis = {
shouldEnforce: boolean;
explicitFetchRequest: boolean;
@ -45,6 +54,17 @@ const USER_WEB_CONTEXT_PATTERNS: RegExp[] = [
/(?:\u7f51\u9875|\u7f51\u7ad9|\u7f51\u7edc|\u4e92\u8054\u7f51|\u94fe\u63a5|\u6765\u6e90|\u65b0\u95fb|\u62a5\u9053|\u6587\u7ae0)/,
];
const USER_RESEARCH_DEPTH_PATTERNS: RegExp[] = [
/\b(research|investigate|analysis|analyze|compare|comparison|deep[-\s]?dive|survey|report|review)\b/i,
/(?:\u8c03\u7814|\u7814\u7a76|\u5206\u6790|\u6df1\u5ea6|\u5bf9\u6bd4|\u5bf9\u7167|\u6c47\u603b|\u76d8\u70b9|\u62a5\u544a|\u8bc4\u4f30|\u8bc4\u6d4b)/,
];
const USER_MULTI_SOURCE_PATTERNS: RegExp[] = [
/\b(multiple|multi-source|across sources|different sources)\b/i,
/(?:\u591a\u6765\u6e90|\u591a\u4e2a\u6765\u6e90|\u4e0d\u540c\u6765\u6e90|\u591a\u7f51\u7ad9)/,
/(?:\u81f3\u5c11|\u4e0d\u5c11\u4e8e|\u6700\u5c11)\s*\d+\s*(?:\u4e2a|\u6761)?(?:\u6765\u6e90|\u94fe\u63a5|\u7f51\u5740|\u7f51\u9875|\u6587\u7ae0)/,
];
const USER_WEB_BLOCK_PATTERNS: RegExp[] = [
/\b(do not|don't|no|without)\s+(browse|web|internet|web_search|web_fetch|fetch)\b/i,
/\bonly\b.*\b(snippet|snippets)\b/i,
@ -63,6 +83,28 @@ function hasAnyPattern(text: string, patterns: RegExp[]): boolean {
return patterns.some((pattern) => pattern.test(text));
}
function normalizeMinFetchSuccess(raw: number): number {
if (!Number.isFinite(raw)) return 1;
return Math.max(1, Math.min(4, Math.floor(raw)));
}
function extractExplicitMinFetchFromPrompt(prompt: string): number | null {
const patterns: RegExp[] = [
/\b(?:at least|minimum of|no less than)\s*(\d+)\s*(?:sources?|links?|urls?|articles?|pages?)\b/i,
/(?:\u81f3\u5c11|\u4e0d\u5c11\u4e8e|\u6700\u5c11)\s*(\d+)\s*(?:\u4e2a|\u6761)?(?:\u6765\u6e90|\u94fe\u63a5|\u7f51\u5740|\u7f51\u9875|\u6587\u7ae0)/,
];
for (const pattern of patterns) {
const match = prompt.match(pattern);
if (!match) continue;
const parsed = Number(match[1]);
if (!Number.isFinite(parsed)) continue;
return normalizeMinFetchSuccess(parsed);
}
return null;
}
function hasToolError(details: Record<string, unknown> | null): boolean {
return details?.error === true;
}
@ -93,9 +135,11 @@ export function summarizeWebToolUsage(records: ToolExecutionRecord[]): WebToolUs
searchCalls: 0,
searchSuccess: 0,
searchSuccessWithResults: 0,
searchNeedsFollowupFetch: false,
fetchCalls: 0,
fetchSuccess: 0,
};
let pendingSearchWithResults = false;
for (const record of records) {
const toolName = record.toolName.trim().toLowerCase();
@ -106,6 +150,7 @@ export function summarizeWebToolUsage(records: ToolExecutionRecord[]): WebToolUs
usage.searchSuccess += 1;
if (getSearchResultCount(record.details) > 0) {
usage.searchSuccessWithResults += 1;
pendingSearchWithResults = true;
}
}
continue;
@ -115,10 +160,12 @@ export function summarizeWebToolUsage(records: ToolExecutionRecord[]): WebToolUs
usage.fetchCalls += 1;
if (isSuccessfulExecution(record)) {
usage.fetchSuccess += 1;
pendingSearchWithResults = false;
}
}
}
usage.searchNeedsFollowupFetch = pendingSearchWithResults;
return usage;
}
@ -126,14 +173,49 @@ export function shouldEnforceWebFetchAfterSearch(params: {
usage: WebToolUsage;
webSearchAvailable: boolean;
webFetchAvailable: boolean;
requiredMinFetchSuccess?: number;
}): boolean {
const { usage, webSearchAvailable, webFetchAvailable } = params;
const {
usage,
webSearchAvailable,
webFetchAvailable,
requiredMinFetchSuccess = 1,
} = params;
if (!webSearchAvailable || !webFetchAvailable) return false;
if (usage.searchSuccessWithResults <= 0) return false;
if (usage.fetchSuccess > 0) return false;
if (usage.fetchSuccess <= 0) return true;
if (usage.searchNeedsFollowupFetch) return true;
if (usage.fetchSuccess < normalizeMinFetchSuccess(requiredMinFetchSuccess)) return true;
return true;
return false;
}
export function resolveWebFetchRequirementFromPrompt(prompt: string): WebFetchRequirement {
const normalizedPrompt = prompt ?? "";
const promptSuggestsResearchDepth = hasAnyPattern(
normalizedPrompt,
USER_RESEARCH_DEPTH_PATTERNS,
);
const multiSourceCue = hasAnyPattern(normalizedPrompt, USER_MULTI_SOURCE_PATTERNS);
const explicitMinFetchFromPrompt = extractExplicitMinFetchFromPrompt(normalizedPrompt);
let requiredMinFetchSuccess = 1;
if (promptSuggestsResearchDepth) requiredMinFetchSuccess = 2;
if (multiSourceCue) requiredMinFetchSuccess = Math.max(requiredMinFetchSuccess, 2);
if (explicitMinFetchFromPrompt !== null) {
requiredMinFetchSuccess = Math.max(
requiredMinFetchSuccess,
explicitMinFetchFromPrompt,
);
}
return {
requiredMinFetchSuccess: normalizeMinFetchSuccess(requiredMinFetchSuccess),
promptSuggestsResearchDepth,
multiSourceCue,
explicitMinFetchFromPrompt,
};
}
export function analyzeCrossTurnWebFetchNeed(params: {