fix(agent): enforce sufficient search-fetch evidence
This commit is contained in:
parent
b5b65c6bae
commit
850d55336a
3 changed files with 245 additions and 18 deletions
|
|
@ -44,6 +44,7 @@ import {
|
|||
import type { AuthProfileFailureReason } from "./auth-profiles/index.js";
|
||||
import {
|
||||
analyzeCrossTurnWebFetchNeed,
|
||||
resolveWebFetchRequirementFromPrompt,
|
||||
shouldEnforceWebFetchAfterSearch,
|
||||
summarizeWebToolUsage,
|
||||
type ToolExecutionRecord,
|
||||
|
|
@ -133,15 +134,45 @@ function formatRunLogToolSummary(tool: string, details: Record<string, unknown>
|
|||
}
|
||||
}
|
||||
|
||||
const WEB_SEARCH_FETCH_ENFORCEMENT_PROMPT = [
|
||||
"You used web_search but did not complete a successful web_fetch in this turn.",
|
||||
"Search snippets are incomplete previews and are not sufficient evidence for detailed claims.",
|
||||
"Before finalizing your answer, you MUST:",
|
||||
"1) Pick the 1-3 most relevant URLs from the web_search results.",
|
||||
"2) Call web_fetch on those URLs.",
|
||||
"3) Revise your answer based on fetched content.",
|
||||
"If all fetch attempts fail, explicitly say so and avoid relying on snippets for specific claims.",
|
||||
].join("\n");
|
||||
function buildWebSearchFetchEnforcementPrompt(params: {
|
||||
requiredMinFetchSuccess: number;
|
||||
fetchSuccess: number;
|
||||
needsFollowupForLatestSearch: boolean;
|
||||
}): { prompt: string; additionalFetchNeeded: number } {
|
||||
const additionalFetchNeeded = Math.max(
|
||||
1,
|
||||
params.requiredMinFetchSuccess - params.fetchSuccess,
|
||||
params.needsFollowupForLatestSearch ? 1 : 0,
|
||||
);
|
||||
|
||||
const lines = [
|
||||
"You used web_search, but web evidence coverage for this turn is still incomplete.",
|
||||
"Search snippets are incomplete previews and are not sufficient evidence for detailed claims.",
|
||||
];
|
||||
|
||||
if (params.requiredMinFetchSuccess > 1) {
|
||||
lines.push(
|
||||
`This task currently requires at least ${params.requiredMinFetchSuccess} successful web_fetch calls.`,
|
||||
);
|
||||
}
|
||||
|
||||
if (params.needsFollowupForLatestSearch) {
|
||||
lines.push(
|
||||
"You performed another successful web_search after your last successful web_fetch. " +
|
||||
"You must fetch URLs from the latest search results before finalizing.",
|
||||
);
|
||||
}
|
||||
|
||||
lines.push(
|
||||
"Before finalizing your answer, you MUST:",
|
||||
"1) Pick the 1-3 most relevant URLs from the latest successful web_search results.",
|
||||
`2) Complete at least ${additionalFetchNeeded} additional successful web_fetch call(s).`,
|
||||
"3) Revise your answer based on fetched page content.",
|
||||
"If all additional fetch attempts fail, explicitly say so and avoid relying on snippets for specific claims.",
|
||||
);
|
||||
|
||||
return { prompt: lines.join("\n"), additionalFetchNeeded };
|
||||
}
|
||||
|
||||
const CROSS_TURN_WEB_FETCH_ENFORCEMENT_PROMPT = [
|
||||
"You are about to finalize a web-dependent answer, but no successful web_fetch happened in this turn.",
|
||||
|
|
@ -590,7 +621,10 @@ export class Agent {
|
|||
messages: this.agent.state.messages.length,
|
||||
});
|
||||
await this.agent.prompt(prompt);
|
||||
await this.enforceWebFetchAfterSearchIfNeeded(toolExecutionStartIndex);
|
||||
await this.enforceWebFetchAfterSearchIfNeeded({
|
||||
toolExecutionStartIndex,
|
||||
userPrompt: prompt,
|
||||
});
|
||||
await this.enforceCrossTurnWebFetchIfNeeded({
|
||||
toolExecutionStartIndex,
|
||||
userPrompt: prompt,
|
||||
|
|
@ -816,9 +850,10 @@ export class Agent {
|
|||
this.session.setApiKey(this.currentApiKey);
|
||||
}
|
||||
|
||||
private async enforceWebFetchAfterSearchIfNeeded(
|
||||
toolExecutionStartIndex: number,
|
||||
): Promise<void> {
|
||||
private async enforceWebFetchAfterSearchIfNeeded(params: {
|
||||
toolExecutionStartIndex: number;
|
||||
userPrompt: string;
|
||||
}): Promise<void> {
|
||||
if (this._internalRun) return;
|
||||
|
||||
const activeTools = new Set(
|
||||
|
|
@ -828,32 +863,48 @@ export class Agent {
|
|||
const webFetchAvailable = activeTools.has("web_fetch");
|
||||
|
||||
const currentTurnExecutions = this.currentRunToolExecutions.slice(
|
||||
toolExecutionStartIndex,
|
||||
params.toolExecutionStartIndex,
|
||||
);
|
||||
const usage = summarizeWebToolUsage(currentTurnExecutions);
|
||||
const requirement = resolveWebFetchRequirementFromPrompt(params.userPrompt);
|
||||
|
||||
if (
|
||||
!shouldEnforceWebFetchAfterSearch({
|
||||
usage,
|
||||
webSearchAvailable,
|
||||
webFetchAvailable,
|
||||
requiredMinFetchSuccess: requirement.requiredMinFetchSuccess,
|
||||
})
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
const { prompt, additionalFetchNeeded } = buildWebSearchFetchEnforcementPrompt({
|
||||
requiredMinFetchSuccess: requirement.requiredMinFetchSuccess,
|
||||
fetchSuccess: usage.fetchSuccess,
|
||||
needsFollowupForLatestSearch: usage.searchNeedsFollowupFetch,
|
||||
});
|
||||
|
||||
this.runLog.log("web_search_fetch_guard", {
|
||||
search_calls: usage.searchCalls,
|
||||
search_success: usage.searchSuccess,
|
||||
search_with_results: usage.searchSuccessWithResults,
|
||||
search_needs_followup_fetch: usage.searchNeedsFollowupFetch,
|
||||
fetch_calls: usage.fetchCalls,
|
||||
fetch_success: usage.fetchSuccess,
|
||||
required_min_fetch_success: requirement.requiredMinFetchSuccess,
|
||||
prompt_suggests_research_depth: requirement.promptSuggestsResearchDepth,
|
||||
prompt_multi_source_cue: requirement.multiSourceCue,
|
||||
prompt_explicit_min_fetch: requirement.explicitMinFetchFromPrompt,
|
||||
});
|
||||
|
||||
try {
|
||||
await this.agent.prompt(WEB_SEARCH_FETCH_ENFORCEMENT_PROMPT);
|
||||
await this.agent.prompt(prompt);
|
||||
this.runLog.log("web_search_fetch_guard_applied", {
|
||||
search_with_results: usage.searchSuccessWithResults,
|
||||
search_needs_followup_fetch: usage.searchNeedsFollowupFetch,
|
||||
required_min_fetch_success: requirement.requiredMinFetchSuccess,
|
||||
additional_fetch_needed: additionalFetchNeeded,
|
||||
});
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
analyzeCrossTurnWebFetchNeed,
|
||||
resolveWebFetchRequirementFromPrompt,
|
||||
shouldEnforceWebFetchAfterSearch,
|
||||
summarizeWebToolUsage,
|
||||
type ToolExecutionRecord,
|
||||
|
|
@ -31,6 +32,7 @@ describe("web-tools-policy", () => {
|
|||
expect(usage.searchCalls).toBe(1);
|
||||
expect(usage.searchSuccess).toBe(1);
|
||||
expect(usage.searchSuccessWithResults).toBe(1);
|
||||
expect(usage.searchNeedsFollowupFetch).toBe(true);
|
||||
expect(usage.fetchCalls).toBe(0);
|
||||
expect(usage.fetchSuccess).toBe(0);
|
||||
});
|
||||
|
|
@ -46,6 +48,22 @@ describe("web-tools-policy", () => {
|
|||
expect(usage.searchCalls).toBe(1);
|
||||
expect(usage.searchSuccess).toBe(0);
|
||||
expect(usage.searchSuccessWithResults).toBe(0);
|
||||
expect(usage.searchNeedsFollowupFetch).toBe(false);
|
||||
});
|
||||
|
||||
it("marks latest search as covered when successful fetch follows", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 1, results: [{}] },
|
||||
}),
|
||||
buildRecord({
|
||||
toolName: "web_fetch",
|
||||
details: { status: 200, length: 1024 },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(usage.searchNeedsFollowupFetch).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
|
|
@ -88,6 +106,53 @@ describe("web-tools-policy", () => {
|
|||
).toBe(false);
|
||||
});
|
||||
|
||||
it("enforces when the latest successful search has no follow-up fetch", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 2, results: [{}, {}] },
|
||||
}),
|
||||
buildRecord({
|
||||
toolName: "web_fetch",
|
||||
details: { status: 200, length: 1200 },
|
||||
}),
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 3, results: [{}, {}, {}] },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(
|
||||
shouldEnforceWebFetchAfterSearch({
|
||||
usage,
|
||||
webSearchAvailable: true,
|
||||
webFetchAvailable: true,
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("enforces when prompt requires deeper evidence coverage", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 6, results: [{}, {}, {}] },
|
||||
}),
|
||||
buildRecord({
|
||||
toolName: "web_fetch",
|
||||
details: { status: 200, length: 2200 },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(
|
||||
shouldEnforceWebFetchAfterSearch({
|
||||
usage,
|
||||
webSearchAvailable: true,
|
||||
webFetchAvailable: true,
|
||||
requiredMinFetchSuccess: 2,
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("does not enforce when search returns no results", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
|
|
@ -230,4 +295,33 @@ describe("web-tools-policy", () => {
|
|||
expect(analysis.webCue).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveWebFetchRequirementFromPrompt", () => {
|
||||
it("requires deeper fetch coverage for research-style prompts", () => {
|
||||
const result = resolveWebFetchRequirementFromPrompt(
|
||||
"帮我调研一下 APPLE 最近的产品信息,并做分析。",
|
||||
);
|
||||
|
||||
expect(result.requiredMinFetchSuccess).toBe(2);
|
||||
expect(result.promptSuggestsResearchDepth).toBe(true);
|
||||
});
|
||||
|
||||
it("uses explicit minimum source count when present", () => {
|
||||
const result = resolveWebFetchRequirementFromPrompt(
|
||||
"Please use at least 3 sources and summarize the latest updates.",
|
||||
);
|
||||
|
||||
expect(result.requiredMinFetchSuccess).toBe(3);
|
||||
expect(result.explicitMinFetchFromPrompt).toBe(3);
|
||||
});
|
||||
|
||||
it("falls back to 1 for simple prompts", () => {
|
||||
const result = resolveWebFetchRequirementFromPrompt(
|
||||
"What is OpenAI's CEO?",
|
||||
);
|
||||
|
||||
expect(result.requiredMinFetchSuccess).toBe(1);
|
||||
expect(result.promptSuggestsResearchDepth).toBe(false);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -8,10 +8,19 @@ export type WebToolUsage = {
|
|||
searchCalls: number;
|
||||
searchSuccess: number;
|
||||
searchSuccessWithResults: number;
|
||||
/** True when the latest successful search (with results) has no later successful fetch. */
|
||||
searchNeedsFollowupFetch: boolean;
|
||||
fetchCalls: number;
|
||||
fetchSuccess: number;
|
||||
};
|
||||
|
||||
export type WebFetchRequirement = {
|
||||
requiredMinFetchSuccess: number;
|
||||
promptSuggestsResearchDepth: boolean;
|
||||
multiSourceCue: boolean;
|
||||
explicitMinFetchFromPrompt: number | null;
|
||||
};
|
||||
|
||||
export type CrossTurnWebFetchGuardAnalysis = {
|
||||
shouldEnforce: boolean;
|
||||
explicitFetchRequest: boolean;
|
||||
|
|
@ -45,6 +54,17 @@ const USER_WEB_CONTEXT_PATTERNS: RegExp[] = [
|
|||
/(?:\u7f51\u9875|\u7f51\u7ad9|\u7f51\u7edc|\u4e92\u8054\u7f51|\u94fe\u63a5|\u6765\u6e90|\u65b0\u95fb|\u62a5\u9053|\u6587\u7ae0)/,
|
||||
];
|
||||
|
||||
const USER_RESEARCH_DEPTH_PATTERNS: RegExp[] = [
|
||||
/\b(research|investigate|analysis|analyze|compare|comparison|deep[-\s]?dive|survey|report|review)\b/i,
|
||||
/(?:\u8c03\u7814|\u7814\u7a76|\u5206\u6790|\u6df1\u5ea6|\u5bf9\u6bd4|\u5bf9\u7167|\u6c47\u603b|\u76d8\u70b9|\u62a5\u544a|\u8bc4\u4f30|\u8bc4\u6d4b)/,
|
||||
];
|
||||
|
||||
const USER_MULTI_SOURCE_PATTERNS: RegExp[] = [
|
||||
/\b(multiple|multi-source|across sources|different sources)\b/i,
|
||||
/(?:\u591a\u6765\u6e90|\u591a\u4e2a\u6765\u6e90|\u4e0d\u540c\u6765\u6e90|\u591a\u7f51\u7ad9)/,
|
||||
/(?:\u81f3\u5c11|\u4e0d\u5c11\u4e8e|\u6700\u5c11)\s*\d+\s*(?:\u4e2a|\u6761)?(?:\u6765\u6e90|\u94fe\u63a5|\u7f51\u5740|\u7f51\u9875|\u6587\u7ae0)/,
|
||||
];
|
||||
|
||||
const USER_WEB_BLOCK_PATTERNS: RegExp[] = [
|
||||
/\b(do not|don't|no|without)\s+(browse|web|internet|web_search|web_fetch|fetch)\b/i,
|
||||
/\bonly\b.*\b(snippet|snippets)\b/i,
|
||||
|
|
@ -63,6 +83,28 @@ function hasAnyPattern(text: string, patterns: RegExp[]): boolean {
|
|||
return patterns.some((pattern) => pattern.test(text));
|
||||
}
|
||||
|
||||
function normalizeMinFetchSuccess(raw: number): number {
|
||||
if (!Number.isFinite(raw)) return 1;
|
||||
return Math.max(1, Math.min(4, Math.floor(raw)));
|
||||
}
|
||||
|
||||
function extractExplicitMinFetchFromPrompt(prompt: string): number | null {
|
||||
const patterns: RegExp[] = [
|
||||
/\b(?:at least|minimum of|no less than)\s*(\d+)\s*(?:sources?|links?|urls?|articles?|pages?)\b/i,
|
||||
/(?:\u81f3\u5c11|\u4e0d\u5c11\u4e8e|\u6700\u5c11)\s*(\d+)\s*(?:\u4e2a|\u6761)?(?:\u6765\u6e90|\u94fe\u63a5|\u7f51\u5740|\u7f51\u9875|\u6587\u7ae0)/,
|
||||
];
|
||||
|
||||
for (const pattern of patterns) {
|
||||
const match = prompt.match(pattern);
|
||||
if (!match) continue;
|
||||
const parsed = Number(match[1]);
|
||||
if (!Number.isFinite(parsed)) continue;
|
||||
return normalizeMinFetchSuccess(parsed);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function hasToolError(details: Record<string, unknown> | null): boolean {
|
||||
return details?.error === true;
|
||||
}
|
||||
|
|
@ -93,9 +135,11 @@ export function summarizeWebToolUsage(records: ToolExecutionRecord[]): WebToolUs
|
|||
searchCalls: 0,
|
||||
searchSuccess: 0,
|
||||
searchSuccessWithResults: 0,
|
||||
searchNeedsFollowupFetch: false,
|
||||
fetchCalls: 0,
|
||||
fetchSuccess: 0,
|
||||
};
|
||||
let pendingSearchWithResults = false;
|
||||
|
||||
for (const record of records) {
|
||||
const toolName = record.toolName.trim().toLowerCase();
|
||||
|
|
@ -106,6 +150,7 @@ export function summarizeWebToolUsage(records: ToolExecutionRecord[]): WebToolUs
|
|||
usage.searchSuccess += 1;
|
||||
if (getSearchResultCount(record.details) > 0) {
|
||||
usage.searchSuccessWithResults += 1;
|
||||
pendingSearchWithResults = true;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
|
|
@ -115,10 +160,12 @@ export function summarizeWebToolUsage(records: ToolExecutionRecord[]): WebToolUs
|
|||
usage.fetchCalls += 1;
|
||||
if (isSuccessfulExecution(record)) {
|
||||
usage.fetchSuccess += 1;
|
||||
pendingSearchWithResults = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
usage.searchNeedsFollowupFetch = pendingSearchWithResults;
|
||||
return usage;
|
||||
}
|
||||
|
||||
|
|
@ -126,14 +173,49 @@ export function shouldEnforceWebFetchAfterSearch(params: {
|
|||
usage: WebToolUsage;
|
||||
webSearchAvailable: boolean;
|
||||
webFetchAvailable: boolean;
|
||||
requiredMinFetchSuccess?: number;
|
||||
}): boolean {
|
||||
const { usage, webSearchAvailable, webFetchAvailable } = params;
|
||||
const {
|
||||
usage,
|
||||
webSearchAvailable,
|
||||
webFetchAvailable,
|
||||
requiredMinFetchSuccess = 1,
|
||||
} = params;
|
||||
|
||||
if (!webSearchAvailable || !webFetchAvailable) return false;
|
||||
if (usage.searchSuccessWithResults <= 0) return false;
|
||||
if (usage.fetchSuccess > 0) return false;
|
||||
if (usage.fetchSuccess <= 0) return true;
|
||||
if (usage.searchNeedsFollowupFetch) return true;
|
||||
if (usage.fetchSuccess < normalizeMinFetchSuccess(requiredMinFetchSuccess)) return true;
|
||||
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
export function resolveWebFetchRequirementFromPrompt(prompt: string): WebFetchRequirement {
|
||||
const normalizedPrompt = prompt ?? "";
|
||||
const promptSuggestsResearchDepth = hasAnyPattern(
|
||||
normalizedPrompt,
|
||||
USER_RESEARCH_DEPTH_PATTERNS,
|
||||
);
|
||||
const multiSourceCue = hasAnyPattern(normalizedPrompt, USER_MULTI_SOURCE_PATTERNS);
|
||||
const explicitMinFetchFromPrompt = extractExplicitMinFetchFromPrompt(normalizedPrompt);
|
||||
|
||||
let requiredMinFetchSuccess = 1;
|
||||
if (promptSuggestsResearchDepth) requiredMinFetchSuccess = 2;
|
||||
if (multiSourceCue) requiredMinFetchSuccess = Math.max(requiredMinFetchSuccess, 2);
|
||||
if (explicitMinFetchFromPrompt !== null) {
|
||||
requiredMinFetchSuccess = Math.max(
|
||||
requiredMinFetchSuccess,
|
||||
explicitMinFetchFromPrompt,
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
requiredMinFetchSuccess: normalizeMinFetchSuccess(requiredMinFetchSuccess),
|
||||
promptSuggestsResearchDepth,
|
||||
multiSourceCue,
|
||||
explicitMinFetchFromPrompt,
|
||||
};
|
||||
}
|
||||
|
||||
export function analyzeCrossTurnWebFetchNeed(params: {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue