Merge pull request #218 from multica-ai/codex/web-fetch-evidence-coverage
fix(agent): enforce web search fetch evidence coverage
This commit is contained in:
commit
39fde8e4b0
5 changed files with 790 additions and 8 deletions
|
|
@ -43,6 +43,13 @@ import {
|
|||
type SystemPromptMode,
|
||||
} from "./system-prompt/index.js";
|
||||
import type { AuthProfileFailureReason } from "./auth-profiles/index.js";
|
||||
import {
|
||||
analyzeCrossTurnWebFetchNeed,
|
||||
resolveWebFetchRequirementFromPrompt,
|
||||
shouldEnforceWebFetchAfterSearch,
|
||||
summarizeWebToolUsage,
|
||||
type ToolExecutionRecord,
|
||||
} from "./web-tools-policy.js";
|
||||
import {
|
||||
sanitizeToolCallInputs,
|
||||
sanitizeToolUseResultPairing,
|
||||
|
|
@ -128,6 +135,56 @@ function formatRunLogToolSummary(tool: string, details: Record<string, unknown>
|
|||
}
|
||||
}
|
||||
|
||||
function buildWebSearchFetchEnforcementPrompt(params: {
|
||||
requiredMinFetchSuccess: number;
|
||||
fetchSuccess: number;
|
||||
needsFollowupForLatestSearch: boolean;
|
||||
}): { prompt: string; additionalFetchNeeded: number } {
|
||||
const additionalFetchNeeded = Math.max(
|
||||
1,
|
||||
params.requiredMinFetchSuccess - params.fetchSuccess,
|
||||
params.needsFollowupForLatestSearch ? 1 : 0,
|
||||
);
|
||||
|
||||
const lines = [
|
||||
"You used web_search, but web evidence coverage for this turn is still incomplete.",
|
||||
"Search snippets are incomplete previews and are not sufficient evidence for detailed claims.",
|
||||
];
|
||||
|
||||
if (params.requiredMinFetchSuccess > 1) {
|
||||
lines.push(
|
||||
`This task currently requires at least ${params.requiredMinFetchSuccess} successful web_fetch calls.`,
|
||||
);
|
||||
}
|
||||
|
||||
if (params.needsFollowupForLatestSearch) {
|
||||
lines.push(
|
||||
"You performed another successful web_search after your last successful web_fetch. " +
|
||||
"You must fetch URLs from the latest search results before finalizing.",
|
||||
);
|
||||
}
|
||||
|
||||
lines.push(
|
||||
"Before finalizing your answer, you MUST:",
|
||||
"1) Pick the 1-3 most relevant URLs from the latest successful web_search results.",
|
||||
`2) Complete at least ${additionalFetchNeeded} additional successful web_fetch call(s).`,
|
||||
"3) Revise your answer based on fetched page content.",
|
||||
"If all additional fetch attempts fail, explicitly say so and avoid relying on snippets for specific claims.",
|
||||
);
|
||||
|
||||
return { prompt: lines.join("\n"), additionalFetchNeeded };
|
||||
}
|
||||
|
||||
const CROSS_TURN_WEB_FETCH_ENFORCEMENT_PROMPT = [
|
||||
"You are about to finalize a web-dependent answer, but no successful web_fetch happened in this turn.",
|
||||
"Do not rely only on snippets or prior-turn memory for fresh factual claims.",
|
||||
"Before finalizing your answer, you MUST:",
|
||||
"1) If relevant URLs are already available in this conversation, call web_fetch on 1-3 of them.",
|
||||
"2) If no URLs are available, call web_search to find candidates, then web_fetch on 1-3 relevant URLs.",
|
||||
"3) Revise your answer using fetched page content as primary evidence.",
|
||||
"If all fetch attempts fail, explicitly report that limitation and avoid specific claims not backed by fetched content.",
|
||||
].join("\n");
|
||||
|
||||
export class Agent {
|
||||
private readonly agent: PiAgentCore;
|
||||
private output;
|
||||
|
|
@ -142,6 +199,7 @@ export class Agent {
|
|||
private readonly stderr: NodeJS.WritableStream;
|
||||
private readonly runLog: RunLog;
|
||||
private readonly toolStartTimes = new Map<string, number>();
|
||||
private currentRunToolExecutions: ToolExecutionRecord[] = [];
|
||||
private initialized = false;
|
||||
|
||||
// Context window settings (for pre-flight compaction)
|
||||
|
|
@ -525,6 +583,7 @@ export class Agent {
|
|||
this.currentUserSource = options?.source;
|
||||
this._isRunning = true;
|
||||
this._aborted = false;
|
||||
this.currentRunToolExecutions = [];
|
||||
|
||||
const runStart = Date.now();
|
||||
this.runLog.log("run_start", {
|
||||
|
|
@ -553,6 +612,7 @@ export class Agent {
|
|||
|
||||
// Loop to exhaust all candidate profiles on rotatable errors
|
||||
while (true) {
|
||||
const toolExecutionStartIndex = this.currentRunToolExecutions.length;
|
||||
try {
|
||||
const llmStart = Date.now();
|
||||
this.runLog.log("llm_call", {
|
||||
|
|
@ -562,6 +622,14 @@ export class Agent {
|
|||
messages: this.agent.state.messages.length,
|
||||
});
|
||||
await this.agent.prompt(prompt);
|
||||
await this.enforceWebFetchAfterSearchIfNeeded({
|
||||
toolExecutionStartIndex,
|
||||
userPrompt: prompt,
|
||||
});
|
||||
await this.enforceCrossTurnWebFetchIfNeeded({
|
||||
toolExecutionStartIndex,
|
||||
userPrompt: prompt,
|
||||
});
|
||||
this.runLog.log("llm_result", {
|
||||
duration_ms: Date.now() - llmStart,
|
||||
});
|
||||
|
|
@ -693,6 +761,7 @@ export class Agent {
|
|||
this._lastEventSavedAssistant = undefined;
|
||||
this.currentUserDisplayPrompt = undefined;
|
||||
this.currentUserSource = undefined;
|
||||
this.currentRunToolExecutions = [];
|
||||
this.runLog.flush().catch(() => {});
|
||||
}
|
||||
}
|
||||
|
|
@ -782,6 +851,125 @@ export class Agent {
|
|||
this.session.setApiKey(this.currentApiKey);
|
||||
}
|
||||
|
||||
private async enforceWebFetchAfterSearchIfNeeded(params: {
|
||||
toolExecutionStartIndex: number;
|
||||
userPrompt: string;
|
||||
}): Promise<void> {
|
||||
if (this._internalRun) return;
|
||||
|
||||
const activeTools = new Set(
|
||||
(this.agent.state.tools ?? []).map((tool) => tool.name.toLowerCase()),
|
||||
);
|
||||
const webSearchAvailable = activeTools.has("web_search");
|
||||
const webFetchAvailable = activeTools.has("web_fetch");
|
||||
|
||||
const currentTurnExecutions = this.currentRunToolExecutions.slice(
|
||||
params.toolExecutionStartIndex,
|
||||
);
|
||||
const usage = summarizeWebToolUsage(currentTurnExecutions);
|
||||
const requirement = resolveWebFetchRequirementFromPrompt(params.userPrompt);
|
||||
|
||||
if (
|
||||
!shouldEnforceWebFetchAfterSearch({
|
||||
usage,
|
||||
webSearchAvailable,
|
||||
webFetchAvailable,
|
||||
requiredMinFetchSuccess: requirement.requiredMinFetchSuccess,
|
||||
})
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
const { prompt, additionalFetchNeeded } = buildWebSearchFetchEnforcementPrompt({
|
||||
requiredMinFetchSuccess: requirement.requiredMinFetchSuccess,
|
||||
fetchSuccess: usage.fetchSuccess,
|
||||
needsFollowupForLatestSearch: usage.searchNeedsFollowupFetch,
|
||||
});
|
||||
|
||||
this.runLog.log("web_search_fetch_guard", {
|
||||
search_calls: usage.searchCalls,
|
||||
search_success: usage.searchSuccess,
|
||||
search_with_results: usage.searchSuccessWithResults,
|
||||
search_needs_followup_fetch: usage.searchNeedsFollowupFetch,
|
||||
fetch_calls: usage.fetchCalls,
|
||||
fetch_success: usage.fetchSuccess,
|
||||
required_min_fetch_success: requirement.requiredMinFetchSuccess,
|
||||
prompt_suggests_research_depth: requirement.promptSuggestsResearchDepth,
|
||||
prompt_multi_source_cue: requirement.multiSourceCue,
|
||||
prompt_explicit_min_fetch: requirement.explicitMinFetchFromPrompt,
|
||||
});
|
||||
|
||||
try {
|
||||
await this.agent.prompt(prompt);
|
||||
this.runLog.log("web_search_fetch_guard_applied", {
|
||||
search_with_results: usage.searchSuccessWithResults,
|
||||
search_needs_followup_fetch: usage.searchNeedsFollowupFetch,
|
||||
required_min_fetch_success: requirement.requiredMinFetchSuccess,
|
||||
additional_fetch_needed: additionalFetchNeeded,
|
||||
});
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
this.runLog.log("web_search_fetch_guard_failed", {
|
||||
error: message.slice(0, 200),
|
||||
});
|
||||
if (this.debug) {
|
||||
this.stderr.write(`[web-guard] Failed to enforce search->fetch: ${message}\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async enforceCrossTurnWebFetchIfNeeded(params: {
|
||||
toolExecutionStartIndex: number;
|
||||
userPrompt: string;
|
||||
}): Promise<void> {
|
||||
if (this._internalRun) return;
|
||||
|
||||
const activeTools = new Set(
|
||||
(this.agent.state.tools ?? []).map((tool) => tool.name.toLowerCase()),
|
||||
);
|
||||
const webFetchAvailable = activeTools.has("web_fetch");
|
||||
const currentTurnExecutions = this.currentRunToolExecutions.slice(
|
||||
params.toolExecutionStartIndex,
|
||||
);
|
||||
const usage = summarizeWebToolUsage(currentTurnExecutions);
|
||||
const analysis = analyzeCrossTurnWebFetchNeed({
|
||||
usage,
|
||||
webFetchAvailable,
|
||||
userPrompt: params.userPrompt,
|
||||
assistantText: this.output.state.lastAssistantText ?? "",
|
||||
});
|
||||
|
||||
if (!analysis.shouldEnforce) return;
|
||||
|
||||
this.runLog.log("web_cross_turn_fetch_guard", {
|
||||
fetch_calls: usage.fetchCalls,
|
||||
fetch_success: usage.fetchSuccess,
|
||||
explicit_fetch_request: analysis.explicitFetchRequest,
|
||||
user_provides_url: analysis.userProvidesUrl,
|
||||
freshness_cue: analysis.freshnessCue,
|
||||
web_cue: analysis.webCue,
|
||||
user_needs_fresh_web_evidence: analysis.userNeedsFreshWebEvidence,
|
||||
user_blocks_web_fetch: analysis.userBlocksWebFetch,
|
||||
assistant_web_claim_signal: analysis.assistantHasWebClaimSignal,
|
||||
});
|
||||
|
||||
try {
|
||||
await this.agent.prompt(CROSS_TURN_WEB_FETCH_ENFORCEMENT_PROMPT);
|
||||
this.runLog.log("web_cross_turn_fetch_guard_applied", {
|
||||
explicit_fetch_request: analysis.explicitFetchRequest,
|
||||
user_provides_url: analysis.userProvidesUrl,
|
||||
});
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
this.runLog.log("web_cross_turn_fetch_guard_failed", {
|
||||
error: message.slice(0, 200),
|
||||
});
|
||||
if (this.debug) {
|
||||
this.stderr.write(`[web-cross-turn-guard] Failed to enforce fetch: ${message}\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private handleRunLogEvent(event: AgentEvent) {
|
||||
if (event.type === "tool_execution_start") {
|
||||
const toolName = (event as any).toolName ?? "unknown";
|
||||
|
|
@ -801,11 +989,18 @@ export class Agent {
|
|||
const resultText = extractRunLogResultText(result);
|
||||
const resultChars = resultText?.length ?? 0;
|
||||
const details = extractRunLogResultDetails(result);
|
||||
const isError = Boolean((event as any).isError ?? false);
|
||||
|
||||
this.currentRunToolExecutions.push({
|
||||
toolName,
|
||||
isError,
|
||||
details,
|
||||
});
|
||||
|
||||
const toolEndData: Record<string, unknown> = {
|
||||
tool: toolName,
|
||||
duration_ms,
|
||||
is_error: (event as any).isError ?? false,
|
||||
is_error: isError,
|
||||
result_chars: resultChars,
|
||||
result_summary: formatRunLogToolSummary(toolName, details),
|
||||
};
|
||||
|
|
|
|||
|
|
@ -181,7 +181,7 @@ describe("buildConditionalToolSections", () => {
|
|||
const result = buildConditionalToolSections(["web_search"], "full");
|
||||
const text = result.join("\n");
|
||||
expect(text).toContain("## Web Access");
|
||||
expect(text).toContain("Web usage is conditional, not mandatory");
|
||||
expect(text).toContain("you MUST call web_fetch");
|
||||
});
|
||||
|
||||
it("adds dynamic evidence decision guidance when data tool is present", () => {
|
||||
|
|
|
|||
|
|
@ -364,14 +364,12 @@ export function buildConditionalToolSections(
|
|||
"## Web Access",
|
||||
"You have web access. Use it when the user asks about current events, needs up-to-date information, or requests content from URLs.",
|
||||
"Prefer web_search for discovery and web_fetch for specific URLs.",
|
||||
"Web usage is conditional, not mandatory: call web tools when they materially improve evidence quality.",
|
||||
"When web_search is used, treat snippets as incomplete previews rather than final evidence.",
|
||||
"",
|
||||
"### Search-then-Fetch",
|
||||
"After web_search, evaluate whether the snippets contain enough detail to answer accurately.",
|
||||
"If not, use web_fetch on the 1-3 most relevant URLs to get full content before answering.",
|
||||
"Always fetch when the user asks for detailed explanations, comparisons, or analysis;",
|
||||
"when snippets are vague or contradictory; or when the question requires specific data points.",
|
||||
"Skip fetch when the answer is a simple fact clearly stated in the snippet or the user only wants a quick overview.",
|
||||
"After every successful web_search, you MUST call web_fetch on 1-3 relevant URLs before detailed reasoning or factual claims.",
|
||||
"Use fetched page content (not snippets) as the primary evidence for analysis and synthesis.",
|
||||
"If all fetch attempts fail, explicitly report that limitation and avoid specific claims derived only from snippets.",
|
||||
"",
|
||||
);
|
||||
}
|
||||
|
|
|
|||
327
packages/core/src/agent/web-tools-policy.test.ts
Normal file
327
packages/core/src/agent/web-tools-policy.test.ts
Normal file
|
|
@ -0,0 +1,327 @@
|
|||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
analyzeCrossTurnWebFetchNeed,
|
||||
resolveWebFetchRequirementFromPrompt,
|
||||
shouldEnforceWebFetchAfterSearch,
|
||||
summarizeWebToolUsage,
|
||||
type ToolExecutionRecord,
|
||||
} from "./web-tools-policy.js";
|
||||
|
||||
function buildRecord(params: {
|
||||
toolName: string;
|
||||
isError?: boolean;
|
||||
details?: Record<string, unknown> | null;
|
||||
}): ToolExecutionRecord {
|
||||
return {
|
||||
toolName: params.toolName,
|
||||
isError: params.isError ?? false,
|
||||
details: params.details ?? null,
|
||||
};
|
||||
}
|
||||
|
||||
describe("web-tools-policy", () => {
|
||||
describe("summarizeWebToolUsage", () => {
|
||||
it("counts successful web_search calls with results", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 3, results: [{}, {}, {}] },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(usage.searchCalls).toBe(1);
|
||||
expect(usage.searchSuccess).toBe(1);
|
||||
expect(usage.searchSuccessWithResults).toBe(1);
|
||||
expect(usage.searchNeedsFollowupFetch).toBe(true);
|
||||
expect(usage.fetchCalls).toBe(0);
|
||||
expect(usage.fetchSuccess).toBe(0);
|
||||
});
|
||||
|
||||
it("does not count tool-level error payload as success", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { error: true, code: "search_failed" },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(usage.searchCalls).toBe(1);
|
||||
expect(usage.searchSuccess).toBe(0);
|
||||
expect(usage.searchSuccessWithResults).toBe(0);
|
||||
expect(usage.searchNeedsFollowupFetch).toBe(false);
|
||||
});
|
||||
|
||||
it("marks latest search as covered when successful fetch follows", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 1, results: [{}] },
|
||||
}),
|
||||
buildRecord({
|
||||
toolName: "web_fetch",
|
||||
details: { status: 200, length: 1024 },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(usage.searchNeedsFollowupFetch).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("shouldEnforceWebFetchAfterSearch", () => {
|
||||
it("enforces when search has results but fetch never succeeded", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 2, results: [{}, {}] },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(
|
||||
shouldEnforceWebFetchAfterSearch({
|
||||
usage,
|
||||
webSearchAvailable: true,
|
||||
webFetchAvailable: true,
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("does not enforce after a successful web_fetch", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 2, results: [{}, {}] },
|
||||
}),
|
||||
buildRecord({
|
||||
toolName: "web_fetch",
|
||||
details: { status: 200, length: 1024 },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(
|
||||
shouldEnforceWebFetchAfterSearch({
|
||||
usage,
|
||||
webSearchAvailable: true,
|
||||
webFetchAvailable: true,
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it("enforces when the latest successful search has no follow-up fetch", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 2, results: [{}, {}] },
|
||||
}),
|
||||
buildRecord({
|
||||
toolName: "web_fetch",
|
||||
details: { status: 200, length: 1200 },
|
||||
}),
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 3, results: [{}, {}, {}] },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(
|
||||
shouldEnforceWebFetchAfterSearch({
|
||||
usage,
|
||||
webSearchAvailable: true,
|
||||
webFetchAvailable: true,
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("enforces when prompt requires deeper evidence coverage", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 6, results: [{}, {}, {}] },
|
||||
}),
|
||||
buildRecord({
|
||||
toolName: "web_fetch",
|
||||
details: { status: 200, length: 2200 },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(
|
||||
shouldEnforceWebFetchAfterSearch({
|
||||
usage,
|
||||
webSearchAvailable: true,
|
||||
webFetchAvailable: true,
|
||||
requiredMinFetchSuccess: 2,
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
it("does not enforce when search returns no results", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 0, results: [] },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(
|
||||
shouldEnforceWebFetchAfterSearch({
|
||||
usage,
|
||||
webSearchAvailable: true,
|
||||
webFetchAvailable: true,
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it("does not enforce when web_fetch is unavailable", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 1, results: [{}] },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(
|
||||
shouldEnforceWebFetchAfterSearch({
|
||||
usage,
|
||||
webSearchAvailable: true,
|
||||
webFetchAvailable: false,
|
||||
}),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
it("enforces when fetch was attempted but failed", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_search",
|
||||
details: { count: 1, results: [{}] },
|
||||
}),
|
||||
buildRecord({
|
||||
toolName: "web_fetch",
|
||||
details: { error: true, code: "fetch_failed" },
|
||||
}),
|
||||
]);
|
||||
|
||||
expect(
|
||||
shouldEnforceWebFetchAfterSearch({
|
||||
usage,
|
||||
webSearchAvailable: true,
|
||||
webFetchAvailable: true,
|
||||
}),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("analyzeCrossTurnWebFetchNeed", () => {
|
||||
it("enforces when user explicitly asks to refetch page content", () => {
|
||||
const usage = summarizeWebToolUsage([]);
|
||||
const analysis = analyzeCrossTurnWebFetchNeed({
|
||||
usage,
|
||||
webFetchAvailable: true,
|
||||
userPrompt: "Please refetch the page body this turn and verify with sources.",
|
||||
assistantText: "Here is a quick summary.",
|
||||
});
|
||||
|
||||
expect(analysis.shouldEnforce).toBe(true);
|
||||
expect(analysis.explicitFetchRequest).toBe(true);
|
||||
});
|
||||
|
||||
it("enforces for freshness requests when assistant makes web-style claims", () => {
|
||||
const usage = summarizeWebToolUsage([]);
|
||||
const analysis = analyzeCrossTurnWebFetchNeed({
|
||||
usage,
|
||||
webFetchAvailable: true,
|
||||
userPrompt: "Give me the latest web news about OpenAI with sources.",
|
||||
assistantText: "According to Reuters, OpenAI announced a new release.",
|
||||
});
|
||||
|
||||
expect(analysis.shouldEnforce).toBe(true);
|
||||
expect(analysis.freshnessCue).toBe(true);
|
||||
expect(analysis.webCue).toBe(true);
|
||||
expect(analysis.assistantHasWebClaimSignal).toBe(true);
|
||||
});
|
||||
|
||||
it("does not enforce when a fetch was already attempted in this turn", () => {
|
||||
const usage = summarizeWebToolUsage([
|
||||
buildRecord({
|
||||
toolName: "web_fetch",
|
||||
details: { error: true, code: "fetch_failed" },
|
||||
}),
|
||||
]);
|
||||
const analysis = analyzeCrossTurnWebFetchNeed({
|
||||
usage,
|
||||
webFetchAvailable: true,
|
||||
userPrompt: "Please verify with the latest web sources.",
|
||||
assistantText: "According to Reuters, ...",
|
||||
});
|
||||
|
||||
expect(analysis.shouldEnforce).toBe(false);
|
||||
});
|
||||
|
||||
it("does not enforce when user explicitly blocks web fetch", () => {
|
||||
const usage = summarizeWebToolUsage([]);
|
||||
const analysis = analyzeCrossTurnWebFetchNeed({
|
||||
usage,
|
||||
webFetchAvailable: true,
|
||||
userPrompt: "Do not browse the web, only use snippets.",
|
||||
assistantText: "According to Reuters, ...",
|
||||
});
|
||||
|
||||
expect(analysis.shouldEnforce).toBe(false);
|
||||
expect(analysis.userBlocksWebFetch).toBe(true);
|
||||
});
|
||||
|
||||
it("enforces when user provides a direct URL but no fetch happened", () => {
|
||||
const usage = summarizeWebToolUsage([]);
|
||||
const analysis = analyzeCrossTurnWebFetchNeed({
|
||||
usage,
|
||||
webFetchAvailable: true,
|
||||
userPrompt: "Summarize https://example.com/article and include key takeaways.",
|
||||
assistantText: "I can summarize it for you.",
|
||||
});
|
||||
|
||||
expect(analysis.shouldEnforce).toBe(true);
|
||||
expect(analysis.userProvidesUrl).toBe(true);
|
||||
});
|
||||
|
||||
it("does not enforce for non-web freshness requests", () => {
|
||||
const usage = summarizeWebToolUsage([]);
|
||||
const analysis = analyzeCrossTurnWebFetchNeed({
|
||||
usage,
|
||||
webFetchAvailable: true,
|
||||
userPrompt: "What is the latest version in this repository?",
|
||||
assistantText: "The latest version is 1.2.3.",
|
||||
});
|
||||
|
||||
expect(analysis.shouldEnforce).toBe(false);
|
||||
expect(analysis.freshnessCue).toBe(true);
|
||||
expect(analysis.webCue).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveWebFetchRequirementFromPrompt", () => {
|
||||
it("requires deeper fetch coverage for research-style prompts", () => {
|
||||
const result = resolveWebFetchRequirementFromPrompt(
|
||||
"帮我调研一下 APPLE 最近的产品信息,并做分析。",
|
||||
);
|
||||
|
||||
expect(result.requiredMinFetchSuccess).toBe(2);
|
||||
expect(result.promptSuggestsResearchDepth).toBe(true);
|
||||
});
|
||||
|
||||
it("uses explicit minimum source count when present", () => {
|
||||
const result = resolveWebFetchRequirementFromPrompt(
|
||||
"Please use at least 3 sources and summarize the latest updates.",
|
||||
);
|
||||
|
||||
expect(result.requiredMinFetchSuccess).toBe(3);
|
||||
expect(result.explicitMinFetchFromPrompt).toBe(3);
|
||||
});
|
||||
|
||||
it("falls back to 1 for simple prompts", () => {
|
||||
const result = resolveWebFetchRequirementFromPrompt(
|
||||
"What is OpenAI's CEO?",
|
||||
);
|
||||
|
||||
expect(result.requiredMinFetchSuccess).toBe(1);
|
||||
expect(result.promptSuggestsResearchDepth).toBe(false);
|
||||
});
|
||||
});
|
||||
});
|
||||
262
packages/core/src/agent/web-tools-policy.ts
Normal file
262
packages/core/src/agent/web-tools-policy.ts
Normal file
|
|
@ -0,0 +1,262 @@
|
|||
export type ToolExecutionRecord = {
|
||||
toolName: string;
|
||||
isError: boolean;
|
||||
details: Record<string, unknown> | null;
|
||||
};
|
||||
|
||||
export type WebToolUsage = {
|
||||
searchCalls: number;
|
||||
searchSuccess: number;
|
||||
searchSuccessWithResults: number;
|
||||
/** True when the latest successful search (with results) has no later successful fetch. */
|
||||
searchNeedsFollowupFetch: boolean;
|
||||
fetchCalls: number;
|
||||
fetchSuccess: number;
|
||||
};
|
||||
|
||||
export type WebFetchRequirement = {
|
||||
requiredMinFetchSuccess: number;
|
||||
promptSuggestsResearchDepth: boolean;
|
||||
multiSourceCue: boolean;
|
||||
explicitMinFetchFromPrompt: number | null;
|
||||
};
|
||||
|
||||
export type CrossTurnWebFetchGuardAnalysis = {
|
||||
shouldEnforce: boolean;
|
||||
explicitFetchRequest: boolean;
|
||||
userProvidesUrl: boolean;
|
||||
freshnessCue: boolean;
|
||||
webCue: boolean;
|
||||
userNeedsFreshWebEvidence: boolean;
|
||||
userBlocksWebFetch: boolean;
|
||||
assistantHasWebClaimSignal: boolean;
|
||||
};
|
||||
|
||||
const URL_PATTERN = /https?:\/\/[^\s)]+/i;
|
||||
|
||||
const USER_EXPLICIT_FETCH_PATTERNS: RegExp[] = [
|
||||
/\b(re[-\s]?fetch|fetch (again|fresh)|verify with sources?|cite sources?|provide (sources?|links?))\b/i,
|
||||
/\b(revisit|revalidate|double-check)\b.*\b(source|link|url|web|website)\b/i,
|
||||
/(?:\u672c\u8f6e|\u8fd9\u4e00\u8f6e).*(?:\u91cd\u65b0|\u518d\u6b21).*(?:\u6293\u53d6|\u83b7\u53d6|\u62c9\u53d6)/,
|
||||
/(?:\u91cd\u65b0|\u518d\u6b21).*(?:\u6293\u53d6|\u83b7\u53d6).*(?:\u7f51\u9875|\u6b63\u6587|\u539f\u6587|\u94fe\u63a5)/,
|
||||
/(?:\u7ed9\u51fa|\u63d0\u4f9b).*(?:\u6765\u6e90|\u94fe\u63a5|\u5f15\u7528)/,
|
||||
/(?:\u6838\u5b9e|\u67e5\u8bc1|\u9a8c\u8bc1).*(?:\u6765\u6e90|\u7f51\u9875)/,
|
||||
];
|
||||
|
||||
const USER_FRESHNESS_PATTERNS: RegExp[] = [
|
||||
/\b(latest|most recent|recent|today|current|up-to-date|newest|breaking)\b/i,
|
||||
/\b(news|update|updates)\b/i,
|
||||
/(?:\u6700\u65b0|\u6700\u8fd1|\u4eca\u5929|\u5f53\u524d|\u8fd1\u671f|\u52a8\u6001|\u65b0\u95fb|\u8d44\u8baf)/,
|
||||
];
|
||||
|
||||
const USER_WEB_CONTEXT_PATTERNS: RegExp[] = [
|
||||
/\b(web|internet|online|url|urls|link|links|website|article|source|sources|news)\b/i,
|
||||
/(?:\u7f51\u9875|\u7f51\u7ad9|\u7f51\u7edc|\u4e92\u8054\u7f51|\u94fe\u63a5|\u6765\u6e90|\u65b0\u95fb|\u62a5\u9053|\u6587\u7ae0)/,
|
||||
];
|
||||
|
||||
const USER_RESEARCH_DEPTH_PATTERNS: RegExp[] = [
|
||||
/\b(research|investigate|analysis|analyze|compare|comparison|deep[-\s]?dive|survey|report|review)\b/i,
|
||||
/(?:\u8c03\u7814|\u7814\u7a76|\u5206\u6790|\u6df1\u5ea6|\u5bf9\u6bd4|\u5bf9\u7167|\u6c47\u603b|\u76d8\u70b9|\u62a5\u544a|\u8bc4\u4f30|\u8bc4\u6d4b)/,
|
||||
];
|
||||
|
||||
const USER_MULTI_SOURCE_PATTERNS: RegExp[] = [
|
||||
/\b(multiple|multi-source|across sources|different sources)\b/i,
|
||||
/(?:\u591a\u6765\u6e90|\u591a\u4e2a\u6765\u6e90|\u4e0d\u540c\u6765\u6e90|\u591a\u7f51\u7ad9)/,
|
||||
/(?:\u81f3\u5c11|\u4e0d\u5c11\u4e8e|\u6700\u5c11)\s*\d+\s*(?:\u4e2a|\u6761)?(?:\u6765\u6e90|\u94fe\u63a5|\u7f51\u5740|\u7f51\u9875|\u6587\u7ae0)/,
|
||||
];
|
||||
|
||||
const USER_WEB_BLOCK_PATTERNS: RegExp[] = [
|
||||
/\b(do not|don't|no|without)\s+(browse|web|internet|web_search|web_fetch|fetch)\b/i,
|
||||
/\bonly\b.*\b(snippet|snippets)\b/i,
|
||||
/(?:\u4e0d\u8981|\u4e0d\u9700)\s*(?:\u8054\u7f51|\u6293\u53d6|\u641c\u7d22|\u83b7\u53d6\u7f51\u9875|web_fetch|web_search)/,
|
||||
/(?:\u4ec5|\u53ea).*(?:snippet|\u6458\u8981)/i,
|
||||
];
|
||||
|
||||
const ASSISTANT_WEB_CLAIM_PATTERNS: RegExp[] = [
|
||||
/\b(according to|reported by|as reported|source|sources|citation|cited|press release)\b/i,
|
||||
/\b(reuters|bloomberg|associated press|ap news|financial times|wall street journal)\b/i,
|
||||
/(?:\u636e[^。\n]{0,24}(?:\u62a5\u9053|\u663e\u793a|\u79f0)|\u6765\u6e90|\u62a5\u9053\u79f0|\u516c\u544a|\u53d1\u5e03|\u5ba3\u5e03)/,
|
||||
];
|
||||
|
||||
function hasAnyPattern(text: string, patterns: RegExp[]): boolean {
|
||||
if (!text.trim()) return false;
|
||||
return patterns.some((pattern) => pattern.test(text));
|
||||
}
|
||||
|
||||
function normalizeMinFetchSuccess(raw: number): number {
|
||||
if (!Number.isFinite(raw)) return 1;
|
||||
return Math.max(1, Math.min(4, Math.floor(raw)));
|
||||
}
|
||||
|
||||
function extractExplicitMinFetchFromPrompt(prompt: string): number | null {
|
||||
const patterns: RegExp[] = [
|
||||
/\b(?:at least|minimum of|no less than)\s*(\d+)\s*(?:sources?|links?|urls?|articles?|pages?)\b/i,
|
||||
/(?:\u81f3\u5c11|\u4e0d\u5c11\u4e8e|\u6700\u5c11)\s*(\d+)\s*(?:\u4e2a|\u6761)?(?:\u6765\u6e90|\u94fe\u63a5|\u7f51\u5740|\u7f51\u9875|\u6587\u7ae0)/,
|
||||
];
|
||||
|
||||
for (const pattern of patterns) {
|
||||
const match = prompt.match(pattern);
|
||||
if (!match) continue;
|
||||
const parsed = Number(match[1]);
|
||||
if (!Number.isFinite(parsed)) continue;
|
||||
return normalizeMinFetchSuccess(parsed);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function hasToolError(details: Record<string, unknown> | null): boolean {
|
||||
return details?.error === true;
|
||||
}
|
||||
|
||||
function getSearchResultCount(details: Record<string, unknown> | null): number {
|
||||
if (!details) return 0;
|
||||
const countRaw = details.count;
|
||||
if (typeof countRaw === "number" && Number.isFinite(countRaw)) {
|
||||
return Math.max(0, Math.floor(countRaw));
|
||||
}
|
||||
|
||||
const results = details.results;
|
||||
if (Array.isArray(results)) {
|
||||
return results.length;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
function isSuccessfulExecution(record: ToolExecutionRecord): boolean {
|
||||
if (record.isError) return false;
|
||||
if (hasToolError(record.details)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
export function summarizeWebToolUsage(records: ToolExecutionRecord[]): WebToolUsage {
|
||||
const usage: WebToolUsage = {
|
||||
searchCalls: 0,
|
||||
searchSuccess: 0,
|
||||
searchSuccessWithResults: 0,
|
||||
searchNeedsFollowupFetch: false,
|
||||
fetchCalls: 0,
|
||||
fetchSuccess: 0,
|
||||
};
|
||||
let pendingSearchWithResults = false;
|
||||
|
||||
for (const record of records) {
|
||||
const toolName = record.toolName.trim().toLowerCase();
|
||||
|
||||
if (toolName === "web_search") {
|
||||
usage.searchCalls += 1;
|
||||
if (isSuccessfulExecution(record)) {
|
||||
usage.searchSuccess += 1;
|
||||
if (getSearchResultCount(record.details) > 0) {
|
||||
usage.searchSuccessWithResults += 1;
|
||||
pendingSearchWithResults = true;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (toolName === "web_fetch") {
|
||||
usage.fetchCalls += 1;
|
||||
if (isSuccessfulExecution(record)) {
|
||||
usage.fetchSuccess += 1;
|
||||
pendingSearchWithResults = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
usage.searchNeedsFollowupFetch = pendingSearchWithResults;
|
||||
return usage;
|
||||
}
|
||||
|
||||
export function shouldEnforceWebFetchAfterSearch(params: {
|
||||
usage: WebToolUsage;
|
||||
webSearchAvailable: boolean;
|
||||
webFetchAvailable: boolean;
|
||||
requiredMinFetchSuccess?: number;
|
||||
}): boolean {
|
||||
const {
|
||||
usage,
|
||||
webSearchAvailable,
|
||||
webFetchAvailable,
|
||||
requiredMinFetchSuccess = 1,
|
||||
} = params;
|
||||
|
||||
if (!webSearchAvailable || !webFetchAvailable) return false;
|
||||
if (usage.searchSuccessWithResults <= 0) return false;
|
||||
if (usage.fetchSuccess <= 0) return true;
|
||||
if (usage.searchNeedsFollowupFetch) return true;
|
||||
if (usage.fetchSuccess < normalizeMinFetchSuccess(requiredMinFetchSuccess)) return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
export function resolveWebFetchRequirementFromPrompt(prompt: string): WebFetchRequirement {
|
||||
const normalizedPrompt = prompt ?? "";
|
||||
const promptSuggestsResearchDepth = hasAnyPattern(
|
||||
normalizedPrompt,
|
||||
USER_RESEARCH_DEPTH_PATTERNS,
|
||||
);
|
||||
const multiSourceCue = hasAnyPattern(normalizedPrompt, USER_MULTI_SOURCE_PATTERNS);
|
||||
const explicitMinFetchFromPrompt = extractExplicitMinFetchFromPrompt(normalizedPrompt);
|
||||
|
||||
let requiredMinFetchSuccess = 1;
|
||||
if (promptSuggestsResearchDepth) requiredMinFetchSuccess = 2;
|
||||
if (multiSourceCue) requiredMinFetchSuccess = Math.max(requiredMinFetchSuccess, 2);
|
||||
if (explicitMinFetchFromPrompt !== null) {
|
||||
requiredMinFetchSuccess = Math.max(
|
||||
requiredMinFetchSuccess,
|
||||
explicitMinFetchFromPrompt,
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
requiredMinFetchSuccess: normalizeMinFetchSuccess(requiredMinFetchSuccess),
|
||||
promptSuggestsResearchDepth,
|
||||
multiSourceCue,
|
||||
explicitMinFetchFromPrompt,
|
||||
};
|
||||
}
|
||||
|
||||
export function analyzeCrossTurnWebFetchNeed(params: {
|
||||
usage: WebToolUsage;
|
||||
webFetchAvailable: boolean;
|
||||
userPrompt: string;
|
||||
assistantText: string;
|
||||
}): CrossTurnWebFetchGuardAnalysis {
|
||||
const userPrompt = params.userPrompt ?? "";
|
||||
const assistantText = params.assistantText ?? "";
|
||||
|
||||
const explicitFetchRequest = hasAnyPattern(
|
||||
userPrompt,
|
||||
USER_EXPLICIT_FETCH_PATTERNS,
|
||||
);
|
||||
const userProvidesUrl = URL_PATTERN.test(userPrompt);
|
||||
const freshnessCue = hasAnyPattern(userPrompt, USER_FRESHNESS_PATTERNS);
|
||||
const webCue = userProvidesUrl || hasAnyPattern(userPrompt, USER_WEB_CONTEXT_PATTERNS);
|
||||
const userNeedsFreshWebEvidence =
|
||||
explicitFetchRequest || userProvidesUrl || (freshnessCue && webCue);
|
||||
const userBlocksWebFetch = hasAnyPattern(userPrompt, USER_WEB_BLOCK_PATTERNS);
|
||||
const assistantHasWebClaimSignal =
|
||||
URL_PATTERN.test(assistantText) ||
|
||||
hasAnyPattern(assistantText, ASSISTANT_WEB_CLAIM_PATTERNS);
|
||||
|
||||
const shouldEnforce =
|
||||
params.webFetchAvailable &&
|
||||
params.usage.fetchCalls === 0 &&
|
||||
params.usage.fetchSuccess === 0 &&
|
||||
!userBlocksWebFetch &&
|
||||
userNeedsFreshWebEvidence &&
|
||||
(explicitFetchRequest || userProvidesUrl || assistantHasWebClaimSignal);
|
||||
|
||||
return {
|
||||
shouldEnforce,
|
||||
explicitFetchRequest,
|
||||
userProvidesUrl,
|
||||
freshnessCue,
|
||||
webCue,
|
||||
userNeedsFreshWebEvidence,
|
||||
userBlocksWebFetch,
|
||||
assistantHasWebClaimSignal,
|
||||
};
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue