feat(docs): add LLM Handbook + Google Whitepaper integration v3.3.0
Advanced Guardrails: - prompt-injection-detector.sh (PreToolUse) - output-validator.sh (PostToolUse heuristics) - claudemd-scanner.sh (SessionStart injection detection) - output-secrets-scanner.sh (PostToolUse secrets leak prevention) Observability & Monitoring: - session-logger.sh (JSONL activity logging) - session-stats.sh (cost tracking & analysis) - guide/observability.md (full documentation) LLM-as-a-Judge Evaluation: - output-evaluator.md agent (Haiku) - /validate-changes command - pre-commit-evaluator.sh (opt-in git hook) Google Agent Whitepaper Integration: - Context Triage Guide (Section 2.2.4) - CLAUDE.md Injection Warning (Section 3.1.3) - Agent Validation Checklist (Section 4.2.4) - MCP Security: Tool Shadowing & Confused Deputy (Section 8.6) - Session vs Memory patterns (Section 3.3.3) Stats: 10 new files, 8 modified, 5 new guide sections Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
19110eba22
commit
8a4d116e2e
17 changed files with 2188 additions and 3 deletions
182
examples/hooks/bash/prompt-injection-detector.sh
Executable file
182
examples/hooks/bash/prompt-injection-detector.sh
Executable file
|
|
@ -0,0 +1,182 @@
|
|||
#!/bin/bash
|
||||
# Hook: PreToolUse - Detect prompt injection attempts
|
||||
# Exit 0 = allow, Exit 2 = block (stderr message shown to Claude)
|
||||
#
|
||||
# This hook detects common prompt injection patterns that attempt to
|
||||
# manipulate Claude's behavior through malicious instructions.
|
||||
#
|
||||
# Place in: .claude/hooks/prompt-injection-detector.sh
|
||||
# Register in: .claude/settings.json under PreToolUse event
|
||||
|
||||
set -e
|
||||
|
||||
# Read JSON from stdin
|
||||
INPUT=$(cat)
|
||||
|
||||
TOOL_NAME=$(echo "$INPUT" | jq -r '.tool_name // empty')
|
||||
TOOL_INPUT=$(echo "$INPUT" | jq -r '.tool_input // empty')
|
||||
|
||||
# Only check tools that handle user-provided text content
|
||||
case "$TOOL_NAME" in
|
||||
Bash|Write|Edit|WebFetch)
|
||||
;;
|
||||
*)
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
|
||||
# Extract content to analyze based on tool type
|
||||
CONTENT=""
|
||||
case "$TOOL_NAME" in
|
||||
Bash)
|
||||
CONTENT=$(echo "$TOOL_INPUT" | jq -r '.command // empty')
|
||||
;;
|
||||
Write|Edit)
|
||||
CONTENT=$(echo "$TOOL_INPUT" | jq -r '.content // .new_string // empty')
|
||||
;;
|
||||
WebFetch)
|
||||
CONTENT=$(echo "$TOOL_INPUT" | jq -r '.url // empty')
|
||||
;;
|
||||
esac
|
||||
|
||||
# Skip if no content to analyze
|
||||
[[ -z "$CONTENT" ]] && exit 0
|
||||
|
||||
# Convert to lowercase for case-insensitive matching
|
||||
CONTENT_LOWER=$(echo "$CONTENT" | tr '[:upper:]' '[:lower:]')
|
||||
|
||||
# === ROLE OVERRIDE PATTERNS ===
|
||||
# Attempts to override Claude's instructions or identity
|
||||
ROLE_OVERRIDE_PATTERNS=(
|
||||
"ignore previous instructions"
|
||||
"ignore all previous"
|
||||
"ignore your instructions"
|
||||
"disregard previous"
|
||||
"disregard your instructions"
|
||||
"forget your instructions"
|
||||
"forget everything"
|
||||
"you are now"
|
||||
"act as if"
|
||||
"pretend you are"
|
||||
"pretend to be"
|
||||
"from now on you"
|
||||
"new instructions:"
|
||||
"override:"
|
||||
"system prompt:"
|
||||
)
|
||||
|
||||
for pattern in "${ROLE_OVERRIDE_PATTERNS[@]}"; do
|
||||
if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
|
||||
echo "BLOCKED: Prompt injection detected - role override attempt: '$pattern'" >&2
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
|
||||
# === JAILBREAK PATTERNS ===
|
||||
# Known jailbreak techniques and bypass attempts
|
||||
JAILBREAK_PATTERNS=(
|
||||
"dan mode"
|
||||
"developer mode"
|
||||
"jailbreak"
|
||||
"do anything now"
|
||||
"bypass"
|
||||
"no restrictions"
|
||||
"without restrictions"
|
||||
"unrestricted mode"
|
||||
"god mode"
|
||||
"sudo mode"
|
||||
"admin mode"
|
||||
"debug mode"
|
||||
"maintenance mode"
|
||||
)
|
||||
|
||||
for pattern in "${JAILBREAK_PATTERNS[@]}"; do
|
||||
if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
|
||||
echo "BLOCKED: Prompt injection detected - jailbreak attempt: '$pattern'" >&2
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
|
||||
# === DELIMITER INJECTION ===
|
||||
# Attempts to inject fake system messages or close prompt sections
|
||||
DELIMITER_PATTERNS=(
|
||||
"</system>"
|
||||
"<|endoftext|>"
|
||||
"<|im_end|>"
|
||||
"[/INST]"
|
||||
"[INST]"
|
||||
"<<SYS>>"
|
||||
"<</SYS>>"
|
||||
"### System:"
|
||||
"### Human:"
|
||||
"### Assistant:"
|
||||
"```system"
|
||||
"SYSTEM:"
|
||||
)
|
||||
|
||||
for pattern in "${DELIMITER_PATTERNS[@]}"; do
|
||||
if [[ "$CONTENT" == *"$pattern"* ]]; then
|
||||
echo "BLOCKED: Prompt injection detected - delimiter injection: '$pattern'" >&2
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
|
||||
# === AUTHORITY IMPERSONATION ===
|
||||
# Claims of special authority or permissions
|
||||
AUTHORITY_PATTERNS=(
|
||||
"anthropic employee"
|
||||
"anthropic staff"
|
||||
"i am your creator"
|
||||
"i am your developer"
|
||||
"i have admin access"
|
||||
"authorized to bypass"
|
||||
"emergency override"
|
||||
"security exception"
|
||||
"this is a test"
|
||||
"testing mode"
|
||||
)
|
||||
|
||||
for pattern in "${AUTHORITY_PATTERNS[@]}"; do
|
||||
if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
|
||||
echo "BLOCKED: Prompt injection detected - authority impersonation: '$pattern'" >&2
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
|
||||
# === BASE64 ENCODED INSTRUCTIONS ===
|
||||
# Detect potential base64-encoded payloads (heuristic)
|
||||
# Look for long base64-like strings that might contain instructions
|
||||
if echo "$CONTENT" | grep -qE '[A-Za-z0-9+/]{50,}={0,2}'; then
|
||||
# Try to decode and check for injection patterns
|
||||
DECODED=$(echo "$CONTENT" | grep -oE '[A-Za-z0-9+/]{50,}={0,2}' | head -1 | base64 -d 2>/dev/null || true)
|
||||
DECODED_LOWER=$(echo "$DECODED" | tr '[:upper:]' '[:lower:]')
|
||||
|
||||
for pattern in "ignore" "override" "system" "jailbreak" "dan mode"; do
|
||||
if [[ "$DECODED_LOWER" == *"$pattern"* ]]; then
|
||||
echo "BLOCKED: Prompt injection detected - encoded payload containing: '$pattern'" >&2
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# === CONTEXT MANIPULATION ===
|
||||
# Attempts to manipulate the conversation context
|
||||
CONTEXT_PATTERNS=(
|
||||
"in the previous message"
|
||||
"as i mentioned earlier"
|
||||
"you agreed to"
|
||||
"you already said"
|
||||
"you promised"
|
||||
"remember when you"
|
||||
"our agreement was"
|
||||
)
|
||||
|
||||
for pattern in "${CONTEXT_PATTERNS[@]}"; do
|
||||
if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
|
||||
# Warning only - these could be legitimate
|
||||
echo '{"systemMessage": "Warning: Detected potential context manipulation pattern. Verify legitimacy."}'
|
||||
fi
|
||||
done
|
||||
|
||||
# Allow by default
|
||||
exit 0
|
||||
Loading…
Add table
Add a link
Reference in a new issue