feat(docs): add LLM Handbook + Google Whitepaper integration v3.3.0

Advanced Guardrails: - prompt-injection-detector.sh (PreToolUse) - output-validator.sh (PostToolUse heuristics) - claudemd-scanner.sh (SessionStart injection detection) - output-secrets-scanner.sh (PostToolUse secrets leak prevention) Observability & Monitoring: - session-logger.sh (JSONL activity logging) - session-stats.sh (cost tracking & analysis) - guide/observability.md (full documentation) LLM-as-a-Judge Evaluation: - output-evaluator.md agent (Haiku) - /validate-changes command - pre-commit-evaluator.sh (opt-in git hook) Google Agent Whitepaper Integration: - Context Triage Guide (Section 2.2.4) - CLAUDE.md Injection Warning (Section 3.1.3) - Agent Validation Checklist (Section 4.2.4) - MCP Security: Tool Shadowing & Confused Deputy (Section 8.6) - Session vs Memory patterns (Section 3.3.3) Stats: 10 new files, 8 modified, 5 new guide sections Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 21:00:49 +01:00 · 2026-01-14 21:00:49 +01:00 · 8a4d116e2e
commit 8a4d116e2e
parent 19110eba22
17 changed files with 2188 additions and 3 deletions
--- a/examples/hooks/bash/prompt-injection-detector.sh
+++ b/examples/hooks/bash/prompt-injection-detector.sh
@ -0,0 +1,182 @@
+#!/bin/bash
+# Hook: PreToolUse - Detect prompt injection attempts
+# Exit 0 = allow, Exit 2 = block (stderr message shown to Claude)
+#
+# This hook detects common prompt injection patterns that attempt to
+# manipulate Claude's behavior through malicious instructions.
+#
+# Place in: .claude/hooks/prompt-injection-detector.sh
+# Register in: .claude/settings.json under PreToolUse event
+
+set -e
+
+# Read JSON from stdin
+INPUT=$(cat)
+
+TOOL_NAME=$(echo "$INPUT" | jq -r '.tool_name // empty')
+TOOL_INPUT=$(echo "$INPUT" | jq -r '.tool_input // empty')
+
+# Only check tools that handle user-provided text content
+case "$TOOL_NAME" in
+    Bash|Write|Edit|WebFetch)
+        ;;
+    *)
+        exit 0
+        ;;
+esac
+
+# Extract content to analyze based on tool type
+CONTENT=""
+case "$TOOL_NAME" in
+    Bash)
+        CONTENT=$(echo "$TOOL_INPUT" | jq -r '.command // empty')
+        ;;
+    Write|Edit)
+        CONTENT=$(echo "$TOOL_INPUT" | jq -r '.content // .new_string // empty')
+        ;;
+    WebFetch)
+        CONTENT=$(echo "$TOOL_INPUT" | jq -r '.url // empty')
+        ;;
+esac
+
+# Skip if no content to analyze
+[[ -z "$CONTENT" ]] && exit 0
+
+# Convert to lowercase for case-insensitive matching
+CONTENT_LOWER=$(echo "$CONTENT" | tr '[:upper:]' '[:lower:]')
+
+# === ROLE OVERRIDE PATTERNS ===
+# Attempts to override Claude's instructions or identity
+ROLE_OVERRIDE_PATTERNS=(
+    "ignore previous instructions"
+    "ignore all previous"
+    "ignore your instructions"
+    "disregard previous"
+    "disregard your instructions"
+    "forget your instructions"
+    "forget everything"
+    "you are now"
+    "act as if"
+    "pretend you are"
+    "pretend to be"
+    "from now on you"
+    "new instructions:"
+    "override:"
+    "system prompt:"
+)
+
+for pattern in "${ROLE_OVERRIDE_PATTERNS[@]}"; do
+    if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
+        echo "BLOCKED: Prompt injection detected - role override attempt: '$pattern'" >&2
+        exit 2
+    fi
+done
+
+# === JAILBREAK PATTERNS ===
+# Known jailbreak techniques and bypass attempts
+JAILBREAK_PATTERNS=(
+    "dan mode"
+    "developer mode"
+    "jailbreak"
+    "do anything now"
+    "bypass"
+    "no restrictions"
+    "without restrictions"
+    "unrestricted mode"
+    "god mode"
+    "sudo mode"
+    "admin mode"
+    "debug mode"
+    "maintenance mode"
+)
+
+for pattern in "${JAILBREAK_PATTERNS[@]}"; do
+    if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
+        echo "BLOCKED: Prompt injection detected - jailbreak attempt: '$pattern'" >&2
+        exit 2
+    fi
+done
+
+# === DELIMITER INJECTION ===
+# Attempts to inject fake system messages or close prompt sections
+DELIMITER_PATTERNS=(
+    "</system>"
+    "<|endoftext|>"
+    "<|im_end|>"
+    "[/INST]"
+    "[INST]"
+    "<<SYS>>"
+    "<</SYS>>"
+    "### System:"
+    "### Human:"
+    "### Assistant:"
+    "```system"
+    "SYSTEM:"
+)
+
+for pattern in "${DELIMITER_PATTERNS[@]}"; do
+    if [[ "$CONTENT" == *"$pattern"* ]]; then
+        echo "BLOCKED: Prompt injection detected - delimiter injection: '$pattern'" >&2
+        exit 2
+    fi
+done
+
+# === AUTHORITY IMPERSONATION ===
+# Claims of special authority or permissions
+AUTHORITY_PATTERNS=(
+    "anthropic employee"
+    "anthropic staff"
+    "i am your creator"
+    "i am your developer"
+    "i have admin access"
+    "authorized to bypass"
+    "emergency override"
+    "security exception"
+    "this is a test"
+    "testing mode"
+)
+
+for pattern in "${AUTHORITY_PATTERNS[@]}"; do
+    if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
+        echo "BLOCKED: Prompt injection detected - authority impersonation: '$pattern'" >&2
+        exit 2
+    fi
+done
+
+# === BASE64 ENCODED INSTRUCTIONS ===
+# Detect potential base64-encoded payloads (heuristic)
+# Look for long base64-like strings that might contain instructions
+if echo "$CONTENT" | grep -qE '[A-Za-z0-9+/]{50,}={0,2}'; then
+    # Try to decode and check for injection patterns
+    DECODED=$(echo "$CONTENT" | grep -oE '[A-Za-z0-9+/]{50,}={0,2}' | head -1 | base64 -d 2>/dev/null || true)
+    DECODED_LOWER=$(echo "$DECODED" | tr '[:upper:]' '[:lower:]')
+
+    for pattern in "ignore" "override" "system" "jailbreak" "dan mode"; do
+        if [[ "$DECODED_LOWER" == *"$pattern"* ]]; then
+            echo "BLOCKED: Prompt injection detected - encoded payload containing: '$pattern'" >&2
+            exit 2
+        fi
+    done
+fi
+
+# === CONTEXT MANIPULATION ===
+# Attempts to manipulate the conversation context
+CONTEXT_PATTERNS=(
+    "in the previous message"
+    "as i mentioned earlier"
+    "you agreed to"
+    "you already said"
+    "you promised"
+    "remember when you"
+    "our agreement was"
+)
+
+for pattern in "${CONTEXT_PATTERNS[@]}"; do
+    if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
+        # Warning only - these could be legitimate
+        echo '{"systemMessage": "Warning: Detected potential context manipulation pattern. Verify legitimacy."}'
+    fi
+done
+
+# Allow by default
+exit 0