claude-code-ultimate-guide/examples/hooks/bash/prompt-injection-detector.sh

#!/bin/bash
# Hook: PreToolUse - Detect prompt injection attempts
# Exit 0 = allow, Exit 2 = block (stderr message shown to Claude)
#
# This hook detects common prompt injection patterns that attempt to
# manipulate Claude's behavior through malicious instructions.
#
# Place in: .claude/hooks/prompt-injection-detector.sh
# Register in: .claude/settings.json under PreToolUse event

set -e

# Read JSON from stdin
INPUT=$(cat)

TOOL_NAME=$(echo "$INPUT" | jq -r '.tool_name // empty')
TOOL_INPUT=$(echo "$INPUT" | jq -r '.tool_input // empty')

# Only check tools that handle user-provided text content
case "$TOOL_NAME" in
    Bash|Write|Edit|WebFetch)
        ;;
    *)
        exit 0
        ;;
esac

# Extract content to analyze based on tool type
CONTENT=""
case "$TOOL_NAME" in
    Bash)
        CONTENT=$(echo "$TOOL_INPUT" | jq -r '.command // empty')
        ;;
    Write|Edit)
        CONTENT=$(echo "$TOOL_INPUT" | jq -r '.content // .new_string // empty')
        ;;
    WebFetch)
        CONTENT=$(echo "$TOOL_INPUT" | jq -r '.url // empty')
        ;;
esac

# Skip if no content to analyze
[[ -z "$CONTENT" ]] && exit 0

# Convert to lowercase for case-insensitive matching
CONTENT_LOWER=$(echo "$CONTENT" | tr '[:upper:]' '[:lower:]')

# === ROLE OVERRIDE PATTERNS ===
# Attempts to override Claude's instructions or identity
ROLE_OVERRIDE_PATTERNS=(
    "ignore previous instructions"
    "ignore all previous"
    "ignore your instructions"
    "disregard previous"
    "disregard your instructions"
    "forget your instructions"
    "forget everything"
    "you are now"
    "act as if"
    "pretend you are"
    "pretend to be"
    "from now on you"
    "new instructions:"
    "override:"
    "system prompt:"
)

for pattern in "${ROLE_OVERRIDE_PATTERNS[@]}"; do
    if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
        echo "BLOCKED: Prompt injection detected - role override attempt: '$pattern'" >&2
        exit 2
    fi
done

# === JAILBREAK PATTERNS ===
# Known jailbreak techniques and bypass attempts
JAILBREAK_PATTERNS=(
    "dan mode"
    "developer mode"
    "jailbreak"
    "do anything now"
    "bypass"
    "no restrictions"
    "without restrictions"
    "unrestricted mode"
    "god mode"
    "sudo mode"
    "admin mode"
    "debug mode"
    "maintenance mode"
)

for pattern in "${JAILBREAK_PATTERNS[@]}"; do
    if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
        echo "BLOCKED: Prompt injection detected - jailbreak attempt: '$pattern'" >&2
        exit 2
    fi
done

# === DELIMITER INJECTION ===
# Attempts to inject fake system messages or close prompt sections
DELIMITER_PATTERNS=(
    "</system>"
    "<|endoftext|>"
    "<|im_end|>"
    "[/INST]"
    "[INST]"
    "<<SYS>>"
    "<</SYS>>"
    "### System:"
    "### Human:"
    "### Assistant:"
    "```system"
    "SYSTEM:"
)

for pattern in "${DELIMITER_PATTERNS[@]}"; do
    if [[ "$CONTENT" == *"$pattern"* ]]; then
        echo "BLOCKED: Prompt injection detected - delimiter injection: '$pattern'" >&2
        exit 2
    fi
done

# === AUTHORITY IMPERSONATION ===
# Claims of special authority or permissions
AUTHORITY_PATTERNS=(
    "anthropic employee"
    "anthropic staff"
    "i am your creator"
    "i am your developer"
    "i have admin access"
    "authorized to bypass"
    "emergency override"
    "security exception"
    "this is a test"
    "testing mode"
)

for pattern in "${AUTHORITY_PATTERNS[@]}"; do
    if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
        echo "BLOCKED: Prompt injection detected - authority impersonation: '$pattern'" >&2
        exit 2
    fi
done

# === BASE64 ENCODED INSTRUCTIONS ===
# Detect potential base64-encoded payloads (heuristic)
# Look for long base64-like strings that might contain instructions
if echo "$CONTENT" | grep -qE '[A-Za-z0-9+/]{50,}={0,2}'; then
    # Try to decode and check for injection patterns
    DECODED=$(echo "$CONTENT" | grep -oE '[A-Za-z0-9+/]{50,}={0,2}' | head -1 | base64 -d 2>/dev/null || true)
    DECODED_LOWER=$(echo "$DECODED" | tr '[:upper:]' '[:lower:]')

    for pattern in "ignore" "override" "system" "jailbreak" "dan mode"; do
        if [[ "$DECODED_LOWER" == *"$pattern"* ]]; then
            echo "BLOCKED: Prompt injection detected - encoded payload containing: '$pattern'" >&2
            exit 2
        fi
    done
fi

# === CONTEXT MANIPULATION ===
# Attempts to manipulate the conversation context
CONTEXT_PATTERNS=(
    "in the previous message"
    "as i mentioned earlier"
    "you agreed to"
    "you already said"
    "you promised"
    "remember when you"
    "our agreement was"
)

for pattern in "${CONTEXT_PATTERNS[@]}"; do
    if [[ "$CONTENT_LOWER" == *"$pattern"* ]]; then
        # Warning only - these could be legitimate
        echo '{"systemMessage": "Warning: Detected potential context manipulation pattern. Verify legitimacy."}'
    fi
done

# Allow by default
exit 0