claude-code-ultimate-guide/examples/hooks/bash/unicode-injection-scanner.sh

#!/bin/bash
# =============================================================================
# Unicode Injection Scanner Hook
# =============================================================================
# Event: PreToolUse (runs before Edit/Write operations)
# Purpose: Detect invisible Unicode characters used for prompt injection
#
# This hook detects evasion techniques that embed invisible instructions:
#   - Zero-width characters (U+200B-U+200D, U+FEFF)
#   - RTL/LTR override (U+202A-U+202E, U+2066-U+2069)
#   - ANSI escape sequences (terminal injection)
#   - Null bytes (truncation attacks)
#   - Tag characters (U+E0000-U+E007F)
#
# Installation:
#   Add to .claude/settings.json:
#   {
#     "hooks": {
#       "PreToolUse": [{
#         "matcher": "Edit|Write",
#         "hooks": ["bash examples/hooks/bash/unicode-injection-scanner.sh"]
#       }]
#     }
#   }
#
# Exit codes:
#   0 = allow (no injection detected)
#   2 = block (injection detected, stderr message shown to Claude)
#
# References:
#   - CVE-2025-53109/53110: Unicode-based sandbox escape
#   - Arxiv 2509.22040: Prompt Injection on Coding Assistants
# =============================================================================

set -euo pipefail

# Read the hook input from stdin
INPUT=$(cat)

TOOL_NAME=$(echo "$INPUT" | jq -r '.tool_name // empty')
TOOL_INPUT=$(echo "$INPUT" | jq -r '.tool_input // empty')

# Only check Edit and Write tools
case "$TOOL_NAME" in
    Edit|Write)
        ;;
    *)
        exit 0
        ;;
esac

# Extract content to analyze
CONTENT=""
case "$TOOL_NAME" in
    Write)
        CONTENT=$(echo "$TOOL_INPUT" | jq -r '.content // empty')
        ;;
    Edit)
        CONTENT=$(echo "$TOOL_INPUT" | jq -r '.new_string // empty')
        ;;
esac

# Skip if no content
[[ -z "$CONTENT" ]] && exit 0

# === ZERO-WIDTH CHARACTERS ===
# U+200B Zero Width Space
# U+200C Zero Width Non-Joiner
# U+200D Zero Width Joiner
# U+FEFF Byte Order Mark (when not at start)
if echo "$CONTENT" | grep -qP '[\x{200B}-\x{200D}\x{FEFF}]'; then
    echo "BLOCKED: Zero-width characters detected (U+200B-U+200D or BOM). These can hide malicious instructions." >&2
    exit 2
fi

# === BIDIRECTIONAL TEXT OVERRIDE ===
# U+202A Left-to-Right Embedding
# U+202B Right-to-Left Embedding
# U+202C Pop Directional Formatting
# U+202D Left-to-Right Override
# U+202E Right-to-Left Override (most dangerous - reverses text display)
# U+2066-U+2069 Isolate controls
if echo "$CONTENT" | grep -qP '[\x{202A}-\x{202E}\x{2066}-\x{2069}]'; then
    echo "BLOCKED: Bidirectional text override detected (U+202A-U+202E). These can disguise malicious commands." >&2
    exit 2
fi

# === ANSI ESCAPE SEQUENCES ===
# \x1b[ CSI (Control Sequence Introducer) - terminal control
# \x1b] OSC (Operating System Command)
# \x1b( Character set selection
# These can manipulate terminal display or execute commands
if echo "$CONTENT" | grep -qE $'\x1b\[|\x1b\]|\x1b\('; then
    echo "BLOCKED: ANSI escape sequence detected. These can manipulate terminal display." >&2
    exit 2
fi

# === NULL BYTES ===
# \x00 can truncate strings and bypass security checks
if echo "$CONTENT" | grep -qP '\x00'; then
    echo "BLOCKED: Null byte detected. These can cause string truncation attacks." >&2
    exit 2
fi

# === TAG CHARACTERS ===
# U+E0000-U+E007F are invisible "tag" characters
# Sometimes used to embed hidden data
if echo "$CONTENT" | grep -qP '[\x{E0000}-\x{E007F}]'; then
    echo "BLOCKED: Unicode tag characters detected (U+E0000-E007F). These can embed invisible data." >&2
    exit 2
fi

# === OVERLONG UTF-8 SEQUENCES ===
# Detect potential overlong encodings (e.g., encoding '/' as C0 AF instead of 2F)
# These can bypass path filters
# Check for C0 or C1 bytes followed by 80-BF (overlong 2-byte sequences)
if echo "$CONTENT" | grep -qP '[\xC0-\xC1][\x80-\xBF]'; then
    echo "BLOCKED: Overlong UTF-8 sequence detected. These can bypass security filters." >&2
    exit 2
fi

# === HOMOGLYPHS WARNING ===
# Detect Cyrillic characters that look like Latin (confusables)
# Common in typosquatting and filter bypass
# а (U+0430) vs a, е (U+0435) vs e, о (U+043E) vs o, etc.
HOMOGLYPHS_FOUND=false
if echo "$CONTENT" | grep -qP '[\x{0430}\x{0435}\x{043E}\x{0440}\x{0441}\x{0445}]'; then
    HOMOGLYPHS_FOUND=true
fi
if echo "$CONTENT" | grep -qP '[\x{0391}-\x{03C9}]' && echo "$CONTENT" | grep -qP '[a-zA-Z]'; then
    # Greek mixed with Latin
    HOMOGLYPHS_FOUND=true
fi

if [[ "$HOMOGLYPHS_FOUND" == "true" ]]; then
    # Warning only - could be legitimate multilingual content
    echo '{"systemMessage": "Warning: Potential homoglyph characters detected (Cyrillic/Greek mixed with Latin). Verify this is not an attempt to bypass filters."}'
fi

# All checks passed
exit 0