claude-code-ultimate-guide/scripts/extract-audit-context.py

#!/usr/bin/env python3
"""
Extract audit context for quiz questions.

Reads 256 quiz questions from landing repo, resolves their doc_reference anchors
to sections in the guide, and extracts relevant context (max 150 lines per question).

Output: claudedocs/audit-context.json

Strategies for resolving doc_reference.anchor (in order):
  A. Anchor matching: Convert anchor to markdown heading and search
  B. Section name matching: Fuzzy match on doc_reference.section
  C. reference.yaml fallback: Use line numbers from index
  D. UNRESOLVED: Flag if no match found

Usage:
    python3 scripts/extract-audit-context.py

Requirements:
    - pyyaml (pip install pyyaml)
    - thefuzz (pip install thefuzz)
"""

import json
import re
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import yaml

try:
    from thefuzz import fuzz
except ImportError:
    print("Error: thefuzz not installed. Run: pip install thefuzz", file=sys.stderr)
    sys.exit(1)


# ═══════════════════════════════════════════════════════════════
# Configuration
# ═══════════════════════════════════════════════════════════════

BASE_DIR = Path(__file__).parent.parent
LANDING_DIR = Path(__file__).parent.parent.parent / "claude-code-ultimate-guide-landing"
QUESTIONS_DIR = LANDING_DIR / "questions"
REFERENCE_YAML = BASE_DIR / "machine-readable" / "reference.yaml"
OUTPUT_JSON = BASE_DIR / "claudedocs" / "audit-context.json"

CONTEXT_LINES = 150  # Max lines of guide context per question

# Cache for loaded guide files
_GUIDE_CACHE = {}


# ═══════════════════════════════════════════════════════════════
# Parsing Utilities (reuse from build-questions.py)
# ═══════════════════════════════════════════════════════════════

def parse_frontmatter(content: str) -> Tuple[Dict, str]:
    """Parse YAML frontmatter and body from Markdown content."""
    lines = content.split('\n')

    if lines[0].strip() != '---':
        raise ValueError("File must start with YAML frontmatter (---)")

    closing_idx = None
    for idx in range(1, len(lines)):
        if lines[idx].strip() == '---':
            closing_idx = idx
            break

    if closing_idx is None:
        raise ValueError("Invalid frontmatter structure (missing closing ---)")

    yaml_text = '\n'.join(lines[1:closing_idx])
    body_text = '\n'.join(lines[closing_idx + 1:])

    try:
        frontmatter = yaml.safe_load(yaml_text)
    except yaml.YAMLError as e:
        raise ValueError(f"Invalid YAML frontmatter: {e}")

    return frontmatter, body_text


def split_body(body: str) -> Tuple[str, str]:
    """Split body into question and explanation at first --- (outside code blocks)."""
    lines = body.split('\n')
    in_code_block = False
    separator_idx = None

    for idx, line in enumerate(lines):
        if line.strip().startswith('```'):
            in_code_block = not in_code_block
            continue

        if not in_code_block and line.strip() == '---':
            separator_idx = idx
            break

    if separator_idx is None:
        raise ValueError("Body must contain --- separator between question and explanation")

    question = '\n'.join(lines[:separator_idx]).strip()
    explanation = '\n'.join(lines[separator_idx + 1:]).strip()

    return question, explanation


# ═══════════════════════════════════════════════════════════════
# Guide Context Resolution
# ═══════════════════════════════════════════════════════════════

def load_guide(guide_file: str = "guide/ultimate-guide.md") -> List[str]:
    """
    Load guide lines from specified file.

    Args:
        guide_file: Relative path from BASE_DIR (e.g., "guide/ultimate-guide.md")

    Returns:
        List of lines
    """
    if guide_file in _GUIDE_CACHE:
        return _GUIDE_CACHE[guide_file]

    guide_path = BASE_DIR / guide_file
    if not guide_path.exists():
        raise FileNotFoundError(f"Guide file not found: {guide_path}")

    lines = guide_path.read_text().split('\n')
    _GUIDE_CACHE[guide_file] = lines
    return lines


def load_reference_yaml() -> Dict:
    """Load reference.yaml for fallback line numbers."""
    if not REFERENCE_YAML.exists():
        return {}
    return yaml.safe_load(REFERENCE_YAML.read_text())


def anchor_to_heading(anchor: str) -> str:
    """
    Convert anchor like '#11-installation' to markdown heading 'Installation'
    or '## 1.1 Installation'.

    Handles various anchor formats:
    - '#11-installation' → 'installation' (lowercase for fuzzy match)
    - '#core-concepts' → 'core concepts'
    - '#32-common-tasks' → 'common tasks'
    """
    # Remove '#' and leading numbers (XX-), replace '-' with ' '
    clean = anchor.lstrip('#').lower()
    clean = re.sub(r'^\d+-', '', clean)  # Remove leading XX-
    clean = clean.replace('-', ' ')
    return clean.strip()


def find_heading_in_guide(guide_lines: List[str], target_heading: str) -> Optional[int]:
    """
    Find line number of heading in guide (fuzzy match, threshold 70).

    Uses partial matching strategy:
    - Checks if target is a substring (case-insensitive)
    - Falls back to fuzzy ratio with threshold 70

    Returns:
        Line number (0-indexed) or None if not found
    """
    best_score = 0
    best_line = None
    target_lower = target_heading.lower()

    for idx, line in enumerate(guide_lines):
        if line.startswith('#'):
            # Extract heading text (remove #, ##, etc.)
            heading_text = re.sub(r'^#+\s*', '', line).lower()
            # Remove leading numbers like '1.1', '3.2', etc.
            heading_text = re.sub(r'^\d+\.?\d*\s*', '', heading_text)

            # Strategy 1: Substring match (exact)
            if target_lower in heading_text or heading_text in target_lower:
                return idx

            # Strategy 2: Fuzzy match
            score = fuzz.ratio(target_lower, heading_text)
            if score > best_score:
                best_score = score
                best_line = idx

    # Lowered threshold to 70 to catch more variations
    if best_score >= 70:
        return best_line
    return None


def extract_section_context(guide_lines: List[str], start_line: int, max_lines: int = CONTEXT_LINES) -> str:
    """
    Extract context from guide starting at start_line.
    Stops at next heading of same/higher level or after max_lines.

    Args:
        guide_lines: Full guide lines
        start_line: Starting line number (0-indexed)
        max_lines: Maximum lines to extract

    Returns:
        Context text
    """
    if start_line >= len(guide_lines):
        return ""

    # Determine heading level of start line
    start_heading = guide_lines[start_line]
    start_level = len(re.match(r'^#+', start_heading).group()) if start_heading.startswith('#') else 0

    context_lines = []
    for offset in range(max_lines):
        line_idx = start_line + offset
        if line_idx >= len(guide_lines):
            break

        line = guide_lines[line_idx]

        # Stop at next heading of same/higher level (but not the start heading itself)
        if offset > 0 and line.startswith('#'):
            heading_level = len(re.match(r'^#+', line).group())
            if heading_level <= start_level:
                break

        context_lines.append(line)

    return '\n'.join(context_lines)


def resolve_doc_reference(doc_ref: Dict, reference_yaml: Dict) -> Dict:
    """
    Resolve doc_reference to guide context.

    Strategies (in order):
      A. Anchor matching: Convert anchor to heading and search
      B. Section name matching: Fuzzy match on section field
      C. reference.yaml fallback: Use line numbers
      D. UNRESOLVED: No match found

    Returns:
        {
            'strategy': 'anchor|section|reference_yaml|unresolved|file_not_found',
            'context': 'extracted guide text or empty',
            'line_number': int or None,
            'confidence': int (0-100),
            'source_file': str (actual file searched)
        }
    """
    result = {
        'strategy': 'unresolved',
        'context': '',
        'line_number': None,
        'confidence': 0,
        'source_file': doc_ref.get('file', 'guide/ultimate-guide.md')
    }

    # Load the correct guide file
    guide_file = doc_ref.get('file', 'guide/ultimate-guide.md')
    try:
        guide_lines = load_guide(guide_file)
    except FileNotFoundError:
        result['strategy'] = 'file_not_found'
        return result

    # Strategy A: Anchor matching
    if 'anchor' in doc_ref and doc_ref['anchor']:
        target_heading = anchor_to_heading(doc_ref['anchor'])
        line_num = find_heading_in_guide(guide_lines, target_heading)

        if line_num is not None:
            result['strategy'] = 'anchor'
            result['line_number'] = line_num
            result['context'] = extract_section_context(guide_lines, line_num)
            result['confidence'] = 95
            return result

    # Strategy B: Section name matching
    if 'section' in doc_ref and doc_ref['section']:
        target_section = doc_ref['section'].lower()
        line_num = find_heading_in_guide(guide_lines, target_section)

        if line_num is not None:
            result['strategy'] = 'section'
            result['line_number'] = line_num
            result['context'] = extract_section_context(guide_lines, line_num)
            result['confidence'] = 80
            return result

    # Strategy C: reference.yaml fallback
    # TODO: Implement if anchor/section strategies fail too often
    # For now, skip since reference.yaml has complex structure

    # Strategy D: UNRESOLVED
    return result


# ═══════════════════════════════════════════════════════════════
# Main Processing
# ═══════════════════════════════════════════════════════════════

def process_questions() -> List[Dict]:
    """Process all quiz questions and extract audit context."""
    if not QUESTIONS_DIR.exists():
        print(f"Error: Questions directory not found: {QUESTIONS_DIR}", file=sys.stderr)
        sys.exit(1)

    # Load reference
    print("Loading reference.yaml...")
    reference_yaml = load_reference_yaml()

    # Find all question files
    md_files = sorted(QUESTIONS_DIR.glob('*/*.md'))
    if not md_files:
        print(f"Error: No .md files found in {QUESTIONS_DIR}", file=sys.stderr)
        sys.exit(1)

    print(f"Found {len(md_files)} question files")
    print()

    # Process each question
    results = []
    stats = {
        'total': len(md_files),
        'anchor': 0,
        'section': 0,
        'reference_yaml': 0,
        'unresolved': 0,
        'no_reference': 0,
        'file_not_found': 0
    }

    for idx, filepath in enumerate(md_files, 1):
        try:
            content = filepath.read_text()
            frontmatter, body = parse_frontmatter(content)
            question_text, explanation_text = split_body(body)

            q_id = frontmatter['id']
            category_id = frontmatter['category_id']

            # Build question object
            question_obj = {
                'id': q_id,
                'category_id': category_id,
                'difficulty': frontmatter['difficulty'],
                'profiles': frontmatter['profiles'],
                'question': question_text,
                'options': frontmatter['options'],
                'correct': frontmatter['correct'],
                'explanation': explanation_text,
                'source_file': str(filepath.relative_to(QUESTIONS_DIR.parent))
            }

            # Resolve doc_reference if present
            if 'doc_reference' in frontmatter:
                doc_ref = frontmatter['doc_reference']
                resolution = resolve_doc_reference(doc_ref, reference_yaml)

                question_obj['doc_reference'] = doc_ref
                question_obj['guide_context'] = resolution['context']
                question_obj['resolution_strategy'] = resolution['strategy']
                question_obj['resolution_confidence'] = resolution['confidence']
                question_obj['guide_line_number'] = resolution['line_number']
                question_obj['guide_source_file'] = resolution['source_file']

                stats[resolution['strategy']] += 1
            else:
                question_obj['guide_context'] = ''
                question_obj['resolution_strategy'] = 'no_reference'
                stats['no_reference'] += 1

            results.append(question_obj)

            # Progress indicator
            if idx % 25 == 0:
                print(f"Processed {idx}/{len(md_files)} questions...")

        except Exception as e:
            print(f"Error processing {filepath.name}: {e}", file=sys.stderr)
            continue

    print()
    print("═══════════════════════════════════════════════════════════════")
    print("Resolution Statistics")
    print("═══════════════════════════════════════════════════════════════")
    print(f"Total questions:        {stats['total']}")
    print(f"Anchor strategy:        {stats['anchor']} ({stats['anchor']/stats['total']*100:.1f}%)")
    print(f"Section strategy:       {stats['section']} ({stats['section']/stats['total']*100:.1f}%)")
    print(f"reference.yaml:         {stats['reference_yaml']} ({stats['reference_yaml']/stats['total']*100:.1f}%)")
    print(f"No doc_reference:       {stats['no_reference']} ({stats['no_reference']/stats['total']*100:.1f}%)")
    print(f"File not found:         {stats['file_not_found']} ({stats['file_not_found']/stats['total']*100:.1f}%)")
    print(f"UNRESOLVED:             {stats['unresolved']} ({stats['unresolved']/stats['total']*100:.1f}%)")
    print()

    resolved_count = stats['anchor'] + stats['section'] + stats['reference_yaml']
    resolution_rate = resolved_count / (stats['total'] - stats['no_reference']) * 100 if stats['total'] > stats['no_reference'] else 0
    print(f"Resolution rate (excl. no_reference): {resolution_rate:.1f}%")

    if resolution_rate < 95:
        print()
        print("⚠️  WARNING: Resolution rate < 95%. Consider improving strategies.")

    return results


def main():
    """Main entry point."""
    print("═══════════════════════════════════════════════════════════════")
    print("Quiz Question Audit Context Extraction")
    print("═══════════════════════════════════════════════════════════════")
    print()

    # Process questions
    questions_with_context = process_questions()

    # Ensure output directory exists
    OUTPUT_JSON.parent.mkdir(parents=True, exist_ok=True)

    # Write output
    output_data = {
        'version': '1.0',
        'generated_at': '2026-02-04',
        'total_questions': len(questions_with_context),
        'questions': questions_with_context
    }

    OUTPUT_JSON.write_text(json.dumps(output_data, indent=2, ensure_ascii=False) + '\n')

    print()
    print("═══════════════════════════════════════════════════════════════")
    print(f"✓ Output written to: {OUTPUT_JSON}")
    print(f"  Total questions: {len(questions_with_context)}")
    print("═══════════════════════════════════════════════════════════════")


if __name__ == '__main__':
    main()