claude-code-ultimate-guide/scripts/extract-audit-context.py
Florian BRUNIAUX a55ff38143 feat(quiz): add complete audit system for 256 questions
Implements automated pipeline for quiz question quality control:

**Phase 1: Context Extraction**
- Script: extract-audit-context.py
- Resolves doc_reference anchors to guide sections (97.3% success)
- Multi-file support (ultimate-guide.md, learning-with-ai.md, etc.)
- Fuzzy matching + substring fallback
- Output: audit-context.json (256 questions + context)

**Phase 2: Batch Generation**
- Script: generate-audit-batches.py
- 16 prioritized review batches by category
- Advanced Patterns split into 2 batches (29 questions)
- Embedded review instructions in each batch
- Output: audit-batches/*.md (16,559 lines)

**Phase 3: Report Compilation**
- Script: generate-audit-report.py
- Parses agent review outputs (PASS/ISSUE format)
- Aggregates by severity (critical/warning/info)
- Output: audit-report.md

**Validation:**
- Q01-001 error found immediately (curl vs npm contradiction)
- System working as designed 

**Documentation:**
- AUDIT-WORKFLOW.md (complete 5-phase guide)
- AUDIT-SYSTEM-SUMMARY.md (architecture + metrics)
- IMPLEMENTATION-COMPLETE.md (status + validation)
- DEMO-REVIEW-OUTPUT.txt (example review)

**Next Steps:** Manual agent reviews (16 batches, ~2-3 hours)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-04 16:45:47 +01:00

443 lines
16 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Extract audit context for quiz questions.
Reads 256 quiz questions from landing repo, resolves their doc_reference anchors
to sections in the guide, and extracts relevant context (max 150 lines per question).
Output: claudedocs/audit-context.json
Strategies for resolving doc_reference.anchor (in order):
A. Anchor matching: Convert anchor to markdown heading and search
B. Section name matching: Fuzzy match on doc_reference.section
C. reference.yaml fallback: Use line numbers from index
D. UNRESOLVED: Flag if no match found
Usage:
python3 scripts/extract-audit-context.py
Requirements:
- pyyaml (pip install pyyaml)
- thefuzz (pip install thefuzz)
"""
import json
import re
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import yaml
try:
from thefuzz import fuzz
except ImportError:
print("Error: thefuzz not installed. Run: pip install thefuzz", file=sys.stderr)
sys.exit(1)
# ═══════════════════════════════════════════════════════════════
# Configuration
# ═══════════════════════════════════════════════════════════════
BASE_DIR = Path(__file__).parent.parent
LANDING_DIR = Path(__file__).parent.parent.parent / "claude-code-ultimate-guide-landing"
QUESTIONS_DIR = LANDING_DIR / "questions"
REFERENCE_YAML = BASE_DIR / "machine-readable" / "reference.yaml"
OUTPUT_JSON = BASE_DIR / "claudedocs" / "audit-context.json"
CONTEXT_LINES = 150 # Max lines of guide context per question
# Cache for loaded guide files
_GUIDE_CACHE = {}
# ═══════════════════════════════════════════════════════════════
# Parsing Utilities (reuse from build-questions.py)
# ═══════════════════════════════════════════════════════════════
def parse_frontmatter(content: str) -> Tuple[Dict, str]:
"""Parse YAML frontmatter and body from Markdown content."""
lines = content.split('\n')
if lines[0].strip() != '---':
raise ValueError("File must start with YAML frontmatter (---)")
closing_idx = None
for idx in range(1, len(lines)):
if lines[idx].strip() == '---':
closing_idx = idx
break
if closing_idx is None:
raise ValueError("Invalid frontmatter structure (missing closing ---)")
yaml_text = '\n'.join(lines[1:closing_idx])
body_text = '\n'.join(lines[closing_idx + 1:])
try:
frontmatter = yaml.safe_load(yaml_text)
except yaml.YAMLError as e:
raise ValueError(f"Invalid YAML frontmatter: {e}")
return frontmatter, body_text
def split_body(body: str) -> Tuple[str, str]:
"""Split body into question and explanation at first --- (outside code blocks)."""
lines = body.split('\n')
in_code_block = False
separator_idx = None
for idx, line in enumerate(lines):
if line.strip().startswith('```'):
in_code_block = not in_code_block
continue
if not in_code_block and line.strip() == '---':
separator_idx = idx
break
if separator_idx is None:
raise ValueError("Body must contain --- separator between question and explanation")
question = '\n'.join(lines[:separator_idx]).strip()
explanation = '\n'.join(lines[separator_idx + 1:]).strip()
return question, explanation
# ═══════════════════════════════════════════════════════════════
# Guide Context Resolution
# ═══════════════════════════════════════════════════════════════
def load_guide(guide_file: str = "guide/ultimate-guide.md") -> List[str]:
"""
Load guide lines from specified file.
Args:
guide_file: Relative path from BASE_DIR (e.g., "guide/ultimate-guide.md")
Returns:
List of lines
"""
if guide_file in _GUIDE_CACHE:
return _GUIDE_CACHE[guide_file]
guide_path = BASE_DIR / guide_file
if not guide_path.exists():
raise FileNotFoundError(f"Guide file not found: {guide_path}")
lines = guide_path.read_text().split('\n')
_GUIDE_CACHE[guide_file] = lines
return lines
def load_reference_yaml() -> Dict:
"""Load reference.yaml for fallback line numbers."""
if not REFERENCE_YAML.exists():
return {}
return yaml.safe_load(REFERENCE_YAML.read_text())
def anchor_to_heading(anchor: str) -> str:
"""
Convert anchor like '#11-installation' to markdown heading 'Installation'
or '## 1.1 Installation'.
Handles various anchor formats:
- '#11-installation''installation' (lowercase for fuzzy match)
- '#core-concepts''core concepts'
- '#32-common-tasks''common tasks'
"""
# Remove '#' and leading numbers (XX-), replace '-' with ' '
clean = anchor.lstrip('#').lower()
clean = re.sub(r'^\d+-', '', clean) # Remove leading XX-
clean = clean.replace('-', ' ')
return clean.strip()
def find_heading_in_guide(guide_lines: List[str], target_heading: str) -> Optional[int]:
"""
Find line number of heading in guide (fuzzy match, threshold 70).
Uses partial matching strategy:
- Checks if target is a substring (case-insensitive)
- Falls back to fuzzy ratio with threshold 70
Returns:
Line number (0-indexed) or None if not found
"""
best_score = 0
best_line = None
target_lower = target_heading.lower()
for idx, line in enumerate(guide_lines):
if line.startswith('#'):
# Extract heading text (remove #, ##, etc.)
heading_text = re.sub(r'^#+\s*', '', line).lower()
# Remove leading numbers like '1.1', '3.2', etc.
heading_text = re.sub(r'^\d+\.?\d*\s*', '', heading_text)
# Strategy 1: Substring match (exact)
if target_lower in heading_text or heading_text in target_lower:
return idx
# Strategy 2: Fuzzy match
score = fuzz.ratio(target_lower, heading_text)
if score > best_score:
best_score = score
best_line = idx
# Lowered threshold to 70 to catch more variations
if best_score >= 70:
return best_line
return None
def extract_section_context(guide_lines: List[str], start_line: int, max_lines: int = CONTEXT_LINES) -> str:
"""
Extract context from guide starting at start_line.
Stops at next heading of same/higher level or after max_lines.
Args:
guide_lines: Full guide lines
start_line: Starting line number (0-indexed)
max_lines: Maximum lines to extract
Returns:
Context text
"""
if start_line >= len(guide_lines):
return ""
# Determine heading level of start line
start_heading = guide_lines[start_line]
start_level = len(re.match(r'^#+', start_heading).group()) if start_heading.startswith('#') else 0
context_lines = []
for offset in range(max_lines):
line_idx = start_line + offset
if line_idx >= len(guide_lines):
break
line = guide_lines[line_idx]
# Stop at next heading of same/higher level (but not the start heading itself)
if offset > 0 and line.startswith('#'):
heading_level = len(re.match(r'^#+', line).group())
if heading_level <= start_level:
break
context_lines.append(line)
return '\n'.join(context_lines)
def resolve_doc_reference(doc_ref: Dict, reference_yaml: Dict) -> Dict:
"""
Resolve doc_reference to guide context.
Strategies (in order):
A. Anchor matching: Convert anchor to heading and search
B. Section name matching: Fuzzy match on section field
C. reference.yaml fallback: Use line numbers
D. UNRESOLVED: No match found
Returns:
{
'strategy': 'anchor|section|reference_yaml|unresolved|file_not_found',
'context': 'extracted guide text or empty',
'line_number': int or None,
'confidence': int (0-100),
'source_file': str (actual file searched)
}
"""
result = {
'strategy': 'unresolved',
'context': '',
'line_number': None,
'confidence': 0,
'source_file': doc_ref.get('file', 'guide/ultimate-guide.md')
}
# Load the correct guide file
guide_file = doc_ref.get('file', 'guide/ultimate-guide.md')
try:
guide_lines = load_guide(guide_file)
except FileNotFoundError:
result['strategy'] = 'file_not_found'
return result
# Strategy A: Anchor matching
if 'anchor' in doc_ref and doc_ref['anchor']:
target_heading = anchor_to_heading(doc_ref['anchor'])
line_num = find_heading_in_guide(guide_lines, target_heading)
if line_num is not None:
result['strategy'] = 'anchor'
result['line_number'] = line_num
result['context'] = extract_section_context(guide_lines, line_num)
result['confidence'] = 95
return result
# Strategy B: Section name matching
if 'section' in doc_ref and doc_ref['section']:
target_section = doc_ref['section'].lower()
line_num = find_heading_in_guide(guide_lines, target_section)
if line_num is not None:
result['strategy'] = 'section'
result['line_number'] = line_num
result['context'] = extract_section_context(guide_lines, line_num)
result['confidence'] = 80
return result
# Strategy C: reference.yaml fallback
# TODO: Implement if anchor/section strategies fail too often
# For now, skip since reference.yaml has complex structure
# Strategy D: UNRESOLVED
return result
# ═══════════════════════════════════════════════════════════════
# Main Processing
# ═══════════════════════════════════════════════════════════════
def process_questions() -> List[Dict]:
"""Process all quiz questions and extract audit context."""
if not QUESTIONS_DIR.exists():
print(f"Error: Questions directory not found: {QUESTIONS_DIR}", file=sys.stderr)
sys.exit(1)
# Load reference
print("Loading reference.yaml...")
reference_yaml = load_reference_yaml()
# Find all question files
md_files = sorted(QUESTIONS_DIR.glob('*/*.md'))
if not md_files:
print(f"Error: No .md files found in {QUESTIONS_DIR}", file=sys.stderr)
sys.exit(1)
print(f"Found {len(md_files)} question files")
print()
# Process each question
results = []
stats = {
'total': len(md_files),
'anchor': 0,
'section': 0,
'reference_yaml': 0,
'unresolved': 0,
'no_reference': 0,
'file_not_found': 0
}
for idx, filepath in enumerate(md_files, 1):
try:
content = filepath.read_text()
frontmatter, body = parse_frontmatter(content)
question_text, explanation_text = split_body(body)
q_id = frontmatter['id']
category_id = frontmatter['category_id']
# Build question object
question_obj = {
'id': q_id,
'category_id': category_id,
'difficulty': frontmatter['difficulty'],
'profiles': frontmatter['profiles'],
'question': question_text,
'options': frontmatter['options'],
'correct': frontmatter['correct'],
'explanation': explanation_text,
'source_file': str(filepath.relative_to(QUESTIONS_DIR.parent))
}
# Resolve doc_reference if present
if 'doc_reference' in frontmatter:
doc_ref = frontmatter['doc_reference']
resolution = resolve_doc_reference(doc_ref, reference_yaml)
question_obj['doc_reference'] = doc_ref
question_obj['guide_context'] = resolution['context']
question_obj['resolution_strategy'] = resolution['strategy']
question_obj['resolution_confidence'] = resolution['confidence']
question_obj['guide_line_number'] = resolution['line_number']
question_obj['guide_source_file'] = resolution['source_file']
stats[resolution['strategy']] += 1
else:
question_obj['guide_context'] = ''
question_obj['resolution_strategy'] = 'no_reference'
stats['no_reference'] += 1
results.append(question_obj)
# Progress indicator
if idx % 25 == 0:
print(f"Processed {idx}/{len(md_files)} questions...")
except Exception as e:
print(f"Error processing {filepath.name}: {e}", file=sys.stderr)
continue
print()
print("═══════════════════════════════════════════════════════════════")
print("Resolution Statistics")
print("═══════════════════════════════════════════════════════════════")
print(f"Total questions: {stats['total']}")
print(f"Anchor strategy: {stats['anchor']} ({stats['anchor']/stats['total']*100:.1f}%)")
print(f"Section strategy: {stats['section']} ({stats['section']/stats['total']*100:.1f}%)")
print(f"reference.yaml: {stats['reference_yaml']} ({stats['reference_yaml']/stats['total']*100:.1f}%)")
print(f"No doc_reference: {stats['no_reference']} ({stats['no_reference']/stats['total']*100:.1f}%)")
print(f"File not found: {stats['file_not_found']} ({stats['file_not_found']/stats['total']*100:.1f}%)")
print(f"UNRESOLVED: {stats['unresolved']} ({stats['unresolved']/stats['total']*100:.1f}%)")
print()
resolved_count = stats['anchor'] + stats['section'] + stats['reference_yaml']
resolution_rate = resolved_count / (stats['total'] - stats['no_reference']) * 100 if stats['total'] > stats['no_reference'] else 0
print(f"Resolution rate (excl. no_reference): {resolution_rate:.1f}%")
if resolution_rate < 95:
print()
print("⚠️ WARNING: Resolution rate < 95%. Consider improving strategies.")
return results
def main():
"""Main entry point."""
print("═══════════════════════════════════════════════════════════════")
print("Quiz Question Audit Context Extraction")
print("═══════════════════════════════════════════════════════════════")
print()
# Process questions
questions_with_context = process_questions()
# Ensure output directory exists
OUTPUT_JSON.parent.mkdir(parents=True, exist_ok=True)
# Write output
output_data = {
'version': '1.0',
'generated_at': '2026-02-04',
'total_questions': len(questions_with_context),
'questions': questions_with_context
}
OUTPUT_JSON.write_text(json.dumps(output_data, indent=2, ensure_ascii=False) + '\n')
print()
print("═══════════════════════════════════════════════════════════════")
print(f"✓ Output written to: {OUTPUT_JSON}")
print(f" Total questions: {len(questions_with_context)}")
print("═══════════════════════════════════════════════════════════════")
if __name__ == '__main__':
main()