docs: factual audit + reference sync — 260 findings corrected

Parallel 6-agent audit against official Anthropic docs (llms-full.txt). Key corrections applied across permissions, hooks, MCP, security, privacy, reference.yaml. Highlights: - Fix MCP config path (~/.claude.json), mcpServers key, variable substitution syntax - Fix permission modes (5 not 3), :* syntax (×6), Stop event description - Fix hook JSON field names (hook_event_name, tool_name, tool_input, session_id) - Fix filesystem restriction docs (permission rules, not settings.json keys) - Fix data-privacy: 4-tier retention, /bug 5yr warning, ZDR conditions, 5 telemetry opt-out vars - Add official llms.txt/llms-full.txt references to CLAUDE.md + machine-readable/llms.txt - Reference.yaml: 375 entries re-synced (92% had wrong line numbers — guide grew 15K→21K lines) - New script: scripts/resync-reference-yaml.py for automated line number sync - Quiz: corrected answers for hooks (07), memory settings (03), MCP servers (08) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-26 12:10:14 +01:00 · 2026-02-26 12:10:14 +01:00 · 8e63d84b47
commit 8e63d84b47
parent ad735dfff4
26 changed files with 1125 additions and 566 deletions
--- a/scripts/resync-reference-yaml.py
+++ b/scripts/resync-reference-yaml.py
@ -0,0 +1,406 @@
+#!/usr/bin/env python3
+"""
+Re-sync line numbers in machine-readable/reference.yaml
+
+Strategy:
+  1. Build header index for each guide file (all lines starting with #)
+  2. For each reference.yaml entry with a line number, read what's at that line
+  3. If content doesn't look right, search by key-name keywords in headers
+  4. Output a patch file with proposed fixes + confidence scores
+
+Usage:
+  python3 scripts/resync-reference-yaml.py [--apply]
+
+  Without --apply: prints report to stdout + saves claudedocs/resync-report.md
+  With --apply:    applies HIGH CONFIDENCE fixes directly to reference.yaml
+"""
+
+import re
+import sys
+import os
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).parent.parent
+YAML_FILE = REPO_ROOT / "machine-readable" / "reference.yaml"
+REPORT_FILE = REPO_ROOT / "claudedocs" / "resync-report.md"
+
+# Files that reference.yaml points to (with bare integers → ultimate-guide.md)
+GUIDE_FILES = {
+    "guide/ultimate-guide.md": None,
+    "guide/architecture.md": None,
+    "guide/workflows/iterative-refinement.md": None,
+    "guide/observability.md": None,
+    "guide/learning-with-ai.md": None,
+    "guide/ai-ecosystem.md": None,
+    "guide/ai-traceability.md": None,
+    "guide/sandbox-isolation.md": None,
+    "guide/sandbox-native.md": None,
+    "guide/known-issues.md": None,
+    "guide/third-party-tools.md": None,
+    "guide/adoption-approaches.md": None,
+    "examples/commands/review-pr.md": None,
+    "examples/agents/code-reviewer.md": None,
+}
+
+
+def build_header_index(filepath: Path) -> list[tuple[int, str]]:
+    """Return list of (line_num, header_text) for all # headers."""
+    headers = []
+    try:
+        with open(filepath, encoding="utf-8") as f:
+            for i, line in enumerate(f, 1):
+                stripped = line.rstrip()
+                if stripped.startswith("#"):
+                    headers.append((i, stripped))
+    except FileNotFoundError:
+        pass
+    return headers
+
+
+def get_line_content(filepath: Path, line_num: int, context: int = 2) -> str:
+    """Return content around a given line number."""
+    try:
+        with open(filepath, encoding="utf-8") as f:
+            lines = f.readlines()
+        start = max(0, line_num - 1 - context)
+        end = min(len(lines), line_num + context)
+        result = []
+        for i in range(start, end):
+            marker = ">>>" if i == line_num - 1 else "   "
+            result.append(f"{marker} {i+1}: {lines[i].rstrip()}")
+        return "\n".join(result)
+    except FileNotFoundError:
+        return "(file not found)"
+
+
+def key_to_keywords(key: str) -> list[str]:
+    """Convert snake_case key to search keywords, filtering noise words."""
+    noise = {"the", "a", "an", "and", "or", "of", "in", "to", "for",
+             "is", "at", "by", "on", "it", "its", "from", "with",
+             "guide", "section", "line", "ref", "reference", "api",
+             "mode", "modes", "type", "types", "table", "example",
+             "examples", "list", "advanced", "basic", "overview", "vs"}
+    words = re.split(r"[_\-]", key.lower())
+    return [w for w in words if w not in noise and len(w) > 2]
+
+
+def score_header(keywords: list[str], header_text: str) -> int:
+    """Score a header based on keyword matches (higher = better)."""
+    header_lower = header_text.lower()
+    score = 0
+    for kw in keywords:
+        if kw in header_lower:
+            score += 1
+        # Partial match bonus
+        elif len(kw) > 4 and any(kw in w for w in header_lower.split()):
+            score += 0.5
+    return score
+
+
+def find_best_header(key: str, headers: list[tuple[int, str]],
+                     old_line: int) -> tuple[int | None, float, str]:
+    """Find best matching header. Returns (new_line, confidence, header_text)."""
+    keywords = key_to_keywords(key)
+    if not keywords:
+        return None, 0.0, ""
+
+    best_score = 0
+    best_line = None
+    best_text = ""
+    second_best = 0
+
+    for line_num, header_text in headers:
+        score = score_header(keywords, header_text)
+        if score > best_score:
+            second_best = best_score
+            best_score = score
+            best_line = line_num
+            best_text = header_text
+        elif score > second_best:
+            second_best = score
+
+    if best_score == 0:
+        return None, 0.0, ""
+
+    # Confidence: high if best is clearly better than second best
+    if best_score >= 2 and (second_best == 0 or best_score / max(second_best, 0.1) >= 2):
+        confidence = min(1.0, best_score / max(len(keywords), 1))
+    else:
+        confidence = 0.4 * (best_score / max(len(keywords), 1))
+
+    return best_line, confidence, best_text
+
+
+def parse_yaml_line_refs(yaml_content: str) -> list[dict]:
+    """
+    Parse reference.yaml and extract all line number references.
+    Returns list of {key, file, old_line, yaml_line, raw_value}
+    """
+    results = []
+    in_main_guide_section = False
+    lines = yaml_content.split("\n")
+
+    for i, line in enumerate(lines, 1):
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#"):
+            # Track if we're in the main guide section (bare integers = ultimate-guide.md)
+            if "# Main guide (guide/ultimate-guide.md)" in stripped:
+                in_main_guide_section = True
+            elif stripped.startswith("#") and in_main_guide_section:
+                # New major section — stop treating bare ints as ultimate-guide.md
+                # But keep it true since the pattern continues after comments
+                pass
+            continue
+
+        # Pattern 1: key: "filepath:NNNN"  or  key: "filepath:NNNN"  # comment
+        m = re.match(r'^(\s*)(\S+):\s*"([^"]+):(\d+)"', line)
+        if m:
+            key = m.group(2).rstrip(":")
+            filepath = m.group(3)
+            line_num = int(m.group(4))
+            results.append({
+                "key": key,
+                "file": filepath,
+                "old_line": line_num,
+                "yaml_line": i,
+                "raw_value": f'"{filepath}:{line_num}"',
+                "type": "string_ref",
+            })
+            continue
+
+        # Pattern 2: key: NNNN  (bare integer, implies ultimate-guide.md if in that section)
+        m = re.match(r'^(\s*)(\S+):\s*(\d{3,5})\s*(?:#.*)?$', line)
+        if m:
+            key = m.group(2).rstrip(":")
+            line_num = int(m.group(3))
+            # Heuristic: if line_num > 100 and key looks like a content reference
+            # (not a count, score, year etc.)
+            if line_num > 100 and not any(x in key for x in
+                    ["count", "score", "stars", "year", "limit", "budget",
+                     "savings", "total", "ratio", "budget", "sizing"]):
+                results.append({
+                    "key": key,
+                    "file": "guide/ultimate-guide.md",
+                    "old_line": line_num,
+                    "yaml_line": i,
+                    "raw_value": str(line_num),
+                    "type": "bare_int",
+                })
+
+    return results
+
+
+def validate_line(filepath: Path, line_num: int) -> str:
+    """Return the content at a given line (or error)."""
+    try:
+        with open(filepath, encoding="utf-8") as f:
+            lines = f.readlines()
+        if 1 <= line_num <= len(lines):
+            return lines[line_num - 1].rstrip()
+        return f"(line {line_num} out of range, file has {len(lines)} lines)"
+    except FileNotFoundError:
+        return "(file not found)"
+
+
+def is_sensible_content(key: str, content: str) -> bool:
+    """Quick check if content at old line is plausibly related to the key."""
+    if not content or content.startswith("("):
+        return False
+    keywords = key_to_keywords(key)
+    if not keywords:
+        return True
+    content_lower = content.lower()
+    matches = sum(1 for kw in keywords if kw in content_lower)
+    return matches >= max(1, len(keywords) // 2)
+
+
+def main():
+    apply_fixes = "--apply" in sys.argv
+
+    print("Reading reference.yaml...")
+    with open(YAML_FILE, encoding="utf-8") as f:
+        yaml_content = f.read()
+
+    print("Building header indexes...")
+    header_indexes = {}
+    for rel_path in GUIDE_FILES:
+        abs_path = REPO_ROOT / rel_path
+        idx = build_header_index(abs_path)
+        header_indexes[rel_path] = idx
+        if idx:
+            print(f"  {rel_path}: {len(idx)} headers")
+        else:
+            print(f"  {rel_path}: NOT FOUND or no headers")
+
+    print("\nParsing YAML references...")
+    refs = parse_yaml_line_refs(yaml_content)
+    print(f"Found {len(refs)} line number references")
+
+    report_lines = [
+        "# Re-sync Report: machine-readable/reference.yaml",
+        f"Generated: 2026-02-25",
+        f"Total references scanned: {len(refs)}",
+        "",
+    ]
+
+    corrections = []  # (yaml_line, old_value, new_value, key)
+    stats = {"ok": 0, "high": 0, "medium": 0, "low": 0, "unknown": 0, "file_missing": 0}
+
+    ok_entries = []
+    needs_fix = []
+
+    for ref in refs:
+        key = ref["key"]
+        rel_path = ref["file"]
+        old_line = ref["old_line"]
+        abs_path = REPO_ROOT / rel_path
+
+        current_content = validate_line(abs_path, old_line)
+        headers = header_indexes.get(rel_path, [])
+
+        if not headers and not abs_path.exists():
+            stats["file_missing"] += 1
+            needs_fix.append({**ref, "status": "FILE_MISSING", "new_line": None,
+                              "confidence": 0, "current_content": current_content,
+                              "suggested_header": ""})
+            continue
+
+        sensible = is_sensible_content(key, current_content)
+
+        if sensible:
+            stats["ok"] += 1
+            ok_entries.append({**ref, "current_content": current_content})
+        else:
+            new_line, confidence, header_text = find_best_header(key, headers, old_line)
+
+            if confidence >= 0.7:
+                level = "HIGH"
+                stats["high"] += 1
+            elif confidence >= 0.4:
+                level = "MEDIUM"
+                stats["medium"] += 1
+            elif new_line:
+                level = "LOW"
+                stats["low"] += 1
+            else:
+                level = "UNKNOWN"
+                stats["unknown"] += 1
+
+            needs_fix.append({
+                **ref,
+                "status": level,
+                "new_line": new_line,
+                "confidence": confidence,
+                "current_content": current_content,
+                "suggested_header": header_text,
+            })
+
+            if new_line and level in ("HIGH", "MEDIUM"):
+                if ref["type"] == "string_ref":
+                    old_val = f'"{rel_path}:{old_line}"'
+                    new_val = f'"{rel_path}:{new_line}"'
+                else:
+                    old_val = str(old_line)
+                    new_val = str(new_line)
+                corrections.append((ref["yaml_line"], old_val, new_val, key))
+
+    # Build report
+    report_lines += [
+        "## Summary",
+        "",
+        f"| Status | Count |",
+        f"|--------|-------|",
+        f"| OK (content matches) | {stats['ok']} |",
+        f"| HIGH confidence fix | {stats['high']} |",
+        f"| MEDIUM confidence fix | {stats['medium']} |",
+        f"| LOW confidence (manual review) | {stats['low']} |",
+        f"| UNKNOWN (no match found) | {stats['unknown']} |",
+        f"| FILE MISSING | {stats['file_missing']} |",
+        f"| **Total** | **{len(refs)}** |",
+        "",
+        f"Auto-fixable (HIGH + MEDIUM): **{stats['high'] + stats['medium']}**",
+        "",
+    ]
+
+    if needs_fix:
+        report_lines += ["## Entries Needing Correction", ""]
+        for entry in sorted(needs_fix, key=lambda x: (-x["confidence"], x["key"])):
+            status = entry["status"]
+            emoji = {"HIGH": "✅", "MEDIUM": "⚠️", "LOW": "🔶", "UNKNOWN": "❓", "FILE_MISSING": "🚫"}.get(status, "")
+            report_lines.append(f"### {emoji} {entry['key']} ({status}, conf={entry['confidence']:.2f})")
+            report_lines.append(f"- **File**: `{entry['file']}`")
+            report_lines.append(f"- **Old line**: {entry['old_line']}")
+            report_lines.append(f"- **Content at old line**: `{entry['current_content'][:100]}`")
+            if entry["new_line"]:
+                report_lines.append(f"- **Suggested line**: {entry['new_line']}")
+                report_lines.append(f"- **Header found**: `{entry['suggested_header']}`")
+            else:
+                report_lines.append(f"- **Suggested line**: (not found — manual search needed)")
+            report_lines.append("")
+
+    report_lines += ["## OK Entries (sample)", ""]
+    for entry in ok_entries[:20]:
+        report_lines.append(f"- `{entry['key']}`: line {entry['old_line']} → `{entry['current_content'][:80]}`")
+
+    report_content = "\n".join(report_lines)
+
+    # Write report
+    REPORT_FILE.parent.mkdir(exist_ok=True)
+    with open(REPORT_FILE, "w", encoding="utf-8") as f:
+        f.write(report_content)
+    print(f"\nReport saved to {REPORT_FILE}")
+
+    # Print summary
+    print(f"\n{'='*60}")
+    print(f"OK:            {stats['ok']}")
+    print(f"HIGH fix:      {stats['high']}")
+    print(f"MEDIUM fix:    {stats['medium']}")
+    print(f"LOW fix:       {stats['low']}")
+    print(f"UNKNOWN:       {stats['unknown']}")
+    print(f"FILE MISSING:  {stats['file_missing']}")
+    print(f"Auto-fixable:  {stats['high'] + stats['medium']}")
+    print(f"{'='*60}")
+
+    if corrections:
+        print(f"\n{'='*60}")
+        print("PROPOSED CORRECTIONS (HIGH + MEDIUM confidence):")
+        print(f"{'='*60}")
+        for yaml_line, old_val, new_val, key in corrections[:30]:
+            print(f"  Line {yaml_line:4d} | {key}")
+            print(f"           {old_val} → {new_val}")
+        if len(corrections) > 30:
+            print(f"  ... and {len(corrections) - 30} more (see report)")
+
+    if apply_fixes and corrections:
+        print(f"\nApplying {len(corrections)} fixes to reference.yaml...")
+        content = yaml_content
+        applied = 0
+        for yaml_line_num, old_val, new_val, key in corrections:
+            # Find the exact occurrence of old_val near the yaml_line_num
+            # Use line-by-line replacement to be precise
+            lines = content.split("\n")
+            target_idx = yaml_line_num - 1
+            if target_idx < len(lines) and old_val in lines[target_idx]:
+                lines[target_idx] = lines[target_idx].replace(old_val, new_val, 1)
+                applied += 1
+            else:
+                # Try ±2 lines
+                for delta in [-1, 1, -2, 2]:
+                    idx = target_idx + delta
+                    if 0 <= idx < len(lines) and old_val in lines[idx]:
+                        lines[idx] = lines[idx].replace(old_val, new_val, 1)
+                        applied += 1
+                        break
+            content = "\n".join(lines)
+
+        with open(YAML_FILE, "w", encoding="utf-8") as f:
+            f.write(content)
+        print(f"Applied {applied}/{len(corrections)} fixes.")
+        print(f"Run without --apply to verify remaining issues.")
+    elif not apply_fixes and corrections:
+        print(f"\nRun with --apply to apply HIGH+MEDIUM confidence fixes automatically.")
+        print(f"LOW and UNKNOWN confidence fixes require manual review (see report).")
+
+
+if __name__ == "__main__":
+    main()