claude-code-ultimate-guide/scripts/resync-reference-yaml.py

#!/usr/bin/env python3
"""
Re-sync line numbers in machine-readable/reference.yaml

Strategy:
  1. Build header index for each guide file (all lines starting with #)
  2. For each reference.yaml entry with a line number, read what's at that line
  3. If content doesn't look right, search by key-name keywords in headers
  4. Output a patch file with proposed fixes + confidence scores

Usage:
  python3 scripts/resync-reference-yaml.py [--apply]

  Without --apply: prints report to stdout + saves claudedocs/resync-report.md
  With --apply:    applies HIGH CONFIDENCE fixes directly to reference.yaml
"""

import re
import sys
import os
from pathlib import Path

REPO_ROOT = Path(__file__).parent.parent
YAML_FILE = REPO_ROOT / "machine-readable" / "reference.yaml"
REPORT_FILE = REPO_ROOT / "claudedocs" / "resync-report.md"

# Files that reference.yaml points to (with bare integers → ultimate-guide.md)
GUIDE_FILES = {
    "guide/ultimate-guide.md": None,
    "guide/architecture.md": None,
    "guide/workflows/iterative-refinement.md": None,
    "guide/observability.md": None,
    "guide/learning-with-ai.md": None,
    "guide/ai-ecosystem.md": None,
    "guide/ai-traceability.md": None,
    "guide/sandbox-isolation.md": None,
    "guide/sandbox-native.md": None,
    "guide/known-issues.md": None,
    "guide/third-party-tools.md": None,
    "guide/adoption-approaches.md": None,
    "examples/commands/review-pr.md": None,
    "examples/agents/code-reviewer.md": None,
}


def build_header_index(filepath: Path) -> list[tuple[int, str]]:
    """Return list of (line_num, header_text) for all # headers."""
    headers = []
    try:
        with open(filepath, encoding="utf-8") as f:
            for i, line in enumerate(f, 1):
                stripped = line.rstrip()
                if stripped.startswith("#"):
                    headers.append((i, stripped))
    except FileNotFoundError:
        pass
    return headers


def get_line_content(filepath: Path, line_num: int, context: int = 2) -> str:
    """Return content around a given line number."""
    try:
        with open(filepath, encoding="utf-8") as f:
            lines = f.readlines()
        start = max(0, line_num - 1 - context)
        end = min(len(lines), line_num + context)
        result = []
        for i in range(start, end):
            marker = ">>>" if i == line_num - 1 else "   "
            result.append(f"{marker} {i+1}: {lines[i].rstrip()}")
        return "\n".join(result)
    except FileNotFoundError:
        return "(file not found)"


def key_to_keywords(key: str) -> list[str]:
    """Convert snake_case key to search keywords, filtering noise words."""
    noise = {"the", "a", "an", "and", "or", "of", "in", "to", "for",
             "is", "at", "by", "on", "it", "its", "from", "with",
             "guide", "section", "line", "ref", "reference", "api",
             "mode", "modes", "type", "types", "table", "example",
             "examples", "list", "advanced", "basic", "overview", "vs"}
    words = re.split(r"[_\-]", key.lower())
    return [w for w in words if w not in noise and len(w) > 2]


def score_header(keywords: list[str], header_text: str) -> int:
    """Score a header based on keyword matches (higher = better)."""
    header_lower = header_text.lower()
    score = 0
    for kw in keywords:
        if kw in header_lower:
            score += 1
        # Partial match bonus
        elif len(kw) > 4 and any(kw in w for w in header_lower.split()):
            score += 0.5
    return score


def find_best_header(key: str, headers: list[tuple[int, str]],
                     old_line: int) -> tuple[int | None, float, str]:
    """Find best matching header. Returns (new_line, confidence, header_text)."""
    keywords = key_to_keywords(key)
    if not keywords:
        return None, 0.0, ""

    best_score = 0
    best_line = None
    best_text = ""
    second_best = 0

    for line_num, header_text in headers:
        score = score_header(keywords, header_text)
        if score > best_score:
            second_best = best_score
            best_score = score
            best_line = line_num
            best_text = header_text
        elif score > second_best:
            second_best = score

    if best_score == 0:
        return None, 0.0, ""

    # Confidence: high if best is clearly better than second best
    if best_score >= 2 and (second_best == 0 or best_score / max(second_best, 0.1) >= 2):
        confidence = min(1.0, best_score / max(len(keywords), 1))
    else:
        confidence = 0.4 * (best_score / max(len(keywords), 1))

    return best_line, confidence, best_text


def parse_yaml_line_refs(yaml_content: str) -> list[dict]:
    """
    Parse reference.yaml and extract all line number references.
    Returns list of {key, file, old_line, yaml_line, raw_value}
    """
    results = []
    in_main_guide_section = False
    lines = yaml_content.split("\n")

    for i, line in enumerate(lines, 1):
        stripped = line.strip()
        if not stripped or stripped.startswith("#"):
            # Track if we're in the main guide section (bare integers = ultimate-guide.md)
            if "# Main guide (guide/ultimate-guide.md)" in stripped:
                in_main_guide_section = True
            elif stripped.startswith("#") and in_main_guide_section:
                # New major section — stop treating bare ints as ultimate-guide.md
                # But keep it true since the pattern continues after comments
                pass
            continue

        # Pattern 1: key: "filepath:NNNN"  or  key: "filepath:NNNN"  # comment
        m = re.match(r'^(\s*)(\S+):\s*"([^"]+):(\d+)"', line)
        if m:
            key = m.group(2).rstrip(":")
            filepath = m.group(3)
            line_num = int(m.group(4))
            results.append({
                "key": key,
                "file": filepath,
                "old_line": line_num,
                "yaml_line": i,
                "raw_value": f'"{filepath}:{line_num}"',
                "type": "string_ref",
            })
            continue

        # Pattern 2: key: NNNN  (bare integer, implies ultimate-guide.md if in that section)
        m = re.match(r'^(\s*)(\S+):\s*(\d{3,5})\s*(?:#.*)?$', line)
        if m:
            key = m.group(2).rstrip(":")
            line_num = int(m.group(3))
            # Heuristic: if line_num > 100 and key looks like a content reference
            # (not a count, score, year etc.)
            if line_num > 100 and not any(x in key for x in
                    ["count", "score", "stars", "year", "limit", "budget",
                     "savings", "total", "ratio", "budget", "sizing"]):
                results.append({
                    "key": key,
                    "file": "guide/ultimate-guide.md",
                    "old_line": line_num,
                    "yaml_line": i,
                    "raw_value": str(line_num),
                    "type": "bare_int",
                })

    return results


def validate_line(filepath: Path, line_num: int) -> str:
    """Return the content at a given line (or error)."""
    try:
        with open(filepath, encoding="utf-8") as f:
            lines = f.readlines()
        if 1 <= line_num <= len(lines):
            return lines[line_num - 1].rstrip()
        return f"(line {line_num} out of range, file has {len(lines)} lines)"
    except FileNotFoundError:
        return "(file not found)"


def is_sensible_content(key: str, content: str) -> bool:
    """Quick check if content at old line is plausibly related to the key."""
    if not content or content.startswith("("):
        return False
    keywords = key_to_keywords(key)
    if not keywords:
        return True
    content_lower = content.lower()
    matches = sum(1 for kw in keywords if kw in content_lower)
    return matches >= max(1, len(keywords) // 2)


def main():
    apply_fixes = "--apply" in sys.argv

    print("Reading reference.yaml...")
    with open(YAML_FILE, encoding="utf-8") as f:
        yaml_content = f.read()

    print("Building header indexes...")
    header_indexes = {}
    for rel_path in GUIDE_FILES:
        abs_path = REPO_ROOT / rel_path
        idx = build_header_index(abs_path)
        header_indexes[rel_path] = idx
        if idx:
            print(f"  {rel_path}: {len(idx)} headers")
        else:
            print(f"  {rel_path}: NOT FOUND or no headers")

    print("\nParsing YAML references...")
    refs = parse_yaml_line_refs(yaml_content)
    print(f"Found {len(refs)} line number references")

    report_lines = [
        "# Re-sync Report: machine-readable/reference.yaml",
        f"Generated: 2026-02-25",
        f"Total references scanned: {len(refs)}",
        "",
    ]

    corrections = []  # (yaml_line, old_value, new_value, key)
    stats = {"ok": 0, "high": 0, "medium": 0, "low": 0, "unknown": 0, "file_missing": 0}

    ok_entries = []
    needs_fix = []

    for ref in refs:
        key = ref["key"]
        rel_path = ref["file"]
        old_line = ref["old_line"]
        abs_path = REPO_ROOT / rel_path

        current_content = validate_line(abs_path, old_line)
        headers = header_indexes.get(rel_path, [])

        if not headers and not abs_path.exists():
            stats["file_missing"] += 1
            needs_fix.append({**ref, "status": "FILE_MISSING", "new_line": None,
                              "confidence": 0, "current_content": current_content,
                              "suggested_header": ""})
            continue

        sensible = is_sensible_content(key, current_content)

        if sensible:
            stats["ok"] += 1
            ok_entries.append({**ref, "current_content": current_content})
        else:
            new_line, confidence, header_text = find_best_header(key, headers, old_line)

            if confidence >= 0.7:
                level = "HIGH"
                stats["high"] += 1
            elif confidence >= 0.4:
                level = "MEDIUM"
                stats["medium"] += 1
            elif new_line:
                level = "LOW"
                stats["low"] += 1
            else:
                level = "UNKNOWN"
                stats["unknown"] += 1

            needs_fix.append({
                **ref,
                "status": level,
                "new_line": new_line,
                "confidence": confidence,
                "current_content": current_content,
                "suggested_header": header_text,
            })

            if new_line and level in ("HIGH", "MEDIUM"):
                if ref["type"] == "string_ref":
                    old_val = f'"{rel_path}:{old_line}"'
                    new_val = f'"{rel_path}:{new_line}"'
                else:
                    old_val = str(old_line)
                    new_val = str(new_line)
                corrections.append((ref["yaml_line"], old_val, new_val, key))

    # Build report
    report_lines += [
        "## Summary",
        "",
        f"| Status | Count |",
        f"|--------|-------|",
        f"| OK (content matches) | {stats['ok']} |",
        f"| HIGH confidence fix | {stats['high']} |",
        f"| MEDIUM confidence fix | {stats['medium']} |",
        f"| LOW confidence (manual review) | {stats['low']} |",
        f"| UNKNOWN (no match found) | {stats['unknown']} |",
        f"| FILE MISSING | {stats['file_missing']} |",
        f"| **Total** | **{len(refs)}** |",
        "",
        f"Auto-fixable (HIGH + MEDIUM): **{stats['high'] + stats['medium']}**",
        "",
    ]

    if needs_fix:
        report_lines += ["## Entries Needing Correction", ""]
        for entry in sorted(needs_fix, key=lambda x: (-x["confidence"], x["key"])):
            status = entry["status"]
            emoji = {"HIGH": "✅", "MEDIUM": "⚠️", "LOW": "🔶", "UNKNOWN": "❓", "FILE_MISSING": "🚫"}.get(status, "")
            report_lines.append(f"### {emoji} {entry['key']} ({status}, conf={entry['confidence']:.2f})")
            report_lines.append(f"- **File**: `{entry['file']}`")
            report_lines.append(f"- **Old line**: {entry['old_line']}")
            report_lines.append(f"- **Content at old line**: `{entry['current_content'][:100]}`")
            if entry["new_line"]:
                report_lines.append(f"- **Suggested line**: {entry['new_line']}")
                report_lines.append(f"- **Header found**: `{entry['suggested_header']}`")
            else:
                report_lines.append(f"- **Suggested line**: (not found — manual search needed)")
            report_lines.append("")

    report_lines += ["## OK Entries (sample)", ""]
    for entry in ok_entries[:20]:
        report_lines.append(f"- `{entry['key']}`: line {entry['old_line']} → `{entry['current_content'][:80]}`")

    report_content = "\n".join(report_lines)

    # Write report
    REPORT_FILE.parent.mkdir(exist_ok=True)
    with open(REPORT_FILE, "w", encoding="utf-8") as f:
        f.write(report_content)
    print(f"\nReport saved to {REPORT_FILE}")

    # Print summary
    print(f"\n{'='*60}")
    print(f"OK:            {stats['ok']}")
    print(f"HIGH fix:      {stats['high']}")
    print(f"MEDIUM fix:    {stats['medium']}")
    print(f"LOW fix:       {stats['low']}")
    print(f"UNKNOWN:       {stats['unknown']}")
    print(f"FILE MISSING:  {stats['file_missing']}")
    print(f"Auto-fixable:  {stats['high'] + stats['medium']}")
    print(f"{'='*60}")

    if corrections:
        print(f"\n{'='*60}")
        print("PROPOSED CORRECTIONS (HIGH + MEDIUM confidence):")
        print(f"{'='*60}")
        for yaml_line, old_val, new_val, key in corrections[:30]:
            print(f"  Line {yaml_line:4d} | {key}")
            print(f"           {old_val} → {new_val}")
        if len(corrections) > 30:
            print(f"  ... and {len(corrections) - 30} more (see report)")

    if apply_fixes and corrections:
        print(f"\nApplying {len(corrections)} fixes to reference.yaml...")
        content = yaml_content
        applied = 0
        for yaml_line_num, old_val, new_val, key in corrections:
            # Find the exact occurrence of old_val near the yaml_line_num
            # Use line-by-line replacement to be precise
            lines = content.split("\n")
            target_idx = yaml_line_num - 1
            if target_idx < len(lines) and old_val in lines[target_idx]:
                lines[target_idx] = lines[target_idx].replace(old_val, new_val, 1)
                applied += 1
            else:
                # Try ±2 lines
                for delta in [-1, 1, -2, 2]:
                    idx = target_idx + delta
                    if 0 <= idx < len(lines) and old_val in lines[idx]:
                        lines[idx] = lines[idx].replace(old_val, new_val, 1)
                        applied += 1
                        break
            content = "\n".join(lines)

        with open(YAML_FILE, "w", encoding="utf-8") as f:
            f.write(content)
        print(f"Applied {applied}/{len(corrections)} fixes.")
        print(f"Run without --apply to verify remaining issues.")
    elif not apply_fixes and corrections:
        print(f"\nRun with --apply to apply HIGH+MEDIUM confidence fixes automatically.")
        print(f"LOW and UNKNOWN confidence fixes require manual review (see report).")


if __name__ == "__main__":
    main()