Implements automated pipeline for quiz question quality control:
**Phase 1: Context Extraction**
- Script: extract-audit-context.py
- Resolves doc_reference anchors to guide sections (97.3% success)
- Multi-file support (ultimate-guide.md, learning-with-ai.md, etc.)
- Fuzzy matching + substring fallback
- Output: audit-context.json (256 questions + context)
**Phase 2: Batch Generation**
- Script: generate-audit-batches.py
- 16 prioritized review batches by category
- Advanced Patterns split into 2 batches (29 questions)
- Embedded review instructions in each batch
- Output: audit-batches/*.md (16,559 lines)
**Phase 3: Report Compilation**
- Script: generate-audit-report.py
- Parses agent review outputs (PASS/ISSUE format)
- Aggregates by severity (critical/warning/info)
- Output: audit-report.md
**Validation:**
- Q01-001 error found immediately (curl vs npm contradiction)
- System working as designed ✅
**Documentation:**
- AUDIT-WORKFLOW.md (complete 5-phase guide)
- AUDIT-SYSTEM-SUMMARY.md (architecture + metrics)
- IMPLEMENTATION-COMPLETE.md (status + validation)
- DEMO-REVIEW-OUTPUT.txt (example review)
**Next Steps:** Manual agent reviews (16 batches, ~2-3 hours)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
250 lines
8.6 KiB
Python
Executable file
250 lines
8.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Generate audit report from agent reviews.
|
|
|
|
Collects agent review outputs and compiles them into a comprehensive report.
|
|
|
|
Input: claudedocs/audit-reviews/*.txt (agent outputs)
|
|
Output: claudedocs/audit-report.md
|
|
|
|
Usage:
|
|
python3 scripts/generate-audit-report.py
|
|
"""
|
|
|
|
import re
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple
|
|
|
|
|
|
BASE_DIR = Path(__file__).parent.parent
|
|
REVIEWS_DIR = BASE_DIR / "claudedocs" / "audit-reviews"
|
|
OUTPUT_REPORT = BASE_DIR / "claudedocs" / "audit-report.md"
|
|
|
|
|
|
def parse_review_file(filepath: Path) -> Dict:
|
|
"""
|
|
Parse agent review output.
|
|
|
|
Expected format:
|
|
PASS: Q01-001
|
|
ISSUE: Q01-002 - [critical] CORRECT_ANSWER - Description
|
|
ISSUE: Q01-003 - [warning] AMBIGUITY - Description
|
|
"""
|
|
content = filepath.read_text()
|
|
results = {
|
|
'pass': [],
|
|
'issues': []
|
|
}
|
|
|
|
for line in content.split('\n'):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
if line.startswith('PASS:'):
|
|
q_id = line.replace('PASS:', '').strip()
|
|
results['pass'].append(q_id)
|
|
|
|
elif line.startswith('ISSUE:'):
|
|
# Parse: ISSUE: Q01-002 - [critical] CORRECT_ANSWER - Description
|
|
match = re.match(r'ISSUE:\s+(Q\d+-\d+)\s+-\s+\[(\w+)\]\s+(\w+)\s+-\s+(.+)', line)
|
|
if match:
|
|
q_id, severity, issue_type, description = match.groups()
|
|
results['issues'].append({
|
|
'q_id': q_id,
|
|
'severity': severity,
|
|
'type': issue_type,
|
|
'description': description
|
|
})
|
|
|
|
return results
|
|
|
|
|
|
def generate_report(all_reviews: List[Dict]) -> str:
|
|
"""Generate comprehensive audit report."""
|
|
lines = []
|
|
|
|
# Header
|
|
lines.append("# Quiz Question Audit Report")
|
|
lines.append("")
|
|
lines.append(f"**Generated**: 2026-02-04")
|
|
lines.append("")
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
# Aggregate statistics
|
|
total_pass = sum(len(r['pass']) for r in all_reviews)
|
|
total_issues = sum(len(r['issues']) for r in all_reviews)
|
|
total_questions = total_pass + total_issues
|
|
|
|
critical_count = sum(1 for r in all_reviews for i in r['issues'] if i['severity'] == 'critical')
|
|
warning_count = sum(1 for r in all_reviews for i in r['issues'] if i['severity'] == 'warning')
|
|
info_count = sum(1 for r in all_reviews for i in r['issues'] if i['severity'] == 'info')
|
|
|
|
lines.append("## Executive Summary")
|
|
lines.append("")
|
|
lines.append(f"**Total Questions Reviewed**: {total_questions}")
|
|
lines.append(f"**Pass**: {total_pass} ({total_pass/total_questions*100:.1f}%)")
|
|
lines.append(f"**Issues Found**: {total_issues} ({total_issues/total_questions*100:.1f}%)")
|
|
lines.append("")
|
|
lines.append("### Issue Breakdown")
|
|
lines.append("")
|
|
lines.append(f"- **Critical**: {critical_count} (wrong answer, major factual error)")
|
|
lines.append(f"- **Warning**: {warning_count} (ambiguous, outdated, misleading)")
|
|
lines.append(f"- **Info**: {info_count} (minor wording, trivial)")
|
|
lines.append("")
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
# Critical issues
|
|
lines.append("## Critical Issues (Immediate Fix Required)")
|
|
lines.append("")
|
|
critical_issues = [i for r in all_reviews for i in r['issues'] if i['severity'] == 'critical']
|
|
|
|
if critical_issues:
|
|
for issue in sorted(critical_issues, key=lambda x: x['q_id']):
|
|
lines.append(f"### {issue['q_id']}")
|
|
lines.append("")
|
|
lines.append(f"**Type**: {issue['type']}")
|
|
lines.append(f"**Issue**: {issue['description']}")
|
|
lines.append("")
|
|
else:
|
|
lines.append("*No critical issues found.*")
|
|
lines.append("")
|
|
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
# Warnings
|
|
lines.append("## Warnings (Review & Consider Fixing)")
|
|
lines.append("")
|
|
warning_issues = [i for r in all_reviews for i in r['issues'] if i['severity'] == 'warning']
|
|
|
|
if warning_issues:
|
|
# Group by type
|
|
by_type = defaultdict(list)
|
|
for issue in warning_issues:
|
|
by_type[issue['type']].append(issue)
|
|
|
|
for issue_type, issues in sorted(by_type.items()):
|
|
lines.append(f"### {issue_type} ({len(issues)} questions)")
|
|
lines.append("")
|
|
for issue in sorted(issues, key=lambda x: x['q_id']):
|
|
lines.append(f"- **{issue['q_id']}**: {issue['description']}")
|
|
lines.append("")
|
|
else:
|
|
lines.append("*No warnings found.*")
|
|
lines.append("")
|
|
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
# Info
|
|
lines.append("## Info (Minor Issues)")
|
|
lines.append("")
|
|
info_issues = [i for r in all_reviews for i in r['issues'] if i['severity'] == 'info']
|
|
|
|
if info_issues:
|
|
for issue in sorted(info_issues, key=lambda x: x['q_id']):
|
|
lines.append(f"- **{issue['q_id']}** ({issue['type']}): {issue['description']}")
|
|
lines.append("")
|
|
else:
|
|
lines.append("*No info issues found.*")
|
|
lines.append("")
|
|
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
# Health by category
|
|
lines.append("## Health by Category")
|
|
lines.append("")
|
|
lines.append("| Category | Pass | Issues | Pass Rate |")
|
|
lines.append("|----------|------|--------|-----------|")
|
|
|
|
# Extract category from Q01-001 format
|
|
by_category = defaultdict(lambda: {'pass': 0, 'issues': 0})
|
|
for review in all_reviews:
|
|
for q_id in review['pass']:
|
|
cat = q_id.split('-')[0]
|
|
by_category[cat]['pass'] += 1
|
|
for issue in review['issues']:
|
|
cat = issue['q_id'].split('-')[0]
|
|
by_category[cat]['issues'] += 1
|
|
|
|
for cat in sorted(by_category.keys()):
|
|
stats = by_category[cat]
|
|
total = stats['pass'] + stats['issues']
|
|
pass_rate = stats['pass'] / total * 100 if total > 0 else 0
|
|
lines.append(f"| Category {cat} | {stats['pass']} | {stats['issues']} | {pass_rate:.1f}% |")
|
|
|
|
lines.append("")
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
# Next steps
|
|
lines.append("## Recommended Actions")
|
|
lines.append("")
|
|
lines.append("1. **Fix Critical Issues** (Priority 1)")
|
|
lines.append(" - Review each critical issue")
|
|
lines.append(" - Fix question/answer or update explanation")
|
|
lines.append(" - Rebuild: `python3 scripts/build-questions.py`")
|
|
lines.append("")
|
|
lines.append("2. **Review Warnings** (Priority 2)")
|
|
lines.append(" - Evaluate ambiguities and outdated info")
|
|
lines.append(" - Decide: fix, clarify, or accept")
|
|
lines.append("")
|
|
lines.append("3. **Consider Info Issues** (Priority 3)")
|
|
lines.append(" - Minor improvements for quality")
|
|
lines.append("")
|
|
|
|
return '\n'.join(lines)
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
print("═══════════════════════════════════════════════════════════════")
|
|
print("Quiz Question Audit Report Generation")
|
|
print("═══════════════════════════════════════════════════════════════")
|
|
print()
|
|
|
|
if not REVIEWS_DIR.exists():
|
|
print(f"Error: Reviews directory not found: {REVIEWS_DIR}", file=sys.stderr)
|
|
print("Place agent review outputs in claudedocs/audit-reviews/*.txt", file=sys.stderr)
|
|
return 1
|
|
|
|
# Find review files
|
|
review_files = sorted(REVIEWS_DIR.glob('*.txt'))
|
|
if not review_files:
|
|
print(f"Error: No review files found in {REVIEWS_DIR}", file=sys.stderr)
|
|
return 1
|
|
|
|
print(f"Found {len(review_files)} review files")
|
|
print()
|
|
|
|
# Parse all reviews
|
|
all_reviews = []
|
|
for filepath in review_files:
|
|
print(f"Parsing {filepath.name}...")
|
|
review = parse_review_file(filepath)
|
|
all_reviews.append(review)
|
|
print(f" Pass: {len(review['pass'])}, Issues: {len(review['issues'])}")
|
|
|
|
print()
|
|
|
|
# Generate report
|
|
report = generate_report(all_reviews)
|
|
|
|
# Write output
|
|
OUTPUT_REPORT.write_text(report)
|
|
|
|
print("═══════════════════════════════════════════════════════════════")
|
|
print(f"✓ Report generated: {OUTPUT_REPORT}")
|
|
print("═══════════════════════════════════════════════════════════════")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
sys.exit(main())
|