typing_audit.py
python
| 1 | #!/usr/bin/env python3 |
| 2 | """Typing audit — find and count all banned type patterns in the codebase. |
| 3 | |
| 4 | Checks every rule from .cursorrules and AGENTS.md: |
| 5 | - Any as a type (param, return, collection value) |
| 6 | - object as a type (same severity as Any) |
| 7 | - cast() usage (all usage banned — fix the callee) |
| 8 | - # type: ignore (blanket and specific) |
| 9 | - Bare collections (list, dict, set, tuple without type parameters) |
| 10 | - Optional[X] (use X | None instead) |
| 11 | - Legacy typing imports (List, Dict, Set, Tuple — use lowercase builtins) |
| 12 | |
| 13 | Outputs JSON (machine-readable) + a human summary to stdout. |
| 14 | |
| 15 | Usage: |
| 16 | python tools/typing_audit.py # muse/ + tests/ |
| 17 | python tools/typing_audit.py --json artifacts/typing_audit.json |
| 18 | python tools/typing_audit.py --dirs muse/ tests/ |
| 19 | python tools/typing_audit.py --dirs muse/ --max-any 0 |
| 20 | """ |
| 21 | |
| 22 | import argparse |
| 23 | import ast |
| 24 | import json |
| 25 | import re |
| 26 | import sys |
| 27 | from collections import defaultdict |
| 28 | from pathlib import Path |
| 29 | from typing import Any |
| 30 | |
| 31 | |
| 32 | # ── Pattern matchers ────────────────────────────────────────────────────────── |
| 33 | # Grouped by category. Every key contributes to the total violation count. |
| 34 | |
| 35 | _PATTERNS: dict[str, re.Pattern[str]] = { |
| 36 | # ── Any-as-type ────────────────────────────────────────────────────── |
| 37 | "dict_str_any": re.compile( |
| 38 | r"\bdict\[str,\s*Any\]|\bDict\[str,\s*Any\]", re.IGNORECASE |
| 39 | ), |
| 40 | "list_any": re.compile(r"\blist\[Any\]|\bList\[Any\]", re.IGNORECASE), |
| 41 | "return_any": re.compile(r"->\s*Any\b"), |
| 42 | "param_any": re.compile(r":\s*Any\b"), |
| 43 | "mapping_any": re.compile( |
| 44 | r"\bMapping\[str,\s*Any\]", re.IGNORECASE |
| 45 | ), |
| 46 | "optional_any": re.compile(r"\bOptional\[Any\]", re.IGNORECASE), |
| 47 | "sequence_any": re.compile( |
| 48 | r"\bSequence\[Any\]|\bIterable\[Any\]", re.IGNORECASE |
| 49 | ), |
| 50 | "tuple_any": re.compile(r"\btuple\[.*Any.*\]|\bTuple\[.*Any.*\]"), |
| 51 | |
| 52 | # ── object-as-type (same severity as Any) ──────────────────────────── |
| 53 | "param_object": re.compile(r":\s*object\b"), |
| 54 | "return_object": re.compile(r"->\s*object\b"), |
| 55 | "collection_object": re.compile( |
| 56 | r"\b(?:dict|list|set|tuple|Sequence|Mapping)\[[^]]*\bobject\b" |
| 57 | ), |
| 58 | |
| 59 | # ── cast() — all usage banned ──────────────────────────────────────── |
| 60 | "cast_usage": re.compile(r"\bcast\("), |
| 61 | |
| 62 | # ── type: ignore — suppresses real errors ──────────────────────────── |
| 63 | "type_ignore": re.compile(r"#\s*type:\s*ignore"), |
| 64 | |
| 65 | # ── Bare collections (no type parameters) ──────────────────────────── |
| 66 | # Negative lookaheads: exclude parameterized [, constructor calls (, |
| 67 | # and prose patterns (": list of items" in docstrings). |
| 68 | "bare_list": re.compile(r"(?::\s*|->\s*)list\b(?!\[|\(|\s+[a-z])"), |
| 69 | "bare_dict": re.compile(r"(?::\s*|->\s*)dict\b(?!\[|\(|\s+[a-z])"), |
| 70 | "bare_set": re.compile(r"(?::\s*|->\s*)set\b(?!\[|\(|\s+[a-z])"), |
| 71 | "bare_tuple": re.compile(r"(?::\s*|->\s*)tuple\b(?!\[|\(|\s+[a-z])"), |
| 72 | |
| 73 | # ── Optional[X] — use X | None instead ─────────────────────────────── |
| 74 | # Excludes Optional[Any] which is already caught by optional_any. |
| 75 | "optional_usage": re.compile(r"\bOptional\[(?!Any\b)"), |
| 76 | |
| 77 | # ── Legacy typing imports (use lowercase builtins) ─────────────────── |
| 78 | "legacy_List": re.compile(r"\bList\["), |
| 79 | "legacy_Dict": re.compile(r"\bDict\["), |
| 80 | "legacy_Set": re.compile(r"\bSet\["), |
| 81 | "legacy_Tuple": re.compile(r"\bTuple\["), |
| 82 | } |
| 83 | |
| 84 | |
| 85 | def _count_pattern_in_line(line: str, pattern: re.Pattern[str]) -> int: |
| 86 | return len(pattern.findall(line)) |
| 87 | |
| 88 | |
| 89 | def _imports_any(source: str) -> bool: |
| 90 | """Check if file imports Any from typing.""" |
| 91 | return bool(re.search(r"from\s+typing\s+import\s+.*\bAny\b", source)) |
| 92 | |
| 93 | |
| 94 | def _classify_type_ignores(line: str) -> str: |
| 95 | """Return the ignore variant (blanket vs specific).""" |
| 96 | m = re.search(r"#\s*type:\s*ignore\[([^\]]+)\]", line) |
| 97 | if m: |
| 98 | return f"type_ignore[{m.group(1)}]" |
| 99 | return "type_ignore[blanket]" |
| 100 | |
| 101 | |
| 102 | # ── AST-based detection ────────────────────────────────────────────────────── |
| 103 | |
| 104 | |
| 105 | def _find_untyped_defs(source: str, filepath: str) -> list[dict[str, Any]]: |
| 106 | """Find function defs missing return type or param annotations.""" |
| 107 | results: list[dict[str, Any]] = [] |
| 108 | try: |
| 109 | tree = ast.parse(source) |
| 110 | except SyntaxError: |
| 111 | return results |
| 112 | |
| 113 | for node in ast.walk(tree): |
| 114 | if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): |
| 115 | if node.returns is None: |
| 116 | results.append({ |
| 117 | "file": filepath, |
| 118 | "line": node.lineno, |
| 119 | "name": node.name, |
| 120 | "issue": "missing_return_type", |
| 121 | }) |
| 122 | for arg in node.args.args + node.args.kwonlyargs: |
| 123 | if arg.annotation is None and arg.arg != "self" and arg.arg != "cls": |
| 124 | results.append({ |
| 125 | "file": filepath, |
| 126 | "line": node.lineno, |
| 127 | "name": f"{node.name}.{arg.arg}", |
| 128 | "issue": "missing_param_type", |
| 129 | }) |
| 130 | return results |
| 131 | |
| 132 | |
| 133 | # ── File scanner ────────────────────────────────────────────────────────────── |
| 134 | |
| 135 | |
| 136 | def scan_file(filepath: Path) -> dict[str, Any]: |
| 137 | """Scan a single Python file for typing violations.""" |
| 138 | try: |
| 139 | source = filepath.read_text(encoding="utf-8") |
| 140 | except (OSError, UnicodeDecodeError): |
| 141 | return {} |
| 142 | |
| 143 | lines = source.splitlines() |
| 144 | result: dict[str, Any] = { |
| 145 | "file": str(filepath), |
| 146 | "imports_any": _imports_any(source), |
| 147 | "patterns": defaultdict(int), |
| 148 | "pattern_lines": defaultdict(list), |
| 149 | "type_ignore_variants": defaultdict(int), |
| 150 | "untyped_defs": [], |
| 151 | } |
| 152 | |
| 153 | for lineno, line in enumerate(lines, 1): |
| 154 | stripped = line.strip() |
| 155 | if not stripped or stripped.startswith("#"): |
| 156 | continue |
| 157 | |
| 158 | for name, pattern in _PATTERNS.items(): |
| 159 | count = _count_pattern_in_line(line, pattern) |
| 160 | if count > 0: |
| 161 | result["patterns"][name] += count |
| 162 | result["pattern_lines"][name].append(lineno) |
| 163 | |
| 164 | if name == "type_ignore": |
| 165 | variant = _classify_type_ignores(line) |
| 166 | result["type_ignore_variants"][variant] += 1 |
| 167 | |
| 168 | result["untyped_defs"] = _find_untyped_defs(source, str(filepath)) |
| 169 | return result |
| 170 | |
| 171 | |
| 172 | def scan_directory(directory: Path) -> list[dict[str, Any]]: |
| 173 | """Scan all Python files in a directory tree.""" |
| 174 | results: list[dict[str, Any]] = [] |
| 175 | for py_file in sorted(directory.rglob("*.py")): |
| 176 | if "venv" in py_file.parts or "__pycache__" in py_file.parts: |
| 177 | continue |
| 178 | if ".git" in py_file.parts: |
| 179 | continue |
| 180 | file_result = scan_file(py_file) |
| 181 | if file_result: |
| 182 | results.append(file_result) |
| 183 | return results |
| 184 | |
| 185 | |
| 186 | # ── Report generation ───────────────────────────────────────────────────────── |
| 187 | |
| 188 | # Display order: group patterns into logical categories for the report. |
| 189 | _CATEGORY_ORDER: list[tuple[str, list[str]]] = [ |
| 190 | ("Any-as-type", [ |
| 191 | "dict_str_any", "list_any", "return_any", "param_any", |
| 192 | "mapping_any", "optional_any", "sequence_any", "tuple_any", |
| 193 | ]), |
| 194 | ("object-as-type", [ |
| 195 | "param_object", "return_object", "collection_object", |
| 196 | ]), |
| 197 | ("cast() usage", ["cast_usage"]), |
| 198 | ("type: ignore", ["type_ignore"]), |
| 199 | ("Bare collections", [ |
| 200 | "bare_list", "bare_dict", "bare_set", "bare_tuple", |
| 201 | ]), |
| 202 | ("Optional (use X | None)", ["optional_usage"]), |
| 203 | ("Legacy typing imports", [ |
| 204 | "legacy_List", "legacy_Dict", "legacy_Set", "legacy_Tuple", |
| 205 | ]), |
| 206 | ] |
| 207 | |
| 208 | |
| 209 | def generate_report(results: list[dict[str, Any]]) -> dict[str, Any]: |
| 210 | """Generate aggregate report from scan results.""" |
| 211 | totals: dict[str, int] = defaultdict(int) |
| 212 | files_with_any_import = 0 |
| 213 | per_file: dict[str, dict[str, int]] = {} |
| 214 | top_offenders: list[dict[str, Any]] = [] |
| 215 | all_type_ignore_variants: dict[str, int] = defaultdict(int) |
| 216 | all_untyped_defs: list[dict[str, Any]] = [] |
| 217 | |
| 218 | for r in results: |
| 219 | filepath = r["file"] |
| 220 | if r.get("imports_any"): |
| 221 | files_with_any_import += 1 |
| 222 | |
| 223 | file_total = 0 |
| 224 | file_patterns: dict[str, int] = {} |
| 225 | for pattern, count in r.get("patterns", {}).items(): |
| 226 | totals[pattern] += count |
| 227 | file_patterns[pattern] = count |
| 228 | file_total += count |
| 229 | |
| 230 | if file_total > 0: |
| 231 | per_file[filepath] = file_patterns |
| 232 | top_offenders.append({"file": filepath, "total": file_total, "patterns": file_patterns}) |
| 233 | |
| 234 | for variant, count in r.get("type_ignore_variants", {}).items(): |
| 235 | all_type_ignore_variants[variant] += count |
| 236 | |
| 237 | all_untyped_defs.extend(r.get("untyped_defs", [])) |
| 238 | |
| 239 | top_offenders.sort(key=lambda x: x["total"], reverse=True) |
| 240 | |
| 241 | return { |
| 242 | "summary": { |
| 243 | "total_files_scanned": len(results), |
| 244 | "files_importing_any": files_with_any_import, |
| 245 | "total_any_patterns": sum(totals.values()), |
| 246 | "untyped_defs": len(all_untyped_defs), |
| 247 | }, |
| 248 | "pattern_totals": dict(totals), |
| 249 | "type_ignore_variants": dict(all_type_ignore_variants), |
| 250 | "top_offenders": top_offenders[:30], |
| 251 | "per_file": per_file, |
| 252 | "untyped_defs": all_untyped_defs[:50], |
| 253 | } |
| 254 | |
| 255 | |
| 256 | def print_human_summary(report: dict[str, Any]) -> None: |
| 257 | """Print a human-readable summary.""" |
| 258 | s = report["summary"] |
| 259 | totals = report["pattern_totals"] |
| 260 | print("\n" + "=" * 70) |
| 261 | print(" TYPING AUDIT — Violation Report") |
| 262 | print("=" * 70) |
| 263 | print(f" Files scanned: {s['total_files_scanned']}") |
| 264 | print(f" Files importing Any: {s['files_importing_any']}") |
| 265 | print(f" Total violations: {s['total_any_patterns']}") |
| 266 | print(f" Untyped defs: {s['untyped_defs']}") |
| 267 | print() |
| 268 | |
| 269 | for category, pattern_names in _CATEGORY_ORDER: |
| 270 | category_total = sum(totals.get(p, 0) for p in pattern_names) |
| 271 | if category_total == 0: |
| 272 | continue |
| 273 | print(f" {category}:") |
| 274 | for p in pattern_names: |
| 275 | count = totals.get(p, 0) |
| 276 | if count > 0: |
| 277 | print(f" {p:30s} {count:5d}") |
| 278 | print() |
| 279 | |
| 280 | if not any(totals.get(p, 0) for _, pats in _CATEGORY_ORDER for p in pats): |
| 281 | print(" Pattern breakdown: (none)") |
| 282 | print() |
| 283 | |
| 284 | if report["type_ignore_variants"]: |
| 285 | print(" # type: ignore variants:") |
| 286 | for variant, count in sorted(report["type_ignore_variants"].items(), key=lambda x: -x[1]): |
| 287 | print(f" {variant:40s} {count:5d}") |
| 288 | print() |
| 289 | print(" Top 15 offenders:") |
| 290 | for entry in report["top_offenders"][:15]: |
| 291 | print(f" {entry['total']:4d} {entry['file']}") |
| 292 | print("=" * 70 + "\n") |
| 293 | |
| 294 | |
| 295 | # ── CLI ─────────────────────────────────────────────────────────────────────── |
| 296 | |
| 297 | |
| 298 | def main() -> None: |
| 299 | parser = argparse.ArgumentParser( |
| 300 | description="Audit typing violations: Any, object, cast, bare collections, " |
| 301 | "Optional, legacy imports, type: ignore, untyped defs", |
| 302 | ) |
| 303 | parser.add_argument( |
| 304 | "--dirs", |
| 305 | nargs="+", |
| 306 | default=["muse/", "tests/"], |
| 307 | help="Directories to scan", |
| 308 | ) |
| 309 | parser.add_argument("--json", type=str, help="Write JSON report to file") |
| 310 | parser.add_argument( |
| 311 | "--max-any", |
| 312 | type=int, |
| 313 | default=None, |
| 314 | help="Fail (exit 1) if total violations exceed this threshold", |
| 315 | ) |
| 316 | args = parser.parse_args() |
| 317 | |
| 318 | all_results: list[dict[str, Any]] = [] |
| 319 | for d in args.dirs: |
| 320 | p = Path(d) |
| 321 | if p.exists(): |
| 322 | all_results.extend(scan_directory(p)) |
| 323 | else: |
| 324 | print(f"WARNING: {d} does not exist, skipping", file=sys.stderr) |
| 325 | |
| 326 | report = generate_report(all_results) |
| 327 | print_human_summary(report) |
| 328 | |
| 329 | if args.json: |
| 330 | Path(args.json).parent.mkdir(parents=True, exist_ok=True) |
| 331 | Path(args.json).write_text( |
| 332 | json.dumps(report, indent=2, default=str), |
| 333 | encoding="utf-8", |
| 334 | ) |
| 335 | print(f" JSON report written to {args.json}") |
| 336 | |
| 337 | if args.max_any is not None: |
| 338 | total = report["summary"]["total_any_patterns"] |
| 339 | if total > args.max_any: |
| 340 | print( |
| 341 | f"\n❌ RATCHET FAILED: {total} violations exceed " |
| 342 | f"threshold of {args.max_any}", |
| 343 | file=sys.stderr, |
| 344 | ) |
| 345 | sys.exit(1) |
| 346 | else: |
| 347 | print( |
| 348 | f"\n✅ RATCHET OK: {total} violations within " |
| 349 | f"threshold of {args.max_any}", |
| 350 | ) |
| 351 | |
| 352 | |
| 353 | if __name__ == "__main__": |
| 354 | main() |