cgcardona / muse public
clones.py python
262 lines 9.1 KB
e6786943 feat: upgrade to Python 3.14, drop from __future__ import annotations Gabriel Cardona <cgcardona@gmail.com> 1d ago
1 """muse clones — find duplicate and near-duplicate symbols.
2
3 Detects two tiers of code duplication from committed snapshot data:
4
5 **Exact clones**
6 Symbols with the same ``body_hash`` at different addresses. The body is
7 character-for-character identical (after normalisation) even if the name or
8 surrounding context differs. These are true copy-paste duplicates.
9
10 **Near-clones**
11 Symbols with the same ``signature_id`` but different ``body_hash``. Same
12 function signature, different implementation — strong candidates for
13 consolidation behind a shared abstraction.
14
15 Git has no concept of these. Git stores file diffs; Muse stores symbol
16 identity hashes. Clone detection is a single pass over the snapshot index.
17
18 Usage::
19
20 muse clones
21 muse clones --tier exact
22 muse clones --tier near
23 muse clones --kind function
24 muse clones --commit HEAD~10
25 muse clones --min-cluster 3
26 muse clones --json
27
28 Output::
29
30 Clone analysis — commit a1b2c3d4
31 ──────────────────────────────────────────────────────────────
32
33 Exact clones (2 clusters):
34 body_hash a1b2c3d4:
35 src/billing.py::compute_hash function
36 src/utils.py::compute_hash function
37 src/legacy.py::_hash function
38
39 Near-clones — same signature (3 clusters):
40 signature_id e5f6a7b8:
41 src/billing.py::validate function
42 src/auth.py::validate function
43
44 Flags:
45
46 ``--tier {exact|near|both}``
47 Which tier to report (default: both).
48
49 ``--kind KIND``
50 Restrict to symbols of this kind.
51
52 ``--min-cluster N``
53 Only show clusters with at least N members (default: 2).
54
55 ``--commit, -c REF``
56 Analyse a historical snapshot instead of HEAD.
57
58 ``--json``
59 Emit results as JSON.
60 """
61
62 import json
63 import logging
64 import pathlib
65 from typing import Literal
66
67 import typer
68
69 from muse.core.errors import ExitCode
70 from muse.core.repo import require_repo
71 from muse.core.store import get_commit_snapshot_manifest, resolve_commit_ref
72 from muse.plugins.code._query import language_of, symbols_for_snapshot
73 from muse.plugins.code.ast_parser import SymbolRecord
74
75 logger = logging.getLogger(__name__)
76
77 app = typer.Typer()
78
79 CloneTier = Literal["exact", "near", "both"]
80
81
82 def _read_repo_id(root: pathlib.Path) -> str:
83 return str(json.loads((root / ".muse" / "repo.json").read_text())["repo_id"])
84
85
86 def _read_branch(root: pathlib.Path) -> str:
87 head_ref = (root / ".muse" / "HEAD").read_text().strip()
88 return head_ref.removeprefix("refs/heads/").strip()
89
90
91 class _CloneCluster:
92 def __init__(
93 self,
94 tier: CloneTier,
95 hash_value: str,
96 members: list[tuple[str, SymbolRecord]],
97 ) -> None:
98 self.tier = tier
99 self.hash_value = hash_value
100 self.members = members # (address, record)
101
102 def to_dict(self) -> dict[str, str | list[dict[str, str]]]:
103 return {
104 "tier": self.tier,
105 "hash": self.hash_value[:8],
106 "count": str(len(self.members)),
107 "members": [
108 {
109 "address": addr,
110 "kind": rec["kind"],
111 "language": language_of(addr.split("::")[0]),
112 "body_hash": rec["body_hash"][:8],
113 "signature_id": rec["signature_id"][:8],
114 "content_id": rec["content_id"][:8],
115 }
116 for addr, rec in self.members
117 ],
118 }
119
120
121 def find_clones(
122 root: pathlib.Path,
123 manifest: dict[str, str],
124 tier: CloneTier,
125 kind_filter: str | None,
126 min_cluster: int,
127 ) -> list[_CloneCluster]:
128 """Build clone clusters from *manifest*."""
129 sym_map = symbols_for_snapshot(root, manifest, kind_filter=kind_filter)
130
131 # Flatten to list of (address, record).
132 all_syms: list[tuple[str, SymbolRecord]] = [
133 (addr, rec)
134 for _fp, tree in sorted(sym_map.items())
135 for addr, rec in sorted(tree.items())
136 if rec["kind"] != "import"
137 ]
138
139 clusters: list[_CloneCluster] = []
140
141 if tier in ("exact", "both"):
142 body_index: dict[str, list[tuple[str, SymbolRecord]]] = {}
143 for addr, rec in all_syms:
144 body_index.setdefault(rec["body_hash"], []).append((addr, rec))
145 for body_hash, members in sorted(body_index.items()):
146 if len(members) >= min_cluster:
147 clusters.append(_CloneCluster("exact", body_hash, members))
148
149 if tier in ("near", "both"):
150 sig_index: dict[str, list[tuple[str, SymbolRecord]]] = {}
151 for addr, rec in all_syms:
152 sig_index.setdefault(rec["signature_id"], []).append((addr, rec))
153 for sig_id, members in sorted(sig_index.items()):
154 # Near-clone: same signature, at least two DIFFERENT body hashes.
155 unique_bodies = {r["body_hash"] for _, r in members}
156 if len(members) >= min_cluster and len(unique_bodies) > 1:
157 # Don't re-emit clusters already reported as exact clones.
158 clusters.append(_CloneCluster("near", sig_id, members))
159
160 # Sort: largest clusters first, then by tier, then by hash.
161 clusters.sort(key=lambda c: (-len(c.members), c.tier, c.hash_value))
162 return clusters
163
164
165 @app.callback(invoke_without_command=True)
166 def clones(
167 ctx: typer.Context,
168 tier: str = typer.Option(
169 "both", "--tier", "-t",
170 help="Tier to report: exact, near, or both.",
171 ),
172 kind_filter: str | None = typer.Option(
173 None, "--kind", "-k", metavar="KIND",
174 help="Restrict to symbols of this kind.",
175 ),
176 min_cluster: int = typer.Option(
177 2, "--min-cluster", "-m", metavar="N",
178 help="Only show clusters with at least N members.",
179 ),
180 ref: str | None = typer.Option(
181 None, "--commit", "-c", metavar="REF",
182 help="Analyse this commit instead of HEAD.",
183 ),
184 as_json: bool = typer.Option(False, "--json", help="Emit results as JSON."),
185 ) -> None:
186 """Find exact and near-duplicate symbols in the committed snapshot.
187
188 Exact clones share the same ``body_hash`` (identical implementation).
189 Near-clones share the same ``signature_id`` but differ in body — same
190 contract, different implementation. Both tiers are candidates for
191 consolidation behind shared abstractions.
192
193 Uses content-addressed hashes from the snapshot — no AST recomputation
194 or file parsing at query time.
195 """
196 root = require_repo()
197 repo_id = _read_repo_id(root)
198 branch = _read_branch(root)
199
200 if tier not in ("exact", "near", "both"):
201 typer.echo(f"❌ --tier must be 'exact', 'near', or 'both' (got: {tier!r})", err=True)
202 raise typer.Exit(code=ExitCode.USER_ERROR)
203
204 commit = resolve_commit_ref(root, repo_id, branch, ref)
205 if commit is None:
206 typer.echo(f"❌ Commit '{ref or 'HEAD'}' not found.", err=True)
207 raise typer.Exit(code=ExitCode.USER_ERROR)
208
209 manifest = get_commit_snapshot_manifest(root, commit.commit_id) or {}
210 # Validated above — safe to narrow.
211 if tier == "exact":
212 cluster_list = find_clones(root, manifest, "exact", kind_filter, min_cluster)
213 elif tier == "near":
214 cluster_list = find_clones(root, manifest, "near", kind_filter, min_cluster)
215 else:
216 cluster_list = find_clones(root, manifest, "both", kind_filter, min_cluster)
217
218 exact_clusters = [c for c in cluster_list if c.tier == "exact"]
219 near_clusters = [c for c in cluster_list if c.tier == "near"]
220
221 if as_json:
222 typer.echo(json.dumps(
223 {
224 "schema_version": 1,
225 "commit": commit.commit_id[:8],
226 "tier": tier,
227 "min_cluster": min_cluster,
228 "kind_filter": kind_filter,
229 "exact_clone_clusters": len(exact_clusters),
230 "near_clone_clusters": len(near_clusters),
231 "clusters": [c.to_dict() for c in cluster_list],
232 },
233 indent=2,
234 ))
235 return
236
237 typer.echo(f"\nClone analysis — commit {commit.commit_id[:8]}")
238 if kind_filter:
239 typer.echo(f" (kind: {kind_filter})")
240 typer.echo("─" * 62)
241
242 if not cluster_list:
243 typer.echo("\n ✅ No clones detected.")
244 return
245
246 if exact_clusters and tier in ("exact", "both"):
247 typer.echo(f"\nExact clones ({len(exact_clusters)} cluster(s)):")
248 for cl in exact_clusters:
249 typer.echo(f" body_hash {cl.hash_value[:8]}:")
250 for addr, rec in cl.members:
251 typer.echo(f" {addr} {rec['kind']}")
252
253 if near_clusters and tier in ("near", "both"):
254 typer.echo(f"\nNear-clones — same signature ({len(near_clusters)} cluster(s)):")
255 for cl in near_clusters:
256 typer.echo(f" signature_id {cl.hash_value[:8]}:")
257 for addr, rec in cl.members:
258 typer.echo(f" {addr} {rec['kind']} (body {rec['body_hash'][:8]})")
259
260 total = sum(len(c.members) for c in cluster_list)
261 typer.echo(f"\n {len(cluster_list)} clone cluster(s), {total} total symbol(s) involved")
262 typer.echo(" Consider consolidating behind shared abstractions.")