cgcardona / muse public
clones.py python
263 lines 9.1 KB
b4e8aaf2 feat(code): Phase 1 — lineage, api-surface, codemap, clones, checkout-s… Gabriel Cardona <cgcardona@gmail.com> 1d ago
1 """muse clones — find duplicate and near-duplicate symbols.
2
3 Detects two tiers of code duplication from committed snapshot data:
4
5 **Exact clones**
6 Symbols with the same ``body_hash`` at different addresses. The body is
7 character-for-character identical (after normalisation) even if the name or
8 surrounding context differs. These are true copy-paste duplicates.
9
10 **Near-clones**
11 Symbols with the same ``signature_id`` but different ``body_hash``. Same
12 function signature, different implementation — strong candidates for
13 consolidation behind a shared abstraction.
14
15 Git has no concept of these. Git stores file diffs; Muse stores symbol
16 identity hashes. Clone detection is a single pass over the snapshot index.
17
18 Usage::
19
20 muse clones
21 muse clones --tier exact
22 muse clones --tier near
23 muse clones --kind function
24 muse clones --commit HEAD~10
25 muse clones --min-cluster 3
26 muse clones --json
27
28 Output::
29
30 Clone analysis — commit a1b2c3d4
31 ──────────────────────────────────────────────────────────────
32
33 Exact clones (2 clusters):
34 body_hash a1b2c3d4:
35 src/billing.py::compute_hash function
36 src/utils.py::compute_hash function
37 src/legacy.py::_hash function
38
39 Near-clones — same signature (3 clusters):
40 signature_id e5f6a7b8:
41 src/billing.py::validate function
42 src/auth.py::validate function
43
44 Flags:
45
46 ``--tier {exact|near|both}``
47 Which tier to report (default: both).
48
49 ``--kind KIND``
50 Restrict to symbols of this kind.
51
52 ``--min-cluster N``
53 Only show clusters with at least N members (default: 2).
54
55 ``--commit, -c REF``
56 Analyse a historical snapshot instead of HEAD.
57
58 ``--json``
59 Emit results as JSON.
60 """
61 from __future__ import annotations
62
63 import json
64 import logging
65 import pathlib
66 from typing import Literal
67
68 import typer
69
70 from muse.core.errors import ExitCode
71 from muse.core.repo import require_repo
72 from muse.core.store import get_commit_snapshot_manifest, resolve_commit_ref
73 from muse.plugins.code._query import language_of, symbols_for_snapshot
74 from muse.plugins.code.ast_parser import SymbolRecord
75
76 logger = logging.getLogger(__name__)
77
78 app = typer.Typer()
79
80 CloneTier = Literal["exact", "near", "both"]
81
82
83 def _read_repo_id(root: pathlib.Path) -> str:
84 return str(json.loads((root / ".muse" / "repo.json").read_text())["repo_id"])
85
86
87 def _read_branch(root: pathlib.Path) -> str:
88 head_ref = (root / ".muse" / "HEAD").read_text().strip()
89 return head_ref.removeprefix("refs/heads/").strip()
90
91
92 class _CloneCluster:
93 def __init__(
94 self,
95 tier: CloneTier,
96 hash_value: str,
97 members: list[tuple[str, SymbolRecord]],
98 ) -> None:
99 self.tier = tier
100 self.hash_value = hash_value
101 self.members = members # (address, record)
102
103 def to_dict(self) -> dict[str, str | list[dict[str, str]]]:
104 return {
105 "tier": self.tier,
106 "hash": self.hash_value[:8],
107 "count": str(len(self.members)),
108 "members": [
109 {
110 "address": addr,
111 "kind": rec["kind"],
112 "language": language_of(addr.split("::")[0]),
113 "body_hash": rec["body_hash"][:8],
114 "signature_id": rec["signature_id"][:8],
115 "content_id": rec["content_id"][:8],
116 }
117 for addr, rec in self.members
118 ],
119 }
120
121
122 def find_clones(
123 root: pathlib.Path,
124 manifest: dict[str, str],
125 tier: CloneTier,
126 kind_filter: str | None,
127 min_cluster: int,
128 ) -> list[_CloneCluster]:
129 """Build clone clusters from *manifest*."""
130 sym_map = symbols_for_snapshot(root, manifest, kind_filter=kind_filter)
131
132 # Flatten to list of (address, record).
133 all_syms: list[tuple[str, SymbolRecord]] = [
134 (addr, rec)
135 for _fp, tree in sorted(sym_map.items())
136 for addr, rec in sorted(tree.items())
137 if rec["kind"] != "import"
138 ]
139
140 clusters: list[_CloneCluster] = []
141
142 if tier in ("exact", "both"):
143 body_index: dict[str, list[tuple[str, SymbolRecord]]] = {}
144 for addr, rec in all_syms:
145 body_index.setdefault(rec["body_hash"], []).append((addr, rec))
146 for body_hash, members in sorted(body_index.items()):
147 if len(members) >= min_cluster:
148 clusters.append(_CloneCluster("exact", body_hash, members))
149
150 if tier in ("near", "both"):
151 sig_index: dict[str, list[tuple[str, SymbolRecord]]] = {}
152 for addr, rec in all_syms:
153 sig_index.setdefault(rec["signature_id"], []).append((addr, rec))
154 for sig_id, members in sorted(sig_index.items()):
155 # Near-clone: same signature, at least two DIFFERENT body hashes.
156 unique_bodies = {r["body_hash"] for _, r in members}
157 if len(members) >= min_cluster and len(unique_bodies) > 1:
158 # Don't re-emit clusters already reported as exact clones.
159 clusters.append(_CloneCluster("near", sig_id, members))
160
161 # Sort: largest clusters first, then by tier, then by hash.
162 clusters.sort(key=lambda c: (-len(c.members), c.tier, c.hash_value))
163 return clusters
164
165
166 @app.callback(invoke_without_command=True)
167 def clones(
168 ctx: typer.Context,
169 tier: str = typer.Option(
170 "both", "--tier", "-t",
171 help="Tier to report: exact, near, or both.",
172 ),
173 kind_filter: str | None = typer.Option(
174 None, "--kind", "-k", metavar="KIND",
175 help="Restrict to symbols of this kind.",
176 ),
177 min_cluster: int = typer.Option(
178 2, "--min-cluster", "-m", metavar="N",
179 help="Only show clusters with at least N members.",
180 ),
181 ref: str | None = typer.Option(
182 None, "--commit", "-c", metavar="REF",
183 help="Analyse this commit instead of HEAD.",
184 ),
185 as_json: bool = typer.Option(False, "--json", help="Emit results as JSON."),
186 ) -> None:
187 """Find exact and near-duplicate symbols in the committed snapshot.
188
189 Exact clones share the same ``body_hash`` (identical implementation).
190 Near-clones share the same ``signature_id`` but differ in body — same
191 contract, different implementation. Both tiers are candidates for
192 consolidation behind shared abstractions.
193
194 Uses content-addressed hashes from the snapshot — no AST recomputation
195 or file parsing at query time.
196 """
197 root = require_repo()
198 repo_id = _read_repo_id(root)
199 branch = _read_branch(root)
200
201 if tier not in ("exact", "near", "both"):
202 typer.echo(f"❌ --tier must be 'exact', 'near', or 'both' (got: {tier!r})", err=True)
203 raise typer.Exit(code=ExitCode.USER_ERROR)
204
205 commit = resolve_commit_ref(root, repo_id, branch, ref)
206 if commit is None:
207 typer.echo(f"❌ Commit '{ref or 'HEAD'}' not found.", err=True)
208 raise typer.Exit(code=ExitCode.USER_ERROR)
209
210 manifest = get_commit_snapshot_manifest(root, commit.commit_id) or {}
211 # Validated above — safe to narrow.
212 if tier == "exact":
213 cluster_list = find_clones(root, manifest, "exact", kind_filter, min_cluster)
214 elif tier == "near":
215 cluster_list = find_clones(root, manifest, "near", kind_filter, min_cluster)
216 else:
217 cluster_list = find_clones(root, manifest, "both", kind_filter, min_cluster)
218
219 exact_clusters = [c for c in cluster_list if c.tier == "exact"]
220 near_clusters = [c for c in cluster_list if c.tier == "near"]
221
222 if as_json:
223 typer.echo(json.dumps(
224 {
225 "schema_version": 1,
226 "commit": commit.commit_id[:8],
227 "tier": tier,
228 "min_cluster": min_cluster,
229 "kind_filter": kind_filter,
230 "exact_clone_clusters": len(exact_clusters),
231 "near_clone_clusters": len(near_clusters),
232 "clusters": [c.to_dict() for c in cluster_list],
233 },
234 indent=2,
235 ))
236 return
237
238 typer.echo(f"\nClone analysis — commit {commit.commit_id[:8]}")
239 if kind_filter:
240 typer.echo(f" (kind: {kind_filter})")
241 typer.echo("─" * 62)
242
243 if not cluster_list:
244 typer.echo("\n ✅ No clones detected.")
245 return
246
247 if exact_clusters and tier in ("exact", "both"):
248 typer.echo(f"\nExact clones ({len(exact_clusters)} cluster(s)):")
249 for cl in exact_clusters:
250 typer.echo(f" body_hash {cl.hash_value[:8]}:")
251 for addr, rec in cl.members:
252 typer.echo(f" {addr} {rec['kind']}")
253
254 if near_clusters and tier in ("near", "both"):
255 typer.echo(f"\nNear-clones — same signature ({len(near_clusters)} cluster(s)):")
256 for cl in near_clusters:
257 typer.echo(f" signature_id {cl.hash_value[:8]}:")
258 for addr, rec in cl.members:
259 typer.echo(f" {addr} {rec['kind']} (body {rec['body_hash'][:8]})")
260
261 total = sum(len(c.members) for c in cluster_list)
262 typer.echo(f"\n {len(cluster_list)} clone cluster(s), {total} total symbol(s) involved")
263 typer.echo(" Consider consolidating behind shared abstractions.")