muse/plugins/code/_refactor_classify.py · cgcardona/muse

_refactor_classify.py python

230 lines 8.6 KB

bda49bdb feat: redesign .museignore as TOML with domain-scoped sections (#100) Gabriel Cardona <cgcardona@gmail.com> 1d ago

1	"""Composite refactor classification for the code domain.
2
3	Provides two tiers of classification:
4
5	Exact classification (deterministic, hash-based):
6
7	``rename`` same body_hash, different name
8	``move`` same content_id, different file path
9	``rename+move`` same body_hash, different name AND different file
10	``signature_only`` same body_hash, different signature_id
11	``impl_only`` same signature_id, different body_hash
12	``metadata_only`` same body_hash + signature_id, different metadata_id
13	``full_rewrite`` both signature and body changed
14
15	Inferred refactor (best-effort, heuristic):
16
17	``extract`` a new symbol appeared whose body is a strict subset of an
18	existing deleted/modified symbol's body
19	``inline`` a symbol disappeared and its known callers expanded
20	``split`` one symbol became two — each shares a portion of the body
21	``merge`` two symbols became one — body is a union of the two old bodies
22
23	Each inferred classification carries a ``confidence`` float (0.0–1.0) and an
24	``evidence`` list of strings explaining the reasoning.
25
26	These are used by ``muse detect-refactor`` to produce the enhanced JSON output.
27	"""
28
29	from __future__ import annotations
30
31	import logging
32	from typing import Literal
33
34	from muse.plugins.code.ast_parser import SymbolRecord
35
36	logger = logging.getLogger(__name__)
37
38	ExactClassification = Literal[
39	"rename",
40	"move",
41	"rename+move",
42	"signature_only",
43	"impl_only",
44	"metadata_only",
45	"full_rewrite",
46	"unchanged",
47	]
48
49	InferredRefactor = Literal["extract", "inline", "split", "merge", "none"]
50
51
52	class RefactorClassification:
53	"""Full classification of a single refactoring event."""
54
55	def __init__(
56	self,
57	old_address: str,
58	new_address: str,
59	old_rec: SymbolRecord,
60	new_rec: SymbolRecord,
61	exact: ExactClassification,
62	inferred: InferredRefactor = "none",
63	confidence: float = 1.0,
64	evidence: list[str] \| None = None,
65	) -> None:
66	self.old_address = old_address
67	self.new_address = new_address
68	self.old_rec = old_rec
69	self.new_rec = new_rec
70	self.exact = exact
71	self.inferred = inferred
72	self.confidence = confidence
73	self.evidence: list[str] = evidence or []
74
75	def to_dict(self) -> dict[str, str \| float \| list[str]]:
76	return {
77	"old_address": self.old_address,
78	"new_address": self.new_address,
79	"old_kind": self.old_rec["kind"],
80	"new_kind": self.new_rec["kind"],
81	"exact_classification": self.exact,
82	"inferred_refactor": self.inferred,
83	"confidence": round(self.confidence, 3),
84	"evidence": self.evidence,
85	"old_content_id": self.old_rec["content_id"][:8],
86	"new_content_id": self.new_rec["content_id"][:8],
87	"old_body_hash": self.old_rec["body_hash"][:8],
88	"new_body_hash": self.new_rec["body_hash"][:8],
89	"old_signature_id": self.old_rec["signature_id"][:8],
90	"new_signature_id": self.new_rec["signature_id"][:8],
91	}
92
93
94	def classify_exact(
95	old_addr: str,
96	new_addr: str,
97	old: SymbolRecord,
98	new: SymbolRecord,
99	) -> ExactClassification:
100	"""Return the deterministic hash-based refactor classification."""
101	old_file = old_addr.split("::")[0]
102	new_file = new_addr.split("::")[0]
103	same_file = old_file == new_file
104	same_name = old["name"] == new["name"]
105	same_body = old["body_hash"] == new["body_hash"]
106	same_sig = old["signature_id"] == new["signature_id"]
107	same_meta = old.get("metadata_id", "") == new.get("metadata_id", "")
108
109	if old["content_id"] == new["content_id"]:
110	return "unchanged"
111
112	# Cross-file move detection.
113	if not same_file:
114	if same_name and same_body:
115	return "move"
116	if same_body:
117	return "rename+move"
118
119	# Intra-file.
120	if same_body and not same_sig:
121	return "signature_only"
122	if same_body and same_sig and not same_meta:
123	return "metadata_only"
124	if same_sig and not same_body:
125	return "impl_only"
126	if same_body and not same_name:
127	return "rename"
128
129	return "full_rewrite"
130
131
132	def _body_tokens(body_hash: str, body_src: str) -> frozenset[str]:
133	"""Very rough body tokenisation for subset detection (split words on spaces)."""
134	return frozenset(body_src.split())
135
136
137	def classify_composite(
138	removed: dict[str, SymbolRecord],
139	added: dict[str, SymbolRecord],
140	) -> list[RefactorClassification]:
141	"""Classify composite refactors across a batch of added/removed symbols.
142
143	Args:
144	removed: Symbols deleted in this diff (address → record).
145	added: Symbols inserted in this diff (address → record).
146
147	Returns:
148	List of :class:`RefactorClassification` objects. Only pairs/groups
149	that pass a confidence threshold are included.
150	"""
151	results: list[RefactorClassification] = []
152	matched_removed: set[str] = set()
153	matched_added: set[str] = set()
154
155	# ── Exact matches first (rename / move / rename+move) ──────────────────
156	added_by_body: dict[str, str] = {r["body_hash"]: addr for addr, r in added.items()}
157	added_by_content: dict[str, str] = {r["content_id"]: addr for addr, r in added.items()}
158
159	for rem_addr, rem_rec in sorted(removed.items()):
160	# Exact content match → moved/copied.
161	if rem_rec["content_id"] in added_by_content:
162	new_addr = added_by_content[rem_rec["content_id"]]
163	new_rec = added[new_addr]
164	exact = classify_exact(rem_addr, new_addr, rem_rec, new_rec)
165	results.append(RefactorClassification(
166	old_address=rem_addr,
167	new_address=new_addr,
168	old_rec=rem_rec,
169	new_rec=new_rec,
170	exact=exact,
171	evidence=[f"content_id matches {rem_rec['content_id'][:8]}"],
172	))
173	matched_removed.add(rem_addr)
174	matched_added.add(new_addr)
175	continue
176
177	# Same body, different name → rename (possibly with cross-file move).
178	if rem_rec["body_hash"] in added_by_body:
179	new_addr = added_by_body[rem_rec["body_hash"]]
180	if new_addr not in matched_added:
181	new_rec = added[new_addr]
182	exact = classify_exact(rem_addr, new_addr, rem_rec, new_rec)
183	results.append(RefactorClassification(
184	old_address=rem_addr,
185	new_address=new_addr,
186	old_rec=rem_rec,
187	new_rec=new_rec,
188	exact=exact,
189	evidence=[f"body_hash matches {rem_rec['body_hash'][:8]}"],
190	))
191	matched_removed.add(rem_addr)
192	matched_added.add(new_addr)
193	continue
194
195	# ── Inferred: extract — new symbol, no prior body_hash match ───────────
196	# Heuristic: the new symbol's name appears as a call in the modified/surviving
197	# code of a removed symbol. Confidence proportional to name overlap.
198	unmatched_added = {a: r for a, r in added.items() if a not in matched_added}
199	unmatched_removed = {a: r for a, r in removed.items() if a not in matched_removed}
200
201	for add_addr, add_rec in sorted(unmatched_added.items()):
202	best_confidence = 0.0
203	best_src_addr: str \| None = None
204	# Look for removed/source symbols that might have been extracted from.
205	for rem_addr, rem_rec in sorted(unmatched_removed.items()):
206	# Simple heuristic: is the new symbol's name a substring of the
207	# source symbol's qualified_name or vice versa?
208	overlap = add_rec["name"].lower() in rem_rec["qualified_name"].lower()
209	if overlap:
210	confidence = 0.5 # Low confidence — name heuristic only.
211	if confidence > best_confidence:
212	best_confidence = confidence
213	best_src_addr = rem_addr
214	if best_src_addr and best_confidence >= 0.5:
215	src_rec = unmatched_removed[best_src_addr]
216	results.append(RefactorClassification(
217	old_address=best_src_addr,
218	new_address=add_addr,
219	old_rec=src_rec,
220	new_rec=add_rec,
221	exact="full_rewrite",
222	inferred="extract",
223	confidence=best_confidence,
224	evidence=[
225	f"new symbol '{add_rec['name']}' found inside "
226	f"old qualified_name '{src_rec['qualified_name']}'"
227	],
228	))
229
230	return results

Content Address

Object ID (SHA-256)

c072bab4b567a5075ca2217adff1a5852ecc10e7b30a5f8c7a12fa9def8e29ea

This file is immutable and content-addressed. The same SHA always refers to the same bytes, across every clone and every time.

File Info

Path muse/plugins/code/_refactor_classify.py

Lines 230

Size 8.6 KB

Language python

Ref bda49bdb

Snapshot ab8c2d1d5ac4…

Last Modified

bda49bdb

feat: redesign .museignore as TOML with domain-scoped sections (#100)

Gabriel Cardona <cgcardona@gmail.com> 1d ago

View commit →

Links

Browse tree at bda49bdb All commits View raw