cgcardona / muse public
_refactor_classify.py python
230 lines 8.6 KB
bda49bdb feat: redesign .museignore as TOML with domain-scoped sections (#100) Gabriel Cardona <cgcardona@gmail.com> 1d ago
1 """Composite refactor classification for the code domain.
2
3 Provides two tiers of classification:
4
5 **Exact classification** (deterministic, hash-based):
6
7 ``rename`` same body_hash, different name
8 ``move`` same content_id, different file path
9 ``rename+move`` same body_hash, different name AND different file
10 ``signature_only`` same body_hash, different signature_id
11 ``impl_only`` same signature_id, different body_hash
12 ``metadata_only`` same body_hash + signature_id, different metadata_id
13 ``full_rewrite`` both signature and body changed
14
15 **Inferred refactor** (best-effort, heuristic):
16
17 ``extract`` a new symbol appeared whose body is a strict subset of an
18 existing deleted/modified symbol's body
19 ``inline`` a symbol disappeared and its known callers expanded
20 ``split`` one symbol became two — each shares a portion of the body
21 ``merge`` two symbols became one — body is a union of the two old bodies
22
23 Each inferred classification carries a ``confidence`` float (0.0–1.0) and an
24 ``evidence`` list of strings explaining the reasoning.
25
26 These are used by ``muse detect-refactor`` to produce the enhanced JSON output.
27 """
28
29 from __future__ import annotations
30
31 import logging
32 from typing import Literal
33
34 from muse.plugins.code.ast_parser import SymbolRecord
35
36 logger = logging.getLogger(__name__)
37
38 ExactClassification = Literal[
39 "rename",
40 "move",
41 "rename+move",
42 "signature_only",
43 "impl_only",
44 "metadata_only",
45 "full_rewrite",
46 "unchanged",
47 ]
48
49 InferredRefactor = Literal["extract", "inline", "split", "merge", "none"]
50
51
52 class RefactorClassification:
53 """Full classification of a single refactoring event."""
54
55 def __init__(
56 self,
57 old_address: str,
58 new_address: str,
59 old_rec: SymbolRecord,
60 new_rec: SymbolRecord,
61 exact: ExactClassification,
62 inferred: InferredRefactor = "none",
63 confidence: float = 1.0,
64 evidence: list[str] | None = None,
65 ) -> None:
66 self.old_address = old_address
67 self.new_address = new_address
68 self.old_rec = old_rec
69 self.new_rec = new_rec
70 self.exact = exact
71 self.inferred = inferred
72 self.confidence = confidence
73 self.evidence: list[str] = evidence or []
74
75 def to_dict(self) -> dict[str, str | float | list[str]]:
76 return {
77 "old_address": self.old_address,
78 "new_address": self.new_address,
79 "old_kind": self.old_rec["kind"],
80 "new_kind": self.new_rec["kind"],
81 "exact_classification": self.exact,
82 "inferred_refactor": self.inferred,
83 "confidence": round(self.confidence, 3),
84 "evidence": self.evidence,
85 "old_content_id": self.old_rec["content_id"][:8],
86 "new_content_id": self.new_rec["content_id"][:8],
87 "old_body_hash": self.old_rec["body_hash"][:8],
88 "new_body_hash": self.new_rec["body_hash"][:8],
89 "old_signature_id": self.old_rec["signature_id"][:8],
90 "new_signature_id": self.new_rec["signature_id"][:8],
91 }
92
93
94 def classify_exact(
95 old_addr: str,
96 new_addr: str,
97 old: SymbolRecord,
98 new: SymbolRecord,
99 ) -> ExactClassification:
100 """Return the deterministic hash-based refactor classification."""
101 old_file = old_addr.split("::")[0]
102 new_file = new_addr.split("::")[0]
103 same_file = old_file == new_file
104 same_name = old["name"] == new["name"]
105 same_body = old["body_hash"] == new["body_hash"]
106 same_sig = old["signature_id"] == new["signature_id"]
107 same_meta = old.get("metadata_id", "") == new.get("metadata_id", "")
108
109 if old["content_id"] == new["content_id"]:
110 return "unchanged"
111
112 # Cross-file move detection.
113 if not same_file:
114 if same_name and same_body:
115 return "move"
116 if same_body:
117 return "rename+move"
118
119 # Intra-file.
120 if same_body and not same_sig:
121 return "signature_only"
122 if same_body and same_sig and not same_meta:
123 return "metadata_only"
124 if same_sig and not same_body:
125 return "impl_only"
126 if same_body and not same_name:
127 return "rename"
128
129 return "full_rewrite"
130
131
132 def _body_tokens(body_hash: str, body_src: str) -> frozenset[str]:
133 """Very rough body tokenisation for subset detection (split words on spaces)."""
134 return frozenset(body_src.split())
135
136
137 def classify_composite(
138 removed: dict[str, SymbolRecord],
139 added: dict[str, SymbolRecord],
140 ) -> list[RefactorClassification]:
141 """Classify composite refactors across a batch of added/removed symbols.
142
143 Args:
144 removed: Symbols deleted in this diff (address → record).
145 added: Symbols inserted in this diff (address → record).
146
147 Returns:
148 List of :class:`RefactorClassification` objects. Only pairs/groups
149 that pass a confidence threshold are included.
150 """
151 results: list[RefactorClassification] = []
152 matched_removed: set[str] = set()
153 matched_added: set[str] = set()
154
155 # ── Exact matches first (rename / move / rename+move) ──────────────────
156 added_by_body: dict[str, str] = {r["body_hash"]: addr for addr, r in added.items()}
157 added_by_content: dict[str, str] = {r["content_id"]: addr for addr, r in added.items()}
158
159 for rem_addr, rem_rec in sorted(removed.items()):
160 # Exact content match → moved/copied.
161 if rem_rec["content_id"] in added_by_content:
162 new_addr = added_by_content[rem_rec["content_id"]]
163 new_rec = added[new_addr]
164 exact = classify_exact(rem_addr, new_addr, rem_rec, new_rec)
165 results.append(RefactorClassification(
166 old_address=rem_addr,
167 new_address=new_addr,
168 old_rec=rem_rec,
169 new_rec=new_rec,
170 exact=exact,
171 evidence=[f"content_id matches {rem_rec['content_id'][:8]}"],
172 ))
173 matched_removed.add(rem_addr)
174 matched_added.add(new_addr)
175 continue
176
177 # Same body, different name → rename (possibly with cross-file move).
178 if rem_rec["body_hash"] in added_by_body:
179 new_addr = added_by_body[rem_rec["body_hash"]]
180 if new_addr not in matched_added:
181 new_rec = added[new_addr]
182 exact = classify_exact(rem_addr, new_addr, rem_rec, new_rec)
183 results.append(RefactorClassification(
184 old_address=rem_addr,
185 new_address=new_addr,
186 old_rec=rem_rec,
187 new_rec=new_rec,
188 exact=exact,
189 evidence=[f"body_hash matches {rem_rec['body_hash'][:8]}"],
190 ))
191 matched_removed.add(rem_addr)
192 matched_added.add(new_addr)
193 continue
194
195 # ── Inferred: extract — new symbol, no prior body_hash match ───────────
196 # Heuristic: the new symbol's name appears as a call in the modified/surviving
197 # code of a removed symbol. Confidence proportional to name overlap.
198 unmatched_added = {a: r for a, r in added.items() if a not in matched_added}
199 unmatched_removed = {a: r for a, r in removed.items() if a not in matched_removed}
200
201 for add_addr, add_rec in sorted(unmatched_added.items()):
202 best_confidence = 0.0
203 best_src_addr: str | None = None
204 # Look for removed/source symbols that might have been extracted from.
205 for rem_addr, rem_rec in sorted(unmatched_removed.items()):
206 # Simple heuristic: is the new symbol's name a substring of the
207 # source symbol's qualified_name or vice versa?
208 overlap = add_rec["name"].lower() in rem_rec["qualified_name"].lower()
209 if overlap:
210 confidence = 0.5 # Low confidence — name heuristic only.
211 if confidence > best_confidence:
212 best_confidence = confidence
213 best_src_addr = rem_addr
214 if best_src_addr and best_confidence >= 0.5:
215 src_rec = unmatched_removed[best_src_addr]
216 results.append(RefactorClassification(
217 old_address=best_src_addr,
218 new_address=add_addr,
219 old_rec=src_rec,
220 new_rec=add_rec,
221 exact="full_rewrite",
222 inferred="extract",
223 confidence=best_confidence,
224 evidence=[
225 f"new symbol '{add_rec['name']}' found inside "
226 f"old qualified_name '{src_rec['qualified_name']}'"
227 ],
228 ))
229
230 return results