cgcardona / muse public
plugin.py python
676 lines 23.9 KB
062ae392 feat: code-domain semantic commands + code tour de force demo (#54) Gabriel Cardona <cgcardona@gmail.com> 1d ago
1 """Code domain plugin — semantic version control for source code.
2
3 This plugin implements :class:`~muse.domain.MuseDomainPlugin` and
4 :class:`~muse.domain.StructuredMergePlugin` for software repositories.
5
6 Philosophy
7 ----------
8 Git models files as sequences of lines. The code plugin models them as
9 **collections of named symbols** — functions, classes, methods, variables.
10 Two commits that only reformat a Python file (no semantic change) produce
11 identical symbol ``content_id`` values and therefore *no* structured delta.
12 Two commits that rename a function produce a ``ReplaceOp`` annotated
13 ``"renamed to bar"`` rather than a red/green line diff.
14
15 Live State
16 ----------
17 ``LiveState`` is either a ``pathlib.Path`` pointing to ``muse-work/`` or a
18 ``SnapshotManifest`` dict. The path form is used by the CLI; the dict form
19 is used by in-memory merge and diff operations.
20
21 Snapshot Format
22 ---------------
23 A code snapshot is a ``SnapshotManifest``:
24
25 .. code-block:: json
26
27 {
28 "files": {
29 "src/utils.py": "<sha256-of-raw-bytes>",
30 "README.md": "<sha256-of-raw-bytes>"
31 },
32 "domain": "code"
33 }
34
35 The ``files`` values are **raw-bytes SHA-256 hashes** (not AST hashes).
36 This ensures the object store can correctly restore files verbatim on
37 ``muse checkout``. Semantic identity (AST-based hashing) is used only
38 inside ``diff()`` when constructing the structured delta.
39
40 Delta Format
41 ------------
42 ``diff()`` returns a ``StructuredDelta``. For Python files (and other
43 languages with adapters) it produces ``PatchOp`` entries whose ``child_ops``
44 carry symbol-level operations:
45
46 - ``InsertOp`` — a symbol was added (address ``"src/utils.py::my_func"``).
47 - ``DeleteOp`` — a symbol was removed.
48 - ``ReplaceOp`` — a symbol changed. The ``new_summary`` field describes the
49 change: ``"renamed to bar"``, ``"implementation changed"``, etc.
50
51 Non-Python files produce coarse ``InsertOp`` / ``DeleteOp`` / ``ReplaceOp``
52 at the file level.
53
54 Merge Semantics
55 ---------------
56 The plugin implements :class:`~muse.domain.StructuredMergePlugin` so that
57 OT-aware merges detect conflicts at *symbol* granularity:
58
59 - Agent A modifies ``foo()`` and Agent B modifies ``bar()`` in the same
60 file → **auto-merge** (ops commute).
61 - Both agents modify ``foo()`` → **symbol-level conflict** at address
62 ``"src/utils.py::foo"`` rather than a coarse file conflict.
63
64 Schema
65 ------
66 The code domain schema declares five dimensions:
67
68 ``structure``
69 The module/file tree — ``TreeSchema`` with GumTree diff.
70
71 ``symbols``
72 The AST symbol tree — ``TreeSchema`` with GumTree diff.
73
74 ``imports``
75 The import set — ``SetSchema`` with ``by_content`` identity.
76
77 ``variables``
78 Top-level variable assignments — ``SetSchema``.
79
80 ``metadata``
81 Configuration and non-code files — ``SetSchema``.
82 """
83 from __future__ import annotations
84
85 import hashlib
86 import logging
87 import pathlib
88
89 from muse.core.ignore import is_ignored, load_patterns
90 from muse.core.object_store import read_object
91 from muse.core.op_transform import merge_op_lists, ops_commute
92 from muse.core.schema import (
93 DimensionSpec,
94 DomainSchema,
95 SetSchema,
96 TreeSchema,
97 )
98 from muse.domain import (
99 DeleteOp,
100 DomainOp,
101 DriftReport,
102 InsertOp,
103 LiveState,
104 MergeResult,
105 PatchOp,
106 ReplaceOp,
107 SnapshotManifest,
108 StateDelta,
109 StateSnapshot,
110 StructuredDelta,
111 )
112 from muse.plugins.code.ast_parser import (
113 SymbolTree,
114 adapter_for_path,
115 parse_symbols,
116 )
117 from muse.plugins.code.symbol_diff import (
118 build_diff_ops,
119 delta_summary,
120 )
121
122 logger = logging.getLogger(__name__)
123
124 _DOMAIN_NAME = "code"
125
126 # Directories that are never versioned regardless of .museignore.
127 # These are implicit ignores that apply to all code repositories.
128 _ALWAYS_IGNORE_DIRS: frozenset[str] = frozenset({
129 ".git",
130 ".muse",
131 "__pycache__",
132 ".mypy_cache",
133 ".pytest_cache",
134 ".ruff_cache",
135 "node_modules",
136 ".venv",
137 "venv",
138 ".tox",
139 "dist",
140 "build",
141 ".eggs",
142 ".DS_Store",
143 })
144
145
146 class CodePlugin:
147 """Muse domain plugin for software source code repositories.
148
149 Implements all six core protocol methods plus the optional
150 :class:`~muse.domain.StructuredMergePlugin` OT extension. The plugin
151 does not implement :class:`~muse.domain.CRDTPlugin` — source code is
152 human-authored and benefits from explicit conflict resolution rather
153 than automatic convergence.
154
155 The plugin is stateless. The module-level singleton :data:`plugin` is
156 the standard entry point.
157 """
158
159 # ------------------------------------------------------------------
160 # 1. snapshot
161 # ------------------------------------------------------------------
162
163 def snapshot(self, live_state: LiveState) -> StateSnapshot:
164 """Capture the current ``muse-work/`` directory as a snapshot dict.
165
166 Walks all regular files under *live_state*, hashing each one with
167 SHA-256 (raw bytes). Honours ``.museignore`` and always ignores
168 known tool-generated directories (``__pycache__``, ``.git``, etc.).
169
170 Args:
171 live_state: A ``pathlib.Path`` pointing to ``muse-work/``, or an
172 existing ``SnapshotManifest`` dict (returned as-is).
173
174 Returns:
175 A ``SnapshotManifest`` mapping workspace-relative POSIX paths to
176 their SHA-256 raw-bytes digests.
177 """
178 if not isinstance(live_state, pathlib.Path):
179 return live_state
180
181 workdir = live_state
182 # .museignore lives in the repo root (parent of muse-work/).
183 repo_root = workdir.parent
184 patterns = load_patterns(repo_root)
185
186 files: dict[str, str] = {}
187 for p in sorted(workdir.rglob("*")):
188 if not p.is_file():
189 continue
190 # Skip always-ignored directories by checking path parts.
191 if any(part in _ALWAYS_IGNORE_DIRS for part in p.parts):
192 continue
193 rel = p.relative_to(workdir).as_posix()
194 if is_ignored(rel, patterns):
195 continue
196 files[rel] = _hash_file(p)
197
198 return SnapshotManifest(files=files, domain=_DOMAIN_NAME)
199
200 # ------------------------------------------------------------------
201 # 2. diff
202 # ------------------------------------------------------------------
203
204 def diff(
205 self,
206 base: StateSnapshot,
207 target: StateSnapshot,
208 *,
209 repo_root: pathlib.Path | None = None,
210 ) -> StateDelta:
211 """Compute the structured delta between two snapshots.
212
213 Without ``repo_root``
214 Produces coarse file-level ops (``InsertOp`` / ``DeleteOp`` /
215 ``ReplaceOp``). Used by ``muse checkout`` which only needs file
216 paths.
217
218 With ``repo_root``
219 Reads source bytes from the object store, parses AST for
220 supported languages (Python), and produces ``PatchOp`` entries
221 with symbol-level ``child_ops``. Used by ``muse commit`` (to
222 store the structured delta) and ``muse show`` / ``muse diff``.
223
224 Args:
225 base: Base snapshot (older state).
226 target: Target snapshot (newer state).
227 repo_root: Repository root for object-store access and symbol
228 extraction. ``None`` → file-level ops only.
229
230 Returns:
231 A ``StructuredDelta`` with ``domain="code"``.
232 """
233 base_files = base["files"]
234 target_files = target["files"]
235
236 if repo_root is None:
237 ops = _file_level_ops(base_files, target_files)
238 else:
239 ops = _semantic_ops(base_files, target_files, repo_root)
240
241 summary = delta_summary(ops)
242 return StructuredDelta(domain=_DOMAIN_NAME, ops=ops, summary=summary)
243
244 # ------------------------------------------------------------------
245 # 3. merge
246 # ------------------------------------------------------------------
247
248 def merge(
249 self,
250 base: StateSnapshot,
251 left: StateSnapshot,
252 right: StateSnapshot,
253 *,
254 repo_root: pathlib.Path | None = None,
255 ) -> MergeResult:
256 """Three-way merge at file granularity.
257
258 Standard three-way logic:
259
260 - Both sides agree → consensus wins (including both deleted).
261 - Only one side changed → take that side.
262 - Both sides changed differently → conflict.
263
264 This is the fallback used by ``muse cherry-pick`` and contexts where
265 the OT merge path is not available. :meth:`merge_ops` provides
266 symbol-level conflict detection when both sides have structured deltas.
267
268 Args:
269 base: Common ancestor snapshot.
270 left: Our branch snapshot.
271 right: Their branch snapshot.
272 repo_root: Repository root (unused at file level; kept for
273 protocol conformance and future attribute lookups).
274
275 Returns:
276 A ``MergeResult`` with the reconciled snapshot and any file-level
277 conflicts.
278 """
279 base_files = base["files"]
280 left_files = left["files"]
281 right_files = right["files"]
282
283 merged: dict[str, str] = dict(base_files)
284 conflicts: list[str] = []
285
286 all_paths = set(base_files) | set(left_files) | set(right_files)
287 for path in sorted(all_paths):
288 b = base_files.get(path)
289 l = left_files.get(path)
290 r = right_files.get(path)
291
292 if l == r:
293 if l is None:
294 merged.pop(path, None)
295 else:
296 merged[path] = l
297 elif b == l:
298 if r is None:
299 merged.pop(path, None)
300 else:
301 merged[path] = r
302 elif b == r:
303 if l is None:
304 merged.pop(path, None)
305 else:
306 merged[path] = l
307 else:
308 conflicts.append(path)
309 merged[path] = l or r or b or ""
310
311 return MergeResult(
312 merged=SnapshotManifest(files=merged, domain=_DOMAIN_NAME),
313 conflicts=conflicts,
314 )
315
316 # ------------------------------------------------------------------
317 # 4. drift
318 # ------------------------------------------------------------------
319
320 def drift(self, committed: StateSnapshot, live: LiveState) -> DriftReport:
321 """Report how much the working tree has drifted from the last commit.
322
323 Called by ``muse status``. Takes a snapshot of the current live
324 state and diffs it against the committed snapshot.
325
326 Args:
327 committed: The last committed snapshot.
328 live: Current live state (path or snapshot manifest).
329
330 Returns:
331 A ``DriftReport`` describing what has changed since the last commit.
332 """
333 current = self.snapshot(live)
334 delta = self.diff(committed, current)
335 return DriftReport(
336 has_drift=len(delta["ops"]) > 0,
337 summary=delta["summary"],
338 delta=delta,
339 )
340
341 # ------------------------------------------------------------------
342 # 5. apply
343 # ------------------------------------------------------------------
344
345 def apply(self, delta: StateDelta, live_state: LiveState) -> LiveState:
346 """Apply a delta to the working tree.
347
348 Called by ``muse checkout`` after the core engine has already
349 restored file-level objects from the object store. The code plugin
350 has no domain-specific post-processing to perform, so this is a
351 pass-through.
352
353 Args:
354 delta: The typed operation list (unused at post-checkout time).
355 live_state: Current live state (returned unchanged).
356
357 Returns:
358 *live_state* unchanged.
359 """
360 return live_state
361
362 # ------------------------------------------------------------------
363 # 6. schema
364 # ------------------------------------------------------------------
365
366 def schema(self) -> DomainSchema:
367 """Declare the structural schema of the code domain.
368
369 Returns:
370 A ``DomainSchema`` with five semantic dimensions:
371 ``structure``, ``symbols``, ``imports``, ``variables``,
372 and ``metadata``.
373 """
374 return DomainSchema(
375 domain=_DOMAIN_NAME,
376 description=(
377 "Semantic version control for source code. "
378 "Treats code as a structured system of named symbols "
379 "(functions, classes, methods) rather than lines of text. "
380 "Two commits that only reformat a file produce no delta. "
381 "Renames and moves are detected via content-addressed "
382 "symbol identity."
383 ),
384 top_level=TreeSchema(
385 kind="tree",
386 node_type="module",
387 diff_algorithm="gumtree",
388 ),
389 dimensions=[
390 DimensionSpec(
391 name="structure",
392 description=(
393 "Module / file tree. Tracks which files exist and "
394 "how they relate to each other."
395 ),
396 schema=TreeSchema(
397 kind="tree",
398 node_type="file",
399 diff_algorithm="gumtree",
400 ),
401 independent_merge=False,
402 ),
403 DimensionSpec(
404 name="symbols",
405 description=(
406 "AST symbol tree. Functions, classes, methods, and "
407 "variables — the primary unit of semantic change."
408 ),
409 schema=TreeSchema(
410 kind="tree",
411 node_type="symbol",
412 diff_algorithm="gumtree",
413 ),
414 independent_merge=True,
415 ),
416 DimensionSpec(
417 name="imports",
418 description=(
419 "Import set. Tracks added / removed import statements "
420 "as an unordered set — order is semantically irrelevant."
421 ),
422 schema=SetSchema(
423 kind="set",
424 element_type="import",
425 identity="by_content",
426 ),
427 independent_merge=True,
428 ),
429 DimensionSpec(
430 name="variables",
431 description=(
432 "Top-level variable and constant assignments. "
433 "Tracked as an unordered set."
434 ),
435 schema=SetSchema(
436 kind="set",
437 element_type="variable",
438 identity="by_content",
439 ),
440 independent_merge=True,
441 ),
442 DimensionSpec(
443 name="metadata",
444 description=(
445 "Non-code files: configuration, documentation, "
446 "build scripts, etc. Tracked at file granularity."
447 ),
448 schema=SetSchema(
449 kind="set",
450 element_type="file",
451 identity="by_content",
452 ),
453 independent_merge=True,
454 ),
455 ],
456 merge_mode="three_way",
457 schema_version=1,
458 )
459
460 # ------------------------------------------------------------------
461 # StructuredMergePlugin — OT extension
462 # ------------------------------------------------------------------
463
464 def merge_ops(
465 self,
466 base: StateSnapshot,
467 ours_snap: StateSnapshot,
468 theirs_snap: StateSnapshot,
469 ours_ops: list[DomainOp],
470 theirs_ops: list[DomainOp],
471 *,
472 repo_root: pathlib.Path | None = None,
473 ) -> MergeResult:
474 """Operation-level three-way merge using Operational Transformation.
475
476 Uses :func:`~muse.core.op_transform.merge_op_lists` to determine
477 which ``DomainOp`` pairs commute (auto-mergeable) and which conflict.
478 For ``PatchOp`` entries at the same file address, the engine recurses
479 into ``child_ops`` — so two agents modifying *different* functions in
480 the same file auto-merge, while concurrent modifications to the *same*
481 function produce a symbol-level conflict address.
482
483 The reconciled ``merged`` snapshot is produced by the file-level
484 three-way :meth:`merge` fallback (we cannot reconstruct merged source
485 bytes without a text-merge pass). This is correct for all cases where
486 the two sides touched *different* files. For the same-file-different-
487 symbol case the merged manifest holds the *ours* version of the file —
488 annotated as a conflict-free merge — which may require the user to
489 re-apply the theirs changes manually. This limitation is documented
490 and will be lifted in a future release that implements source-level
491 patching.
492
493 Args:
494 base: Common ancestor snapshot.
495 ours_snap: Our branch's final snapshot.
496 theirs_snap: Their branch's final snapshot.
497 ours_ops: Our branch's typed operation list.
498 theirs_ops: Their branch's typed operation list.
499 repo_root: Repository root for ``.museattributes`` lookup.
500
501 Returns:
502 A ``MergeResult`` where ``conflicts`` contains symbol-level
503 addresses (e.g. ``"src/utils.py::calculate_total"``) rather than
504 bare file paths.
505 """
506 # The core OT engine's _op_key for PatchOp hashes only the file path
507 # and child_domain — not the child_ops themselves. This means two
508 # PatchOps for the same file are treated as "consensus" regardless of
509 # whether they touch the same or different symbols. We therefore
510 # implement symbol-level conflict detection directly here.
511
512 # ── Step 1: symbol-level conflict detection for PatchOps ──────────
513 ours_patches: dict[str, PatchOp] = {
514 op["address"]: op for op in ours_ops if op["op"] == "patch"
515 }
516 theirs_patches: dict[str, PatchOp] = {
517 op["address"]: op for op in theirs_ops if op["op"] == "patch"
518 }
519
520 conflict_addresses: set[str] = set()
521 for path in ours_patches:
522 if path not in theirs_patches:
523 continue
524 for our_child in ours_patches[path]["child_ops"]:
525 for their_child in theirs_patches[path]["child_ops"]:
526 if not ops_commute(our_child, their_child):
527 conflict_addresses.add(our_child["address"])
528
529 # ── Step 2: coarse OT for non-PatchOp ops (file-level inserts/deletes) ──
530 non_patch_ours: list[DomainOp] = [op for op in ours_ops if op["op"] != "patch"]
531 non_patch_theirs: list[DomainOp] = [op for op in theirs_ops if op["op"] != "patch"]
532 file_result = merge_op_lists(
533 base_ops=[],
534 ours_ops=non_patch_ours,
535 theirs_ops=non_patch_theirs,
536 )
537 for our_op, _ in file_result.conflict_ops:
538 conflict_addresses.add(our_op["address"])
539
540 conflicts: list[str] = sorted(conflict_addresses)
541 merged_ops: list[DomainOp] = list(file_result.merged_ops) + list(ours_ops)
542
543 # Fall back to file-level merge for the manifest.
544 fallback = self.merge(base, ours_snap, theirs_snap, repo_root=repo_root)
545 return MergeResult(
546 merged=fallback.merged,
547 conflicts=conflicts,
548 applied_strategies=fallback.applied_strategies,
549 dimension_reports=fallback.dimension_reports,
550 op_log=merged_ops,
551 )
552
553
554 # ---------------------------------------------------------------------------
555 # Private helpers
556 # ---------------------------------------------------------------------------
557
558
559 def _hash_file(path: pathlib.Path) -> str:
560 """Return the SHA-256 hex digest of *path*'s raw bytes."""
561 h = hashlib.sha256()
562 with path.open("rb") as fh:
563 for chunk in iter(lambda: fh.read(65_536), b""):
564 h.update(chunk)
565 return h.hexdigest()
566
567
568 def _file_level_ops(
569 base_files: dict[str, str],
570 target_files: dict[str, str],
571 ) -> list[DomainOp]:
572 """Produce coarse file-level ops (no AST parsing)."""
573 base_paths = set(base_files)
574 target_paths = set(target_files)
575 ops: list[DomainOp] = []
576
577 for path in sorted(target_paths - base_paths):
578 ops.append(InsertOp(
579 op="insert",
580 address=path,
581 position=None,
582 content_id=target_files[path],
583 content_summary=f"added {path}",
584 ))
585 for path in sorted(base_paths - target_paths):
586 ops.append(DeleteOp(
587 op="delete",
588 address=path,
589 position=None,
590 content_id=base_files[path],
591 content_summary=f"removed {path}",
592 ))
593 for path in sorted(base_paths & target_paths):
594 if base_files[path] != target_files[path]:
595 ops.append(ReplaceOp(
596 op="replace",
597 address=path,
598 position=None,
599 old_content_id=base_files[path],
600 new_content_id=target_files[path],
601 old_summary=f"{path} (before)",
602 new_summary=f"{path} (after)",
603 ))
604 return ops
605
606
607 def _semantic_ops(
608 base_files: dict[str, str],
609 target_files: dict[str, str],
610 repo_root: pathlib.Path,
611 ) -> list[DomainOp]:
612 """Produce symbol-level ops by reading files from the object store."""
613 base_paths = set(base_files)
614 target_paths = set(target_files)
615 changed_paths = (
616 (target_paths - base_paths) # added
617 | (base_paths - target_paths) # removed
618 | { # modified
619 p for p in base_paths & target_paths
620 if base_files[p] != target_files[p]
621 }
622 )
623
624 base_trees: dict[str, SymbolTree] = {}
625 target_trees: dict[str, SymbolTree] = {}
626
627 for path in changed_paths:
628 if path in base_files:
629 raw = read_object(repo_root, base_files[path])
630 if raw is not None:
631 base_trees[path] = _parse_with_fallback(raw, path)
632
633 if path in target_files:
634 raw = read_object(repo_root, target_files[path])
635 if raw is not None:
636 target_trees[path] = _parse_with_fallback(raw, path)
637
638 return build_diff_ops(base_files, target_files, base_trees, target_trees)
639
640
641 def _parse_with_fallback(source: bytes, file_path: str) -> SymbolTree:
642 """Parse symbols from *source*, returning an empty tree on any error."""
643 try:
644 return parse_symbols(source, file_path)
645 except Exception:
646 logger.debug("Symbol parsing failed for %s — falling back to file-level.", file_path)
647 return {}
648
649
650 def _load_symbol_trees_from_workdir(
651 workdir: pathlib.Path,
652 manifest: dict[str, str],
653 ) -> dict[str, SymbolTree]:
654 """Build symbol trees for all files in *manifest* that live in *workdir*."""
655 trees: dict[str, SymbolTree] = {}
656 for rel_path in manifest:
657 file_path = workdir / rel_path
658 if not file_path.is_file():
659 continue
660 try:
661 source = file_path.read_bytes()
662 except OSError:
663 continue
664 suffix = pathlib.PurePosixPath(rel_path).suffix.lower()
665 adapter = adapter_for_path(rel_path)
666 if adapter.supported_extensions().intersection({suffix}):
667 trees[rel_path] = _parse_with_fallback(source, rel_path)
668 return trees
669
670
671 # ---------------------------------------------------------------------------
672 # Module-level singleton
673 # ---------------------------------------------------------------------------
674
675 #: The singleton plugin instance registered in ``muse/plugins/registry.py``.
676 plugin = CodePlugin()