cgcardona / muse public
test_diff_algorithms.py python
498 lines 19.4 KB
d855b718 refactor: strip phase/v2 workflow labels from all source, tests, and docs Gabriel Cardona <cgcardona@gmail.com> 1d ago
1 """Tests for the diff algorithm library.
2
3 Covers all four algorithm modules (lcs, tree_edit, numerical, set_ops) and
4 the schema-driven dispatch in ``muse.core.diff_algorithms``.
5
6 Each algorithm is tested at three levels:
7 1. **Unit** — the core function in isolation.
8 2. **Output shape** — the returned ``StructuredDelta`` is well-formed.
9 3. **Dispatch** — ``diff_by_schema`` routes correctly for each schema kind.
10 """
11 from __future__ import annotations
12
13 import hashlib
14
15 import pytest
16 from typing import Literal
17
18 from muse.core.diff_algorithms import (
19 DiffInput,
20 MapInput,
21 SequenceInput,
22 SetInput,
23 TensorInput,
24 TreeInput,
25 TreeNode,
26 diff_by_schema,
27 )
28 from muse.core.diff_algorithms import lcs as lcs_mod
29 from muse.core.diff_algorithms import numerical as numerical_mod
30 from muse.core.diff_algorithms import set_ops as set_ops_mod
31 from muse.core.diff_algorithms import tree_edit as tree_edit_mod
32 from muse.core.schema import (
33 MapSchema,
34 SequenceSchema,
35 SetSchema,
36 TensorSchema,
37 TreeSchema,
38 )
39 from muse.domain import StructuredDelta
40
41
42 # ---------------------------------------------------------------------------
43 # Helpers
44 # ---------------------------------------------------------------------------
45
46
47 def _cid(s: str) -> str:
48 """Return a deterministic SHA-256 hex for a short string."""
49 return hashlib.sha256(s.encode()).hexdigest()
50
51
52 def _seq_schema(element_type: str = "item") -> SequenceSchema:
53 return SequenceSchema(
54 kind="sequence",
55 element_type=element_type,
56 identity="by_position",
57 diff_algorithm="lcs",
58 alphabet=None,
59 )
60
61
62 def _set_schema(element_type: str = "file") -> SetSchema:
63 return SetSchema(kind="set", element_type=element_type, identity="by_content")
64
65
66 DiffMode = Literal["sparse", "block", "full"]
67
68
69 def _tensor_schema(
70 mode: DiffMode = "sparse", epsilon: float = 0.0
71 ) -> TensorSchema:
72 return TensorSchema(
73 kind="tensor",
74 dtype="float32",
75 rank=1,
76 epsilon=epsilon,
77 diff_mode=mode,
78 )
79
80
81 def _tree_schema() -> TreeSchema:
82 return TreeSchema(kind="tree", node_type="node", diff_algorithm="zhang_shasha")
83
84
85 def _map_schema() -> MapSchema:
86 return MapSchema(
87 kind="map",
88 key_type="key",
89 value_schema=_seq_schema(),
90 identity="by_key",
91 )
92
93
94 def _leaf(label: str) -> TreeNode:
95 return TreeNode(id=label, label=label, content_id=_cid(label), children=())
96
97
98 def _node(label: str, *children: TreeNode) -> TreeNode:
99 return TreeNode(
100 id=label, label=label, content_id=_cid(label), children=tuple(children)
101 )
102
103
104 def _is_valid_delta(d: StructuredDelta) -> bool:
105 return isinstance(d["ops"], list) and isinstance(d["summary"], str)
106
107
108 # ===========================================================================
109 # LCS / Myers tests
110 # ===========================================================================
111
112
113 class TestLCSMyersSES:
114 def test_empty_to_empty_returns_no_steps(self) -> None:
115 steps = lcs_mod.myers_ses([], [])
116 assert steps == []
117
118 def test_empty_to_sequence_all_inserts(self) -> None:
119 steps = lcs_mod.myers_ses([], ["a", "b", "c"])
120 kinds = [s.kind for s in steps]
121 assert kinds == ["insert", "insert", "insert"]
122
123 def test_sequence_to_empty_all_deletes(self) -> None:
124 steps = lcs_mod.myers_ses(["a", "b", "c"], [])
125 kinds = [s.kind for s in steps]
126 assert kinds == ["delete", "delete", "delete"]
127
128 def test_identical_sequences_all_keeps(self) -> None:
129 ids = ["a", "b", "c"]
130 steps = lcs_mod.myers_ses(ids, ids)
131 assert all(s.kind == "keep" for s in steps)
132 assert len(steps) == 3
133
134 def test_single_insert_in_middle(self) -> None:
135 base = ["a", "c"]
136 target = ["a", "b", "c"]
137 steps = lcs_mod.myers_ses(base, target)
138 inserts = [s for s in steps if s.kind == "insert"]
139 assert len(inserts) == 1
140 assert inserts[0].item == "b"
141
142 def test_single_delete_in_middle(self) -> None:
143 base = ["a", "b", "c"]
144 target = ["a", "c"]
145 steps = lcs_mod.myers_ses(base, target)
146 deletes = [s for s in steps if s.kind == "delete"]
147 assert len(deletes) == 1
148 assert deletes[0].item == "b"
149
150 def test_lcs_is_minimal(self) -> None:
151 base = ["a", "b", "c", "d"]
152 target = ["a", "x", "c", "d"]
153 steps = lcs_mod.myers_ses(base, target)
154 keeps = [s for s in steps if s.kind == "keep"]
155 inserts = [s for s in steps if s.kind == "insert"]
156 deletes = [s for s in steps if s.kind == "delete"]
157 assert len(keeps) == 3 # a, c, d are kept
158 assert len(inserts) == 1
159 assert len(deletes) == 1
160
161 def test_step_indices_are_consistent(self) -> None:
162 base = ["x", "y", "z"]
163 target = ["y", "z", "w"]
164 steps = lcs_mod.myers_ses(base, target)
165 for s in steps:
166 if s.kind == "delete":
167 assert s.item == base[s.base_index]
168 elif s.kind == "insert":
169 assert s.item == target[s.target_index]
170
171
172 class TestLCSDetectMoves:
173 def test_paired_delete_insert_becomes_move(self) -> None:
174 from muse.domain import DeleteOp, InsertOp
175
176 cid = _cid("note")
177 ins_op = InsertOp(op="insert", address="", position=3, content_id=cid, content_summary="")
178 del_op = DeleteOp(op="delete", address="", position=0, content_id=cid, content_summary="")
179 moves, rem_ins, rem_del = lcs_mod.detect_moves([ins_op], [del_op])
180 assert len(moves) == 1
181 assert moves[0]["op"] == "move"
182 assert moves[0]["from_position"] == 0
183 assert moves[0]["to_position"] == 3
184 assert len(rem_ins) == 0
185 assert len(rem_del) == 0
186
187 def test_same_position_not_a_move(self) -> None:
188 from muse.domain import DeleteOp, InsertOp
189
190 cid = _cid("item")
191 ins_op = InsertOp(op="insert", address="", position=1, content_id=cid, content_summary="")
192 del_op = DeleteOp(op="delete", address="", position=1, content_id=cid, content_summary="")
193 moves, rem_ins, rem_del = lcs_mod.detect_moves([ins_op], [del_op])
194 assert len(moves) == 0
195 assert len(rem_ins) == 1
196 assert len(rem_del) == 1
197
198 def test_no_paired_content_no_moves(self) -> None:
199 from muse.domain import DeleteOp, InsertOp
200
201 ins_op = InsertOp(op="insert", address="", position=0, content_id=_cid("a"), content_summary="")
202 del_op = DeleteOp(op="delete", address="", position=0, content_id=_cid("b"), content_summary="")
203 moves, rem_ins, rem_del = lcs_mod.detect_moves([ins_op], [del_op])
204 assert len(moves) == 0
205 assert len(rem_ins) == 1
206 assert len(rem_del) == 1
207
208
209 class TestLCSDiff:
210 def test_empty_to_sequence_is_all_inserts(self) -> None:
211 delta = lcs_mod.diff(_seq_schema(), [], ["a", "b"], domain="test")
212 ops = [op for op in delta["ops"] if op["op"] == "insert"]
213 assert len(ops) == 2
214
215 def test_sequence_to_empty_is_all_deletes(self) -> None:
216 delta = lcs_mod.diff(_seq_schema(), ["a", "b"], [], domain="test")
217 ops = [op for op in delta["ops"] if op["op"] == "delete"]
218 assert len(ops) == 2
219
220 def test_identical_sequences_returns_no_ops(self) -> None:
221 delta = lcs_mod.diff(_seq_schema(), ["a", "b", "c"], ["a", "b", "c"], domain="test")
222 assert delta["ops"] == []
223
224 def test_produces_valid_structured_delta(self) -> None:
225 delta = lcs_mod.diff(_seq_schema("note"), ["x"], ["x", "y"], domain="music")
226 assert _is_valid_delta(delta)
227 assert delta["domain"] == "music"
228
229 def test_move_detected_from_delete_plus_insert(self) -> None:
230 a, b, c = _cid("a"), _cid("b"), _cid("c")
231 delta = lcs_mod.diff(_seq_schema(), [a, b, c], [b, c, a], domain="test")
232 ops_by_kind = {op["op"] for op in delta["ops"]}
233 assert "move" in ops_by_kind
234
235 def test_summary_is_human_readable(self) -> None:
236 delta = lcs_mod.diff(_seq_schema("note"), ["a"], ["a", "b"], domain="test")
237 assert "note" in delta["summary"]
238 assert "added" in delta["summary"]
239
240
241 # ===========================================================================
242 # Tree edit tests
243 # ===========================================================================
244
245
246 class TestTreeEditDiff:
247 def test_identical_trees_returns_no_ops(self) -> None:
248 root = _node("root", _leaf("A"), _leaf("B"))
249 delta = tree_edit_mod.diff(_tree_schema(), root, root, domain="test")
250 assert delta["ops"] == []
251
252 def test_leaf_relabel_is_replace(self) -> None:
253 base = _leaf("A")
254 old_cid = _cid("A")
255 new_node = TreeNode(id="A", label="A", content_id=_cid("A_new"), children=())
256 target = TreeNode(id="root", label="root", content_id=_cid("root"),
257 children=(new_node,))
258 base_root = TreeNode(id="root", label="root", content_id=_cid("root"),
259 children=(base,))
260 delta = tree_edit_mod.diff(_tree_schema(), base_root, target, domain="test")
261 replace_ops = [op for op in delta["ops"] if op["op"] == "replace"]
262 assert len(replace_ops) == 1
263
264 def test_node_insert(self) -> None:
265 base = _node("root", _leaf("A"))
266 target = _node("root", _leaf("A"), _leaf("B"))
267 delta = tree_edit_mod.diff(_tree_schema(), base, target, domain="test")
268 insert_ops = [op for op in delta["ops"] if op["op"] == "insert"]
269 assert len(insert_ops) >= 1
270
271 def test_node_delete(self) -> None:
272 base = _node("root", _leaf("A"), _leaf("B"))
273 target = _node("root", _leaf("A"))
274 delta = tree_edit_mod.diff(_tree_schema(), base, target, domain="test")
275 delete_ops = [op for op in delta["ops"] if op["op"] == "delete"]
276 assert len(delete_ops) >= 1
277
278 def test_subtree_move(self) -> None:
279 leaf_a = _leaf("A")
280 leaf_b = _leaf("B")
281 base = _node("root", leaf_a, leaf_b)
282 # Move: leaf_b before leaf_a
283 target = _node("root", leaf_b, leaf_a)
284 delta = tree_edit_mod.diff(_tree_schema(), base, target, domain="test")
285 # Should produce a move or a pair of delete/insert
286 op_kinds = {op["op"] for op in delta["ops"]}
287 assert op_kinds & {"move", "insert", "delete"}
288
289 def test_produces_valid_structured_delta(self) -> None:
290 base = _node("root", _leaf("X"))
291 target = _node("root", _leaf("Y"))
292 delta = tree_edit_mod.diff(_tree_schema(), base, target, domain="music")
293 assert _is_valid_delta(delta)
294 assert delta["domain"] == "music"
295
296 def test_summary_is_human_readable(self) -> None:
297 base = _node("root", _leaf("A"))
298 target = _node("root", _leaf("A"), _leaf("B"))
299 delta = tree_edit_mod.diff(_tree_schema(), base, target, domain="test")
300 assert isinstance(delta["summary"], str)
301 assert len(delta["summary"]) > 0
302
303
304 # ===========================================================================
305 # Numerical diff tests
306 # ===========================================================================
307
308
309 class TestNumericalDiff:
310 def test_within_epsilon_returns_no_ops(self) -> None:
311 schema = _tensor_schema(epsilon=1.0)
312 delta = numerical_mod.diff(schema, [1.0, 2.0, 3.0], [1.4, 2.0, 3.0], domain="test")
313 assert delta["ops"] == []
314
315 def test_outside_epsilon_returns_replace(self) -> None:
316 schema = _tensor_schema(epsilon=0.1)
317 delta = numerical_mod.diff(schema, [1.0, 2.0, 3.0], [1.0, 5.0, 3.0], domain="test")
318 assert len(delta["ops"]) == 1
319 assert delta["ops"][0]["op"] == "replace"
320
321 def test_identical_arrays_returns_no_ops(self) -> None:
322 schema = _tensor_schema()
323 delta = numerical_mod.diff(schema, [1.0, 2.0], [1.0, 2.0], domain="test")
324 assert delta["ops"] == []
325
326 def test_sparse_mode_one_op_per_element(self) -> None:
327 schema = _tensor_schema(mode="sparse", epsilon=0.0)
328 base = [1.0, 2.0, 3.0]
329 target = [9.0, 2.0, 9.0]
330 delta = numerical_mod.diff(schema, base, target, domain="test")
331 assert len(delta["ops"]) == 2 # positions 0 and 2
332 for op in delta["ops"]:
333 assert op["op"] == "replace"
334
335 def test_block_mode_groups_adjacent(self) -> None:
336 schema = _tensor_schema(mode="block", epsilon=0.0)
337 base = [1.0, 2.0, 3.0, 4.0, 5.0]
338 target = [9.0, 9.0, 3.0, 9.0, 9.0]
339 delta = numerical_mod.diff(schema, base, target, domain="test")
340 # Changes at 0,1 and 3,4 → two blocks
341 assert len(delta["ops"]) == 2
342
343 def test_full_mode_single_op(self) -> None:
344 schema = _tensor_schema(mode="full", epsilon=0.0)
345 base = [1.0, 2.0, 3.0]
346 target = [1.0, 99.0, 3.0]
347 delta = numerical_mod.diff(schema, base, target, domain="test")
348 assert len(delta["ops"]) == 1
349 assert delta["ops"][0]["op"] == "replace"
350
351 def test_length_mismatch_returns_single_replace(self) -> None:
352 schema = _tensor_schema()
353 delta = numerical_mod.diff(schema, [1.0, 2.0], [1.0, 2.0, 3.0], domain="test")
354 assert len(delta["ops"]) == 1
355 assert delta["ops"][0]["op"] == "replace"
356
357 def test_produces_valid_structured_delta(self) -> None:
358 schema = _tensor_schema(epsilon=0.5)
359 delta = numerical_mod.diff(schema, [0.0, 1.0], [0.0, 2.0], domain="music")
360 assert _is_valid_delta(delta)
361 assert delta["domain"] == "music"
362
363
364 # ===========================================================================
365 # Set ops tests
366 # ===========================================================================
367
368
369 class TestSetOpsDiff:
370 def test_add_returns_insert(self) -> None:
371 schema = _set_schema()
372 base: frozenset[str] = frozenset()
373 target = frozenset({_cid("file_a")})
374 delta = set_ops_mod.diff(schema, base, target, domain="test")
375 assert len(delta["ops"]) == 1
376 assert delta["ops"][0]["op"] == "insert"
377
378 def test_remove_returns_delete(self) -> None:
379 schema = _set_schema()
380 cid = _cid("file_a")
381 base = frozenset({cid})
382 target: frozenset[str] = frozenset()
383 delta = set_ops_mod.diff(schema, base, target, domain="test")
384 assert len(delta["ops"]) == 1
385 assert delta["ops"][0]["op"] == "delete"
386
387 def test_no_change_returns_empty(self) -> None:
388 schema = _set_schema()
389 cids = frozenset({_cid("a"), _cid("b")})
390 delta = set_ops_mod.diff(schema, cids, cids, domain="test")
391 assert delta["ops"] == []
392
393 def test_all_ops_have_none_position(self) -> None:
394 schema = _set_schema()
395 base: frozenset[str] = frozenset()
396 target = frozenset({_cid("x"), _cid("y")})
397 delta = set_ops_mod.diff(schema, base, target, domain="test")
398 for op in delta["ops"]:
399 assert op["position"] is None
400
401 def test_produces_valid_structured_delta(self) -> None:
402 schema = _set_schema("audio_file")
403 base = frozenset({_cid("drums"), _cid("bass")})
404 target = frozenset({_cid("drums"), _cid("guitar")})
405 delta = set_ops_mod.diff(schema, base, target, domain="music")
406 assert _is_valid_delta(delta)
407 assert delta["domain"] == "music"
408 assert "audio_file" in delta["summary"]
409
410
411 # ===========================================================================
412 # Schema dispatch (diff_by_schema) tests
413 # ===========================================================================
414
415
416 class TestDiffBySchema:
417 def test_dispatch_sequence_schema_calls_lcs(self) -> None:
418 schema = _seq_schema("note")
419 base: DiffInput = SequenceInput(kind="sequence", items=["a"])
420 target: DiffInput = SequenceInput(kind="sequence", items=["a", "b"])
421 delta = diff_by_schema(schema, base, target, domain="test")
422 assert delta["domain"] == "test"
423 insert_ops = [op for op in delta["ops"] if op["op"] == "insert"]
424 assert len(insert_ops) == 1
425
426 def test_dispatch_set_schema_calls_set_ops(self) -> None:
427 schema = _set_schema("file")
428 cid_a = _cid("a")
429 base: DiffInput = SetInput(kind="set", items=frozenset({cid_a}))
430 target: DiffInput = SetInput(kind="set", items=frozenset())
431 delta = diff_by_schema(schema, base, target, domain="test")
432 delete_ops = [op for op in delta["ops"] if op["op"] == "delete"]
433 assert len(delete_ops) == 1
434
435 def test_dispatch_tensor_schema_calls_numerical(self) -> None:
436 schema = _tensor_schema(epsilon=0.0)
437 base: DiffInput = TensorInput(kind="tensor", values=[1.0, 2.0])
438 target: DiffInput = TensorInput(kind="tensor", values=[1.0, 9.0])
439 delta = diff_by_schema(schema, base, target, domain="test")
440 replace_ops = [op for op in delta["ops"] if op["op"] == "replace"]
441 assert len(replace_ops) == 1
442
443 def test_dispatch_tree_schema_calls_tree_edit(self) -> None:
444 schema = _tree_schema()
445 base_tree = _node("root", _leaf("A"))
446 target_tree = _node("root", _leaf("A"), _leaf("B"))
447 base: DiffInput = TreeInput(kind="tree", root=base_tree)
448 target: DiffInput = TreeInput(kind="tree", root=target_tree)
449 delta = diff_by_schema(schema, base, target, domain="test")
450 assert _is_valid_delta(delta)
451
452 def test_dispatch_map_schema_recurses(self) -> None:
453 schema = _map_schema()
454 cid_a, cid_b = _cid("va"), _cid("vb")
455 base: DiffInput = MapInput(kind="map", entries={"key1": cid_a})
456 target: DiffInput = MapInput(kind="map", entries={"key1": cid_b, "key2": cid_a})
457 delta = diff_by_schema(schema, base, target, domain="test")
458 assert _is_valid_delta(delta)
459 # key2 added → insert op; key1 changed → replace op
460 op_kinds = [op["op"] for op in delta["ops"]]
461 assert "insert" in op_kinds
462 assert "replace" in op_kinds
463
464 def test_type_error_on_mismatched_schema_and_input(self) -> None:
465 schema = _seq_schema()
466 wrong_input: DiffInput = SetInput(kind="set", items=frozenset())
467 with pytest.raises(TypeError, match="sequence schema requires SequenceInput"):
468 diff_by_schema(schema, wrong_input, wrong_input, domain="test")
469
470 def test_identical_sequence_produces_no_ops(self) -> None:
471 schema = _seq_schema()
472 items = ["a", "b", "c"]
473 base: DiffInput = SequenceInput(kind="sequence", items=items)
474 target: DiffInput = SequenceInput(kind="sequence", items=items)
475 delta = diff_by_schema(schema, base, target, domain="test")
476 assert delta["ops"] == []
477
478 def test_map_add_key_is_insert(self) -> None:
479 schema = _map_schema()
480 base: DiffInput = MapInput(kind="map", entries={})
481 target: DiffInput = MapInput(kind="map", entries={"chr1": _cid("seq")})
482 delta = diff_by_schema(schema, base, target, domain="genomics")
483 assert delta["ops"][0]["op"] == "insert"
484
485 def test_map_remove_key_is_delete(self) -> None:
486 schema = _map_schema()
487 base: DiffInput = MapInput(kind="map", entries={"chr1": _cid("seq")})
488 target: DiffInput = MapInput(kind="map", entries={})
489 delta = diff_by_schema(schema, base, target, domain="genomics")
490 assert delta["ops"][0]["op"] == "delete"
491
492 def test_map_unchanged_returns_no_ops(self) -> None:
493 schema = _map_schema()
494 entries = {"k1": _cid("v1"), "k2": _cid("v2")}
495 base: DiffInput = MapInput(kind="map", entries=entries)
496 target: DiffInput = MapInput(kind="map", entries=entries)
497 delta = diff_by_schema(schema, base, target, domain="test")
498 assert delta["ops"] == []