maestro/muse_cli/snapshot.py · cgcardona/muse

snapshot.py python

160 lines 5.4 KB

12901c5a Initial extraction from tellurstori/maestro cgcardona <gabriel@tellurstori.com> 4d ago

1	"""Pure filesystem snapshot logic for ``muse commit``.
2
3	All functions here are side-effect-free (no DB, no I/O besides reading
4	files under ``workdir``). They are kept separate so they can be
5	unit-tested without a database.
6
7	ID derivation contract (deterministic, no random/UUID components):
8
9	object_id = sha256(file_bytes).hexdigest()
10	snapshot_id = sha256("\|".join(sorted(f"{path}:{oid}" for path, oid in manifest.items()))).hexdigest()
11	commit_id = sha256(
12	"\|".join(sorted(parent_ids))
13	+ "\|" + snapshot_id
14	+ "\|" + message
15	+ "\|" + committed_at_iso
16	).hexdigest()
17	"""
18	from __future__ import annotations
19
20	import hashlib
21	import pathlib
22
23
24	def hash_file(path: pathlib.Path) -> str:
25	"""Return the sha256 hex digest of a file's raw bytes.
26
27	This is the ``object_id`` for the given file. Reading in chunks
28	keeps memory usage constant regardless of file size.
29	"""
30	h = hashlib.sha256()
31	with path.open("rb") as fh:
32	for chunk in iter(lambda: fh.read(65536), b""):
33	h.update(chunk)
34	return h.hexdigest()
35
36
37	def build_snapshot_manifest(workdir: pathlib.Path) -> dict[str, str]:
38	"""Alias for ``walk_workdir`` — preferred name in public API."""
39	return walk_workdir(workdir)
40
41
42	def walk_workdir(workdir: pathlib.Path) -> dict[str, str]:
43	"""Walk workdir recursively and return ``{rel_path: object_id}``.
44
45	Only regular files are included (symlinks and directories are skipped).
46	Paths use POSIX separators regardless of host OS for cross-platform
47	reproducibility. Hidden files (starting with ``.``) are excluded.
48	"""
49	manifest: dict[str, str] = {}
50	for file_path in sorted(workdir.rglob("*")):
51	if not file_path.is_file():
52	continue
53	if file_path.name.startswith("."):
54	continue
55	rel = file_path.relative_to(workdir).as_posix()
56	manifest[rel] = hash_file(file_path)
57	return manifest
58
59
60	def compute_snapshot_id(manifest: dict[str, str]) -> str:
61	"""Return sha256 of the sorted ``path:object_id`` pairs.
62
63	Sorting ensures two identical working trees always produce the same
64	snapshot_id, regardless of filesystem traversal order.
65	"""
66	parts = sorted(f"{path}:{oid}" for path, oid in manifest.items())
67	payload = "\|".join(parts).encode()
68	return hashlib.sha256(payload).hexdigest()
69
70
71	def diff_workdir_vs_snapshot(
72	workdir: pathlib.Path,
73	last_manifest: dict[str, str],
74	) -> tuple[set[str], set[str], set[str], set[str]]:
75	"""Compare workdir against last_manifest from the previous commit.
76
77	Returns a tuple of four disjoint path sets:
78
79	- ``added`` — files in workdir absent from last_manifest
80	(new files since the last commit).
81	- ``modified`` — files present in both but with a differing sha256 hash.
82	- ``deleted`` — files in last_manifest absent from workdir.
83	- ``untracked`` — non-empty only when last_manifest is empty (i.e. the
84	branch has no commits yet): every file in workdir is
85	treated as untracked rather than as newly-added.
86
87	All paths use POSIX separators for cross-platform reproducibility.
88	"""
89	if not workdir.exists():
90	# Nothing on disk — every previously committed path is deleted.
91	return set(), set(), set(last_manifest.keys()), set()
92
93	current_manifest = walk_workdir(workdir)
94	current_paths = set(current_manifest.keys())
95	last_paths = set(last_manifest.keys())
96
97	if not last_paths:
98	# No prior snapshot — all working-tree files are untracked.
99	return set(), set(), set(), current_paths
100
101	added = current_paths - last_paths
102	deleted = last_paths - current_paths
103	common = current_paths & last_paths
104	modified = {p for p in common if current_manifest[p] != last_manifest[p]}
105	return added, modified, deleted, set()
106
107
108	def compute_commit_id(
109	parent_ids: list[str],
110	snapshot_id: str,
111	message: str,
112	committed_at_iso: str,
113	) -> str:
114	"""Return sha256 of the commit's canonical inputs.
115
116	Given the same arguments on two machines the result is identical.
117	``parent_ids`` is sorted before hashing so insertion order does not
118	affect determinism.
119	"""
120	parts = [
121	"\|".join(sorted(parent_ids)),
122	snapshot_id,
123	message,
124	committed_at_iso,
125	]
126	payload = "\|".join(parts).encode()
127	return hashlib.sha256(payload).hexdigest()
128
129
130	def compute_commit_tree_id(
131	parent_ids: list[str],
132	snapshot_id: str,
133	message: str,
134	author: str,
135	) -> str:
136	"""Return a deterministic commit ID for a raw plumbing commit (no timestamp).
137
138	Unlike ``compute_commit_id``, this function omits ``committed_at`` so that
139	the same (parent_ids, snapshot_id, message, author) tuple always produces
140	the same commit_id. This guarantees idempotency for ``muse commit-tree``:
141	re-running with identical inputs returns the same ID without inserting a
142	duplicate row.
143
144	Args:
145	parent_ids: Zero or more parent commit IDs. Sorted before hashing.
146	snapshot_id: The sha256 ID of the snapshot this commit points to.
147	message: The commit message.
148	author: The author name string.
149
150	Returns:
151	A 64-character lowercase hex SHA-256 digest.
152	"""
153	parts = [
154	"\|".join(sorted(parent_ids)),
155	snapshot_id,
156	message,
157	author,
158	]
159	payload = "\|".join(parts).encode()
160	return hashlib.sha256(payload).hexdigest()

Content Address

Object ID (SHA-256)

a88739a68c7ece041c596e213868e4ea2f57ec353d2a39211dd86a1b635bdbf9

This file is immutable and content-addressed. The same SHA always refers to the same bytes, across every clone and every time.

File Info

Path maestro/muse_cli/snapshot.py

Lines 160

Size 5.4 KB

Language python

Ref 12901c5a

Snapshot 219767745448…

Last Modified

12901c5a

Initial extraction from tellurstori/maestro

cgcardona <gabriel@tellurstori.com> 4d ago

View commit →

Links

Browse tree at 12901c5a All commits View raw