cgcardona / muse public
snapshot.py python
160 lines 5.4 KB
12901c5a Initial extraction from tellurstori/maestro cgcardona <gabriel@tellurstori.com> 4d ago
1 """Pure filesystem snapshot logic for ``muse commit``.
2
3 All functions here are side-effect-free (no DB, no I/O besides reading
4 files under ``workdir``). They are kept separate so they can be
5 unit-tested without a database.
6
7 ID derivation contract (deterministic, no random/UUID components):
8
9 object_id = sha256(file_bytes).hexdigest()
10 snapshot_id = sha256("|".join(sorted(f"{path}:{oid}" for path, oid in manifest.items()))).hexdigest()
11 commit_id = sha256(
12 "|".join(sorted(parent_ids))
13 + "|" + snapshot_id
14 + "|" + message
15 + "|" + committed_at_iso
16 ).hexdigest()
17 """
18 from __future__ import annotations
19
20 import hashlib
21 import pathlib
22
23
24 def hash_file(path: pathlib.Path) -> str:
25 """Return the sha256 hex digest of a file's raw bytes.
26
27 This is the ``object_id`` for the given file. Reading in chunks
28 keeps memory usage constant regardless of file size.
29 """
30 h = hashlib.sha256()
31 with path.open("rb") as fh:
32 for chunk in iter(lambda: fh.read(65536), b""):
33 h.update(chunk)
34 return h.hexdigest()
35
36
37 def build_snapshot_manifest(workdir: pathlib.Path) -> dict[str, str]:
38 """Alias for ``walk_workdir`` — preferred name in public API."""
39 return walk_workdir(workdir)
40
41
42 def walk_workdir(workdir: pathlib.Path) -> dict[str, str]:
43 """Walk *workdir* recursively and return ``{rel_path: object_id}``.
44
45 Only regular files are included (symlinks and directories are skipped).
46 Paths use POSIX separators regardless of host OS for cross-platform
47 reproducibility. Hidden files (starting with ``.``) are excluded.
48 """
49 manifest: dict[str, str] = {}
50 for file_path in sorted(workdir.rglob("*")):
51 if not file_path.is_file():
52 continue
53 if file_path.name.startswith("."):
54 continue
55 rel = file_path.relative_to(workdir).as_posix()
56 manifest[rel] = hash_file(file_path)
57 return manifest
58
59
60 def compute_snapshot_id(manifest: dict[str, str]) -> str:
61 """Return sha256 of the sorted ``path:object_id`` pairs.
62
63 Sorting ensures two identical working trees always produce the same
64 snapshot_id, regardless of filesystem traversal order.
65 """
66 parts = sorted(f"{path}:{oid}" for path, oid in manifest.items())
67 payload = "|".join(parts).encode()
68 return hashlib.sha256(payload).hexdigest()
69
70
71 def diff_workdir_vs_snapshot(
72 workdir: pathlib.Path,
73 last_manifest: dict[str, str],
74 ) -> tuple[set[str], set[str], set[str], set[str]]:
75 """Compare *workdir* against *last_manifest* from the previous commit.
76
77 Returns a tuple of four disjoint path sets:
78
79 - ``added`` — files in *workdir* absent from *last_manifest*
80 (new files since the last commit).
81 - ``modified`` — files present in both but with a differing sha256 hash.
82 - ``deleted`` — files in *last_manifest* absent from *workdir*.
83 - ``untracked`` — non-empty only when *last_manifest* is empty (i.e. the
84 branch has no commits yet): every file in *workdir* is
85 treated as untracked rather than as newly-added.
86
87 All paths use POSIX separators for cross-platform reproducibility.
88 """
89 if not workdir.exists():
90 # Nothing on disk — every previously committed path is deleted.
91 return set(), set(), set(last_manifest.keys()), set()
92
93 current_manifest = walk_workdir(workdir)
94 current_paths = set(current_manifest.keys())
95 last_paths = set(last_manifest.keys())
96
97 if not last_paths:
98 # No prior snapshot — all working-tree files are untracked.
99 return set(), set(), set(), current_paths
100
101 added = current_paths - last_paths
102 deleted = last_paths - current_paths
103 common = current_paths & last_paths
104 modified = {p for p in common if current_manifest[p] != last_manifest[p]}
105 return added, modified, deleted, set()
106
107
108 def compute_commit_id(
109 parent_ids: list[str],
110 snapshot_id: str,
111 message: str,
112 committed_at_iso: str,
113 ) -> str:
114 """Return sha256 of the commit's canonical inputs.
115
116 Given the same arguments on two machines the result is identical.
117 ``parent_ids`` is sorted before hashing so insertion order does not
118 affect determinism.
119 """
120 parts = [
121 "|".join(sorted(parent_ids)),
122 snapshot_id,
123 message,
124 committed_at_iso,
125 ]
126 payload = "|".join(parts).encode()
127 return hashlib.sha256(payload).hexdigest()
128
129
130 def compute_commit_tree_id(
131 parent_ids: list[str],
132 snapshot_id: str,
133 message: str,
134 author: str,
135 ) -> str:
136 """Return a deterministic commit ID for a raw plumbing commit (no timestamp).
137
138 Unlike ``compute_commit_id``, this function omits ``committed_at`` so that
139 the same (parent_ids, snapshot_id, message, author) tuple always produces
140 the same commit_id. This guarantees idempotency for ``muse commit-tree``:
141 re-running with identical inputs returns the same ID without inserting a
142 duplicate row.
143
144 Args:
145 parent_ids: Zero or more parent commit IDs. Sorted before hashing.
146 snapshot_id: The sha256 ID of the snapshot this commit points to.
147 message: The commit message.
148 author: The author name string.
149
150 Returns:
151 A 64-character lowercase hex SHA-256 digest.
152 """
153 parts = [
154 "|".join(sorted(parent_ids)),
155 snapshot_id,
156 message,
157 author,
158 ]
159 payload = "|".join(parts).encode()
160 return hashlib.sha256(payload).hexdigest()