snapshot.py
python
| 1 | """Pure filesystem snapshot logic for ``muse commit``. |
| 2 | |
| 3 | All functions here are side-effect-free (no DB, no I/O besides reading |
| 4 | files under ``workdir``). They are kept separate so they can be |
| 5 | unit-tested without a database. |
| 6 | |
| 7 | ID derivation contract (deterministic, no random/UUID components): |
| 8 | |
| 9 | object_id = sha256(file_bytes).hexdigest() |
| 10 | snapshot_id = sha256("|".join(sorted(f"{path}:{oid}" for path, oid in manifest.items()))).hexdigest() |
| 11 | commit_id = sha256( |
| 12 | "|".join(sorted(parent_ids)) |
| 13 | + "|" + snapshot_id |
| 14 | + "|" + message |
| 15 | + "|" + committed_at_iso |
| 16 | ).hexdigest() |
| 17 | """ |
| 18 | from __future__ import annotations |
| 19 | |
| 20 | import hashlib |
| 21 | import pathlib |
| 22 | |
| 23 | |
| 24 | def hash_file(path: pathlib.Path) -> str: |
| 25 | """Return the sha256 hex digest of a file's raw bytes. |
| 26 | |
| 27 | This is the ``object_id`` for the given file. Reading in chunks |
| 28 | keeps memory usage constant regardless of file size. |
| 29 | """ |
| 30 | h = hashlib.sha256() |
| 31 | with path.open("rb") as fh: |
| 32 | for chunk in iter(lambda: fh.read(65536), b""): |
| 33 | h.update(chunk) |
| 34 | return h.hexdigest() |
| 35 | |
| 36 | |
| 37 | def build_snapshot_manifest(workdir: pathlib.Path) -> dict[str, str]: |
| 38 | """Alias for ``walk_workdir`` — preferred name in public API.""" |
| 39 | return walk_workdir(workdir) |
| 40 | |
| 41 | |
| 42 | def walk_workdir(workdir: pathlib.Path) -> dict[str, str]: |
| 43 | """Walk *workdir* recursively and return ``{rel_path: object_id}``. |
| 44 | |
| 45 | Only regular files are included (symlinks and directories are skipped). |
| 46 | Paths use POSIX separators regardless of host OS for cross-platform |
| 47 | reproducibility. Hidden files (starting with ``.``) are excluded. |
| 48 | """ |
| 49 | manifest: dict[str, str] = {} |
| 50 | for file_path in sorted(workdir.rglob("*")): |
| 51 | if not file_path.is_file(): |
| 52 | continue |
| 53 | if file_path.name.startswith("."): |
| 54 | continue |
| 55 | rel = file_path.relative_to(workdir).as_posix() |
| 56 | manifest[rel] = hash_file(file_path) |
| 57 | return manifest |
| 58 | |
| 59 | |
| 60 | def compute_snapshot_id(manifest: dict[str, str]) -> str: |
| 61 | """Return sha256 of the sorted ``path:object_id`` pairs. |
| 62 | |
| 63 | Sorting ensures two identical working trees always produce the same |
| 64 | snapshot_id, regardless of filesystem traversal order. |
| 65 | """ |
| 66 | parts = sorted(f"{path}:{oid}" for path, oid in manifest.items()) |
| 67 | payload = "|".join(parts).encode() |
| 68 | return hashlib.sha256(payload).hexdigest() |
| 69 | |
| 70 | |
| 71 | def diff_workdir_vs_snapshot( |
| 72 | workdir: pathlib.Path, |
| 73 | last_manifest: dict[str, str], |
| 74 | ) -> tuple[set[str], set[str], set[str], set[str]]: |
| 75 | """Compare *workdir* against *last_manifest* from the previous commit. |
| 76 | |
| 77 | Returns a tuple of four disjoint path sets: |
| 78 | |
| 79 | - ``added`` — files in *workdir* absent from *last_manifest* |
| 80 | (new files since the last commit). |
| 81 | - ``modified`` — files present in both but with a differing sha256 hash. |
| 82 | - ``deleted`` — files in *last_manifest* absent from *workdir*. |
| 83 | - ``untracked`` — non-empty only when *last_manifest* is empty (i.e. the |
| 84 | branch has no commits yet): every file in *workdir* is |
| 85 | treated as untracked rather than as newly-added. |
| 86 | |
| 87 | All paths use POSIX separators for cross-platform reproducibility. |
| 88 | """ |
| 89 | if not workdir.exists(): |
| 90 | # Nothing on disk — every previously committed path is deleted. |
| 91 | return set(), set(), set(last_manifest.keys()), set() |
| 92 | |
| 93 | current_manifest = walk_workdir(workdir) |
| 94 | current_paths = set(current_manifest.keys()) |
| 95 | last_paths = set(last_manifest.keys()) |
| 96 | |
| 97 | if not last_paths: |
| 98 | # No prior snapshot — all working-tree files are untracked. |
| 99 | return set(), set(), set(), current_paths |
| 100 | |
| 101 | added = current_paths - last_paths |
| 102 | deleted = last_paths - current_paths |
| 103 | common = current_paths & last_paths |
| 104 | modified = {p for p in common if current_manifest[p] != last_manifest[p]} |
| 105 | return added, modified, deleted, set() |
| 106 | |
| 107 | |
| 108 | def compute_commit_id( |
| 109 | parent_ids: list[str], |
| 110 | snapshot_id: str, |
| 111 | message: str, |
| 112 | committed_at_iso: str, |
| 113 | ) -> str: |
| 114 | """Return sha256 of the commit's canonical inputs. |
| 115 | |
| 116 | Given the same arguments on two machines the result is identical. |
| 117 | ``parent_ids`` is sorted before hashing so insertion order does not |
| 118 | affect determinism. |
| 119 | """ |
| 120 | parts = [ |
| 121 | "|".join(sorted(parent_ids)), |
| 122 | snapshot_id, |
| 123 | message, |
| 124 | committed_at_iso, |
| 125 | ] |
| 126 | payload = "|".join(parts).encode() |
| 127 | return hashlib.sha256(payload).hexdigest() |
| 128 | |
| 129 | |
| 130 | def compute_commit_tree_id( |
| 131 | parent_ids: list[str], |
| 132 | snapshot_id: str, |
| 133 | message: str, |
| 134 | author: str, |
| 135 | ) -> str: |
| 136 | """Return a deterministic commit ID for a raw plumbing commit (no timestamp). |
| 137 | |
| 138 | Unlike ``compute_commit_id``, this function omits ``committed_at`` so that |
| 139 | the same (parent_ids, snapshot_id, message, author) tuple always produces |
| 140 | the same commit_id. This guarantees idempotency for ``muse commit-tree``: |
| 141 | re-running with identical inputs returns the same ID without inserting a |
| 142 | duplicate row. |
| 143 | |
| 144 | Args: |
| 145 | parent_ids: Zero or more parent commit IDs. Sorted before hashing. |
| 146 | snapshot_id: The sha256 ID of the snapshot this commit points to. |
| 147 | message: The commit message. |
| 148 | author: The author name string. |
| 149 | |
| 150 | Returns: |
| 151 | A 64-character lowercase hex SHA-256 digest. |
| 152 | """ |
| 153 | parts = [ |
| 154 | "|".join(sorted(parent_ids)), |
| 155 | snapshot_id, |
| 156 | message, |
| 157 | author, |
| 158 | ] |
| 159 | payload = "|".join(parts).encode() |
| 160 | return hashlib.sha256(payload).hexdigest() |