test_stress_object_store.py
python
| 1 | """Stress tests for the content-addressed object store. |
| 2 | |
| 3 | Exercises: |
| 4 | - Write-then-read round-trip for varied payload sizes (1 byte … 10 MB). |
| 5 | - Idempotency: writing the same object ID twice is a no-op. |
| 6 | - has_object before and after writes. |
| 7 | - object_path sharding: first two hex chars as directory. |
| 8 | - read_object returns None for absent objects. |
| 9 | - restore_object copies bytes faithfully. |
| 10 | - write_object_from_path uses copy semantics, not load. |
| 11 | - Content integrity: read(write(content)) == content. |
| 12 | - Multiple distinct objects coexist without collision. |
| 13 | """ |
| 14 | from __future__ import annotations |
| 15 | |
| 16 | import hashlib |
| 17 | import os |
| 18 | import pathlib |
| 19 | import secrets |
| 20 | |
| 21 | import pytest |
| 22 | |
| 23 | from muse.core.object_store import ( |
| 24 | has_object, |
| 25 | object_path, |
| 26 | objects_dir, |
| 27 | read_object, |
| 28 | restore_object, |
| 29 | write_object, |
| 30 | write_object_from_path, |
| 31 | ) |
| 32 | |
| 33 | |
| 34 | # --------------------------------------------------------------------------- |
| 35 | # Helpers |
| 36 | # --------------------------------------------------------------------------- |
| 37 | |
| 38 | |
| 39 | def _sha256(data: bytes) -> str: |
| 40 | return hashlib.sha256(data).hexdigest() |
| 41 | |
| 42 | |
| 43 | @pytest.fixture |
| 44 | def repo(tmp_path: pathlib.Path) -> pathlib.Path: |
| 45 | (tmp_path / ".muse").mkdir() |
| 46 | return tmp_path |
| 47 | |
| 48 | |
| 49 | # --------------------------------------------------------------------------- |
| 50 | # Basic round-trip |
| 51 | # --------------------------------------------------------------------------- |
| 52 | |
| 53 | |
| 54 | class TestRoundTrip: |
| 55 | def test_write_then_read_small(self, repo: pathlib.Path) -> None: |
| 56 | data = b"hello muse" |
| 57 | oid = _sha256(data) |
| 58 | write_object(repo, oid, data) |
| 59 | assert read_object(repo, oid) == data |
| 60 | |
| 61 | def test_write_then_read_empty(self, repo: pathlib.Path) -> None: |
| 62 | data = b"" |
| 63 | oid = _sha256(data) |
| 64 | write_object(repo, oid, data) |
| 65 | assert read_object(repo, oid) == data |
| 66 | |
| 67 | def test_write_then_read_single_byte(self, repo: pathlib.Path) -> None: |
| 68 | data = b"\x00" |
| 69 | oid = _sha256(data) |
| 70 | write_object(repo, oid, data) |
| 71 | assert read_object(repo, oid) == data |
| 72 | |
| 73 | def test_write_then_read_binary(self, repo: pathlib.Path) -> None: |
| 74 | data = bytes(range(256)) * 100 |
| 75 | oid = _sha256(data) |
| 76 | write_object(repo, oid, data) |
| 77 | assert read_object(repo, oid) == data |
| 78 | |
| 79 | @pytest.mark.parametrize("size", [1, 100, 4096, 65536, 1_000_000]) |
| 80 | def test_write_then_read_various_sizes(self, repo: pathlib.Path, size: int) -> None: |
| 81 | data = secrets.token_bytes(size) |
| 82 | oid = _sha256(data) |
| 83 | assert write_object(repo, oid, data) is True |
| 84 | assert read_object(repo, oid) == data |
| 85 | |
| 86 | def test_content_integrity(self, repo: pathlib.Path) -> None: |
| 87 | """Read back exactly what was written — not a truncated or padded version.""" |
| 88 | for i in range(20): |
| 89 | data = f"object-content-{i}-{'x' * i}".encode() |
| 90 | oid = _sha256(data) |
| 91 | write_object(repo, oid, data) |
| 92 | recovered = read_object(repo, oid) |
| 93 | assert recovered == data |
| 94 | assert len(recovered) == len(data) |
| 95 | |
| 96 | |
| 97 | # --------------------------------------------------------------------------- |
| 98 | # Idempotency |
| 99 | # --------------------------------------------------------------------------- |
| 100 | |
| 101 | |
| 102 | class TestIdempotency: |
| 103 | def test_double_write_returns_false_second_time(self, repo: pathlib.Path) -> None: |
| 104 | data = b"idempotent" |
| 105 | oid = _sha256(data) |
| 106 | assert write_object(repo, oid, data) is True |
| 107 | assert write_object(repo, oid, data) is False |
| 108 | |
| 109 | def test_double_write_does_not_corrupt(self, repo: pathlib.Path) -> None: |
| 110 | data = b"original content" |
| 111 | oid = _sha256(data) |
| 112 | write_object(repo, oid, data) |
| 113 | # Attempt to overwrite with different content using the same ID — should be silently skipped. |
| 114 | write_object(repo, oid, b"different content") |
| 115 | assert read_object(repo, oid) == data |
| 116 | |
| 117 | def test_triple_write_stays_stable(self, repo: pathlib.Path) -> None: |
| 118 | data = b"triple-write" |
| 119 | oid = _sha256(data) |
| 120 | for _ in range(3): |
| 121 | write_object(repo, oid, data) |
| 122 | assert read_object(repo, oid) == data |
| 123 | |
| 124 | |
| 125 | # --------------------------------------------------------------------------- |
| 126 | # has_object |
| 127 | # --------------------------------------------------------------------------- |
| 128 | |
| 129 | |
| 130 | class TestHasObject: |
| 131 | def test_absent_before_write(self, repo: pathlib.Path) -> None: |
| 132 | oid = _sha256(b"not yet written") |
| 133 | assert not has_object(repo, oid) |
| 134 | |
| 135 | def test_present_after_write(self, repo: pathlib.Path) -> None: |
| 136 | data = b"present" |
| 137 | oid = _sha256(data) |
| 138 | write_object(repo, oid, data) |
| 139 | assert has_object(repo, oid) |
| 140 | |
| 141 | def test_other_objects_dont_shadow(self, repo: pathlib.Path) -> None: |
| 142 | a = b"object-a" |
| 143 | b_ = b"object-b" |
| 144 | oid_a = _sha256(a) |
| 145 | oid_b = _sha256(b_) |
| 146 | write_object(repo, oid_a, a) |
| 147 | assert has_object(repo, oid_a) |
| 148 | assert not has_object(repo, oid_b) |
| 149 | write_object(repo, oid_b, b_) |
| 150 | assert has_object(repo, oid_b) |
| 151 | |
| 152 | |
| 153 | # --------------------------------------------------------------------------- |
| 154 | # Absent objects |
| 155 | # --------------------------------------------------------------------------- |
| 156 | |
| 157 | |
| 158 | class TestAbsentObjects: |
| 159 | def test_read_absent_returns_none(self, repo: pathlib.Path) -> None: |
| 160 | fake_oid = "a" * 64 |
| 161 | assert read_object(repo, fake_oid) is None |
| 162 | |
| 163 | def test_restore_absent_returns_false(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 164 | fake_oid = "b" * 64 |
| 165 | dest = tmp_path / "restored.bin" |
| 166 | result = restore_object(repo, fake_oid, dest) |
| 167 | assert result is False |
| 168 | assert not dest.exists() |
| 169 | |
| 170 | def test_has_object_false_for_random_id(self, repo: pathlib.Path) -> None: |
| 171 | for _ in range(10): |
| 172 | assert not has_object(repo, secrets.token_hex(32)) |
| 173 | |
| 174 | |
| 175 | # --------------------------------------------------------------------------- |
| 176 | # Sharding layout |
| 177 | # --------------------------------------------------------------------------- |
| 178 | |
| 179 | |
| 180 | class TestSharding: |
| 181 | def test_object_path_uses_first_two_chars_as_dir(self, repo: pathlib.Path) -> None: |
| 182 | oid = "ab" + "c" * 62 |
| 183 | path = object_path(repo, oid) |
| 184 | assert path.parent.name == "ab" |
| 185 | assert path.name == "c" * 62 |
| 186 | |
| 187 | def test_objects_with_same_prefix_go_to_same_shard(self, repo: pathlib.Path) -> None: |
| 188 | oid1 = "ff" + "0" * 62 |
| 189 | oid2 = "ff" + "1" * 62 |
| 190 | assert object_path(repo, oid1).parent == object_path(repo, oid2).parent |
| 191 | |
| 192 | def test_objects_with_different_prefix_go_to_different_shards(self, repo: pathlib.Path) -> None: |
| 193 | oid1 = "aa" + "x" * 62 |
| 194 | oid2 = "bb" + "x" * 62 |
| 195 | assert object_path(repo, oid1).parent != object_path(repo, oid2).parent |
| 196 | |
| 197 | def test_256_shards_can_all_be_created(self, repo: pathlib.Path) -> None: |
| 198 | """Write one object per possible shard prefix (00-ff).""" |
| 199 | for prefix in [f"{i:02x}" for i in range(256)]: |
| 200 | data = f"shard-{prefix}".encode() |
| 201 | oid = prefix + _sha256(data)[2:] |
| 202 | write_object(repo, oid, data) |
| 203 | # Verify all 256 shard dirs exist. |
| 204 | shards = [d.name for d in objects_dir(repo).iterdir() if d.is_dir()] |
| 205 | assert len(shards) == 256 |
| 206 | |
| 207 | |
| 208 | # --------------------------------------------------------------------------- |
| 209 | # write_object_from_path |
| 210 | # --------------------------------------------------------------------------- |
| 211 | |
| 212 | |
| 213 | class TestWriteObjectFromPath: |
| 214 | def test_from_path_round_trip(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 215 | src = tmp_path / "source.bin" |
| 216 | data = b"from-path-content" |
| 217 | src.write_bytes(data) |
| 218 | oid = _sha256(data) |
| 219 | assert write_object_from_path(repo, oid, src) is True |
| 220 | assert read_object(repo, oid) == data |
| 221 | |
| 222 | def test_from_path_idempotent(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 223 | src = tmp_path / "idem.bin" |
| 224 | data = b"idempotent-from-path" |
| 225 | src.write_bytes(data) |
| 226 | oid = _sha256(data) |
| 227 | write_object_from_path(repo, oid, src) |
| 228 | assert write_object_from_path(repo, oid, src) is False |
| 229 | |
| 230 | |
| 231 | # --------------------------------------------------------------------------- |
| 232 | # restore_object |
| 233 | # --------------------------------------------------------------------------- |
| 234 | |
| 235 | |
| 236 | class TestRestoreObject: |
| 237 | def test_restore_round_trip(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 238 | data = b"restore-me" |
| 239 | oid = _sha256(data) |
| 240 | write_object(repo, oid, data) |
| 241 | dest = tmp_path / "sub" / "restored.bin" |
| 242 | assert restore_object(repo, oid, dest) is True |
| 243 | assert dest.read_bytes() == data |
| 244 | |
| 245 | def test_restore_creates_parent_dirs(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 246 | data = b"deep-restore" |
| 247 | oid = _sha256(data) |
| 248 | write_object(repo, oid, data) |
| 249 | dest = tmp_path / "a" / "b" / "c" / "file.bin" |
| 250 | restore_object(repo, oid, dest) |
| 251 | assert dest.exists() |
| 252 | |
| 253 | def test_restore_large_object_intact(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 254 | data = secrets.token_bytes(2_000_000) |
| 255 | oid = _sha256(data) |
| 256 | write_object(repo, oid, data) |
| 257 | dest = tmp_path / "large.bin" |
| 258 | restore_object(repo, oid, dest) |
| 259 | assert dest.read_bytes() == data |
| 260 | |
| 261 | |
| 262 | # --------------------------------------------------------------------------- |
| 263 | # Multiple distinct objects |
| 264 | # --------------------------------------------------------------------------- |
| 265 | |
| 266 | |
| 267 | class TestMultipleObjects: |
| 268 | def test_100_distinct_objects_coexist(self, repo: pathlib.Path) -> None: |
| 269 | written: dict[str, bytes] = {} |
| 270 | for i in range(100): |
| 271 | data = f"payload-{i:03d}-{'z' * i}".encode() |
| 272 | oid = _sha256(data) |
| 273 | write_object(repo, oid, data) |
| 274 | written[oid] = data |
| 275 | |
| 276 | for oid, data in written.items(): |
| 277 | assert read_object(repo, oid) == data |
| 278 | |
| 279 | def test_all_objects_independently_addressable(self, repo: pathlib.Path) -> None: |
| 280 | """Verify no two distinct objects collide in the store.""" |
| 281 | oids: list[str] = [] |
| 282 | for i in range(50): |
| 283 | data = secrets.token_bytes(64) |
| 284 | oid = _sha256(data) |
| 285 | write_object(repo, oid, data) |
| 286 | oids.append(oid) |
| 287 | # All OIDs should be unique (probabilistic but essentially certain). |
| 288 | assert len(set(oids)) == 50 |