cgcardona / muse public
git2muse.py python
582 lines 18.5 KB
de2a25b0 fix: two-bug root-cause for ghost files in muse status 5h ago
1 """git2muse — Replay a Git commit graph into a Muse repository.
2
3 Usage
4 -----
5 ::
6
7 python tools/git2muse.py [--repo-root PATH] [--dry-run] [--verbose]
8
9 Strategy
10 --------
11 1. Walk ``main`` branch commits oldest-first and create Muse commits on the
12 Muse ``main`` branch preserving the original author, timestamp, and message.
13 2. Walk ``dev`` branch commits oldest-first that are not already on ``main``
14 and replay them onto a Muse ``dev`` branch, branching from the correct
15 ancestor.
16 3. Skip merge commits (commits with more than one parent) — they carry no
17 unique file-state delta; the Muse DAG is reconstructed faithfully through
18 the parent chain on each branch.
19
20 For each Git commit the tool:
21 - Extracts the commit's file tree into ``state/`` using ``git archive``.
22 - Removes files that Muse should not snapshot (build artefacts, caches, IDE
23 files, etc.) according to a hard-coded exclusion list that mirrors
24 ``.museignore``.
25 - Calls the Muse Python API directly (bypassing the CLI) so the original
26 Git author name, e-mail, and committer timestamp are preserved verbatim in
27 the Muse ``CommitRecord``.
28 - Updates the Muse branch HEAD ref so the Muse repo tracks the same history.
29
30 After a successful run the Muse repo under ``.muse/`` contains a full code-
31 domain representation of the project history and is ready to push to MuseHub.
32 """
33
34 from __future__ import annotations
35
36 import argparse
37 import datetime
38 import hashlib
39 import logging
40 import pathlib
41 import shutil
42 import subprocess
43 import sys
44 import tarfile
45 import tempfile
46
47 # ---------------------------------------------------------------------------
48 # Bootstrap: make sure the project root is on sys.path so we can import muse
49 # even when running from the tools/ directory.
50 # ---------------------------------------------------------------------------
51 _REPO_ROOT = pathlib.Path(__file__).parent.parent
52 if str(_REPO_ROOT) not in sys.path:
53 sys.path.insert(0, str(_REPO_ROOT))
54
55 from muse.core.object_store import write_object
56 from muse.core.store import (
57 CommitRecord,
58 SnapshotRecord,
59 get_head_commit_id,
60 write_commit,
61 write_snapshot,
62 )
63 from muse.core.snapshot import compute_commit_id, compute_snapshot_id
64
65 logger = logging.getLogger("git2muse")
66
67 # ---------------------------------------------------------------------------
68 # Files / dirs that should never end up in a Muse snapshot.
69 # These mirror .museignore + the hidden-directory exclusion in walk_workdir.
70 # ---------------------------------------------------------------------------
71
72 _EXCLUDE_PREFIXES: tuple[str, ...] = (
73 ".git/",
74 ".muse/",
75 ".muse",
76 ".venv/",
77 ".tox/",
78 ".mypy_cache/",
79 ".pytest_cache/",
80 ".hypothesis/",
81 ".github/",
82 ".DS_Store",
83 "artifacts/",
84 "__pycache__/",
85 )
86
87 _EXCLUDE_SUFFIXES: tuple[str, ...] = (
88 ".pyc",
89 ".pyo",
90 ".egg-info",
91 ".swp",
92 ".swo",
93 ".tmp",
94 "Thumbs.db",
95 ".DS_Store",
96 )
97
98
99 def _should_exclude(rel_path: str) -> bool:
100 """Return True if *rel_path* should be excluded from the Muse snapshot."""
101 for prefix in _EXCLUDE_PREFIXES:
102 if rel_path.startswith(prefix) or rel_path == prefix.rstrip("/"):
103 return True
104 for suffix in _EXCLUDE_SUFFIXES:
105 if rel_path.endswith(suffix):
106 return True
107 # Skip hidden files/dirs at the top level (mirrors walk_workdir behaviour).
108 first_component = rel_path.split("/")[0]
109 if first_component.startswith("."):
110 return True
111 return False
112
113
114 # ---------------------------------------------------------------------------
115 # Git helpers
116 # ---------------------------------------------------------------------------
117
118
119 def _git(repo_root: pathlib.Path, *args: str) -> str:
120 """Run a git command and return stdout (stripped)."""
121 result = subprocess.run(
122 ["git", *args],
123 cwd=repo_root,
124 capture_output=True,
125 text=True,
126 check=True,
127 )
128 return result.stdout.strip()
129
130
131 def _git_commits_oldest_first(
132 repo_root: pathlib.Path,
133 branch: str,
134 exclude_branches: list[str] | None = None,
135 ) -> list[str]:
136 """Return SHA1 hashes oldest-first for *branch*.
137
138 When *exclude_branches* is given, commits reachable from any of those
139 branches are excluded (used to extract dev-only commits).
140 """
141 cmd = ["log", "--topo-order", "--reverse", "--format=%H"]
142 if exclude_branches:
143 cmd.append(branch)
144 for excl in exclude_branches:
145 cmd.append(f"^{excl}")
146 else:
147 cmd.append(branch)
148 raw = _git(repo_root, *cmd)
149 return [line for line in raw.splitlines() if line.strip()]
150
151
152 _META_SEP = "|||GIT2MUSE|||"
153
154
155 def _git_commit_meta(repo_root: pathlib.Path, sha: str) -> dict[str, str]:
156 """Return author name, email, timestamp, and message for *sha*."""
157 fmt = f"%an{_META_SEP}%ae{_META_SEP}%at{_META_SEP}%B"
158 raw = _git(repo_root, "show", "-s", f"--format={fmt}", sha)
159 parts = raw.split(_META_SEP, 3)
160 if len(parts) < 4:
161 return {"name": "unknown", "email": "", "ts": "0", "message": sha[:12]}
162 name, email, ts, message = parts
163 return {
164 "name": name.strip(),
165 "email": email.strip(),
166 "ts": ts.strip(),
167 "message": message.strip(),
168 }
169
170
171 def _git_parent_shas(repo_root: pathlib.Path, sha: str) -> list[str]:
172 """Return parent SHA1s for *sha* (empty list for root commits)."""
173 raw = _git(repo_root, "log", "-1", "--format=%P", sha)
174 return [p for p in raw.split() if p]
175
176
177 def _is_merge_commit(repo_root: pathlib.Path, sha: str) -> bool:
178 return len(_git_parent_shas(repo_root, sha)) > 1
179
180
181 def _extract_tree_to(
182 repo_root: pathlib.Path,
183 sha: str,
184 dest: pathlib.Path,
185 ) -> None:
186 """Extract the git tree for *sha* into *dest*, applying exclusions."""
187 # Wipe and recreate dest for a clean slate.
188 if dest.exists():
189 shutil.rmtree(dest)
190 dest.mkdir(parents=True)
191
192 # git archive produces a tar stream of the commit tree.
193 archive = subprocess.run(
194 ["git", "archive", "--format=tar", sha],
195 cwd=repo_root,
196 capture_output=True,
197 check=True,
198 )
199 with tempfile.NamedTemporaryFile(suffix=".tar", delete=False) as tmp:
200 tmp.write(archive.stdout)
201 tmp_path = pathlib.Path(tmp.name)
202
203 try:
204 with tarfile.open(tmp_path) as tf:
205 for member in tf.getmembers():
206 if not member.isfile():
207 continue
208 # removeprefix strips only the literal "./" tar prefix, not
209 # individual characters — lstrip("./") was incorrectly turning
210 # ".cursorignore" into "cursorignore" and ".github/" into "github/".
211 rel = member.name.removeprefix("./")
212 if _should_exclude(rel):
213 continue
214 target = dest / rel
215 target.parent.mkdir(parents=True, exist_ok=True)
216 f = tf.extractfile(member)
217 if f is not None:
218 target.write_bytes(f.read())
219 finally:
220 tmp_path.unlink(missing_ok=True)
221
222
223 # ---------------------------------------------------------------------------
224 # Muse snapshot helpers (bypass CLI to preserve git metadata)
225 # ---------------------------------------------------------------------------
226
227
228 def _sha256_file(path: pathlib.Path) -> str:
229 h = hashlib.sha256()
230 h.update(path.read_bytes())
231 return h.hexdigest()
232
233
234 def _build_manifest(workdir: pathlib.Path) -> dict[str, str]:
235 """Walk *workdir* and return {rel_path: sha256} manifest."""
236 manifest: dict[str, str] = {}
237 for fpath in sorted(workdir.rglob("*")):
238 if not fpath.is_file():
239 continue
240 if fpath.is_symlink():
241 continue
242 rel = str(fpath.relative_to(workdir))
243 first = rel.split("/")[0]
244 if first.startswith("."):
245 continue
246 manifest[rel] = _sha256_file(fpath)
247 return manifest
248
249
250 def _store_objects(
251 repo_root: pathlib.Path,
252 workdir: pathlib.Path,
253 manifest: dict[str, str],
254 ) -> None:
255 """Write all objects referenced in *manifest* to the object store."""
256 for rel, oid in manifest.items():
257 fpath = workdir / rel
258 if not fpath.exists():
259 logger.warning("⚠️ Missing file in workdir: %s", rel)
260 continue
261 content = fpath.read_bytes()
262 write_object(repo_root, oid, content)
263
264
265 # ---------------------------------------------------------------------------
266 # Branch ref helpers (direct file I/O — mirrors store.py internal logic)
267 # ---------------------------------------------------------------------------
268
269
270 def _refs_dir(repo_root: pathlib.Path) -> pathlib.Path:
271 return repo_root / ".muse" / "refs" / "heads"
272
273
274 def _set_branch_head(
275 repo_root: pathlib.Path, branch: str, commit_id: str
276 ) -> None:
277 ref_path = _refs_dir(repo_root) / branch
278 ref_path.parent.mkdir(parents=True, exist_ok=True)
279 ref_path.write_text(commit_id + "\n")
280
281
282 def _get_branch_head(repo_root: pathlib.Path, branch: str) -> str | None:
283 ref_path = _refs_dir(repo_root) / branch
284 if not ref_path.exists():
285 return None
286 return ref_path.read_text().strip() or None
287
288
289 def _set_head_ref(repo_root: pathlib.Path, branch: str) -> None:
290 head_path = repo_root / ".muse" / "HEAD"
291 head_path.write_text(f"refs/heads/{branch}\n")
292
293
294 def _ensure_branch_exists(repo_root: pathlib.Path, branch: str) -> None:
295 _refs_dir(repo_root).mkdir(parents=True, exist_ok=True)
296 ref_path = _refs_dir(repo_root) / branch
297 if not ref_path.exists():
298 ref_path.write_text("")
299
300
301 # ---------------------------------------------------------------------------
302 # Core replay logic
303 # ---------------------------------------------------------------------------
304
305
306 def _replay_commit(
307 repo_root: pathlib.Path,
308 workdir: pathlib.Path,
309 git_sha: str,
310 muse_branch: str,
311 parent_muse_id: str | None,
312 meta: dict[str, str],
313 repo_id: str,
314 dry_run: bool,
315 ) -> str:
316 """Replay one Git commit into the Muse object store.
317
318 Returns the new Muse commit ID.
319 """
320 # Build manifest from workdir (already populated by caller).
321 manifest = _build_manifest(workdir)
322
323 # Compute snapshot ID deterministically.
324 snapshot_id = compute_snapshot_id(manifest)
325
326 # Build CommitRecord with original Git metadata.
327 committed_at = datetime.datetime.fromtimestamp(
328 int(meta["ts"]), tz=datetime.timezone.utc
329 )
330 author = f"{meta['name']} <{meta['email']}>"
331 message = meta["message"] or git_sha[:12]
332
333 committed_at_iso = committed_at.isoformat()
334 parent_ids = [parent_muse_id] if parent_muse_id else []
335 commit_id = compute_commit_id(
336 parent_ids=parent_ids,
337 snapshot_id=snapshot_id,
338 message=message,
339 committed_at_iso=committed_at_iso,
340 )
341
342 if dry_run:
343 logger.info(
344 "[dry-run] Would create commit %s (git: %s) on %s | %s",
345 commit_id[:12],
346 git_sha[:12],
347 muse_branch,
348 message[:60],
349 )
350 return commit_id
351
352 # Write objects into the content-addressed store.
353 _store_objects(repo_root, workdir, manifest)
354
355 # Write snapshot record.
356 snap = SnapshotRecord(snapshot_id=snapshot_id, manifest=manifest)
357 write_snapshot(repo_root, snap)
358
359 # Write commit record.
360 record = CommitRecord(
361 commit_id=commit_id,
362 repo_id=repo_id,
363 branch=muse_branch,
364 snapshot_id=snapshot_id,
365 message=message,
366 committed_at=committed_at,
367 parent_commit_id=parent_muse_id,
368 author=author,
369 )
370 write_commit(repo_root, record)
371
372 # Advance branch HEAD.
373 _set_branch_head(repo_root, muse_branch, commit_id)
374
375 return commit_id
376
377
378 def _replay_branch(
379 repo_root: pathlib.Path,
380 workdir: pathlib.Path,
381 git_shas: list[str],
382 muse_branch: str,
383 start_parent_muse_id: str | None,
384 repo_id: str,
385 dry_run: bool,
386 verbose: bool,
387 ) -> dict[str, str]:
388 """Replay a list of git SHAs (oldest first) onto *muse_branch*.
389
390 Returns a mapping of git_sha → muse_commit_id for every replayed commit.
391 """
392 _ensure_branch_exists(repo_root, muse_branch)
393
394 git_to_muse: dict[str, str] = {}
395 parent_muse_id = start_parent_muse_id
396 total = len(git_shas)
397
398 for i, git_sha in enumerate(git_shas, 1):
399 meta = _git_commit_meta(repo_root, git_sha)
400
401 if verbose or i % 10 == 0 or i == 1 or i == total:
402 logger.info(
403 "[%s] %d/%d git:%s '%s'",
404 muse_branch,
405 i,
406 total,
407 git_sha[:12],
408 meta["message"][:60],
409 )
410
411 # Populate state/ with this commit's tree.
412 if not dry_run:
413 _extract_tree_to(repo_root, git_sha, workdir)
414
415 muse_id = _replay_commit(
416 repo_root=repo_root,
417 workdir=workdir,
418 git_sha=git_sha,
419 muse_branch=muse_branch,
420 parent_muse_id=parent_muse_id,
421 meta=meta,
422 repo_id=repo_id,
423 dry_run=dry_run,
424 )
425
426 git_to_muse[git_sha] = muse_id
427 parent_muse_id = muse_id
428
429 return git_to_muse
430
431
432 # ---------------------------------------------------------------------------
433 # Entry point
434 # ---------------------------------------------------------------------------
435
436
437 def _load_repo_id(repo_root: pathlib.Path) -> str:
438 import json
439 repo_json = repo_root / ".muse" / "repo.json"
440 data: dict[str, str] = json.loads(repo_json.read_text())
441 return data["repo_id"]
442
443
444 def main(argv: list[str] | None = None) -> int:
445 parser = argparse.ArgumentParser(
446 description="Replay a Git commit graph into a Muse repository."
447 )
448 parser.add_argument(
449 "--repo-root",
450 type=pathlib.Path,
451 default=_REPO_ROOT,
452 help="Path to the repository root (default: parent of this script).",
453 )
454 parser.add_argument(
455 "--dry-run",
456 action="store_true",
457 help="Log what would happen without writing anything.",
458 )
459 parser.add_argument(
460 "--verbose",
461 "-v",
462 action="store_true",
463 help="Log every commit (default: log every 10 + first/last).",
464 )
465 parser.add_argument(
466 "--branch",
467 default="all",
468 help="Which git branch(es) to replay: 'main', 'dev', or 'all' (default).",
469 )
470 args = parser.parse_args(argv)
471
472 logging.basicConfig(
473 level=logging.INFO,
474 format="%(levelname)s %(message)s",
475 )
476
477 repo_root: pathlib.Path = args.repo_root.resolve()
478 dry_run: bool = args.dry_run
479 verbose: bool = args.verbose
480 branch_arg: str = args.branch
481
482 # Verify .muse/ exists.
483 if not (repo_root / ".muse" / "repo.json").exists():
484 logger.error(
485 "❌ No .muse/repo.json found in %s — run 'muse init' first.", repo_root
486 )
487 return 1
488
489 repo_id = _load_repo_id(repo_root)
490 logger.info("✅ Muse repo ID: %s", repo_id)
491
492 # Use a temp directory for git archive extraction — the repo root IS the
493 # working tree and must never be wiped between replays.
494 with tempfile.TemporaryDirectory(prefix="git2muse-") as _tmpdir:
495 workdir = pathlib.Path(_tmpdir)
496
497 # -----------------------------------------------------------------------
498 # Phase 1: main branch
499 # -----------------------------------------------------------------------
500 all_git_to_muse: dict[str, str] = {}
501
502 if branch_arg in ("main", "all"):
503 logger.info("━━━ Phase 1: replaying main branch ━━━")
504 main_shas = _git_commits_oldest_first(repo_root, "main")
505 # Skip merge commits — they add no unique tree delta.
506 main_shas = [
507 s for s in main_shas
508 if not _is_merge_commit(repo_root, s)
509 ]
510 logger.info(" %d non-merge commits on main", len(main_shas))
511
512 _set_head_ref(repo_root, "main")
513 mapping = _replay_branch(
514 repo_root=repo_root,
515 workdir=workdir,
516 git_shas=main_shas,
517 muse_branch="main",
518 start_parent_muse_id=None,
519 repo_id=repo_id,
520 dry_run=dry_run,
521 verbose=verbose,
522 )
523 all_git_to_muse.update(mapping)
524 logger.info("✅ main: %d commits written", len(mapping))
525
526 # -----------------------------------------------------------------------
527 # Phase 2: dev branch (commits not reachable from main)
528 # -----------------------------------------------------------------------
529 if branch_arg in ("dev", "all"):
530 logger.info("━━━ Phase 2: replaying dev branch ━━━")
531 dev_only_shas = _git_commits_oldest_first(
532 repo_root, "dev", exclude_branches=["main"]
533 )
534 dev_only_shas = [
535 s for s in dev_only_shas
536 if not _is_merge_commit(repo_root, s)
537 ]
538 logger.info(" %d dev-only non-merge commits", len(dev_only_shas))
539
540 if dev_only_shas:
541 # Find the git parent of the oldest dev-only commit — it should
542 # already be in all_git_to_muse (it's a main commit).
543 oldest_dev_sha = dev_only_shas[0]
544 git_parents = _git_parent_shas(repo_root, oldest_dev_sha)
545 branch_parent_muse_id: str | None = None
546 for gp in git_parents:
547 if gp in all_git_to_muse:
548 branch_parent_muse_id = all_git_to_muse[gp]
549 break
550 if branch_parent_muse_id is None:
551 # Fall back to current main HEAD.
552 branch_parent_muse_id = _get_branch_head(repo_root, "main")
553
554 _set_head_ref(repo_root, "dev")
555 mapping = _replay_branch(
556 repo_root=repo_root,
557 workdir=workdir,
558 git_shas=dev_only_shas,
559 muse_branch="dev",
560 start_parent_muse_id=branch_parent_muse_id,
561 repo_id=repo_id,
562 dry_run=dry_run,
563 verbose=verbose,
564 )
565 all_git_to_muse.update(mapping)
566 logger.info("✅ dev: %d commits written", len(mapping))
567 else:
568 logger.info(" dev has no unique commits beyond main — skipping")
569
570 # Leave HEAD pointing at main.
571 if not dry_run:
572 _set_head_ref(repo_root, "main")
573
574 # Summary.
575 main_count = len(all_git_to_muse)
576 logger.info("━━━ Done ━━━ total Muse commits written: %d", main_count)
577
578 return 0
579
580
581 if __name__ == "__main__":
582 sys.exit(main())