"""Pure filesystem snapshot logic for ``muse commit``.

All functions here are side-effect-free (no DB, no I/O besides reading
files under ``workdir``). They are kept separate so they can be
unit-tested without a database.

ID derivation contract (deterministic, no random/UUID components):

    object_id = sha256(file_bytes).hexdigest()
    snapshot_id = sha256("|".join(sorted(f"{path}:{oid}" for path, oid in manifest.items()))).hexdigest()
    commit_id = sha256(
                    "|".join(sorted(parent_ids))
                    + "|" + snapshot_id
                    + "|" + message
                    + "|" + committed_at_iso
                  ).hexdigest()
"""
from __future__ import annotations

import hashlib
import pathlib


def hash_file(path: pathlib.Path) -> str:
    """Return the sha256 hex digest of a file's raw bytes.

    This is the ``object_id`` for the given file. Reading in chunks
    keeps memory usage constant regardless of file size.
    """
    h = hashlib.sha256()
    with path.open("rb") as fh:
        for chunk in iter(lambda: fh.read(65536), b""):
            h.update(chunk)
    return h.hexdigest()


def build_snapshot_manifest(workdir: pathlib.Path) -> dict[str, str]:
    """Alias for ``walk_workdir`` — preferred name in public API."""
    return walk_workdir(workdir)


def walk_workdir(workdir: pathlib.Path) -> dict[str, str]:
    """Walk *workdir* recursively and return ``{rel_path: object_id}``.

    Only regular files are included (symlinks and directories are skipped).
    Paths use POSIX separators regardless of host OS for cross-platform
    reproducibility. Hidden files (starting with ``.``) are excluded.
    """
    manifest: dict[str, str] = {}
    for file_path in sorted(workdir.rglob("*")):
        if not file_path.is_file():
            continue
        if file_path.name.startswith("."):
            continue
        rel = file_path.relative_to(workdir).as_posix()
        manifest[rel] = hash_file(file_path)
    return manifest


def compute_snapshot_id(manifest: dict[str, str]) -> str:
    """Return sha256 of the sorted ``path:object_id`` pairs.

    Sorting ensures two identical working trees always produce the same
    snapshot_id, regardless of filesystem traversal order.
    """
    parts = sorted(f"{path}:{oid}" for path, oid in manifest.items())
    payload = "|".join(parts).encode()
    return hashlib.sha256(payload).hexdigest()


def diff_workdir_vs_snapshot(
    workdir: pathlib.Path,
    last_manifest: dict[str, str],
) -> tuple[set[str], set[str], set[str], set[str]]:
    """Compare *workdir* against *last_manifest* from the previous commit.

    Returns a tuple of four disjoint path sets:

    - ``added`` — files in *workdir* absent from *last_manifest*
                      (new files since the last commit).
    - ``modified`` — files present in both but with a differing sha256 hash.
    - ``deleted`` — files in *last_manifest* absent from *workdir*.
    - ``untracked`` — non-empty only when *last_manifest* is empty (i.e. the
                      branch has no commits yet): every file in *workdir* is
                      treated as untracked rather than as newly-added.

    All paths use POSIX separators for cross-platform reproducibility.
    """
    if not workdir.exists():
        # Nothing on disk — every previously committed path is deleted.
        return set(), set(), set(last_manifest.keys()), set()

    current_manifest = walk_workdir(workdir)
    current_paths = set(current_manifest.keys())
    last_paths = set(last_manifest.keys())

    if not last_paths:
        # No prior snapshot — all working-tree files are untracked.
        return set(), set(), set(), current_paths

    added = current_paths - last_paths
    deleted = last_paths - current_paths
    common = current_paths & last_paths
    modified = {p for p in common if current_manifest[p] != last_manifest[p]}
    return added, modified, deleted, set()


def compute_commit_id(
    parent_ids: list[str],
    snapshot_id: str,
    message: str,
    committed_at_iso: str,
) -> str:
    """Return sha256 of the commit's canonical inputs.

    Given the same arguments on two machines the result is identical.
    ``parent_ids`` is sorted before hashing so insertion order does not
    affect determinism.
    """
    parts = [
        "|".join(sorted(parent_ids)),
        snapshot_id,
        message,
        committed_at_iso,
    ]
    payload = "|".join(parts).encode()
    return hashlib.sha256(payload).hexdigest()


def compute_commit_tree_id(
    parent_ids: list[str],
    snapshot_id: str,
    message: str,
    author: str,
) -> str:
    """Return a deterministic commit ID for a raw plumbing commit (no timestamp).

    Unlike ``compute_commit_id``, this function omits ``committed_at`` so that
    the same (parent_ids, snapshot_id, message, author) tuple always produces
    the same commit_id. This guarantees idempotency for ``muse commit-tree``:
    re-running with identical inputs returns the same ID without inserting a
    duplicate row.

    Args:
        parent_ids: Zero or more parent commit IDs. Sorted before hashing.
        snapshot_id: The sha256 ID of the snapshot this commit points to.
        message: The commit message.
        author: The author name string.

    Returns:
        A 64-character lowercase hex SHA-256 digest.
    """
    parts = [
        "|".join(sorted(parent_ids)),
        snapshot_id,
        message,
        author,
    ]
    payload = "|".join(parts).encode()
    return hashlib.sha256(payload).hexdigest()