"""Diagnose differences between two directories of source files.

For each file present in both dirs this script reports:
 - raw MD5 of bytes
 - normalized MD5 (normalize CRLF->LF and remove BOM)
 - number of physical lines (raw splitlines)
 - whether raw MD5 matches, normalized MD5 matches

Usage:
    python tools/diagnose_diff.py <dirA> <dirB> [--ext .py,.txt]

Example:
    python tools/diagnose_diff.py pyucc baseline/target_simulator__20251126T144643_local/files
"""

import sys
from pathlib import Path
import hashlib
import argparse


def md5_bytes(b: bytes) -> str:
    return hashlib.md5(b).hexdigest()


def normalize_bytes(b: bytes) -> bytes:
    # Remove UTF-8 BOM if present
    if b.startswith(b"\xef\xbb\xbf"):
        b = b[3:]
    # Normalize CRLF -> LF
    b = b.replace(b"\r\n", b"\n")
    # Also normalize lone CR to LF
    b = b.replace(b"\r", b"\n")
    return b


def phys_lines_count_raw(b: bytes) -> int:
    # Count physical lines as number of occurrences of '\n' when normalized to LF,
    # but preserve behavior for files without trailing newline
    nb = normalize_bytes(b)
    if len(nb) == 0:
        return 0
    return nb.count(b"\n") + (0 if nb.endswith(b"\n") else 1)


def analyze_file(path: Path):
    b = path.read_bytes()
    raw_md5 = md5_bytes(b)
    norm = normalize_bytes(b)
    norm_md5 = md5_bytes(norm)
    phys = phys_lines_count_raw(b)
    size = len(b)
    return {
        "path": str(path),
        "raw_md5": raw_md5,
        "norm_md5": norm_md5,
        "phys_lines": phys,
        "size": size,
    }


def collect_files(root: Path, exts=None):
    files = {}
    for p in root.rglob("*"):
        if p.is_file():
            if exts:
                if not any(str(p).lower().endswith(e) for e in exts):
                    continue
            rel = str(p.relative_to(root)).replace("\\", "/")
            files[rel] = p
    return files


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("dirA")
    parser.add_argument("dirB")
    parser.add_argument(
        "--ext", default=None, help="Comma-separated extensions to check, e.g. .py,.txt"
    )
    args = parser.parse_args()

    exts = None
    if args.ext:
        exts = [e.strip().lower() for e in args.ext.split(",") if e.strip()]

    a = Path(args.dirA)
    b = Path(args.dirB)
    if not a.exists() or not b.exists():
        print("One of the provided dirs does not exist:", a, b)
        sys.exit(2)

    files_a = collect_files(a, exts)
    files_b = collect_files(b, exts)

    common = sorted(set(files_a.keys()) & set(files_b.keys()))
    only_a = sorted(set(files_a.keys()) - set(files_b.keys()))
    only_b = sorted(set(files_b.keys()) - set(files_a.keys()))

    print(
        f"Files only in A: {len(only_a)}; only in B: {len(only_b)}; common: {len(common)}"
    )
    if only_a:
        print("\nOnly in A (examples):")
        for p in only_a[:20]:
            print("  ", p)
    if only_b:
        print("\nOnly in B (examples):")
        for p in only_b[:20]:
            print("  ", p)

    diffs = []
    for rel in common:
        fa = files_a[rel]
        fb = files_b[rel]
        ra = analyze_file(fa)
        rb = analyze_file(fb)
        raw_same = ra["raw_md5"] == rb["raw_md5"]
        norm_same = ra["norm_md5"] == rb["norm_md5"]
        phys_same = ra["phys_lines"] == rb["phys_lines"]
        if not (raw_same and norm_same and phys_same):
            diffs.append((rel, ra, rb, raw_same, norm_same, phys_same))

    print(f"Found {len(diffs)} differing files (by raw/norm/phys checks)")
    for rel, ra, rb, raw_same, norm_same, phys_same in diffs[:200]:
        print("\n--", rel)
        print(
            "  A size:",
            ra["size"],
            "raw_md5:",
            ra["raw_md5"][:8],
            "norm_md5:",
            ra["norm_md5"][:8],
            "lines:",
            ra["phys_lines"],
        )
        print(
            "  B size:",
            rb["size"],
            "raw_md5:",
            rb["raw_md5"][:8],
            "norm_md5:",
            rb["norm_md5"][:8],
            "lines:",
            rb["phys_lines"],
        )
        print(
            "  same_raw:", raw_same, "same_norm:", norm_same, "same_lines:", phys_same
        )

    if not diffs:
        print("No differences detected (raw/norm/phys matched for all common files).")


if __name__ == "__main__":
    main()