"""Diagnose differences between two directories of source files. For each file present in both dirs this script reports: - raw MD5 of bytes - normalized MD5 (normalize CRLF->LF and remove BOM) - number of physical lines (raw splitlines) - whether raw MD5 matches, normalized MD5 matches Usage: python tools/diagnose_diff.py [--ext .py,.txt] Example: python tools/diagnose_diff.py pyucc baseline/target_simulator__20251126T144643_local/files """ import sys from pathlib import Path import hashlib import argparse def md5_bytes(b: bytes) -> str: return hashlib.md5(b).hexdigest() def normalize_bytes(b: bytes) -> bytes: # Remove UTF-8 BOM if present if b.startswith(b"\xef\xbb\xbf"): b = b[3:] # Normalize CRLF -> LF b = b.replace(b"\r\n", b"\n") # Also normalize lone CR to LF b = b.replace(b"\r", b"\n") return b def phys_lines_count_raw(b: bytes) -> int: # Count physical lines as number of occurrences of '\n' when normalized to LF, # but preserve behavior for files without trailing newline nb = normalize_bytes(b) if len(nb) == 0: return 0 return nb.count(b"\n") + (0 if nb.endswith(b"\n") else 1) def analyze_file(path: Path): b = path.read_bytes() raw_md5 = md5_bytes(b) norm = normalize_bytes(b) norm_md5 = md5_bytes(norm) phys = phys_lines_count_raw(b) size = len(b) return { "path": str(path), "raw_md5": raw_md5, "norm_md5": norm_md5, "phys_lines": phys, "size": size, } def collect_files(root: Path, exts=None): files = {} for p in root.rglob("*"): if p.is_file(): if exts: if not any(str(p).lower().endswith(e) for e in exts): continue rel = str(p.relative_to(root)).replace("\\", "/") files[rel] = p return files def main(): parser = argparse.ArgumentParser() parser.add_argument("dirA") parser.add_argument("dirB") parser.add_argument( "--ext", default=None, help="Comma-separated extensions to check, e.g. .py,.txt" ) args = parser.parse_args() exts = None if args.ext: exts = [e.strip().lower() for e in args.ext.split(",") if e.strip()] a = Path(args.dirA) b = Path(args.dirB) if not a.exists() or not b.exists(): print("One of the provided dirs does not exist:", a, b) sys.exit(2) files_a = collect_files(a, exts) files_b = collect_files(b, exts) common = sorted(set(files_a.keys()) & set(files_b.keys())) only_a = sorted(set(files_a.keys()) - set(files_b.keys())) only_b = sorted(set(files_b.keys()) - set(files_a.keys())) print( f"Files only in A: {len(only_a)}; only in B: {len(only_b)}; common: {len(common)}" ) if only_a: print("\nOnly in A (examples):") for p in only_a[:20]: print(" ", p) if only_b: print("\nOnly in B (examples):") for p in only_b[:20]: print(" ", p) diffs = [] for rel in common: fa = files_a[rel] fb = files_b[rel] ra = analyze_file(fa) rb = analyze_file(fb) raw_same = ra["raw_md5"] == rb["raw_md5"] norm_same = ra["norm_md5"] == rb["norm_md5"] phys_same = ra["phys_lines"] == rb["phys_lines"] if not (raw_same and norm_same and phys_same): diffs.append((rel, ra, rb, raw_same, norm_same, phys_same)) print(f"Found {len(diffs)} differing files (by raw/norm/phys checks)") for rel, ra, rb, raw_same, norm_same, phys_same in diffs[:200]: print("\n--", rel) print( " A size:", ra["size"], "raw_md5:", ra["raw_md5"][:8], "norm_md5:", ra["norm_md5"][:8], "lines:", ra["phys_lines"], ) print( " B size:", rb["size"], "raw_md5:", rb["raw_md5"][:8], "norm_md5:", rb["norm_md5"][:8], "lines:", rb["phys_lines"], ) print( " same_raw:", raw_same, "same_norm:", norm_same, "same_lines:", phys_same ) if not diffs: print("No differences detected (raw/norm/phys matched for all common files).") if __name__ == "__main__": main()