"""Diagnose differences between two directories of source files. For each file present in both dirs this script reports: - raw MD5 of bytes - normalized MD5 (normalize CRLF->LF and remove BOM) - number of physical lines (raw splitlines) - whether raw MD5 matches, normalized MD5 matches Usage: python tools/diagnose_diff.py [--ext .py,.txt] Example: python tools/diagnose_diff.py pyucc baseline/target_simulator__20251126T144643_local/files """ import sys from pathlib import Path import hashlib import argparse def md5_bytes(b: bytes) -> str: return hashlib.md5(b).hexdigest() def normalize_bytes(b: bytes) -> bytes: # Remove UTF-8 BOM if present if b.startswith(b"\xef\xbb\xbf"): b = b[3:] # Normalize CRLF -> LF b = b.replace(b"\r\n", b"\n") # Also normalize lone CR to LF b = b.replace(b"\r", b"\n") return b def phys_lines_count_raw(b: bytes) -> int: # Count physical lines as number of occurrences of '\n' when normalized to LF, # but preserve behavior for files without trailing newline nb = normalize_bytes(b) if len(nb) == 0: return 0 return nb.count(b"\n") + (0 if nb.endswith(b"\n") else 1) def analyze_file(path: Path): b = path.read_bytes() raw_md5 = md5_bytes(b) norm = normalize_bytes(b) norm_md5 = md5_bytes(norm) phys = phys_lines_count_raw(b) size = len(b) return { "path": str(path), "raw_md5": raw_md5, "norm_md5": norm_md5, "phys_lines": phys, "size": size, } def collect_files(root: Path, exts=None): files = {} for p in root.rglob("*"): if p.is_file(): if exts: if not any(str(p).lower().endswith(e) for e in exts): continue rel = str(p.relative_to(root)).replace('\\', '/') files[rel] = p return files def main(): parser = argparse.ArgumentParser() parser.add_argument('dirA') parser.add_argument('dirB') parser.add_argument('--ext', default=None, help='Comma-separated extensions to check, e.g. .py,.txt') args = parser.parse_args() exts = None if args.ext: exts = [e.strip().lower() for e in args.ext.split(',') if e.strip()] a = Path(args.dirA) b = Path(args.dirB) if not a.exists() or not b.exists(): print('One of the provided dirs does not exist:', a, b) sys.exit(2) files_a = collect_files(a, exts) files_b = collect_files(b, exts) common = sorted(set(files_a.keys()) & set(files_b.keys())) only_a = sorted(set(files_a.keys()) - set(files_b.keys())) only_b = sorted(set(files_b.keys()) - set(files_a.keys())) print(f'Files only in A: {len(only_a)}; only in B: {len(only_b)}; common: {len(common)}') if only_a: print('\nOnly in A (examples):') for p in only_a[:20]: print(' ', p) if only_b: print('\nOnly in B (examples):') for p in only_b[:20]: print(' ', p) diffs = [] for rel in common: fa = files_a[rel] fb = files_b[rel] ra = analyze_file(fa) rb = analyze_file(fb) raw_same = ra['raw_md5'] == rb['raw_md5'] norm_same = ra['norm_md5'] == rb['norm_md5'] phys_same = ra['phys_lines'] == rb['phys_lines'] if not (raw_same and norm_same and phys_same): diffs.append((rel, ra, rb, raw_same, norm_same, phys_same)) print(f'Found {len(diffs)} differing files (by raw/norm/phys checks)') for rel, ra, rb, raw_same, norm_same, phys_same in diffs[:200]: print('\n--', rel) print(' A size:', ra['size'], 'raw_md5:', ra['raw_md5'][:8], 'norm_md5:', ra['norm_md5'][:8], 'lines:', ra['phys_lines']) print(' B size:', rb['size'], 'raw_md5:', rb['raw_md5'][:8], 'norm_md5:', rb['norm_md5'][:8], 'lines:', rb['phys_lines']) print(' same_raw:', raw_same, 'same_norm:', norm_same, 'same_lines:', phys_same) if not diffs: print('No differences detected (raw/norm/phys matched for all common files).') if __name__ == '__main__': main()