SXXXXXXX_PyUCC/tools/diagnose_diff.py

131 lines
4.0 KiB
Python

"""Diagnose differences between two directories of source files.
For each file present in both dirs this script reports:
- raw MD5 of bytes
- normalized MD5 (normalize CRLF->LF and remove BOM)
- number of physical lines (raw splitlines)
- whether raw MD5 matches, normalized MD5 matches
Usage:
python tools/diagnose_diff.py <dirA> <dirB> [--ext .py,.txt]
Example:
python tools/diagnose_diff.py pyucc baseline/target_simulator__20251126T144643_local/files
"""
import sys
from pathlib import Path
import hashlib
import argparse
def md5_bytes(b: bytes) -> str:
return hashlib.md5(b).hexdigest()
def normalize_bytes(b: bytes) -> bytes:
# Remove UTF-8 BOM if present
if b.startswith(b"\xef\xbb\xbf"):
b = b[3:]
# Normalize CRLF -> LF
b = b.replace(b"\r\n", b"\n")
# Also normalize lone CR to LF
b = b.replace(b"\r", b"\n")
return b
def phys_lines_count_raw(b: bytes) -> int:
# Count physical lines as number of occurrences of '\n' when normalized to LF,
# but preserve behavior for files without trailing newline
nb = normalize_bytes(b)
if len(nb) == 0:
return 0
return nb.count(b"\n") + (0 if nb.endswith(b"\n") else 1)
def analyze_file(path: Path):
b = path.read_bytes()
raw_md5 = md5_bytes(b)
norm = normalize_bytes(b)
norm_md5 = md5_bytes(norm)
phys = phys_lines_count_raw(b)
size = len(b)
return {
"path": str(path),
"raw_md5": raw_md5,
"norm_md5": norm_md5,
"phys_lines": phys,
"size": size,
}
def collect_files(root: Path, exts=None):
files = {}
for p in root.rglob("*"):
if p.is_file():
if exts:
if not any(str(p).lower().endswith(e) for e in exts):
continue
rel = str(p.relative_to(root)).replace('\\', '/')
files[rel] = p
return files
def main():
parser = argparse.ArgumentParser()
parser.add_argument('dirA')
parser.add_argument('dirB')
parser.add_argument('--ext', default=None, help='Comma-separated extensions to check, e.g. .py,.txt')
args = parser.parse_args()
exts = None
if args.ext:
exts = [e.strip().lower() for e in args.ext.split(',') if e.strip()]
a = Path(args.dirA)
b = Path(args.dirB)
if not a.exists() or not b.exists():
print('One of the provided dirs does not exist:', a, b)
sys.exit(2)
files_a = collect_files(a, exts)
files_b = collect_files(b, exts)
common = sorted(set(files_a.keys()) & set(files_b.keys()))
only_a = sorted(set(files_a.keys()) - set(files_b.keys()))
only_b = sorted(set(files_b.keys()) - set(files_a.keys()))
print(f'Files only in A: {len(only_a)}; only in B: {len(only_b)}; common: {len(common)}')
if only_a:
print('\nOnly in A (examples):')
for p in only_a[:20]:
print(' ', p)
if only_b:
print('\nOnly in B (examples):')
for p in only_b[:20]:
print(' ', p)
diffs = []
for rel in common:
fa = files_a[rel]
fb = files_b[rel]
ra = analyze_file(fa)
rb = analyze_file(fb)
raw_same = ra['raw_md5'] == rb['raw_md5']
norm_same = ra['norm_md5'] == rb['norm_md5']
phys_same = ra['phys_lines'] == rb['phys_lines']
if not (raw_same and norm_same and phys_same):
diffs.append((rel, ra, rb, raw_same, norm_same, phys_same))
print(f'Found {len(diffs)} differing files (by raw/norm/phys checks)')
for rel, ra, rb, raw_same, norm_same, phys_same in diffs[:200]:
print('\n--', rel)
print(' A size:', ra['size'], 'raw_md5:', ra['raw_md5'][:8], 'norm_md5:', ra['norm_md5'][:8], 'lines:', ra['phys_lines'])
print(' B size:', rb['size'], 'raw_md5:', rb['raw_md5'][:8], 'norm_md5:', rb['norm_md5'][:8], 'lines:', rb['phys_lines'])
print(' same_raw:', raw_same, 'same_norm:', norm_same, 'same_lines:', phys_same)
if not diffs:
print('No differences detected (raw/norm/phys matched for all common files).')
if __name__ == '__main__':
main()