157 lines
4.3 KiB
Python
157 lines
4.3 KiB
Python
"""Diagnose differences between two directories of source files.
|
|
|
|
For each file present in both dirs this script reports:
|
|
- raw MD5 of bytes
|
|
- normalized MD5 (normalize CRLF->LF and remove BOM)
|
|
- number of physical lines (raw splitlines)
|
|
- whether raw MD5 matches, normalized MD5 matches
|
|
|
|
Usage:
|
|
python tools/diagnose_diff.py <dirA> <dirB> [--ext .py,.txt]
|
|
|
|
Example:
|
|
python tools/diagnose_diff.py pyucc baseline/target_simulator__20251126T144643_local/files
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
import hashlib
|
|
import argparse
|
|
|
|
|
|
def md5_bytes(b: bytes) -> str:
|
|
return hashlib.md5(b).hexdigest()
|
|
|
|
|
|
def normalize_bytes(b: bytes) -> bytes:
|
|
# Remove UTF-8 BOM if present
|
|
if b.startswith(b"\xef\xbb\xbf"):
|
|
b = b[3:]
|
|
# Normalize CRLF -> LF
|
|
b = b.replace(b"\r\n", b"\n")
|
|
# Also normalize lone CR to LF
|
|
b = b.replace(b"\r", b"\n")
|
|
return b
|
|
|
|
|
|
def phys_lines_count_raw(b: bytes) -> int:
|
|
# Count physical lines as number of occurrences of '\n' when normalized to LF,
|
|
# but preserve behavior for files without trailing newline
|
|
nb = normalize_bytes(b)
|
|
if len(nb) == 0:
|
|
return 0
|
|
return nb.count(b"\n") + (0 if nb.endswith(b"\n") else 1)
|
|
|
|
|
|
def analyze_file(path: Path):
|
|
b = path.read_bytes()
|
|
raw_md5 = md5_bytes(b)
|
|
norm = normalize_bytes(b)
|
|
norm_md5 = md5_bytes(norm)
|
|
phys = phys_lines_count_raw(b)
|
|
size = len(b)
|
|
return {
|
|
"path": str(path),
|
|
"raw_md5": raw_md5,
|
|
"norm_md5": norm_md5,
|
|
"phys_lines": phys,
|
|
"size": size,
|
|
}
|
|
|
|
|
|
def collect_files(root: Path, exts=None):
|
|
files = {}
|
|
for p in root.rglob("*"):
|
|
if p.is_file():
|
|
if exts:
|
|
if not any(str(p).lower().endswith(e) for e in exts):
|
|
continue
|
|
rel = str(p.relative_to(root)).replace("\\", "/")
|
|
files[rel] = p
|
|
return files
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("dirA")
|
|
parser.add_argument("dirB")
|
|
parser.add_argument(
|
|
"--ext", default=None, help="Comma-separated extensions to check, e.g. .py,.txt"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
exts = None
|
|
if args.ext:
|
|
exts = [e.strip().lower() for e in args.ext.split(",") if e.strip()]
|
|
|
|
a = Path(args.dirA)
|
|
b = Path(args.dirB)
|
|
if not a.exists() or not b.exists():
|
|
print("One of the provided dirs does not exist:", a, b)
|
|
sys.exit(2)
|
|
|
|
files_a = collect_files(a, exts)
|
|
files_b = collect_files(b, exts)
|
|
|
|
common = sorted(set(files_a.keys()) & set(files_b.keys()))
|
|
only_a = sorted(set(files_a.keys()) - set(files_b.keys()))
|
|
only_b = sorted(set(files_b.keys()) - set(files_a.keys()))
|
|
|
|
print(
|
|
f"Files only in A: {len(only_a)}; only in B: {len(only_b)}; common: {len(common)}"
|
|
)
|
|
if only_a:
|
|
print("\nOnly in A (examples):")
|
|
for p in only_a[:20]:
|
|
print(" ", p)
|
|
if only_b:
|
|
print("\nOnly in B (examples):")
|
|
for p in only_b[:20]:
|
|
print(" ", p)
|
|
|
|
diffs = []
|
|
for rel in common:
|
|
fa = files_a[rel]
|
|
fb = files_b[rel]
|
|
ra = analyze_file(fa)
|
|
rb = analyze_file(fb)
|
|
raw_same = ra["raw_md5"] == rb["raw_md5"]
|
|
norm_same = ra["norm_md5"] == rb["norm_md5"]
|
|
phys_same = ra["phys_lines"] == rb["phys_lines"]
|
|
if not (raw_same and norm_same and phys_same):
|
|
diffs.append((rel, ra, rb, raw_same, norm_same, phys_same))
|
|
|
|
print(f"Found {len(diffs)} differing files (by raw/norm/phys checks)")
|
|
for rel, ra, rb, raw_same, norm_same, phys_same in diffs[:200]:
|
|
print("\n--", rel)
|
|
print(
|
|
" A size:",
|
|
ra["size"],
|
|
"raw_md5:",
|
|
ra["raw_md5"][:8],
|
|
"norm_md5:",
|
|
ra["norm_md5"][:8],
|
|
"lines:",
|
|
ra["phys_lines"],
|
|
)
|
|
print(
|
|
" B size:",
|
|
rb["size"],
|
|
"raw_md5:",
|
|
rb["raw_md5"][:8],
|
|
"norm_md5:",
|
|
rb["norm_md5"][:8],
|
|
"lines:",
|
|
rb["phys_lines"],
|
|
)
|
|
print(
|
|
" same_raw:", raw_same, "same_norm:", norm_same, "same_lines:", phys_same
|
|
)
|
|
|
|
if not diffs:
|
|
print("No differences detected (raw/norm/phys matched for all common files).")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|