SXXXXXXX_PyUCC/test_pygount_determinism_deep.py

122 lines
3.9 KiB
Python

"""Deep test of pygount determinism - check EXACT output."""
import subprocess
import json
import hashlib
from pathlib import Path
import tempfile
import time
# Create a test file with known content
test_content = '''"""Test module."""
def function_one():
"""Function one."""
x = 1
y = 2
return x + y
class MyClass:
"""A class."""
def method(self):
"""Method."""
# Comment line
pass
if __name__ == "__main__":
print("Hello")
'''
temp_file = Path(tempfile.mktemp(suffix=".py"))
temp_file.write_text(test_content, encoding='utf-8')
print(f"Test file: {temp_file}")
print(f"Content hash: {hashlib.md5(test_content.encode()).hexdigest()}")
print()
# Run pygount 10 times and collect EXACT JSON output
results = []
for i in range(10):
proc = subprocess.run(['pygount', '--format', 'json', str(temp_file)],
capture_output=True, text=True, check=True)
# Parse JSON
data = json.loads(proc.stdout)
# Extract the file entry
if isinstance(data, list):
item = data[0]
else:
item = data['files'][0]
results.append({
'run': i + 1,
'raw_json': proc.stdout,
'lineCount': item.get('lineCount'),
'sourceCount': item.get('sourceCount'),
'codeCount': item.get('codeCount'),
'documentationCount': item.get('documentationCount'),
'emptyCount': item.get('emptyCount'),
'stringCount': item.get('stringCount'),
'language': item.get('language'),
})
print(f"Run {i+1}: code={item.get('codeCount')}, doc={item.get('documentationCount')}, "
f"empty={item.get('emptyCount')}, source={item.get('sourceCount')}")
# Check if ALL runs produced IDENTICAL results
print()
print("=" * 70)
print("CHECKING FOR DIFFERENCES")
print("=" * 70)
first = results[0]
all_identical = True
for i, result in enumerate(results[1:], 2):
if result['lineCount'] != first['lineCount'] or \
result['sourceCount'] != first['sourceCount'] or \
result['codeCount'] != first['codeCount'] or \
result['documentationCount'] != first['documentationCount'] or \
result['emptyCount'] != first['emptyCount']:
print(f"\n❌ DIFFERENCE FOUND in run {i}:")
print(f" Run 1: lineCount={first['lineCount']}, sourceCount={first['sourceCount']}, "
f"codeCount={first['codeCount']}, doc={first['documentationCount']}, empty={first['emptyCount']}")
print(f" Run {i}: lineCount={result['lineCount']}, sourceCount={result['sourceCount']}, "
f"codeCount={result['codeCount']}, doc={result['documentationCount']}, empty={result['emptyCount']}")
all_identical = False
# Check if raw JSON is byte-for-byte identical
json_hashes = [hashlib.md5(r['raw_json'].encode()).hexdigest() for r in results]
unique_hashes = set(json_hashes)
print()
if len(unique_hashes) == 1:
print("✅ All JSON outputs are BYTE-FOR-BYTE IDENTICAL")
print(f" JSON hash: {json_hashes[0]}")
else:
print(f"❌ Found {len(unique_hashes)} DIFFERENT JSON outputs:")
for h in unique_hashes:
count = json_hashes.count(h)
print(f" Hash {h[:8]}... appeared {count} times")
# Show first difference in detail
print("\n First two different outputs:")
for i, result in enumerate(results):
if json_hashes[i] != json_hashes[0]:
print(f"\n Run 1 JSON:\n{results[0]['raw_json']}")
print(f"\n Run {i+1} JSON:\n{result['raw_json']}")
break
print()
if all_identical:
print("🎉 CONCLUSION: pygount IS deterministic (all numeric values identical)")
else:
print("⚠️ CONCLUSION: pygount is NOT deterministic (values differ between runs)")
print(" → This tool cannot be used for reliable baseline comparison")
print(" → Recommend switching to a deterministic counting library")
# Cleanup
temp_file.unlink()