SXXXXXXX_PyUCC/tests/test_pygount_determinism_deep.py

"""Deep test of pygount determinism - check EXACT output."""

import subprocess
import json
import hashlib
from pathlib import Path
import tempfile
import time

# Create a test file with known content
test_content = '''"""Test module."""

def function_one():
    """Function one."""
    x = 1
    y = 2
    return x + y

class MyClass:
    """A class."""

    def method(self):
        """Method."""
        # Comment line
        pass

if __name__ == "__main__":
    print("Hello")
'''

temp_file = Path(tempfile.mktemp(suffix=".py"))
temp_file.write_text(test_content, encoding="utf-8")

print(f"Test file: {temp_file}")
print(f"Content hash: {hashlib.md5(test_content.encode()).hexdigest()}")
print()

# Run pygount 10 times and collect EXACT JSON output
results = []
for i in range(10):
    proc = subprocess.run(
        ["pygount", "--format", "json", str(temp_file)],
        capture_output=True,
        text=True,
        check=True,
    )

    # Parse JSON
    data = json.loads(proc.stdout)

    # Extract the file entry
    if isinstance(data, list):
        item = data[0]
    else:
        item = data["files"][0]

    results.append(
        {
            "run": i + 1,
            "raw_json": proc.stdout,
            "lineCount": item.get("lineCount"),
            "sourceCount": item.get("sourceCount"),
            "codeCount": item.get("codeCount"),
            "documentationCount": item.get("documentationCount"),
            "emptyCount": item.get("emptyCount"),
            "stringCount": item.get("stringCount"),
            "language": item.get("language"),
        }
    )

    print(
        f"Run {i+1}: code={item.get('codeCount')}, doc={item.get('documentationCount')}, "
        f"empty={item.get('emptyCount')}, source={item.get('sourceCount')}"
    )

# Check if ALL runs produced IDENTICAL results
print()
print("=" * 70)
print("CHECKING FOR DIFFERENCES")
print("=" * 70)

first = results[0]
all_identical = True

for i, result in enumerate(results[1:], 2):
    if (
        result["lineCount"] != first["lineCount"]
        or result["sourceCount"] != first["sourceCount"]
        or result["codeCount"] != first["codeCount"]
        or result["documentationCount"] != first["documentationCount"]
        or result["emptyCount"] != first["emptyCount"]
    ):

        print(f"\n❌ DIFFERENCE FOUND in run {i}:")
        print(
            f"  Run 1: lineCount={first['lineCount']}, sourceCount={first['sourceCount']}, "
            f"codeCount={first['codeCount']}, doc={first['documentationCount']}, empty={first['emptyCount']}"
        )
        print(
            f"  Run {i}: lineCount={result['lineCount']}, sourceCount={result['sourceCount']}, "
            f"codeCount={result['codeCount']}, doc={result['documentationCount']}, empty={result['emptyCount']}"
        )
        all_identical = False

# Check if raw JSON is byte-for-byte identical
json_hashes = [hashlib.md5(r["raw_json"].encode()).hexdigest() for r in results]
unique_hashes = set(json_hashes)

print()
if len(unique_hashes) == 1:
    print("✅ All JSON outputs are BYTE-FOR-BYTE IDENTICAL")
    print(f"   JSON hash: {json_hashes[0]}")
else:
    print(f"❌ Found {len(unique_hashes)} DIFFERENT JSON outputs:")
    for h in unique_hashes:
        count = json_hashes.count(h)
        print(f"   Hash {h[:8]}... appeared {count} times")

    # Show first difference in detail
    print("\n   First two different outputs:")
    for i, result in enumerate(results):
        if json_hashes[i] != json_hashes[0]:
            print(f"\n   Run 1 JSON:\n{results[0]['raw_json']}")
            print(f"\n   Run {i+1} JSON:\n{result['raw_json']}")
            break

print()
if all_identical:
    print("🎉 CONCLUSION: pygount IS deterministic (all numeric values identical)")
else:
    print("⚠️  CONCLUSION: pygount is NOT deterministic (values differ between runs)")
    print("    → This tool cannot be used for reliable baseline comparison")
    print("    → Recommend switching to a deterministic counting library")

# Cleanup
temp_file.unlink()