SXXXXXXX_PyUCC/tests/test_real_project_files.py

"""Test pygount determinism with REAL project files."""

import subprocess
import json
import hashlib
from pathlib import Path
import tempfile
import time
import shutil

# Real project directory
source_dir = Path(r"C:\src\____GitProjects\S1005403_RisCC\target_simulator")

if not source_dir.exists():
    print(f"ERROR: Directory not found: {source_dir}")
    exit(1)

# Get all Python files
py_files = list(source_dir.rglob("*.py"))
print(f"Found {len(py_files)} Python files in {source_dir}")
print()

if len(py_files) == 0:
    print("ERROR: No Python files found!")
    exit(1)

# Limit to first 15 files for testing (to keep test reasonable)
test_files = py_files[:15]


def run_pygount_on_file(file_path):
    """Run pygount and extract counts - NO CACHE."""
    proc = subprocess.run(
        ["pygount", "--format", "json", str(file_path)],
        capture_output=True,
        text=True,
        check=True,
    )
    data = json.loads(proc.stdout)

    if isinstance(data, list):
        item = data[0]
    else:
        item = data["files"][0]

    return {
        "lineCount": item.get("lineCount"),
        "sourceCount": item.get("sourceCount"),
        "codeCount": item.get("codeCount"),
        "documentationCount": item.get("documentationCount"),
        "emptyCount": item.get("emptyCount"),
        "language": item.get("language"),
    }


print("=" * 80)
print("REAL PROJECT TEST - NO CACHE")
print("=" * 80)
print(f"Testing {len(test_files)} files from real project")
print()

# Calculate content hashes
file_hashes = {}
for file_path in test_files:
    with open(file_path, "rb") as f:
        content = f.read()
        file_hashes[file_path.name] = hashlib.md5(content).hexdigest()

# RUN 1: Analyze files in original location
print("RUN 1: Analyzing files in ORIGINAL location")
print("-" * 80)
run1_results = {}
for file_path in test_files:
    counts = run_pygount_on_file(file_path)
    run1_results[file_path.name] = counts
    print(
        f"  {file_path.name}: code={counts['codeCount']}, doc={counts['documentationCount']}, empty={counts['emptyCount']}"
    )
print()

# Wait 10 seconds
print("Waiting 10 seconds...")
time.sleep(10)
print()

# RUN 2: Re-analyze SAME files (same location)
print("RUN 2: Re-analyzing SAME files (same location, 10s later)")
print("-" * 80)
run2_results = {}
for file_path in test_files:
    counts = run_pygount_on_file(file_path)
    run2_results[file_path.name] = counts
    print(
        f"  {file_path.name}: code={counts['codeCount']}, doc={counts['documentationCount']}, empty={counts['emptyCount']}"
    )
print()

# Copy files to temporary location
temp_dir = Path(tempfile.mkdtemp(prefix="test_copy_"))
print(f"Copying files to: {temp_dir}")
for file_path in test_files:
    dest = temp_dir / file_path.name
    shutil.copy2(file_path, dest)
print("Files copied")
print()

# Wait 5 seconds
print("Waiting 5 seconds...")
time.sleep(5)
print()

# RUN 3: Analyze files in NEW location
print("RUN 3: Analyzing files in NEW location (different path, same content)")
print("-" * 80)
run3_results = {}
for file_path in test_files:
    new_path = temp_dir / file_path.name
    counts = run_pygount_on_file(new_path)
    run3_results[file_path.name] = counts
    print(
        f"  {file_path.name}: code={counts['codeCount']}, doc={counts['documentationCount']}, empty={counts['emptyCount']}"
    )
print()

# Wait 10 seconds
print("Waiting 10 seconds...")
time.sleep(10)
print()

# RUN 4: Re-analyze original location again
print("RUN 4: Re-analyzing ORIGINAL location again (20s after Run 1)")
print("-" * 80)
run4_results = {}
for file_path in test_files:
    counts = run_pygount_on_file(file_path)
    run4_results[file_path.name] = counts
    print(
        f"  {file_path.name}: code={counts['codeCount']}, doc={counts['documentationCount']}, empty={counts['emptyCount']}"
    )
print()

# ANALYSIS
print("=" * 80)
print("DETAILED ANALYSIS")
print("=" * 80)
print()

differences_found = False
for file_path in test_files:
    filename = file_path.name
    file_hash = file_hashes[filename]

    r1 = run1_results[filename]
    r2 = run2_results[filename]
    r3 = run3_results[filename]
    r4 = run4_results[filename]

    # Check if all numeric values are identical
    if (
        r1["lineCount"] == r2["lineCount"] == r3["lineCount"] == r4["lineCount"]
        and r1["codeCount"] == r2["codeCount"] == r3["codeCount"] == r4["codeCount"]
        and r1["documentationCount"]
        == r2["documentationCount"]
        == r3["documentationCount"]
        == r4["documentationCount"]
        and r1["emptyCount"] == r2["emptyCount"] == r3["emptyCount"] == r4["emptyCount"]
    ):
        # All identical
        print(f"✅ {filename} - CONSISTENT across all runs")
    else:
        print(f"\n❌ {filename} - DIFFERENCES FOUND!")
        print(f"   Content hash: {file_hash}")
        print(
            f"   Run 1 (original):     code={r1['codeCount']:4d}, doc={r1['documentationCount']:4d}, empty={r1['emptyCount']:4d}, total={r1['lineCount']:4d}"
        )
        print(
            f"   Run 2 (same, 10s):    code={r2['codeCount']:4d}, doc={r2['documentationCount']:4d}, empty={r2['emptyCount']:4d}, total={r2['lineCount']:4d}"
        )
        print(
            f"   Run 3 (copy, 15s):    code={r3['codeCount']:4d}, doc={r3['documentationCount']:4d}, empty={r3['emptyCount']:4d}, total={r3['lineCount']:4d}"
        )
        print(
            f"   Run 4 (original, 25s): code={r4['codeCount']:4d}, doc={r4['documentationCount']:4d}, empty={r4['emptyCount']:4d}, total={r4['lineCount']:4d}"
        )

        # Show which pairs differ
        if r1 != r2:
            print(f"   ⚠️  Run 1 ≠ Run 2 (same location, different time)")
        if r1 != r3:
            print(f"   ⚠️  Run 1 ≠ Run 3 (different location)")
        if r1 != r4:
            print(f"   ⚠️  Run 1 ≠ Run 4 (same location, much later)")

        differences_found = True

print()
print("=" * 80)
print("FINAL CONCLUSION")
print("=" * 80)
if not differences_found:
    print("✅ SUCCESS: All files produced IDENTICAL results across all runs")
    print("   → pygount IS deterministic with real project files")
else:
    print("❌ FAILURE: Some files produced DIFFERENT results!")
    print("   → pygount is NOT deterministic with these real files")
    print("   → This explains why baseline comparison shows differences")
    print("   → SOLUTION NEEDED: Either fix pygount usage or find alternative")

# Cleanup
shutil.rmtree(temp_dir)