SXXXXXXX_PyUCC/tests/test_ucc_batch_validation.py

344 lines
11 KiB
Python

"""Batch validation test comparing PyUCC extended counting with UCC results.
Reads UCC output files and compares with PyUCC results on the same files.
"""
from pathlib import Path
import sys
import re
sys.path.insert(0, str(Path(__file__).parent.parent))
from pyucc.core.ucc_complete_counter import UCCCompleteCounter
def parse_ucc_outfile(outfile_path: Path):
"""Parse UCC output file and extract file metrics.
Returns:
dict: {filename: {total, blank, whole_cmt, embed_cmt, directive, data_decl, exec_inst, logical, physical}}
"""
results = {}
with open(outfile_path, "r", encoding="utf-8", errors="ignore") as f:
lines = f.readlines()
# Skip header lines until we find the separator line
start_idx = 0
for i, line in enumerate(lines):
if "---" in line and "++" in line:
start_idx = i + 1
break
# Parse data lines
for line in lines[start_idx:]:
line = line.strip()
if not line or line.startswith("---") or "Total" in line or "Number" in line:
continue
# Parse line format:
# Total Blank | Whole Embedded | Compiler Data Exec. | Logical Physical | Type Name
parts = line.split("|")
if len(parts) < 5:
continue
try:
# Part 0: Total and Blank
total_blank = parts[0].strip().split()
total = int(total_blank[0])
blank = int(total_blank[1])
# Part 1: Comments
comments = parts[1].strip().split()
whole_cmt = int(comments[0])
embed_cmt = int(comments[1])
# Part 2: Directives, Data, Exec
metrics = parts[2].strip().split()
directive = int(metrics[0])
data_decl = int(metrics[1])
exec_inst = int(metrics[2])
# Part 3: Logical and Physical SLOC
sloc = parts[3].strip().split()
logical = int(sloc[0])
physical = int(sloc[1])
# Part 4: File type and name
file_info = parts[4].strip().split(None, 1)
if len(file_info) < 2:
continue
file_type = file_info[0]
filename = file_info[1]
# Clean filename (remove path prefixes like _25_10\REP\...)
# Keep only the last part for matching
filename_clean = Path(filename).name
results[filename] = {
"total": total,
"blank": blank,
"whole_cmt": whole_cmt,
"embed_cmt": embed_cmt,
"directive": directive,
"data_decl": data_decl,
"exec_inst": exec_inst,
"logical": logical,
"physical": physical,
"full_path": filename,
}
except (ValueError, IndexError) as e:
# Skip malformed lines
continue
return results
def find_source_file(filename: str, search_paths: list[Path]) -> Path:
"""Find source file in search paths."""
# Extract just the filename (no path)
just_filename = Path(filename).name
for search_path in search_paths:
if not search_path.exists():
continue
# Try exact match by filename
matches = list(search_path.rglob(just_filename))
if matches:
return matches[0]
return None
def test_batch_validation():
"""Test PyUCC extended counting against UCC results on multiple files."""
# Parse UCC results
ucc_outfile = Path(
r"c:\src\____GitProjects\SXXXXXXX_PyUcc\__UCC\Metrics\DSP-diff\Baseline-A-C_CPP_outfile.txt"
)
if not ucc_outfile.exists():
print(f"ERROR: UCC output file not found: {ucc_outfile}")
return
print(">> Parsing UCC results...")
ucc_results = parse_ucc_outfile(ucc_outfile)
print(f" Found {len(ucc_results)} files in UCC output\n")
# Search paths for source files
search_paths = [
Path(r"C:\__temp\Metrics\attuale\REP"),
Path(r"C:\__temp\Metrics\_25_10\REP"),
]
# Initialize counter
counter = UCCCompleteCounter(language="C")
# Statistics
stats = {
"total_files": 0,
"files_found": 0,
"files_not_found": 0,
"perfect_matches": 0,
"good_matches": 0, # >= 90% on all metrics
"differences": {
"total": [],
"blank": [],
"whole_cmt": [],
"embed_cmt": [],
"directive": [],
"data_decl": [],
"exec_inst": [],
"logical": [],
"physical": [],
},
}
# Test a sample of files (first 50 for better statistics)
sample_size = 50
sample_files = list(ucc_results.items())[:sample_size]
print(f">> Testing {len(sample_files)} files...\n")
print("=" * 120)
for filename, ucc_data in sample_files:
stats["total_files"] += 1
# Find source file
source_file = find_source_file(filename, search_paths)
if not source_file:
stats["files_not_found"] += 1
print(f"WARNING: File not found: {filename}")
continue
stats["files_found"] += 1
# Analyze with PyUCC
try:
pyucc_result = counter.analyze_file(source_file)
except Exception as e:
print(f"ERROR analyzing {filename}: {e}")
continue
# Compare results
diffs = {
"total": pyucc_result["total_lines"] - ucc_data["total"],
"blank": pyucc_result["blank_lines"] - ucc_data["blank"],
"whole_cmt": pyucc_result["comment_whole"] - ucc_data["whole_cmt"],
"embed_cmt": pyucc_result["comment_embedded"] - ucc_data["embed_cmt"],
"directive": pyucc_result["compiler_directives"] - ucc_data["directive"],
"data_decl": pyucc_result["data_declarations"] - ucc_data["data_decl"],
"exec_inst": pyucc_result["exec_instructions"] - ucc_data["exec_inst"],
"logical": pyucc_result["logical_sloc"] - ucc_data["logical"],
"physical": pyucc_result["physical_sloc"] - ucc_data["physical"],
}
# Record differences
for key, diff in diffs.items():
stats["differences"][key].append(diff)
# Check if perfect match
if all(d == 0 for d in diffs.values()):
stats["perfect_matches"] += 1
status = "[PERFECT]"
else:
# Check if good match (within 10% on all metrics)
good_match = True
for key, diff in diffs.items():
if ucc_data[key] > 0:
accuracy = 100 * (1 - abs(diff) / ucc_data[key])
if accuracy < 90:
good_match = False
break
elif diff != 0:
good_match = False
break
if good_match:
stats["good_matches"] += 1
status = "[GOOD]"
else:
status = "[DIFF]"
# Print comparison
print(f"\n{status} {filename}")
print(f" {'Metric':<15} {'UCC':>6} {'PyUCC':>6} {'Diff':>6} {'Accuracy':>8}")
print(f" {'-'*15} {'-'*6} {'-'*6} {'-'*6} {'-'*8}")
for key in [
"total",
"blank",
"directive",
"data_decl",
"exec_inst",
"logical",
"physical",
]:
ucc_val = ucc_data[key]
pyucc_val = pyucc_result[
{
"total": "total_lines",
"blank": "blank_lines",
"directive": "compiler_directives",
"data_decl": "data_declarations",
"exec_inst": "exec_instructions",
"logical": "logical_sloc",
"physical": "physical_sloc",
}[key]
]
diff = diffs[key]
if ucc_val > 0:
accuracy = 100 * (1 - abs(diff) / ucc_val)
acc_str = f"{accuracy:.1f}%"
else:
acc_str = "N/A" if diff == 0 else "ERROR"
print(f" {key:<15} {ucc_val:>6} {pyucc_val:>6} {diff:>+6} {acc_str:>8}")
# Print summary statistics
print("\n" + "=" * 120)
print(f"\n>> SUMMARY STATISTICS")
print(f"\n Files processed: {stats['files_found']}/{stats['total_files']}")
print(f" Files not found: {stats['files_not_found']}")
print(
f" Perfect matches (all <=2 diff): {stats['perfect_matches']} ({100*stats['perfect_matches']/max(1,stats['files_found']):.1f}%)"
)
print(
f" Good matches (all >=90%): {stats['good_matches']} ({100*stats['good_matches']/max(1,stats['files_found']):.1f}%)"
)
if stats["files_found"] > 0:
print(f"\n Average differences:")
for key, diffs in stats["differences"].items():
if diffs:
avg_diff = sum(diffs) / len(diffs)
avg_abs_diff = sum(abs(d) for d in diffs) / len(diffs)
print(
f" {key:<15}: avg={avg_diff:>+7.1f} abs_avg={avg_abs_diff:>6.1f}"
)
print("\n" + "=" * 120)
# Overall assessment
if stats["files_found"] > 0:
success_rate = (
100
* (stats["perfect_matches"] + stats["good_matches"])
/ stats["files_found"]
)
# Calculate overall accuracy by metric
print(f"\n Accuracy by metric (based on absolute average difference):")
metric_keys = [
"total",
"blank",
"directive",
"physical",
"whole_cmt",
"embed_cmt",
"data_decl",
"exec_inst",
"logical",
]
total_accuracy = 0
for key in metric_keys:
diffs = stats["differences"][key]
if diffs:
avg_abs = sum(abs(d) for d in diffs) / len(diffs)
# Calculate accuracy: if avg diff is 0, accuracy is 100%
# For each unit of difference, reduce accuracy proportionally
# Assume reasonable scale: 10 diff = 10% loss
accuracy = max(0, 100 - (avg_abs * 2)) # 2% loss per unit diff
total_accuracy += accuracy
print(
f" {key:15s}: {accuracy:>5.1f}% (avg abs diff: {avg_abs:>5.1f})"
)
overall_accuracy = total_accuracy / len(metric_keys)
print(f"\n Overall accuracy: {overall_accuracy:.1f}%")
if overall_accuracy >= 95:
print(
f"\n>> EXCELLENT! {overall_accuracy:.1f}% overall accuracy - PyUCC matches UCC very well!"
)
elif overall_accuracy >= 90:
print(
f"\n>> GOOD! {overall_accuracy:.1f}% overall accuracy - PyUCC is close to UCC"
)
elif overall_accuracy >= 85:
print(
f"\n>> FAIR: {overall_accuracy:.1f}% overall accuracy - Some improvements needed"
)
else:
print(
f"\n>> NEEDS WORK: {overall_accuracy:.1f}% overall accuracy - Significant differences"
)
if __name__ == "__main__":
test_batch_validation()