344 lines
11 KiB
Python
344 lines
11 KiB
Python
"""Batch validation test comparing PyUCC extended counting with UCC results.
|
|
|
|
Reads UCC output files and compares with PyUCC results on the same files.
|
|
"""
|
|
|
|
from pathlib import Path
|
|
import sys
|
|
import re
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from pyucc.core.ucc_complete_counter import UCCCompleteCounter
|
|
|
|
|
|
def parse_ucc_outfile(outfile_path: Path):
|
|
"""Parse UCC output file and extract file metrics.
|
|
|
|
Returns:
|
|
dict: {filename: {total, blank, whole_cmt, embed_cmt, directive, data_decl, exec_inst, logical, physical}}
|
|
"""
|
|
results = {}
|
|
|
|
with open(outfile_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
lines = f.readlines()
|
|
|
|
# Skip header lines until we find the separator line
|
|
start_idx = 0
|
|
for i, line in enumerate(lines):
|
|
if "---" in line and "++" in line:
|
|
start_idx = i + 1
|
|
break
|
|
|
|
# Parse data lines
|
|
for line in lines[start_idx:]:
|
|
line = line.strip()
|
|
if not line or line.startswith("---") or "Total" in line or "Number" in line:
|
|
continue
|
|
|
|
# Parse line format:
|
|
# Total Blank | Whole Embedded | Compiler Data Exec. | Logical Physical | Type Name
|
|
parts = line.split("|")
|
|
if len(parts) < 5:
|
|
continue
|
|
|
|
try:
|
|
# Part 0: Total and Blank
|
|
total_blank = parts[0].strip().split()
|
|
total = int(total_blank[0])
|
|
blank = int(total_blank[1])
|
|
|
|
# Part 1: Comments
|
|
comments = parts[1].strip().split()
|
|
whole_cmt = int(comments[0])
|
|
embed_cmt = int(comments[1])
|
|
|
|
# Part 2: Directives, Data, Exec
|
|
metrics = parts[2].strip().split()
|
|
directive = int(metrics[0])
|
|
data_decl = int(metrics[1])
|
|
exec_inst = int(metrics[2])
|
|
|
|
# Part 3: Logical and Physical SLOC
|
|
sloc = parts[3].strip().split()
|
|
logical = int(sloc[0])
|
|
physical = int(sloc[1])
|
|
|
|
# Part 4: File type and name
|
|
file_info = parts[4].strip().split(None, 1)
|
|
if len(file_info) < 2:
|
|
continue
|
|
|
|
file_type = file_info[0]
|
|
filename = file_info[1]
|
|
|
|
# Clean filename (remove path prefixes like _25_10\REP\...)
|
|
# Keep only the last part for matching
|
|
filename_clean = Path(filename).name
|
|
|
|
results[filename] = {
|
|
"total": total,
|
|
"blank": blank,
|
|
"whole_cmt": whole_cmt,
|
|
"embed_cmt": embed_cmt,
|
|
"directive": directive,
|
|
"data_decl": data_decl,
|
|
"exec_inst": exec_inst,
|
|
"logical": logical,
|
|
"physical": physical,
|
|
"full_path": filename,
|
|
}
|
|
except (ValueError, IndexError) as e:
|
|
# Skip malformed lines
|
|
continue
|
|
|
|
return results
|
|
|
|
|
|
def find_source_file(filename: str, search_paths: list[Path]) -> Path:
|
|
"""Find source file in search paths."""
|
|
# Extract just the filename (no path)
|
|
just_filename = Path(filename).name
|
|
|
|
for search_path in search_paths:
|
|
if not search_path.exists():
|
|
continue
|
|
# Try exact match by filename
|
|
matches = list(search_path.rglob(just_filename))
|
|
if matches:
|
|
return matches[0]
|
|
return None
|
|
|
|
|
|
def test_batch_validation():
|
|
"""Test PyUCC extended counting against UCC results on multiple files."""
|
|
|
|
# Parse UCC results
|
|
ucc_outfile = Path(
|
|
r"c:\src\____GitProjects\SXXXXXXX_PyUcc\__UCC\Metrics\DSP-diff\Baseline-A-C_CPP_outfile.txt"
|
|
)
|
|
|
|
if not ucc_outfile.exists():
|
|
print(f"ERROR: UCC output file not found: {ucc_outfile}")
|
|
return
|
|
|
|
print(">> Parsing UCC results...")
|
|
ucc_results = parse_ucc_outfile(ucc_outfile)
|
|
print(f" Found {len(ucc_results)} files in UCC output\n")
|
|
|
|
# Search paths for source files
|
|
search_paths = [
|
|
Path(r"C:\__temp\Metrics\attuale\REP"),
|
|
Path(r"C:\__temp\Metrics\_25_10\REP"),
|
|
]
|
|
|
|
# Initialize counter
|
|
counter = UCCCompleteCounter(language="C")
|
|
|
|
# Statistics
|
|
stats = {
|
|
"total_files": 0,
|
|
"files_found": 0,
|
|
"files_not_found": 0,
|
|
"perfect_matches": 0,
|
|
"good_matches": 0, # >= 90% on all metrics
|
|
"differences": {
|
|
"total": [],
|
|
"blank": [],
|
|
"whole_cmt": [],
|
|
"embed_cmt": [],
|
|
"directive": [],
|
|
"data_decl": [],
|
|
"exec_inst": [],
|
|
"logical": [],
|
|
"physical": [],
|
|
},
|
|
}
|
|
|
|
# Test a sample of files (first 50 for better statistics)
|
|
sample_size = 50
|
|
sample_files = list(ucc_results.items())[:sample_size]
|
|
|
|
print(f">> Testing {len(sample_files)} files...\n")
|
|
print("=" * 120)
|
|
|
|
for filename, ucc_data in sample_files:
|
|
stats["total_files"] += 1
|
|
|
|
# Find source file
|
|
source_file = find_source_file(filename, search_paths)
|
|
|
|
if not source_file:
|
|
stats["files_not_found"] += 1
|
|
print(f"WARNING: File not found: {filename}")
|
|
continue
|
|
|
|
stats["files_found"] += 1
|
|
|
|
# Analyze with PyUCC
|
|
try:
|
|
pyucc_result = counter.analyze_file(source_file)
|
|
except Exception as e:
|
|
print(f"ERROR analyzing {filename}: {e}")
|
|
continue
|
|
|
|
# Compare results
|
|
diffs = {
|
|
"total": pyucc_result["total_lines"] - ucc_data["total"],
|
|
"blank": pyucc_result["blank_lines"] - ucc_data["blank"],
|
|
"whole_cmt": pyucc_result["comment_whole"] - ucc_data["whole_cmt"],
|
|
"embed_cmt": pyucc_result["comment_embedded"] - ucc_data["embed_cmt"],
|
|
"directive": pyucc_result["compiler_directives"] - ucc_data["directive"],
|
|
"data_decl": pyucc_result["data_declarations"] - ucc_data["data_decl"],
|
|
"exec_inst": pyucc_result["exec_instructions"] - ucc_data["exec_inst"],
|
|
"logical": pyucc_result["logical_sloc"] - ucc_data["logical"],
|
|
"physical": pyucc_result["physical_sloc"] - ucc_data["physical"],
|
|
}
|
|
|
|
# Record differences
|
|
for key, diff in diffs.items():
|
|
stats["differences"][key].append(diff)
|
|
|
|
# Check if perfect match
|
|
if all(d == 0 for d in diffs.values()):
|
|
stats["perfect_matches"] += 1
|
|
status = "[PERFECT]"
|
|
else:
|
|
# Check if good match (within 10% on all metrics)
|
|
good_match = True
|
|
for key, diff in diffs.items():
|
|
if ucc_data[key] > 0:
|
|
accuracy = 100 * (1 - abs(diff) / ucc_data[key])
|
|
if accuracy < 90:
|
|
good_match = False
|
|
break
|
|
elif diff != 0:
|
|
good_match = False
|
|
break
|
|
|
|
if good_match:
|
|
stats["good_matches"] += 1
|
|
status = "[GOOD]"
|
|
else:
|
|
status = "[DIFF]"
|
|
|
|
# Print comparison
|
|
print(f"\n{status} {filename}")
|
|
print(f" {'Metric':<15} {'UCC':>6} {'PyUCC':>6} {'Diff':>6} {'Accuracy':>8}")
|
|
print(f" {'-'*15} {'-'*6} {'-'*6} {'-'*6} {'-'*8}")
|
|
|
|
for key in [
|
|
"total",
|
|
"blank",
|
|
"directive",
|
|
"data_decl",
|
|
"exec_inst",
|
|
"logical",
|
|
"physical",
|
|
]:
|
|
ucc_val = ucc_data[key]
|
|
pyucc_val = pyucc_result[
|
|
{
|
|
"total": "total_lines",
|
|
"blank": "blank_lines",
|
|
"directive": "compiler_directives",
|
|
"data_decl": "data_declarations",
|
|
"exec_inst": "exec_instructions",
|
|
"logical": "logical_sloc",
|
|
"physical": "physical_sloc",
|
|
}[key]
|
|
]
|
|
diff = diffs[key]
|
|
|
|
if ucc_val > 0:
|
|
accuracy = 100 * (1 - abs(diff) / ucc_val)
|
|
acc_str = f"{accuracy:.1f}%"
|
|
else:
|
|
acc_str = "N/A" if diff == 0 else "ERROR"
|
|
|
|
print(f" {key:<15} {ucc_val:>6} {pyucc_val:>6} {diff:>+6} {acc_str:>8}")
|
|
|
|
# Print summary statistics
|
|
print("\n" + "=" * 120)
|
|
print(f"\n>> SUMMARY STATISTICS")
|
|
print(f"\n Files processed: {stats['files_found']}/{stats['total_files']}")
|
|
print(f" Files not found: {stats['files_not_found']}")
|
|
print(
|
|
f" Perfect matches (all <=2 diff): {stats['perfect_matches']} ({100*stats['perfect_matches']/max(1,stats['files_found']):.1f}%)"
|
|
)
|
|
print(
|
|
f" Good matches (all >=90%): {stats['good_matches']} ({100*stats['good_matches']/max(1,stats['files_found']):.1f}%)"
|
|
)
|
|
|
|
if stats["files_found"] > 0:
|
|
print(f"\n Average differences:")
|
|
for key, diffs in stats["differences"].items():
|
|
if diffs:
|
|
avg_diff = sum(diffs) / len(diffs)
|
|
avg_abs_diff = sum(abs(d) for d in diffs) / len(diffs)
|
|
print(
|
|
f" {key:<15}: avg={avg_diff:>+7.1f} abs_avg={avg_abs_diff:>6.1f}"
|
|
)
|
|
|
|
print("\n" + "=" * 120)
|
|
|
|
# Overall assessment
|
|
if stats["files_found"] > 0:
|
|
success_rate = (
|
|
100
|
|
* (stats["perfect_matches"] + stats["good_matches"])
|
|
/ stats["files_found"]
|
|
)
|
|
|
|
# Calculate overall accuracy by metric
|
|
print(f"\n Accuracy by metric (based on absolute average difference):")
|
|
metric_keys = [
|
|
"total",
|
|
"blank",
|
|
"directive",
|
|
"physical",
|
|
"whole_cmt",
|
|
"embed_cmt",
|
|
"data_decl",
|
|
"exec_inst",
|
|
"logical",
|
|
]
|
|
|
|
total_accuracy = 0
|
|
for key in metric_keys:
|
|
diffs = stats["differences"][key]
|
|
if diffs:
|
|
avg_abs = sum(abs(d) for d in diffs) / len(diffs)
|
|
# Calculate accuracy: if avg diff is 0, accuracy is 100%
|
|
# For each unit of difference, reduce accuracy proportionally
|
|
# Assume reasonable scale: 10 diff = 10% loss
|
|
accuracy = max(0, 100 - (avg_abs * 2)) # 2% loss per unit diff
|
|
total_accuracy += accuracy
|
|
print(
|
|
f" {key:15s}: {accuracy:>5.1f}% (avg abs diff: {avg_abs:>5.1f})"
|
|
)
|
|
|
|
overall_accuracy = total_accuracy / len(metric_keys)
|
|
print(f"\n Overall accuracy: {overall_accuracy:.1f}%")
|
|
|
|
if overall_accuracy >= 95:
|
|
print(
|
|
f"\n>> EXCELLENT! {overall_accuracy:.1f}% overall accuracy - PyUCC matches UCC very well!"
|
|
)
|
|
elif overall_accuracy >= 90:
|
|
print(
|
|
f"\n>> GOOD! {overall_accuracy:.1f}% overall accuracy - PyUCC is close to UCC"
|
|
)
|
|
elif overall_accuracy >= 85:
|
|
print(
|
|
f"\n>> FAIR: {overall_accuracy:.1f}% overall accuracy - Some improvements needed"
|
|
)
|
|
else:
|
|
print(
|
|
f"\n>> NEEDS WORK: {overall_accuracy:.1f}% overall accuracy - Significant differences"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_batch_validation()
|