"""Batch validation test comparing PyUCC extended counting with UCC results. Reads UCC output files and compares with PyUCC results on the same files. """ from pathlib import Path import sys import re sys.path.insert(0, str(Path(__file__).parent.parent)) from pyucc.core.ucc_complete_counter import UCCCompleteCounter def parse_ucc_outfile(outfile_path: Path): """Parse UCC output file and extract file metrics. Returns: dict: {filename: {total, blank, whole_cmt, embed_cmt, directive, data_decl, exec_inst, logical, physical}} """ results = {} with open(outfile_path, "r", encoding="utf-8", errors="ignore") as f: lines = f.readlines() # Skip header lines until we find the separator line start_idx = 0 for i, line in enumerate(lines): if "---" in line and "++" in line: start_idx = i + 1 break # Parse data lines for line in lines[start_idx:]: line = line.strip() if not line or line.startswith("---") or "Total" in line or "Number" in line: continue # Parse line format: # Total Blank | Whole Embedded | Compiler Data Exec. | Logical Physical | Type Name parts = line.split("|") if len(parts) < 5: continue try: # Part 0: Total and Blank total_blank = parts[0].strip().split() total = int(total_blank[0]) blank = int(total_blank[1]) # Part 1: Comments comments = parts[1].strip().split() whole_cmt = int(comments[0]) embed_cmt = int(comments[1]) # Part 2: Directives, Data, Exec metrics = parts[2].strip().split() directive = int(metrics[0]) data_decl = int(metrics[1]) exec_inst = int(metrics[2]) # Part 3: Logical and Physical SLOC sloc = parts[3].strip().split() logical = int(sloc[0]) physical = int(sloc[1]) # Part 4: File type and name file_info = parts[4].strip().split(None, 1) if len(file_info) < 2: continue file_type = file_info[0] filename = file_info[1] # Clean filename (remove path prefixes like _25_10\REP\...) # Keep only the last part for matching filename_clean = Path(filename).name results[filename] = { "total": total, "blank": blank, "whole_cmt": whole_cmt, "embed_cmt": embed_cmt, "directive": directive, "data_decl": data_decl, "exec_inst": exec_inst, "logical": logical, "physical": physical, "full_path": filename, } except (ValueError, IndexError) as e: # Skip malformed lines continue return results def find_source_file(filename: str, search_paths: list[Path]) -> Path: """Find source file in search paths.""" # Extract just the filename (no path) just_filename = Path(filename).name for search_path in search_paths: if not search_path.exists(): continue # Try exact match by filename matches = list(search_path.rglob(just_filename)) if matches: return matches[0] return None def test_batch_validation(): """Test PyUCC extended counting against UCC results on multiple files.""" # Parse UCC results ucc_outfile = Path( r"c:\src\____GitProjects\SXXXXXXX_PyUcc\__UCC\Metrics\DSP-diff\Baseline-A-C_CPP_outfile.txt" ) if not ucc_outfile.exists(): print(f"ERROR: UCC output file not found: {ucc_outfile}") return print(">> Parsing UCC results...") ucc_results = parse_ucc_outfile(ucc_outfile) print(f" Found {len(ucc_results)} files in UCC output\n") # Search paths for source files search_paths = [ Path(r"C:\__temp\Metrics\attuale\REP"), Path(r"C:\__temp\Metrics\_25_10\REP"), ] # Initialize counter counter = UCCCompleteCounter(language="C") # Statistics stats = { "total_files": 0, "files_found": 0, "files_not_found": 0, "perfect_matches": 0, "good_matches": 0, # >= 90% on all metrics "differences": { "total": [], "blank": [], "whole_cmt": [], "embed_cmt": [], "directive": [], "data_decl": [], "exec_inst": [], "logical": [], "physical": [], }, } # Test a sample of files (first 50 for better statistics) sample_size = 50 sample_files = list(ucc_results.items())[:sample_size] print(f">> Testing {len(sample_files)} files...\n") print("=" * 120) for filename, ucc_data in sample_files: stats["total_files"] += 1 # Find source file source_file = find_source_file(filename, search_paths) if not source_file: stats["files_not_found"] += 1 print(f"WARNING: File not found: {filename}") continue stats["files_found"] += 1 # Analyze with PyUCC try: pyucc_result = counter.analyze_file(source_file) except Exception as e: print(f"ERROR analyzing {filename}: {e}") continue # Compare results diffs = { "total": pyucc_result["total_lines"] - ucc_data["total"], "blank": pyucc_result["blank_lines"] - ucc_data["blank"], "whole_cmt": pyucc_result["comment_whole"] - ucc_data["whole_cmt"], "embed_cmt": pyucc_result["comment_embedded"] - ucc_data["embed_cmt"], "directive": pyucc_result["compiler_directives"] - ucc_data["directive"], "data_decl": pyucc_result["data_declarations"] - ucc_data["data_decl"], "exec_inst": pyucc_result["exec_instructions"] - ucc_data["exec_inst"], "logical": pyucc_result["logical_sloc"] - ucc_data["logical"], "physical": pyucc_result["physical_sloc"] - ucc_data["physical"], } # Record differences for key, diff in diffs.items(): stats["differences"][key].append(diff) # Check if perfect match if all(d == 0 for d in diffs.values()): stats["perfect_matches"] += 1 status = "[PERFECT]" else: # Check if good match (within 10% on all metrics) good_match = True for key, diff in diffs.items(): if ucc_data[key] > 0: accuracy = 100 * (1 - abs(diff) / ucc_data[key]) if accuracy < 90: good_match = False break elif diff != 0: good_match = False break if good_match: stats["good_matches"] += 1 status = "[GOOD]" else: status = "[DIFF]" # Print comparison print(f"\n{status} {filename}") print(f" {'Metric':<15} {'UCC':>6} {'PyUCC':>6} {'Diff':>6} {'Accuracy':>8}") print(f" {'-'*15} {'-'*6} {'-'*6} {'-'*6} {'-'*8}") for key in [ "total", "blank", "directive", "data_decl", "exec_inst", "logical", "physical", ]: ucc_val = ucc_data[key] pyucc_val = pyucc_result[ { "total": "total_lines", "blank": "blank_lines", "directive": "compiler_directives", "data_decl": "data_declarations", "exec_inst": "exec_instructions", "logical": "logical_sloc", "physical": "physical_sloc", }[key] ] diff = diffs[key] if ucc_val > 0: accuracy = 100 * (1 - abs(diff) / ucc_val) acc_str = f"{accuracy:.1f}%" else: acc_str = "N/A" if diff == 0 else "ERROR" print(f" {key:<15} {ucc_val:>6} {pyucc_val:>6} {diff:>+6} {acc_str:>8}") # Print summary statistics print("\n" + "=" * 120) print(f"\n>> SUMMARY STATISTICS") print(f"\n Files processed: {stats['files_found']}/{stats['total_files']}") print(f" Files not found: {stats['files_not_found']}") print( f" Perfect matches (all <=2 diff): {stats['perfect_matches']} ({100*stats['perfect_matches']/max(1,stats['files_found']):.1f}%)" ) print( f" Good matches (all >=90%): {stats['good_matches']} ({100*stats['good_matches']/max(1,stats['files_found']):.1f}%)" ) if stats["files_found"] > 0: print(f"\n Average differences:") for key, diffs in stats["differences"].items(): if diffs: avg_diff = sum(diffs) / len(diffs) avg_abs_diff = sum(abs(d) for d in diffs) / len(diffs) print( f" {key:<15}: avg={avg_diff:>+7.1f} abs_avg={avg_abs_diff:>6.1f}" ) print("\n" + "=" * 120) # Overall assessment if stats["files_found"] > 0: success_rate = ( 100 * (stats["perfect_matches"] + stats["good_matches"]) / stats["files_found"] ) # Calculate overall accuracy by metric print(f"\n Accuracy by metric (based on absolute average difference):") metric_keys = [ "total", "blank", "directive", "physical", "whole_cmt", "embed_cmt", "data_decl", "exec_inst", "logical", ] total_accuracy = 0 for key in metric_keys: diffs = stats["differences"][key] if diffs: avg_abs = sum(abs(d) for d in diffs) / len(diffs) # Calculate accuracy: if avg diff is 0, accuracy is 100% # For each unit of difference, reduce accuracy proportionally # Assume reasonable scale: 10 diff = 10% loss accuracy = max(0, 100 - (avg_abs * 2)) # 2% loss per unit diff total_accuracy += accuracy print( f" {key:15s}: {accuracy:>5.1f}% (avg abs diff: {avg_abs:>5.1f})" ) overall_accuracy = total_accuracy / len(metric_keys) print(f"\n Overall accuracy: {overall_accuracy:.1f}%") if overall_accuracy >= 95: print( f"\n>> EXCELLENT! {overall_accuracy:.1f}% overall accuracy - PyUCC matches UCC very well!" ) elif overall_accuracy >= 90: print( f"\n>> GOOD! {overall_accuracy:.1f}% overall accuracy - PyUCC is close to UCC" ) elif overall_accuracy >= 85: print( f"\n>> FAIR: {overall_accuracy:.1f}% overall accuracy - Some improvements needed" ) else: print( f"\n>> NEEDS WORK: {overall_accuracy:.1f}% overall accuracy - Significant differences" ) if __name__ == "__main__": test_batch_validation()