"""Complete UCC-compatible counter with full preprocessing pipeline. This module implements the complete UCC counting flow: 1. PreCountProcess - Remove strings, normalize whitespace 2. CountCommentsSLOC - Remove all comments (block and line) 3. CountBlankSLOC - Identify blank lines 4. CountDirectiveSLOC - Extract and count directives 5. LanguageSpecificProcess - LSLOC state machine with keyword classification Target: 90-95% accuracy matching UCC v.2018.07 for C/C++ """ import re from pathlib import Path from typing import Dict, Any, List, Tuple import logging _LOG = logging.getLogger(__name__) class UCCCompleteCounter: """Complete UCC-compatible counter with full preprocessing.""" def __init__(self, language: str = "C"): self.language = language.upper() self._setup_language() def _setup_language(self): """Setup language-specific patterns and keywords.""" if self.language in ["C", "C++", "C_CPP"]: self._setup_c_cpp() else: raise NotImplementedError(f"Language {self.language} not yet supported") def _setup_c_cpp(self): """Setup C/C++ specific patterns and keywords from UCC source.""" # Comment patterns self.line_comment_start = "//" self.block_comment_start = "/*" self.block_comment_end = "*/" # String quote patterns self.string_quote = '"' self.char_quote = "'" self.escape_char = "\\" # Continuation line self.continuation = "\\" # Compiler directives (from UCC CCJavaCsScalaCounter.cpp) self.directive_keywords = [ "define", "undef", "if", "ifdef", "ifndef", "else", "elif", "endif", "include", "pragma", "error", "warning", "line", "region", "endregion", ] # Data declaration keywords (from UCC exec_name_list) self.data_keywords = [ "auto", "bool", "char", "class", "const", "double", "enum", "extern", "float", "int", "long", "private", "protected", "public", "register", "short", "signed", "static", "struct", "typedef", "union", "unsigned", "virtual", "void", "volatile", # C++ specific "namespace", "template", "typename", "explicit", # Common types "size_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t", "int8_t", "int16_t", "int32_t", "int64_t", "wchar_t", "ptrdiff_t", ] # Executable instruction keywords (from UCC exec_name_list) self.exec_keywords = [ "break", "case", "catch", "continue", "default", "delete", "do", "else", "for", "goto", "if", "new", "return", "switch", "throw", "try", "while", # Additional "sizeof", "typeid", "const_cast", "dynamic_cast", "reinterpret_cast", "static_cast", ] # For/if/while control structures self.control_keywords = ["for", "if", "while"] def analyze_file(self, file_path: Path) -> Dict[str, Any]: """ Analyze file with complete UCC preprocessing pipeline. Returns dict with UCC-compatible metrics. """ if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") try: with open(file_path, "r", encoding="utf-8", errors="ignore") as f: lines = f.readlines() except Exception as e: _LOG.error(f"Error reading {file_path}: {e}") raise # Store original lines original_lines = lines.copy() total_lines = len(lines) # STEP 1: PreCountProcess - Remove quotes and normalize processed_lines = self._precount_process(lines) # STEP 2: CountBlankSLOC - Identify blank lines (BEFORE removing comments!) blank_lines = self._count_blank_sloc(processed_lines) # STEP 3: CountCommentsSLOC - Remove all comments no_comment_lines, comment_whole, comment_embedded = self._count_comments_sloc( processed_lines, original_lines ) # STEP 4: CountDirectiveSLOC - Extract directives no_directive_lines, directive_count = self._count_directive_sloc( no_comment_lines, original_lines ) # STEP 5: LanguageSpecificProcess - LSLOC with state machine lsloc_result = self._language_specific_process( no_directive_lines, original_lines ) # Calculate physical SLOC (non-blank, non-comment-only) physical_sloc = total_lines - blank_lines - comment_whole return { "total_lines": total_lines, "blank_lines": blank_lines, "comment_whole": comment_whole, "comment_embedded": comment_embedded, "compiler_directives": directive_count, "data_declarations": lsloc_result["data_decl"], "exec_instructions": lsloc_result["exec_inst"], "logical_sloc": lsloc_result["logical_sloc"], "physical_sloc": physical_sloc, } def _precount_process(self, lines: List[str]) -> List[str]: """ PreCountProcess: Remove string literals and normalize. Replaces quoted strings with empty quotes to avoid counting keywords/terminators inside strings. """ processed = [] for line in lines: # Remove string literals but keep the quotes cleaned = self._remove_string_literals(line) processed.append(cleaned) return processed def _remove_string_literals(self, line: str) -> str: """Remove content of string and char literals, keep quotes.""" result = [] i = 0 while i < len(line): char = line[i] # Check for string literal if char == self.string_quote: result.append(char) i += 1 # Skip until closing quote or end of line while i < len(line): if line[i] == self.escape_char and i + 1 < len(line): # Skip escaped character i += 2 elif line[i] == self.string_quote: result.append(line[i]) i += 1 break else: # Don't include string content i += 1 # Check for char literal elif char == self.char_quote: result.append(char) i += 1 # Skip until closing quote or end of line while i < len(line): if line[i] == self.escape_char and i + 1 < len(line): # Skip escaped character i += 2 elif line[i] == self.char_quote: result.append(line[i]) i += 1 break else: # Don't include char content i += 1 else: result.append(char) i += 1 return "".join(result) def _count_comments_sloc( self, lines: List[str], original_lines: List[str] ) -> Tuple[List[str], int, int]: """ CountCommentsSLOC: Remove all comments and count whole/embedded. UCC counts EVERY line in a multi-line block comment as comment_whole. """ no_comment_lines = [] comment_whole = 0 comment_embedded = 0 in_block_comment = False for i, line in enumerate(lines): original_stripped = line.strip() cleaned = line # Handle being inside a block comment from previous line if in_block_comment: # Count this continuation line as comment_whole comment_whole += 1 end_pos = cleaned.find(self.block_comment_end) if end_pos != -1: # Block comment ends after_comment = cleaned[ end_pos + len(self.block_comment_end) : ].strip() cleaned = cleaned[end_pos + len(self.block_comment_end) :] in_block_comment = False # If has code after, it's embedded if after_comment: comment_embedded += 1 # But we already counted as whole above, so subtract 1 comment_whole -= 1 else: # Still in block comment cleaned = "" no_comment_lines.append(cleaned) continue # Check for whole line comments (line or block) if original_stripped.startswith(self.line_comment_start): comment_whole += 1 no_comment_lines.append("") continue if original_stripped.startswith(self.block_comment_start): # Block comment starting at line beginning comment_whole += 1 end_pos = cleaned.find(self.block_comment_end) if end_pos != -1: # Block comment ends on same line after_comment = cleaned[ end_pos + len(self.block_comment_end) : ].strip() if after_comment: # Has code after - it's embedded, not whole comment_embedded += 1 comment_whole -= 1 cleaned = cleaned[end_pos + len(self.block_comment_end) :] else: cleaned = "" else: # Block comment continues to next line in_block_comment = True cleaned = "" no_comment_lines.append(cleaned) continue # Check for embedded comments line_comment_pos = cleaned.find(self.line_comment_start) block_comment_pos = cleaned.find(self.block_comment_start) # Find first comment first_comment_pos = -1 if line_comment_pos != -1 and block_comment_pos != -1: first_comment_pos = min(line_comment_pos, block_comment_pos) elif line_comment_pos != -1: first_comment_pos = line_comment_pos elif block_comment_pos != -1: first_comment_pos = block_comment_pos if first_comment_pos != -1: code_before = cleaned[:first_comment_pos].strip() if code_before: comment_embedded += 1 else: comment_whole += 1 # Remove comment if first_comment_pos == line_comment_pos: cleaned = cleaned[:line_comment_pos] else: # Block comment end_pos = cleaned.find( self.block_comment_end, block_comment_pos + len(self.block_comment_start), ) if end_pos != -1: cleaned = ( cleaned[:block_comment_pos] + cleaned[end_pos + len(self.block_comment_end) :] ) else: cleaned = cleaned[:block_comment_pos] in_block_comment = True no_comment_lines.append(cleaned) return no_comment_lines, comment_whole, comment_embedded def _count_blank_sloc(self, lines: List[str]) -> int: """Count blank lines (lines with no code after comment removal).""" blank_count = 0 for line in lines: if not line.strip(): blank_count += 1 return blank_count def _count_directive_sloc( self, lines: List[str], original_lines: List[str] ) -> Tuple[List[str], int]: """ CountDirectiveSLOC: Extract and count compiler directives. Returns: - Lines with directives blanked - Count of directive statements (logical) """ no_directive_lines = [] directive_count = 0 in_directive = False directive_statement = "" for i, line in enumerate(lines): stripped = line.lstrip() # Check if this is a directive line if stripped.startswith("#"): # Check if it's a recognized directive is_directive = False for keyword in self.directive_keywords: if re.match(r"#\s*" + keyword + r"\b", stripped): is_directive = True break if is_directive: in_directive = True directive_statement += stripped # Check for continuation if stripped.rstrip().endswith(self.continuation): # Directive continues on next line no_directive_lines.append("") continue else: # Directive complete directive_count += 1 directive_statement = "" in_directive = False no_directive_lines.append("") continue elif in_directive: # Continuation of directive directive_statement += stripped if stripped.rstrip().endswith(self.continuation): no_directive_lines.append("") continue else: # Directive complete directive_count += 1 directive_statement = "" in_directive = False no_directive_lines.append("") continue no_directive_lines.append(line) return no_directive_lines, directive_count def _language_specific_process( self, lines: List[str], original_lines: List[str] ) -> Dict[str, int]: """ LanguageSpecificProcess: LSLOC counting with state machine. Implements UCC's LSLOC algorithm with: - Statement terminator detection (;, {, }) - Parenthesis tracking for for/if/while - Keyword-based data vs exec classification - Multi-line statement accumulation """ data_decl = 0 exec_inst = 0 logical_sloc = 0 # State machine variables (maintained across lines) paren_count = 0 brace_count = 0 bracket_count = 0 # For arrays [] for_flag = False found_forifwhile = False statement_buffer = "" for line in lines: stripped = line.strip() if not stripped: continue # Process each character looking for terminators i = 0 while i < len(stripped): char = stripped[i] statement_buffer += char # Track brackets, parentheses, braces if char == "[": bracket_count += 1 elif char == "]": bracket_count = max(0, bracket_count - 1) elif char == "(": paren_count += 1 # Check if this starts a for/if/while # Look for keyword before the ( before_paren = statement_buffer[ : statement_buffer.rfind("(") ].strip() words = before_paren.split() if words and words[-1] in self.control_keywords: found_forifwhile = True if words[-1] == "for": for_flag = True elif char == ")": paren_count = max(0, paren_count - 1) # If for/if/while condition closed, count it as exec if paren_count == 0 and found_forifwhile: logical_sloc += 1 exec_inst += 1 found_forifwhile = False for_flag = False # Check for statement terminators elif char == ";": if paren_count == 0 and bracket_count == 0: # End of statement stmt = statement_buffer.strip() if stmt and len(stmt) > 1: # Not just ; # Remove trailing ; stmt = stmt[:-1].strip() if stmt: logical_sloc += 1 # Classify as data or exec if self._is_data_declaration(stmt): data_decl += 1 else: exec_inst += 1 statement_buffer = "" for_flag = False elif char == "{": brace_count += 1 if paren_count == 0 and bracket_count == 0: # Start of block stmt = statement_buffer.strip()[:-1].strip() # Remove { if stmt and not found_forifwhile: logical_sloc += 1 if self._is_data_declaration(stmt): data_decl += 1 else: exec_inst += 1 statement_buffer = "" found_forifwhile = False elif char == "}": brace_count = max(0, brace_count - 1) if paren_count == 0 and bracket_count == 0: # End of block stmt = statement_buffer.strip()[:-1].strip() # Remove } if stmt: logical_sloc += 1 if self._is_data_declaration(stmt): data_decl += 1 else: exec_inst += 1 statement_buffer = "" i += 1 # Handle any remaining statement if statement_buffer.strip(): logical_sloc += 1 if self._is_data_declaration(statement_buffer): data_decl += 1 else: exec_inst += 1 return { "data_decl": data_decl, "exec_inst": exec_inst, "logical_sloc": logical_sloc, } def _is_data_declaration(self, statement: str) -> bool: """ Determine if statement is a data declaration or executable instruction. Uses keyword matching similar to UCC. """ stmt_lower = statement.lower() # Remove common prefixes stmt_lower = re.sub( r"^\s*(public|private|protected|static|extern|const|volatile)\s+", "", stmt_lower, ) # Check for data keywords for keyword in self.data_keywords: if re.search(r"\b" + keyword + r"\b", stmt_lower): return True # Check for exec keywords (takes precedence) for keyword in self.exec_keywords: if re.search(r"\b" + keyword + r"\b", stmt_lower): return False # Check for function call pattern (name followed by parenthesis) if re.search(r"\w+\s*\(", statement): # Could be function call (exec) or function declaration (data) # If no type keyword before, likely a call has_type = any( re.search(r"\b" + kw + r"\b", stmt_lower) for kw in self.data_keywords ) return has_type # Check for assignment (likely exec) if "=" in statement and not "==" in statement: # Could be initialization or assignment # If has type keyword, it's data declaration with initialization has_type = any( re.search(r"\b" + kw + r"\b", stmt_lower) for kw in self.data_keywords ) return has_type # Default: if has pointer or array, likely data if "*" in statement or "[" in statement: return True # Default to exec return False def analyze_file_ucc_complete(file_path: Path) -> Dict[str, Any]: """Convenience function to analyze a file with complete UCC counter.""" counter = UCCCompleteCounter(language="C") return counter.analyze_file(file_path)