"""Extended counting module to match UCC detailed metrics. This module provides counting capabilities that match UCC's detailed breakdown: - Whole vs Embedded comments - Compiler directives - Data declarations - Executable instructions - Logical SLOC (statement-based counting) """ import re from pathlib import Path from typing import Dict, Any, Tuple, List import logging _LOG = logging.getLogger(__name__) class UCCExtendedCounter: """Extended counter that provides UCC-compatible detailed metrics.""" def __init__(self, language: str = "C"): self.language = language.lower() self._setup_patterns() def _setup_patterns(self): """Setup regex patterns based on language.""" if self.language in ["c", "c++"]: self._setup_c_cpp_patterns() elif self.language == "python": self._setup_python_patterns() else: self._setup_generic_patterns() def _setup_c_cpp_patterns(self): """Setup patterns for C/C++.""" # Compiler directives self.directive_pattern = re.compile( r"^\s*#\s*(include|define|undef|ifdef|ifndef|endif|if|elif|else|" r"pragma|error|warning|line)\b", re.IGNORECASE, ) # Data declarations (simplified - real implementation needs more sophisticated parsing) self.data_decl_pattern = re.compile( r"^\s*(?:extern\s+)?(?:static\s+)?(?:const\s+)?" r"(?:unsigned\s+)?(?:signed\s+)?" r"(?:int|char|short|long|float|double|void|bool|size_t|uint\w*|int\w*)\s+" r"(?:\*\s*)*\w+\s*(?:\[[^\]]*\])?\s*[;=]", re.MULTILINE, ) # Comments self.line_comment_pattern = re.compile(r"//.*$", re.MULTILINE) self.block_comment_pattern = re.compile(r"/\*.*?\*/", re.DOTALL) # Statement terminators for logical SLOC self.statement_terminators = [";", "{", "}"] def _setup_python_patterns(self): """Setup patterns for Python.""" self.directive_pattern = re.compile(r"^\s*(?:import|from)\s+", re.MULTILINE) self.data_decl_pattern = re.compile( r"^\s*\w+\s*:\s*(?:int|str|float|bool|list|dict|tuple|set)\s*[=]?", re.MULTILINE, ) self.line_comment_pattern = re.compile(r"#.*$", re.MULTILINE) self.statement_terminators = ["\n"] # Python uses newlines def _setup_generic_patterns(self): """Setup generic fallback patterns.""" self.directive_pattern = re.compile(r"^\s*#", re.MULTILINE) self.data_decl_pattern = None self.line_comment_pattern = re.compile(r"#.*$|//.*$", re.MULTILINE) self.block_comment_pattern = re.compile( r'/\*.*?\*/|""".*?"""|\'\'\'.*?\'\'\'', re.DOTALL ) self.statement_terminators = [";"] def analyze_file_extended(self, file_path: Path) -> Dict[str, Any]: """ Analyze file with UCC-compatible detailed metrics. Returns dict with: - total_lines: Total lines in file - blank_lines: Completely empty lines - comment_whole: Comments on their own line - comment_embedded: Comments on same line as code - compiler_directives: Preprocessor directives count - data_declarations: Data/variable declaration count - exec_instructions: Executable statement count - logical_sloc: Logical source lines of code - physical_sloc: Physical source lines of code (non-blank, non-comment-only) """ if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") try: with open(file_path, "r", encoding="utf-8", errors="ignore") as f: lines = f.readlines() except Exception as e: _LOG.error(f"Failed to read {file_path}: {e}") return self._empty_result() return self._analyze_lines(lines) def _analyze_lines(self, lines: List[str]) -> Dict[str, Any]: """Analyze list of lines and return detailed metrics.""" result = { "total_lines": len(lines), "blank_lines": 0, "comment_whole": 0, "comment_embedded": 0, "compiler_directives": 0, "data_declarations": 0, "exec_instructions": 0, "logical_sloc": 0, "physical_sloc": 0, } # First pass: identify block comment boundaries in_block_comment = False block_comment_lines = set() for line_num, line in enumerate(lines): stripped = line.strip() # Track block comment state if "/*" in line: in_block_comment = True block_comment_lines.add(line_num) if in_block_comment: block_comment_lines.add(line_num) if "*/" in line: in_block_comment = False # Second pass: classify each line logical_statements = 0 for line_num, line in enumerate(lines): stripped = line.strip() # Count blank lines if not stripped: result["blank_lines"] += 1 continue # Check if this line is part of a block comment is_in_block = line_num in block_comment_lines # Analyze line type if is_in_block: # Check if there's also code on this line (embedded comment) has_code_before = ( "/*" in line and line.index("/*") > 0 and line[: line.index("/*")].strip() ) has_code_after = ( "*/" in line and line.index("*/") < len(line) - 2 and line[line.index("*/") + 2 :].strip() ) if has_code_before or has_code_after: result["comment_embedded"] += 1 result["physical_sloc"] += 1 # Extract and analyze code parts code_part = self._extract_code_from_mixed_line(line) if code_part: self._classify_code_line(code_part, result) logical_statements += self._count_logical_statements(code_part) else: # Pure comment line result["comment_whole"] += 1 elif stripped.startswith("//"): # Line comment at start result["comment_whole"] += 1 elif "//" in stripped: # Line has embedded comment code_part = stripped[: stripped.index("//")].strip() if code_part: result["comment_embedded"] += 1 result["physical_sloc"] += 1 self._classify_code_line(code_part, result) logical_statements += self._count_logical_statements(code_part) else: # Comment at start after whitespace result["comment_whole"] += 1 else: # Pure code line - no comments result["physical_sloc"] += 1 self._classify_code_line(stripped, result) logical_statements += self._count_logical_statements(stripped) result["logical_sloc"] = logical_statements return result def _find_comment_regions(self, content: str) -> List[Tuple[int, int]]: """Find all block comment regions (start_line, end_line).""" regions = [] if not hasattr(self, "block_comment_pattern"): return regions for match in self.block_comment_pattern.finditer(content): start_pos = match.start() end_pos = match.end() # Convert byte positions to line numbers start_line = content[:start_pos].count("\n") end_line = content[:end_pos].count("\n") regions.append((start_line, end_line)) return regions def _is_line_in_comment_region( self, line_num: int, regions: List[Tuple[int, int]] ) -> bool: """Check if line is within a block comment region.""" for start, end in regions: if start <= line_num <= end: return True return False def _extract_code_part(self, line: str) -> str: """Extract code part from line with embedded comment.""" # Remove line comments if "//" in line: line = line[: line.index("//")] # Remove inline block comments (simple case) line = re.sub(r"/\*.*?\*/", "", line) return line.strip() def _extract_code_from_mixed_line(self, line: str) -> str: """Extract code from line that has both code and block comments.""" result = line # Remove block comment parts if "/*" in result and "*/" in result: # Inline block comment start = result.index("/*") end = result.index("*/") + 2 result = result[:start] + result[end:] elif "/*" in result: # Comment starts on this line result = result[: result.index("/*")] elif "*/" in result: # Comment ends on this line result = result[result.index("*/") + 2 :] return result.strip() def _classify_code_line(self, code: str, result: Dict): """Classify code line as directive, data declaration, or executable.""" # Empty code doesn't count if not code or code == ";": return # Check compiler directive (must be at start of code, ignoring whitespace) if code.lstrip().startswith("#"): result["compiler_directives"] += 1 return # Check if it's a type/class/struct/enum definition or typedef if re.match(r"^(?:typedef|struct|class|enum|union)\s+", code, re.IGNORECASE): result["data_declarations"] += 1 return # Check for variable declarations (more comprehensive patterns) # Pattern for C/C++ declarations is_declaration = False # Check for common type keywords type_keywords = [ "int", "char", "short", "long", "float", "double", "void", "bool", "unsigned", "signed", "const", "static", "extern", "volatile", "size_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t", "int8_t", "int16_t", "int32_t", "int64_t", ] # Check if line starts with a type keyword (after storage class specifiers) code_lower = code.lower() words = code.split() if words: # Skip storage class specifiers first_word = words[0] if first_word in ["static", "extern", "const", "volatile"]: words = words[1:] if len(words) > 1 else [] if words and words[0].lower() in type_keywords: # Likely a declaration if it has = or ; if "=" in code or code.rstrip().endswith(";"): is_declaration = True # Check for function declarations (ending with );) if re.search(r"\([^)]*\)\s*;", code): is_declaration = True # Check for pointer/array declarations if re.search(r"\*\s*\w+\s*[;=\[]", code) or re.search( r"\w+\s*\[[^\]]*\]", code ): # But not if it's an assignment to existing var if not re.match(r"^\s*\w+\s*\[", code): # Not array access is_declaration = True if is_declaration: result["data_declarations"] += 1 return # Default: executable instruction # Must have actual content (not just braces or semicolons) has_executable_content = False # Check for control flow keywords if any( kw in code for kw in [ "if", "else", "while", "for", "do", "switch", "case", "return", "break", "continue", "goto", ] ): has_executable_content = True # Check for function calls (word followed by parentheses) if re.search(r"\w+\s*\(", code): has_executable_content = True # Check for assignments if "=" in code and not "==" in code: has_executable_content = True # Check for braces (block delimiters count as executable) if "{" in code or "}" in code: has_executable_content = True # Check for standalone semicolon (end of previous statement) if code.strip() == ";": has_executable_content = False if has_executable_content: result["exec_instructions"] += 1 def _count_logical_statements(self, code: str) -> int: """ Count logical statements in code line. For C/C++: count semicolons, braces """ if self.language in ["c", "c++"]: count = 0 count += code.count(";") count += code.count("{") count += code.count("}") return count # For other languages, 1 statement per non-empty line return 1 if code.strip() else 0 def _empty_result(self) -> Dict[str, Any]: """Return empty result dict.""" return { "total_lines": 0, "blank_lines": 0, "comment_whole": 0, "comment_embedded": 0, "compiler_directives": 0, "data_declarations": 0, "exec_instructions": 0, "logical_sloc": 0, "physical_sloc": 0, } def analyze_file_ucc_style(file_path: Path, language: str = None) -> Dict[str, Any]: """ Analyze file with UCC-style detailed metrics. Args: file_path: Path to source file language: Language hint (auto-detected if None) Returns: Dict with detailed UCC-compatible metrics """ if language is None: # Auto-detect from extension ext = file_path.suffix.lower() if ext in [".c", ".h"]: language = "C" elif ext in [".cpp", ".cc", ".cxx", ".hpp", ".hh"]: language = "C++" elif ext == ".py": language = "Python" else: language = "generic" counter = UCCExtendedCounter(language) result = counter.analyze_file_extended(file_path) result["language"] = language result["file"] = str(file_path) return result def format_ucc_table_line(result: Dict[str, Any], file_label: str = None) -> str: """ Format result as UCC-style table line. Args: result: Result dict from analyze_file_ucc_style() file_label: Optional custom file label (default: uses result['file']) Returns: Formatted string matching UCC output format """ if file_label is None: file_label = Path(result.get("file", "unknown")).name return ( f" {result['total_lines']:4} {result['blank_lines']:3} |" f" {result['comment_whole']:3} {result['comment_embedded']:3} |" f" {result['compiler_directives']:3} {result['data_declarations']:3} {result['exec_instructions']:3} |" f" {result['logical_sloc']:3} {result['physical_sloc']:3} |" f" CODE {file_label}" ) def format_ucc_table_header() -> str: """Return UCC-style table header.""" return """ Total Blank | Comments | Compiler Data Exec. | Logical Physical | File Module Lines Lines | Whole Embedded | Direct. Decl. Instr. | SLOC SLOC | Type Name -----------------+------------------+-------------------------+------------------+---------------------------"""