"""Extended counting module to match UCC detailed metrics. This module provides counting capabilities that match UCC's detailed breakdown: - Whole vs Embedded comments - Compiler directives - Data declarations - Executable instructions - Logical SLOC (statement-based counting) """ import re from pathlib import Path from typing import Dict, Any, Tuple, List import logging _LOG = logging.getLogger(__name__) class UCCExtendedCounter: """Extended counter that provides UCC-compatible detailed metrics.""" def __init__(self, language: str = "C"): self.language = language.lower() self._setup_patterns() def _setup_patterns(self): """Setup regex patterns based on language.""" if self.language in ["c", "c++"]: self._setup_c_cpp_patterns() elif self.language == "python": self._setup_python_patterns() else: self._setup_generic_patterns() def _setup_c_cpp_patterns(self): """Setup patterns for C/C++.""" # Compiler directives self.directive_pattern = re.compile( r'^\s*#\s*(include|define|undef|ifdef|ifndef|endif|if|elif|else|' r'pragma|error|warning|line)\b', re.IGNORECASE ) # Data declarations (simplified - real implementation needs more sophisticated parsing) self.data_decl_pattern = re.compile( r'^\s*(?:extern\s+)?(?:static\s+)?(?:const\s+)?' r'(?:unsigned\s+)?(?:signed\s+)?' r'(?:int|char|short|long|float|double|void|bool|size_t|uint\w*|int\w*)\s+' r'(?:\*\s*)*\w+\s*(?:\[[^\]]*\])?\s*[;=]', re.MULTILINE ) # Comments self.line_comment_pattern = re.compile(r'//.*$', re.MULTILINE) self.block_comment_pattern = re.compile(r'/\*.*?\*/', re.DOTALL) # Statement terminators for logical SLOC self.statement_terminators = [';', '{', '}'] def _setup_python_patterns(self): """Setup patterns for Python.""" self.directive_pattern = re.compile(r'^\s*(?:import|from)\s+', re.MULTILINE) self.data_decl_pattern = re.compile( r'^\s*\w+\s*:\s*(?:int|str|float|bool|list|dict|tuple|set)\s*[=]?', re.MULTILINE ) self.line_comment_pattern = re.compile(r'#.*$', re.MULTILINE) self.statement_terminators = ['\n'] # Python uses newlines def _setup_generic_patterns(self): """Setup generic fallback patterns.""" self.directive_pattern = re.compile(r'^\s*#', re.MULTILINE) self.data_decl_pattern = None self.line_comment_pattern = re.compile(r'#.*$|//.*$', re.MULTILINE) self.block_comment_pattern = re.compile(r'/\*.*?\*/|""".*?"""|\'\'\'.*?\'\'\'', re.DOTALL) self.statement_terminators = [';'] def analyze_file_extended(self, file_path: Path) -> Dict[str, Any]: """ Analyze file with UCC-compatible detailed metrics. Returns dict with: - total_lines: Total lines in file - blank_lines: Completely empty lines - comment_whole: Comments on their own line - comment_embedded: Comments on same line as code - compiler_directives: Preprocessor directives count - data_declarations: Data/variable declaration count - exec_instructions: Executable statement count - logical_sloc: Logical source lines of code - physical_sloc: Physical source lines of code (non-blank, non-comment-only) """ if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: lines = f.readlines() except Exception as e: _LOG.error(f"Failed to read {file_path}: {e}") return self._empty_result() return self._analyze_lines(lines) def _analyze_lines(self, lines: List[str]) -> Dict[str, Any]: """Analyze list of lines and return detailed metrics.""" result = { 'total_lines': len(lines), 'blank_lines': 0, 'comment_whole': 0, 'comment_embedded': 0, 'compiler_directives': 0, 'data_declarations': 0, 'exec_instructions': 0, 'logical_sloc': 0, 'physical_sloc': 0, } # First pass: identify block comment boundaries in_block_comment = False block_comment_lines = set() for line_num, line in enumerate(lines): stripped = line.strip() # Track block comment state if '/*' in line: in_block_comment = True block_comment_lines.add(line_num) if in_block_comment: block_comment_lines.add(line_num) if '*/' in line: in_block_comment = False # Second pass: classify each line logical_statements = 0 for line_num, line in enumerate(lines): stripped = line.strip() # Count blank lines if not stripped: result['blank_lines'] += 1 continue # Check if this line is part of a block comment is_in_block = line_num in block_comment_lines # Analyze line type if is_in_block: # Check if there's also code on this line (embedded comment) has_code_before = '/*' in line and line.index('/*') > 0 and line[:line.index('/*')].strip() has_code_after = '*/' in line and line.index('*/') < len(line) - 2 and line[line.index('*/') + 2:].strip() if has_code_before or has_code_after: result['comment_embedded'] += 1 result['physical_sloc'] += 1 # Extract and analyze code parts code_part = self._extract_code_from_mixed_line(line) if code_part: self._classify_code_line(code_part, result) logical_statements += self._count_logical_statements(code_part) else: # Pure comment line result['comment_whole'] += 1 elif stripped.startswith('//'): # Line comment at start result['comment_whole'] += 1 elif '//' in stripped: # Line has embedded comment code_part = stripped[:stripped.index('//')].strip() if code_part: result['comment_embedded'] += 1 result['physical_sloc'] += 1 self._classify_code_line(code_part, result) logical_statements += self._count_logical_statements(code_part) else: # Comment at start after whitespace result['comment_whole'] += 1 else: # Pure code line - no comments result['physical_sloc'] += 1 self._classify_code_line(stripped, result) logical_statements += self._count_logical_statements(stripped) result['logical_sloc'] = logical_statements return result def _find_comment_regions(self, content: str) -> List[Tuple[int, int]]: """Find all block comment regions (start_line, end_line).""" regions = [] if not hasattr(self, 'block_comment_pattern'): return regions for match in self.block_comment_pattern.finditer(content): start_pos = match.start() end_pos = match.end() # Convert byte positions to line numbers start_line = content[:start_pos].count('\n') end_line = content[:end_pos].count('\n') regions.append((start_line, end_line)) return regions def _is_line_in_comment_region(self, line_num: int, regions: List[Tuple[int, int]]) -> bool: """Check if line is within a block comment region.""" for start, end in regions: if start <= line_num <= end: return True return False def _extract_code_part(self, line: str) -> str: """Extract code part from line with embedded comment.""" # Remove line comments if '//' in line: line = line[:line.index('//')] # Remove inline block comments (simple case) line = re.sub(r'/\*.*?\*/', '', line) return line.strip() def _extract_code_from_mixed_line(self, line: str) -> str: """Extract code from line that has both code and block comments.""" result = line # Remove block comment parts if '/*' in result and '*/' in result: # Inline block comment start = result.index('/*') end = result.index('*/') + 2 result = result[:start] + result[end:] elif '/*' in result: # Comment starts on this line result = result[:result.index('/*')] elif '*/' in result: # Comment ends on this line result = result[result.index('*/') + 2:] return result.strip() def _classify_code_line(self, code: str, result: Dict): """Classify code line as directive, data declaration, or executable.""" # Empty code doesn't count if not code or code == ';': return # Check compiler directive (must be at start of code, ignoring whitespace) if code.lstrip().startswith('#'): result['compiler_directives'] += 1 return # Check if it's a type/class/struct/enum definition or typedef if re.match(r'^(?:typedef|struct|class|enum|union)\s+', code, re.IGNORECASE): result['data_declarations'] += 1 return # Check for variable declarations (more comprehensive patterns) # Pattern for C/C++ declarations is_declaration = False # Check for common type keywords type_keywords = [ 'int', 'char', 'short', 'long', 'float', 'double', 'void', 'bool', 'unsigned', 'signed', 'const', 'static', 'extern', 'volatile', 'size_t', 'uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', 'int8_t', 'int16_t', 'int32_t', 'int64_t' ] # Check if line starts with a type keyword (after storage class specifiers) code_lower = code.lower() words = code.split() if words: # Skip storage class specifiers first_word = words[0] if first_word in ['static', 'extern', 'const', 'volatile']: words = words[1:] if len(words) > 1 else [] if words and words[0].lower() in type_keywords: # Likely a declaration if it has = or ; if '=' in code or code.rstrip().endswith(';'): is_declaration = True # Check for function declarations (ending with );) if re.search(r'\([^)]*\)\s*;', code): is_declaration = True # Check for pointer/array declarations if re.search(r'\*\s*\w+\s*[;=\[]', code) or re.search(r'\w+\s*\[[^\]]*\]', code): # But not if it's an assignment to existing var if not re.match(r'^\s*\w+\s*\[', code): # Not array access is_declaration = True if is_declaration: result['data_declarations'] += 1 return # Default: executable instruction # Must have actual content (not just braces or semicolons) has_executable_content = False # Check for control flow keywords if any(kw in code for kw in ['if', 'else', 'while', 'for', 'do', 'switch', 'case', 'return', 'break', 'continue', 'goto']): has_executable_content = True # Check for function calls (word followed by parentheses) if re.search(r'\w+\s*\(', code): has_executable_content = True # Check for assignments if '=' in code and not '==' in code: has_executable_content = True # Check for braces (block delimiters count as executable) if '{' in code or '}' in code: has_executable_content = True # Check for standalone semicolon (end of previous statement) if code.strip() == ';': has_executable_content = False if has_executable_content: result['exec_instructions'] += 1 def _count_logical_statements(self, code: str) -> int: """ Count logical statements in code line. For C/C++: count semicolons, braces """ if self.language in ["c", "c++"]: count = 0 count += code.count(';') count += code.count('{') count += code.count('}') return count # For other languages, 1 statement per non-empty line return 1 if code.strip() else 0 def _empty_result(self) -> Dict[str, Any]: """Return empty result dict.""" return { 'total_lines': 0, 'blank_lines': 0, 'comment_whole': 0, 'comment_embedded': 0, 'compiler_directives': 0, 'data_declarations': 0, 'exec_instructions': 0, 'logical_sloc': 0, 'physical_sloc': 0, } def analyze_file_ucc_style(file_path: Path, language: str = None) -> Dict[str, Any]: """ Analyze file with UCC-style detailed metrics. Args: file_path: Path to source file language: Language hint (auto-detected if None) Returns: Dict with detailed UCC-compatible metrics """ if language is None: # Auto-detect from extension ext = file_path.suffix.lower() if ext in ['.c', '.h']: language = 'C' elif ext in ['.cpp', '.cc', '.cxx', '.hpp', '.hh']: language = 'C++' elif ext == '.py': language = 'Python' else: language = 'generic' counter = UCCExtendedCounter(language) result = counter.analyze_file_extended(file_path) result['language'] = language result['file'] = str(file_path) return result def format_ucc_table_line(result: Dict[str, Any], file_label: str = None) -> str: """ Format result as UCC-style table line. Args: result: Result dict from analyze_file_ucc_style() file_label: Optional custom file label (default: uses result['file']) Returns: Formatted string matching UCC output format """ if file_label is None: file_label = Path(result.get('file', 'unknown')).name return ( f" {result['total_lines']:4} {result['blank_lines']:3} |" f" {result['comment_whole']:3} {result['comment_embedded']:3} |" f" {result['compiler_directives']:3} {result['data_declarations']:3} {result['exec_instructions']:3} |" f" {result['logical_sloc']:3} {result['physical_sloc']:3} |" f" CODE {file_label}" ) def format_ucc_table_header() -> str: """Return UCC-style table header.""" return """ Total Blank | Comments | Compiler Data Exec. | Logical Physical | File Module Lines Lines | Whole Embedded | Direct. Decl. Instr. | SLOC SLOC | Type Name -----------------+------------------+-------------------------+------------------+---------------------------"""