SXXXXXXX_PyUCC/pyucc/core/ucc_extended_counting.py

427 lines
16 KiB
Python

"""Extended counting module to match UCC detailed metrics.
This module provides counting capabilities that match UCC's detailed breakdown:
- Whole vs Embedded comments
- Compiler directives
- Data declarations
- Executable instructions
- Logical SLOC (statement-based counting)
"""
import re
from pathlib import Path
from typing import Dict, Any, Tuple, List
import logging
_LOG = logging.getLogger(__name__)
class UCCExtendedCounter:
"""Extended counter that provides UCC-compatible detailed metrics."""
def __init__(self, language: str = "C"):
self.language = language.lower()
self._setup_patterns()
def _setup_patterns(self):
"""Setup regex patterns based on language."""
if self.language in ["c", "c++"]:
self._setup_c_cpp_patterns()
elif self.language == "python":
self._setup_python_patterns()
else:
self._setup_generic_patterns()
def _setup_c_cpp_patterns(self):
"""Setup patterns for C/C++."""
# Compiler directives
self.directive_pattern = re.compile(
r'^\s*#\s*(include|define|undef|ifdef|ifndef|endif|if|elif|else|'
r'pragma|error|warning|line)\b',
re.IGNORECASE
)
# Data declarations (simplified - real implementation needs more sophisticated parsing)
self.data_decl_pattern = re.compile(
r'^\s*(?:extern\s+)?(?:static\s+)?(?:const\s+)?'
r'(?:unsigned\s+)?(?:signed\s+)?'
r'(?:int|char|short|long|float|double|void|bool|size_t|uint\w*|int\w*)\s+'
r'(?:\*\s*)*\w+\s*(?:\[[^\]]*\])?\s*[;=]',
re.MULTILINE
)
# Comments
self.line_comment_pattern = re.compile(r'//.*$', re.MULTILINE)
self.block_comment_pattern = re.compile(r'/\*.*?\*/', re.DOTALL)
# Statement terminators for logical SLOC
self.statement_terminators = [';', '{', '}']
def _setup_python_patterns(self):
"""Setup patterns for Python."""
self.directive_pattern = re.compile(r'^\s*(?:import|from)\s+', re.MULTILINE)
self.data_decl_pattern = re.compile(
r'^\s*\w+\s*:\s*(?:int|str|float|bool|list|dict|tuple|set)\s*[=]?',
re.MULTILINE
)
self.line_comment_pattern = re.compile(r'#.*$', re.MULTILINE)
self.statement_terminators = ['\n'] # Python uses newlines
def _setup_generic_patterns(self):
"""Setup generic fallback patterns."""
self.directive_pattern = re.compile(r'^\s*#', re.MULTILINE)
self.data_decl_pattern = None
self.line_comment_pattern = re.compile(r'#.*$|//.*$', re.MULTILINE)
self.block_comment_pattern = re.compile(r'/\*.*?\*/|""".*?"""|\'\'\'.*?\'\'\'', re.DOTALL)
self.statement_terminators = [';']
def analyze_file_extended(self, file_path: Path) -> Dict[str, Any]:
"""
Analyze file with UCC-compatible detailed metrics.
Returns dict with:
- total_lines: Total lines in file
- blank_lines: Completely empty lines
- comment_whole: Comments on their own line
- comment_embedded: Comments on same line as code
- compiler_directives: Preprocessor directives count
- data_declarations: Data/variable declaration count
- exec_instructions: Executable statement count
- logical_sloc: Logical source lines of code
- physical_sloc: Physical source lines of code (non-blank, non-comment-only)
"""
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
except Exception as e:
_LOG.error(f"Failed to read {file_path}: {e}")
return self._empty_result()
return self._analyze_lines(lines)
def _analyze_lines(self, lines: List[str]) -> Dict[str, Any]:
"""Analyze list of lines and return detailed metrics."""
result = {
'total_lines': len(lines),
'blank_lines': 0,
'comment_whole': 0,
'comment_embedded': 0,
'compiler_directives': 0,
'data_declarations': 0,
'exec_instructions': 0,
'logical_sloc': 0,
'physical_sloc': 0,
}
# First pass: identify block comment boundaries
in_block_comment = False
block_comment_lines = set()
for line_num, line in enumerate(lines):
stripped = line.strip()
# Track block comment state
if '/*' in line:
in_block_comment = True
block_comment_lines.add(line_num)
if in_block_comment:
block_comment_lines.add(line_num)
if '*/' in line:
in_block_comment = False
# Second pass: classify each line
logical_statements = 0
for line_num, line in enumerate(lines):
stripped = line.strip()
# Count blank lines
if not stripped:
result['blank_lines'] += 1
continue
# Check if this line is part of a block comment
is_in_block = line_num in block_comment_lines
# Analyze line type
if is_in_block:
# Check if there's also code on this line (embedded comment)
has_code_before = '/*' in line and line.index('/*') > 0 and line[:line.index('/*')].strip()
has_code_after = '*/' in line and line.index('*/') < len(line) - 2 and line[line.index('*/') + 2:].strip()
if has_code_before or has_code_after:
result['comment_embedded'] += 1
result['physical_sloc'] += 1
# Extract and analyze code parts
code_part = self._extract_code_from_mixed_line(line)
if code_part:
self._classify_code_line(code_part, result)
logical_statements += self._count_logical_statements(code_part)
else:
# Pure comment line
result['comment_whole'] += 1
elif stripped.startswith('//'):
# Line comment at start
result['comment_whole'] += 1
elif '//' in stripped:
# Line has embedded comment
code_part = stripped[:stripped.index('//')].strip()
if code_part:
result['comment_embedded'] += 1
result['physical_sloc'] += 1
self._classify_code_line(code_part, result)
logical_statements += self._count_logical_statements(code_part)
else:
# Comment at start after whitespace
result['comment_whole'] += 1
else:
# Pure code line - no comments
result['physical_sloc'] += 1
self._classify_code_line(stripped, result)
logical_statements += self._count_logical_statements(stripped)
result['logical_sloc'] = logical_statements
return result
def _find_comment_regions(self, content: str) -> List[Tuple[int, int]]:
"""Find all block comment regions (start_line, end_line)."""
regions = []
if not hasattr(self, 'block_comment_pattern'):
return regions
for match in self.block_comment_pattern.finditer(content):
start_pos = match.start()
end_pos = match.end()
# Convert byte positions to line numbers
start_line = content[:start_pos].count('\n')
end_line = content[:end_pos].count('\n')
regions.append((start_line, end_line))
return regions
def _is_line_in_comment_region(self, line_num: int, regions: List[Tuple[int, int]]) -> bool:
"""Check if line is within a block comment region."""
for start, end in regions:
if start <= line_num <= end:
return True
return False
def _extract_code_part(self, line: str) -> str:
"""Extract code part from line with embedded comment."""
# Remove line comments
if '//' in line:
line = line[:line.index('//')]
# Remove inline block comments (simple case)
line = re.sub(r'/\*.*?\*/', '', line)
return line.strip()
def _extract_code_from_mixed_line(self, line: str) -> str:
"""Extract code from line that has both code and block comments."""
result = line
# Remove block comment parts
if '/*' in result and '*/' in result:
# Inline block comment
start = result.index('/*')
end = result.index('*/') + 2
result = result[:start] + result[end:]
elif '/*' in result:
# Comment starts on this line
result = result[:result.index('/*')]
elif '*/' in result:
# Comment ends on this line
result = result[result.index('*/') + 2:]
return result.strip()
def _classify_code_line(self, code: str, result: Dict):
"""Classify code line as directive, data declaration, or executable."""
# Empty code doesn't count
if not code or code == ';':
return
# Check compiler directive (must be at start of code, ignoring whitespace)
if code.lstrip().startswith('#'):
result['compiler_directives'] += 1
return
# Check if it's a type/class/struct/enum definition or typedef
if re.match(r'^(?:typedef|struct|class|enum|union)\s+', code, re.IGNORECASE):
result['data_declarations'] += 1
return
# Check for variable declarations (more comprehensive patterns)
# Pattern for C/C++ declarations
is_declaration = False
# Check for common type keywords
type_keywords = [
'int', 'char', 'short', 'long', 'float', 'double', 'void', 'bool',
'unsigned', 'signed', 'const', 'static', 'extern', 'volatile',
'size_t', 'uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',
'int8_t', 'int16_t', 'int32_t', 'int64_t'
]
# Check if line starts with a type keyword (after storage class specifiers)
code_lower = code.lower()
words = code.split()
if words:
# Skip storage class specifiers
first_word = words[0]
if first_word in ['static', 'extern', 'const', 'volatile']:
words = words[1:] if len(words) > 1 else []
if words and words[0].lower() in type_keywords:
# Likely a declaration if it has = or ;
if '=' in code or code.rstrip().endswith(';'):
is_declaration = True
# Check for function declarations (ending with );)
if re.search(r'\([^)]*\)\s*;', code):
is_declaration = True
# Check for pointer/array declarations
if re.search(r'\*\s*\w+\s*[;=\[]', code) or re.search(r'\w+\s*\[[^\]]*\]', code):
# But not if it's an assignment to existing var
if not re.match(r'^\s*\w+\s*\[', code): # Not array access
is_declaration = True
if is_declaration:
result['data_declarations'] += 1
return
# Default: executable instruction
# Must have actual content (not just braces or semicolons)
has_executable_content = False
# Check for control flow keywords
if any(kw in code for kw in ['if', 'else', 'while', 'for', 'do', 'switch', 'case', 'return', 'break', 'continue', 'goto']):
has_executable_content = True
# Check for function calls (word followed by parentheses)
if re.search(r'\w+\s*\(', code):
has_executable_content = True
# Check for assignments
if '=' in code and not '==' in code:
has_executable_content = True
# Check for braces (block delimiters count as executable)
if '{' in code or '}' in code:
has_executable_content = True
# Check for standalone semicolon (end of previous statement)
if code.strip() == ';':
has_executable_content = False
if has_executable_content:
result['exec_instructions'] += 1
def _count_logical_statements(self, code: str) -> int:
"""
Count logical statements in code line.
For C/C++: count semicolons, braces
"""
if self.language in ["c", "c++"]:
count = 0
count += code.count(';')
count += code.count('{')
count += code.count('}')
return count
# For other languages, 1 statement per non-empty line
return 1 if code.strip() else 0
def _empty_result(self) -> Dict[str, Any]:
"""Return empty result dict."""
return {
'total_lines': 0,
'blank_lines': 0,
'comment_whole': 0,
'comment_embedded': 0,
'compiler_directives': 0,
'data_declarations': 0,
'exec_instructions': 0,
'logical_sloc': 0,
'physical_sloc': 0,
}
def analyze_file_ucc_style(file_path: Path, language: str = None) -> Dict[str, Any]:
"""
Analyze file with UCC-style detailed metrics.
Args:
file_path: Path to source file
language: Language hint (auto-detected if None)
Returns:
Dict with detailed UCC-compatible metrics
"""
if language is None:
# Auto-detect from extension
ext = file_path.suffix.lower()
if ext in ['.c', '.h']:
language = 'C'
elif ext in ['.cpp', '.cc', '.cxx', '.hpp', '.hh']:
language = 'C++'
elif ext == '.py':
language = 'Python'
else:
language = 'generic'
counter = UCCExtendedCounter(language)
result = counter.analyze_file_extended(file_path)
result['language'] = language
result['file'] = str(file_path)
return result
def format_ucc_table_line(result: Dict[str, Any], file_label: str = None) -> str:
"""
Format result as UCC-style table line.
Args:
result: Result dict from analyze_file_ucc_style()
file_label: Optional custom file label (default: uses result['file'])
Returns:
Formatted string matching UCC output format
"""
if file_label is None:
file_label = Path(result.get('file', 'unknown')).name
return (
f" {result['total_lines']:4} {result['blank_lines']:3} |"
f" {result['comment_whole']:3} {result['comment_embedded']:3} |"
f" {result['compiler_directives']:3} {result['data_declarations']:3} {result['exec_instructions']:3} |"
f" {result['logical_sloc']:3} {result['physical_sloc']:3} |"
f" CODE {file_label}"
)
def format_ucc_table_header() -> str:
"""Return UCC-style table header."""
return """ Total Blank | Comments | Compiler Data Exec. | Logical Physical | File Module
Lines Lines | Whole Embedded | Direct. Decl. Instr. | SLOC SLOC | Type Name
-----------------+------------------+-------------------------+------------------+---------------------------"""