427 lines
16 KiB
Python
427 lines
16 KiB
Python
"""Extended counting module to match UCC detailed metrics.
|
|
|
|
This module provides counting capabilities that match UCC's detailed breakdown:
|
|
- Whole vs Embedded comments
|
|
- Compiler directives
|
|
- Data declarations
|
|
- Executable instructions
|
|
- Logical SLOC (statement-based counting)
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Tuple, List
|
|
import logging
|
|
|
|
_LOG = logging.getLogger(__name__)
|
|
|
|
|
|
class UCCExtendedCounter:
|
|
"""Extended counter that provides UCC-compatible detailed metrics."""
|
|
|
|
def __init__(self, language: str = "C"):
|
|
self.language = language.lower()
|
|
self._setup_patterns()
|
|
|
|
def _setup_patterns(self):
|
|
"""Setup regex patterns based on language."""
|
|
if self.language in ["c", "c++"]:
|
|
self._setup_c_cpp_patterns()
|
|
elif self.language == "python":
|
|
self._setup_python_patterns()
|
|
else:
|
|
self._setup_generic_patterns()
|
|
|
|
def _setup_c_cpp_patterns(self):
|
|
"""Setup patterns for C/C++."""
|
|
# Compiler directives
|
|
self.directive_pattern = re.compile(
|
|
r'^\s*#\s*(include|define|undef|ifdef|ifndef|endif|if|elif|else|'
|
|
r'pragma|error|warning|line)\b',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
# Data declarations (simplified - real implementation needs more sophisticated parsing)
|
|
self.data_decl_pattern = re.compile(
|
|
r'^\s*(?:extern\s+)?(?:static\s+)?(?:const\s+)?'
|
|
r'(?:unsigned\s+)?(?:signed\s+)?'
|
|
r'(?:int|char|short|long|float|double|void|bool|size_t|uint\w*|int\w*)\s+'
|
|
r'(?:\*\s*)*\w+\s*(?:\[[^\]]*\])?\s*[;=]',
|
|
re.MULTILINE
|
|
)
|
|
|
|
# Comments
|
|
self.line_comment_pattern = re.compile(r'//.*$', re.MULTILINE)
|
|
self.block_comment_pattern = re.compile(r'/\*.*?\*/', re.DOTALL)
|
|
|
|
# Statement terminators for logical SLOC
|
|
self.statement_terminators = [';', '{', '}']
|
|
|
|
def _setup_python_patterns(self):
|
|
"""Setup patterns for Python."""
|
|
self.directive_pattern = re.compile(r'^\s*(?:import|from)\s+', re.MULTILINE)
|
|
self.data_decl_pattern = re.compile(
|
|
r'^\s*\w+\s*:\s*(?:int|str|float|bool|list|dict|tuple|set)\s*[=]?',
|
|
re.MULTILINE
|
|
)
|
|
self.line_comment_pattern = re.compile(r'#.*$', re.MULTILINE)
|
|
self.statement_terminators = ['\n'] # Python uses newlines
|
|
|
|
def _setup_generic_patterns(self):
|
|
"""Setup generic fallback patterns."""
|
|
self.directive_pattern = re.compile(r'^\s*#', re.MULTILINE)
|
|
self.data_decl_pattern = None
|
|
self.line_comment_pattern = re.compile(r'#.*$|//.*$', re.MULTILINE)
|
|
self.block_comment_pattern = re.compile(r'/\*.*?\*/|""".*?"""|\'\'\'.*?\'\'\'', re.DOTALL)
|
|
self.statement_terminators = [';']
|
|
|
|
def analyze_file_extended(self, file_path: Path) -> Dict[str, Any]:
|
|
"""
|
|
Analyze file with UCC-compatible detailed metrics.
|
|
|
|
Returns dict with:
|
|
- total_lines: Total lines in file
|
|
- blank_lines: Completely empty lines
|
|
- comment_whole: Comments on their own line
|
|
- comment_embedded: Comments on same line as code
|
|
- compiler_directives: Preprocessor directives count
|
|
- data_declarations: Data/variable declaration count
|
|
- exec_instructions: Executable statement count
|
|
- logical_sloc: Logical source lines of code
|
|
- physical_sloc: Physical source lines of code (non-blank, non-comment-only)
|
|
"""
|
|
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
lines = f.readlines()
|
|
except Exception as e:
|
|
_LOG.error(f"Failed to read {file_path}: {e}")
|
|
return self._empty_result()
|
|
|
|
return self._analyze_lines(lines)
|
|
|
|
def _analyze_lines(self, lines: List[str]) -> Dict[str, Any]:
|
|
"""Analyze list of lines and return detailed metrics."""
|
|
|
|
result = {
|
|
'total_lines': len(lines),
|
|
'blank_lines': 0,
|
|
'comment_whole': 0,
|
|
'comment_embedded': 0,
|
|
'compiler_directives': 0,
|
|
'data_declarations': 0,
|
|
'exec_instructions': 0,
|
|
'logical_sloc': 0,
|
|
'physical_sloc': 0,
|
|
}
|
|
|
|
# First pass: identify block comment boundaries
|
|
in_block_comment = False
|
|
block_comment_lines = set()
|
|
|
|
for line_num, line in enumerate(lines):
|
|
stripped = line.strip()
|
|
|
|
# Track block comment state
|
|
if '/*' in line:
|
|
in_block_comment = True
|
|
block_comment_lines.add(line_num)
|
|
|
|
if in_block_comment:
|
|
block_comment_lines.add(line_num)
|
|
|
|
if '*/' in line:
|
|
in_block_comment = False
|
|
|
|
# Second pass: classify each line
|
|
logical_statements = 0
|
|
|
|
for line_num, line in enumerate(lines):
|
|
stripped = line.strip()
|
|
|
|
# Count blank lines
|
|
if not stripped:
|
|
result['blank_lines'] += 1
|
|
continue
|
|
|
|
# Check if this line is part of a block comment
|
|
is_in_block = line_num in block_comment_lines
|
|
|
|
# Analyze line type
|
|
if is_in_block:
|
|
# Check if there's also code on this line (embedded comment)
|
|
has_code_before = '/*' in line and line.index('/*') > 0 and line[:line.index('/*')].strip()
|
|
has_code_after = '*/' in line and line.index('*/') < len(line) - 2 and line[line.index('*/') + 2:].strip()
|
|
|
|
if has_code_before or has_code_after:
|
|
result['comment_embedded'] += 1
|
|
result['physical_sloc'] += 1
|
|
|
|
# Extract and analyze code parts
|
|
code_part = self._extract_code_from_mixed_line(line)
|
|
if code_part:
|
|
self._classify_code_line(code_part, result)
|
|
logical_statements += self._count_logical_statements(code_part)
|
|
else:
|
|
# Pure comment line
|
|
result['comment_whole'] += 1
|
|
|
|
elif stripped.startswith('//'):
|
|
# Line comment at start
|
|
result['comment_whole'] += 1
|
|
|
|
elif '//' in stripped:
|
|
# Line has embedded comment
|
|
code_part = stripped[:stripped.index('//')].strip()
|
|
if code_part:
|
|
result['comment_embedded'] += 1
|
|
result['physical_sloc'] += 1
|
|
self._classify_code_line(code_part, result)
|
|
logical_statements += self._count_logical_statements(code_part)
|
|
else:
|
|
# Comment at start after whitespace
|
|
result['comment_whole'] += 1
|
|
|
|
else:
|
|
# Pure code line - no comments
|
|
result['physical_sloc'] += 1
|
|
self._classify_code_line(stripped, result)
|
|
logical_statements += self._count_logical_statements(stripped)
|
|
|
|
result['logical_sloc'] = logical_statements
|
|
|
|
return result
|
|
|
|
def _find_comment_regions(self, content: str) -> List[Tuple[int, int]]:
|
|
"""Find all block comment regions (start_line, end_line)."""
|
|
regions = []
|
|
|
|
if not hasattr(self, 'block_comment_pattern'):
|
|
return regions
|
|
|
|
for match in self.block_comment_pattern.finditer(content):
|
|
start_pos = match.start()
|
|
end_pos = match.end()
|
|
|
|
# Convert byte positions to line numbers
|
|
start_line = content[:start_pos].count('\n')
|
|
end_line = content[:end_pos].count('\n')
|
|
|
|
regions.append((start_line, end_line))
|
|
|
|
return regions
|
|
|
|
def _is_line_in_comment_region(self, line_num: int, regions: List[Tuple[int, int]]) -> bool:
|
|
"""Check if line is within a block comment region."""
|
|
for start, end in regions:
|
|
if start <= line_num <= end:
|
|
return True
|
|
return False
|
|
|
|
def _extract_code_part(self, line: str) -> str:
|
|
"""Extract code part from line with embedded comment."""
|
|
# Remove line comments
|
|
if '//' in line:
|
|
line = line[:line.index('//')]
|
|
|
|
# Remove inline block comments (simple case)
|
|
line = re.sub(r'/\*.*?\*/', '', line)
|
|
|
|
return line.strip()
|
|
|
|
def _extract_code_from_mixed_line(self, line: str) -> str:
|
|
"""Extract code from line that has both code and block comments."""
|
|
result = line
|
|
|
|
# Remove block comment parts
|
|
if '/*' in result and '*/' in result:
|
|
# Inline block comment
|
|
start = result.index('/*')
|
|
end = result.index('*/') + 2
|
|
result = result[:start] + result[end:]
|
|
elif '/*' in result:
|
|
# Comment starts on this line
|
|
result = result[:result.index('/*')]
|
|
elif '*/' in result:
|
|
# Comment ends on this line
|
|
result = result[result.index('*/') + 2:]
|
|
|
|
return result.strip()
|
|
|
|
def _classify_code_line(self, code: str, result: Dict):
|
|
"""Classify code line as directive, data declaration, or executable."""
|
|
|
|
# Empty code doesn't count
|
|
if not code or code == ';':
|
|
return
|
|
|
|
# Check compiler directive (must be at start of code, ignoring whitespace)
|
|
if code.lstrip().startswith('#'):
|
|
result['compiler_directives'] += 1
|
|
return
|
|
|
|
# Check if it's a type/class/struct/enum definition or typedef
|
|
if re.match(r'^(?:typedef|struct|class|enum|union)\s+', code, re.IGNORECASE):
|
|
result['data_declarations'] += 1
|
|
return
|
|
|
|
# Check for variable declarations (more comprehensive patterns)
|
|
# Pattern for C/C++ declarations
|
|
is_declaration = False
|
|
|
|
# Check for common type keywords
|
|
type_keywords = [
|
|
'int', 'char', 'short', 'long', 'float', 'double', 'void', 'bool',
|
|
'unsigned', 'signed', 'const', 'static', 'extern', 'volatile',
|
|
'size_t', 'uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',
|
|
'int8_t', 'int16_t', 'int32_t', 'int64_t'
|
|
]
|
|
|
|
# Check if line starts with a type keyword (after storage class specifiers)
|
|
code_lower = code.lower()
|
|
words = code.split()
|
|
if words:
|
|
# Skip storage class specifiers
|
|
first_word = words[0]
|
|
if first_word in ['static', 'extern', 'const', 'volatile']:
|
|
words = words[1:] if len(words) > 1 else []
|
|
|
|
if words and words[0].lower() in type_keywords:
|
|
# Likely a declaration if it has = or ;
|
|
if '=' in code or code.rstrip().endswith(';'):
|
|
is_declaration = True
|
|
|
|
# Check for function declarations (ending with );)
|
|
if re.search(r'\([^)]*\)\s*;', code):
|
|
is_declaration = True
|
|
|
|
# Check for pointer/array declarations
|
|
if re.search(r'\*\s*\w+\s*[;=\[]', code) or re.search(r'\w+\s*\[[^\]]*\]', code):
|
|
# But not if it's an assignment to existing var
|
|
if not re.match(r'^\s*\w+\s*\[', code): # Not array access
|
|
is_declaration = True
|
|
|
|
if is_declaration:
|
|
result['data_declarations'] += 1
|
|
return
|
|
|
|
# Default: executable instruction
|
|
# Must have actual content (not just braces or semicolons)
|
|
has_executable_content = False
|
|
|
|
# Check for control flow keywords
|
|
if any(kw in code for kw in ['if', 'else', 'while', 'for', 'do', 'switch', 'case', 'return', 'break', 'continue', 'goto']):
|
|
has_executable_content = True
|
|
|
|
# Check for function calls (word followed by parentheses)
|
|
if re.search(r'\w+\s*\(', code):
|
|
has_executable_content = True
|
|
|
|
# Check for assignments
|
|
if '=' in code and not '==' in code:
|
|
has_executable_content = True
|
|
|
|
# Check for braces (block delimiters count as executable)
|
|
if '{' in code or '}' in code:
|
|
has_executable_content = True
|
|
|
|
# Check for standalone semicolon (end of previous statement)
|
|
if code.strip() == ';':
|
|
has_executable_content = False
|
|
|
|
if has_executable_content:
|
|
result['exec_instructions'] += 1
|
|
|
|
def _count_logical_statements(self, code: str) -> int:
|
|
"""
|
|
Count logical statements in code line.
|
|
For C/C++: count semicolons, braces
|
|
"""
|
|
if self.language in ["c", "c++"]:
|
|
count = 0
|
|
count += code.count(';')
|
|
count += code.count('{')
|
|
count += code.count('}')
|
|
return count
|
|
|
|
# For other languages, 1 statement per non-empty line
|
|
return 1 if code.strip() else 0
|
|
|
|
def _empty_result(self) -> Dict[str, Any]:
|
|
"""Return empty result dict."""
|
|
return {
|
|
'total_lines': 0,
|
|
'blank_lines': 0,
|
|
'comment_whole': 0,
|
|
'comment_embedded': 0,
|
|
'compiler_directives': 0,
|
|
'data_declarations': 0,
|
|
'exec_instructions': 0,
|
|
'logical_sloc': 0,
|
|
'physical_sloc': 0,
|
|
}
|
|
|
|
|
|
def analyze_file_ucc_style(file_path: Path, language: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Analyze file with UCC-style detailed metrics.
|
|
|
|
Args:
|
|
file_path: Path to source file
|
|
language: Language hint (auto-detected if None)
|
|
|
|
Returns:
|
|
Dict with detailed UCC-compatible metrics
|
|
"""
|
|
if language is None:
|
|
# Auto-detect from extension
|
|
ext = file_path.suffix.lower()
|
|
if ext in ['.c', '.h']:
|
|
language = 'C'
|
|
elif ext in ['.cpp', '.cc', '.cxx', '.hpp', '.hh']:
|
|
language = 'C++'
|
|
elif ext == '.py':
|
|
language = 'Python'
|
|
else:
|
|
language = 'generic'
|
|
|
|
counter = UCCExtendedCounter(language)
|
|
result = counter.analyze_file_extended(file_path)
|
|
result['language'] = language
|
|
result['file'] = str(file_path)
|
|
|
|
return result
|
|
|
|
|
|
def format_ucc_table_line(result: Dict[str, Any], file_label: str = None) -> str:
|
|
"""
|
|
Format result as UCC-style table line.
|
|
|
|
Args:
|
|
result: Result dict from analyze_file_ucc_style()
|
|
file_label: Optional custom file label (default: uses result['file'])
|
|
|
|
Returns:
|
|
Formatted string matching UCC output format
|
|
"""
|
|
if file_label is None:
|
|
file_label = Path(result.get('file', 'unknown')).name
|
|
|
|
return (
|
|
f" {result['total_lines']:4} {result['blank_lines']:3} |"
|
|
f" {result['comment_whole']:3} {result['comment_embedded']:3} |"
|
|
f" {result['compiler_directives']:3} {result['data_declarations']:3} {result['exec_instructions']:3} |"
|
|
f" {result['logical_sloc']:3} {result['physical_sloc']:3} |"
|
|
f" CODE {file_label}"
|
|
)
|
|
|
|
|
|
def format_ucc_table_header() -> str:
|
|
"""Return UCC-style table header."""
|
|
return """ Total Blank | Comments | Compiler Data Exec. | Logical Physical | File Module
|
|
Lines Lines | Whole Embedded | Direct. Decl. Instr. | SLOC SLOC | Type Name
|
|
-----------------+------------------+-------------------------+------------------+---------------------------"""
|