SXXXXXXX_PyUCC/pyucc/core/ucc_extended_counting.py

475 lines
16 KiB
Python

"""Extended counting module to match UCC detailed metrics.
This module provides counting capabilities that match UCC's detailed breakdown:
- Whole vs Embedded comments
- Compiler directives
- Data declarations
- Executable instructions
- Logical SLOC (statement-based counting)
"""
import re
from pathlib import Path
from typing import Dict, Any, Tuple, List
import logging
_LOG = logging.getLogger(__name__)
class UCCExtendedCounter:
"""Extended counter that provides UCC-compatible detailed metrics."""
def __init__(self, language: str = "C"):
self.language = language.lower()
self._setup_patterns()
def _setup_patterns(self):
"""Setup regex patterns based on language."""
if self.language in ["c", "c++"]:
self._setup_c_cpp_patterns()
elif self.language == "python":
self._setup_python_patterns()
else:
self._setup_generic_patterns()
def _setup_c_cpp_patterns(self):
"""Setup patterns for C/C++."""
# Compiler directives
self.directive_pattern = re.compile(
r"^\s*#\s*(include|define|undef|ifdef|ifndef|endif|if|elif|else|"
r"pragma|error|warning|line)\b",
re.IGNORECASE,
)
# Data declarations (simplified - real implementation needs more sophisticated parsing)
self.data_decl_pattern = re.compile(
r"^\s*(?:extern\s+)?(?:static\s+)?(?:const\s+)?"
r"(?:unsigned\s+)?(?:signed\s+)?"
r"(?:int|char|short|long|float|double|void|bool|size_t|uint\w*|int\w*)\s+"
r"(?:\*\s*)*\w+\s*(?:\[[^\]]*\])?\s*[;=]",
re.MULTILINE,
)
# Comments
self.line_comment_pattern = re.compile(r"//.*$", re.MULTILINE)
self.block_comment_pattern = re.compile(r"/\*.*?\*/", re.DOTALL)
# Statement terminators for logical SLOC
self.statement_terminators = [";", "{", "}"]
def _setup_python_patterns(self):
"""Setup patterns for Python."""
self.directive_pattern = re.compile(r"^\s*(?:import|from)\s+", re.MULTILINE)
self.data_decl_pattern = re.compile(
r"^\s*\w+\s*:\s*(?:int|str|float|bool|list|dict|tuple|set)\s*[=]?",
re.MULTILINE,
)
self.line_comment_pattern = re.compile(r"#.*$", re.MULTILINE)
self.statement_terminators = ["\n"] # Python uses newlines
def _setup_generic_patterns(self):
"""Setup generic fallback patterns."""
self.directive_pattern = re.compile(r"^\s*#", re.MULTILINE)
self.data_decl_pattern = None
self.line_comment_pattern = re.compile(r"#.*$|//.*$", re.MULTILINE)
self.block_comment_pattern = re.compile(
r'/\*.*?\*/|""".*?"""|\'\'\'.*?\'\'\'', re.DOTALL
)
self.statement_terminators = [";"]
def analyze_file_extended(self, file_path: Path) -> Dict[str, Any]:
"""
Analyze file with UCC-compatible detailed metrics.
Returns dict with:
- total_lines: Total lines in file
- blank_lines: Completely empty lines
- comment_whole: Comments on their own line
- comment_embedded: Comments on same line as code
- compiler_directives: Preprocessor directives count
- data_declarations: Data/variable declaration count
- exec_instructions: Executable statement count
- logical_sloc: Logical source lines of code
- physical_sloc: Physical source lines of code (non-blank, non-comment-only)
"""
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
try:
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
lines = f.readlines()
except Exception as e:
_LOG.error(f"Failed to read {file_path}: {e}")
return self._empty_result()
return self._analyze_lines(lines)
def _analyze_lines(self, lines: List[str]) -> Dict[str, Any]:
"""Analyze list of lines and return detailed metrics."""
result = {
"total_lines": len(lines),
"blank_lines": 0,
"comment_whole": 0,
"comment_embedded": 0,
"compiler_directives": 0,
"data_declarations": 0,
"exec_instructions": 0,
"logical_sloc": 0,
"physical_sloc": 0,
}
# First pass: identify block comment boundaries
in_block_comment = False
block_comment_lines = set()
for line_num, line in enumerate(lines):
stripped = line.strip()
# Track block comment state
if "/*" in line:
in_block_comment = True
block_comment_lines.add(line_num)
if in_block_comment:
block_comment_lines.add(line_num)
if "*/" in line:
in_block_comment = False
# Second pass: classify each line
logical_statements = 0
for line_num, line in enumerate(lines):
stripped = line.strip()
# Count blank lines
if not stripped:
result["blank_lines"] += 1
continue
# Check if this line is part of a block comment
is_in_block = line_num in block_comment_lines
# Analyze line type
if is_in_block:
# Check if there's also code on this line (embedded comment)
has_code_before = (
"/*" in line
and line.index("/*") > 0
and line[: line.index("/*")].strip()
)
has_code_after = (
"*/" in line
and line.index("*/") < len(line) - 2
and line[line.index("*/") + 2 :].strip()
)
if has_code_before or has_code_after:
result["comment_embedded"] += 1
result["physical_sloc"] += 1
# Extract and analyze code parts
code_part = self._extract_code_from_mixed_line(line)
if code_part:
self._classify_code_line(code_part, result)
logical_statements += self._count_logical_statements(code_part)
else:
# Pure comment line
result["comment_whole"] += 1
elif stripped.startswith("//"):
# Line comment at start
result["comment_whole"] += 1
elif "//" in stripped:
# Line has embedded comment
code_part = stripped[: stripped.index("//")].strip()
if code_part:
result["comment_embedded"] += 1
result["physical_sloc"] += 1
self._classify_code_line(code_part, result)
logical_statements += self._count_logical_statements(code_part)
else:
# Comment at start after whitespace
result["comment_whole"] += 1
else:
# Pure code line - no comments
result["physical_sloc"] += 1
self._classify_code_line(stripped, result)
logical_statements += self._count_logical_statements(stripped)
result["logical_sloc"] = logical_statements
return result
def _find_comment_regions(self, content: str) -> List[Tuple[int, int]]:
"""Find all block comment regions (start_line, end_line)."""
regions = []
if not hasattr(self, "block_comment_pattern"):
return regions
for match in self.block_comment_pattern.finditer(content):
start_pos = match.start()
end_pos = match.end()
# Convert byte positions to line numbers
start_line = content[:start_pos].count("\n")
end_line = content[:end_pos].count("\n")
regions.append((start_line, end_line))
return regions
def _is_line_in_comment_region(
self, line_num: int, regions: List[Tuple[int, int]]
) -> bool:
"""Check if line is within a block comment region."""
for start, end in regions:
if start <= line_num <= end:
return True
return False
def _extract_code_part(self, line: str) -> str:
"""Extract code part from line with embedded comment."""
# Remove line comments
if "//" in line:
line = line[: line.index("//")]
# Remove inline block comments (simple case)
line = re.sub(r"/\*.*?\*/", "", line)
return line.strip()
def _extract_code_from_mixed_line(self, line: str) -> str:
"""Extract code from line that has both code and block comments."""
result = line
# Remove block comment parts
if "/*" in result and "*/" in result:
# Inline block comment
start = result.index("/*")
end = result.index("*/") + 2
result = result[:start] + result[end:]
elif "/*" in result:
# Comment starts on this line
result = result[: result.index("/*")]
elif "*/" in result:
# Comment ends on this line
result = result[result.index("*/") + 2 :]
return result.strip()
def _classify_code_line(self, code: str, result: Dict):
"""Classify code line as directive, data declaration, or executable."""
# Empty code doesn't count
if not code or code == ";":
return
# Check compiler directive (must be at start of code, ignoring whitespace)
if code.lstrip().startswith("#"):
result["compiler_directives"] += 1
return
# Check if it's a type/class/struct/enum definition or typedef
if re.match(r"^(?:typedef|struct|class|enum|union)\s+", code, re.IGNORECASE):
result["data_declarations"] += 1
return
# Check for variable declarations (more comprehensive patterns)
# Pattern for C/C++ declarations
is_declaration = False
# Check for common type keywords
type_keywords = [
"int",
"char",
"short",
"long",
"float",
"double",
"void",
"bool",
"unsigned",
"signed",
"const",
"static",
"extern",
"volatile",
"size_t",
"uint8_t",
"uint16_t",
"uint32_t",
"uint64_t",
"int8_t",
"int16_t",
"int32_t",
"int64_t",
]
# Check if line starts with a type keyword (after storage class specifiers)
code_lower = code.lower()
words = code.split()
if words:
# Skip storage class specifiers
first_word = words[0]
if first_word in ["static", "extern", "const", "volatile"]:
words = words[1:] if len(words) > 1 else []
if words and words[0].lower() in type_keywords:
# Likely a declaration if it has = or ;
if "=" in code or code.rstrip().endswith(";"):
is_declaration = True
# Check for function declarations (ending with );)
if re.search(r"\([^)]*\)\s*;", code):
is_declaration = True
# Check for pointer/array declarations
if re.search(r"\*\s*\w+\s*[;=\[]", code) or re.search(
r"\w+\s*\[[^\]]*\]", code
):
# But not if it's an assignment to existing var
if not re.match(r"^\s*\w+\s*\[", code): # Not array access
is_declaration = True
if is_declaration:
result["data_declarations"] += 1
return
# Default: executable instruction
# Must have actual content (not just braces or semicolons)
has_executable_content = False
# Check for control flow keywords
if any(
kw in code
for kw in [
"if",
"else",
"while",
"for",
"do",
"switch",
"case",
"return",
"break",
"continue",
"goto",
]
):
has_executable_content = True
# Check for function calls (word followed by parentheses)
if re.search(r"\w+\s*\(", code):
has_executable_content = True
# Check for assignments
if "=" in code and not "==" in code:
has_executable_content = True
# Check for braces (block delimiters count as executable)
if "{" in code or "}" in code:
has_executable_content = True
# Check for standalone semicolon (end of previous statement)
if code.strip() == ";":
has_executable_content = False
if has_executable_content:
result["exec_instructions"] += 1
def _count_logical_statements(self, code: str) -> int:
"""
Count logical statements in code line.
For C/C++: count semicolons, braces
"""
if self.language in ["c", "c++"]:
count = 0
count += code.count(";")
count += code.count("{")
count += code.count("}")
return count
# For other languages, 1 statement per non-empty line
return 1 if code.strip() else 0
def _empty_result(self) -> Dict[str, Any]:
"""Return empty result dict."""
return {
"total_lines": 0,
"blank_lines": 0,
"comment_whole": 0,
"comment_embedded": 0,
"compiler_directives": 0,
"data_declarations": 0,
"exec_instructions": 0,
"logical_sloc": 0,
"physical_sloc": 0,
}
def analyze_file_ucc_style(file_path: Path, language: str = None) -> Dict[str, Any]:
"""
Analyze file with UCC-style detailed metrics.
Args:
file_path: Path to source file
language: Language hint (auto-detected if None)
Returns:
Dict with detailed UCC-compatible metrics
"""
if language is None:
# Auto-detect from extension
ext = file_path.suffix.lower()
if ext in [".c", ".h"]:
language = "C"
elif ext in [".cpp", ".cc", ".cxx", ".hpp", ".hh"]:
language = "C++"
elif ext == ".py":
language = "Python"
else:
language = "generic"
counter = UCCExtendedCounter(language)
result = counter.analyze_file_extended(file_path)
result["language"] = language
result["file"] = str(file_path)
return result
def format_ucc_table_line(result: Dict[str, Any], file_label: str = None) -> str:
"""
Format result as UCC-style table line.
Args:
result: Result dict from analyze_file_ucc_style()
file_label: Optional custom file label (default: uses result['file'])
Returns:
Formatted string matching UCC output format
"""
if file_label is None:
file_label = Path(result.get("file", "unknown")).name
return (
f" {result['total_lines']:4} {result['blank_lines']:3} |"
f" {result['comment_whole']:3} {result['comment_embedded']:3} |"
f" {result['compiler_directives']:3} {result['data_declarations']:3} {result['exec_instructions']:3} |"
f" {result['logical_sloc']:3} {result['physical_sloc']:3} |"
f" CODE {file_label}"
)
def format_ucc_table_header() -> str:
"""Return UCC-style table header."""
return """ Total Blank | Comments | Compiler Data Exec. | Logical Physical | File Module
Lines Lines | Whole Embedded | Direct. Decl. Instr. | SLOC SLOC | Type Name
-----------------+------------------+-------------------------+------------------+---------------------------"""