475 lines
16 KiB
Python
475 lines
16 KiB
Python
"""Extended counting module to match UCC detailed metrics.
|
|
|
|
This module provides counting capabilities that match UCC's detailed breakdown:
|
|
- Whole vs Embedded comments
|
|
- Compiler directives
|
|
- Data declarations
|
|
- Executable instructions
|
|
- Logical SLOC (statement-based counting)
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Tuple, List
|
|
import logging
|
|
|
|
_LOG = logging.getLogger(__name__)
|
|
|
|
|
|
class UCCExtendedCounter:
|
|
"""Extended counter that provides UCC-compatible detailed metrics."""
|
|
|
|
def __init__(self, language: str = "C"):
|
|
self.language = language.lower()
|
|
self._setup_patterns()
|
|
|
|
def _setup_patterns(self):
|
|
"""Setup regex patterns based on language."""
|
|
if self.language in ["c", "c++"]:
|
|
self._setup_c_cpp_patterns()
|
|
elif self.language == "python":
|
|
self._setup_python_patterns()
|
|
else:
|
|
self._setup_generic_patterns()
|
|
|
|
def _setup_c_cpp_patterns(self):
|
|
"""Setup patterns for C/C++."""
|
|
# Compiler directives
|
|
self.directive_pattern = re.compile(
|
|
r"^\s*#\s*(include|define|undef|ifdef|ifndef|endif|if|elif|else|"
|
|
r"pragma|error|warning|line)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Data declarations (simplified - real implementation needs more sophisticated parsing)
|
|
self.data_decl_pattern = re.compile(
|
|
r"^\s*(?:extern\s+)?(?:static\s+)?(?:const\s+)?"
|
|
r"(?:unsigned\s+)?(?:signed\s+)?"
|
|
r"(?:int|char|short|long|float|double|void|bool|size_t|uint\w*|int\w*)\s+"
|
|
r"(?:\*\s*)*\w+\s*(?:\[[^\]]*\])?\s*[;=]",
|
|
re.MULTILINE,
|
|
)
|
|
|
|
# Comments
|
|
self.line_comment_pattern = re.compile(r"//.*$", re.MULTILINE)
|
|
self.block_comment_pattern = re.compile(r"/\*.*?\*/", re.DOTALL)
|
|
|
|
# Statement terminators for logical SLOC
|
|
self.statement_terminators = [";", "{", "}"]
|
|
|
|
def _setup_python_patterns(self):
|
|
"""Setup patterns for Python."""
|
|
self.directive_pattern = re.compile(r"^\s*(?:import|from)\s+", re.MULTILINE)
|
|
self.data_decl_pattern = re.compile(
|
|
r"^\s*\w+\s*:\s*(?:int|str|float|bool|list|dict|tuple|set)\s*[=]?",
|
|
re.MULTILINE,
|
|
)
|
|
self.line_comment_pattern = re.compile(r"#.*$", re.MULTILINE)
|
|
self.statement_terminators = ["\n"] # Python uses newlines
|
|
|
|
def _setup_generic_patterns(self):
|
|
"""Setup generic fallback patterns."""
|
|
self.directive_pattern = re.compile(r"^\s*#", re.MULTILINE)
|
|
self.data_decl_pattern = None
|
|
self.line_comment_pattern = re.compile(r"#.*$|//.*$", re.MULTILINE)
|
|
self.block_comment_pattern = re.compile(
|
|
r'/\*.*?\*/|""".*?"""|\'\'\'.*?\'\'\'', re.DOTALL
|
|
)
|
|
self.statement_terminators = [";"]
|
|
|
|
def analyze_file_extended(self, file_path: Path) -> Dict[str, Any]:
|
|
"""
|
|
Analyze file with UCC-compatible detailed metrics.
|
|
|
|
Returns dict with:
|
|
- total_lines: Total lines in file
|
|
- blank_lines: Completely empty lines
|
|
- comment_whole: Comments on their own line
|
|
- comment_embedded: Comments on same line as code
|
|
- compiler_directives: Preprocessor directives count
|
|
- data_declarations: Data/variable declaration count
|
|
- exec_instructions: Executable statement count
|
|
- logical_sloc: Logical source lines of code
|
|
- physical_sloc: Physical source lines of code (non-blank, non-comment-only)
|
|
"""
|
|
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
lines = f.readlines()
|
|
except Exception as e:
|
|
_LOG.error(f"Failed to read {file_path}: {e}")
|
|
return self._empty_result()
|
|
|
|
return self._analyze_lines(lines)
|
|
|
|
def _analyze_lines(self, lines: List[str]) -> Dict[str, Any]:
|
|
"""Analyze list of lines and return detailed metrics."""
|
|
|
|
result = {
|
|
"total_lines": len(lines),
|
|
"blank_lines": 0,
|
|
"comment_whole": 0,
|
|
"comment_embedded": 0,
|
|
"compiler_directives": 0,
|
|
"data_declarations": 0,
|
|
"exec_instructions": 0,
|
|
"logical_sloc": 0,
|
|
"physical_sloc": 0,
|
|
}
|
|
|
|
# First pass: identify block comment boundaries
|
|
in_block_comment = False
|
|
block_comment_lines = set()
|
|
|
|
for line_num, line in enumerate(lines):
|
|
stripped = line.strip()
|
|
|
|
# Track block comment state
|
|
if "/*" in line:
|
|
in_block_comment = True
|
|
block_comment_lines.add(line_num)
|
|
|
|
if in_block_comment:
|
|
block_comment_lines.add(line_num)
|
|
|
|
if "*/" in line:
|
|
in_block_comment = False
|
|
|
|
# Second pass: classify each line
|
|
logical_statements = 0
|
|
|
|
for line_num, line in enumerate(lines):
|
|
stripped = line.strip()
|
|
|
|
# Count blank lines
|
|
if not stripped:
|
|
result["blank_lines"] += 1
|
|
continue
|
|
|
|
# Check if this line is part of a block comment
|
|
is_in_block = line_num in block_comment_lines
|
|
|
|
# Analyze line type
|
|
if is_in_block:
|
|
# Check if there's also code on this line (embedded comment)
|
|
has_code_before = (
|
|
"/*" in line
|
|
and line.index("/*") > 0
|
|
and line[: line.index("/*")].strip()
|
|
)
|
|
has_code_after = (
|
|
"*/" in line
|
|
and line.index("*/") < len(line) - 2
|
|
and line[line.index("*/") + 2 :].strip()
|
|
)
|
|
|
|
if has_code_before or has_code_after:
|
|
result["comment_embedded"] += 1
|
|
result["physical_sloc"] += 1
|
|
|
|
# Extract and analyze code parts
|
|
code_part = self._extract_code_from_mixed_line(line)
|
|
if code_part:
|
|
self._classify_code_line(code_part, result)
|
|
logical_statements += self._count_logical_statements(code_part)
|
|
else:
|
|
# Pure comment line
|
|
result["comment_whole"] += 1
|
|
|
|
elif stripped.startswith("//"):
|
|
# Line comment at start
|
|
result["comment_whole"] += 1
|
|
|
|
elif "//" in stripped:
|
|
# Line has embedded comment
|
|
code_part = stripped[: stripped.index("//")].strip()
|
|
if code_part:
|
|
result["comment_embedded"] += 1
|
|
result["physical_sloc"] += 1
|
|
self._classify_code_line(code_part, result)
|
|
logical_statements += self._count_logical_statements(code_part)
|
|
else:
|
|
# Comment at start after whitespace
|
|
result["comment_whole"] += 1
|
|
|
|
else:
|
|
# Pure code line - no comments
|
|
result["physical_sloc"] += 1
|
|
self._classify_code_line(stripped, result)
|
|
logical_statements += self._count_logical_statements(stripped)
|
|
|
|
result["logical_sloc"] = logical_statements
|
|
|
|
return result
|
|
|
|
def _find_comment_regions(self, content: str) -> List[Tuple[int, int]]:
|
|
"""Find all block comment regions (start_line, end_line)."""
|
|
regions = []
|
|
|
|
if not hasattr(self, "block_comment_pattern"):
|
|
return regions
|
|
|
|
for match in self.block_comment_pattern.finditer(content):
|
|
start_pos = match.start()
|
|
end_pos = match.end()
|
|
|
|
# Convert byte positions to line numbers
|
|
start_line = content[:start_pos].count("\n")
|
|
end_line = content[:end_pos].count("\n")
|
|
|
|
regions.append((start_line, end_line))
|
|
|
|
return regions
|
|
|
|
def _is_line_in_comment_region(
|
|
self, line_num: int, regions: List[Tuple[int, int]]
|
|
) -> bool:
|
|
"""Check if line is within a block comment region."""
|
|
for start, end in regions:
|
|
if start <= line_num <= end:
|
|
return True
|
|
return False
|
|
|
|
def _extract_code_part(self, line: str) -> str:
|
|
"""Extract code part from line with embedded comment."""
|
|
# Remove line comments
|
|
if "//" in line:
|
|
line = line[: line.index("//")]
|
|
|
|
# Remove inline block comments (simple case)
|
|
line = re.sub(r"/\*.*?\*/", "", line)
|
|
|
|
return line.strip()
|
|
|
|
def _extract_code_from_mixed_line(self, line: str) -> str:
|
|
"""Extract code from line that has both code and block comments."""
|
|
result = line
|
|
|
|
# Remove block comment parts
|
|
if "/*" in result and "*/" in result:
|
|
# Inline block comment
|
|
start = result.index("/*")
|
|
end = result.index("*/") + 2
|
|
result = result[:start] + result[end:]
|
|
elif "/*" in result:
|
|
# Comment starts on this line
|
|
result = result[: result.index("/*")]
|
|
elif "*/" in result:
|
|
# Comment ends on this line
|
|
result = result[result.index("*/") + 2 :]
|
|
|
|
return result.strip()
|
|
|
|
def _classify_code_line(self, code: str, result: Dict):
|
|
"""Classify code line as directive, data declaration, or executable."""
|
|
|
|
# Empty code doesn't count
|
|
if not code or code == ";":
|
|
return
|
|
|
|
# Check compiler directive (must be at start of code, ignoring whitespace)
|
|
if code.lstrip().startswith("#"):
|
|
result["compiler_directives"] += 1
|
|
return
|
|
|
|
# Check if it's a type/class/struct/enum definition or typedef
|
|
if re.match(r"^(?:typedef|struct|class|enum|union)\s+", code, re.IGNORECASE):
|
|
result["data_declarations"] += 1
|
|
return
|
|
|
|
# Check for variable declarations (more comprehensive patterns)
|
|
# Pattern for C/C++ declarations
|
|
is_declaration = False
|
|
|
|
# Check for common type keywords
|
|
type_keywords = [
|
|
"int",
|
|
"char",
|
|
"short",
|
|
"long",
|
|
"float",
|
|
"double",
|
|
"void",
|
|
"bool",
|
|
"unsigned",
|
|
"signed",
|
|
"const",
|
|
"static",
|
|
"extern",
|
|
"volatile",
|
|
"size_t",
|
|
"uint8_t",
|
|
"uint16_t",
|
|
"uint32_t",
|
|
"uint64_t",
|
|
"int8_t",
|
|
"int16_t",
|
|
"int32_t",
|
|
"int64_t",
|
|
]
|
|
|
|
# Check if line starts with a type keyword (after storage class specifiers)
|
|
code_lower = code.lower()
|
|
words = code.split()
|
|
if words:
|
|
# Skip storage class specifiers
|
|
first_word = words[0]
|
|
if first_word in ["static", "extern", "const", "volatile"]:
|
|
words = words[1:] if len(words) > 1 else []
|
|
|
|
if words and words[0].lower() in type_keywords:
|
|
# Likely a declaration if it has = or ;
|
|
if "=" in code or code.rstrip().endswith(";"):
|
|
is_declaration = True
|
|
|
|
# Check for function declarations (ending with );)
|
|
if re.search(r"\([^)]*\)\s*;", code):
|
|
is_declaration = True
|
|
|
|
# Check for pointer/array declarations
|
|
if re.search(r"\*\s*\w+\s*[;=\[]", code) or re.search(
|
|
r"\w+\s*\[[^\]]*\]", code
|
|
):
|
|
# But not if it's an assignment to existing var
|
|
if not re.match(r"^\s*\w+\s*\[", code): # Not array access
|
|
is_declaration = True
|
|
|
|
if is_declaration:
|
|
result["data_declarations"] += 1
|
|
return
|
|
|
|
# Default: executable instruction
|
|
# Must have actual content (not just braces or semicolons)
|
|
has_executable_content = False
|
|
|
|
# Check for control flow keywords
|
|
if any(
|
|
kw in code
|
|
for kw in [
|
|
"if",
|
|
"else",
|
|
"while",
|
|
"for",
|
|
"do",
|
|
"switch",
|
|
"case",
|
|
"return",
|
|
"break",
|
|
"continue",
|
|
"goto",
|
|
]
|
|
):
|
|
has_executable_content = True
|
|
|
|
# Check for function calls (word followed by parentheses)
|
|
if re.search(r"\w+\s*\(", code):
|
|
has_executable_content = True
|
|
|
|
# Check for assignments
|
|
if "=" in code and not "==" in code:
|
|
has_executable_content = True
|
|
|
|
# Check for braces (block delimiters count as executable)
|
|
if "{" in code or "}" in code:
|
|
has_executable_content = True
|
|
|
|
# Check for standalone semicolon (end of previous statement)
|
|
if code.strip() == ";":
|
|
has_executable_content = False
|
|
|
|
if has_executable_content:
|
|
result["exec_instructions"] += 1
|
|
|
|
def _count_logical_statements(self, code: str) -> int:
|
|
"""
|
|
Count logical statements in code line.
|
|
For C/C++: count semicolons, braces
|
|
"""
|
|
if self.language in ["c", "c++"]:
|
|
count = 0
|
|
count += code.count(";")
|
|
count += code.count("{")
|
|
count += code.count("}")
|
|
return count
|
|
|
|
# For other languages, 1 statement per non-empty line
|
|
return 1 if code.strip() else 0
|
|
|
|
def _empty_result(self) -> Dict[str, Any]:
|
|
"""Return empty result dict."""
|
|
return {
|
|
"total_lines": 0,
|
|
"blank_lines": 0,
|
|
"comment_whole": 0,
|
|
"comment_embedded": 0,
|
|
"compiler_directives": 0,
|
|
"data_declarations": 0,
|
|
"exec_instructions": 0,
|
|
"logical_sloc": 0,
|
|
"physical_sloc": 0,
|
|
}
|
|
|
|
|
|
def analyze_file_ucc_style(file_path: Path, language: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Analyze file with UCC-style detailed metrics.
|
|
|
|
Args:
|
|
file_path: Path to source file
|
|
language: Language hint (auto-detected if None)
|
|
|
|
Returns:
|
|
Dict with detailed UCC-compatible metrics
|
|
"""
|
|
if language is None:
|
|
# Auto-detect from extension
|
|
ext = file_path.suffix.lower()
|
|
if ext in [".c", ".h"]:
|
|
language = "C"
|
|
elif ext in [".cpp", ".cc", ".cxx", ".hpp", ".hh"]:
|
|
language = "C++"
|
|
elif ext == ".py":
|
|
language = "Python"
|
|
else:
|
|
language = "generic"
|
|
|
|
counter = UCCExtendedCounter(language)
|
|
result = counter.analyze_file_extended(file_path)
|
|
result["language"] = language
|
|
result["file"] = str(file_path)
|
|
|
|
return result
|
|
|
|
|
|
def format_ucc_table_line(result: Dict[str, Any], file_label: str = None) -> str:
|
|
"""
|
|
Format result as UCC-style table line.
|
|
|
|
Args:
|
|
result: Result dict from analyze_file_ucc_style()
|
|
file_label: Optional custom file label (default: uses result['file'])
|
|
|
|
Returns:
|
|
Formatted string matching UCC output format
|
|
"""
|
|
if file_label is None:
|
|
file_label = Path(result.get("file", "unknown")).name
|
|
|
|
return (
|
|
f" {result['total_lines']:4} {result['blank_lines']:3} |"
|
|
f" {result['comment_whole']:3} {result['comment_embedded']:3} |"
|
|
f" {result['compiler_directives']:3} {result['data_declarations']:3} {result['exec_instructions']:3} |"
|
|
f" {result['logical_sloc']:3} {result['physical_sloc']:3} |"
|
|
f" CODE {file_label}"
|
|
)
|
|
|
|
|
|
def format_ucc_table_header() -> str:
|
|
"""Return UCC-style table header."""
|
|
return """ Total Blank | Comments | Compiler Data Exec. | Logical Physical | File Module
|
|
Lines Lines | Whole Embedded | Direct. Decl. Instr. | SLOC SLOC | Type Name
|
|
-----------------+------------------+-------------------------+------------------+---------------------------"""
|