637 lines
21 KiB
Python
637 lines
21 KiB
Python
"""Complete UCC-compatible counter with full preprocessing pipeline.
|
|
|
|
This module implements the complete UCC counting flow:
|
|
1. PreCountProcess - Remove strings, normalize whitespace
|
|
2. CountCommentsSLOC - Remove all comments (block and line)
|
|
3. CountBlankSLOC - Identify blank lines
|
|
4. CountDirectiveSLOC - Extract and count directives
|
|
5. LanguageSpecificProcess - LSLOC state machine with keyword classification
|
|
|
|
Target: 90-95% accuracy matching UCC v.2018.07 for C/C++
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, Any, List, Tuple
|
|
import logging
|
|
|
|
_LOG = logging.getLogger(__name__)
|
|
|
|
|
|
class UCCCompleteCounter:
|
|
"""Complete UCC-compatible counter with full preprocessing."""
|
|
|
|
def __init__(self, language: str = "C"):
|
|
self.language = language.upper()
|
|
self._setup_language()
|
|
|
|
def _setup_language(self):
|
|
"""Setup language-specific patterns and keywords."""
|
|
if self.language in ["C", "C++", "C_CPP"]:
|
|
self._setup_c_cpp()
|
|
else:
|
|
raise NotImplementedError(f"Language {self.language} not yet supported")
|
|
|
|
def _setup_c_cpp(self):
|
|
"""Setup C/C++ specific patterns and keywords from UCC source."""
|
|
|
|
# Comment patterns
|
|
self.line_comment_start = "//"
|
|
self.block_comment_start = "/*"
|
|
self.block_comment_end = "*/"
|
|
|
|
# String quote patterns
|
|
self.string_quote = '"'
|
|
self.char_quote = "'"
|
|
self.escape_char = "\\"
|
|
|
|
# Continuation line
|
|
self.continuation = "\\"
|
|
|
|
# Compiler directives (from UCC CCJavaCsScalaCounter.cpp)
|
|
self.directive_keywords = [
|
|
"define",
|
|
"undef",
|
|
"if",
|
|
"ifdef",
|
|
"ifndef",
|
|
"else",
|
|
"elif",
|
|
"endif",
|
|
"include",
|
|
"pragma",
|
|
"error",
|
|
"warning",
|
|
"line",
|
|
"region",
|
|
"endregion",
|
|
]
|
|
|
|
# Data declaration keywords (from UCC exec_name_list)
|
|
self.data_keywords = [
|
|
"auto",
|
|
"bool",
|
|
"char",
|
|
"class",
|
|
"const",
|
|
"double",
|
|
"enum",
|
|
"extern",
|
|
"float",
|
|
"int",
|
|
"long",
|
|
"private",
|
|
"protected",
|
|
"public",
|
|
"register",
|
|
"short",
|
|
"signed",
|
|
"static",
|
|
"struct",
|
|
"typedef",
|
|
"union",
|
|
"unsigned",
|
|
"virtual",
|
|
"void",
|
|
"volatile",
|
|
# C++ specific
|
|
"namespace",
|
|
"template",
|
|
"typename",
|
|
"explicit",
|
|
# Common types
|
|
"size_t",
|
|
"uint8_t",
|
|
"uint16_t",
|
|
"uint32_t",
|
|
"uint64_t",
|
|
"int8_t",
|
|
"int16_t",
|
|
"int32_t",
|
|
"int64_t",
|
|
"wchar_t",
|
|
"ptrdiff_t",
|
|
]
|
|
|
|
# Executable instruction keywords (from UCC exec_name_list)
|
|
self.exec_keywords = [
|
|
"break",
|
|
"case",
|
|
"catch",
|
|
"continue",
|
|
"default",
|
|
"delete",
|
|
"do",
|
|
"else",
|
|
"for",
|
|
"goto",
|
|
"if",
|
|
"new",
|
|
"return",
|
|
"switch",
|
|
"throw",
|
|
"try",
|
|
"while",
|
|
# Additional
|
|
"sizeof",
|
|
"typeid",
|
|
"const_cast",
|
|
"dynamic_cast",
|
|
"reinterpret_cast",
|
|
"static_cast",
|
|
]
|
|
|
|
# For/if/while control structures
|
|
self.control_keywords = ["for", "if", "while"]
|
|
|
|
def analyze_file(self, file_path: Path) -> Dict[str, Any]:
|
|
"""
|
|
Analyze file with complete UCC preprocessing pipeline.
|
|
|
|
Returns dict with UCC-compatible metrics.
|
|
"""
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
lines = f.readlines()
|
|
except Exception as e:
|
|
_LOG.error(f"Error reading {file_path}: {e}")
|
|
raise
|
|
|
|
# Store original lines
|
|
original_lines = lines.copy()
|
|
total_lines = len(lines)
|
|
|
|
# STEP 1: PreCountProcess - Remove quotes and normalize
|
|
processed_lines = self._precount_process(lines)
|
|
|
|
# STEP 2: CountBlankSLOC - Identify blank lines (BEFORE removing comments!)
|
|
blank_lines = self._count_blank_sloc(processed_lines)
|
|
|
|
# STEP 3: CountCommentsSLOC - Remove all comments
|
|
no_comment_lines, comment_whole, comment_embedded = self._count_comments_sloc(
|
|
processed_lines, original_lines
|
|
)
|
|
|
|
# STEP 4: CountDirectiveSLOC - Extract directives
|
|
no_directive_lines, directive_count = self._count_directive_sloc(
|
|
no_comment_lines, original_lines
|
|
)
|
|
|
|
# STEP 5: LanguageSpecificProcess - LSLOC with state machine
|
|
lsloc_result = self._language_specific_process(
|
|
no_directive_lines, original_lines
|
|
)
|
|
|
|
# Calculate physical SLOC (non-blank, non-comment-only)
|
|
physical_sloc = total_lines - blank_lines - comment_whole
|
|
|
|
return {
|
|
"total_lines": total_lines,
|
|
"blank_lines": blank_lines,
|
|
"comment_whole": comment_whole,
|
|
"comment_embedded": comment_embedded,
|
|
"compiler_directives": directive_count,
|
|
"data_declarations": lsloc_result["data_decl"],
|
|
"exec_instructions": lsloc_result["exec_inst"],
|
|
"logical_sloc": lsloc_result["logical_sloc"],
|
|
"physical_sloc": physical_sloc,
|
|
}
|
|
|
|
def _precount_process(self, lines: List[str]) -> List[str]:
|
|
"""
|
|
PreCountProcess: Remove string literals and normalize.
|
|
|
|
Replaces quoted strings with empty quotes to avoid counting
|
|
keywords/terminators inside strings.
|
|
"""
|
|
processed = []
|
|
|
|
for line in lines:
|
|
# Remove string literals but keep the quotes
|
|
cleaned = self._remove_string_literals(line)
|
|
processed.append(cleaned)
|
|
|
|
return processed
|
|
|
|
def _remove_string_literals(self, line: str) -> str:
|
|
"""Remove content of string and char literals, keep quotes."""
|
|
result = []
|
|
i = 0
|
|
|
|
while i < len(line):
|
|
char = line[i]
|
|
|
|
# Check for string literal
|
|
if char == self.string_quote:
|
|
result.append(char)
|
|
i += 1
|
|
# Skip until closing quote or end of line
|
|
while i < len(line):
|
|
if line[i] == self.escape_char and i + 1 < len(line):
|
|
# Skip escaped character
|
|
i += 2
|
|
elif line[i] == self.string_quote:
|
|
result.append(line[i])
|
|
i += 1
|
|
break
|
|
else:
|
|
# Don't include string content
|
|
i += 1
|
|
|
|
# Check for char literal
|
|
elif char == self.char_quote:
|
|
result.append(char)
|
|
i += 1
|
|
# Skip until closing quote or end of line
|
|
while i < len(line):
|
|
if line[i] == self.escape_char and i + 1 < len(line):
|
|
# Skip escaped character
|
|
i += 2
|
|
elif line[i] == self.char_quote:
|
|
result.append(line[i])
|
|
i += 1
|
|
break
|
|
else:
|
|
# Don't include char content
|
|
i += 1
|
|
else:
|
|
result.append(char)
|
|
i += 1
|
|
|
|
return "".join(result)
|
|
|
|
def _count_comments_sloc(
|
|
self, lines: List[str], original_lines: List[str]
|
|
) -> Tuple[List[str], int, int]:
|
|
"""
|
|
CountCommentsSLOC: Remove all comments and count whole/embedded.
|
|
|
|
UCC counts EVERY line in a multi-line block comment as comment_whole.
|
|
"""
|
|
no_comment_lines = []
|
|
comment_whole = 0
|
|
comment_embedded = 0
|
|
in_block_comment = False
|
|
|
|
for i, line in enumerate(lines):
|
|
original_stripped = line.strip()
|
|
cleaned = line
|
|
|
|
# Handle being inside a block comment from previous line
|
|
if in_block_comment:
|
|
# Count this continuation line as comment_whole
|
|
comment_whole += 1
|
|
|
|
end_pos = cleaned.find(self.block_comment_end)
|
|
if end_pos != -1:
|
|
# Block comment ends
|
|
after_comment = cleaned[
|
|
end_pos + len(self.block_comment_end) :
|
|
].strip()
|
|
cleaned = cleaned[end_pos + len(self.block_comment_end) :]
|
|
in_block_comment = False
|
|
# If has code after, it's embedded
|
|
if after_comment:
|
|
comment_embedded += 1
|
|
# But we already counted as whole above, so subtract 1
|
|
comment_whole -= 1
|
|
else:
|
|
# Still in block comment
|
|
cleaned = ""
|
|
|
|
no_comment_lines.append(cleaned)
|
|
continue
|
|
|
|
# Check for whole line comments (line or block)
|
|
if original_stripped.startswith(self.line_comment_start):
|
|
comment_whole += 1
|
|
no_comment_lines.append("")
|
|
continue
|
|
|
|
if original_stripped.startswith(self.block_comment_start):
|
|
# Block comment starting at line beginning
|
|
comment_whole += 1
|
|
|
|
end_pos = cleaned.find(self.block_comment_end)
|
|
if end_pos != -1:
|
|
# Block comment ends on same line
|
|
after_comment = cleaned[
|
|
end_pos + len(self.block_comment_end) :
|
|
].strip()
|
|
if after_comment:
|
|
# Has code after - it's embedded, not whole
|
|
comment_embedded += 1
|
|
comment_whole -= 1
|
|
cleaned = cleaned[end_pos + len(self.block_comment_end) :]
|
|
else:
|
|
cleaned = ""
|
|
else:
|
|
# Block comment continues to next line
|
|
in_block_comment = True
|
|
cleaned = ""
|
|
|
|
no_comment_lines.append(cleaned)
|
|
continue
|
|
|
|
# Check for embedded comments
|
|
line_comment_pos = cleaned.find(self.line_comment_start)
|
|
block_comment_pos = cleaned.find(self.block_comment_start)
|
|
|
|
# Find first comment
|
|
first_comment_pos = -1
|
|
if line_comment_pos != -1 and block_comment_pos != -1:
|
|
first_comment_pos = min(line_comment_pos, block_comment_pos)
|
|
elif line_comment_pos != -1:
|
|
first_comment_pos = line_comment_pos
|
|
elif block_comment_pos != -1:
|
|
first_comment_pos = block_comment_pos
|
|
|
|
if first_comment_pos != -1:
|
|
code_before = cleaned[:first_comment_pos].strip()
|
|
if code_before:
|
|
comment_embedded += 1
|
|
else:
|
|
comment_whole += 1
|
|
|
|
# Remove comment
|
|
if first_comment_pos == line_comment_pos:
|
|
cleaned = cleaned[:line_comment_pos]
|
|
else:
|
|
# Block comment
|
|
end_pos = cleaned.find(
|
|
self.block_comment_end,
|
|
block_comment_pos + len(self.block_comment_start),
|
|
)
|
|
if end_pos != -1:
|
|
cleaned = (
|
|
cleaned[:block_comment_pos]
|
|
+ cleaned[end_pos + len(self.block_comment_end) :]
|
|
)
|
|
else:
|
|
cleaned = cleaned[:block_comment_pos]
|
|
in_block_comment = True
|
|
|
|
no_comment_lines.append(cleaned)
|
|
|
|
return no_comment_lines, comment_whole, comment_embedded
|
|
|
|
def _count_blank_sloc(self, lines: List[str]) -> int:
|
|
"""Count blank lines (lines with no code after comment removal)."""
|
|
blank_count = 0
|
|
|
|
for line in lines:
|
|
if not line.strip():
|
|
blank_count += 1
|
|
|
|
return blank_count
|
|
|
|
def _count_directive_sloc(
|
|
self, lines: List[str], original_lines: List[str]
|
|
) -> Tuple[List[str], int]:
|
|
"""
|
|
CountDirectiveSLOC: Extract and count compiler directives.
|
|
|
|
Returns:
|
|
- Lines with directives blanked
|
|
- Count of directive statements (logical)
|
|
"""
|
|
no_directive_lines = []
|
|
directive_count = 0
|
|
in_directive = False
|
|
directive_statement = ""
|
|
|
|
for i, line in enumerate(lines):
|
|
stripped = line.lstrip()
|
|
|
|
# Check if this is a directive line
|
|
if stripped.startswith("#"):
|
|
# Check if it's a recognized directive
|
|
is_directive = False
|
|
for keyword in self.directive_keywords:
|
|
if re.match(r"#\s*" + keyword + r"\b", stripped):
|
|
is_directive = True
|
|
break
|
|
|
|
if is_directive:
|
|
in_directive = True
|
|
directive_statement += stripped
|
|
|
|
# Check for continuation
|
|
if stripped.rstrip().endswith(self.continuation):
|
|
# Directive continues on next line
|
|
no_directive_lines.append("")
|
|
continue
|
|
else:
|
|
# Directive complete
|
|
directive_count += 1
|
|
directive_statement = ""
|
|
in_directive = False
|
|
no_directive_lines.append("")
|
|
continue
|
|
|
|
elif in_directive:
|
|
# Continuation of directive
|
|
directive_statement += stripped
|
|
|
|
if stripped.rstrip().endswith(self.continuation):
|
|
no_directive_lines.append("")
|
|
continue
|
|
else:
|
|
# Directive complete
|
|
directive_count += 1
|
|
directive_statement = ""
|
|
in_directive = False
|
|
no_directive_lines.append("")
|
|
continue
|
|
|
|
no_directive_lines.append(line)
|
|
|
|
return no_directive_lines, directive_count
|
|
|
|
def _language_specific_process(
|
|
self, lines: List[str], original_lines: List[str]
|
|
) -> Dict[str, int]:
|
|
"""
|
|
LanguageSpecificProcess: LSLOC counting with state machine.
|
|
|
|
Implements UCC's LSLOC algorithm with:
|
|
- Statement terminator detection (;, {, })
|
|
- Parenthesis tracking for for/if/while
|
|
- Keyword-based data vs exec classification
|
|
- Multi-line statement accumulation
|
|
"""
|
|
data_decl = 0
|
|
exec_inst = 0
|
|
logical_sloc = 0
|
|
|
|
# State machine variables (maintained across lines)
|
|
paren_count = 0
|
|
brace_count = 0
|
|
bracket_count = 0 # For arrays []
|
|
for_flag = False
|
|
found_forifwhile = False
|
|
statement_buffer = ""
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
|
|
if not stripped:
|
|
continue
|
|
|
|
# Process each character looking for terminators
|
|
i = 0
|
|
|
|
while i < len(stripped):
|
|
char = stripped[i]
|
|
statement_buffer += char
|
|
|
|
# Track brackets, parentheses, braces
|
|
if char == "[":
|
|
bracket_count += 1
|
|
elif char == "]":
|
|
bracket_count = max(0, bracket_count - 1)
|
|
elif char == "(":
|
|
paren_count += 1
|
|
# Check if this starts a for/if/while
|
|
# Look for keyword before the (
|
|
before_paren = statement_buffer[
|
|
: statement_buffer.rfind("(")
|
|
].strip()
|
|
words = before_paren.split()
|
|
if words and words[-1] in self.control_keywords:
|
|
found_forifwhile = True
|
|
if words[-1] == "for":
|
|
for_flag = True
|
|
|
|
elif char == ")":
|
|
paren_count = max(0, paren_count - 1)
|
|
|
|
# If for/if/while condition closed, count it as exec
|
|
if paren_count == 0 and found_forifwhile:
|
|
logical_sloc += 1
|
|
exec_inst += 1
|
|
found_forifwhile = False
|
|
for_flag = False
|
|
|
|
# Check for statement terminators
|
|
elif char == ";":
|
|
if paren_count == 0 and bracket_count == 0:
|
|
# End of statement
|
|
stmt = statement_buffer.strip()
|
|
if stmt and len(stmt) > 1: # Not just ;
|
|
# Remove trailing ;
|
|
stmt = stmt[:-1].strip()
|
|
if stmt:
|
|
logical_sloc += 1
|
|
# Classify as data or exec
|
|
if self._is_data_declaration(stmt):
|
|
data_decl += 1
|
|
else:
|
|
exec_inst += 1
|
|
|
|
statement_buffer = ""
|
|
for_flag = False
|
|
|
|
elif char == "{":
|
|
brace_count += 1
|
|
if paren_count == 0 and bracket_count == 0:
|
|
# Start of block
|
|
stmt = statement_buffer.strip()[:-1].strip() # Remove {
|
|
if stmt and not found_forifwhile:
|
|
logical_sloc += 1
|
|
if self._is_data_declaration(stmt):
|
|
data_decl += 1
|
|
else:
|
|
exec_inst += 1
|
|
|
|
statement_buffer = ""
|
|
found_forifwhile = False
|
|
|
|
elif char == "}":
|
|
brace_count = max(0, brace_count - 1)
|
|
if paren_count == 0 and bracket_count == 0:
|
|
# End of block
|
|
stmt = statement_buffer.strip()[:-1].strip() # Remove }
|
|
if stmt:
|
|
logical_sloc += 1
|
|
if self._is_data_declaration(stmt):
|
|
data_decl += 1
|
|
else:
|
|
exec_inst += 1
|
|
statement_buffer = ""
|
|
|
|
i += 1
|
|
|
|
# Handle any remaining statement
|
|
if statement_buffer.strip():
|
|
logical_sloc += 1
|
|
if self._is_data_declaration(statement_buffer):
|
|
data_decl += 1
|
|
else:
|
|
exec_inst += 1
|
|
|
|
return {
|
|
"data_decl": data_decl,
|
|
"exec_inst": exec_inst,
|
|
"logical_sloc": logical_sloc,
|
|
}
|
|
|
|
def _is_data_declaration(self, statement: str) -> bool:
|
|
"""
|
|
Determine if statement is a data declaration or executable instruction.
|
|
|
|
Uses keyword matching similar to UCC.
|
|
"""
|
|
stmt_lower = statement.lower()
|
|
|
|
# Remove common prefixes
|
|
stmt_lower = re.sub(
|
|
r"^\s*(public|private|protected|static|extern|const|volatile)\s+",
|
|
"",
|
|
stmt_lower,
|
|
)
|
|
|
|
# Check for data keywords
|
|
for keyword in self.data_keywords:
|
|
if re.search(r"\b" + keyword + r"\b", stmt_lower):
|
|
return True
|
|
|
|
# Check for exec keywords (takes precedence)
|
|
for keyword in self.exec_keywords:
|
|
if re.search(r"\b" + keyword + r"\b", stmt_lower):
|
|
return False
|
|
|
|
# Check for function call pattern (name followed by parenthesis)
|
|
if re.search(r"\w+\s*\(", statement):
|
|
# Could be function call (exec) or function declaration (data)
|
|
# If no type keyword before, likely a call
|
|
has_type = any(
|
|
re.search(r"\b" + kw + r"\b", stmt_lower) for kw in self.data_keywords
|
|
)
|
|
return has_type
|
|
|
|
# Check for assignment (likely exec)
|
|
if "=" in statement and not "==" in statement:
|
|
# Could be initialization or assignment
|
|
# If has type keyword, it's data declaration with initialization
|
|
has_type = any(
|
|
re.search(r"\b" + kw + r"\b", stmt_lower) for kw in self.data_keywords
|
|
)
|
|
return has_type
|
|
|
|
# Default: if has pointer or array, likely data
|
|
if "*" in statement or "[" in statement:
|
|
return True
|
|
|
|
# Default to exec
|
|
return False
|
|
|
|
|
|
def analyze_file_ucc_complete(file_path: Path) -> Dict[str, Any]:
|
|
"""Convenience function to analyze a file with complete UCC counter."""
|
|
counter = UCCCompleteCounter(language="C")
|
|
return counter.analyze_file(file_path)
|