562 lines
21 KiB
Python
562 lines
21 KiB
Python
"""Complete UCC-compatible counter with full preprocessing pipeline.
|
|
|
|
This module implements the complete UCC counting flow:
|
|
1. PreCountProcess - Remove strings, normalize whitespace
|
|
2. CountCommentsSLOC - Remove all comments (block and line)
|
|
3. CountBlankSLOC - Identify blank lines
|
|
4. CountDirectiveSLOC - Extract and count directives
|
|
5. LanguageSpecificProcess - LSLOC state machine with keyword classification
|
|
|
|
Target: 90-95% accuracy matching UCC v.2018.07 for C/C++
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, Any, List, Tuple
|
|
import logging
|
|
|
|
_LOG = logging.getLogger(__name__)
|
|
|
|
|
|
class UCCCompleteCounter:
|
|
"""Complete UCC-compatible counter with full preprocessing."""
|
|
|
|
def __init__(self, language: str = "C"):
|
|
self.language = language.upper()
|
|
self._setup_language()
|
|
|
|
def _setup_language(self):
|
|
"""Setup language-specific patterns and keywords."""
|
|
if self.language in ["C", "C++", "C_CPP"]:
|
|
self._setup_c_cpp()
|
|
else:
|
|
raise NotImplementedError(f"Language {self.language} not yet supported")
|
|
|
|
def _setup_c_cpp(self):
|
|
"""Setup C/C++ specific patterns and keywords from UCC source."""
|
|
|
|
# Comment patterns
|
|
self.line_comment_start = "//"
|
|
self.block_comment_start = "/*"
|
|
self.block_comment_end = "*/"
|
|
|
|
# String quote patterns
|
|
self.string_quote = '"'
|
|
self.char_quote = "'"
|
|
self.escape_char = '\\'
|
|
|
|
# Continuation line
|
|
self.continuation = '\\'
|
|
|
|
# Compiler directives (from UCC CCJavaCsScalaCounter.cpp)
|
|
self.directive_keywords = [
|
|
"define", "undef", "if", "ifdef", "ifndef", "else",
|
|
"elif", "endif", "include", "pragma", "error",
|
|
"warning", "line", "region", "endregion"
|
|
]
|
|
|
|
# Data declaration keywords (from UCC exec_name_list)
|
|
self.data_keywords = [
|
|
"auto", "bool", "char", "class", "const", "double",
|
|
"enum", "extern", "float", "int", "long", "private",
|
|
"protected", "public", "register", "short", "signed",
|
|
"static", "struct", "typedef", "union", "unsigned",
|
|
"virtual", "void", "volatile",
|
|
# C++ specific
|
|
"namespace", "template", "typename", "explicit",
|
|
# Common types
|
|
"size_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t",
|
|
"int8_t", "int16_t", "int32_t", "int64_t",
|
|
"wchar_t", "ptrdiff_t"
|
|
]
|
|
|
|
# Executable instruction keywords (from UCC exec_name_list)
|
|
self.exec_keywords = [
|
|
"break", "case", "catch", "continue", "default",
|
|
"delete", "do", "else", "for", "goto", "if",
|
|
"new", "return", "switch", "throw", "try", "while",
|
|
# Additional
|
|
"sizeof", "typeid", "const_cast", "dynamic_cast",
|
|
"reinterpret_cast", "static_cast"
|
|
]
|
|
|
|
# For/if/while control structures
|
|
self.control_keywords = ["for", "if", "while"]
|
|
|
|
def analyze_file(self, file_path: Path) -> Dict[str, Any]:
|
|
"""
|
|
Analyze file with complete UCC preprocessing pipeline.
|
|
|
|
Returns dict with UCC-compatible metrics.
|
|
"""
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
lines = f.readlines()
|
|
except Exception as e:
|
|
_LOG.error(f"Error reading {file_path}: {e}")
|
|
raise
|
|
|
|
# Store original lines
|
|
original_lines = lines.copy()
|
|
total_lines = len(lines)
|
|
|
|
# STEP 1: PreCountProcess - Remove quotes and normalize
|
|
processed_lines = self._precount_process(lines)
|
|
|
|
# STEP 2: CountBlankSLOC - Identify blank lines (BEFORE removing comments!)
|
|
blank_lines = self._count_blank_sloc(processed_lines)
|
|
|
|
# STEP 3: CountCommentsSLOC - Remove all comments
|
|
no_comment_lines, comment_whole, comment_embedded = self._count_comments_sloc(
|
|
processed_lines, original_lines
|
|
)
|
|
|
|
# STEP 4: CountDirectiveSLOC - Extract directives
|
|
no_directive_lines, directive_count = self._count_directive_sloc(
|
|
no_comment_lines, original_lines
|
|
)
|
|
|
|
# STEP 5: LanguageSpecificProcess - LSLOC with state machine
|
|
lsloc_result = self._language_specific_process(
|
|
no_directive_lines, original_lines
|
|
)
|
|
|
|
# Calculate physical SLOC (non-blank, non-comment-only)
|
|
physical_sloc = total_lines - blank_lines - comment_whole
|
|
|
|
return {
|
|
'total_lines': total_lines,
|
|
'blank_lines': blank_lines,
|
|
'comment_whole': comment_whole,
|
|
'comment_embedded': comment_embedded,
|
|
'compiler_directives': directive_count,
|
|
'data_declarations': lsloc_result['data_decl'],
|
|
'exec_instructions': lsloc_result['exec_inst'],
|
|
'logical_sloc': lsloc_result['logical_sloc'],
|
|
'physical_sloc': physical_sloc
|
|
}
|
|
|
|
def _precount_process(self, lines: List[str]) -> List[str]:
|
|
"""
|
|
PreCountProcess: Remove string literals and normalize.
|
|
|
|
Replaces quoted strings with empty quotes to avoid counting
|
|
keywords/terminators inside strings.
|
|
"""
|
|
processed = []
|
|
|
|
for line in lines:
|
|
# Remove string literals but keep the quotes
|
|
cleaned = self._remove_string_literals(line)
|
|
processed.append(cleaned)
|
|
|
|
return processed
|
|
|
|
def _remove_string_literals(self, line: str) -> str:
|
|
"""Remove content of string and char literals, keep quotes."""
|
|
result = []
|
|
i = 0
|
|
|
|
while i < len(line):
|
|
char = line[i]
|
|
|
|
# Check for string literal
|
|
if char == self.string_quote:
|
|
result.append(char)
|
|
i += 1
|
|
# Skip until closing quote or end of line
|
|
while i < len(line):
|
|
if line[i] == self.escape_char and i + 1 < len(line):
|
|
# Skip escaped character
|
|
i += 2
|
|
elif line[i] == self.string_quote:
|
|
result.append(line[i])
|
|
i += 1
|
|
break
|
|
else:
|
|
# Don't include string content
|
|
i += 1
|
|
|
|
# Check for char literal
|
|
elif char == self.char_quote:
|
|
result.append(char)
|
|
i += 1
|
|
# Skip until closing quote or end of line
|
|
while i < len(line):
|
|
if line[i] == self.escape_char and i + 1 < len(line):
|
|
# Skip escaped character
|
|
i += 2
|
|
elif line[i] == self.char_quote:
|
|
result.append(line[i])
|
|
i += 1
|
|
break
|
|
else:
|
|
# Don't include char content
|
|
i += 1
|
|
else:
|
|
result.append(char)
|
|
i += 1
|
|
|
|
return ''.join(result)
|
|
|
|
def _count_comments_sloc(
|
|
self,
|
|
lines: List[str],
|
|
original_lines: List[str]
|
|
) -> Tuple[List[str], int, int]:
|
|
"""
|
|
CountCommentsSLOC: Remove all comments and count whole/embedded.
|
|
|
|
UCC counts EVERY line in a multi-line block comment as comment_whole.
|
|
"""
|
|
no_comment_lines = []
|
|
comment_whole = 0
|
|
comment_embedded = 0
|
|
in_block_comment = False
|
|
|
|
for i, line in enumerate(lines):
|
|
original_stripped = line.strip()
|
|
cleaned = line
|
|
|
|
# Handle being inside a block comment from previous line
|
|
if in_block_comment:
|
|
# Count this continuation line as comment_whole
|
|
comment_whole += 1
|
|
|
|
end_pos = cleaned.find(self.block_comment_end)
|
|
if end_pos != -1:
|
|
# Block comment ends
|
|
after_comment = cleaned[end_pos + len(self.block_comment_end):].strip()
|
|
cleaned = cleaned[end_pos + len(self.block_comment_end):]
|
|
in_block_comment = False
|
|
# If has code after, it's embedded
|
|
if after_comment:
|
|
comment_embedded += 1
|
|
# But we already counted as whole above, so subtract 1
|
|
comment_whole -= 1
|
|
else:
|
|
# Still in block comment
|
|
cleaned = ""
|
|
|
|
no_comment_lines.append(cleaned)
|
|
continue
|
|
|
|
# Check for whole line comments (line or block)
|
|
if original_stripped.startswith(self.line_comment_start):
|
|
comment_whole += 1
|
|
no_comment_lines.append("")
|
|
continue
|
|
|
|
if original_stripped.startswith(self.block_comment_start):
|
|
# Block comment starting at line beginning
|
|
comment_whole += 1
|
|
|
|
end_pos = cleaned.find(self.block_comment_end)
|
|
if end_pos != -1:
|
|
# Block comment ends on same line
|
|
after_comment = cleaned[end_pos + len(self.block_comment_end):].strip()
|
|
if after_comment:
|
|
# Has code after - it's embedded, not whole
|
|
comment_embedded += 1
|
|
comment_whole -= 1
|
|
cleaned = cleaned[end_pos + len(self.block_comment_end):]
|
|
else:
|
|
cleaned = ""
|
|
else:
|
|
# Block comment continues to next line
|
|
in_block_comment = True
|
|
cleaned = ""
|
|
|
|
no_comment_lines.append(cleaned)
|
|
continue
|
|
|
|
# Check for embedded comments
|
|
line_comment_pos = cleaned.find(self.line_comment_start)
|
|
block_comment_pos = cleaned.find(self.block_comment_start)
|
|
|
|
# Find first comment
|
|
first_comment_pos = -1
|
|
if line_comment_pos != -1 and block_comment_pos != -1:
|
|
first_comment_pos = min(line_comment_pos, block_comment_pos)
|
|
elif line_comment_pos != -1:
|
|
first_comment_pos = line_comment_pos
|
|
elif block_comment_pos != -1:
|
|
first_comment_pos = block_comment_pos
|
|
|
|
if first_comment_pos != -1:
|
|
code_before = cleaned[:first_comment_pos].strip()
|
|
if code_before:
|
|
comment_embedded += 1
|
|
else:
|
|
comment_whole += 1
|
|
|
|
# Remove comment
|
|
if first_comment_pos == line_comment_pos:
|
|
cleaned = cleaned[:line_comment_pos]
|
|
else:
|
|
# Block comment
|
|
end_pos = cleaned.find(self.block_comment_end, block_comment_pos + len(self.block_comment_start))
|
|
if end_pos != -1:
|
|
cleaned = cleaned[:block_comment_pos] + cleaned[end_pos + len(self.block_comment_end):]
|
|
else:
|
|
cleaned = cleaned[:block_comment_pos]
|
|
in_block_comment = True
|
|
|
|
no_comment_lines.append(cleaned)
|
|
|
|
return no_comment_lines, comment_whole, comment_embedded
|
|
|
|
def _count_blank_sloc(self, lines: List[str]) -> int:
|
|
"""Count blank lines (lines with no code after comment removal)."""
|
|
blank_count = 0
|
|
|
|
for line in lines:
|
|
if not line.strip():
|
|
blank_count += 1
|
|
|
|
return blank_count
|
|
|
|
def _count_directive_sloc(
|
|
self,
|
|
lines: List[str],
|
|
original_lines: List[str]
|
|
) -> Tuple[List[str], int]:
|
|
"""
|
|
CountDirectiveSLOC: Extract and count compiler directives.
|
|
|
|
Returns:
|
|
- Lines with directives blanked
|
|
- Count of directive statements (logical)
|
|
"""
|
|
no_directive_lines = []
|
|
directive_count = 0
|
|
in_directive = False
|
|
directive_statement = ""
|
|
|
|
for i, line in enumerate(lines):
|
|
stripped = line.lstrip()
|
|
|
|
# Check if this is a directive line
|
|
if stripped.startswith('#'):
|
|
# Check if it's a recognized directive
|
|
is_directive = False
|
|
for keyword in self.directive_keywords:
|
|
if re.match(r'#\s*' + keyword + r'\b', stripped):
|
|
is_directive = True
|
|
break
|
|
|
|
if is_directive:
|
|
in_directive = True
|
|
directive_statement += stripped
|
|
|
|
# Check for continuation
|
|
if stripped.rstrip().endswith(self.continuation):
|
|
# Directive continues on next line
|
|
no_directive_lines.append("")
|
|
continue
|
|
else:
|
|
# Directive complete
|
|
directive_count += 1
|
|
directive_statement = ""
|
|
in_directive = False
|
|
no_directive_lines.append("")
|
|
continue
|
|
|
|
elif in_directive:
|
|
# Continuation of directive
|
|
directive_statement += stripped
|
|
|
|
if stripped.rstrip().endswith(self.continuation):
|
|
no_directive_lines.append("")
|
|
continue
|
|
else:
|
|
# Directive complete
|
|
directive_count += 1
|
|
directive_statement = ""
|
|
in_directive = False
|
|
no_directive_lines.append("")
|
|
continue
|
|
|
|
no_directive_lines.append(line)
|
|
|
|
return no_directive_lines, directive_count
|
|
|
|
def _language_specific_process(
|
|
self,
|
|
lines: List[str],
|
|
original_lines: List[str]
|
|
) -> Dict[str, int]:
|
|
"""
|
|
LanguageSpecificProcess: LSLOC counting with state machine.
|
|
|
|
Implements UCC's LSLOC algorithm with:
|
|
- Statement terminator detection (;, {, })
|
|
- Parenthesis tracking for for/if/while
|
|
- Keyword-based data vs exec classification
|
|
- Multi-line statement accumulation
|
|
"""
|
|
data_decl = 0
|
|
exec_inst = 0
|
|
logical_sloc = 0
|
|
|
|
# State machine variables (maintained across lines)
|
|
paren_count = 0
|
|
brace_count = 0
|
|
bracket_count = 0 # For arrays []
|
|
for_flag = False
|
|
found_forifwhile = False
|
|
statement_buffer = ""
|
|
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
|
|
if not stripped:
|
|
continue
|
|
|
|
# Process each character looking for terminators
|
|
i = 0
|
|
|
|
while i < len(stripped):
|
|
char = stripped[i]
|
|
statement_buffer += char
|
|
|
|
# Track brackets, parentheses, braces
|
|
if char == '[':
|
|
bracket_count += 1
|
|
elif char == ']':
|
|
bracket_count = max(0, bracket_count - 1)
|
|
elif char == '(':
|
|
paren_count += 1
|
|
# Check if this starts a for/if/while
|
|
# Look for keyword before the (
|
|
before_paren = statement_buffer[:statement_buffer.rfind('(')].strip()
|
|
words = before_paren.split()
|
|
if words and words[-1] in self.control_keywords:
|
|
found_forifwhile = True
|
|
if words[-1] == "for":
|
|
for_flag = True
|
|
|
|
elif char == ')':
|
|
paren_count = max(0, paren_count - 1)
|
|
|
|
# If for/if/while condition closed, count it as exec
|
|
if paren_count == 0 and found_forifwhile:
|
|
logical_sloc += 1
|
|
exec_inst += 1
|
|
found_forifwhile = False
|
|
for_flag = False
|
|
|
|
# Check for statement terminators
|
|
elif char == ';':
|
|
if paren_count == 0 and bracket_count == 0:
|
|
# End of statement
|
|
stmt = statement_buffer.strip()
|
|
if stmt and len(stmt) > 1: # Not just ;
|
|
# Remove trailing ;
|
|
stmt = stmt[:-1].strip()
|
|
if stmt:
|
|
logical_sloc += 1
|
|
# Classify as data or exec
|
|
if self._is_data_declaration(stmt):
|
|
data_decl += 1
|
|
else:
|
|
exec_inst += 1
|
|
|
|
statement_buffer = ""
|
|
for_flag = False
|
|
|
|
elif char == '{':
|
|
brace_count += 1
|
|
if paren_count == 0 and bracket_count == 0:
|
|
# Start of block
|
|
stmt = statement_buffer.strip()[:-1].strip() # Remove {
|
|
if stmt and not found_forifwhile:
|
|
logical_sloc += 1
|
|
if self._is_data_declaration(stmt):
|
|
data_decl += 1
|
|
else:
|
|
exec_inst += 1
|
|
|
|
statement_buffer = ""
|
|
found_forifwhile = False
|
|
|
|
elif char == '}':
|
|
brace_count = max(0, brace_count - 1)
|
|
if paren_count == 0 and bracket_count == 0:
|
|
# End of block
|
|
stmt = statement_buffer.strip()[:-1].strip() # Remove }
|
|
if stmt:
|
|
logical_sloc += 1
|
|
if self._is_data_declaration(stmt):
|
|
data_decl += 1
|
|
else:
|
|
exec_inst += 1
|
|
statement_buffer = ""
|
|
|
|
i += 1
|
|
|
|
# Handle any remaining statement
|
|
if statement_buffer.strip():
|
|
logical_sloc += 1
|
|
if self._is_data_declaration(statement_buffer):
|
|
data_decl += 1
|
|
else:
|
|
exec_inst += 1
|
|
|
|
return {
|
|
'data_decl': data_decl,
|
|
'exec_inst': exec_inst,
|
|
'logical_sloc': logical_sloc
|
|
}
|
|
|
|
def _is_data_declaration(self, statement: str) -> bool:
|
|
"""
|
|
Determine if statement is a data declaration or executable instruction.
|
|
|
|
Uses keyword matching similar to UCC.
|
|
"""
|
|
stmt_lower = statement.lower()
|
|
|
|
# Remove common prefixes
|
|
stmt_lower = re.sub(r'^\s*(public|private|protected|static|extern|const|volatile)\s+', '', stmt_lower)
|
|
|
|
# Check for data keywords
|
|
for keyword in self.data_keywords:
|
|
if re.search(r'\b' + keyword + r'\b', stmt_lower):
|
|
return True
|
|
|
|
# Check for exec keywords (takes precedence)
|
|
for keyword in self.exec_keywords:
|
|
if re.search(r'\b' + keyword + r'\b', stmt_lower):
|
|
return False
|
|
|
|
# Check for function call pattern (name followed by parenthesis)
|
|
if re.search(r'\w+\s*\(', statement):
|
|
# Could be function call (exec) or function declaration (data)
|
|
# If no type keyword before, likely a call
|
|
has_type = any(re.search(r'\b' + kw + r'\b', stmt_lower) for kw in self.data_keywords)
|
|
return has_type
|
|
|
|
# Check for assignment (likely exec)
|
|
if '=' in statement and not '==' in statement:
|
|
# Could be initialization or assignment
|
|
# If has type keyword, it's data declaration with initialization
|
|
has_type = any(re.search(r'\b' + kw + r'\b', stmt_lower) for kw in self.data_keywords)
|
|
return has_type
|
|
|
|
# Default: if has pointer or array, likely data
|
|
if '*' in statement or '[' in statement:
|
|
return True
|
|
|
|
# Default to exec
|
|
return False
|
|
|
|
|
|
def analyze_file_ucc_complete(file_path: Path) -> Dict[str, Any]:
|
|
"""Convenience function to analyze a file with complete UCC counter."""
|
|
counter = UCCCompleteCounter(language="C")
|
|
return counter.analyze_file(file_path)
|