SXXXXXXX_PyUCC/pyucc/core/ucc_complete_counter.py

562 lines
21 KiB
Python

"""Complete UCC-compatible counter with full preprocessing pipeline.
This module implements the complete UCC counting flow:
1. PreCountProcess - Remove strings, normalize whitespace
2. CountCommentsSLOC - Remove all comments (block and line)
3. CountBlankSLOC - Identify blank lines
4. CountDirectiveSLOC - Extract and count directives
5. LanguageSpecificProcess - LSLOC state machine with keyword classification
Target: 90-95% accuracy matching UCC v.2018.07 for C/C++
"""
import re
from pathlib import Path
from typing import Dict, Any, List, Tuple
import logging
_LOG = logging.getLogger(__name__)
class UCCCompleteCounter:
"""Complete UCC-compatible counter with full preprocessing."""
def __init__(self, language: str = "C"):
self.language = language.upper()
self._setup_language()
def _setup_language(self):
"""Setup language-specific patterns and keywords."""
if self.language in ["C", "C++", "C_CPP"]:
self._setup_c_cpp()
else:
raise NotImplementedError(f"Language {self.language} not yet supported")
def _setup_c_cpp(self):
"""Setup C/C++ specific patterns and keywords from UCC source."""
# Comment patterns
self.line_comment_start = "//"
self.block_comment_start = "/*"
self.block_comment_end = "*/"
# String quote patterns
self.string_quote = '"'
self.char_quote = "'"
self.escape_char = '\\'
# Continuation line
self.continuation = '\\'
# Compiler directives (from UCC CCJavaCsScalaCounter.cpp)
self.directive_keywords = [
"define", "undef", "if", "ifdef", "ifndef", "else",
"elif", "endif", "include", "pragma", "error",
"warning", "line", "region", "endregion"
]
# Data declaration keywords (from UCC exec_name_list)
self.data_keywords = [
"auto", "bool", "char", "class", "const", "double",
"enum", "extern", "float", "int", "long", "private",
"protected", "public", "register", "short", "signed",
"static", "struct", "typedef", "union", "unsigned",
"virtual", "void", "volatile",
# C++ specific
"namespace", "template", "typename", "explicit",
# Common types
"size_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t",
"int8_t", "int16_t", "int32_t", "int64_t",
"wchar_t", "ptrdiff_t"
]
# Executable instruction keywords (from UCC exec_name_list)
self.exec_keywords = [
"break", "case", "catch", "continue", "default",
"delete", "do", "else", "for", "goto", "if",
"new", "return", "switch", "throw", "try", "while",
# Additional
"sizeof", "typeid", "const_cast", "dynamic_cast",
"reinterpret_cast", "static_cast"
]
# For/if/while control structures
self.control_keywords = ["for", "if", "while"]
def analyze_file(self, file_path: Path) -> Dict[str, Any]:
"""
Analyze file with complete UCC preprocessing pipeline.
Returns dict with UCC-compatible metrics.
"""
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
except Exception as e:
_LOG.error(f"Error reading {file_path}: {e}")
raise
# Store original lines
original_lines = lines.copy()
total_lines = len(lines)
# STEP 1: PreCountProcess - Remove quotes and normalize
processed_lines = self._precount_process(lines)
# STEP 2: CountBlankSLOC - Identify blank lines (BEFORE removing comments!)
blank_lines = self._count_blank_sloc(processed_lines)
# STEP 3: CountCommentsSLOC - Remove all comments
no_comment_lines, comment_whole, comment_embedded = self._count_comments_sloc(
processed_lines, original_lines
)
# STEP 4: CountDirectiveSLOC - Extract directives
no_directive_lines, directive_count = self._count_directive_sloc(
no_comment_lines, original_lines
)
# STEP 5: LanguageSpecificProcess - LSLOC with state machine
lsloc_result = self._language_specific_process(
no_directive_lines, original_lines
)
# Calculate physical SLOC (non-blank, non-comment-only)
physical_sloc = total_lines - blank_lines - comment_whole
return {
'total_lines': total_lines,
'blank_lines': blank_lines,
'comment_whole': comment_whole,
'comment_embedded': comment_embedded,
'compiler_directives': directive_count,
'data_declarations': lsloc_result['data_decl'],
'exec_instructions': lsloc_result['exec_inst'],
'logical_sloc': lsloc_result['logical_sloc'],
'physical_sloc': physical_sloc
}
def _precount_process(self, lines: List[str]) -> List[str]:
"""
PreCountProcess: Remove string literals and normalize.
Replaces quoted strings with empty quotes to avoid counting
keywords/terminators inside strings.
"""
processed = []
for line in lines:
# Remove string literals but keep the quotes
cleaned = self._remove_string_literals(line)
processed.append(cleaned)
return processed
def _remove_string_literals(self, line: str) -> str:
"""Remove content of string and char literals, keep quotes."""
result = []
i = 0
while i < len(line):
char = line[i]
# Check for string literal
if char == self.string_quote:
result.append(char)
i += 1
# Skip until closing quote or end of line
while i < len(line):
if line[i] == self.escape_char and i + 1 < len(line):
# Skip escaped character
i += 2
elif line[i] == self.string_quote:
result.append(line[i])
i += 1
break
else:
# Don't include string content
i += 1
# Check for char literal
elif char == self.char_quote:
result.append(char)
i += 1
# Skip until closing quote or end of line
while i < len(line):
if line[i] == self.escape_char and i + 1 < len(line):
# Skip escaped character
i += 2
elif line[i] == self.char_quote:
result.append(line[i])
i += 1
break
else:
# Don't include char content
i += 1
else:
result.append(char)
i += 1
return ''.join(result)
def _count_comments_sloc(
self,
lines: List[str],
original_lines: List[str]
) -> Tuple[List[str], int, int]:
"""
CountCommentsSLOC: Remove all comments and count whole/embedded.
UCC counts EVERY line in a multi-line block comment as comment_whole.
"""
no_comment_lines = []
comment_whole = 0
comment_embedded = 0
in_block_comment = False
for i, line in enumerate(lines):
original_stripped = line.strip()
cleaned = line
# Handle being inside a block comment from previous line
if in_block_comment:
# Count this continuation line as comment_whole
comment_whole += 1
end_pos = cleaned.find(self.block_comment_end)
if end_pos != -1:
# Block comment ends
after_comment = cleaned[end_pos + len(self.block_comment_end):].strip()
cleaned = cleaned[end_pos + len(self.block_comment_end):]
in_block_comment = False
# If has code after, it's embedded
if after_comment:
comment_embedded += 1
# But we already counted as whole above, so subtract 1
comment_whole -= 1
else:
# Still in block comment
cleaned = ""
no_comment_lines.append(cleaned)
continue
# Check for whole line comments (line or block)
if original_stripped.startswith(self.line_comment_start):
comment_whole += 1
no_comment_lines.append("")
continue
if original_stripped.startswith(self.block_comment_start):
# Block comment starting at line beginning
comment_whole += 1
end_pos = cleaned.find(self.block_comment_end)
if end_pos != -1:
# Block comment ends on same line
after_comment = cleaned[end_pos + len(self.block_comment_end):].strip()
if after_comment:
# Has code after - it's embedded, not whole
comment_embedded += 1
comment_whole -= 1
cleaned = cleaned[end_pos + len(self.block_comment_end):]
else:
cleaned = ""
else:
# Block comment continues to next line
in_block_comment = True
cleaned = ""
no_comment_lines.append(cleaned)
continue
# Check for embedded comments
line_comment_pos = cleaned.find(self.line_comment_start)
block_comment_pos = cleaned.find(self.block_comment_start)
# Find first comment
first_comment_pos = -1
if line_comment_pos != -1 and block_comment_pos != -1:
first_comment_pos = min(line_comment_pos, block_comment_pos)
elif line_comment_pos != -1:
first_comment_pos = line_comment_pos
elif block_comment_pos != -1:
first_comment_pos = block_comment_pos
if first_comment_pos != -1:
code_before = cleaned[:first_comment_pos].strip()
if code_before:
comment_embedded += 1
else:
comment_whole += 1
# Remove comment
if first_comment_pos == line_comment_pos:
cleaned = cleaned[:line_comment_pos]
else:
# Block comment
end_pos = cleaned.find(self.block_comment_end, block_comment_pos + len(self.block_comment_start))
if end_pos != -1:
cleaned = cleaned[:block_comment_pos] + cleaned[end_pos + len(self.block_comment_end):]
else:
cleaned = cleaned[:block_comment_pos]
in_block_comment = True
no_comment_lines.append(cleaned)
return no_comment_lines, comment_whole, comment_embedded
def _count_blank_sloc(self, lines: List[str]) -> int:
"""Count blank lines (lines with no code after comment removal)."""
blank_count = 0
for line in lines:
if not line.strip():
blank_count += 1
return blank_count
def _count_directive_sloc(
self,
lines: List[str],
original_lines: List[str]
) -> Tuple[List[str], int]:
"""
CountDirectiveSLOC: Extract and count compiler directives.
Returns:
- Lines with directives blanked
- Count of directive statements (logical)
"""
no_directive_lines = []
directive_count = 0
in_directive = False
directive_statement = ""
for i, line in enumerate(lines):
stripped = line.lstrip()
# Check if this is a directive line
if stripped.startswith('#'):
# Check if it's a recognized directive
is_directive = False
for keyword in self.directive_keywords:
if re.match(r'#\s*' + keyword + r'\b', stripped):
is_directive = True
break
if is_directive:
in_directive = True
directive_statement += stripped
# Check for continuation
if stripped.rstrip().endswith(self.continuation):
# Directive continues on next line
no_directive_lines.append("")
continue
else:
# Directive complete
directive_count += 1
directive_statement = ""
in_directive = False
no_directive_lines.append("")
continue
elif in_directive:
# Continuation of directive
directive_statement += stripped
if stripped.rstrip().endswith(self.continuation):
no_directive_lines.append("")
continue
else:
# Directive complete
directive_count += 1
directive_statement = ""
in_directive = False
no_directive_lines.append("")
continue
no_directive_lines.append(line)
return no_directive_lines, directive_count
def _language_specific_process(
self,
lines: List[str],
original_lines: List[str]
) -> Dict[str, int]:
"""
LanguageSpecificProcess: LSLOC counting with state machine.
Implements UCC's LSLOC algorithm with:
- Statement terminator detection (;, {, })
- Parenthesis tracking for for/if/while
- Keyword-based data vs exec classification
- Multi-line statement accumulation
"""
data_decl = 0
exec_inst = 0
logical_sloc = 0
# State machine variables (maintained across lines)
paren_count = 0
brace_count = 0
bracket_count = 0 # For arrays []
for_flag = False
found_forifwhile = False
statement_buffer = ""
for line in lines:
stripped = line.strip()
if not stripped:
continue
# Process each character looking for terminators
i = 0
while i < len(stripped):
char = stripped[i]
statement_buffer += char
# Track brackets, parentheses, braces
if char == '[':
bracket_count += 1
elif char == ']':
bracket_count = max(0, bracket_count - 1)
elif char == '(':
paren_count += 1
# Check if this starts a for/if/while
# Look for keyword before the (
before_paren = statement_buffer[:statement_buffer.rfind('(')].strip()
words = before_paren.split()
if words and words[-1] in self.control_keywords:
found_forifwhile = True
if words[-1] == "for":
for_flag = True
elif char == ')':
paren_count = max(0, paren_count - 1)
# If for/if/while condition closed, count it as exec
if paren_count == 0 and found_forifwhile:
logical_sloc += 1
exec_inst += 1
found_forifwhile = False
for_flag = False
# Check for statement terminators
elif char == ';':
if paren_count == 0 and bracket_count == 0:
# End of statement
stmt = statement_buffer.strip()
if stmt and len(stmt) > 1: # Not just ;
# Remove trailing ;
stmt = stmt[:-1].strip()
if stmt:
logical_sloc += 1
# Classify as data or exec
if self._is_data_declaration(stmt):
data_decl += 1
else:
exec_inst += 1
statement_buffer = ""
for_flag = False
elif char == '{':
brace_count += 1
if paren_count == 0 and bracket_count == 0:
# Start of block
stmt = statement_buffer.strip()[:-1].strip() # Remove {
if stmt and not found_forifwhile:
logical_sloc += 1
if self._is_data_declaration(stmt):
data_decl += 1
else:
exec_inst += 1
statement_buffer = ""
found_forifwhile = False
elif char == '}':
brace_count = max(0, brace_count - 1)
if paren_count == 0 and bracket_count == 0:
# End of block
stmt = statement_buffer.strip()[:-1].strip() # Remove }
if stmt:
logical_sloc += 1
if self._is_data_declaration(stmt):
data_decl += 1
else:
exec_inst += 1
statement_buffer = ""
i += 1
# Handle any remaining statement
if statement_buffer.strip():
logical_sloc += 1
if self._is_data_declaration(statement_buffer):
data_decl += 1
else:
exec_inst += 1
return {
'data_decl': data_decl,
'exec_inst': exec_inst,
'logical_sloc': logical_sloc
}
def _is_data_declaration(self, statement: str) -> bool:
"""
Determine if statement is a data declaration or executable instruction.
Uses keyword matching similar to UCC.
"""
stmt_lower = statement.lower()
# Remove common prefixes
stmt_lower = re.sub(r'^\s*(public|private|protected|static|extern|const|volatile)\s+', '', stmt_lower)
# Check for data keywords
for keyword in self.data_keywords:
if re.search(r'\b' + keyword + r'\b', stmt_lower):
return True
# Check for exec keywords (takes precedence)
for keyword in self.exec_keywords:
if re.search(r'\b' + keyword + r'\b', stmt_lower):
return False
# Check for function call pattern (name followed by parenthesis)
if re.search(r'\w+\s*\(', statement):
# Could be function call (exec) or function declaration (data)
# If no type keyword before, likely a call
has_type = any(re.search(r'\b' + kw + r'\b', stmt_lower) for kw in self.data_keywords)
return has_type
# Check for assignment (likely exec)
if '=' in statement and not '==' in statement:
# Could be initialization or assignment
# If has type keyword, it's data declaration with initialization
has_type = any(re.search(r'\b' + kw + r'\b', stmt_lower) for kw in self.data_keywords)
return has_type
# Default: if has pointer or array, likely data
if '*' in statement or '[' in statement:
return True
# Default to exec
return False
def analyze_file_ucc_complete(file_path: Path) -> Dict[str, Any]:
"""Convenience function to analyze a file with complete UCC counter."""
counter = UCCCompleteCounter(language="C")
return counter.analyze_file(file_path)