SXXXXXXX_PyUCC/pyucc/core/ucc_python_counter.py

409 lines
15 KiB
Python

"""
UCC-compatible counter for Python files.
Implements UCC algorithms for Python with the following metrics:
- Comment Whole Lines
- Comment Embedded Lines
- Compiler Directives (import/from statements)
- Exec Instructions (all executable code in Python)
- Logical SLOC (statement count)
- Physical SLOC (non-blank, non-comment lines)
Note: Python does not distinguish between data declarations and executable instructions,
so data_declarations is always 0.
"""
import re
from pathlib import Path
from typing import Dict, List, Tuple
class UCCPythonCounter:
"""UCC-compatible counter for Python files."""
# Python directives (imports)
DIRECTIVES = {
'import', 'from', 'as'
}
# Python exec keywords (all are exec, no data declarations)
EXEC_KEYWORDS = {
'and', 'as', 'assert', 'break', 'continue', 'def', 'del',
'elif', 'else', 'except', 'exec', 'exit', 'finally', 'for',
'global', 'if', 'in', 'is', 'lambda', 'not', 'or', 'pass',
'print', 'raise', 'return', 'try', 'while', 'with', 'yield',
'class', 'async', 'await', 'nonlocal'
}
# Continuation indicators
CONTINUATION_CHARS = {'+', '-', '*', '/', '=', '<', '>', '|', '&', '%', '^', '\\', '~', ','}
CONTINUATION_KEYWORDS = {'is', 'in', 'not', 'and', 'or'}
def __init__(self):
self.results = {
'comment_whole': 0,
'comment_embedded': 0,
'compiler_directives': 0,
'data_declarations': 0, # Always 0 for Python
'exec_instructions': 0,
'logical_sloc': 0,
'physical_sloc': 0,
'blank_lines': 0,
}
def analyze_file(self, file_path: Path) -> Dict[str, int]:
"""
Analyze a Python file using UCC algorithms.
Returns dict with UCC extended metrics.
"""
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
except Exception:
return self.results.copy()
# Step 1: Count blank lines BEFORE any processing
self._count_blank_lines(lines)
# Step 2: Process strings and comments
processed_lines, original_lines = self._preprocess_lines(lines)
# Step 3: Count and remove comments (updates comment_whole, comment_embedded)
processed_lines = self._count_and_remove_comments(processed_lines, original_lines)
# Step 4: Count directives (import/from statements)
processed_lines = self._count_directives(processed_lines, original_lines)
# Step 5: Count logical SLOC and exec instructions
self._count_logical_sloc(processed_lines, original_lines)
return self.results.copy()
def _count_blank_lines(self, lines: List[str]) -> None:
"""Count blank lines before any processing (UCC counts originally blank lines)."""
for line in lines:
if not line.strip():
self.results['blank_lines'] += 1
def _preprocess_lines(self, lines: List[str]) -> Tuple[List[str], List[str]]:
"""
Preprocess lines: remove string contents but keep structure.
Returns (processed_lines, original_lines).
"""
processed = []
original = []
in_triple_quote = False
triple_quote_char = None
for line in lines:
original.append(line.rstrip('\n\r'))
if in_triple_quote:
# Inside triple-quoted string
if triple_quote_char * 3 in line:
# End of triple-quoted string
idx = line.find(triple_quote_char * 3)
processed_line = '$' * (idx + 3) + line[idx + 3:]
in_triple_quote = False
else:
# Entire line is inside string
processed_line = '$' * len(line.rstrip('\n\r'))
else:
processed_line = line.rstrip('\n\r')
# Check for triple-quoted strings
for quote_char in ['"', "'"]:
triple = quote_char * 3
if triple in processed_line:
start_idx = processed_line.find(triple)
end_idx = processed_line.find(triple, start_idx + 3)
if end_idx != -1:
# Complete triple-quoted string on one line
processed_line = (processed_line[:start_idx] +
'$' * (end_idx - start_idx + 3) +
processed_line[end_idx + 3:])
else:
# Start of multi-line triple-quoted string
processed_line = processed_line[:start_idx] + '$' * (len(processed_line) - start_idx)
in_triple_quote = True
triple_quote_char = quote_char
break
# Replace single/double quoted strings (only if not in triple quote)
if not in_triple_quote:
processed_line = self._replace_quotes(processed_line)
processed.append(processed_line)
return processed, original
def _replace_quotes(self, line: str) -> str:
"""Replace content of single and double quoted strings with $."""
result = []
i = 0
while i < len(line):
if line[i] in ['"', "'"]:
quote = line[i]
result.append(quote)
i += 1
# Find closing quote, handling escape sequences
while i < len(line):
if line[i] == '\\' and i + 1 < len(line):
result.append('$')
result.append('$')
i += 2
elif line[i] == quote:
result.append(quote)
i += 1
break
else:
result.append('$')
i += 1
else:
result.append(line[i])
i += 1
return ''.join(result)
def _count_and_remove_comments(self, processed: List[str], original: List[str]) -> List[str]:
"""
Count whole and embedded comments, then remove them.
UCC counts EVERY line in a multi-line comment block as whole.
"""
result = []
in_block_comment = False
block_quote_char = None
for proc_line, orig_line in zip(processed, original):
stripped = proc_line.strip()
# Skip blank lines (already processed and counted)
if not stripped:
result.append('')
continue
# Handle multi-line comments (""" or ''')
if in_block_comment:
# Every line in block is a whole comment
self.results['comment_whole'] += 1
# Check if block ends
if block_quote_char * 3 in proc_line:
in_block_comment = False
result.append('')
continue
# Check for start of block comment
block_started = False
for quote_char in ['"', "'"]:
triple = quote_char * 3
if triple in proc_line:
# Check if it's complete on one line
first = proc_line.find(triple)
second = proc_line.find(triple, first + 3)
if second == -1:
# Multi-line block starts
in_block_comment = True
block_quote_char = quote_char
block_started = True
# Check if there's code before the comment
before = proc_line[:first].strip()
if before:
self.results['comment_embedded'] += 1
result.append(before)
else:
self.results['comment_whole'] += 1
result.append('')
break
else:
# Complete block comment on one line
before = proc_line[:first].strip()
after = proc_line[second + 3:].strip()
if before or after:
self.results['comment_embedded'] += 1
result.append(before + ' ' + after)
else:
self.results['comment_whole'] += 1
result.append('')
block_started = True
break
if block_started:
continue
# Handle single-line comments (#)
if '#' in proc_line:
comment_idx = proc_line.find('#')
before = proc_line[:comment_idx].strip()
if before:
self.results['comment_embedded'] += 1
result.append(before)
else:
self.results['comment_whole'] += 1
result.append('')
else:
result.append(proc_line)
return result
def _count_directives(self, processed: List[str], original: List[str]) -> List[str]:
"""
Count and extract compiler directives (import/from statements).
Returns lines with directives removed.
"""
result = []
in_directive = False
for proc_line, orig_line in zip(processed, original):
stripped = proc_line.strip()
if not stripped:
result.append('')
continue
# Check if line starts with import/from
tokens = stripped.split()
if tokens and tokens[0] in self.DIRECTIVES:
self.results['compiler_directives'] += 1
in_directive = True
# Check for continuation (ends with \)
if not stripped.endswith('\\'):
in_directive = False
result.append('')
elif in_directive:
# Continuation of directive
self.results['compiler_directives'] += 1
if not stripped.endswith('\\'):
in_directive = False
result.append('')
else:
result.append(proc_line)
return result
def _count_logical_sloc(self, processed: List[str], original: List[str]) -> None:
"""
Count logical SLOC and exec instructions.
Python: all non-blank, non-comment, non-directive lines are exec.
"""
accumulated_statement = ''
paren_count = 0
bracket_count = 0
brace_count = 0
for proc_line, orig_line in zip(processed, original):
stripped = proc_line.strip()
if not stripped:
continue
# This is a physical line (non-blank, non-comment, non-directive)
self.results['physical_sloc'] += 1
# Track parentheses, brackets, braces
paren_count += proc_line.count('(') - proc_line.count(')')
bracket_count += proc_line.count('[') - proc_line.count(']')
brace_count += proc_line.count('{') - proc_line.count('}')
accumulated_statement += ' ' + stripped
# Check if statement is complete
statement_complete = False
# Statement continues if:
# 1. Inside parentheses/brackets/braces
if paren_count > 0 or bracket_count > 0 or brace_count > 0:
continue
# 2. Ends with continuation character
if stripped.endswith('\\'):
accumulated_statement = accumulated_statement.rstrip('\\')
continue
# 3. Ends with continuation operator/keyword
last_token = self._get_last_token(stripped)
if last_token in self.CONTINUATION_CHARS or last_token in self.CONTINUATION_KEYWORDS:
continue
# 4. Special case: else: or elif: - not counted as separate statement
if stripped.endswith('else:') or stripped.endswith('elif:'):
accumulated_statement = ''
continue
# Check for multiple statements on one line (separated by ; or :)
# Count : and ; as statement terminators (except in else:)
statement_seps = accumulated_statement.count(';')
# Count : but exclude 'else:'
colon_count = accumulated_statement.count(':')
if 'else:' in accumulated_statement:
colon_count -= accumulated_statement.count('else:')
num_statements = max(1, statement_seps + colon_count + 1)
# Count as exec instruction and logical SLOC
if self._is_exec_instruction(accumulated_statement):
self.results['exec_instructions'] += 1
self.results['logical_sloc'] += num_statements
accumulated_statement = ''
# Handle incomplete statement at end of file
if accumulated_statement.strip():
if self._is_exec_instruction(accumulated_statement):
self.results['exec_instructions'] += 1
self.results['logical_sloc'] += 1
def _get_last_token(self, line: str) -> str:
"""Extract last meaningful token from line."""
line = line.rstrip()
if not line:
return ''
# Check if last char is an operator
if line[-1] in self.CONTINUATION_CHARS:
return line[-1]
# Extract last word
tokens = line.split()
if tokens:
return tokens[-1]
return ''
def _is_exec_instruction(self, statement: str) -> bool:
"""Check if statement contains executable keywords."""
statement_lower = statement.lower()
# Check for exec keywords
for keyword in self.EXEC_KEYWORDS:
# Use word boundaries to avoid false matches
pattern = r'\b' + re.escape(keyword) + r'\b'
if re.search(pattern, statement_lower):
return True
# Check for assignment (contains =)
if '=' in statement and '==' not in statement:
return True
# Check for function/method calls (contains '(')
if '(' in statement:
return True
# If has any content, consider it executable
return bool(statement.strip())