461 lines
15 KiB
Python
461 lines
15 KiB
Python
"""
|
|
UCC-compatible counter for Python files.
|
|
|
|
Implements UCC algorithms for Python with the following metrics:
|
|
- Comment Whole Lines
|
|
- Comment Embedded Lines
|
|
- Compiler Directives (import/from statements)
|
|
- Exec Instructions (all executable code in Python)
|
|
- Logical SLOC (statement count)
|
|
- Physical SLOC (non-blank, non-comment lines)
|
|
|
|
Note: Python does not distinguish between data declarations and executable instructions,
|
|
so data_declarations is always 0.
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple
|
|
|
|
|
|
class UCCPythonCounter:
|
|
"""UCC-compatible counter for Python files."""
|
|
|
|
# Python directives (imports)
|
|
DIRECTIVES = {"import", "from", "as"}
|
|
|
|
# Python exec keywords (all are exec, no data declarations)
|
|
EXEC_KEYWORDS = {
|
|
"and",
|
|
"as",
|
|
"assert",
|
|
"break",
|
|
"continue",
|
|
"def",
|
|
"del",
|
|
"elif",
|
|
"else",
|
|
"except",
|
|
"exec",
|
|
"exit",
|
|
"finally",
|
|
"for",
|
|
"global",
|
|
"if",
|
|
"in",
|
|
"is",
|
|
"lambda",
|
|
"not",
|
|
"or",
|
|
"pass",
|
|
"print",
|
|
"raise",
|
|
"return",
|
|
"try",
|
|
"while",
|
|
"with",
|
|
"yield",
|
|
"class",
|
|
"async",
|
|
"await",
|
|
"nonlocal",
|
|
}
|
|
|
|
# Continuation indicators
|
|
CONTINUATION_CHARS = {
|
|
"+",
|
|
"-",
|
|
"*",
|
|
"/",
|
|
"=",
|
|
"<",
|
|
">",
|
|
"|",
|
|
"&",
|
|
"%",
|
|
"^",
|
|
"\\",
|
|
"~",
|
|
",",
|
|
}
|
|
CONTINUATION_KEYWORDS = {"is", "in", "not", "and", "or"}
|
|
|
|
def __init__(self):
|
|
self.results = {
|
|
"comment_whole": 0,
|
|
"comment_embedded": 0,
|
|
"compiler_directives": 0,
|
|
"data_declarations": 0, # Always 0 for Python
|
|
"exec_instructions": 0,
|
|
"logical_sloc": 0,
|
|
"physical_sloc": 0,
|
|
"blank_lines": 0,
|
|
}
|
|
|
|
def analyze_file(self, file_path: Path) -> Dict[str, int]:
|
|
"""
|
|
Analyze a Python file using UCC algorithms.
|
|
|
|
Returns dict with UCC extended metrics.
|
|
"""
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
lines = f.readlines()
|
|
except Exception:
|
|
return self.results.copy()
|
|
|
|
# Step 1: Count blank lines BEFORE any processing
|
|
self._count_blank_lines(lines)
|
|
|
|
# Step 2: Process strings and comments
|
|
processed_lines, original_lines = self._preprocess_lines(lines)
|
|
|
|
# Step 3: Count and remove comments (updates comment_whole, comment_embedded)
|
|
processed_lines = self._count_and_remove_comments(
|
|
processed_lines, original_lines
|
|
)
|
|
|
|
# Step 4: Count directives (import/from statements)
|
|
processed_lines = self._count_directives(processed_lines, original_lines)
|
|
|
|
# Step 5: Count logical SLOC and exec instructions
|
|
self._count_logical_sloc(processed_lines, original_lines)
|
|
|
|
return self.results.copy()
|
|
|
|
def _count_blank_lines(self, lines: List[str]) -> None:
|
|
"""Count blank lines before any processing (UCC counts originally blank lines)."""
|
|
for line in lines:
|
|
if not line.strip():
|
|
self.results["blank_lines"] += 1
|
|
|
|
def _preprocess_lines(self, lines: List[str]) -> Tuple[List[str], List[str]]:
|
|
"""
|
|
Preprocess lines: remove string contents but keep structure.
|
|
Returns (processed_lines, original_lines).
|
|
"""
|
|
processed = []
|
|
original = []
|
|
|
|
in_triple_quote = False
|
|
triple_quote_char = None
|
|
|
|
for line in lines:
|
|
original.append(line.rstrip("\n\r"))
|
|
|
|
if in_triple_quote:
|
|
# Inside triple-quoted string
|
|
if triple_quote_char * 3 in line:
|
|
# End of triple-quoted string
|
|
idx = line.find(triple_quote_char * 3)
|
|
processed_line = "$" * (idx + 3) + line[idx + 3 :]
|
|
in_triple_quote = False
|
|
else:
|
|
# Entire line is inside string
|
|
processed_line = "$" * len(line.rstrip("\n\r"))
|
|
else:
|
|
processed_line = line.rstrip("\n\r")
|
|
|
|
# Check for triple-quoted strings
|
|
for quote_char in ['"', "'"]:
|
|
triple = quote_char * 3
|
|
if triple in processed_line:
|
|
start_idx = processed_line.find(triple)
|
|
end_idx = processed_line.find(triple, start_idx + 3)
|
|
|
|
if end_idx != -1:
|
|
# Complete triple-quoted string on one line
|
|
processed_line = (
|
|
processed_line[:start_idx]
|
|
+ "$" * (end_idx - start_idx + 3)
|
|
+ processed_line[end_idx + 3 :]
|
|
)
|
|
else:
|
|
# Start of multi-line triple-quoted string
|
|
processed_line = processed_line[:start_idx] + "$" * (
|
|
len(processed_line) - start_idx
|
|
)
|
|
in_triple_quote = True
|
|
triple_quote_char = quote_char
|
|
break
|
|
|
|
# Replace single/double quoted strings (only if not in triple quote)
|
|
if not in_triple_quote:
|
|
processed_line = self._replace_quotes(processed_line)
|
|
|
|
processed.append(processed_line)
|
|
|
|
return processed, original
|
|
|
|
def _replace_quotes(self, line: str) -> str:
|
|
"""Replace content of single and double quoted strings with $."""
|
|
result = []
|
|
i = 0
|
|
while i < len(line):
|
|
if line[i] in ['"', "'"]:
|
|
quote = line[i]
|
|
result.append(quote)
|
|
i += 1
|
|
|
|
# Find closing quote, handling escape sequences
|
|
while i < len(line):
|
|
if line[i] == "\\" and i + 1 < len(line):
|
|
result.append("$")
|
|
result.append("$")
|
|
i += 2
|
|
elif line[i] == quote:
|
|
result.append(quote)
|
|
i += 1
|
|
break
|
|
else:
|
|
result.append("$")
|
|
i += 1
|
|
else:
|
|
result.append(line[i])
|
|
i += 1
|
|
|
|
return "".join(result)
|
|
|
|
def _count_and_remove_comments(
|
|
self, processed: List[str], original: List[str]
|
|
) -> List[str]:
|
|
"""
|
|
Count whole and embedded comments, then remove them.
|
|
UCC counts EVERY line in a multi-line comment block as whole.
|
|
"""
|
|
result = []
|
|
in_block_comment = False
|
|
block_quote_char = None
|
|
|
|
for proc_line, orig_line in zip(processed, original):
|
|
stripped = proc_line.strip()
|
|
|
|
# Skip blank lines (already processed and counted)
|
|
if not stripped:
|
|
result.append("")
|
|
continue
|
|
|
|
# Handle multi-line comments (""" or ''')
|
|
if in_block_comment:
|
|
# Every line in block is a whole comment
|
|
self.results["comment_whole"] += 1
|
|
|
|
# Check if block ends
|
|
if block_quote_char * 3 in proc_line:
|
|
in_block_comment = False
|
|
|
|
result.append("")
|
|
continue
|
|
|
|
# Check for start of block comment
|
|
block_started = False
|
|
for quote_char in ['"', "'"]:
|
|
triple = quote_char * 3
|
|
if triple in proc_line:
|
|
# Check if it's complete on one line
|
|
first = proc_line.find(triple)
|
|
second = proc_line.find(triple, first + 3)
|
|
|
|
if second == -1:
|
|
# Multi-line block starts
|
|
in_block_comment = True
|
|
block_quote_char = quote_char
|
|
block_started = True
|
|
|
|
# Check if there's code before the comment
|
|
before = proc_line[:first].strip()
|
|
if before:
|
|
self.results["comment_embedded"] += 1
|
|
result.append(before)
|
|
else:
|
|
self.results["comment_whole"] += 1
|
|
result.append("")
|
|
break
|
|
else:
|
|
# Complete block comment on one line
|
|
before = proc_line[:first].strip()
|
|
after = proc_line[second + 3 :].strip()
|
|
|
|
if before or after:
|
|
self.results["comment_embedded"] += 1
|
|
result.append(before + " " + after)
|
|
else:
|
|
self.results["comment_whole"] += 1
|
|
result.append("")
|
|
block_started = True
|
|
break
|
|
|
|
if block_started:
|
|
continue
|
|
|
|
# Handle single-line comments (#)
|
|
if "#" in proc_line:
|
|
comment_idx = proc_line.find("#")
|
|
before = proc_line[:comment_idx].strip()
|
|
|
|
if before:
|
|
self.results["comment_embedded"] += 1
|
|
result.append(before)
|
|
else:
|
|
self.results["comment_whole"] += 1
|
|
result.append("")
|
|
else:
|
|
result.append(proc_line)
|
|
|
|
return result
|
|
|
|
def _count_directives(self, processed: List[str], original: List[str]) -> List[str]:
|
|
"""
|
|
Count and extract compiler directives (import/from statements).
|
|
Returns lines with directives removed.
|
|
"""
|
|
result = []
|
|
in_directive = False
|
|
|
|
for proc_line, orig_line in zip(processed, original):
|
|
stripped = proc_line.strip()
|
|
|
|
if not stripped:
|
|
result.append("")
|
|
continue
|
|
|
|
# Check if line starts with import/from
|
|
tokens = stripped.split()
|
|
if tokens and tokens[0] in self.DIRECTIVES:
|
|
self.results["compiler_directives"] += 1
|
|
in_directive = True
|
|
|
|
# Check for continuation (ends with \)
|
|
if not stripped.endswith("\\"):
|
|
in_directive = False
|
|
|
|
result.append("")
|
|
elif in_directive:
|
|
# Continuation of directive
|
|
self.results["compiler_directives"] += 1
|
|
|
|
if not stripped.endswith("\\"):
|
|
in_directive = False
|
|
|
|
result.append("")
|
|
else:
|
|
result.append(proc_line)
|
|
|
|
return result
|
|
|
|
def _count_logical_sloc(self, processed: List[str], original: List[str]) -> None:
|
|
"""
|
|
Count logical SLOC and exec instructions.
|
|
Python: all non-blank, non-comment, non-directive lines are exec.
|
|
"""
|
|
accumulated_statement = ""
|
|
paren_count = 0
|
|
bracket_count = 0
|
|
brace_count = 0
|
|
|
|
for proc_line, orig_line in zip(processed, original):
|
|
stripped = proc_line.strip()
|
|
|
|
if not stripped:
|
|
continue
|
|
|
|
# This is a physical line (non-blank, non-comment, non-directive)
|
|
self.results["physical_sloc"] += 1
|
|
|
|
# Track parentheses, brackets, braces
|
|
paren_count += proc_line.count("(") - proc_line.count(")")
|
|
bracket_count += proc_line.count("[") - proc_line.count("]")
|
|
brace_count += proc_line.count("{") - proc_line.count("}")
|
|
|
|
accumulated_statement += " " + stripped
|
|
|
|
# Check if statement is complete
|
|
statement_complete = False
|
|
|
|
# Statement continues if:
|
|
# 1. Inside parentheses/brackets/braces
|
|
if paren_count > 0 or bracket_count > 0 or brace_count > 0:
|
|
continue
|
|
|
|
# 2. Ends with continuation character
|
|
if stripped.endswith("\\"):
|
|
accumulated_statement = accumulated_statement.rstrip("\\")
|
|
continue
|
|
|
|
# 3. Ends with continuation operator/keyword
|
|
last_token = self._get_last_token(stripped)
|
|
if (
|
|
last_token in self.CONTINUATION_CHARS
|
|
or last_token in self.CONTINUATION_KEYWORDS
|
|
):
|
|
continue
|
|
|
|
# 4. Special case: else: or elif: - not counted as separate statement
|
|
if stripped.endswith("else:") or stripped.endswith("elif:"):
|
|
accumulated_statement = ""
|
|
continue
|
|
|
|
# Check for multiple statements on one line (separated by ; or :)
|
|
# Count : and ; as statement terminators (except in else:)
|
|
statement_seps = accumulated_statement.count(";")
|
|
|
|
# Count : but exclude 'else:'
|
|
colon_count = accumulated_statement.count(":")
|
|
if "else:" in accumulated_statement:
|
|
colon_count -= accumulated_statement.count("else:")
|
|
|
|
num_statements = max(1, statement_seps + colon_count + 1)
|
|
|
|
# Count as exec instruction and logical SLOC
|
|
if self._is_exec_instruction(accumulated_statement):
|
|
self.results["exec_instructions"] += 1
|
|
|
|
self.results["logical_sloc"] += num_statements
|
|
|
|
accumulated_statement = ""
|
|
|
|
# Handle incomplete statement at end of file
|
|
if accumulated_statement.strip():
|
|
if self._is_exec_instruction(accumulated_statement):
|
|
self.results["exec_instructions"] += 1
|
|
self.results["logical_sloc"] += 1
|
|
|
|
def _get_last_token(self, line: str) -> str:
|
|
"""Extract last meaningful token from line."""
|
|
line = line.rstrip()
|
|
if not line:
|
|
return ""
|
|
|
|
# Check if last char is an operator
|
|
if line[-1] in self.CONTINUATION_CHARS:
|
|
return line[-1]
|
|
|
|
# Extract last word
|
|
tokens = line.split()
|
|
if tokens:
|
|
return tokens[-1]
|
|
|
|
return ""
|
|
|
|
def _is_exec_instruction(self, statement: str) -> bool:
|
|
"""Check if statement contains executable keywords."""
|
|
statement_lower = statement.lower()
|
|
|
|
# Check for exec keywords
|
|
for keyword in self.EXEC_KEYWORDS:
|
|
# Use word boundaries to avoid false matches
|
|
pattern = r"\b" + re.escape(keyword) + r"\b"
|
|
if re.search(pattern, statement_lower):
|
|
return True
|
|
|
|
# Check for assignment (contains =)
|
|
if "=" in statement and "==" not in statement:
|
|
return True
|
|
|
|
# Check for function/method calls (contains '(')
|
|
if "(" in statement:
|
|
return True
|
|
|
|
# If has any content, consider it executable
|
|
return bool(statement.strip())
|