SXXXXXXX_PyUCC/pyucc/core/ucc_java_counter.py

331 lines
10 KiB
Python

"""
UCC-compatible counter for Java files.
Implements UCC algorithms for Java with the following metrics:
- Comment Whole Lines (/* */, //, /** */)
- Comment Embedded Lines
- Compiler Directives (import/package)
- Data Declarations (class, interface, variable declarations)
- Exec Instructions (executable statements)
- Logical SLOC (statement count)
- Physical SLOC (non-blank, non-comment lines)
Java follows C-style syntax but with Java-specific keywords.
"""
import re
from pathlib import Path
from typing import Dict, List
class UCCJavaCounter:
"""UCC-compatible counter for Java files."""
# Java directives
DIRECTIVES = {"import", "package"}
# Java data declaration keywords
DATA_KEYWORDS = {
"abstract",
"ArrayList",
"boolean",
"byte",
"char",
"class",
"double",
"extends",
"float",
"HashMap",
"HashSet",
"implements",
"int",
"interface",
"LinkedHashMap",
"LinkedList",
"long",
"native",
"private",
"protected",
"public",
"short",
"static",
"String",
"TreeMap",
"Vector",
"void",
"volatile",
"enum",
"final",
"transient",
"synchronized",
}
# Java exec keywords
EXEC_KEYWORDS = {
"break",
"case",
"catch",
"continue",
"default",
"do",
"else",
"finally",
"for",
"if",
"new",
"return",
"super",
"switch",
"this",
"throw",
"throws",
"try",
"while",
"instanceof",
"assert",
}
def __init__(self):
self.results = {
"comment_whole": 0,
"comment_embedded": 0,
"compiler_directives": 0,
"data_declarations": 0,
"exec_instructions": 0,
"logical_sloc": 0,
"physical_sloc": 0,
"blank_lines": 0,
}
def analyze_file(self, file_path: Path) -> Dict[str, int]:
"""Analyze a Java file using UCC algorithms."""
try:
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
lines = f.readlines()
except Exception:
return self.results.copy()
# Step 1: Count blank lines BEFORE processing
self._count_blank_lines(lines)
# Step 2: Remove string literals (keep structure)
processed_lines = self._remove_strings(lines)
# Step 3: Count and remove comments
processed_lines = self._count_and_remove_comments(processed_lines, lines)
# Step 4: Count directives
processed_lines = self._count_directives(processed_lines, lines)
# Step 5: Count logical SLOC and classify data/exec
self._count_logical_sloc(processed_lines, lines)
return self.results.copy()
def _count_blank_lines(self, lines: List[str]) -> None:
"""Count blank lines before processing."""
for line in lines:
if not line.strip():
self.results["blank_lines"] += 1
def _remove_strings(self, lines: List[str]) -> List[str]:
"""Remove string literal contents, keep quotes."""
result = []
for line in lines:
# Replace string contents with $
processed = line
# Handle escaped quotes
processed = re.sub(
r'"([^"\\]*(\\.[^"\\]*)*)"',
lambda m: '"' + "$" * (len(m.group(0)) - 2) + '"',
processed,
)
processed = re.sub(
r"'([^'\\]*(\\.[^'\\]*)*)'",
lambda m: "'" + "$" * (len(m.group(0)) - 2) + "'",
processed,
)
result.append(processed)
return result
def _count_and_remove_comments(
self, processed: List[str], original: List[str]
) -> List[str]:
"""Count whole and embedded comments, then remove them."""
result = []
in_block = False
for proc_line, orig_line in zip(processed, original):
stripped = proc_line.strip()
if not stripped:
result.append("")
continue
if in_block:
# Inside block comment
self.results["comment_whole"] += 1
if "*/" in proc_line:
idx = proc_line.find("*/")
after = proc_line[idx + 2 :].strip()
if after:
result.append(after)
else:
result.append("")
in_block = False
else:
result.append("")
continue
# Check for block comment start
if "/*" in proc_line:
start_idx = proc_line.find("/*")
before = proc_line[:start_idx].strip()
# Check if it ends on same line
end_idx = proc_line.find("*/", start_idx)
if end_idx != -1:
after = proc_line[end_idx + 2 :].strip()
combined = (before + " " + after).strip()
if combined:
self.results["comment_embedded"] += 1
result.append(combined)
else:
self.results["comment_whole"] += 1
result.append("")
else:
# Multi-line block starts
in_block = True
if before:
self.results["comment_embedded"] += 1
result.append(before)
else:
self.results["comment_whole"] += 1
result.append("")
continue
# Check for line comment
if "//" in proc_line:
idx = proc_line.find("//")
before = proc_line[:idx].strip()
if before:
self.results["comment_embedded"] += 1
result.append(before)
else:
self.results["comment_whole"] += 1
result.append("")
else:
result.append(proc_line)
return result
def _count_directives(self, processed: List[str], original: List[str]) -> List[str]:
"""Count compiler directives (import/package)."""
result = []
for proc_line, orig_line in zip(processed, original):
stripped = proc_line.strip()
if not stripped:
result.append("")
continue
# Check if line starts with import or package
tokens = stripped.split()
if tokens and tokens[0] in self.DIRECTIVES:
self.results["compiler_directives"] += 1
result.append("")
else:
result.append(proc_line)
return result
def _count_logical_sloc(self, processed: List[str], original: List[str]) -> None:
"""Count logical SLOC and classify as data or exec."""
accumulated = ""
brace_count = 0
paren_count = 0
bracket_count = 0
for proc_line, orig_line in zip(processed, original):
stripped = proc_line.strip()
if not stripped:
continue
# This is a physical SLOC
self.results["physical_sloc"] += 1
# Track braces/parens/brackets
brace_count += stripped.count("{") - stripped.count("}")
paren_count += stripped.count("(") - stripped.count(")")
bracket_count += stripped.count("[") - stripped.count("]")
accumulated += " " + stripped
# Statement complete when we hit ; or { or } and no open parens/brackets
if paren_count == 0 and bracket_count == 0:
if ";" in stripped or "{" in stripped or "}" in stripped:
# Count statements by semicolons
semicolons = accumulated.count(";")
# Count braces (each { or } can be a statement)
open_braces = accumulated.count("{")
close_braces = accumulated.count("}")
# Total logical statements
num_statements = max(1, semicolons + open_braces)
# Classify as data or exec
if self._is_data_declaration(accumulated):
self.results["data_declarations"] += 1
elif self._is_exec_instruction(accumulated):
self.results["exec_instructions"] += 1
self.results["logical_sloc"] += num_statements
accumulated = ""
# Handle incomplete statement at EOF
if accumulated.strip():
if self._is_data_declaration(accumulated):
self.results["data_declarations"] += 1
elif self._is_exec_instruction(accumulated):
self.results["exec_instructions"] += 1
self.results["logical_sloc"] += 1
def _is_data_declaration(self, statement: str) -> bool:
"""Check if statement is a data declaration."""
statement_lower = statement.lower()
# Check for data keywords
for keyword in self.DATA_KEYWORDS:
pattern = r"\b" + re.escape(keyword.lower()) + r"\b"
if re.search(pattern, statement_lower):
return True
return False
def _is_exec_instruction(self, statement: str) -> bool:
"""Check if statement contains executable keywords."""
statement_lower = statement.lower()
# Check for exec keywords
for keyword in self.EXEC_KEYWORDS:
pattern = r"\b" + re.escape(keyword.lower()) + r"\b"
if re.search(pattern, statement_lower):
return True
# Check for method calls (contains '(')
if "(" in statement and not any(
kw in statement_lower for kw in ["class", "interface"]
):
return True
# Check for assignment (contains '=')
if "=" in statement and "==" not in statement:
return True
return False