331 lines
10 KiB
Python
331 lines
10 KiB
Python
"""
|
|
UCC-compatible counter for Java files.
|
|
|
|
Implements UCC algorithms for Java with the following metrics:
|
|
- Comment Whole Lines (/* */, //, /** */)
|
|
- Comment Embedded Lines
|
|
- Compiler Directives (import/package)
|
|
- Data Declarations (class, interface, variable declarations)
|
|
- Exec Instructions (executable statements)
|
|
- Logical SLOC (statement count)
|
|
- Physical SLOC (non-blank, non-comment lines)
|
|
|
|
Java follows C-style syntax but with Java-specific keywords.
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
|
|
|
|
class UCCJavaCounter:
|
|
"""UCC-compatible counter for Java files."""
|
|
|
|
# Java directives
|
|
DIRECTIVES = {"import", "package"}
|
|
|
|
# Java data declaration keywords
|
|
DATA_KEYWORDS = {
|
|
"abstract",
|
|
"ArrayList",
|
|
"boolean",
|
|
"byte",
|
|
"char",
|
|
"class",
|
|
"double",
|
|
"extends",
|
|
"float",
|
|
"HashMap",
|
|
"HashSet",
|
|
"implements",
|
|
"int",
|
|
"interface",
|
|
"LinkedHashMap",
|
|
"LinkedList",
|
|
"long",
|
|
"native",
|
|
"private",
|
|
"protected",
|
|
"public",
|
|
"short",
|
|
"static",
|
|
"String",
|
|
"TreeMap",
|
|
"Vector",
|
|
"void",
|
|
"volatile",
|
|
"enum",
|
|
"final",
|
|
"transient",
|
|
"synchronized",
|
|
}
|
|
|
|
# Java exec keywords
|
|
EXEC_KEYWORDS = {
|
|
"break",
|
|
"case",
|
|
"catch",
|
|
"continue",
|
|
"default",
|
|
"do",
|
|
"else",
|
|
"finally",
|
|
"for",
|
|
"if",
|
|
"new",
|
|
"return",
|
|
"super",
|
|
"switch",
|
|
"this",
|
|
"throw",
|
|
"throws",
|
|
"try",
|
|
"while",
|
|
"instanceof",
|
|
"assert",
|
|
}
|
|
|
|
def __init__(self):
|
|
self.results = {
|
|
"comment_whole": 0,
|
|
"comment_embedded": 0,
|
|
"compiler_directives": 0,
|
|
"data_declarations": 0,
|
|
"exec_instructions": 0,
|
|
"logical_sloc": 0,
|
|
"physical_sloc": 0,
|
|
"blank_lines": 0,
|
|
}
|
|
|
|
def analyze_file(self, file_path: Path) -> Dict[str, int]:
|
|
"""Analyze a Java file using UCC algorithms."""
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
lines = f.readlines()
|
|
except Exception:
|
|
return self.results.copy()
|
|
|
|
# Step 1: Count blank lines BEFORE processing
|
|
self._count_blank_lines(lines)
|
|
|
|
# Step 2: Remove string literals (keep structure)
|
|
processed_lines = self._remove_strings(lines)
|
|
|
|
# Step 3: Count and remove comments
|
|
processed_lines = self._count_and_remove_comments(processed_lines, lines)
|
|
|
|
# Step 4: Count directives
|
|
processed_lines = self._count_directives(processed_lines, lines)
|
|
|
|
# Step 5: Count logical SLOC and classify data/exec
|
|
self._count_logical_sloc(processed_lines, lines)
|
|
|
|
return self.results.copy()
|
|
|
|
def _count_blank_lines(self, lines: List[str]) -> None:
|
|
"""Count blank lines before processing."""
|
|
for line in lines:
|
|
if not line.strip():
|
|
self.results["blank_lines"] += 1
|
|
|
|
def _remove_strings(self, lines: List[str]) -> List[str]:
|
|
"""Remove string literal contents, keep quotes."""
|
|
result = []
|
|
for line in lines:
|
|
# Replace string contents with $
|
|
processed = line
|
|
# Handle escaped quotes
|
|
processed = re.sub(
|
|
r'"([^"\\]*(\\.[^"\\]*)*)"',
|
|
lambda m: '"' + "$" * (len(m.group(0)) - 2) + '"',
|
|
processed,
|
|
)
|
|
processed = re.sub(
|
|
r"'([^'\\]*(\\.[^'\\]*)*)'",
|
|
lambda m: "'" + "$" * (len(m.group(0)) - 2) + "'",
|
|
processed,
|
|
)
|
|
result.append(processed)
|
|
return result
|
|
|
|
def _count_and_remove_comments(
|
|
self, processed: List[str], original: List[str]
|
|
) -> List[str]:
|
|
"""Count whole and embedded comments, then remove them."""
|
|
result = []
|
|
in_block = False
|
|
|
|
for proc_line, orig_line in zip(processed, original):
|
|
stripped = proc_line.strip()
|
|
|
|
if not stripped:
|
|
result.append("")
|
|
continue
|
|
|
|
if in_block:
|
|
# Inside block comment
|
|
self.results["comment_whole"] += 1
|
|
if "*/" in proc_line:
|
|
idx = proc_line.find("*/")
|
|
after = proc_line[idx + 2 :].strip()
|
|
if after:
|
|
result.append(after)
|
|
else:
|
|
result.append("")
|
|
in_block = False
|
|
else:
|
|
result.append("")
|
|
continue
|
|
|
|
# Check for block comment start
|
|
if "/*" in proc_line:
|
|
start_idx = proc_line.find("/*")
|
|
before = proc_line[:start_idx].strip()
|
|
|
|
# Check if it ends on same line
|
|
end_idx = proc_line.find("*/", start_idx)
|
|
if end_idx != -1:
|
|
after = proc_line[end_idx + 2 :].strip()
|
|
combined = (before + " " + after).strip()
|
|
|
|
if combined:
|
|
self.results["comment_embedded"] += 1
|
|
result.append(combined)
|
|
else:
|
|
self.results["comment_whole"] += 1
|
|
result.append("")
|
|
else:
|
|
# Multi-line block starts
|
|
in_block = True
|
|
if before:
|
|
self.results["comment_embedded"] += 1
|
|
result.append(before)
|
|
else:
|
|
self.results["comment_whole"] += 1
|
|
result.append("")
|
|
continue
|
|
|
|
# Check for line comment
|
|
if "//" in proc_line:
|
|
idx = proc_line.find("//")
|
|
before = proc_line[:idx].strip()
|
|
|
|
if before:
|
|
self.results["comment_embedded"] += 1
|
|
result.append(before)
|
|
else:
|
|
self.results["comment_whole"] += 1
|
|
result.append("")
|
|
else:
|
|
result.append(proc_line)
|
|
|
|
return result
|
|
|
|
def _count_directives(self, processed: List[str], original: List[str]) -> List[str]:
|
|
"""Count compiler directives (import/package)."""
|
|
result = []
|
|
|
|
for proc_line, orig_line in zip(processed, original):
|
|
stripped = proc_line.strip()
|
|
|
|
if not stripped:
|
|
result.append("")
|
|
continue
|
|
|
|
# Check if line starts with import or package
|
|
tokens = stripped.split()
|
|
if tokens and tokens[0] in self.DIRECTIVES:
|
|
self.results["compiler_directives"] += 1
|
|
result.append("")
|
|
else:
|
|
result.append(proc_line)
|
|
|
|
return result
|
|
|
|
def _count_logical_sloc(self, processed: List[str], original: List[str]) -> None:
|
|
"""Count logical SLOC and classify as data or exec."""
|
|
accumulated = ""
|
|
brace_count = 0
|
|
paren_count = 0
|
|
bracket_count = 0
|
|
|
|
for proc_line, orig_line in zip(processed, original):
|
|
stripped = proc_line.strip()
|
|
|
|
if not stripped:
|
|
continue
|
|
|
|
# This is a physical SLOC
|
|
self.results["physical_sloc"] += 1
|
|
|
|
# Track braces/parens/brackets
|
|
brace_count += stripped.count("{") - stripped.count("}")
|
|
paren_count += stripped.count("(") - stripped.count(")")
|
|
bracket_count += stripped.count("[") - stripped.count("]")
|
|
|
|
accumulated += " " + stripped
|
|
|
|
# Statement complete when we hit ; or { or } and no open parens/brackets
|
|
if paren_count == 0 and bracket_count == 0:
|
|
if ";" in stripped or "{" in stripped or "}" in stripped:
|
|
# Count statements by semicolons
|
|
semicolons = accumulated.count(";")
|
|
|
|
# Count braces (each { or } can be a statement)
|
|
open_braces = accumulated.count("{")
|
|
close_braces = accumulated.count("}")
|
|
|
|
# Total logical statements
|
|
num_statements = max(1, semicolons + open_braces)
|
|
|
|
# Classify as data or exec
|
|
if self._is_data_declaration(accumulated):
|
|
self.results["data_declarations"] += 1
|
|
elif self._is_exec_instruction(accumulated):
|
|
self.results["exec_instructions"] += 1
|
|
|
|
self.results["logical_sloc"] += num_statements
|
|
accumulated = ""
|
|
|
|
# Handle incomplete statement at EOF
|
|
if accumulated.strip():
|
|
if self._is_data_declaration(accumulated):
|
|
self.results["data_declarations"] += 1
|
|
elif self._is_exec_instruction(accumulated):
|
|
self.results["exec_instructions"] += 1
|
|
self.results["logical_sloc"] += 1
|
|
|
|
def _is_data_declaration(self, statement: str) -> bool:
|
|
"""Check if statement is a data declaration."""
|
|
statement_lower = statement.lower()
|
|
|
|
# Check for data keywords
|
|
for keyword in self.DATA_KEYWORDS:
|
|
pattern = r"\b" + re.escape(keyword.lower()) + r"\b"
|
|
if re.search(pattern, statement_lower):
|
|
return True
|
|
|
|
return False
|
|
|
|
def _is_exec_instruction(self, statement: str) -> bool:
|
|
"""Check if statement contains executable keywords."""
|
|
statement_lower = statement.lower()
|
|
|
|
# Check for exec keywords
|
|
for keyword in self.EXEC_KEYWORDS:
|
|
pattern = r"\b" + re.escape(keyword.lower()) + r"\b"
|
|
if re.search(pattern, statement_lower):
|
|
return True
|
|
|
|
# Check for method calls (contains '(')
|
|
if "(" in statement and not any(
|
|
kw in statement_lower for kw in ["class", "interface"]
|
|
):
|
|
return True
|
|
|
|
# Check for assignment (contains '=')
|
|
if "=" in statement and "==" not in statement:
|
|
return True
|
|
|
|
return False
|