SXXXXXXX_PyUCC/pyucc/core/ucc_java_counter.py

275 lines
10 KiB
Python

"""
UCC-compatible counter for Java files.
Implements UCC algorithms for Java with the following metrics:
- Comment Whole Lines (/* */, //, /** */)
- Comment Embedded Lines
- Compiler Directives (import/package)
- Data Declarations (class, interface, variable declarations)
- Exec Instructions (executable statements)
- Logical SLOC (statement count)
- Physical SLOC (non-blank, non-comment lines)
Java follows C-style syntax but with Java-specific keywords.
"""
import re
from pathlib import Path
from typing import Dict, List
class UCCJavaCounter:
"""UCC-compatible counter for Java files."""
# Java directives
DIRECTIVES = {'import', 'package'}
# Java data declaration keywords
DATA_KEYWORDS = {
'abstract', 'ArrayList', 'boolean', 'byte', 'char', 'class',
'double', 'extends', 'float', 'HashMap', 'HashSet', 'implements',
'int', 'interface', 'LinkedHashMap', 'LinkedList', 'long',
'native', 'private', 'protected', 'public', 'short', 'static',
'String', 'TreeMap', 'Vector', 'void', 'volatile', 'enum',
'final', 'transient', 'synchronized'
}
# Java exec keywords
EXEC_KEYWORDS = {
'break', 'case', 'catch', 'continue', 'default', 'do', 'else',
'finally', 'for', 'if', 'new', 'return', 'super', 'switch',
'this', 'throw', 'throws', 'try', 'while', 'instanceof', 'assert'
}
def __init__(self):
self.results = {
'comment_whole': 0,
'comment_embedded': 0,
'compiler_directives': 0,
'data_declarations': 0,
'exec_instructions': 0,
'logical_sloc': 0,
'physical_sloc': 0,
'blank_lines': 0,
}
def analyze_file(self, file_path: Path) -> Dict[str, int]:
"""Analyze a Java file using UCC algorithms."""
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
lines = f.readlines()
except Exception:
return self.results.copy()
# Step 1: Count blank lines BEFORE processing
self._count_blank_lines(lines)
# Step 2: Remove string literals (keep structure)
processed_lines = self._remove_strings(lines)
# Step 3: Count and remove comments
processed_lines = self._count_and_remove_comments(processed_lines, lines)
# Step 4: Count directives
processed_lines = self._count_directives(processed_lines, lines)
# Step 5: Count logical SLOC and classify data/exec
self._count_logical_sloc(processed_lines, lines)
return self.results.copy()
def _count_blank_lines(self, lines: List[str]) -> None:
"""Count blank lines before processing."""
for line in lines:
if not line.strip():
self.results['blank_lines'] += 1
def _remove_strings(self, lines: List[str]) -> List[str]:
"""Remove string literal contents, keep quotes."""
result = []
for line in lines:
# Replace string contents with $
processed = line
# Handle escaped quotes
processed = re.sub(r'"([^"\\]*(\\.[^"\\]*)*)"', lambda m: '"' + '$' * (len(m.group(0)) - 2) + '"', processed)
processed = re.sub(r"'([^'\\]*(\\.[^'\\]*)*)'", lambda m: "'" + '$' * (len(m.group(0)) - 2) + "'", processed)
result.append(processed)
return result
def _count_and_remove_comments(self, processed: List[str], original: List[str]) -> List[str]:
"""Count whole and embedded comments, then remove them."""
result = []
in_block = False
for proc_line, orig_line in zip(processed, original):
stripped = proc_line.strip()
if not stripped:
result.append('')
continue
if in_block:
# Inside block comment
self.results['comment_whole'] += 1
if '*/' in proc_line:
idx = proc_line.find('*/')
after = proc_line[idx + 2:].strip()
if after:
result.append(after)
else:
result.append('')
in_block = False
else:
result.append('')
continue
# Check for block comment start
if '/*' in proc_line:
start_idx = proc_line.find('/*')
before = proc_line[:start_idx].strip()
# Check if it ends on same line
end_idx = proc_line.find('*/', start_idx)
if end_idx != -1:
after = proc_line[end_idx + 2:].strip()
combined = (before + ' ' + after).strip()
if combined:
self.results['comment_embedded'] += 1
result.append(combined)
else:
self.results['comment_whole'] += 1
result.append('')
else:
# Multi-line block starts
in_block = True
if before:
self.results['comment_embedded'] += 1
result.append(before)
else:
self.results['comment_whole'] += 1
result.append('')
continue
# Check for line comment
if '//' in proc_line:
idx = proc_line.find('//')
before = proc_line[:idx].strip()
if before:
self.results['comment_embedded'] += 1
result.append(before)
else:
self.results['comment_whole'] += 1
result.append('')
else:
result.append(proc_line)
return result
def _count_directives(self, processed: List[str], original: List[str]) -> List[str]:
"""Count compiler directives (import/package)."""
result = []
for proc_line, orig_line in zip(processed, original):
stripped = proc_line.strip()
if not stripped:
result.append('')
continue
# Check if line starts with import or package
tokens = stripped.split()
if tokens and tokens[0] in self.DIRECTIVES:
self.results['compiler_directives'] += 1
result.append('')
else:
result.append(proc_line)
return result
def _count_logical_sloc(self, processed: List[str], original: List[str]) -> None:
"""Count logical SLOC and classify as data or exec."""
accumulated = ''
brace_count = 0
paren_count = 0
bracket_count = 0
for proc_line, orig_line in zip(processed, original):
stripped = proc_line.strip()
if not stripped:
continue
# This is a physical SLOC
self.results['physical_sloc'] += 1
# Track braces/parens/brackets
brace_count += stripped.count('{') - stripped.count('}')
paren_count += stripped.count('(') - stripped.count(')')
bracket_count += stripped.count('[') - stripped.count(']')
accumulated += ' ' + stripped
# Statement complete when we hit ; or { or } and no open parens/brackets
if paren_count == 0 and bracket_count == 0:
if ';' in stripped or '{' in stripped or '}' in stripped:
# Count statements by semicolons
semicolons = accumulated.count(';')
# Count braces (each { or } can be a statement)
open_braces = accumulated.count('{')
close_braces = accumulated.count('}')
# Total logical statements
num_statements = max(1, semicolons + open_braces)
# Classify as data or exec
if self._is_data_declaration(accumulated):
self.results['data_declarations'] += 1
elif self._is_exec_instruction(accumulated):
self.results['exec_instructions'] += 1
self.results['logical_sloc'] += num_statements
accumulated = ''
# Handle incomplete statement at EOF
if accumulated.strip():
if self._is_data_declaration(accumulated):
self.results['data_declarations'] += 1
elif self._is_exec_instruction(accumulated):
self.results['exec_instructions'] += 1
self.results['logical_sloc'] += 1
def _is_data_declaration(self, statement: str) -> bool:
"""Check if statement is a data declaration."""
statement_lower = statement.lower()
# Check for data keywords
for keyword in self.DATA_KEYWORDS:
pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
if re.search(pattern, statement_lower):
return True
return False
def _is_exec_instruction(self, statement: str) -> bool:
"""Check if statement contains executable keywords."""
statement_lower = statement.lower()
# Check for exec keywords
for keyword in self.EXEC_KEYWORDS:
pattern = r'\b' + re.escape(keyword.lower()) + r'\b'
if re.search(pattern, statement_lower):
return True
# Check for method calls (contains '(')
if '(' in statement and not any(kw in statement_lower for kw in ['class', 'interface']):
return True
# Check for assignment (contains '=')
if '=' in statement and '==' not in statement:
return True
return False