""" UCC-compatible counter for Java files. Implements UCC algorithms for Java with the following metrics: - Comment Whole Lines (/* */, //, /** */) - Comment Embedded Lines - Compiler Directives (import/package) - Data Declarations (class, interface, variable declarations) - Exec Instructions (executable statements) - Logical SLOC (statement count) - Physical SLOC (non-blank, non-comment lines) Java follows C-style syntax but with Java-specific keywords. """ import re from pathlib import Path from typing import Dict, List class UCCJavaCounter: """UCC-compatible counter for Java files.""" # Java directives DIRECTIVES = {"import", "package"} # Java data declaration keywords DATA_KEYWORDS = { "abstract", "ArrayList", "boolean", "byte", "char", "class", "double", "extends", "float", "HashMap", "HashSet", "implements", "int", "interface", "LinkedHashMap", "LinkedList", "long", "native", "private", "protected", "public", "short", "static", "String", "TreeMap", "Vector", "void", "volatile", "enum", "final", "transient", "synchronized", } # Java exec keywords EXEC_KEYWORDS = { "break", "case", "catch", "continue", "default", "do", "else", "finally", "for", "if", "new", "return", "super", "switch", "this", "throw", "throws", "try", "while", "instanceof", "assert", } def __init__(self): self.results = { "comment_whole": 0, "comment_embedded": 0, "compiler_directives": 0, "data_declarations": 0, "exec_instructions": 0, "logical_sloc": 0, "physical_sloc": 0, "blank_lines": 0, } def analyze_file(self, file_path: Path) -> Dict[str, int]: """Analyze a Java file using UCC algorithms.""" try: with open(file_path, "r", encoding="utf-8", errors="ignore") as f: lines = f.readlines() except Exception: return self.results.copy() # Step 1: Count blank lines BEFORE processing self._count_blank_lines(lines) # Step 2: Remove string literals (keep structure) processed_lines = self._remove_strings(lines) # Step 3: Count and remove comments processed_lines = self._count_and_remove_comments(processed_lines, lines) # Step 4: Count directives processed_lines = self._count_directives(processed_lines, lines) # Step 5: Count logical SLOC and classify data/exec self._count_logical_sloc(processed_lines, lines) return self.results.copy() def _count_blank_lines(self, lines: List[str]) -> None: """Count blank lines before processing.""" for line in lines: if not line.strip(): self.results["blank_lines"] += 1 def _remove_strings(self, lines: List[str]) -> List[str]: """Remove string literal contents, keep quotes.""" result = [] for line in lines: # Replace string contents with $ processed = line # Handle escaped quotes processed = re.sub( r'"([^"\\]*(\\.[^"\\]*)*)"', lambda m: '"' + "$" * (len(m.group(0)) - 2) + '"', processed, ) processed = re.sub( r"'([^'\\]*(\\.[^'\\]*)*)'", lambda m: "'" + "$" * (len(m.group(0)) - 2) + "'", processed, ) result.append(processed) return result def _count_and_remove_comments( self, processed: List[str], original: List[str] ) -> List[str]: """Count whole and embedded comments, then remove them.""" result = [] in_block = False for proc_line, orig_line in zip(processed, original): stripped = proc_line.strip() if not stripped: result.append("") continue if in_block: # Inside block comment self.results["comment_whole"] += 1 if "*/" in proc_line: idx = proc_line.find("*/") after = proc_line[idx + 2 :].strip() if after: result.append(after) else: result.append("") in_block = False else: result.append("") continue # Check for block comment start if "/*" in proc_line: start_idx = proc_line.find("/*") before = proc_line[:start_idx].strip() # Check if it ends on same line end_idx = proc_line.find("*/", start_idx) if end_idx != -1: after = proc_line[end_idx + 2 :].strip() combined = (before + " " + after).strip() if combined: self.results["comment_embedded"] += 1 result.append(combined) else: self.results["comment_whole"] += 1 result.append("") else: # Multi-line block starts in_block = True if before: self.results["comment_embedded"] += 1 result.append(before) else: self.results["comment_whole"] += 1 result.append("") continue # Check for line comment if "//" in proc_line: idx = proc_line.find("//") before = proc_line[:idx].strip() if before: self.results["comment_embedded"] += 1 result.append(before) else: self.results["comment_whole"] += 1 result.append("") else: result.append(proc_line) return result def _count_directives(self, processed: List[str], original: List[str]) -> List[str]: """Count compiler directives (import/package).""" result = [] for proc_line, orig_line in zip(processed, original): stripped = proc_line.strip() if not stripped: result.append("") continue # Check if line starts with import or package tokens = stripped.split() if tokens and tokens[0] in self.DIRECTIVES: self.results["compiler_directives"] += 1 result.append("") else: result.append(proc_line) return result def _count_logical_sloc(self, processed: List[str], original: List[str]) -> None: """Count logical SLOC and classify as data or exec.""" accumulated = "" brace_count = 0 paren_count = 0 bracket_count = 0 for proc_line, orig_line in zip(processed, original): stripped = proc_line.strip() if not stripped: continue # This is a physical SLOC self.results["physical_sloc"] += 1 # Track braces/parens/brackets brace_count += stripped.count("{") - stripped.count("}") paren_count += stripped.count("(") - stripped.count(")") bracket_count += stripped.count("[") - stripped.count("]") accumulated += " " + stripped # Statement complete when we hit ; or { or } and no open parens/brackets if paren_count == 0 and bracket_count == 0: if ";" in stripped or "{" in stripped or "}" in stripped: # Count statements by semicolons semicolons = accumulated.count(";") # Count braces (each { or } can be a statement) open_braces = accumulated.count("{") close_braces = accumulated.count("}") # Total logical statements num_statements = max(1, semicolons + open_braces) # Classify as data or exec if self._is_data_declaration(accumulated): self.results["data_declarations"] += 1 elif self._is_exec_instruction(accumulated): self.results["exec_instructions"] += 1 self.results["logical_sloc"] += num_statements accumulated = "" # Handle incomplete statement at EOF if accumulated.strip(): if self._is_data_declaration(accumulated): self.results["data_declarations"] += 1 elif self._is_exec_instruction(accumulated): self.results["exec_instructions"] += 1 self.results["logical_sloc"] += 1 def _is_data_declaration(self, statement: str) -> bool: """Check if statement is a data declaration.""" statement_lower = statement.lower() # Check for data keywords for keyword in self.DATA_KEYWORDS: pattern = r"\b" + re.escape(keyword.lower()) + r"\b" if re.search(pattern, statement_lower): return True return False def _is_exec_instruction(self, statement: str) -> bool: """Check if statement contains executable keywords.""" statement_lower = statement.lower() # Check for exec keywords for keyword in self.EXEC_KEYWORDS: pattern = r"\b" + re.escape(keyword.lower()) + r"\b" if re.search(pattern, statement_lower): return True # Check for method calls (contains '(') if "(" in statement and not any( kw in statement_lower for kw in ["class", "interface"] ): return True # Check for assignment (contains '=') if "=" in statement and "==" not in statement: return True return False