SXXXXXXX_PyUCC/pyucc/core/ucc_java_counter.py

"""
UCC-compatible counter for Java files.

Implements UCC algorithms for Java with the following metrics:
- Comment Whole Lines (/* */, //, /** */)
- Comment Embedded Lines
- Compiler Directives (import/package)
- Data Declarations (class, interface, variable declarations)
- Exec Instructions (executable statements)
- Logical SLOC (statement count)
- Physical SLOC (non-blank, non-comment lines)

Java follows C-style syntax but with Java-specific keywords.
"""

import re
from pathlib import Path
from typing import Dict, List


class UCCJavaCounter:
    """UCC-compatible counter for Java files."""

    # Java directives
    DIRECTIVES = {"import", "package"}

    # Java data declaration keywords
    DATA_KEYWORDS = {
        "abstract",
        "ArrayList",
        "boolean",
        "byte",
        "char",
        "class",
        "double",
        "extends",
        "float",
        "HashMap",
        "HashSet",
        "implements",
        "int",
        "interface",
        "LinkedHashMap",
        "LinkedList",
        "long",
        "native",
        "private",
        "protected",
        "public",
        "short",
        "static",
        "String",
        "TreeMap",
        "Vector",
        "void",
        "volatile",
        "enum",
        "final",
        "transient",
        "synchronized",
    }

    # Java exec keywords
    EXEC_KEYWORDS = {
        "break",
        "case",
        "catch",
        "continue",
        "default",
        "do",
        "else",
        "finally",
        "for",
        "if",
        "new",
        "return",
        "super",
        "switch",
        "this",
        "throw",
        "throws",
        "try",
        "while",
        "instanceof",
        "assert",
    }

    def __init__(self):
        self.results = {
            "comment_whole": 0,
            "comment_embedded": 0,
            "compiler_directives": 0,
            "data_declarations": 0,
            "exec_instructions": 0,
            "logical_sloc": 0,
            "physical_sloc": 0,
            "blank_lines": 0,
        }

    def analyze_file(self, file_path: Path) -> Dict[str, int]:
        """Analyze a Java file using UCC algorithms."""
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                lines = f.readlines()
        except Exception:
            return self.results.copy()

        # Step 1: Count blank lines BEFORE processing
        self._count_blank_lines(lines)

        # Step 2: Remove string literals (keep structure)
        processed_lines = self._remove_strings(lines)

        # Step 3: Count and remove comments
        processed_lines = self._count_and_remove_comments(processed_lines, lines)

        # Step 4: Count directives
        processed_lines = self._count_directives(processed_lines, lines)

        # Step 5: Count logical SLOC and classify data/exec
        self._count_logical_sloc(processed_lines, lines)

        return self.results.copy()

    def _count_blank_lines(self, lines: List[str]) -> None:
        """Count blank lines before processing."""
        for line in lines:
            if not line.strip():
                self.results["blank_lines"] += 1

    def _remove_strings(self, lines: List[str]) -> List[str]:
        """Remove string literal contents, keep quotes."""
        result = []
        for line in lines:
            # Replace string contents with $
            processed = line
            # Handle escaped quotes
            processed = re.sub(
                r'"([^"\\]*(\\.[^"\\]*)*)"',
                lambda m: '"' + "$" * (len(m.group(0)) - 2) + '"',
                processed,
            )
            processed = re.sub(
                r"'([^'\\]*(\\.[^'\\]*)*)'",
                lambda m: "'" + "$" * (len(m.group(0)) - 2) + "'",
                processed,
            )
            result.append(processed)
        return result

    def _count_and_remove_comments(
        self, processed: List[str], original: List[str]
    ) -> List[str]:
        """Count whole and embedded comments, then remove them."""
        result = []
        in_block = False

        for proc_line, orig_line in zip(processed, original):
            stripped = proc_line.strip()

            if not stripped:
                result.append("")
                continue

            if in_block:
                # Inside block comment
                self.results["comment_whole"] += 1
                if "*/" in proc_line:
                    idx = proc_line.find("*/")
                    after = proc_line[idx + 2 :].strip()
                    if after:
                        result.append(after)
                    else:
                        result.append("")
                    in_block = False
                else:
                    result.append("")
                continue

            # Check for block comment start
            if "/*" in proc_line:
                start_idx = proc_line.find("/*")
                before = proc_line[:start_idx].strip()

                # Check if it ends on same line
                end_idx = proc_line.find("*/", start_idx)
                if end_idx != -1:
                    after = proc_line[end_idx + 2 :].strip()
                    combined = (before + " " + after).strip()

                    if combined:
                        self.results["comment_embedded"] += 1
                        result.append(combined)
                    else:
                        self.results["comment_whole"] += 1
                        result.append("")
                else:
                    # Multi-line block starts
                    in_block = True
                    if before:
                        self.results["comment_embedded"] += 1
                        result.append(before)
                    else:
                        self.results["comment_whole"] += 1
                        result.append("")
                continue

            # Check for line comment
            if "//" in proc_line:
                idx = proc_line.find("//")
                before = proc_line[:idx].strip()

                if before:
                    self.results["comment_embedded"] += 1
                    result.append(before)
                else:
                    self.results["comment_whole"] += 1
                    result.append("")
            else:
                result.append(proc_line)

        return result

    def _count_directives(self, processed: List[str], original: List[str]) -> List[str]:
        """Count compiler directives (import/package)."""
        result = []

        for proc_line, orig_line in zip(processed, original):
            stripped = proc_line.strip()

            if not stripped:
                result.append("")
                continue

            # Check if line starts with import or package
            tokens = stripped.split()
            if tokens and tokens[0] in self.DIRECTIVES:
                self.results["compiler_directives"] += 1
                result.append("")
            else:
                result.append(proc_line)

        return result

    def _count_logical_sloc(self, processed: List[str], original: List[str]) -> None:
        """Count logical SLOC and classify as data or exec."""
        accumulated = ""
        brace_count = 0
        paren_count = 0
        bracket_count = 0

        for proc_line, orig_line in zip(processed, original):
            stripped = proc_line.strip()

            if not stripped:
                continue

            # This is a physical SLOC
            self.results["physical_sloc"] += 1

            # Track braces/parens/brackets
            brace_count += stripped.count("{") - stripped.count("}")
            paren_count += stripped.count("(") - stripped.count(")")
            bracket_count += stripped.count("[") - stripped.count("]")

            accumulated += " " + stripped

            # Statement complete when we hit ; or { or } and no open parens/brackets
            if paren_count == 0 and bracket_count == 0:
                if ";" in stripped or "{" in stripped or "}" in stripped:
                    # Count statements by semicolons
                    semicolons = accumulated.count(";")

                    # Count braces (each { or } can be a statement)
                    open_braces = accumulated.count("{")
                    close_braces = accumulated.count("}")

                    # Total logical statements
                    num_statements = max(1, semicolons + open_braces)

                    # Classify as data or exec
                    if self._is_data_declaration(accumulated):
                        self.results["data_declarations"] += 1
                    elif self._is_exec_instruction(accumulated):
                        self.results["exec_instructions"] += 1

                    self.results["logical_sloc"] += num_statements
                    accumulated = ""

        # Handle incomplete statement at EOF
        if accumulated.strip():
            if self._is_data_declaration(accumulated):
                self.results["data_declarations"] += 1
            elif self._is_exec_instruction(accumulated):
                self.results["exec_instructions"] += 1
            self.results["logical_sloc"] += 1

    def _is_data_declaration(self, statement: str) -> bool:
        """Check if statement is a data declaration."""
        statement_lower = statement.lower()

        # Check for data keywords
        for keyword in self.DATA_KEYWORDS:
            pattern = r"\b" + re.escape(keyword.lower()) + r"\b"
            if re.search(pattern, statement_lower):
                return True

        return False

    def _is_exec_instruction(self, statement: str) -> bool:
        """Check if statement contains executable keywords."""
        statement_lower = statement.lower()

        # Check for exec keywords
        for keyword in self.EXEC_KEYWORDS:
            pattern = r"\b" + re.escape(keyword.lower()) + r"\b"
            if re.search(pattern, statement_lower):
                return True

        # Check for method calls (contains '(')
        if "(" in statement and not any(
            kw in statement_lower for kw in ["class", "interface"]
        ):
            return True

        # Check for assignment (contains '=')
        if "=" in statement and "==" not in statement:
            return True

        return False