SXXXXXXX_PyUCC/pyucc/core/ucc_python_counter.py

"""
UCC-compatible counter for Python files.

Implements UCC algorithms for Python with the following metrics:
- Comment Whole Lines
- Comment Embedded Lines
- Compiler Directives (import/from statements)
- Exec Instructions (all executable code in Python)
- Logical SLOC (statement count)
- Physical SLOC (non-blank, non-comment lines)

Note: Python does not distinguish between data declarations and executable instructions,
so data_declarations is always 0.
"""

import re
from pathlib import Path
from typing import Dict, List, Tuple


class UCCPythonCounter:
    """UCC-compatible counter for Python files."""

    # Python directives (imports)
    DIRECTIVES = {"import", "from", "as"}

    # Python exec keywords (all are exec, no data declarations)
    EXEC_KEYWORDS = {
        "and",
        "as",
        "assert",
        "break",
        "continue",
        "def",
        "del",
        "elif",
        "else",
        "except",
        "exec",
        "exit",
        "finally",
        "for",
        "global",
        "if",
        "in",
        "is",
        "lambda",
        "not",
        "or",
        "pass",
        "print",
        "raise",
        "return",
        "try",
        "while",
        "with",
        "yield",
        "class",
        "async",
        "await",
        "nonlocal",
    }

    # Continuation indicators
    CONTINUATION_CHARS = {
        "+",
        "-",
        "*",
        "/",
        "=",
        "<",
        ">",
        "|",
        "&",
        "%",
        "^",
        "\\",
        "~",
        ",",
    }
    CONTINUATION_KEYWORDS = {"is", "in", "not", "and", "or"}

    def __init__(self):
        self.results = {
            "comment_whole": 0,
            "comment_embedded": 0,
            "compiler_directives": 0,
            "data_declarations": 0,  # Always 0 for Python
            "exec_instructions": 0,
            "logical_sloc": 0,
            "physical_sloc": 0,
            "blank_lines": 0,
        }

    def analyze_file(self, file_path: Path) -> Dict[str, int]:
        """
        Analyze a Python file using UCC algorithms.

        Returns dict with UCC extended metrics.
        """
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                lines = f.readlines()
        except Exception:
            return self.results.copy()

        # Step 1: Count blank lines BEFORE any processing
        self._count_blank_lines(lines)

        # Step 2: Process strings and comments
        processed_lines, original_lines = self._preprocess_lines(lines)

        # Step 3: Count and remove comments (updates comment_whole, comment_embedded)
        processed_lines = self._count_and_remove_comments(
            processed_lines, original_lines
        )

        # Step 4: Count directives (import/from statements)
        processed_lines = self._count_directives(processed_lines, original_lines)

        # Step 5: Count logical SLOC and exec instructions
        self._count_logical_sloc(processed_lines, original_lines)

        return self.results.copy()

    def _count_blank_lines(self, lines: List[str]) -> None:
        """Count blank lines before any processing (UCC counts originally blank lines)."""
        for line in lines:
            if not line.strip():
                self.results["blank_lines"] += 1

    def _preprocess_lines(self, lines: List[str]) -> Tuple[List[str], List[str]]:
        """
        Preprocess lines: remove string contents but keep structure.
        Returns (processed_lines, original_lines).
        """
        processed = []
        original = []

        in_triple_quote = False
        triple_quote_char = None

        for line in lines:
            original.append(line.rstrip("\n\r"))

            if in_triple_quote:
                # Inside triple-quoted string
                if triple_quote_char * 3 in line:
                    # End of triple-quoted string
                    idx = line.find(triple_quote_char * 3)
                    processed_line = "$" * (idx + 3) + line[idx + 3 :]
                    in_triple_quote = False
                else:
                    # Entire line is inside string
                    processed_line = "$" * len(line.rstrip("\n\r"))
            else:
                processed_line = line.rstrip("\n\r")

                # Check for triple-quoted strings
                for quote_char in ['"', "'"]:
                    triple = quote_char * 3
                    if triple in processed_line:
                        start_idx = processed_line.find(triple)
                        end_idx = processed_line.find(triple, start_idx + 3)

                        if end_idx != -1:
                            # Complete triple-quoted string on one line
                            processed_line = (
                                processed_line[:start_idx]
                                + "$" * (end_idx - start_idx + 3)
                                + processed_line[end_idx + 3 :]
                            )
                        else:
                            # Start of multi-line triple-quoted string
                            processed_line = processed_line[:start_idx] + "$" * (
                                len(processed_line) - start_idx
                            )
                            in_triple_quote = True
                            triple_quote_char = quote_char
                        break

                # Replace single/double quoted strings (only if not in triple quote)
                if not in_triple_quote:
                    processed_line = self._replace_quotes(processed_line)

            processed.append(processed_line)

        return processed, original

    def _replace_quotes(self, line: str) -> str:
        """Replace content of single and double quoted strings with $."""
        result = []
        i = 0
        while i < len(line):
            if line[i] in ['"', "'"]:
                quote = line[i]
                result.append(quote)
                i += 1

                # Find closing quote, handling escape sequences
                while i < len(line):
                    if line[i] == "\\" and i + 1 < len(line):
                        result.append("$")
                        result.append("$")
                        i += 2
                    elif line[i] == quote:
                        result.append(quote)
                        i += 1
                        break
                    else:
                        result.append("$")
                        i += 1
            else:
                result.append(line[i])
                i += 1

        return "".join(result)

    def _count_and_remove_comments(
        self, processed: List[str], original: List[str]
    ) -> List[str]:
        """
        Count whole and embedded comments, then remove them.
        UCC counts EVERY line in a multi-line comment block as whole.
        """
        result = []
        in_block_comment = False
        block_quote_char = None

        for proc_line, orig_line in zip(processed, original):
            stripped = proc_line.strip()

            # Skip blank lines (already processed and counted)
            if not stripped:
                result.append("")
                continue

            # Handle multi-line comments (""" or ''')
            if in_block_comment:
                # Every line in block is a whole comment
                self.results["comment_whole"] += 1

                # Check if block ends
                if block_quote_char * 3 in proc_line:
                    in_block_comment = False

                result.append("")
                continue

            # Check for start of block comment
            block_started = False
            for quote_char in ['"', "'"]:
                triple = quote_char * 3
                if triple in proc_line:
                    # Check if it's complete on one line
                    first = proc_line.find(triple)
                    second = proc_line.find(triple, first + 3)

                    if second == -1:
                        # Multi-line block starts
                        in_block_comment = True
                        block_quote_char = quote_char
                        block_started = True

                        # Check if there's code before the comment
                        before = proc_line[:first].strip()
                        if before:
                            self.results["comment_embedded"] += 1
                            result.append(before)
                        else:
                            self.results["comment_whole"] += 1
                            result.append("")
                        break
                    else:
                        # Complete block comment on one line
                        before = proc_line[:first].strip()
                        after = proc_line[second + 3 :].strip()

                        if before or after:
                            self.results["comment_embedded"] += 1
                            result.append(before + " " + after)
                        else:
                            self.results["comment_whole"] += 1
                            result.append("")
                        block_started = True
                        break

            if block_started:
                continue

            # Handle single-line comments (#)
            if "#" in proc_line:
                comment_idx = proc_line.find("#")
                before = proc_line[:comment_idx].strip()

                if before:
                    self.results["comment_embedded"] += 1
                    result.append(before)
                else:
                    self.results["comment_whole"] += 1
                    result.append("")
            else:
                result.append(proc_line)

        return result

    def _count_directives(self, processed: List[str], original: List[str]) -> List[str]:
        """
        Count and extract compiler directives (import/from statements).
        Returns lines with directives removed.
        """
        result = []
        in_directive = False

        for proc_line, orig_line in zip(processed, original):
            stripped = proc_line.strip()

            if not stripped:
                result.append("")
                continue

            # Check if line starts with import/from
            tokens = stripped.split()
            if tokens and tokens[0] in self.DIRECTIVES:
                self.results["compiler_directives"] += 1
                in_directive = True

                # Check for continuation (ends with \)
                if not stripped.endswith("\\"):
                    in_directive = False

                result.append("")
            elif in_directive:
                # Continuation of directive
                self.results["compiler_directives"] += 1

                if not stripped.endswith("\\"):
                    in_directive = False

                result.append("")
            else:
                result.append(proc_line)

        return result

    def _count_logical_sloc(self, processed: List[str], original: List[str]) -> None:
        """
        Count logical SLOC and exec instructions.
        Python: all non-blank, non-comment, non-directive lines are exec.
        """
        accumulated_statement = ""
        paren_count = 0
        bracket_count = 0
        brace_count = 0

        for proc_line, orig_line in zip(processed, original):
            stripped = proc_line.strip()

            if not stripped:
                continue

            # This is a physical line (non-blank, non-comment, non-directive)
            self.results["physical_sloc"] += 1

            # Track parentheses, brackets, braces
            paren_count += proc_line.count("(") - proc_line.count(")")
            bracket_count += proc_line.count("[") - proc_line.count("]")
            brace_count += proc_line.count("{") - proc_line.count("}")

            accumulated_statement += " " + stripped

            # Check if statement is complete
            statement_complete = False

            # Statement continues if:
            # 1. Inside parentheses/brackets/braces
            if paren_count > 0 or bracket_count > 0 or brace_count > 0:
                continue

            # 2. Ends with continuation character
            if stripped.endswith("\\"):
                accumulated_statement = accumulated_statement.rstrip("\\")
                continue

            # 3. Ends with continuation operator/keyword
            last_token = self._get_last_token(stripped)
            if (
                last_token in self.CONTINUATION_CHARS
                or last_token in self.CONTINUATION_KEYWORDS
            ):
                continue

            # 4. Special case: else: or elif: - not counted as separate statement
            if stripped.endswith("else:") or stripped.endswith("elif:"):
                accumulated_statement = ""
                continue

            # Check for multiple statements on one line (separated by ; or :)
            # Count : and ; as statement terminators (except in else:)
            statement_seps = accumulated_statement.count(";")

            # Count : but exclude 'else:'
            colon_count = accumulated_statement.count(":")
            if "else:" in accumulated_statement:
                colon_count -= accumulated_statement.count("else:")

            num_statements = max(1, statement_seps + colon_count + 1)

            # Count as exec instruction and logical SLOC
            if self._is_exec_instruction(accumulated_statement):
                self.results["exec_instructions"] += 1

            self.results["logical_sloc"] += num_statements

            accumulated_statement = ""

        # Handle incomplete statement at end of file
        if accumulated_statement.strip():
            if self._is_exec_instruction(accumulated_statement):
                self.results["exec_instructions"] += 1
            self.results["logical_sloc"] += 1

    def _get_last_token(self, line: str) -> str:
        """Extract last meaningful token from line."""
        line = line.rstrip()
        if not line:
            return ""

        # Check if last char is an operator
        if line[-1] in self.CONTINUATION_CHARS:
            return line[-1]

        # Extract last word
        tokens = line.split()
        if tokens:
            return tokens[-1]

        return ""

    def _is_exec_instruction(self, statement: str) -> bool:
        """Check if statement contains executable keywords."""
        statement_lower = statement.lower()

        # Check for exec keywords
        for keyword in self.EXEC_KEYWORDS:
            # Use word boundaries to avoid false matches
            pattern = r"\b" + re.escape(keyword) + r"\b"
            if re.search(pattern, statement_lower):
                return True

        # Check for assignment (contains =)
        if "=" in statement and "==" not in statement:
            return True

        # Check for function/method calls (contains '(')
        if "(" in statement:
            return True

        # If has any content, consider it executable
        return bool(statement.strip())