SXXXXXXX_PyUCC/pyucc/core/ucc_complete_counter.py

"""Complete UCC-compatible counter with full preprocessing pipeline.

This module implements the complete UCC counting flow:
1. PreCountProcess - Remove strings, normalize whitespace
2. CountCommentsSLOC - Remove all comments (block and line)
3. CountBlankSLOC - Identify blank lines
4. CountDirectiveSLOC - Extract and count directives
5. LanguageSpecificProcess - LSLOC state machine with keyword classification

Target: 90-95% accuracy matching UCC v.2018.07 for C/C++
"""

import re
from pathlib import Path
from typing import Dict, Any, List, Tuple
import logging

_LOG = logging.getLogger(__name__)


class UCCCompleteCounter:
    """Complete UCC-compatible counter with full preprocessing."""

    def __init__(self, language: str = "C"):
        self.language = language.upper()
        self._setup_language()

    def _setup_language(self):
        """Setup language-specific patterns and keywords."""
        if self.language in ["C", "C++", "C_CPP"]:
            self._setup_c_cpp()
        else:
            raise NotImplementedError(f"Language {self.language} not yet supported")

    def _setup_c_cpp(self):
        """Setup C/C++ specific patterns and keywords from UCC source."""

        # Comment patterns
        self.line_comment_start = "//"
        self.block_comment_start = "/*"
        self.block_comment_end = "*/"

        # String quote patterns
        self.string_quote = '"'
        self.char_quote = "'"
        self.escape_char = '\\'

        # Continuation line
        self.continuation = '\\'

        # Compiler directives (from UCC CCJavaCsScalaCounter.cpp)
        self.directive_keywords = [
            "define", "undef", "if", "ifdef", "ifndef", "else",
            "elif", "endif", "include", "pragma", "error",
            "warning", "line", "region", "endregion"
        ]

        # Data declaration keywords (from UCC exec_name_list)
        self.data_keywords = [
            "auto", "bool", "char", "class", "const", "double",
            "enum", "extern", "float", "int", "long", "private",
            "protected", "public", "register", "short", "signed",
            "static", "struct", "typedef", "union", "unsigned",
            "virtual", "void", "volatile",
            # C++ specific
            "namespace", "template", "typename", "explicit",
            # Common types
            "size_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t",
            "int8_t", "int16_t", "int32_t", "int64_t",
            "wchar_t", "ptrdiff_t"
        ]

        # Executable instruction keywords (from UCC exec_name_list)
        self.exec_keywords = [
            "break", "case", "catch", "continue", "default",
            "delete", "do", "else", "for", "goto", "if",
            "new", "return", "switch", "throw", "try", "while",
            # Additional
            "sizeof", "typeid", "const_cast", "dynamic_cast",
            "reinterpret_cast", "static_cast"
        ]

        # For/if/while control structures
        self.control_keywords = ["for", "if", "while"]

    def analyze_file(self, file_path: Path) -> Dict[str, Any]:
        """
        Analyze file with complete UCC preprocessing pipeline.

        Returns dict with UCC-compatible metrics.
        """
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()
        except Exception as e:
            _LOG.error(f"Error reading {file_path}: {e}")
            raise

        # Store original lines
        original_lines = lines.copy()
        total_lines = len(lines)

        # STEP 1: PreCountProcess - Remove quotes and normalize
        processed_lines = self._precount_process(lines)

        # STEP 2: CountBlankSLOC - Identify blank lines (BEFORE removing comments!)
        blank_lines = self._count_blank_sloc(processed_lines)

        # STEP 3: CountCommentsSLOC - Remove all comments
        no_comment_lines, comment_whole, comment_embedded = self._count_comments_sloc(
            processed_lines, original_lines
        )

        # STEP 4: CountDirectiveSLOC - Extract directives
        no_directive_lines, directive_count = self._count_directive_sloc(
            no_comment_lines, original_lines
        )

        # STEP 5: LanguageSpecificProcess - LSLOC with state machine
        lsloc_result = self._language_specific_process(
            no_directive_lines, original_lines
        )

        # Calculate physical SLOC (non-blank, non-comment-only)
        physical_sloc = total_lines - blank_lines - comment_whole

        return {
            'total_lines': total_lines,
            'blank_lines': blank_lines,
            'comment_whole': comment_whole,
            'comment_embedded': comment_embedded,
            'compiler_directives': directive_count,
            'data_declarations': lsloc_result['data_decl'],
            'exec_instructions': lsloc_result['exec_inst'],
            'logical_sloc': lsloc_result['logical_sloc'],
            'physical_sloc': physical_sloc
        }

    def _precount_process(self, lines: List[str]) -> List[str]:
        """
        PreCountProcess: Remove string literals and normalize.

        Replaces quoted strings with empty quotes to avoid counting
        keywords/terminators inside strings.
        """
        processed = []

        for line in lines:
            # Remove string literals but keep the quotes
            cleaned = self._remove_string_literals(line)
            processed.append(cleaned)

        return processed

    def _remove_string_literals(self, line: str) -> str:
        """Remove content of string and char literals, keep quotes."""
        result = []
        i = 0

        while i < len(line):
            char = line[i]

            # Check for string literal
            if char == self.string_quote:
                result.append(char)
                i += 1
                # Skip until closing quote or end of line
                while i < len(line):
                    if line[i] == self.escape_char and i + 1 < len(line):
                        # Skip escaped character
                        i += 2
                    elif line[i] == self.string_quote:
                        result.append(line[i])
                        i += 1
                        break
                    else:
                        # Don't include string content
                        i += 1

            # Check for char literal
            elif char == self.char_quote:
                result.append(char)
                i += 1
                # Skip until closing quote or end of line
                while i < len(line):
                    if line[i] == self.escape_char and i + 1 < len(line):
                        # Skip escaped character
                        i += 2
                    elif line[i] == self.char_quote:
                        result.append(line[i])
                        i += 1
                        break
                    else:
                        # Don't include char content
                        i += 1
            else:
                result.append(char)
                i += 1

        return ''.join(result)

    def _count_comments_sloc(
        self,
        lines: List[str],
        original_lines: List[str]
    ) -> Tuple[List[str], int, int]:
        """
        CountCommentsSLOC: Remove all comments and count whole/embedded.

        UCC counts EVERY line in a multi-line block comment as comment_whole.
        """
        no_comment_lines = []
        comment_whole = 0
        comment_embedded = 0
        in_block_comment = False

        for i, line in enumerate(lines):
            original_stripped = line.strip()
            cleaned = line

            # Handle being inside a block comment from previous line
            if in_block_comment:
                # Count this continuation line as comment_whole
                comment_whole += 1

                end_pos = cleaned.find(self.block_comment_end)
                if end_pos != -1:
                    # Block comment ends
                    after_comment = cleaned[end_pos + len(self.block_comment_end):].strip()
                    cleaned = cleaned[end_pos + len(self.block_comment_end):]
                    in_block_comment = False
                    # If has code after, it's embedded
                    if after_comment:
                        comment_embedded += 1
                        # But we already counted as whole above, so subtract 1
                        comment_whole -= 1
                else:
                    # Still in block comment
                    cleaned = ""

                no_comment_lines.append(cleaned)
                continue

            # Check for whole line comments (line or block)
            if original_stripped.startswith(self.line_comment_start):
                comment_whole += 1
                no_comment_lines.append("")
                continue

            if original_stripped.startswith(self.block_comment_start):
                # Block comment starting at line beginning
                comment_whole += 1

                end_pos = cleaned.find(self.block_comment_end)
                if end_pos != -1:
                    # Block comment ends on same line
                    after_comment = cleaned[end_pos + len(self.block_comment_end):].strip()
                    if after_comment:
                        # Has code after - it's embedded, not whole
                        comment_embedded += 1
                        comment_whole -= 1
                        cleaned = cleaned[end_pos + len(self.block_comment_end):]
                    else:
                        cleaned = ""
                else:
                    # Block comment continues to next line
                    in_block_comment = True
                    cleaned = ""

                no_comment_lines.append(cleaned)
                continue

            # Check for embedded comments
            line_comment_pos = cleaned.find(self.line_comment_start)
            block_comment_pos = cleaned.find(self.block_comment_start)

            # Find first comment
            first_comment_pos = -1
            if line_comment_pos != -1 and block_comment_pos != -1:
                first_comment_pos = min(line_comment_pos, block_comment_pos)
            elif line_comment_pos != -1:
                first_comment_pos = line_comment_pos
            elif block_comment_pos != -1:
                first_comment_pos = block_comment_pos

            if first_comment_pos != -1:
                code_before = cleaned[:first_comment_pos].strip()
                if code_before:
                    comment_embedded += 1
                else:
                    comment_whole += 1

                # Remove comment
                if first_comment_pos == line_comment_pos:
                    cleaned = cleaned[:line_comment_pos]
                else:
                    # Block comment
                    end_pos = cleaned.find(self.block_comment_end, block_comment_pos + len(self.block_comment_start))
                    if end_pos != -1:
                        cleaned = cleaned[:block_comment_pos] + cleaned[end_pos + len(self.block_comment_end):]
                    else:
                        cleaned = cleaned[:block_comment_pos]
                        in_block_comment = True

            no_comment_lines.append(cleaned)

        return no_comment_lines, comment_whole, comment_embedded

    def _count_blank_sloc(self, lines: List[str]) -> int:
        """Count blank lines (lines with no code after comment removal)."""
        blank_count = 0

        for line in lines:
            if not line.strip():
                blank_count += 1

        return blank_count

    def _count_directive_sloc(
        self,
        lines: List[str],
        original_lines: List[str]
    ) -> Tuple[List[str], int]:
        """
        CountDirectiveSLOC: Extract and count compiler directives.

        Returns:
            - Lines with directives blanked
            - Count of directive statements (logical)
        """
        no_directive_lines = []
        directive_count = 0
        in_directive = False
        directive_statement = ""

        for i, line in enumerate(lines):
            stripped = line.lstrip()

            # Check if this is a directive line
            if stripped.startswith('#'):
                # Check if it's a recognized directive
                is_directive = False
                for keyword in self.directive_keywords:
                    if re.match(r'#\s*' + keyword + r'\b', stripped):
                        is_directive = True
                        break

                if is_directive:
                    in_directive = True
                    directive_statement += stripped

                    # Check for continuation
                    if stripped.rstrip().endswith(self.continuation):
                        # Directive continues on next line
                        no_directive_lines.append("")
                        continue
                    else:
                        # Directive complete
                        directive_count += 1
                        directive_statement = ""
                        in_directive = False
                        no_directive_lines.append("")
                        continue

            elif in_directive:
                # Continuation of directive
                directive_statement += stripped

                if stripped.rstrip().endswith(self.continuation):
                    no_directive_lines.append("")
                    continue
                else:
                    # Directive complete
                    directive_count += 1
                    directive_statement = ""
                    in_directive = False
                    no_directive_lines.append("")
                    continue

            no_directive_lines.append(line)

        return no_directive_lines, directive_count

    def _language_specific_process(
        self,
        lines: List[str],
        original_lines: List[str]
    ) -> Dict[str, int]:
        """
        LanguageSpecificProcess: LSLOC counting with state machine.

        Implements UCC's LSLOC algorithm with:
        - Statement terminator detection (;, {, })
        - Parenthesis tracking for for/if/while
        - Keyword-based data vs exec classification
        - Multi-line statement accumulation
        """
        data_decl = 0
        exec_inst = 0
        logical_sloc = 0

        # State machine variables (maintained across lines)
        paren_count = 0
        brace_count = 0
        bracket_count = 0  # For arrays []
        for_flag = False
        found_forifwhile = False
        statement_buffer = ""

        for line in lines:
            stripped = line.strip()

            if not stripped:
                continue

            # Process each character looking for terminators
            i = 0

            while i < len(stripped):
                char = stripped[i]
                statement_buffer += char

                # Track brackets, parentheses, braces
                if char == '[':
                    bracket_count += 1
                elif char == ']':
                    bracket_count = max(0, bracket_count - 1)
                elif char == '(':
                    paren_count += 1
                    # Check if this starts a for/if/while
                    # Look for keyword before the (
                    before_paren = statement_buffer[:statement_buffer.rfind('(')].strip()
                    words = before_paren.split()
                    if words and words[-1] in self.control_keywords:
                        found_forifwhile = True
                        if words[-1] == "for":
                            for_flag = True

                elif char == ')':
                    paren_count = max(0, paren_count - 1)

                    # If for/if/while condition closed, count it as exec
                    if paren_count == 0 and found_forifwhile:
                        logical_sloc += 1
                        exec_inst += 1
                        found_forifwhile = False
                        for_flag = False

                # Check for statement terminators
                elif char == ';':
                    if paren_count == 0 and bracket_count == 0:
                        # End of statement
                        stmt = statement_buffer.strip()
                        if stmt and len(stmt) > 1:  # Not just ;
                            # Remove trailing ;
                            stmt = stmt[:-1].strip()
                            if stmt:
                                logical_sloc += 1
                                # Classify as data or exec
                                if self._is_data_declaration(stmt):
                                    data_decl += 1
                                else:
                                    exec_inst += 1

                        statement_buffer = ""
                        for_flag = False

                elif char == '{':
                    brace_count += 1
                    if paren_count == 0 and bracket_count == 0:
                        # Start of block
                        stmt = statement_buffer.strip()[:-1].strip()  # Remove {
                        if stmt and not found_forifwhile:
                            logical_sloc += 1
                            if self._is_data_declaration(stmt):
                                data_decl += 1
                            else:
                                exec_inst += 1

                        statement_buffer = ""
                        found_forifwhile = False

                elif char == '}':
                    brace_count = max(0, brace_count - 1)
                    if paren_count == 0 and bracket_count == 0:
                        # End of block
                        stmt = statement_buffer.strip()[:-1].strip()  # Remove }
                        if stmt:
                            logical_sloc += 1
                            if self._is_data_declaration(stmt):
                                data_decl += 1
                            else:
                                exec_inst += 1
                        statement_buffer = ""

                i += 1

        # Handle any remaining statement
        if statement_buffer.strip():
            logical_sloc += 1
            if self._is_data_declaration(statement_buffer):
                data_decl += 1
            else:
                exec_inst += 1

        return {
            'data_decl': data_decl,
            'exec_inst': exec_inst,
            'logical_sloc': logical_sloc
        }

    def _is_data_declaration(self, statement: str) -> bool:
        """
        Determine if statement is a data declaration or executable instruction.

        Uses keyword matching similar to UCC.
        """
        stmt_lower = statement.lower()

        # Remove common prefixes
        stmt_lower = re.sub(r'^\s*(public|private|protected|static|extern|const|volatile)\s+', '', stmt_lower)

        # Check for data keywords
        for keyword in self.data_keywords:
            if re.search(r'\b' + keyword + r'\b', stmt_lower):
                return True

        # Check for exec keywords (takes precedence)
        for keyword in self.exec_keywords:
            if re.search(r'\b' + keyword + r'\b', stmt_lower):
                return False

        # Check for function call pattern (name followed by parenthesis)
        if re.search(r'\w+\s*\(', statement):
            # Could be function call (exec) or function declaration (data)
            # If no type keyword before, likely a call
            has_type = any(re.search(r'\b' + kw + r'\b', stmt_lower) for kw in self.data_keywords)
            return has_type

        # Check for assignment (likely exec)
        if '=' in statement and not '==' in statement:
            # Could be initialization or assignment
            # If has type keyword, it's data declaration with initialization
            has_type = any(re.search(r'\b' + kw + r'\b', stmt_lower) for kw in self.data_keywords)
            return has_type

        # Default: if has pointer or array, likely data
        if '*' in statement or '[' in statement:
            return True

        # Default to exec
        return False


def analyze_file_ucc_complete(file_path: Path) -> Dict[str, Any]:
    """Convenience function to analyze a file with complete UCC counter."""
    counter = UCCCompleteCounter(language="C")
    return counter.analyze_file(file_path)