SXXXXXXX_PyUCC/pyucc/core/ucc_extended_counting.py

"""Extended counting module to match UCC detailed metrics.

This module provides counting capabilities that match UCC's detailed breakdown:
- Whole vs Embedded comments
- Compiler directives
- Data declarations
- Executable instructions
- Logical SLOC (statement-based counting)
"""

import re
from pathlib import Path
from typing import Dict, Any, Tuple, List
import logging

_LOG = logging.getLogger(__name__)


class UCCExtendedCounter:
    """Extended counter that provides UCC-compatible detailed metrics."""

    def __init__(self, language: str = "C"):
        self.language = language.lower()
        self._setup_patterns()

    def _setup_patterns(self):
        """Setup regex patterns based on language."""
        if self.language in ["c", "c++"]:
            self._setup_c_cpp_patterns()
        elif self.language == "python":
            self._setup_python_patterns()
        else:
            self._setup_generic_patterns()

    def _setup_c_cpp_patterns(self):
        """Setup patterns for C/C++."""
        # Compiler directives
        self.directive_pattern = re.compile(
            r"^\s*#\s*(include|define|undef|ifdef|ifndef|endif|if|elif|else|"
            r"pragma|error|warning|line)\b",
            re.IGNORECASE,
        )

        # Data declarations (simplified - real implementation needs more sophisticated parsing)
        self.data_decl_pattern = re.compile(
            r"^\s*(?:extern\s+)?(?:static\s+)?(?:const\s+)?"
            r"(?:unsigned\s+)?(?:signed\s+)?"
            r"(?:int|char|short|long|float|double|void|bool|size_t|uint\w*|int\w*)\s+"
            r"(?:\*\s*)*\w+\s*(?:\[[^\]]*\])?\s*[;=]",
            re.MULTILINE,
        )

        # Comments
        self.line_comment_pattern = re.compile(r"//.*$", re.MULTILINE)
        self.block_comment_pattern = re.compile(r"/\*.*?\*/", re.DOTALL)

        # Statement terminators for logical SLOC
        self.statement_terminators = [";", "{", "}"]

    def _setup_python_patterns(self):
        """Setup patterns for Python."""
        self.directive_pattern = re.compile(r"^\s*(?:import|from)\s+", re.MULTILINE)
        self.data_decl_pattern = re.compile(
            r"^\s*\w+\s*:\s*(?:int|str|float|bool|list|dict|tuple|set)\s*[=]?",
            re.MULTILINE,
        )
        self.line_comment_pattern = re.compile(r"#.*$", re.MULTILINE)
        self.statement_terminators = ["\n"]  # Python uses newlines

    def _setup_generic_patterns(self):
        """Setup generic fallback patterns."""
        self.directive_pattern = re.compile(r"^\s*#", re.MULTILINE)
        self.data_decl_pattern = None
        self.line_comment_pattern = re.compile(r"#.*$|//.*$", re.MULTILINE)
        self.block_comment_pattern = re.compile(
            r'/\*.*?\*/|""".*?"""|\'\'\'.*?\'\'\'', re.DOTALL
        )
        self.statement_terminators = [";"]

    def analyze_file_extended(self, file_path: Path) -> Dict[str, Any]:
        """
        Analyze file with UCC-compatible detailed metrics.

        Returns dict with:
        - total_lines: Total lines in file
        - blank_lines: Completely empty lines
        - comment_whole: Comments on their own line
        - comment_embedded: Comments on same line as code
        - compiler_directives: Preprocessor directives count
        - data_declarations: Data/variable declaration count
        - exec_instructions: Executable statement count
        - logical_sloc: Logical source lines of code
        - physical_sloc: Physical source lines of code (non-blank, non-comment-only)
        """

        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                lines = f.readlines()
        except Exception as e:
            _LOG.error(f"Failed to read {file_path}: {e}")
            return self._empty_result()

        return self._analyze_lines(lines)

    def _analyze_lines(self, lines: List[str]) -> Dict[str, Any]:
        """Analyze list of lines and return detailed metrics."""

        result = {
            "total_lines": len(lines),
            "blank_lines": 0,
            "comment_whole": 0,
            "comment_embedded": 0,
            "compiler_directives": 0,
            "data_declarations": 0,
            "exec_instructions": 0,
            "logical_sloc": 0,
            "physical_sloc": 0,
        }

        # First pass: identify block comment boundaries
        in_block_comment = False
        block_comment_lines = set()

        for line_num, line in enumerate(lines):
            stripped = line.strip()

            # Track block comment state
            if "/*" in line:
                in_block_comment = True
                block_comment_lines.add(line_num)

            if in_block_comment:
                block_comment_lines.add(line_num)

            if "*/" in line:
                in_block_comment = False

        # Second pass: classify each line
        logical_statements = 0

        for line_num, line in enumerate(lines):
            stripped = line.strip()

            # Count blank lines
            if not stripped:
                result["blank_lines"] += 1
                continue

            # Check if this line is part of a block comment
            is_in_block = line_num in block_comment_lines

            # Analyze line type
            if is_in_block:
                # Check if there's also code on this line (embedded comment)
                has_code_before = (
                    "/*" in line
                    and line.index("/*") > 0
                    and line[: line.index("/*")].strip()
                )
                has_code_after = (
                    "*/" in line
                    and line.index("*/") < len(line) - 2
                    and line[line.index("*/") + 2 :].strip()
                )

                if has_code_before or has_code_after:
                    result["comment_embedded"] += 1
                    result["physical_sloc"] += 1

                    # Extract and analyze code parts
                    code_part = self._extract_code_from_mixed_line(line)
                    if code_part:
                        self._classify_code_line(code_part, result)
                        logical_statements += self._count_logical_statements(code_part)
                else:
                    # Pure comment line
                    result["comment_whole"] += 1

            elif stripped.startswith("//"):
                # Line comment at start
                result["comment_whole"] += 1

            elif "//" in stripped:
                # Line has embedded comment
                code_part = stripped[: stripped.index("//")].strip()
                if code_part:
                    result["comment_embedded"] += 1
                    result["physical_sloc"] += 1
                    self._classify_code_line(code_part, result)
                    logical_statements += self._count_logical_statements(code_part)
                else:
                    # Comment at start after whitespace
                    result["comment_whole"] += 1

            else:
                # Pure code line - no comments
                result["physical_sloc"] += 1
                self._classify_code_line(stripped, result)
                logical_statements += self._count_logical_statements(stripped)

        result["logical_sloc"] = logical_statements

        return result

    def _find_comment_regions(self, content: str) -> List[Tuple[int, int]]:
        """Find all block comment regions (start_line, end_line)."""
        regions = []

        if not hasattr(self, "block_comment_pattern"):
            return regions

        for match in self.block_comment_pattern.finditer(content):
            start_pos = match.start()
            end_pos = match.end()

            # Convert byte positions to line numbers
            start_line = content[:start_pos].count("\n")
            end_line = content[:end_pos].count("\n")

            regions.append((start_line, end_line))

        return regions

    def _is_line_in_comment_region(
        self, line_num: int, regions: List[Tuple[int, int]]
    ) -> bool:
        """Check if line is within a block comment region."""
        for start, end in regions:
            if start <= line_num <= end:
                return True
        return False

    def _extract_code_part(self, line: str) -> str:
        """Extract code part from line with embedded comment."""
        # Remove line comments
        if "//" in line:
            line = line[: line.index("//")]

        # Remove inline block comments (simple case)
        line = re.sub(r"/\*.*?\*/", "", line)

        return line.strip()

    def _extract_code_from_mixed_line(self, line: str) -> str:
        """Extract code from line that has both code and block comments."""
        result = line

        # Remove block comment parts
        if "/*" in result and "*/" in result:
            # Inline block comment
            start = result.index("/*")
            end = result.index("*/") + 2
            result = result[:start] + result[end:]
        elif "/*" in result:
            # Comment starts on this line
            result = result[: result.index("/*")]
        elif "*/" in result:
            # Comment ends on this line
            result = result[result.index("*/") + 2 :]

        return result.strip()

    def _classify_code_line(self, code: str, result: Dict):
        """Classify code line as directive, data declaration, or executable."""

        # Empty code doesn't count
        if not code or code == ";":
            return

        # Check compiler directive (must be at start of code, ignoring whitespace)
        if code.lstrip().startswith("#"):
            result["compiler_directives"] += 1
            return

        # Check if it's a type/class/struct/enum definition or typedef
        if re.match(r"^(?:typedef|struct|class|enum|union)\s+", code, re.IGNORECASE):
            result["data_declarations"] += 1
            return

        # Check for variable declarations (more comprehensive patterns)
        # Pattern for C/C++ declarations
        is_declaration = False

        # Check for common type keywords
        type_keywords = [
            "int",
            "char",
            "short",
            "long",
            "float",
            "double",
            "void",
            "bool",
            "unsigned",
            "signed",
            "const",
            "static",
            "extern",
            "volatile",
            "size_t",
            "uint8_t",
            "uint16_t",
            "uint32_t",
            "uint64_t",
            "int8_t",
            "int16_t",
            "int32_t",
            "int64_t",
        ]

        # Check if line starts with a type keyword (after storage class specifiers)
        code_lower = code.lower()
        words = code.split()
        if words:
            # Skip storage class specifiers
            first_word = words[0]
            if first_word in ["static", "extern", "const", "volatile"]:
                words = words[1:] if len(words) > 1 else []

            if words and words[0].lower() in type_keywords:
                # Likely a declaration if it has = or ;
                if "=" in code or code.rstrip().endswith(";"):
                    is_declaration = True

        # Check for function declarations (ending with );)
        if re.search(r"\([^)]*\)\s*;", code):
            is_declaration = True

        # Check for pointer/array declarations
        if re.search(r"\*\s*\w+\s*[;=\[]", code) or re.search(
            r"\w+\s*\[[^\]]*\]", code
        ):
            # But not if it's an assignment to existing var
            if not re.match(r"^\s*\w+\s*\[", code):  # Not array access
                is_declaration = True

        if is_declaration:
            result["data_declarations"] += 1
            return

        # Default: executable instruction
        # Must have actual content (not just braces or semicolons)
        has_executable_content = False

        # Check for control flow keywords
        if any(
            kw in code
            for kw in [
                "if",
                "else",
                "while",
                "for",
                "do",
                "switch",
                "case",
                "return",
                "break",
                "continue",
                "goto",
            ]
        ):
            has_executable_content = True

        # Check for function calls (word followed by parentheses)
        if re.search(r"\w+\s*\(", code):
            has_executable_content = True

        # Check for assignments
        if "=" in code and not "==" in code:
            has_executable_content = True

        # Check for braces (block delimiters count as executable)
        if "{" in code or "}" in code:
            has_executable_content = True

        # Check for standalone semicolon (end of previous statement)
        if code.strip() == ";":
            has_executable_content = False

        if has_executable_content:
            result["exec_instructions"] += 1

    def _count_logical_statements(self, code: str) -> int:
        """
        Count logical statements in code line.
        For C/C++: count semicolons, braces
        """
        if self.language in ["c", "c++"]:
            count = 0
            count += code.count(";")
            count += code.count("{")
            count += code.count("}")
            return count

        # For other languages, 1 statement per non-empty line
        return 1 if code.strip() else 0

    def _empty_result(self) -> Dict[str, Any]:
        """Return empty result dict."""
        return {
            "total_lines": 0,
            "blank_lines": 0,
            "comment_whole": 0,
            "comment_embedded": 0,
            "compiler_directives": 0,
            "data_declarations": 0,
            "exec_instructions": 0,
            "logical_sloc": 0,
            "physical_sloc": 0,
        }


def analyze_file_ucc_style(file_path: Path, language: str = None) -> Dict[str, Any]:
    """
    Analyze file with UCC-style detailed metrics.

    Args:
        file_path: Path to source file
        language: Language hint (auto-detected if None)

    Returns:
        Dict with detailed UCC-compatible metrics
    """
    if language is None:
        # Auto-detect from extension
        ext = file_path.suffix.lower()
        if ext in [".c", ".h"]:
            language = "C"
        elif ext in [".cpp", ".cc", ".cxx", ".hpp", ".hh"]:
            language = "C++"
        elif ext == ".py":
            language = "Python"
        else:
            language = "generic"

    counter = UCCExtendedCounter(language)
    result = counter.analyze_file_extended(file_path)
    result["language"] = language
    result["file"] = str(file_path)

    return result


def format_ucc_table_line(result: Dict[str, Any], file_label: str = None) -> str:
    """
    Format result as UCC-style table line.

    Args:
        result: Result dict from analyze_file_ucc_style()
        file_label: Optional custom file label (default: uses result['file'])

    Returns:
        Formatted string matching UCC output format
    """
    if file_label is None:
        file_label = Path(result.get("file", "unknown")).name

    return (
        f"    {result['total_lines']:4}     {result['blank_lines']:3} |"
        f"     {result['comment_whole']:3}      {result['comment_embedded']:3} |"
        f"      {result['compiler_directives']:3}     {result['data_declarations']:3}     {result['exec_instructions']:3} |"
        f"     {result['logical_sloc']:3}      {result['physical_sloc']:3} |"
        f" CODE  {file_label}"
    )


def format_ucc_table_header() -> str:
    """Return UCC-style table header."""
    return """   Total   Blank |      Comments    | Compiler  Data   Exec.  | Logical Physical | File  Module
   Lines   Lines |   Whole Embedded | Direct.   Decl.  Instr. |   SLOC    SLOC   | Type  Name
-----------------+------------------+-------------------------+------------------+---------------------------"""