SXXXXXXX_PyUCC/pyucc/core/ucc_extended_counting.py

"""Extended counting module to match UCC detailed metrics.

This module provides counting capabilities that match UCC's detailed breakdown:
- Whole vs Embedded comments
- Compiler directives
- Data declarations
- Executable instructions
- Logical SLOC (statement-based counting)
"""

import re
from pathlib import Path
from typing import Dict, Any, Tuple, List
import logging

_LOG = logging.getLogger(__name__)


class UCCExtendedCounter:
    """Extended counter that provides UCC-compatible detailed metrics."""

    def __init__(self, language: str = "C"):
        self.language = language.lower()
        self._setup_patterns()

    def _setup_patterns(self):
        """Setup regex patterns based on language."""
        if self.language in ["c", "c++"]:
            self._setup_c_cpp_patterns()
        elif self.language == "python":
            self._setup_python_patterns()
        else:
            self._setup_generic_patterns()

    def _setup_c_cpp_patterns(self):
        """Setup patterns for C/C++."""
        # Compiler directives
        self.directive_pattern = re.compile(
            r'^\s*#\s*(include|define|undef|ifdef|ifndef|endif|if|elif|else|'
            r'pragma|error|warning|line)\b',
            re.IGNORECASE
        )

        # Data declarations (simplified - real implementation needs more sophisticated parsing)
        self.data_decl_pattern = re.compile(
            r'^\s*(?:extern\s+)?(?:static\s+)?(?:const\s+)?'
            r'(?:unsigned\s+)?(?:signed\s+)?'
            r'(?:int|char|short|long|float|double|void|bool|size_t|uint\w*|int\w*)\s+'
            r'(?:\*\s*)*\w+\s*(?:\[[^\]]*\])?\s*[;=]',
            re.MULTILINE
        )

        # Comments
        self.line_comment_pattern = re.compile(r'//.*$', re.MULTILINE)
        self.block_comment_pattern = re.compile(r'/\*.*?\*/', re.DOTALL)

        # Statement terminators for logical SLOC
        self.statement_terminators = [';', '{', '}']

    def _setup_python_patterns(self):
        """Setup patterns for Python."""
        self.directive_pattern = re.compile(r'^\s*(?:import|from)\s+', re.MULTILINE)
        self.data_decl_pattern = re.compile(
            r'^\s*\w+\s*:\s*(?:int|str|float|bool|list|dict|tuple|set)\s*[=]?',
            re.MULTILINE
        )
        self.line_comment_pattern = re.compile(r'#.*$', re.MULTILINE)
        self.statement_terminators = ['\n']  # Python uses newlines

    def _setup_generic_patterns(self):
        """Setup generic fallback patterns."""
        self.directive_pattern = re.compile(r'^\s*#', re.MULTILINE)
        self.data_decl_pattern = None
        self.line_comment_pattern = re.compile(r'#.*$|//.*$', re.MULTILINE)
        self.block_comment_pattern = re.compile(r'/\*.*?\*/|""".*?"""|\'\'\'.*?\'\'\'', re.DOTALL)
        self.statement_terminators = [';']

    def analyze_file_extended(self, file_path: Path) -> Dict[str, Any]:
        """
        Analyze file with UCC-compatible detailed metrics.

        Returns dict with:
        - total_lines: Total lines in file
        - blank_lines: Completely empty lines
        - comment_whole: Comments on their own line
        - comment_embedded: Comments on same line as code
        - compiler_directives: Preprocessor directives count
        - data_declarations: Data/variable declaration count
        - exec_instructions: Executable statement count
        - logical_sloc: Logical source lines of code
        - physical_sloc: Physical source lines of code (non-blank, non-comment-only)
        """

        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()
        except Exception as e:
            _LOG.error(f"Failed to read {file_path}: {e}")
            return self._empty_result()

        return self._analyze_lines(lines)

    def _analyze_lines(self, lines: List[str]) -> Dict[str, Any]:
        """Analyze list of lines and return detailed metrics."""

        result = {
            'total_lines': len(lines),
            'blank_lines': 0,
            'comment_whole': 0,
            'comment_embedded': 0,
            'compiler_directives': 0,
            'data_declarations': 0,
            'exec_instructions': 0,
            'logical_sloc': 0,
            'physical_sloc': 0,
        }

        # First pass: identify block comment boundaries
        in_block_comment = False
        block_comment_lines = set()

        for line_num, line in enumerate(lines):
            stripped = line.strip()

            # Track block comment state
            if '/*' in line:
                in_block_comment = True
                block_comment_lines.add(line_num)

            if in_block_comment:
                block_comment_lines.add(line_num)

            if '*/' in line:
                in_block_comment = False

        # Second pass: classify each line
        logical_statements = 0

        for line_num, line in enumerate(lines):
            stripped = line.strip()

            # Count blank lines
            if not stripped:
                result['blank_lines'] += 1
                continue

            # Check if this line is part of a block comment
            is_in_block = line_num in block_comment_lines

            # Analyze line type
            if is_in_block:
                # Check if there's also code on this line (embedded comment)
                has_code_before = '/*' in line and line.index('/*') > 0 and line[:line.index('/*')].strip()
                has_code_after = '*/' in line and line.index('*/') < len(line) - 2 and line[line.index('*/') + 2:].strip()

                if has_code_before or has_code_after:
                    result['comment_embedded'] += 1
                    result['physical_sloc'] += 1

                    # Extract and analyze code parts
                    code_part = self._extract_code_from_mixed_line(line)
                    if code_part:
                        self._classify_code_line(code_part, result)
                        logical_statements += self._count_logical_statements(code_part)
                else:
                    # Pure comment line
                    result['comment_whole'] += 1

            elif stripped.startswith('//'):
                # Line comment at start
                result['comment_whole'] += 1

            elif '//' in stripped:
                # Line has embedded comment
                code_part = stripped[:stripped.index('//')].strip()
                if code_part:
                    result['comment_embedded'] += 1
                    result['physical_sloc'] += 1
                    self._classify_code_line(code_part, result)
                    logical_statements += self._count_logical_statements(code_part)
                else:
                    # Comment at start after whitespace
                    result['comment_whole'] += 1

            else:
                # Pure code line - no comments
                result['physical_sloc'] += 1
                self._classify_code_line(stripped, result)
                logical_statements += self._count_logical_statements(stripped)

        result['logical_sloc'] = logical_statements

        return result

    def _find_comment_regions(self, content: str) -> List[Tuple[int, int]]:
        """Find all block comment regions (start_line, end_line)."""
        regions = []

        if not hasattr(self, 'block_comment_pattern'):
            return regions

        for match in self.block_comment_pattern.finditer(content):
            start_pos = match.start()
            end_pos = match.end()

            # Convert byte positions to line numbers
            start_line = content[:start_pos].count('\n')
            end_line = content[:end_pos].count('\n')

            regions.append((start_line, end_line))

        return regions

    def _is_line_in_comment_region(self, line_num: int, regions: List[Tuple[int, int]]) -> bool:
        """Check if line is within a block comment region."""
        for start, end in regions:
            if start <= line_num <= end:
                return True
        return False

    def _extract_code_part(self, line: str) -> str:
        """Extract code part from line with embedded comment."""
        # Remove line comments
        if '//' in line:
            line = line[:line.index('//')]

        # Remove inline block comments (simple case)
        line = re.sub(r'/\*.*?\*/', '', line)

        return line.strip()

    def _extract_code_from_mixed_line(self, line: str) -> str:
        """Extract code from line that has both code and block comments."""
        result = line

        # Remove block comment parts
        if '/*' in result and '*/' in result:
            # Inline block comment
            start = result.index('/*')
            end = result.index('*/') + 2
            result = result[:start] + result[end:]
        elif '/*' in result:
            # Comment starts on this line
            result = result[:result.index('/*')]
        elif '*/' in result:
            # Comment ends on this line
            result = result[result.index('*/') + 2:]

        return result.strip()

    def _classify_code_line(self, code: str, result: Dict):
        """Classify code line as directive, data declaration, or executable."""

        # Empty code doesn't count
        if not code or code == ';':
            return

        # Check compiler directive (must be at start of code, ignoring whitespace)
        if code.lstrip().startswith('#'):
            result['compiler_directives'] += 1
            return

        # Check if it's a type/class/struct/enum definition or typedef
        if re.match(r'^(?:typedef|struct|class|enum|union)\s+', code, re.IGNORECASE):
            result['data_declarations'] += 1
            return

        # Check for variable declarations (more comprehensive patterns)
        # Pattern for C/C++ declarations
        is_declaration = False

        # Check for common type keywords
        type_keywords = [
            'int', 'char', 'short', 'long', 'float', 'double', 'void', 'bool',
            'unsigned', 'signed', 'const', 'static', 'extern', 'volatile',
            'size_t', 'uint8_t', 'uint16_t', 'uint32_t', 'uint64_t',
            'int8_t', 'int16_t', 'int32_t', 'int64_t'
        ]

        # Check if line starts with a type keyword (after storage class specifiers)
        code_lower = code.lower()
        words = code.split()
        if words:
            # Skip storage class specifiers
            first_word = words[0]
            if first_word in ['static', 'extern', 'const', 'volatile']:
                words = words[1:] if len(words) > 1 else []

            if words and words[0].lower() in type_keywords:
                # Likely a declaration if it has = or ;
                if '=' in code or code.rstrip().endswith(';'):
                    is_declaration = True

        # Check for function declarations (ending with );)
        if re.search(r'\([^)]*\)\s*;', code):
            is_declaration = True

        # Check for pointer/array declarations
        if re.search(r'\*\s*\w+\s*[;=\[]', code) or re.search(r'\w+\s*\[[^\]]*\]', code):
            # But not if it's an assignment to existing var
            if not re.match(r'^\s*\w+\s*\[', code):  # Not array access
                is_declaration = True

        if is_declaration:
            result['data_declarations'] += 1
            return

        # Default: executable instruction
        # Must have actual content (not just braces or semicolons)
        has_executable_content = False

        # Check for control flow keywords
        if any(kw in code for kw in ['if', 'else', 'while', 'for', 'do', 'switch', 'case', 'return', 'break', 'continue', 'goto']):
            has_executable_content = True

        # Check for function calls (word followed by parentheses)
        if re.search(r'\w+\s*\(', code):
            has_executable_content = True

        # Check for assignments
        if '=' in code and not '==' in code:
            has_executable_content = True

        # Check for braces (block delimiters count as executable)
        if '{' in code or '}' in code:
            has_executable_content = True

        # Check for standalone semicolon (end of previous statement)
        if code.strip() == ';':
            has_executable_content = False

        if has_executable_content:
            result['exec_instructions'] += 1

    def _count_logical_statements(self, code: str) -> int:
        """
        Count logical statements in code line.
        For C/C++: count semicolons, braces
        """
        if self.language in ["c", "c++"]:
            count = 0
            count += code.count(';')
            count += code.count('{')
            count += code.count('}')
            return count

        # For other languages, 1 statement per non-empty line
        return 1 if code.strip() else 0

    def _empty_result(self) -> Dict[str, Any]:
        """Return empty result dict."""
        return {
            'total_lines': 0,
            'blank_lines': 0,
            'comment_whole': 0,
            'comment_embedded': 0,
            'compiler_directives': 0,
            'data_declarations': 0,
            'exec_instructions': 0,
            'logical_sloc': 0,
            'physical_sloc': 0,
        }


def analyze_file_ucc_style(file_path: Path, language: str = None) -> Dict[str, Any]:
    """
    Analyze file with UCC-style detailed metrics.

    Args:
        file_path: Path to source file
        language: Language hint (auto-detected if None)

    Returns:
        Dict with detailed UCC-compatible metrics
    """
    if language is None:
        # Auto-detect from extension
        ext = file_path.suffix.lower()
        if ext in ['.c', '.h']:
            language = 'C'
        elif ext in ['.cpp', '.cc', '.cxx', '.hpp', '.hh']:
            language = 'C++'
        elif ext == '.py':
            language = 'Python'
        else:
            language = 'generic'

    counter = UCCExtendedCounter(language)
    result = counter.analyze_file_extended(file_path)
    result['language'] = language
    result['file'] = str(file_path)

    return result


def format_ucc_table_line(result: Dict[str, Any], file_label: str = None) -> str:
    """
    Format result as UCC-style table line.

    Args:
        result: Result dict from analyze_file_ucc_style()
        file_label: Optional custom file label (default: uses result['file'])

    Returns:
        Formatted string matching UCC output format
    """
    if file_label is None:
        file_label = Path(result.get('file', 'unknown')).name

    return (
        f"    {result['total_lines']:4}     {result['blank_lines']:3} |"
        f"     {result['comment_whole']:3}      {result['comment_embedded']:3} |"
        f"      {result['compiler_directives']:3}     {result['data_declarations']:3}     {result['exec_instructions']:3} |"
        f"     {result['logical_sloc']:3}      {result['physical_sloc']:3} |"
        f" CODE  {file_label}"
    )


def format_ucc_table_header() -> str:
    """Return UCC-style table header."""
    return """   Total   Blank |      Comments    | Compiler  Data   Exec.  | Logical Physical | File  Module
   Lines   Lines |   Whole Embedded | Direct.   Decl.  Instr. |   SLOC    SLOC   | Type  Name
-----------------+------------------+-------------------------+------------------+---------------------------"""