SXXXXXXX_PyUCC/pyucc/core/ucc_assembly_counter.py

"""
UCC-compatible counter for Assembly files.

Implements UCC algorithms for Assembly with the following metrics:
- Comment Whole Lines (;, #, |, /* */)
- Comment Embedded Lines
- Compiler Directives (assembler directives starting with .)
- Data Declarations (in .data, .bss sections)
- Exec Instructions (in .text sections)
- Logical SLOC (instruction count)
- Physical SLOC (non-blank, non-comment lines)

Assembly has distinct data and code sections (.data/.bss vs .text).
"""

import re
from pathlib import Path
from typing import Dict, List


class UCCAssemblyCounter:
    """UCC-compatible counter for Assembly files."""

    # Assembly comment markers (auto-detected)
    COMMENT_MARKERS = ["#", ";", "|"]

    # Data section markers
    DATA_SECTION_MARKERS = [
        ".data",
        ".bss",
        ".const",
        ".rdata",
        ".sdata",
        ".kdata",
        ".sbss",
        ".lit",
        "section .data",
        "section .bss",
    ]

    # Code/text section markers
    TEXT_SECTION_MARKERS = [
        ".text",
        ".code",
        "section .text",
        "section .txt",
        ".init",
        ".fini",
        ".ktext",
    ]

    # Directives (assembler commands starting with .)
    DIRECTIVE_PREFIXES = [".", "%"]

    def __init__(self):
        self.results = {
            "comment_whole": 0,
            "comment_embedded": 0,
            "compiler_directives": 0,
            "data_declarations": 0,
            "exec_instructions": 0,
            "logical_sloc": 0,
            "physical_sloc": 0,
            "blank_lines": 0,
        }
        self.detected_comment_marker = None

    def analyze_file(self, file_path: Path) -> Dict[str, int]:
        """Analyze an Assembly file using UCC algorithms."""
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                lines = f.readlines()
        except Exception:
            return self.results.copy()

        # Step 1: Count blank lines BEFORE processing
        self._count_blank_lines(lines)

        # Step 2: Detect comment marker used in this file
        self._detect_comment_marker(lines)

        # Step 3: Count and remove comments
        processed_lines = self._count_and_remove_comments(lines)

        # Step 4: Process assembly-specific logic (sections, directives, instructions)
        self._process_assembly_logic(processed_lines, lines)

        return self.results.copy()

    def _count_blank_lines(self, lines: List[str]) -> None:
        """Count blank lines before processing."""
        for line in lines:
            if not line.strip():
                self.results["blank_lines"] += 1

    def _detect_comment_marker(self, lines: List[str]) -> None:
        """
        Detect which comment marker (;, #, |) is used in this file.
        Assembly supports multiple comment styles.
        """
        for line in lines:
            stripped = line.strip()
            if not stripped:
                continue

            for marker in self.COMMENT_MARKERS:
                idx = stripped.find(marker)
                if idx == 0:
                    # Marker at start of line
                    self.detected_comment_marker = marker
                    return
                elif idx > 0 and stripped[idx - 1] == " ":
                    # Marker after space (inline comment)
                    self.detected_comment_marker = marker
                    return

        # Default to semicolon if no marker detected
        self.detected_comment_marker = ";"

    def _count_and_remove_comments(self, lines: List[str]) -> List[str]:
        """Count whole and embedded comments, then remove them."""
        result = []
        in_block = False

        for line in lines:
            stripped = line.strip()

            if not stripped:
                result.append("")
                continue

            # Handle block comments /* */
            if in_block:
                self.results["comment_whole"] += 1
                if "*/" in line:
                    idx = line.find("*/")
                    after = line[idx + 2 :].strip()
                    if after:
                        result.append(after)
                    else:
                        result.append("")
                    in_block = False
                else:
                    result.append("")
                continue

            # Check for block comment start
            if "/*" in line:
                start_idx = line.find("/*")
                before = line[:start_idx].strip()

                end_idx = line.find("*/", start_idx)
                if end_idx != -1:
                    # Block comment ends on same line
                    after = line[end_idx + 2 :].strip()
                    combined = (before + " " + after).strip()

                    if combined:
                        self.results["comment_embedded"] += 1
                        result.append(combined)
                    else:
                        self.results["comment_whole"] += 1
                        result.append("")
                else:
                    # Multi-line block starts
                    in_block = True
                    if before:
                        self.results["comment_embedded"] += 1
                        result.append(before)
                    else:
                        self.results["comment_whole"] += 1
                        result.append("")
                continue

            # Handle line comments (detected marker)
            if self.detected_comment_marker and self.detected_comment_marker in line:
                idx = line.find(self.detected_comment_marker)
                before = line[:idx].strip()

                if before:
                    self.results["comment_embedded"] += 1
                    result.append(before)
                else:
                    self.results["comment_whole"] += 1
                    result.append("")
            else:
                result.append(line)

        return result

    def _process_assembly_logic(
        self, processed: List[str], original: List[str]
    ) -> None:
        """
        Process assembly-specific logic:
        - Track .data/.bss vs .text sections
        - Count directives (lines starting with . or %)
        - Count data declarations vs exec instructions
        - Handle labels (label: instruction)
        - Handle continuation lines (ending with \\)
        """
        is_data_section = False  # True if in .data/.bss section
        accumulated = ""
        continuation = False

        for proc_line, orig_line in zip(processed, original):
            stripped = proc_line.strip()

            if not stripped:
                continue

            # Check for line continuation (ends with \)
            if stripped.endswith("\\"):
                accumulated += " " + stripped[:-1]
                continuation = True
                continue

            # Complete the statement
            if continuation:
                stripped = (accumulated + " " + stripped).strip()
                accumulated = ""
                continuation = False

            # This is a physical SLOC
            self.results["physical_sloc"] += 1

            # Check for section switch
            lower = stripped.lower()
            section_changed = False

            for marker in self.DATA_SECTION_MARKERS:
                if lower.startswith(marker):
                    is_data_section = True
                    section_changed = True
                    # Section declaration itself is not data, but counted as exec
                    self.results["exec_instructions"] += 1
                    self.results["logical_sloc"] += 1
                    break

            if not section_changed:
                for marker in self.TEXT_SECTION_MARKERS:
                    if lower.startswith(marker):
                        is_data_section = False
                        section_changed = True
                        # Section declaration counted as exec
                        self.results["exec_instructions"] += 1
                        self.results["logical_sloc"] += 1
                        break

            if section_changed:
                continue

            # Check for directives (start with . or %)
            if any(stripped.startswith(prefix) for prefix in self.DIRECTIVE_PREFIXES):
                # Skip 'end' directives (endm, endif, etc.)
                if stripped.lower().startswith((".end", "end", "%end")):
                    continue

                self.results["compiler_directives"] += 1
                self.results["logical_sloc"] += 1
                continue

            # Check for label-only lines (label: with no instruction)
            if stripped.endswith(":"):
                continue  # Don't count label-only lines

            # Split by statement separator (;)
            statements = [s.strip() for s in stripped.split(";") if s.strip()]

            for statement in statements:
                if not statement:
                    continue

                # Skip labels within statement
                if ":" in statement:
                    parts = statement.split(":", 1)
                    if len(parts) > 1 and parts[1].strip():
                        statement = parts[1].strip()
                    else:
                        continue  # Label only

                # Count as data or exec based on current section
                if is_data_section:
                    self.results["data_declarations"] += 1
                else:
                    self.results["exec_instructions"] += 1

                self.results["logical_sloc"] += 1

        # Handle incomplete statement at EOF
        if accumulated.strip():
            if is_data_section:
                self.results["data_declarations"] += 1
            else:
                self.results["exec_instructions"] += 1
            self.results["logical_sloc"] += 1