""" UCC-compatible counter for Python files. Implements UCC algorithms for Python with the following metrics: - Comment Whole Lines - Comment Embedded Lines - Compiler Directives (import/from statements) - Exec Instructions (all executable code in Python) - Logical SLOC (statement count) - Physical SLOC (non-blank, non-comment lines) Note: Python does not distinguish between data declarations and executable instructions, so data_declarations is always 0. """ import re from pathlib import Path from typing import Dict, List, Tuple class UCCPythonCounter: """UCC-compatible counter for Python files.""" # Python directives (imports) DIRECTIVES = { 'import', 'from', 'as' } # Python exec keywords (all are exec, no data declarations) EXEC_KEYWORDS = { 'and', 'as', 'assert', 'break', 'continue', 'def', 'del', 'elif', 'else', 'except', 'exec', 'exit', 'finally', 'for', 'global', 'if', 'in', 'is', 'lambda', 'not', 'or', 'pass', 'print', 'raise', 'return', 'try', 'while', 'with', 'yield', 'class', 'async', 'await', 'nonlocal' } # Continuation indicators CONTINUATION_CHARS = {'+', '-', '*', '/', '=', '<', '>', '|', '&', '%', '^', '\\', '~', ','} CONTINUATION_KEYWORDS = {'is', 'in', 'not', 'and', 'or'} def __init__(self): self.results = { 'comment_whole': 0, 'comment_embedded': 0, 'compiler_directives': 0, 'data_declarations': 0, # Always 0 for Python 'exec_instructions': 0, 'logical_sloc': 0, 'physical_sloc': 0, 'blank_lines': 0, } def analyze_file(self, file_path: Path) -> Dict[str, int]: """ Analyze a Python file using UCC algorithms. Returns dict with UCC extended metrics. """ try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: lines = f.readlines() except Exception: return self.results.copy() # Step 1: Count blank lines BEFORE any processing self._count_blank_lines(lines) # Step 2: Process strings and comments processed_lines, original_lines = self._preprocess_lines(lines) # Step 3: Count and remove comments (updates comment_whole, comment_embedded) processed_lines = self._count_and_remove_comments(processed_lines, original_lines) # Step 4: Count directives (import/from statements) processed_lines = self._count_directives(processed_lines, original_lines) # Step 5: Count logical SLOC and exec instructions self._count_logical_sloc(processed_lines, original_lines) return self.results.copy() def _count_blank_lines(self, lines: List[str]) -> None: """Count blank lines before any processing (UCC counts originally blank lines).""" for line in lines: if not line.strip(): self.results['blank_lines'] += 1 def _preprocess_lines(self, lines: List[str]) -> Tuple[List[str], List[str]]: """ Preprocess lines: remove string contents but keep structure. Returns (processed_lines, original_lines). """ processed = [] original = [] in_triple_quote = False triple_quote_char = None for line in lines: original.append(line.rstrip('\n\r')) if in_triple_quote: # Inside triple-quoted string if triple_quote_char * 3 in line: # End of triple-quoted string idx = line.find(triple_quote_char * 3) processed_line = '$' * (idx + 3) + line[idx + 3:] in_triple_quote = False else: # Entire line is inside string processed_line = '$' * len(line.rstrip('\n\r')) else: processed_line = line.rstrip('\n\r') # Check for triple-quoted strings for quote_char in ['"', "'"]: triple = quote_char * 3 if triple in processed_line: start_idx = processed_line.find(triple) end_idx = processed_line.find(triple, start_idx + 3) if end_idx != -1: # Complete triple-quoted string on one line processed_line = (processed_line[:start_idx] + '$' * (end_idx - start_idx + 3) + processed_line[end_idx + 3:]) else: # Start of multi-line triple-quoted string processed_line = processed_line[:start_idx] + '$' * (len(processed_line) - start_idx) in_triple_quote = True triple_quote_char = quote_char break # Replace single/double quoted strings (only if not in triple quote) if not in_triple_quote: processed_line = self._replace_quotes(processed_line) processed.append(processed_line) return processed, original def _replace_quotes(self, line: str) -> str: """Replace content of single and double quoted strings with $.""" result = [] i = 0 while i < len(line): if line[i] in ['"', "'"]: quote = line[i] result.append(quote) i += 1 # Find closing quote, handling escape sequences while i < len(line): if line[i] == '\\' and i + 1 < len(line): result.append('$') result.append('$') i += 2 elif line[i] == quote: result.append(quote) i += 1 break else: result.append('$') i += 1 else: result.append(line[i]) i += 1 return ''.join(result) def _count_and_remove_comments(self, processed: List[str], original: List[str]) -> List[str]: """ Count whole and embedded comments, then remove them. UCC counts EVERY line in a multi-line comment block as whole. """ result = [] in_block_comment = False block_quote_char = None for proc_line, orig_line in zip(processed, original): stripped = proc_line.strip() # Skip blank lines (already processed and counted) if not stripped: result.append('') continue # Handle multi-line comments (""" or ''') if in_block_comment: # Every line in block is a whole comment self.results['comment_whole'] += 1 # Check if block ends if block_quote_char * 3 in proc_line: in_block_comment = False result.append('') continue # Check for start of block comment block_started = False for quote_char in ['"', "'"]: triple = quote_char * 3 if triple in proc_line: # Check if it's complete on one line first = proc_line.find(triple) second = proc_line.find(triple, first + 3) if second == -1: # Multi-line block starts in_block_comment = True block_quote_char = quote_char block_started = True # Check if there's code before the comment before = proc_line[:first].strip() if before: self.results['comment_embedded'] += 1 result.append(before) else: self.results['comment_whole'] += 1 result.append('') break else: # Complete block comment on one line before = proc_line[:first].strip() after = proc_line[second + 3:].strip() if before or after: self.results['comment_embedded'] += 1 result.append(before + ' ' + after) else: self.results['comment_whole'] += 1 result.append('') block_started = True break if block_started: continue # Handle single-line comments (#) if '#' in proc_line: comment_idx = proc_line.find('#') before = proc_line[:comment_idx].strip() if before: self.results['comment_embedded'] += 1 result.append(before) else: self.results['comment_whole'] += 1 result.append('') else: result.append(proc_line) return result def _count_directives(self, processed: List[str], original: List[str]) -> List[str]: """ Count and extract compiler directives (import/from statements). Returns lines with directives removed. """ result = [] in_directive = False for proc_line, orig_line in zip(processed, original): stripped = proc_line.strip() if not stripped: result.append('') continue # Check if line starts with import/from tokens = stripped.split() if tokens and tokens[0] in self.DIRECTIVES: self.results['compiler_directives'] += 1 in_directive = True # Check for continuation (ends with \) if not stripped.endswith('\\'): in_directive = False result.append('') elif in_directive: # Continuation of directive self.results['compiler_directives'] += 1 if not stripped.endswith('\\'): in_directive = False result.append('') else: result.append(proc_line) return result def _count_logical_sloc(self, processed: List[str], original: List[str]) -> None: """ Count logical SLOC and exec instructions. Python: all non-blank, non-comment, non-directive lines are exec. """ accumulated_statement = '' paren_count = 0 bracket_count = 0 brace_count = 0 for proc_line, orig_line in zip(processed, original): stripped = proc_line.strip() if not stripped: continue # This is a physical line (non-blank, non-comment, non-directive) self.results['physical_sloc'] += 1 # Track parentheses, brackets, braces paren_count += proc_line.count('(') - proc_line.count(')') bracket_count += proc_line.count('[') - proc_line.count(']') brace_count += proc_line.count('{') - proc_line.count('}') accumulated_statement += ' ' + stripped # Check if statement is complete statement_complete = False # Statement continues if: # 1. Inside parentheses/brackets/braces if paren_count > 0 or bracket_count > 0 or brace_count > 0: continue # 2. Ends with continuation character if stripped.endswith('\\'): accumulated_statement = accumulated_statement.rstrip('\\') continue # 3. Ends with continuation operator/keyword last_token = self._get_last_token(stripped) if last_token in self.CONTINUATION_CHARS or last_token in self.CONTINUATION_KEYWORDS: continue # 4. Special case: else: or elif: - not counted as separate statement if stripped.endswith('else:') or stripped.endswith('elif:'): accumulated_statement = '' continue # Check for multiple statements on one line (separated by ; or :) # Count : and ; as statement terminators (except in else:) statement_seps = accumulated_statement.count(';') # Count : but exclude 'else:' colon_count = accumulated_statement.count(':') if 'else:' in accumulated_statement: colon_count -= accumulated_statement.count('else:') num_statements = max(1, statement_seps + colon_count + 1) # Count as exec instruction and logical SLOC if self._is_exec_instruction(accumulated_statement): self.results['exec_instructions'] += 1 self.results['logical_sloc'] += num_statements accumulated_statement = '' # Handle incomplete statement at end of file if accumulated_statement.strip(): if self._is_exec_instruction(accumulated_statement): self.results['exec_instructions'] += 1 self.results['logical_sloc'] += 1 def _get_last_token(self, line: str) -> str: """Extract last meaningful token from line.""" line = line.rstrip() if not line: return '' # Check if last char is an operator if line[-1] in self.CONTINUATION_CHARS: return line[-1] # Extract last word tokens = line.split() if tokens: return tokens[-1] return '' def _is_exec_instruction(self, statement: str) -> bool: """Check if statement contains executable keywords.""" statement_lower = statement.lower() # Check for exec keywords for keyword in self.EXEC_KEYWORDS: # Use word boundaries to avoid false matches pattern = r'\b' + re.escape(keyword) + r'\b' if re.search(pattern, statement_lower): return True # Check for assignment (contains =) if '=' in statement and '==' not in statement: return True # Check for function/method calls (contains '(') if '(' in statement: return True # If has any content, consider it executable return bool(statement.strip())