# ucc_py/core/scanner.py from pathlib import Path from typing import List, Iterable, Optional import fnmatch def _normalize_extensions(exts: Iterable[str]) -> set: out = set() for ext in exts: e = ext.strip().lower() if not e: continue if not e.startswith("."): e = f".{e}" out.add(e) return out def find_source_files( directory: Path, allowed_extensions: Optional[Iterable[str]] = None, ignore_patterns: Optional[Iterable[str]] = None, ) -> List[Path]: """ Recursively finds files in a directory. If `allowed_extensions` is provided (an iterable of extensions like ['.py', '.cpp']), only files with those suffixes (case-insensitive) are returned. If `ignore_patterns` is provided, any file or directory matching any of the patterns (fnmatch-style) will be skipped. Patterns are matched case-insensitively against each path component (file and directory names). Args: directory: The root directory to start scanning from. allowed_extensions: Optional iterable of extensions to include. ignore_patterns: Optional iterable of fnmatch patterns to exclude. Returns: A list of Path objects for each source file found. """ # A very basic set of extensions to ignore when no allowed_extensions is provided. ignored_extensions = { ".exe", ".dll", ".so", ".o", ".a", ".lib", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".zip", ".tar", ".gz", ".rar", ".7z", ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".db", ".sqlite", } allowed = None if allowed_extensions: allowed = _normalize_extensions(allowed_extensions) ignores = None if ignore_patterns: # Accept either a single comma-separated string or an iterable of strings if isinstance(ignore_patterns, str): parts = [p.strip() for p in ignore_patterns.split(",")] else: parts = list(ignore_patterns) # normalize patterns to lower-case and strip ignores = [p.strip().lower() for p in parts if p and p.strip()] source_files: List[Path] = [] for path in directory.rglob("*"): if not path.is_file(): continue name_lower = path.name.lower() # skip if any ignore pattern matches the file name or any parent directory name skip = False if ignores is not None: for pat in ignores: if fnmatch.fnmatch(name_lower, pat): skip = True break # check parents for parent in path.parents: if fnmatch.fnmatch(parent.name.lower(), pat): skip = True break if skip: break if skip: continue suffix = path.suffix.lower() if allowed is not None: if suffix in allowed: source_files.append(path) else: if suffix not in ignored_extensions: source_files.append(path) return source_files