SXXXXXXX_PyUCC/pyucc/core/scanner.py

118 lines
3.2 KiB
Python

# ucc_py/core/scanner.py
from pathlib import Path
from typing import List, Iterable, Optional
import fnmatch
def _normalize_extensions(exts: Iterable[str]) -> set:
out = set()
for ext in exts:
e = ext.strip().lower()
if not e:
continue
if not e.startswith("."):
e = f".{e}"
out.add(e)
return out
def find_source_files(
directory: Path,
allowed_extensions: Optional[Iterable[str]] = None,
ignore_patterns: Optional[Iterable[str]] = None,
) -> List[Path]:
"""
Recursively finds files in a directory.
If `allowed_extensions` is provided (an iterable of extensions like ['.py', '.cpp']),
only files with those suffixes (case-insensitive) are returned.
If `ignore_patterns` is provided, any file or directory matching any of the
patterns (fnmatch-style) will be skipped. Patterns are matched case-insensitively
against each path component (file and directory names).
Args:
directory: The root directory to start scanning from.
allowed_extensions: Optional iterable of extensions to include.
ignore_patterns: Optional iterable of fnmatch patterns to exclude.
Returns:
A list of Path objects for each source file found.
"""
# A very basic set of extensions to ignore when no allowed_extensions is provided.
ignored_extensions = {
".exe",
".dll",
".so",
".o",
".a",
".lib",
".jpg",
".jpeg",
".png",
".gif",
".bmp",
".zip",
".tar",
".gz",
".rar",
".7z",
".pdf",
".doc",
".docx",
".xls",
".xlsx",
".db",
".sqlite",
}
allowed = None
if allowed_extensions:
allowed = _normalize_extensions(allowed_extensions)
ignores = None
if ignore_patterns:
# Accept either a single comma-separated string or an iterable of strings
if isinstance(ignore_patterns, str):
parts = [p.strip() for p in ignore_patterns.split(",")]
else:
parts = list(ignore_patterns)
# normalize patterns to lower-case and strip
ignores = [p.strip().lower() for p in parts if p and p.strip()]
source_files: List[Path] = []
for path in directory.rglob("*"):
if not path.is_file():
continue
name_lower = path.name.lower()
# skip if any ignore pattern matches the file name or any parent directory name
skip = False
if ignores is not None:
for pat in ignores:
if fnmatch.fnmatch(name_lower, pat):
skip = True
break
# check parents
for parent in path.parents:
if fnmatch.fnmatch(parent.name.lower(), pat):
skip = True
break
if skip:
break
if skip:
continue
suffix = path.suffix.lower()
if allowed is not None:
if suffix in allowed:
source_files.append(path)
else:
if suffix not in ignored_extensions:
source_files.append(path)
return source_files