SXXXXXXX_PyUCC/pyucc/core/scanner.py

141 lines
3.9 KiB
Python

# ucc_py/core/scanner.py
from pathlib import Path
from typing import List, Iterable, Optional
import fnmatch
def _normalize_extensions(exts: Iterable[str]) -> set:
out = set()
for ext in exts:
e = ext.strip().lower()
if not e:
continue
if not e.startswith("."):
e = f".{e}"
out.add(e)
return out
def normalize_ignore_patterns(patterns: Optional[Iterable[str]]) -> Optional[List[str]]:
"""Normalize ignore patterns for fnmatch matching.
- Accepts a single comma-separated string or iterable.
- Trims and lower-cases patterns.
- Patterns that look like extensions (start with '.') and do not contain
wildcards are converted to '*.ext' for convenience (so '.bak' => '*.bak').
"""
if not patterns:
return None
if isinstance(patterns, str):
parts = [p.strip() for p in patterns.split(",")]
else:
parts = list(patterns)
out = []
for p in parts:
if not p:
continue
pp = p.strip().lower()
if not pp:
continue
# if looks like an extension (e.g. .bak) and contains no glob chars, convert
if pp.startswith(".") and (
"*" not in pp and "?" not in pp and "/" not in pp and "\\" not in pp
):
pp = f"*{pp}"
out.append(pp)
return out
def find_source_files(
directory: Path,
allowed_extensions: Optional[Iterable[str]] = None,
ignore_patterns: Optional[Iterable[str]] = None,
) -> List[Path]:
"""
Recursively finds files in a directory.
If `allowed_extensions` is provided (an iterable of extensions like ['.py', '.cpp']),
only files with those suffixes (case-insensitive) are returned.
If `ignore_patterns` is provided, any file or directory matching any of the
patterns (fnmatch-style) will be skipped. Patterns are matched case-insensitively
against each path component (file and directory names).
Args:
directory: The root directory to start scanning from.
allowed_extensions: Optional iterable of extensions to include.
ignore_patterns: Optional iterable of fnmatch patterns to exclude.
Returns:
A list of Path objects for each source file found.
"""
# A very basic set of extensions to ignore when no allowed_extensions is provided.
ignored_extensions = {
".exe",
".dll",
".so",
".o",
".a",
".lib",
".jpg",
".jpeg",
".png",
".gif",
".bmp",
".zip",
".tar",
".gz",
".rar",
".7z",
".pdf",
".doc",
".docx",
".xls",
".xlsx",
".db",
".sqlite",
}
allowed = None
if allowed_extensions:
allowed = _normalize_extensions(allowed_extensions)
ignores = normalize_ignore_patterns(ignore_patterns)
source_files: List[Path] = []
for path in directory.rglob("*"):
if not path.is_file():
continue
name_lower = path.name.lower()
# skip if any ignore pattern matches the file name or any parent directory name
skip = False
if ignores is not None:
for pat in ignores:
if fnmatch.fnmatch(name_lower, pat):
skip = True
break
# check parents
for parent in path.parents:
if fnmatch.fnmatch(parent.name.lower(), pat):
skip = True
break
if skip:
break
if skip:
continue
suffix = path.suffix.lower()
if allowed is not None:
if suffix in allowed:
source_files.append(path)
else:
if suffix not in ignored_extensions:
source_files.append(path)
return source_files