99 lines
3.1 KiB
Python
99 lines
3.1 KiB
Python
# ucc_py/core/scanner.py
|
|
|
|
from pathlib import Path
|
|
from typing import List, Iterable, Optional
|
|
import fnmatch
|
|
|
|
|
|
def _normalize_extensions(exts: Iterable[str]) -> set:
|
|
out = set()
|
|
for ext in exts:
|
|
e = ext.strip().lower()
|
|
if not e:
|
|
continue
|
|
if not e.startswith('.'):
|
|
e = f'.{e}'
|
|
out.add(e)
|
|
return out
|
|
|
|
|
|
def find_source_files(
|
|
directory: Path,
|
|
allowed_extensions: Optional[Iterable[str]] = None,
|
|
ignore_patterns: Optional[Iterable[str]] = None,
|
|
) -> List[Path]:
|
|
"""
|
|
Recursively finds files in a directory.
|
|
|
|
If `allowed_extensions` is provided (an iterable of extensions like ['.py', '.cpp']),
|
|
only files with those suffixes (case-insensitive) are returned.
|
|
|
|
If `ignore_patterns` is provided, any file or directory matching any of the
|
|
patterns (fnmatch-style) will be skipped. Patterns are matched case-insensitively
|
|
against each path component (file and directory names).
|
|
|
|
Args:
|
|
directory: The root directory to start scanning from.
|
|
allowed_extensions: Optional iterable of extensions to include.
|
|
ignore_patterns: Optional iterable of fnmatch patterns to exclude.
|
|
|
|
Returns:
|
|
A list of Path objects for each source file found.
|
|
"""
|
|
# A very basic set of extensions to ignore when no allowed_extensions is provided.
|
|
ignored_extensions = {
|
|
".exe", ".dll", ".so", ".o", ".a", ".lib",
|
|
".jpg", ".jpeg", ".png", ".gif", ".bmp",
|
|
".zip", ".tar", ".gz", ".rar", ".7z",
|
|
".pdf", ".doc", ".docx", ".xls", ".xlsx",
|
|
".db", ".sqlite",
|
|
}
|
|
|
|
allowed = None
|
|
if allowed_extensions:
|
|
allowed = _normalize_extensions(allowed_extensions)
|
|
|
|
ignores = None
|
|
if ignore_patterns:
|
|
# Accept either a single comma-separated string or an iterable of strings
|
|
if isinstance(ignore_patterns, str):
|
|
parts = [p.strip() for p in ignore_patterns.split(",")]
|
|
else:
|
|
parts = list(ignore_patterns)
|
|
# normalize patterns to lower-case and strip
|
|
ignores = [p.strip().lower() for p in parts if p and p.strip()]
|
|
|
|
source_files: List[Path] = []
|
|
|
|
for path in directory.rglob("*"):
|
|
if not path.is_file():
|
|
continue
|
|
|
|
name_lower = path.name.lower()
|
|
|
|
# skip if any ignore pattern matches the file name or any parent directory name
|
|
skip = False
|
|
if ignores is not None:
|
|
for pat in ignores:
|
|
if fnmatch.fnmatch(name_lower, pat):
|
|
skip = True
|
|
break
|
|
# check parents
|
|
for parent in path.parents:
|
|
if fnmatch.fnmatch(parent.name.lower(), pat):
|
|
skip = True
|
|
break
|
|
if skip:
|
|
break
|
|
if skip:
|
|
continue
|
|
|
|
suffix = path.suffix.lower()
|
|
if allowed is not None:
|
|
if suffix in allowed:
|
|
source_files.append(path)
|
|
else:
|
|
if suffix not in ignored_extensions:
|
|
source_files.append(path)
|
|
|
|
return source_files |