141 lines
3.9 KiB
Python
141 lines
3.9 KiB
Python
# ucc_py/core/scanner.py
|
|
|
|
from pathlib import Path
|
|
from typing import List, Iterable, Optional
|
|
import fnmatch
|
|
|
|
|
|
def _normalize_extensions(exts: Iterable[str]) -> set:
|
|
out = set()
|
|
for ext in exts:
|
|
e = ext.strip().lower()
|
|
if not e:
|
|
continue
|
|
if not e.startswith("."):
|
|
e = f".{e}"
|
|
out.add(e)
|
|
return out
|
|
|
|
|
|
def normalize_ignore_patterns(patterns: Optional[Iterable[str]]) -> Optional[List[str]]:
|
|
"""Normalize ignore patterns for fnmatch matching.
|
|
|
|
- Accepts a single comma-separated string or iterable.
|
|
- Trims and lower-cases patterns.
|
|
- Patterns that look like extensions (start with '.') and do not contain
|
|
wildcards are converted to '*.ext' for convenience (so '.bak' => '*.bak').
|
|
"""
|
|
if not patterns:
|
|
return None
|
|
if isinstance(patterns, str):
|
|
parts = [p.strip() for p in patterns.split(",")]
|
|
else:
|
|
parts = list(patterns)
|
|
|
|
out = []
|
|
for p in parts:
|
|
if not p:
|
|
continue
|
|
pp = p.strip().lower()
|
|
if not pp:
|
|
continue
|
|
# if looks like an extension (e.g. .bak) and contains no glob chars, convert
|
|
if pp.startswith(".") and (
|
|
"*" not in pp and "?" not in pp and "/" not in pp and "\\" not in pp
|
|
):
|
|
pp = f"*{pp}"
|
|
out.append(pp)
|
|
return out
|
|
|
|
|
|
def find_source_files(
|
|
directory: Path,
|
|
allowed_extensions: Optional[Iterable[str]] = None,
|
|
ignore_patterns: Optional[Iterable[str]] = None,
|
|
) -> List[Path]:
|
|
"""
|
|
Recursively finds files in a directory.
|
|
|
|
If `allowed_extensions` is provided (an iterable of extensions like ['.py', '.cpp']),
|
|
only files with those suffixes (case-insensitive) are returned.
|
|
|
|
If `ignore_patterns` is provided, any file or directory matching any of the
|
|
patterns (fnmatch-style) will be skipped. Patterns are matched case-insensitively
|
|
against each path component (file and directory names).
|
|
|
|
Args:
|
|
directory: The root directory to start scanning from.
|
|
allowed_extensions: Optional iterable of extensions to include.
|
|
ignore_patterns: Optional iterable of fnmatch patterns to exclude.
|
|
|
|
Returns:
|
|
A list of Path objects for each source file found.
|
|
"""
|
|
# A very basic set of extensions to ignore when no allowed_extensions is provided.
|
|
ignored_extensions = {
|
|
".exe",
|
|
".dll",
|
|
".so",
|
|
".o",
|
|
".a",
|
|
".lib",
|
|
".jpg",
|
|
".jpeg",
|
|
".png",
|
|
".gif",
|
|
".bmp",
|
|
".zip",
|
|
".tar",
|
|
".gz",
|
|
".rar",
|
|
".7z",
|
|
".pdf",
|
|
".doc",
|
|
".docx",
|
|
".xls",
|
|
".xlsx",
|
|
".db",
|
|
".sqlite",
|
|
}
|
|
|
|
allowed = None
|
|
if allowed_extensions:
|
|
allowed = _normalize_extensions(allowed_extensions)
|
|
|
|
ignores = normalize_ignore_patterns(ignore_patterns)
|
|
|
|
source_files: List[Path] = []
|
|
|
|
for path in directory.rglob("*"):
|
|
if not path.is_file():
|
|
continue
|
|
|
|
name_lower = path.name.lower()
|
|
|
|
# skip if any ignore pattern matches the file name or any parent directory name
|
|
skip = False
|
|
if ignores is not None:
|
|
for pat in ignores:
|
|
if fnmatch.fnmatch(name_lower, pat):
|
|
skip = True
|
|
break
|
|
# check parents
|
|
for parent in path.parents:
|
|
if fnmatch.fnmatch(parent.name.lower(), pat):
|
|
skip = True
|
|
break
|
|
if skip:
|
|
break
|
|
if skip:
|
|
continue
|
|
|
|
suffix = path.suffix.lower()
|
|
if allowed is not None:
|
|
if suffix in allowed:
|
|
source_files.append(path)
|
|
else:
|
|
if suffix not in ignored_extensions:
|
|
source_files.append(path)
|
|
|
|
return source_files
|