SXXXXXXX_PyUCC/pyucc/core/scanner.py
VALLONGOL 4fdd646d60 Chore: Stop tracking files based on .gitignore update.
Untracked files matching the following rules:
- Rule "*.zip": 1 file
2025-11-24 10:15:59 +01:00

99 lines
3.1 KiB
Python

# ucc_py/core/scanner.py
from pathlib import Path
from typing import List, Iterable, Optional
import fnmatch
def _normalize_extensions(exts: Iterable[str]) -> set:
out = set()
for ext in exts:
e = ext.strip().lower()
if not e:
continue
if not e.startswith('.'):
e = f'.{e}'
out.add(e)
return out
def find_source_files(
directory: Path,
allowed_extensions: Optional[Iterable[str]] = None,
ignore_patterns: Optional[Iterable[str]] = None,
) -> List[Path]:
"""
Recursively finds files in a directory.
If `allowed_extensions` is provided (an iterable of extensions like ['.py', '.cpp']),
only files with those suffixes (case-insensitive) are returned.
If `ignore_patterns` is provided, any file or directory matching any of the
patterns (fnmatch-style) will be skipped. Patterns are matched case-insensitively
against each path component (file and directory names).
Args:
directory: The root directory to start scanning from.
allowed_extensions: Optional iterable of extensions to include.
ignore_patterns: Optional iterable of fnmatch patterns to exclude.
Returns:
A list of Path objects for each source file found.
"""
# A very basic set of extensions to ignore when no allowed_extensions is provided.
ignored_extensions = {
".exe", ".dll", ".so", ".o", ".a", ".lib",
".jpg", ".jpeg", ".png", ".gif", ".bmp",
".zip", ".tar", ".gz", ".rar", ".7z",
".pdf", ".doc", ".docx", ".xls", ".xlsx",
".db", ".sqlite",
}
allowed = None
if allowed_extensions:
allowed = _normalize_extensions(allowed_extensions)
ignores = None
if ignore_patterns:
# Accept either a single comma-separated string or an iterable of strings
if isinstance(ignore_patterns, str):
parts = [p.strip() for p in ignore_patterns.split(",")]
else:
parts = list(ignore_patterns)
# normalize patterns to lower-case and strip
ignores = [p.strip().lower() for p in parts if p and p.strip()]
source_files: List[Path] = []
for path in directory.rglob("*"):
if not path.is_file():
continue
name_lower = path.name.lower()
# skip if any ignore pattern matches the file name or any parent directory name
skip = False
if ignores is not None:
for pat in ignores:
if fnmatch.fnmatch(name_lower, pat):
skip = True
break
# check parents
for parent in path.parents:
if fnmatch.fnmatch(parent.name.lower(), pat):
skip = True
break
if skip:
break
if skip:
continue
suffix = path.suffix.lower()
if allowed is not None:
if suffix in allowed:
source_files.append(path)
else:
if suffix not in ignored_extensions:
source_files.append(path)
return source_files