SXXXXXXX_BackupTools/backuptools/core/file_scanner.py
2025-05-07 14:02:56 +02:00

113 lines
5.1 KiB
Python

# BackupApp/backup_app/core/file_scanner.py
import os
from pathlib import Path
from typing import List, Tuple, Dict, Callable, Any
from collections import defaultdict
# Tuple format for file details: (filename: str, size_mb: float, full_path: str)
FileDetail = Tuple[str, float, str]
def get_file_extension_stats(file_details_list: List[FileDetail]) -> Dict[str, Dict[str, Any]]:
"""
Calculates statistics (count and total size) for each file extension.
Args:
file_details_list: A list of tuples, where each tuple contains
(filename, size_in_mb, full_path). The filename or full_path
is used to extract the extension.
Returns:
A dictionary where keys are extensions (e.g., '.txt') and values are
dictionaries {'count': int, 'size': float (in MB)}.
"""
extension_data = defaultdict(lambda: {'count': 0, 'size': 0.0})
for _, size_mb, file_path_str in file_details_list:
file_path = Path(file_path_str)
# os.path.splitext correctly handles filenames like ".bashrc" (ext will be ".bashrc")
# and "archive.tar.gz" (ext will be ".gz").
ext = file_path.suffix.lower() if file_path.suffix else file_path.name.lower() # Handles no-extension or dotfiles
if not ext: # Should not happen if Path.suffix or Path.name is used
ext = "(no extension)"
extension_data[ext]['count'] += 1
extension_data[ext]['size'] += size_mb
return dict(extension_data) # Convert back to dict for easier handling if needed
def scan_directory_for_files(
source_directory_str: str,
exclusion_patterns: List[str],
progress_callback: Callable[[int, int, str], None] = None
) -> Tuple[List[FileDetail], List[FileDetail], int, float]:
"""
Scans a directory, categorizing files into included or excluded based on patterns.
Args:
source_directory_str: The path to the source directory to scan.
exclusion_patterns: A list of file extension patterns to exclude (e.g., [".log", ".tmp"]).
Patterns are matched case-insensitively at the end of the filename.
progress_callback: An optional function called during scanning with
(current_file_index, total_files_to_scan, current_file_path_str).
Returns:
A tuple containing:
- included_files: List of FileDetail tuples for files to be included.
- excluded_files: List of FileDetail tuples for files to be excluded.
- total_included_count: Total number of files to be included.
- total_included_size_mb: Total size in MB of files to be included.
"""
source_path = Path(source_directory_str)
if not source_path.is_dir():
# This case should ideally be caught by the GUI before calling core logic
raise ValueError(f"Source directory '{source_directory_str}' does not exist or is not a directory.")
included_files_list: List[FileDetail] = []
excluded_files_list: List[FileDetail] = []
# First, collect all files to scan to provide progress for the callback
all_file_paths_to_scan: List[Path] = []
for root, _, files in os.walk(source_path):
for file_name in files:
all_file_paths_to_scan.append(Path(root) / file_name)
total_files_to_scan = len(all_file_paths_to_scan)
for idx, file_path in enumerate(all_file_paths_to_scan):
if progress_callback:
# Call progress_callback: (current_file_index (0-based), total_files, current_file_name)
# GUI might need to use root.after to schedule UI updates from this callback
progress_callback(idx, total_files_to_scan, str(file_path))
file_name_lower = file_path.name.lower()
try:
file_size_bytes = file_path.stat().st_size
file_size_mb = file_size_bytes / (1024 * 1024)
except OSError: # File might be a broken symlink, or inaccessible
continue # Skip this file
file_detail: FileDetail = (file_path.name, file_size_mb, str(file_path))
is_excluded = False
for pattern in exclusion_patterns:
# Ensure pattern starts with a dot if it's meant to be an extension
# and handle patterns that might not be just extensions (e.g. "tempfile*")
# Original code used endswith, so we'll stick to that for extensions.
# A simple dot prefix ensures we match ".txt" and not "myfiletxt".
normalized_pattern = pattern.lower()
if not normalized_pattern.startswith('.'):
normalized_pattern = '.' + normalized_pattern.lstrip('.')
if file_name_lower.endswith(normalized_pattern):
is_excluded = True
break
if is_excluded:
excluded_files_list.append(file_detail)
else:
included_files_list.append(file_detail)
total_included_count = len(included_files_list)
total_included_size_mb = sum(size for _, size, _ in included_files_list)
return included_files_list, excluded_files_list, total_included_count, total_included_size_mb