# BackupApp/backup_app/core/file_scanner.py import os from pathlib import Path from typing import List, Tuple, Dict, Callable, Any from collections import defaultdict # Tuple format for file details: (filename: str, size_mb: float, full_path: str) FileDetail = Tuple[str, float, str] def get_file_extension_stats(file_details_list: List[FileDetail]) -> Dict[str, Dict[str, Any]]: """ Calculates statistics (count and total size) for each file extension. Args: file_details_list: A list of tuples, where each tuple contains (filename, size_in_mb, full_path). The filename or full_path is used to extract the extension. Returns: A dictionary where keys are extensions (e.g., '.txt') and values are dictionaries {'count': int, 'size': float (in MB)}. """ extension_data = defaultdict(lambda: {'count': 0, 'size': 0.0}) for _, size_mb, file_path_str in file_details_list: file_path = Path(file_path_str) # os.path.splitext correctly handles filenames like ".bashrc" (ext will be ".bashrc") # and "archive.tar.gz" (ext will be ".gz"). ext = file_path.suffix.lower() if file_path.suffix else file_path.name.lower() # Handles no-extension or dotfiles if not ext: # Should not happen if Path.suffix or Path.name is used ext = "(no extension)" extension_data[ext]['count'] += 1 extension_data[ext]['size'] += size_mb return dict(extension_data) # Convert back to dict for easier handling if needed def scan_directory_for_files( source_directory_str: str, exclusion_patterns: List[str], progress_callback: Callable[[int, int, str], None] = None ) -> Tuple[List[FileDetail], List[FileDetail], int, float]: """ Scans a directory, categorizing files into included or excluded based on patterns. Args: source_directory_str: The path to the source directory to scan. exclusion_patterns: A list of file extension patterns to exclude (e.g., [".log", ".tmp"]). Patterns are matched case-insensitively at the end of the filename. progress_callback: An optional function called during scanning with (current_file_index, total_files_to_scan, current_file_path_str). Returns: A tuple containing: - included_files: List of FileDetail tuples for files to be included. - excluded_files: List of FileDetail tuples for files to be excluded. - total_included_count: Total number of files to be included. - total_included_size_mb: Total size in MB of files to be included. """ source_path = Path(source_directory_str) if not source_path.is_dir(): # This case should ideally be caught by the GUI before calling core logic raise ValueError(f"Source directory '{source_directory_str}' does not exist or is not a directory.") included_files_list: List[FileDetail] = [] excluded_files_list: List[FileDetail] = [] # First, collect all files to scan to provide progress for the callback all_file_paths_to_scan: List[Path] = [] for root, _, files in os.walk(source_path): for file_name in files: all_file_paths_to_scan.append(Path(root) / file_name) total_files_to_scan = len(all_file_paths_to_scan) for idx, file_path in enumerate(all_file_paths_to_scan): if progress_callback: # Call progress_callback: (current_file_index (0-based), total_files, current_file_name) # GUI might need to use root.after to schedule UI updates from this callback progress_callback(idx, total_files_to_scan, str(file_path)) file_name_lower = file_path.name.lower() try: file_size_bytes = file_path.stat().st_size file_size_mb = file_size_bytes / (1024 * 1024) except OSError: # File might be a broken symlink, or inaccessible continue # Skip this file file_detail: FileDetail = (file_path.name, file_size_mb, str(file_path)) is_excluded = False for pattern in exclusion_patterns: # Ensure pattern starts with a dot if it's meant to be an extension # and handle patterns that might not be just extensions (e.g. "tempfile*") # Original code used endswith, so we'll stick to that for extensions. # A simple dot prefix ensures we match ".txt" and not "myfiletxt". normalized_pattern = pattern.lower() if not normalized_pattern.startswith('.'): normalized_pattern = '.' + normalized_pattern.lstrip('.') if file_name_lower.endswith(normalized_pattern): is_excluded = True break if is_excluded: excluded_files_list.append(file_detail) else: included_files_list.append(file_detail) total_included_count = len(included_files_list) total_included_size_mb = sum(size for _, size, _ in included_files_list) return included_files_list, excluded_files_list, total_included_count, total_included_size_mb