# dependency_analyzer/core/analyzer.py """ Core static analysis functionality for Python projects. This module provides the tools to parse Python source files, extract import statements, and classify them as standard library, external, or project-internal modules. """ import ast import importlib.metadata import logging import os from pathlib import Path from typing import Dict, Set, Tuple, Union, Optional from .stdlib_detector import is_standard_library # --- Logger Configuration --- logger = logging.getLogger(__name__) # --- Type Aliases --- # Structure: {'pypi_name': {'locations': {'file1', 'file2'}, 'version': '1.2.3', ...}} DependencyInfo = Dict[str, Dict[str, Union[Set[str], Optional[str], str]]] # --- Constants --- # Maps common import names to their actual PyPI package names. MODULE_NAME_TO_PACKAGE_NAME_MAP: Dict[str, str] = { "PIL": "Pillow", "cv2": "opencv-python", # Add other common mappings here if needed } # Modules that are often incorrectly identified as external. FALSE_POSITIVE_EXTERNAL_MODULES: Set[str] = {"mpl_toolkits"} # Modules that are known to be local to the project's ecosystem but are # not in the standard scan path (e.g., sibling directories). PROJECT_SPECIFIC_LOCAL_MODULES: Set[str] = {"geoelevation"} class ImportExtractor(ast.NodeVisitor): """ An AST (Abstract Syntax Tree) visitor that extracts top-level module imports from a Python source file. """ def __init__(self, file_path_str: str): """ Initializes the visitor. Args: file_path_str: The relative path to the file being analyzed, used for tracking where modules are imported. """ super().__init__() self.file_path_str = file_path_str self.imported_modules: Set[Tuple[str, str]] = set() def visit_Import(self, node: ast.Import): """Handles 'import module' statements.""" for alias in node.names: # We only care about the top-level package (e.g., 'os' from 'os.path') module_name = alias.name.split(".")[0] if module_name: self.imported_modules.add((module_name, self.file_path_str)) self.generic_visit(node) def visit_ImportFrom(self, node: ast.ImportFrom): """Handles 'from module import something' statements.""" # We only consider absolute imports (level=0). Relative imports are local. if node.module and node.level == 0: module_name = node.module.split(".")[0] if module_name: self.imported_modules.add((module_name, self.file_path_str)) def find_project_modules_and_dependencies( repo_path: Path, scan_path: Path, ) -> Tuple[DependencyInfo, DependencyInfo]: """ Analyzes Python files, identifies project modules, and classifies dependencies. This function walks through the `scan_path`, parses each Python file to find imports, and then categorizes each import as either part of the standard library or an external dependency. It intelligently ignores imports that refer to the project's own modules. Args: repo_path: The root path of the repository, used for reporting relative paths. scan_path: The specific directory to start the analysis from (can be a subfolder). Returns: A tuple containing two dictionaries: - std_lib_info: Information about used standard library modules. - external_deps_info: Information about detected external dependencies. """ all_imports_locations: Dict[str, Set[str]] = {} # If scanning a sub-directory that matches the repo name, assume it's the main package. # Imports of this name will be ignored as project-internal. main_project_package_name: Optional[str] = None if repo_path != scan_path and scan_path.name == repo_path.name.lower(): main_project_package_name = scan_path.name logger.info( f"Assuming '{main_project_package_name}' is the main project package." ) logger.info(f"Analyzing Python files for imports in '{scan_path}'...") excluded_dirs = { "venv", ".venv", "env", ".env", "docs", "tests", "test", "site-packages", "dist-packages", "__pycache__", ".git", ".hg", ".svn", ".tox", ".nox", "build", "dist", "*.egg-info", } file_count = 0 for root, dirs, files in os.walk(scan_path, topdown=True): # Prune the directory list to avoid walking into excluded folders dirs[:] = [d for d in dirs if d not in excluded_dirs and not d.startswith(".")] current_root_path = Path(root) for file_name in files: if not file_name.endswith(".py"): continue file_path_obj = current_root_path / file_name file_count += 1 try: # Use repo_path as the base for user-friendly relative paths report_rel_path_str = str(file_path_obj.relative_to(repo_path)) except ValueError: report_rel_path_str = str(file_path_obj) logger.warning( f"File path '{file_path_obj}' is not relative to repo root '{repo_path}'." ) logger.debug(f"Parsing: {report_rel_path_str}") try: with open(file_path_obj, "r", encoding="utf-8", errors="ignore") as f: source_code = f.read() tree = ast.parse(source_code, filename=str(file_path_obj)) extractor = ImportExtractor(file_path_str=report_rel_path_str) extractor.visit(tree) for module, path in extractor.imported_modules: all_imports_locations.setdefault(module, set()).add(path) except SyntaxError as e: logger.warning(f"Syntax error in '{report_rel_path_str}': {e}. Skipping.") except Exception as e: logger.exception(f"Error processing file '{report_rel_path_str}': {e}") logger.info( f"Analyzed {file_count} Python files. Found {len(all_imports_locations)} unique top-level imports." ) logger.info("Classifying imports and fetching package versions...") std_libs: DependencyInfo = {} external_deps: DependencyInfo = {} for imp_module, locations in all_imports_locations.items(): # Ignore if it matches the main project package name (project-internal import) if main_project_package_name and imp_module == main_project_package_name: logger.info( f"Skipping '{imp_module}' as it matches the main project package." ) continue # Ignore known project-specific local modules that are not on PyPI if imp_module in PROJECT_SPECIFIC_LOCAL_MODULES: logger.info( f"Skipping known project-specific local module: '{imp_module}'" ) continue # Ignore known false positives if imp_module in FALSE_POSITIVE_EXTERNAL_MODULES: logger.info(f"Skipping known false positive: '{imp_module}'") continue # Classify as standard library or external if is_standard_library(imp_module): logger.debug(f"'{imp_module}' classified as standard library.") std_libs[imp_module] = {"locations": locations, "version": None} else: # It's an external dependency, process it pypi_name = MODULE_NAME_TO_PACKAGE_NAME_MAP.get(imp_module, imp_module) orig_imp = imp_module if pypi_name != imp_module else None logger.debug( f"'{imp_module}' (PyPI: '{pypi_name}') is external. Fetching version..." ) version: Optional[str] = None try: version = importlib.metadata.version(pypi_name) except importlib.metadata.PackageNotFoundError: logger.warning( f"Version for '{pypi_name}' not found in the current environment." ) # Aggregate information for the dependency dep_data = external_deps.setdefault( pypi_name, {"locations": set(), "version": version, "original_import_name": None}, ) dep_data["locations"].update(locations) # Record the original import name if it was mapped if orig_imp and dep_data.get("original_import_name") is None: dep_data["original_import_name"] = orig_imp # type: ignore # Update version if it was found now but not before (unlikely but safe) if dep_data.get("version") is None and version is not None: dep_data["version"] = version logger.info( f"Classification complete: {len(std_libs)} stdlib modules, " f"{len(external_deps)} external dependencies." ) return std_libs, external_deps