221 lines
8.9 KiB
Python
221 lines
8.9 KiB
Python
# dependency_analyzer/core/analyzer.py
|
|
"""
|
|
Core static analysis functionality for Python projects.
|
|
|
|
This module provides the tools to parse Python source files, extract
|
|
import statements, and classify them as standard library, external,
|
|
or project-internal modules.
|
|
"""
|
|
import ast
|
|
import importlib.metadata
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Dict, Set, Tuple, Union, Optional
|
|
|
|
from .stdlib_detector import is_standard_library
|
|
|
|
# --- Logger Configuration ---
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# --- Type Aliases ---
|
|
# Structure: {'pypi_name': {'locations': {'file1', 'file2'}, 'version': '1.2.3', ...}}
|
|
DependencyInfo = Dict[str, Dict[str, Union[Set[str], Optional[str], str]]]
|
|
|
|
# --- Constants ---
|
|
# Maps common import names to their actual PyPI package names.
|
|
MODULE_NAME_TO_PACKAGE_NAME_MAP: Dict[str, str] = {
|
|
"PIL": "Pillow",
|
|
"cv2": "opencv-python",
|
|
# Add other common mappings here if needed
|
|
}
|
|
|
|
# Modules that are often incorrectly identified as external.
|
|
FALSE_POSITIVE_EXTERNAL_MODULES: Set[str] = {"mpl_toolkits"}
|
|
|
|
# Modules that are known to be local to the project's ecosystem but are
|
|
# not in the standard scan path (e.g., sibling directories).
|
|
PROJECT_SPECIFIC_LOCAL_MODULES: Set[str] = {"geoelevation"}
|
|
|
|
|
|
class ImportExtractor(ast.NodeVisitor):
|
|
"""
|
|
An AST (Abstract Syntax Tree) visitor that extracts top-level module imports
|
|
from a Python source file.
|
|
"""
|
|
|
|
def __init__(self, file_path_str: str):
|
|
"""
|
|
Initializes the visitor.
|
|
Args:
|
|
file_path_str: The relative path to the file being analyzed,
|
|
used for tracking where modules are imported.
|
|
"""
|
|
super().__init__()
|
|
self.file_path_str = file_path_str
|
|
self.imported_modules: Set[Tuple[str, str]] = set()
|
|
|
|
def visit_Import(self, node: ast.Import):
|
|
"""Handles 'import module' statements."""
|
|
for alias in node.names:
|
|
# We only care about the top-level package (e.g., 'os' from 'os.path')
|
|
module_name = alias.name.split(".")[0]
|
|
if module_name:
|
|
self.imported_modules.add((module_name, self.file_path_str))
|
|
self.generic_visit(node)
|
|
|
|
def visit_ImportFrom(self, node: ast.ImportFrom):
|
|
"""Handles 'from module import something' statements."""
|
|
# We only consider absolute imports (level=0). Relative imports are local.
|
|
if node.module and node.level == 0:
|
|
module_name = node.module.split(".")[0]
|
|
if module_name:
|
|
self.imported_modules.add((module_name, self.file_path_str))
|
|
|
|
|
|
def find_project_modules_and_dependencies(
|
|
repo_path: Path,
|
|
scan_path: Path,
|
|
) -> Tuple[DependencyInfo, DependencyInfo]:
|
|
"""
|
|
Analyzes Python files, identifies project modules, and classifies dependencies.
|
|
|
|
This function walks through the `scan_path`, parses each Python file to find
|
|
imports, and then categorizes each import as either part of the standard
|
|
library or an external dependency. It intelligently ignores imports that
|
|
refer to the project's own modules.
|
|
|
|
Args:
|
|
repo_path: The root path of the repository, used for reporting relative paths.
|
|
scan_path: The specific directory to start the analysis from (can be a subfolder).
|
|
|
|
Returns:
|
|
A tuple containing two dictionaries:
|
|
- std_lib_info: Information about used standard library modules.
|
|
- external_deps_info: Information about detected external dependencies.
|
|
"""
|
|
all_imports_locations: Dict[str, Set[str]] = {}
|
|
|
|
# If scanning a sub-directory that matches the repo name, assume it's the main package.
|
|
# Imports of this name will be ignored as project-internal.
|
|
main_project_package_name: Optional[str] = None
|
|
if repo_path != scan_path and scan_path.name == repo_path.name.lower():
|
|
main_project_package_name = scan_path.name
|
|
logger.info(
|
|
f"Assuming '{main_project_package_name}' is the main project package."
|
|
)
|
|
|
|
logger.info(f"Analyzing Python files for imports in '{scan_path}'...")
|
|
excluded_dirs = {
|
|
"venv", ".venv", "env", ".env", "docs", "tests", "test",
|
|
"site-packages", "dist-packages", "__pycache__", ".git", ".hg",
|
|
".svn", ".tox", ".nox", "build", "dist", "*.egg-info",
|
|
}
|
|
file_count = 0
|
|
|
|
for root, dirs, files in os.walk(scan_path, topdown=True):
|
|
# Prune the directory list to avoid walking into excluded folders
|
|
dirs[:] = [d for d in dirs if d not in excluded_dirs and not d.startswith(".")]
|
|
|
|
current_root_path = Path(root)
|
|
for file_name in files:
|
|
if not file_name.endswith(".py"):
|
|
continue
|
|
|
|
file_path_obj = current_root_path / file_name
|
|
file_count += 1
|
|
|
|
try:
|
|
# Use repo_path as the base for user-friendly relative paths
|
|
report_rel_path_str = str(file_path_obj.relative_to(repo_path))
|
|
except ValueError:
|
|
report_rel_path_str = str(file_path_obj)
|
|
logger.warning(
|
|
f"File path '{file_path_obj}' is not relative to repo root '{repo_path}'."
|
|
)
|
|
|
|
logger.debug(f"Parsing: {report_rel_path_str}")
|
|
try:
|
|
with open(file_path_obj, "r", encoding="utf-8", errors="ignore") as f:
|
|
source_code = f.read()
|
|
|
|
tree = ast.parse(source_code, filename=str(file_path_obj))
|
|
extractor = ImportExtractor(file_path_str=report_rel_path_str)
|
|
extractor.visit(tree)
|
|
|
|
for module, path in extractor.imported_modules:
|
|
all_imports_locations.setdefault(module, set()).add(path)
|
|
|
|
except SyntaxError as e:
|
|
logger.warning(f"Syntax error in '{report_rel_path_str}': {e}. Skipping.")
|
|
except Exception as e:
|
|
logger.exception(f"Error processing file '{report_rel_path_str}': {e}")
|
|
|
|
logger.info(
|
|
f"Analyzed {file_count} Python files. Found {len(all_imports_locations)} unique top-level imports."
|
|
)
|
|
logger.info("Classifying imports and fetching package versions...")
|
|
|
|
std_libs: DependencyInfo = {}
|
|
external_deps: DependencyInfo = {}
|
|
|
|
for imp_module, locations in all_imports_locations.items():
|
|
# Ignore if it matches the main project package name (project-internal import)
|
|
if main_project_package_name and imp_module == main_project_package_name:
|
|
logger.info(
|
|
f"Skipping '{imp_module}' as it matches the main project package."
|
|
)
|
|
continue
|
|
|
|
# Ignore known project-specific local modules that are not on PyPI
|
|
if imp_module in PROJECT_SPECIFIC_LOCAL_MODULES:
|
|
logger.info(
|
|
f"Skipping known project-specific local module: '{imp_module}'"
|
|
)
|
|
continue
|
|
|
|
# Ignore known false positives
|
|
if imp_module in FALSE_POSITIVE_EXTERNAL_MODULES:
|
|
logger.info(f"Skipping known false positive: '{imp_module}'")
|
|
continue
|
|
|
|
# Classify as standard library or external
|
|
if is_standard_library(imp_module):
|
|
logger.debug(f"'{imp_module}' classified as standard library.")
|
|
std_libs[imp_module] = {"locations": locations, "version": None}
|
|
else:
|
|
# It's an external dependency, process it
|
|
pypi_name = MODULE_NAME_TO_PACKAGE_NAME_MAP.get(imp_module, imp_module)
|
|
orig_imp = imp_module if pypi_name != imp_module else None
|
|
logger.debug(
|
|
f"'{imp_module}' (PyPI: '{pypi_name}') is external. Fetching version..."
|
|
)
|
|
|
|
version: Optional[str] = None
|
|
try:
|
|
version = importlib.metadata.version(pypi_name)
|
|
except importlib.metadata.PackageNotFoundError:
|
|
logger.warning(
|
|
f"Version for '{pypi_name}' not found in the current environment."
|
|
)
|
|
|
|
# Aggregate information for the dependency
|
|
dep_data = external_deps.setdefault(
|
|
pypi_name,
|
|
{"locations": set(), "version": version, "original_import_name": None},
|
|
)
|
|
dep_data["locations"].update(locations)
|
|
|
|
# Record the original import name if it was mapped
|
|
if orig_imp and dep_data.get("original_import_name") is None:
|
|
dep_data["original_import_name"] = orig_imp # type: ignore
|
|
|
|
# Update version if it was found now but not before (unlikely but safe)
|
|
if dep_data.get("version") is None and version is not None:
|
|
dep_data["version"] = version
|
|
|
|
logger.info(
|
|
f"Classification complete: {len(std_libs)} stdlib modules, "
|
|
f"{len(external_deps)} external dependencies."
|
|
)
|
|
return std_libs, external_deps |