SXXXXXXX_DependencyAnalyzer/dependencyanalyzer/core/analyzer.py

221 lines
8.9 KiB
Python

# dependency_analyzer/core/analyzer.py
"""
Core static analysis functionality for Python projects.
This module provides the tools to parse Python source files, extract
import statements, and classify them as standard library, external,
or project-internal modules.
"""
import ast
import importlib.metadata
import logging
import os
from pathlib import Path
from typing import Dict, Set, Tuple, Union, Optional
from .stdlib_detector import is_standard_library
# --- Logger Configuration ---
logger = logging.getLogger(__name__)
# --- Type Aliases ---
# Structure: {'pypi_name': {'locations': {'file1', 'file2'}, 'version': '1.2.3', ...}}
DependencyInfo = Dict[str, Dict[str, Union[Set[str], Optional[str], str]]]
# --- Constants ---
# Maps common import names to their actual PyPI package names.
MODULE_NAME_TO_PACKAGE_NAME_MAP: Dict[str, str] = {
"PIL": "Pillow",
"cv2": "opencv-python",
# Add other common mappings here if needed
}
# Modules that are often incorrectly identified as external.
FALSE_POSITIVE_EXTERNAL_MODULES: Set[str] = {"mpl_toolkits"}
# Modules that are known to be local to the project's ecosystem but are
# not in the standard scan path (e.g., sibling directories).
PROJECT_SPECIFIC_LOCAL_MODULES: Set[str] = {"geoelevation"}
class ImportExtractor(ast.NodeVisitor):
"""
An AST (Abstract Syntax Tree) visitor that extracts top-level module imports
from a Python source file.
"""
def __init__(self, file_path_str: str):
"""
Initializes the visitor.
Args:
file_path_str: The relative path to the file being analyzed,
used for tracking where modules are imported.
"""
super().__init__()
self.file_path_str = file_path_str
self.imported_modules: Set[Tuple[str, str]] = set()
def visit_Import(self, node: ast.Import):
"""Handles 'import module' statements."""
for alias in node.names:
# We only care about the top-level package (e.g., 'os' from 'os.path')
module_name = alias.name.split(".")[0]
if module_name:
self.imported_modules.add((module_name, self.file_path_str))
self.generic_visit(node)
def visit_ImportFrom(self, node: ast.ImportFrom):
"""Handles 'from module import something' statements."""
# We only consider absolute imports (level=0). Relative imports are local.
if node.module and node.level == 0:
module_name = node.module.split(".")[0]
if module_name:
self.imported_modules.add((module_name, self.file_path_str))
def find_project_modules_and_dependencies(
repo_path: Path,
scan_path: Path,
) -> Tuple[DependencyInfo, DependencyInfo]:
"""
Analyzes Python files, identifies project modules, and classifies dependencies.
This function walks through the `scan_path`, parses each Python file to find
imports, and then categorizes each import as either part of the standard
library or an external dependency. It intelligently ignores imports that
refer to the project's own modules.
Args:
repo_path: The root path of the repository, used for reporting relative paths.
scan_path: The specific directory to start the analysis from (can be a subfolder).
Returns:
A tuple containing two dictionaries:
- std_lib_info: Information about used standard library modules.
- external_deps_info: Information about detected external dependencies.
"""
all_imports_locations: Dict[str, Set[str]] = {}
# If scanning a sub-directory that matches the repo name, assume it's the main package.
# Imports of this name will be ignored as project-internal.
main_project_package_name: Optional[str] = None
if repo_path != scan_path and scan_path.name == repo_path.name.lower():
main_project_package_name = scan_path.name
logger.info(
f"Assuming '{main_project_package_name}' is the main project package."
)
logger.info(f"Analyzing Python files for imports in '{scan_path}'...")
excluded_dirs = {
"venv", ".venv", "env", ".env", "docs", "tests", "test",
"site-packages", "dist-packages", "__pycache__", ".git", ".hg",
".svn", ".tox", ".nox", "build", "dist", "*.egg-info",
}
file_count = 0
for root, dirs, files in os.walk(scan_path, topdown=True):
# Prune the directory list to avoid walking into excluded folders
dirs[:] = [d for d in dirs if d not in excluded_dirs and not d.startswith(".")]
current_root_path = Path(root)
for file_name in files:
if not file_name.endswith(".py"):
continue
file_path_obj = current_root_path / file_name
file_count += 1
try:
# Use repo_path as the base for user-friendly relative paths
report_rel_path_str = str(file_path_obj.relative_to(repo_path))
except ValueError:
report_rel_path_str = str(file_path_obj)
logger.warning(
f"File path '{file_path_obj}' is not relative to repo root '{repo_path}'."
)
logger.debug(f"Parsing: {report_rel_path_str}")
try:
with open(file_path_obj, "r", encoding="utf-8", errors="ignore") as f:
source_code = f.read()
tree = ast.parse(source_code, filename=str(file_path_obj))
extractor = ImportExtractor(file_path_str=report_rel_path_str)
extractor.visit(tree)
for module, path in extractor.imported_modules:
all_imports_locations.setdefault(module, set()).add(path)
except SyntaxError as e:
logger.warning(f"Syntax error in '{report_rel_path_str}': {e}. Skipping.")
except Exception as e:
logger.exception(f"Error processing file '{report_rel_path_str}': {e}")
logger.info(
f"Analyzed {file_count} Python files. Found {len(all_imports_locations)} unique top-level imports."
)
logger.info("Classifying imports and fetching package versions...")
std_libs: DependencyInfo = {}
external_deps: DependencyInfo = {}
for imp_module, locations in all_imports_locations.items():
# Ignore if it matches the main project package name (project-internal import)
if main_project_package_name and imp_module == main_project_package_name:
logger.info(
f"Skipping '{imp_module}' as it matches the main project package."
)
continue
# Ignore known project-specific local modules that are not on PyPI
if imp_module in PROJECT_SPECIFIC_LOCAL_MODULES:
logger.info(
f"Skipping known project-specific local module: '{imp_module}'"
)
continue
# Ignore known false positives
if imp_module in FALSE_POSITIVE_EXTERNAL_MODULES:
logger.info(f"Skipping known false positive: '{imp_module}'")
continue
# Classify as standard library or external
if is_standard_library(imp_module):
logger.debug(f"'{imp_module}' classified as standard library.")
std_libs[imp_module] = {"locations": locations, "version": None}
else:
# It's an external dependency, process it
pypi_name = MODULE_NAME_TO_PACKAGE_NAME_MAP.get(imp_module, imp_module)
orig_imp = imp_module if pypi_name != imp_module else None
logger.debug(
f"'{imp_module}' (PyPI: '{pypi_name}') is external. Fetching version..."
)
version: Optional[str] = None
try:
version = importlib.metadata.version(pypi_name)
except importlib.metadata.PackageNotFoundError:
logger.warning(
f"Version for '{pypi_name}' not found in the current environment."
)
# Aggregate information for the dependency
dep_data = external_deps.setdefault(
pypi_name,
{"locations": set(), "version": version, "original_import_name": None},
)
dep_data["locations"].update(locations)
# Record the original import name if it was mapped
if orig_imp and dep_data.get("original_import_name") is None:
dep_data["original_import_name"] = orig_imp # type: ignore
# Update version if it was found now but not before (unlikely but safe)
if dep_data.get("version") is None and version is not None:
dep_data["version"] = version
logger.info(
f"Classification complete: {len(std_libs)} stdlib modules, "
f"{len(external_deps)} external dependencies."
)
return std_libs, external_deps