1013 lines
40 KiB
Python
1013 lines
40 KiB
Python
"""
|
|
Baseline manager and differ prototype.
|
|
|
|
- create baseline from directory (snapshot by default)
|
|
- load baseline metadata
|
|
- diff baseline vs current directory
|
|
- output results as dict / JSON-serializable
|
|
|
|
This is a minimal, self-contained implementation inspired by UCC's DiffTool.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import shutil
|
|
import stat
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
import fnmatch
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from dataclasses import dataclass, asdict
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
import difflib
|
|
|
|
BASELINE_ROOT_DIRNAME = ".pyucc_baselines"
|
|
|
|
|
|
def _sha1_of_file(path: Path, chunk_size: int = 8192) -> str:
|
|
h = hashlib.sha1()
|
|
with path.open("rb") as f:
|
|
for chunk in iter(lambda: f.read(chunk_size), b""):
|
|
h.update(chunk)
|
|
return h.hexdigest()
|
|
|
|
|
|
@dataclass
|
|
class FileMeta:
|
|
path: str # relative path
|
|
size: int
|
|
mtime: float
|
|
sha1: Optional[str] = None
|
|
countings: Optional[Dict] = None
|
|
metrics: Optional[Dict] = None
|
|
|
|
|
|
@dataclass
|
|
class BaselineMetadata:
|
|
baseline_id: str
|
|
created_at: float
|
|
source: str # 'local' or 'git'
|
|
origin: Optional[str]
|
|
project_root: str
|
|
files: List[FileMeta]
|
|
profile: Optional[str] = None
|
|
|
|
|
|
class BaselineManager:
|
|
def __init__(self, workspace_root: str, baselines_root: Optional[str] = None):
|
|
"""Manage baselines storage.
|
|
|
|
Args:
|
|
workspace_root: path to the project/workspace (kept for metadata usage).
|
|
baselines_root: optional absolute or relative path where baselines are stored.
|
|
If omitted, the environment variable `PYUCC_BASELINE_DIR` is consulted;
|
|
if that's not set, defaults to `./baseline` in the current working dir.
|
|
"""
|
|
self.workspace_root = os.path.abspath(workspace_root)
|
|
if baselines_root:
|
|
self.baselines_root = os.path.abspath(baselines_root)
|
|
else:
|
|
# priority: env var, app settings, fallback to ./baseline
|
|
env = os.getenv("PYUCC_BASELINE_DIR")
|
|
if env:
|
|
self.baselines_root = os.path.abspath(env)
|
|
else:
|
|
# try app settings if available
|
|
try:
|
|
from ..config import settings as app_settings
|
|
|
|
sdir = app_settings.get_baseline_dir()
|
|
except Exception:
|
|
sdir = None
|
|
if sdir:
|
|
self.baselines_root = os.path.abspath(sdir)
|
|
else:
|
|
self.baselines_root = os.path.join(os.getcwd(), "baseline")
|
|
os.makedirs(self.baselines_root, exist_ok=True)
|
|
|
|
def _baseline_dir(self, baseline_id: str) -> str:
|
|
return os.path.join(self.baselines_root, baseline_id)
|
|
|
|
def get_baseline_files_dir(self, baseline_id: str) -> str:
|
|
"""Get the directory containing the baseline snapshot files."""
|
|
return os.path.join(self._baseline_dir(baseline_id), "files")
|
|
|
|
def list_baselines(self) -> List[str]:
|
|
return [
|
|
d
|
|
for d in os.listdir(self.baselines_root)
|
|
if os.path.isdir(os.path.join(self.baselines_root, d))
|
|
]
|
|
|
|
def get_metadata_path(self, baseline_id: str) -> str:
|
|
return os.path.join(self._baseline_dir(baseline_id), "metadata.json")
|
|
|
|
def create_baseline_from_dir(
|
|
self,
|
|
dir_path: str,
|
|
baseline_id: Optional[str] = None,
|
|
snapshot: bool = True,
|
|
compute_sha1: bool = True,
|
|
ignore_patterns: Optional[List[str]] = None,
|
|
profile_name: Optional[str] = None,
|
|
max_keep: int = 5,
|
|
) -> str:
|
|
dir_path = os.path.abspath(dir_path)
|
|
if baseline_id is None:
|
|
ts = time.strftime("%Y%m%dT%H%M%S")
|
|
# include profile name in baseline id when available
|
|
if profile_name:
|
|
safe_profile = profile_name.replace(" ", "_")
|
|
baseline_id = f"{safe_profile}__{ts}_local"
|
|
else:
|
|
baseline_id = f"{ts}_local"
|
|
dest = self._baseline_dir(baseline_id)
|
|
if os.path.exists(dest):
|
|
raise FileExistsError(dest)
|
|
os.makedirs(dest, exist_ok=False)
|
|
|
|
files_meta: List[FileMeta] = []
|
|
|
|
# Walk source dir and collect metadata
|
|
ignore_patterns = ignore_patterns or []
|
|
for root, dirs, files in os.walk(dir_path):
|
|
for fn in files:
|
|
fpath = os.path.join(root, fn)
|
|
# skip baseline storage area if under workspace_root
|
|
if (
|
|
os.path.commonpath([self.baselines_root, fpath])
|
|
== self.baselines_root
|
|
):
|
|
continue
|
|
rel = os.path.relpath(fpath, dir_path)
|
|
# check ignore patterns against relative path (unix-style)
|
|
rel_unix = rel.replace("\\", "/")
|
|
ignored = False
|
|
for pat in ignore_patterns:
|
|
if fnmatch.fnmatch(rel_unix, pat) or fnmatch.fnmatch(fn, pat):
|
|
ignored = True
|
|
break
|
|
if ignored:
|
|
continue
|
|
try:
|
|
st = os.stat(fpath)
|
|
except OSError:
|
|
continue
|
|
sha1 = None
|
|
if compute_sha1: # also compute for 0-byte files
|
|
try:
|
|
sha1 = _sha1_of_file(Path(fpath))
|
|
except Exception:
|
|
sha1 = None
|
|
files_meta.append(
|
|
FileMeta(
|
|
path=rel_unix, size=st.st_size, mtime=st.st_mtime, sha1=sha1
|
|
)
|
|
)
|
|
|
|
# Run per-file analyzers (countings + metrics) and attach results to each FileMeta
|
|
try:
|
|
from ..core.countings_impl import (
|
|
analyze_file_counts as _analyze_file_counts,
|
|
)
|
|
from ..core.metrics import analyze_file_metrics as _analyze_metrics
|
|
|
|
for fm in files_meta:
|
|
abs_path = os.path.join(dir_path, fm.path)
|
|
# per-file counts
|
|
try:
|
|
c = _analyze_file_counts(Path(abs_path))
|
|
fm.countings = {
|
|
"physical_lines": int(c.get("physical_lines", 0)),
|
|
"code_lines": int(c.get("code_lines", 0)),
|
|
"comment_lines": int(c.get("comment_lines", 0)),
|
|
"blank_lines": int(c.get("blank_lines", 0)),
|
|
}
|
|
except Exception:
|
|
fm.countings = None
|
|
# per-file metrics
|
|
try:
|
|
m = _analyze_metrics(abs_path)
|
|
fm.metrics = {
|
|
"avg_cc": float(m.get("avg_cc", 0.0)),
|
|
"max_cc": int(m.get("max_cc", 0)),
|
|
"func_count": int(m.get("func_count", 0)),
|
|
"mi": float(m.get("mi", 0.0)),
|
|
}
|
|
except Exception:
|
|
fm.metrics = None
|
|
except Exception:
|
|
pass
|
|
|
|
metadata = BaselineMetadata(
|
|
baseline_id=baseline_id,
|
|
created_at=time.time(),
|
|
source="local",
|
|
origin=None,
|
|
project_root=dir_path,
|
|
files=files_meta,
|
|
profile=profile_name,
|
|
) # Save metadata
|
|
meta_path = self.get_metadata_path(baseline_id)
|
|
with open(meta_path, "w", encoding="utf-8") as f:
|
|
json.dump(self._metadata_to_dict(metadata), f, indent=2)
|
|
|
|
# Optionally store a snapshot
|
|
if snapshot:
|
|
snapshot_dir = os.path.join(dest, "files")
|
|
os.makedirs(snapshot_dir, exist_ok=True)
|
|
# Copy only the files that were included in the baseline (respecting ignore patterns)
|
|
for fm in files_meta:
|
|
src_file = os.path.join(dir_path, fm.path)
|
|
dst_file = os.path.join(snapshot_dir, fm.path)
|
|
# Create parent directories if needed
|
|
dst_parent = os.path.dirname(dst_file)
|
|
if dst_parent:
|
|
os.makedirs(dst_parent, exist_ok=True)
|
|
try:
|
|
shutil.copy2(src_file, dst_file) # copy2 preserves metadata
|
|
except Exception:
|
|
pass # skip files that cannot be copied
|
|
|
|
# Optionally create zip archive (controlled by settings)
|
|
# Check if user wants zip archives (for space savings at cost of speed)
|
|
try:
|
|
from ..config import settings as app_settings
|
|
|
|
if app_settings.get_zip_baselines():
|
|
zip_path = os.path.join(dest, "files.zip")
|
|
shutil.make_archive(
|
|
base_name=zip_path[:-4], format="zip", root_dir=snapshot_dir
|
|
)
|
|
except Exception:
|
|
pass # if settings not available or zip fails, continue without zip
|
|
|
|
# Prune old baselines if requested
|
|
if max_keep > 0:
|
|
self._prune_old_baselines(dir_path, profile_name, max_keep)
|
|
|
|
return baseline_id
|
|
|
|
def _prune_old_baselines(
|
|
self, project_root: str, profile_name: Optional[str], keep: int = 5
|
|
):
|
|
"""Prune older baselines for the same project and profile, keeping `keep` newest."""
|
|
# scan baselines root and load metadata for each baseline
|
|
entries = [] # list of (created_at, baseline_id, path)
|
|
for bn in os.listdir(self.baselines_root):
|
|
bdir = os.path.join(self.baselines_root, bn)
|
|
if not os.path.isdir(bdir):
|
|
continue
|
|
meta_path = os.path.join(bdir, "metadata.json")
|
|
if not os.path.exists(meta_path):
|
|
continue
|
|
try:
|
|
with open(meta_path, "r", encoding="utf-8") as f:
|
|
j = json.load(f)
|
|
except Exception:
|
|
continue
|
|
# match by project_root and profile
|
|
if j.get("project_root") != project_root:
|
|
continue
|
|
if profile_name is None:
|
|
if j.get("profile") is not None:
|
|
continue
|
|
else:
|
|
if j.get("profile") != profile_name:
|
|
continue
|
|
entries.append((j.get("created_at", 0), j.get("baseline_id", bn), bdir))
|
|
|
|
# sort by created_at descending (newest first)
|
|
entries.sort(key=lambda x: x[0], reverse=True)
|
|
# remove entries beyond keep
|
|
for _, bid, path in entries[keep:]:
|
|
try:
|
|
shutil.rmtree(path)
|
|
except Exception:
|
|
pass
|
|
|
|
def create_baseline_from_git(
|
|
self,
|
|
repo_path: str,
|
|
commit_ref: str = "HEAD",
|
|
baseline_id: Optional[str] = None,
|
|
snapshot: bool = True,
|
|
compute_sha1: bool = True,
|
|
ignore_patterns: Optional[List[str]] = None,
|
|
profile_name: Optional[str] = None,
|
|
max_keep: int = 5,
|
|
) -> str:
|
|
"""Create a baseline by exporting a git commit (using `git archive`).
|
|
|
|
This method requires that `git` is available in PATH. It will create a zip
|
|
archive of the requested commit and then build the baseline metadata from
|
|
the extracted tree.
|
|
"""
|
|
repo_path = os.path.abspath(repo_path)
|
|
if baseline_id is None:
|
|
ts = time.strftime("%Y%m%dT%H%M%S")
|
|
if profile_name:
|
|
safe_profile = profile_name.replace(" ", "_")
|
|
baseline_id = f"{safe_profile}__{ts}_git_{commit_ref}"
|
|
else:
|
|
baseline_id = f"{ts}_git_{commit_ref}"
|
|
dest = self._baseline_dir(baseline_id)
|
|
if os.path.exists(dest):
|
|
raise FileExistsError(dest)
|
|
os.makedirs(dest, exist_ok=False)
|
|
|
|
# create a temporary zip with git archive
|
|
zip_tmp = os.path.join(dest, "export.zip")
|
|
try:
|
|
subprocess.run(
|
|
["git", "archive", "--format=zip", "-o", zip_tmp, commit_ref],
|
|
cwd=repo_path,
|
|
check=True,
|
|
)
|
|
except subprocess.CalledProcessError as e:
|
|
raise RuntimeError(f"git archive failed: {e}")
|
|
|
|
# extract zip to a temp dir and build metadata similarly to dir baseline
|
|
extract_dir = os.path.join(dest, "extracted")
|
|
os.makedirs(extract_dir, exist_ok=True)
|
|
shutil.unpack_archive(zip_tmp, extract_dir)
|
|
|
|
# reuse create_baseline_from_dir logic but avoid creating nested baseline dir
|
|
files_meta: List[FileMeta] = []
|
|
ignore_patterns = ignore_patterns or []
|
|
for root, dirs, files in os.walk(extract_dir):
|
|
for fn in files:
|
|
fpath = os.path.join(root, fn)
|
|
rel = os.path.relpath(fpath, extract_dir)
|
|
rel_unix = rel.replace("\\", "/")
|
|
# apply ignore patterns
|
|
ignored = False
|
|
for pat in ignore_patterns:
|
|
if fnmatch.fnmatch(rel_unix, pat) or fnmatch.fnmatch(fn, pat):
|
|
ignored = True
|
|
break
|
|
if ignored:
|
|
continue
|
|
try:
|
|
st = os.stat(fpath)
|
|
except OSError:
|
|
continue
|
|
sha1 = None
|
|
if compute_sha1 and st.st_size > 0:
|
|
try:
|
|
sha1 = _sha1_of_file(Path(fpath))
|
|
except Exception:
|
|
sha1 = None
|
|
files_meta.append(
|
|
FileMeta(
|
|
path=rel_unix, size=st.st_size, mtime=st.st_mtime, sha1=sha1
|
|
)
|
|
)
|
|
|
|
# attempt to run per-file analyzers on the extracted tree and attach results
|
|
try:
|
|
from ..core.countings_impl import (
|
|
analyze_file_counts as _analyze_file_counts,
|
|
)
|
|
from ..core.metrics import analyze_file_metrics as _analyze_metrics
|
|
|
|
for fm in files_meta:
|
|
abs_path = os.path.join(extract_dir, fm.path)
|
|
try:
|
|
c = _analyze_file_counts(Path(abs_path))
|
|
fm.countings = {
|
|
"physical_lines": int(c.get("physical_lines", 0)),
|
|
"code_lines": int(c.get("code_lines", 0)),
|
|
"comment_lines": int(c.get("comment_lines", 0)),
|
|
"blank_lines": int(c.get("blank_lines", 0)),
|
|
}
|
|
except Exception:
|
|
fm.countings = None
|
|
try:
|
|
m = _analyze_metrics(abs_path)
|
|
fm.metrics = {
|
|
"avg_cc": float(m.get("avg_cc", 0.0)),
|
|
"max_cc": int(m.get("max_cc", 0)),
|
|
"func_count": int(m.get("func_count", 0)),
|
|
"mi": float(m.get("mi", 0.0)),
|
|
}
|
|
except Exception:
|
|
fm.metrics = None
|
|
except Exception:
|
|
pass
|
|
|
|
metadata = BaselineMetadata(
|
|
baseline_id=baseline_id,
|
|
created_at=time.time(),
|
|
source="git",
|
|
origin=commit_ref,
|
|
project_root=repo_path,
|
|
files=files_meta,
|
|
profile=profile_name,
|
|
)
|
|
|
|
meta_path = self.get_metadata_path(baseline_id)
|
|
with open(meta_path, "w", encoding="utf-8") as f:
|
|
json.dump(self._metadata_to_dict(metadata), f, indent=2)
|
|
|
|
# Optionally keep the extracted tree (snapshot)
|
|
if snapshot:
|
|
# move extracted content into dest/files
|
|
snapshot_dir = os.path.join(dest, "files")
|
|
shutil.move(extract_dir, snapshot_dir)
|
|
|
|
# Optionally create zip archive from the files directory
|
|
try:
|
|
from ..config import settings as app_settings
|
|
|
|
if app_settings.get_zip_baselines():
|
|
# Keep both files/ and create files.zip
|
|
zip_archive = os.path.join(dest, "files.zip")
|
|
shutil.make_archive(
|
|
base_name=zip_archive[:-4], format="zip", root_dir=snapshot_dir
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
# Always remove the git export zip (it was just temporary)
|
|
try:
|
|
os.remove(zip_tmp)
|
|
except Exception:
|
|
pass
|
|
else:
|
|
# remove extracted files and zip
|
|
shutil.rmtree(extract_dir, ignore_errors=True)
|
|
try:
|
|
os.remove(zip_tmp)
|
|
except Exception:
|
|
pass
|
|
|
|
# prune old baselines if requested
|
|
if max_keep > 0:
|
|
self._prune_old_baselines(repo_path, profile_name, max_keep)
|
|
|
|
return baseline_id
|
|
|
|
def load_metadata(self, baseline_id: str) -> BaselineMetadata:
|
|
meta_path = self.get_metadata_path(baseline_id)
|
|
with open(meta_path, "r", encoding="utf-8") as f:
|
|
j = json.load(f)
|
|
files = [FileMeta(**fm) for fm in j["files"]]
|
|
return BaselineMetadata(
|
|
baseline_id=j["baseline_id"],
|
|
created_at=j["created_at"],
|
|
source=j.get("source", "local"),
|
|
origin=j.get("origin"),
|
|
project_root=j.get("project_root", ""),
|
|
files=files,
|
|
profile=j.get("profile"),
|
|
)
|
|
|
|
def _metadata_to_dict(self, meta: BaselineMetadata) -> Dict:
|
|
d = asdict(meta)
|
|
# dataclass conversion
|
|
d["files"] = [asdict(fm) for fm in meta.files]
|
|
return d
|
|
|
|
|
|
class Differ:
|
|
def __init__(
|
|
self,
|
|
baseline: BaselineMetadata,
|
|
current_dir: str,
|
|
max_workers: int = 4,
|
|
ignore_patterns: Optional[List[str]] = None,
|
|
baseline_files_dir: Optional[str] = None,
|
|
):
|
|
self.baseline = baseline
|
|
self.current_dir = os.path.abspath(current_dir)
|
|
self.max_workers = max_workers
|
|
self.ignore_patterns = ignore_patterns or []
|
|
self._current_files_cache: Optional[List[FileMeta]] = None
|
|
# baseline_files_dir is the directory containing the baseline snapshot files
|
|
# If not provided, falls back to baseline.project_root (for backwards compatibility)
|
|
self.baseline_files_dir = (
|
|
baseline_files_dir if baseline_files_dir else baseline.project_root
|
|
)
|
|
|
|
def build_current_file_list(self) -> List[FileMeta]:
|
|
# Return cached result if already computed
|
|
if self._current_files_cache is not None:
|
|
return self._current_files_cache
|
|
|
|
files_meta: List[FileMeta] = []
|
|
for root, dirs, files in os.walk(self.current_dir):
|
|
for fn in files:
|
|
fpath = os.path.join(root, fn)
|
|
rel = os.path.relpath(fpath, self.current_dir)
|
|
# apply ignore patterns from profile: test against relative path (unix-style) and filename
|
|
rel_unix = rel.replace("\\", "/")
|
|
ignored = False
|
|
for pat in self.ignore_patterns or []:
|
|
if fnmatch.fnmatch(rel_unix, pat) or fnmatch.fnmatch(fn, pat):
|
|
ignored = True
|
|
break
|
|
if ignored:
|
|
continue
|
|
try:
|
|
st = os.stat(fpath)
|
|
except OSError:
|
|
continue
|
|
sha1 = None
|
|
# Compute SHA1 for all files, including 0-byte files
|
|
try:
|
|
sha1 = _sha1_of_file(Path(fpath))
|
|
except Exception:
|
|
sha1 = None
|
|
files_meta.append(
|
|
FileMeta(
|
|
path=rel.replace("\\", "/"),
|
|
size=st.st_size,
|
|
mtime=st.st_mtime,
|
|
sha1=sha1,
|
|
)
|
|
)
|
|
|
|
# Run per-file analyzers (countings + metrics) and attach results to each FileMeta
|
|
# This ensures current files have the same data as baseline files for comparison
|
|
try:
|
|
from ..core.countings_impl import (
|
|
analyze_file_counts as _analyze_file_counts,
|
|
)
|
|
from ..core.metrics import analyze_file_metrics as _analyze_metrics
|
|
|
|
for fm in files_meta:
|
|
abs_path = os.path.join(self.current_dir, fm.path)
|
|
# per-file counts
|
|
try:
|
|
c = _analyze_file_counts(Path(abs_path))
|
|
fm.countings = {
|
|
"physical_lines": int(c.get("physical_lines", 0)),
|
|
"code_lines": int(c.get("code_lines", 0)),
|
|
"comment_lines": int(c.get("comment_lines", 0)),
|
|
"blank_lines": int(c.get("blank_lines", 0)),
|
|
}
|
|
except Exception:
|
|
fm.countings = None
|
|
# per-file metrics
|
|
try:
|
|
m = _analyze_metrics(abs_path)
|
|
fm.metrics = {
|
|
"avg_cc": float(m.get("avg_cc", 0.0)),
|
|
"max_cc": int(m.get("max_cc", 0)),
|
|
"func_count": int(m.get("func_count", 0)),
|
|
"mi": float(m.get("mi", 0.0)),
|
|
}
|
|
except Exception:
|
|
fm.metrics = None
|
|
except Exception:
|
|
pass
|
|
|
|
# Cache the result to avoid recomputation
|
|
self._current_files_cache = files_meta
|
|
return files_meta
|
|
|
|
@staticmethod
|
|
def _index_by_name(files: List[FileMeta]) -> Dict[str, List[FileMeta]]:
|
|
idx: Dict[str, List[FileMeta]] = {}
|
|
for f in files:
|
|
name = os.path.basename(f.path)
|
|
idx.setdefault(name, []).append(f)
|
|
return idx
|
|
|
|
@staticmethod
|
|
def _levenshtein(a: str, b: str) -> int:
|
|
# simple DP implementation
|
|
la, lb = len(a), len(b)
|
|
if la == 0:
|
|
return lb
|
|
if lb == 0:
|
|
return la
|
|
prev = list(range(lb + 1))
|
|
for i, ca in enumerate(a, start=1):
|
|
cur = [i] + [0] * lb
|
|
for j, cb in enumerate(b, start=1):
|
|
add = prev[j] + 1
|
|
delete = cur[j - 1] + 1
|
|
change = prev[j - 1] + (0 if ca == cb else 1)
|
|
cur[j] = min(add, delete, change)
|
|
prev = cur
|
|
return prev[lb]
|
|
|
|
def match_files(
|
|
self, baseline_files: List[FileMeta], current_files: List[FileMeta]
|
|
) -> List[Tuple[Optional[FileMeta], Optional[FileMeta]]]:
|
|
# Implement Gale-Shapley stable matching inspired by UCC logic.
|
|
# Build maps by filename only (candidates must share the same filename)
|
|
mapA_by_name = self._index_by_name(baseline_files)
|
|
mapB_by_name = self._index_by_name(current_files)
|
|
|
|
# Build preference lists (for A: list of B candidates sorted by path distance)
|
|
prefsA: Dict[str, List[FileMeta]] = {} # key: a.path
|
|
prefsB: Dict[str, List[FileMeta]] = {}
|
|
|
|
# helper: compute preference value between two file paths (parent dirs)
|
|
def pref_val(pa: str, pb: str) -> int:
|
|
parent_a = os.path.dirname(pa)
|
|
parent_b = os.path.dirname(pb)
|
|
return self._levenshtein(parent_a, parent_b)
|
|
|
|
# populate preferences A -> Bs
|
|
for a in baseline_files:
|
|
candidates = mapB_by_name.get(os.path.basename(a.path), [])
|
|
# compute scores and sort
|
|
scored = [(pref_val(a.path, b.path), b) for b in candidates]
|
|
scored.sort(key=lambda x: x[0])
|
|
prefsA[a.path] = [b for (_s, b) in scored]
|
|
|
|
# populate preferences B -> As
|
|
for b in current_files:
|
|
candidates = mapA_by_name.get(os.path.basename(b.path), [])
|
|
scored = [(pref_val(a.path, b.path), a) for a in candidates]
|
|
scored.sort(key=lambda x: x[0])
|
|
prefsB[b.path] = [a for (_s, a) in scored]
|
|
|
|
# Prepare Gale-Shapley structures
|
|
freeA = [a for a in baseline_files]
|
|
next_proposal_index: Dict[str, int] = {a.path: 0 for a in baseline_files}
|
|
matchA: Dict[str, Optional[FileMeta]] = {a.path: None for a in baseline_files}
|
|
matchB: Dict[str, Optional[FileMeta]] = {b.path: None for b in current_files}
|
|
|
|
# For quick comparison, build rank maps for B preferences
|
|
rankB: Dict[str, Dict[str, int]] = {}
|
|
for b in current_files:
|
|
rank = {}
|
|
plist = prefsB.get(b.path, [])
|
|
for idx, a in enumerate(plist):
|
|
rank[a.path] = idx
|
|
rankB[b.path] = rank
|
|
|
|
while freeA:
|
|
a = freeA.pop(0)
|
|
a_key = a.path
|
|
plist = prefsA.get(a_key, [])
|
|
if not plist:
|
|
# no candidates
|
|
matchA[a_key] = None
|
|
continue
|
|
# propose to next candidate
|
|
i = next_proposal_index[a_key]
|
|
if i >= len(plist):
|
|
matchA[a_key] = None
|
|
continue
|
|
b = plist[i]
|
|
next_proposal_index[a_key] = i + 1
|
|
b_key = b.path
|
|
current = matchB.get(b_key)
|
|
if current is None:
|
|
# b accepts
|
|
matchA[a_key] = b
|
|
matchB[b_key] = a
|
|
else:
|
|
# b decides preference between current and proposer
|
|
rank_map = rankB.get(b_key, {})
|
|
r_current = rank_map.get(current.path, float("inf"))
|
|
r_proposer = rank_map.get(a_key, float("inf"))
|
|
if r_proposer < r_current:
|
|
# b prefers new proposer
|
|
matchA[a_key] = b
|
|
matchB[b_key] = a
|
|
# previous current becomes free again
|
|
matchA[current.path] = None
|
|
freeA.append(current)
|
|
else:
|
|
# b rejects proposer -> proposer remains free (if more prefs)
|
|
freeA.append(a)
|
|
|
|
# Build results list: pairs for matched A entries
|
|
results: List[Tuple[Optional[FileMeta], Optional[FileMeta]]] = []
|
|
usedB = set()
|
|
for a in baseline_files:
|
|
b = matchA.get(a.path)
|
|
if b is None:
|
|
results.append((a, None))
|
|
else:
|
|
results.append((a, b))
|
|
usedB.add(b.path)
|
|
|
|
# Any B not matched are added as (None, b)
|
|
for b in current_files:
|
|
if b.path not in usedB:
|
|
results.append((None, b))
|
|
|
|
return results
|
|
|
|
@staticmethod
|
|
def _diff_file_pair(fileA_path: Optional[str], fileB_path: Optional[str]) -> Dict:
|
|
res = {"added": 0, "deleted": 0, "modified": 0, "unmodified": 0}
|
|
if fileA_path is None and fileB_path is None:
|
|
return res
|
|
if fileA_path is None:
|
|
# all lines are added
|
|
try:
|
|
with open(fileB_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
lines = f.readlines()
|
|
res["added"] = len(lines)
|
|
except Exception:
|
|
res["added"] = 0
|
|
return res
|
|
if fileB_path is None:
|
|
try:
|
|
with open(fileA_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
lines = f.readlines()
|
|
res["deleted"] = len(lines)
|
|
except Exception:
|
|
res["deleted"] = 0
|
|
return res
|
|
|
|
# both exist; line-based diff
|
|
try:
|
|
with open(fileA_path, "r", encoding="utf-8", errors="ignore") as fa:
|
|
a_lines = fa.readlines()
|
|
except Exception:
|
|
a_lines = []
|
|
try:
|
|
with open(fileB_path, "r", encoding="utf-8", errors="ignore") as fb:
|
|
b_lines = fb.readlines()
|
|
except Exception:
|
|
b_lines = []
|
|
|
|
sm = difflib.SequenceMatcher(a=a_lines, b=b_lines)
|
|
|
|
# DEBUG: Log if files are identical but difflib finds differences
|
|
has_differences = False
|
|
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
|
if tag != "equal":
|
|
has_differences = True
|
|
break
|
|
|
|
if has_differences and len(a_lines) == len(b_lines):
|
|
# Files have same line count but difflib sees differences
|
|
print(f"[DIFFER] ANOMALY DETECTED:")
|
|
print(f" FileA: {fileA_path}")
|
|
print(f" FileB: {fileB_path}")
|
|
print(f" Lines: {len(a_lines)} vs {len(b_lines)}")
|
|
# Check first differing line
|
|
for i, (line_a, line_b) in enumerate(zip(a_lines, b_lines)):
|
|
if line_a != line_b:
|
|
print(f" First diff at line {i+1}:")
|
|
print(f" A: {repr(line_a[:80])}")
|
|
print(f" B: {repr(line_b[:80])}")
|
|
break
|
|
|
|
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
|
if tag == "equal":
|
|
res["unmodified"] += i2 - i1
|
|
elif tag == "delete":
|
|
res["deleted"] += i2 - i1
|
|
elif tag == "insert":
|
|
res["added"] += j2 - j1
|
|
elif tag == "replace":
|
|
la = i2 - i1
|
|
lb = j2 - j1
|
|
res["modified"] += min(la, lb)
|
|
if la > lb:
|
|
res["deleted"] += la - lb
|
|
elif lb > la:
|
|
res["added"] += lb - la
|
|
return res
|
|
|
|
def diff(self) -> Dict:
|
|
baseline_files = self.baseline.files
|
|
current_files = self.build_current_file_list()
|
|
pairs = self.match_files(baseline_files, current_files)
|
|
|
|
total = {"added": 0, "deleted": 0, "modified": 0, "unmodified": 0}
|
|
matched_results = []
|
|
|
|
# helper to construct absolute paths
|
|
def abs_path_for(meta: FileMeta) -> str:
|
|
return (
|
|
os.path.join(self.current_dir, meta.path) if meta is not None else None
|
|
)
|
|
|
|
# process pairs possibly in parallel
|
|
tasks = []
|
|
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
|
futures = []
|
|
for a, b in pairs:
|
|
fa = (
|
|
os.path.join(self.baseline_files_dir, a.path)
|
|
if a is not None
|
|
else None
|
|
)
|
|
fb = os.path.join(self.current_dir, b.path) if b is not None else None
|
|
futures.append(ex.submit(self._diff_file_pair, fa, fb))
|
|
for (a, b), fut in zip(pairs, futures):
|
|
res = fut.result()
|
|
total["added"] += res["added"]
|
|
total["deleted"] += res["deleted"]
|
|
total["modified"] += res["modified"]
|
|
total["unmodified"] += res["unmodified"]
|
|
|
|
# Extract countings and metrics from baseline and current files
|
|
baseline_countings = (
|
|
a.countings if (a is not None and hasattr(a, "countings")) else None
|
|
)
|
|
baseline_metrics = (
|
|
a.metrics if (a is not None and hasattr(a, "metrics")) else None
|
|
)
|
|
current_countings = (
|
|
b.countings if (b is not None and hasattr(b, "countings")) else None
|
|
)
|
|
current_metrics = (
|
|
b.metrics if (b is not None and hasattr(b, "metrics")) else None
|
|
)
|
|
|
|
# Compute deltas for countings
|
|
countings_delta = None
|
|
if baseline_countings and current_countings:
|
|
countings_delta = {
|
|
"physical_lines": current_countings.get("physical_lines", 0)
|
|
- baseline_countings.get("physical_lines", 0),
|
|
"code_lines": current_countings.get("code_lines", 0)
|
|
- baseline_countings.get("code_lines", 0),
|
|
"comment_lines": current_countings.get("comment_lines", 0)
|
|
- baseline_countings.get("comment_lines", 0),
|
|
"blank_lines": current_countings.get("blank_lines", 0)
|
|
- baseline_countings.get("blank_lines", 0),
|
|
# UCC extended deltas
|
|
"comment_whole": current_countings.get("comment_whole", 0)
|
|
- baseline_countings.get("comment_whole", 0),
|
|
"comment_embedded": current_countings.get("comment_embedded", 0)
|
|
- baseline_countings.get("comment_embedded", 0),
|
|
"compiler_directives": current_countings.get("compiler_directives", 0)
|
|
- baseline_countings.get("compiler_directives", 0),
|
|
"data_declarations": current_countings.get("data_declarations", 0)
|
|
- baseline_countings.get("data_declarations", 0),
|
|
"exec_instructions": current_countings.get("exec_instructions", 0)
|
|
- baseline_countings.get("exec_instructions", 0),
|
|
"logical_sloc": current_countings.get("logical_sloc", 0)
|
|
- baseline_countings.get("logical_sloc", 0),
|
|
"physical_sloc": current_countings.get("physical_sloc", 0)
|
|
- baseline_countings.get("physical_sloc", 0),
|
|
}
|
|
# DEBUG LOGGING: Show comparison details when there's a delta
|
|
if any(v != 0 for v in countings_delta.values()):
|
|
fileA_path = a.path if a else "None"
|
|
fileB_path = b.path if b else "None"
|
|
print(
|
|
f"[DIFFER] DELTA DETECTED for {fileA_path} vs {fileB_path}"
|
|
)
|
|
print(f" Baseline: {baseline_countings}")
|
|
print(f" Current: {current_countings}")
|
|
print(f" Delta: {countings_delta}")
|
|
|
|
# Compute deltas for metrics
|
|
metrics_delta = None
|
|
if baseline_metrics and current_metrics:
|
|
metrics_delta = {
|
|
"func_count": current_metrics.get("func_count", 0)
|
|
- baseline_metrics.get("func_count", 0),
|
|
"avg_cc": current_metrics.get("avg_cc", 0.0)
|
|
- baseline_metrics.get("avg_cc", 0.0),
|
|
"max_cc": current_metrics.get("max_cc", 0)
|
|
- baseline_metrics.get("max_cc", 0),
|
|
"mi": current_metrics.get("mi", 0.0)
|
|
- baseline_metrics.get("mi", 0.0),
|
|
}
|
|
|
|
matched_results.append(
|
|
{
|
|
"fileA": a.path if a is not None else None,
|
|
"fileB": b.path if b is not None else None,
|
|
"counts": res,
|
|
"baseline_countings": baseline_countings,
|
|
"current_countings": current_countings,
|
|
"countings_delta": countings_delta,
|
|
"baseline_metrics": baseline_metrics,
|
|
"current_metrics": current_metrics,
|
|
"metrics_delta": metrics_delta,
|
|
}
|
|
)
|
|
|
|
result = {
|
|
"baseline_id": self.baseline.baseline_id,
|
|
"compared_at": time.time(),
|
|
"total": total,
|
|
"pairs": matched_results,
|
|
}
|
|
|
|
# Compute summary statistics from baseline and current file metadata
|
|
try:
|
|
# Calculate baseline summary from baseline files (which have embedded countings/metrics)
|
|
baseline_counts = {
|
|
"physical_lines": 0,
|
|
"code_lines": 0,
|
|
"comment_lines": 0,
|
|
"blank_lines": 0,
|
|
"file_count": 0,
|
|
}
|
|
baseline_metrics = {
|
|
"file_count": 0,
|
|
"total_func_count": 0,
|
|
"avg_avg_cc": 0.0,
|
|
"avg_mi": 0.0,
|
|
}
|
|
baseline_metrics_count = 0
|
|
|
|
for fm in baseline_files:
|
|
if fm.countings:
|
|
baseline_counts["physical_lines"] += fm.countings.get(
|
|
"physical_lines", 0
|
|
)
|
|
baseline_counts["code_lines"] += fm.countings.get("code_lines", 0)
|
|
baseline_counts["comment_lines"] += fm.countings.get(
|
|
"comment_lines", 0
|
|
)
|
|
baseline_counts["blank_lines"] += fm.countings.get("blank_lines", 0)
|
|
baseline_counts["file_count"] += 1
|
|
if fm.metrics:
|
|
baseline_metrics["total_func_count"] += fm.metrics.get(
|
|
"func_count", 0
|
|
)
|
|
baseline_metrics["avg_avg_cc"] += fm.metrics.get("avg_cc", 0.0)
|
|
baseline_metrics["avg_mi"] += fm.metrics.get("mi", 0.0)
|
|
baseline_metrics_count += 1
|
|
|
|
if baseline_metrics_count > 0:
|
|
baseline_metrics["avg_avg_cc"] /= baseline_metrics_count
|
|
baseline_metrics["avg_mi"] /= baseline_metrics_count
|
|
baseline_metrics["file_count"] = baseline_metrics_count
|
|
|
|
# Calculate current summary from current files (which have embedded countings/metrics)
|
|
current_counts = {
|
|
"physical_lines": 0,
|
|
"code_lines": 0,
|
|
"comment_lines": 0,
|
|
"blank_lines": 0,
|
|
"file_count": 0,
|
|
}
|
|
current_metrics = {
|
|
"file_count": 0,
|
|
"total_func_count": 0,
|
|
"avg_avg_cc": 0.0,
|
|
"avg_mi": 0.0,
|
|
}
|
|
current_metrics_count = 0
|
|
|
|
for fm in current_files:
|
|
if fm.countings:
|
|
current_counts["physical_lines"] += fm.countings.get(
|
|
"physical_lines", 0
|
|
)
|
|
current_counts["code_lines"] += fm.countings.get("code_lines", 0)
|
|
current_counts["comment_lines"] += fm.countings.get(
|
|
"comment_lines", 0
|
|
)
|
|
current_counts["blank_lines"] += fm.countings.get("blank_lines", 0)
|
|
current_counts["file_count"] += 1
|
|
if fm.metrics:
|
|
current_metrics["total_func_count"] += fm.metrics.get(
|
|
"func_count", 0
|
|
)
|
|
current_metrics["avg_avg_cc"] += fm.metrics.get("avg_cc", 0.0)
|
|
current_metrics["avg_mi"] += fm.metrics.get("mi", 0.0)
|
|
current_metrics_count += 1
|
|
|
|
if current_metrics_count > 0:
|
|
current_metrics["avg_avg_cc"] /= current_metrics_count
|
|
current_metrics["avg_mi"] /= current_metrics_count
|
|
current_metrics["file_count"] = current_metrics_count
|
|
|
|
# Compute deltas
|
|
delta_counts = {
|
|
"physical_lines": current_counts["physical_lines"]
|
|
- baseline_counts["physical_lines"],
|
|
"code_lines": current_counts["code_lines"]
|
|
- baseline_counts["code_lines"],
|
|
"comment_lines": current_counts["comment_lines"]
|
|
- baseline_counts["comment_lines"],
|
|
"blank_lines": current_counts["blank_lines"]
|
|
- baseline_counts["blank_lines"],
|
|
"file_count": current_counts["file_count"]
|
|
- baseline_counts["file_count"],
|
|
}
|
|
delta_metrics = {
|
|
"total_func_count": current_metrics["total_func_count"]
|
|
- baseline_metrics["total_func_count"],
|
|
"avg_avg_cc": current_metrics["avg_avg_cc"]
|
|
- baseline_metrics["avg_avg_cc"],
|
|
"avg_mi": current_metrics["avg_mi"] - baseline_metrics["avg_mi"],
|
|
}
|
|
|
|
result["summary"] = {
|
|
"baseline": {"countings": baseline_counts, "metrics": baseline_metrics},
|
|
"current": {"countings": current_counts, "metrics": current_metrics},
|
|
"delta": {"countings": delta_counts, "metrics": delta_metrics},
|
|
}
|
|
except Exception:
|
|
pass
|
|
|
|
return result
|