"""Minimal duplicate detection utilities inspired by UCC. This module provides a simple, portable duplicate finder for use from the CLI. It implements: - exact duplicate detection via normalized SHA1 - a simple fuzzy duplicate check using line-based SequenceMatcher The implementation favors clarity over performance and is meant as a first-step integration. For large projects a winnowing/fingerprinting approach should be preferred. """ from __future__ import annotations import hashlib import os from pathlib import Path from typing import Dict, List, Optional, Tuple from collections import Counter, defaultdict import math try: from pygount import analysis as _pg_analysis # type: ignore _HAS_PYGOUNT = True except Exception: _HAS_PYGOUNT = False def _normalize_bytes(b: bytes) -> bytes: if b.startswith(b"\xef\xbb\xbf"): b = b[3:] b = b.replace(b"\r\n", b"\n") b = b.replace(b"\r", b"\n") return b def _sha1_of_bytes(b: bytes) -> str: h = hashlib.sha1() h.update(b) return h.hexdigest() def _read_normalized_text(path: Path) -> Optional[str]: try: raw = path.read_bytes() except Exception: return None norm = _normalize_bytes(raw) try: return norm.decode("utf-8", errors="replace") except Exception: return None def _lines_for_similarity(path: Path) -> List[str]: """Return a list of normalized lines suitable for sequence comparison. Prefer using pygount to strip comments/blank when available; otherwise a simple fallback (strip blank lines and rstrip) is used. """ text = _read_normalized_text(path) if text is None: return [] if _HAS_PYGOUNT: try: sa = _pg_analysis.SourceAnalysis.from_string(text, filename=str(path)) lines = [] # SourceAnalysis keeps per-line categories in .lines (pygount 3.x) for ln in getattr(sa, "lines", []): # ln is a Line object with .type in {"empty", "documentation", "source", ...} if getattr(ln, "type", None) == "empty": continue if getattr(ln, "type", None) == "documentation": continue # treat source and string lines as code lines.append(getattr(ln, "text", "").rstrip()) if lines: return lines except Exception: # fall back to simple approach pass # fallback: keep non-empty lines and strip trailing spaces return [l.rstrip() for l in text.splitlines() if l.strip()] def _normalize_for_fingerprinting(text: str) -> str: """Normalize text for fingerprinting: remove leading/trailing whitespace and collapse all whitespace to single space. Keep case (code is case-sensitive). """ # Replace all whitespace sequences with a single space return " ".join(text.split()) def _compute_kgram_hashes(s: str, k: int) -> List[int]: """Compute rolling Rabin-Karp 64-bit hashes for all k-grams in s.""" if len(s) < k: return [] mask = (1 << 64) - 1 base = 257 # initial hash h = 0 for i in range(k): h = ((h * base) + ord(s[i])) & mask hashes = [h] # precompute base^k pow_k = 1 for _ in range(k): pow_k = (pow_k * base) & mask for i in range(k, len(s)): h = (h * base - ord(s[i - k]) * pow_k + ord(s[i])) & mask hashes.append(h) return hashes def _winnow_hashes(hashes: List[int], window: int) -> set: """Apply winnowing to list of hashes, returning a set of selected fingerprints.""" if not hashes: return set() if window <= 1: return set(hashes) fingerprints = set() # Use deque-like sliding min selection but simple O(n*window) n = len(hashes) for i in range(0, n - window + 1): window_slice = hashes[i : i + window] min_hash = min(window_slice) fingerprints.add(min_hash) return fingerprints def _fingerprints_for_text(text: str, k: int = 25, window: int = 4) -> set: """Return fingerprint set for given text using k-gram winnowing. Defaults chosen to be conservative; for shorter inputs parameters adapt. """ norm = _normalize_for_fingerprinting(text) if not norm: return set() # For small content adapt k k_eff = min(k, max(3, len(norm) // 10)) hashes = _compute_kgram_hashes(norm, k_eff) w_eff = min(window, max(1, len(hashes))) fps = _winnow_hashes(hashes, w_eff) return fps def find_duplicates_in_dir( root: Optional[str] = None, extensions: Optional[List[str]] = None, dup_threshold: float = 5.0, k: int = 25, window: int = 4, file_list: Optional[List[str]] = None, ) -> Dict[str, List[Tuple[str, str]]]: """Scan `root` for duplicate files. Returns a dict with two keys: `exact` and `fuzzy` listing pairs of duplicate paths. `dup_threshold` is the maximal percent of changed lines allowed to mark two files as duplicates (0..100). """ # Normalize extension list to a set with leading dots and lower-case allowed_exts = None if extensions: allowed_exts = set() for e in extensions: ee = e.strip().lower() if not ee: continue if not ee.startswith("."): ee = "." + ee allowed_exts.add(ee) files: List[Path] = [] # If caller provided an explicit file list, use it (no additional filesystem walk) if file_list is not None: for f in file_list: p = Path(f) if not p.is_file(): continue if allowed_exts and p.suffix.lower() not in allowed_exts: continue files.append(p) else: if not root: return {"exact": [], "fuzzy": []} rootp = Path(root) for p in rootp.rglob("*"): if not p.is_file(): continue if allowed_exts and p.suffix.lower() not in allowed_exts: continue files.append(p) # Normalize & sha1 for exact dup detection idx_by_sha: Dict[str, List[Path]] = {} content_cache: Dict[Path, str] = {} for p in files: try: raw = p.read_bytes() except Exception: continue norm = _normalize_bytes(raw) sha = _sha1_of_bytes(norm) idx_by_sha.setdefault(sha, []).append(p) # store normalized text for later fuzzy compare try: content_cache[p] = norm.decode("utf-8", errors="replace") except Exception: content_cache[p] = "" exact: List[Tuple[str, str]] = [] fuzzy: List[Tuple[str, str]] = [] # collect exact duplicates for sha, group in idx_by_sha.items(): if len(group) > 1: base = str(group[0]) for other in group[1:]: exact.append((base, str(other))) # fuzzy duplicates: winnowing + inverted index to avoid full O(n^2) # Compute fingerprints for all files (use normalized content from cache) fps_list: List[set] = [] for p in files: txt = content_cache.get(p, None) if txt is None: txt = _read_normalized_text(p) or "" # prefer using lines-only text to reduce noise # join logical lines for fingerprinting lines = _lines_for_similarity(p) if lines: text_for_fp = "\n".join(lines) else: text_for_fp = txt fps = _fingerprints_for_text(text_for_fp, k=k, window=window) fps_list.append(fps) # Build inverted index from fingerprint -> file indices inverted = defaultdict(set) # hash -> set(file_idx) for idx, fps in enumerate(fps_list): for h in fps: inverted[h].add(idx) seen_pairs = set() for i, fps in enumerate(fps_list): if not fps: continue # gather candidate files that share at least one fingerprint candidate_counts = Counter() for h in fps: for j in inverted.get(h, []): if j == i: continue candidate_counts[j] += 1 # Evaluate each candidate (only those with some shared fingerprints) for j, cnt in candidate_counts.items(): if i >= j: continue # handle pair only once # quick size filter try: sa = files[i].stat().st_size sb = files[j].stat().st_size except Exception: sa = sb = 0 size_diff_pct = 100.0 * abs(sa - sb) / max(sa, sb, 1) if size_diff_pct > dup_threshold + 20.0: continue set_i = fps_list[i] set_j = fps_list[j] if not set_i or not set_j: continue inter = len(set_i & set_j) union = len(set_i | set_j) if union == 0: continue jaccard = inter / union pct_change = 100.0 * (1.0 - jaccard) if pct_change <= dup_threshold: fuzzy.append((str(files[i]), str(files[j]))) return {"exact": exact, "fuzzy": fuzzy}