SXXXXXXX_PyUCC/tools/diagnose_pygount.py

"""Run pygount on two directories and compare per-file counting results.

Usage:
    python tools/diagnose_pygount.py <dirA> <dirB> [--ext .py,.txt]

Output: lists files with differing counting values or missing files.
"""
import argparse
import json
import subprocess
from pathlib import Path
from typing import Dict, Any, List
import sys


def collect_files(directory: Path, exts=None) -> List[Path]:
    files: List[Path] = []
    for p in directory.rglob("*"):
        if p.is_file():
            if exts:
                if not any(str(p).lower().endswith(e) for e in exts):
                    continue
            files.append(p)
    return files


def run_pygount_on_files(files: List[Path]) -> List[Dict[str, Any]]:
    if not files:
        return []
    cmd = ["pygount", "--format", "json"] + [str(p) for p in files]
    proc = subprocess.run(cmd, check=False, capture_output=True, text=True)
    if proc.returncode != 0:
        print(f"pygount failed for {len(files)} files: returncode={proc.returncode}")
        print("stderr:\n", proc.stderr)
    parsed = []
    if proc.stdout:
        text = proc.stdout.strip()
        if not text:
            parsed = []
        else:
            try:
                parsed = json.loads(text)
            except Exception:
                # Try parse as newline-separated JSON objects
                parsed = []
                for line in text.splitlines():
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        parsed.append(json.loads(line))
                    except Exception:
                        # ignore lines that are not JSON
                        pass
    # If pygount returned a dict with 'files' key (formatVersion summary), extract it
    if isinstance(parsed, dict) and parsed.get('files'):
        parsed = parsed.get('files')
    return parsed


def map_item(item: Dict[str, Any]) -> Dict[str, Any]:
    # Similar mapping as in countings_impl._map_pygount_json_item
    physical = (
        item.get("lineCount")
        if item.get("lineCount") is not None else
        item.get("raw_total_lines")
        if item.get("raw_total_lines") is not None else
        item.get("n_lines")
        if item.get("n_lines") is not None else
        item.get("lines")
        if item.get("lines") is not None else
        item.get("raw_lines")
        if item.get("raw_lines") is not None else
        item.get("line_count")
        if item.get("line_count") is not None else
        0
    )
    code = (
        item.get("sourceCount")
        if item.get("sourceCount") is not None else
        item.get("codeCount")
        if item.get("codeCount") is not None else
        item.get("code")
        if item.get("code") is not None else
        item.get("n_code")
        if item.get("n_code") is not None else
        item.get("n_code_lines")
        if item.get("n_code_lines") is not None else
        item.get("code_lines")
        if item.get("code_lines") is not None else
        0
    )
    comment = (
        item.get("documentationCount")
        if item.get("documentationCount") is not None else
        item.get("comment")
        if item.get("comment") is not None else
        item.get("n_comment")
        if item.get("n_comment") is not None else
        item.get("n_comment_lines")
        if item.get("n_comment_lines") is not None else
        item.get("comment_lines")
        if item.get("comment_lines") is not None else
        0
    )
    blank = (
        item.get("emptyCount")
        if item.get("emptyCount") is not None else
        item.get("blank")
        if item.get("blank") is not None else
        item.get("n_blank")
        if item.get("n_blank") is not None else
        item.get("blank_lines")
        if item.get("blank_lines") is not None else
        item.get("empty_count")
        if item.get("empty_count") is not None else
        0
    )
    language = item.get("language") or item.get("languageName") or item.get("lang") or "unknown"
    file_path = (
        item.get("filename")
        or item.get("file")
        or item.get("path")
        or item.get("name")
        or ""
    )
    return {
        "file": str(file_path).replace('\\\\', '/'),
        "physical_lines": int(physical),
        "code_lines": int(code),
        "comment_lines": int(comment),
        "blank_lines": int(blank),
        "language": language,
    }


def build_map(parsed: List[Dict[str, Any]], base: Path) -> Dict[str, Dict[str, Any]]:
    result = {}
    for item in parsed:
        if not isinstance(item, dict):
            # skip unexpected output lines
            continue
        mapped = map_item(item)
        fname = mapped['file']
        # If pygount returns absolute paths, make them relative to base when possible
        p = Path(fname)
        try:
            if p.is_absolute():
                rel = str(p.relative_to(base)).replace('\\', '/')
            else:
                rel = fname.replace('\\', '/')
        except Exception:
            rel = fname.replace('\\', '/')
        result[rel] = mapped
    return result


def compare_maps(mapA: Dict[str, Dict[str, Any]], mapB: Dict[str, Dict[str, Any]]):
    keysA = set(mapA.keys())
    keysB = set(mapB.keys())
    onlyA = sorted(keysA - keysB)
    onlyB = sorted(keysB - keysA)
    common = sorted(keysA & keysB)

    print(f'Files only in A: {len(onlyA)}; only in B: {len(onlyB)}; common: {len(common)}')
    if onlyA:
        print('\nOnly in A (examples):')
        for p in onlyA[:20]:
            print('  ', p)
    if onlyB:
        print('\nOnly in B (examples):')
        for p in onlyB[:20]:
            print('  ', p)

    diffs = []
    for k in common:
        a = mapA[k]
        b = mapB[k]
        # Compare relevant numeric fields
        fields = ['physical_lines', 'code_lines', 'comment_lines', 'blank_lines']
        diff_fields = [f for f in fields if a.get(f) != b.get(f)]
        lang_diff = a.get('language') != b.get('language')
        if diff_fields or lang_diff:
            diffs.append((k, a, b, diff_fields, lang_diff))
    print(f'Found {len(diffs)} differing files (by counting differences)')
    for k, a, b, diff_fields, lang_diff in diffs[:200]:
        print('\n--', k)
        for f in ['physical_lines', 'code_lines', 'comment_lines', 'blank_lines']:
            print(f"  {f}: A={a.get(f)} B={b.get(f)}")
        if lang_diff:
            print(f"  language: A={a.get('language')} B={b.get('language')}")
        if diff_fields:
            print('  diff fields:', diff_fields)

    if not diffs:
        print('No counting differences detected (pygount results matched for all common files).')


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('dirA')
    parser.add_argument('dirB')
    parser.add_argument('--ext', default=None, help='Comma-separated extensions to check, e.g. .py,.txt (not used by pygount run)')
    args = parser.parse_args()

    a = Path(args.dirA)
    b = Path(args.dirB)
    if not a.exists() or not b.exists():
        print('One of the provided dirs does not exist:', a, b)
        sys.exit(2)

    exts = None
    if args.ext:
        exts = [e.strip().lower() for e in args.ext.split(',') if e.strip()]

    filesA = collect_files(a, exts)
    filesB = collect_files(b, exts)

    parsedA = run_pygount_on_files(filesA)
    parsedB = run_pygount_on_files(filesB)

    mapA = build_map(parsedA, a)
    mapB = build_map(parsedB, b)

    compare_maps(mapA, mapB)

if __name__ == '__main__':
    main()