SXXXXXXX_PyUCC/tests/test_duplicates_fingerprinting.py

25 lines
943 B
Python

import pytest
from pyucc.core import duplicates as dupmod
def test_fingerprints_vary_with_k():
text = "def func():\n return 1\n\n# comment\nprint(func())\n"
# small k should produce more k-grams (and likely more fingerprints)
fps_small = dupmod._fingerprints_for_text(text, k=3, window=4)
fps_large = dupmod._fingerprints_for_text(text, k=15, window=4)
assert isinstance(fps_small, set)
assert isinstance(fps_large, set)
# Expect different fingerprint sets
assert fps_small != fps_large
def test_fingerprints_vary_with_window():
text = "line1 line2 line3 line4 line5 line6 line7"
fps_w1 = dupmod._fingerprints_for_text(text, k=5, window=1)
fps_w4 = dupmod._fingerprints_for_text(text, k=5, window=4)
assert isinstance(fps_w1, set)
assert isinstance(fps_w4, set)
# window=1 should effectively select many more fingerprints than larger window
assert len(fps_w1) >= len(fps_w4)