Skip to content

Commit 1ced18d

Browse files
authored
fix(tests): skip gitignored-dataset tests on CI (#47)
tests/benchmarks/datasets/ is gitignored, so CI never has the DBLP-ACM files. Two tests (test_autoconfig_parity_pins_unchanged, test_auto_configure_df_dblp_acm_does_not_crash) tried to read them and failed with FileNotFoundError. Now they pytest.skip when the data file is absent. Both tests remain effective on developer machines that have pulled the Leipzig dataset. Parity pin contract is unchanged — the capture script at tests/parity/capture_autoconfig_output.py is still the source of truth for regenerating pins.
1 parent 5da842b commit 1ced18d

2 files changed

Lines changed: 22 additions & 1 deletion

File tree

tests/test_autoconfig.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1321,14 +1321,26 @@ def test_classify_by_data_prose_not_multi_name():
13211321
def test_autoconfig_parity_pins_unchanged():
13221322
"""Pin test: AutoConfigDecisions refactor must not change output for the
13231323
three benchmarks. If this fails, the refactor changed behavior - fix the
1324-
refactor, not the pin file."""
1324+
refactor, not the pin file.
1325+
1326+
Benchmark datasets are gitignored; this test is effectively local-dev
1327+
only. CI skips it gracefully. Contributors who want to exercise the
1328+
parity guard should run `python tests/parity/capture_autoconfig_output.py`
1329+
after pulling the Leipzig DBLP-ACM dataset into tests/benchmarks/datasets/.
1330+
"""
13251331
import json
13261332
import sys
13271333
from pathlib import Path
13281334

13291335
import polars as pl
1336+
import pytest
13301337

13311338
repo_root = Path(__file__).parent.parent
1339+
dblp_path = repo_root / "tests" / "benchmarks" / "datasets" / "DBLP-ACM" / "DBLP2.csv"
1340+
if not dblp_path.exists():
1341+
pytest.skip(
1342+
"DBLP-ACM dataset not present (gitignored, local-dev only)"
1343+
)
13321344
# Import pin_config from the capture script (not a normal test import)
13331345
sys.path.insert(0, str(repo_root / "tests" / "parity"))
13341346
from capture_autoconfig_output import pin_config # type: ignore

tests/test_autoconfig_verify.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,9 +408,18 @@ def test_preflight_check5_drops_empty_matchkey_after_record_embedding_removal():
408408

409409

410410
def test_auto_configure_df_dblp_acm_does_not_crash():
411+
"""Regression test for the v1.5.0 preflight fix: zero-config dedupe_df
412+
on biblio-style data no longer crashes with 'Missing required columns'.
413+
414+
Benchmark datasets are gitignored; CI skips gracefully. Full integration
415+
coverage lives in tests/test_autoconfig_benchmarks.py (marked @benchmark).
416+
"""
411417
from pathlib import Path
418+
import pytest
412419
from goldenmatch._api import dedupe_df
413420
d = Path("tests/benchmarks/datasets/DBLP-ACM")
421+
if not (d / "DBLP2.csv").exists():
422+
pytest.skip("DBLP-ACM dataset not present (gitignored, local-dev only)")
414423
dblp = pl.read_csv(d / "DBLP2.csv", encoding="utf8-lossy", ignore_errors=True)
415424
acm = pl.read_csv(d / "ACM.csv", encoding="utf8-lossy", ignore_errors=True)
416425
df = pl.concat([dblp, acm], how="diagonal_relaxed")

0 commit comments

Comments
 (0)