Skip to content

Commit d8252d7

Browse files
intermediate push
1 parent 0c5f1cf commit d8252d7

File tree

4 files changed

+38
-141
lines changed

4 files changed

+38
-141
lines changed

tests/integration/release/conftest.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,10 @@ def pytest_addoption(parser: pytest.Parser) -> None:
9393
def release_output_dir() -> Path:
9494
# TODO: [MIC-5522] define correct output dir
9595
# output_dir = os.environ.get("PSP_TEST_OUTPUT_DIR")
96-
output_dir_name = (
97-
"/mnt/team/simulation_science/priv/engineering/pseudopeople_release_testing"
98-
)
96+
# output_dir_name = (
97+
# "/mnt/team/simulation_science/priv/engineering/pseudopeople_release_testing"
98+
# )
99+
output_dir_name = "/home/hjafari/ppl_testing"
99100
# if not output_dir_name:
100101
# raise ValueError("PSP_TEST_OUTPUT_DIR environment variable not set")
101102
output_dir = Path(output_dir_name) / f"{time.strftime('%Y%m%d_%H%M%S')}"
@@ -132,14 +133,16 @@ def dataset_params(
132133

133134

134135
@pytest.fixture(scope="session")
135-
def data(
136+
def noised_data(
136137
dataset_params: tuple[str | int | Callable[..., pd.DataFrame] | None, ...],
137138
release_output_dir: Path,
139+
request: pytest.FixtureRequest,
138140
config: dict[str, Any],
139141
) -> pd.DataFrame:
140142
_, dataset_func, source, year, state, engine = dataset_params
141143

142-
if source is None:
144+
run_slow = request.config.getoption("--runslow")
145+
if run_slow: # get sample data
143146
return dataset_func(seed=SEED, year=None, config=config) # type: ignore [misc, operator]
144147

145148
kwargs = {
@@ -162,7 +165,8 @@ def unnoised_dataset(
162165
dataset_arg, dataset_func, source, year, state, engine = dataset_params
163166
dataset_name = DATASET_ARG_TO_FULL_NAME_MAPPER[dataset_arg] # type: ignore [index]
164167

165-
if source is None:
168+
run_slow = request.config.getoption("--runslow")
169+
if run_slow: # get sample data
166170
return initialize_dataset_with_sample(dataset_name)
167171

168172
kwargs = {

tests/integration/release/test_release.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,15 @@
22

33
from typing import Any
44

5+
import pandas as pd
6+
import pytest
7+
58
from _pytest.fixtures import FixtureRequest
69
from vivarium_testing_utils import FuzzyChecker
710

11+
from pseudopeople.dataset import Dataset
812
from pseudopeople.schema_entities import COLUMNS, DATASET_SCHEMAS
13+
from tests.constants import DATASET_GENERATION_FUNCS
914
from tests.integration.conftest import IDX_COLS, _get_common_datasets, get_unnoised_data
1015
from tests.utilities import (
1116
initialize_dataset_with_sample,
@@ -15,30 +20,27 @@
1520

1621

1722
def test_column_noising(
23+
unnoised_dataset: Dataset,
24+
noised_data: pd.DataFrame,
1825
config: dict[str, Any],
1926
dataset_name: str,
20-
request: FixtureRequest,
2127
fuzzy_checker: FuzzyChecker,
2228
) -> None:
2329
"""Tests that columns are noised as expected"""
24-
original = request.getfixturevalue("unnoised_dataset")
25-
noised_data = request.getfixturevalue("data")
26-
27-
check_noised, check_original, shared_idx = _get_common_datasets(original, noised_data)
30+
check_noised, check_original, shared_idx = _get_common_datasets(unnoised_dataset, noised_data)
2831

2932
run_column_noising_tests(
3033
dataset_name, config, fuzzy_checker, check_noised, check_original, shared_idx
3134
)
3235

3336

3437
def test_row_noising_omit_row_or_do_not_respond(
35-
dataset_name: str, config: dict[str, Any], request: FixtureRequest
38+
noised_data: pd.DataFrame, dataset_name: str, config: dict[str, Any], request: FixtureRequest
3639
) -> None:
3740
"""Tests that omit_row and do_not_respond row noising are being applied"""
3841
idx_cols = IDX_COLS.get(dataset_name)
3942
original = get_unnoised_data(dataset_name)
4043
original_data = original.data.set_index(idx_cols)
41-
noised_data = request.getfixturevalue("data")
4244
noised_data = noised_data.set_index(idx_cols)
4345

4446
run_omit_row_or_do_not_respond_tests(dataset_name, config, original_data, noised_data)
@@ -52,7 +54,7 @@ def test_unnoised_id_cols(dataset_name: str, request: FixtureRequest) -> None:
5254
if dataset_name != DATASET_SCHEMAS.ssa.name:
5355
unnoised_id_cols.append(COLUMNS.household_id.name)
5456
original = initialize_dataset_with_sample(dataset_name)
55-
noised_data = request.getfixturevalue("data")
57+
noised_data = request.getfixturevalue("noised_data")
5658
check_noised, check_original, _ = _get_common_datasets(original, noised_data)
5759
assert (
5860
(

tests/integration/release/test_runner.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,13 @@ def test_runner(pytest_args: list[str]) -> None:
2323
cmd = base_cmd + pytest_args
2424
result = subprocess.run(cmd, capture_output=True, text=True)
2525
assert result.returncode == 0
26+
27+
# mark this as slow
28+
@pytest.mark.parametrize("dataset", ["acs", "cps"])
29+
def test_slow_tests() -> None:
30+
os.chdir(Path(__file__).parent) # need this to access cli options from conftest.py
31+
base_cmd = ["pytest", "--release", "test_release.py"]
32+
cmd = base_cmd + pytest_args
33+
result = subprocess.run(cmd, capture_output=True, text=True)
34+
assert result.returncode == 0
35+
pass

tests/integration/test_interface.py

Lines changed: 8 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -225,96 +225,6 @@ def test_column_dtypes(
225225
assert noised_data[col.name].dtype == expected_dtype
226226

227227

228-
@pytest.mark.parametrize(
229-
"dataset_name",
230-
[
231-
DATASET_SCHEMAS.census.name,
232-
DATASET_SCHEMAS.acs.name,
233-
DATASET_SCHEMAS.cps.name,
234-
DATASET_SCHEMAS.ssa.name,
235-
DATASET_SCHEMAS.tax_w2_1099.name,
236-
DATASET_SCHEMAS.wic.name,
237-
DATASET_SCHEMAS.tax_1040.name,
238-
],
239-
)
240-
@pytest.mark.parametrize(
241-
"engine",
242-
[
243-
"pandas",
244-
"dask",
245-
],
246-
)
247-
def test_column_noising(
248-
dataset_name: str,
249-
engine: str,
250-
config: dict[str, Any],
251-
request: FixtureRequest,
252-
fuzzy_checker: FuzzyChecker,
253-
) -> None:
254-
"""Tests that columns are noised as expected"""
255-
if "TODO" in dataset_name:
256-
pytest.skip(reason=dataset_name)
257-
original = initialize_dataset_with_sample(dataset_name)
258-
if engine == "dask":
259-
generation_function = DATASET_GENERATION_FUNCS[dataset_name]
260-
noised_data = generation_function(
261-
seed=SEED,
262-
year=None,
263-
config=config,
264-
engine=engine,
265-
).compute()
266-
else:
267-
noised_data = request.getfixturevalue(f"noised_sample_data_{dataset_name}")
268-
check_noised, check_original, shared_idx = _get_common_datasets(original, noised_data)
269-
270-
run_column_noising_tests(
271-
dataset_name, config, fuzzy_checker, check_noised, check_original, shared_idx
272-
)
273-
274-
275-
@pytest.mark.parametrize(
276-
"dataset_name",
277-
[
278-
DATASET_SCHEMAS.census.name,
279-
DATASET_SCHEMAS.acs.name,
280-
DATASET_SCHEMAS.cps.name,
281-
DATASET_SCHEMAS.ssa.name,
282-
DATASET_SCHEMAS.tax_w2_1099.name,
283-
DATASET_SCHEMAS.wic.name,
284-
DATASET_SCHEMAS.tax_1040.name,
285-
],
286-
)
287-
@pytest.mark.parametrize(
288-
"engine",
289-
[
290-
"pandas",
291-
"dask",
292-
],
293-
)
294-
def test_row_noising_omit_row_or_do_not_respond(
295-
dataset_name: str, engine: str, config: dict[str, Any], request: FixtureRequest
296-
) -> None:
297-
"""Tests that omit_row and do_not_respond row noising are being applied"""
298-
if "TODO" in dataset_name:
299-
pytest.skip(reason=dataset_name)
300-
idx_cols = IDX_COLS.get(dataset_name)
301-
original = get_unnoised_data(dataset_name)
302-
original_data = original.data.set_index(idx_cols)
303-
if engine == "dask":
304-
generation_function = DATASET_GENERATION_FUNCS[dataset_name]
305-
noised_data = generation_function(
306-
seed=SEED,
307-
year=None,
308-
config=config,
309-
engine=engine,
310-
).compute()
311-
else:
312-
noised_data = request.getfixturevalue(f"noised_sample_data_{dataset_name}")
313-
noised_data = noised_data.set_index(idx_cols)
314-
315-
run_omit_row_or_do_not_respond_tests(dataset_name, config, original_data, noised_data)
316-
317-
318228
@pytest.mark.skip(reason="TODO: Implement duplication row noising")
319229
@pytest.mark.parametrize(
320230
"dataset_name",
@@ -336,42 +246,13 @@ def test_row_noising_duplication(dataset_name: str) -> None:
336246
@pytest.mark.parametrize(
337247
"dataset_name",
338248
[
339-
DATASET_SCHEMAS.census.name,
249+
# DATASET_SCHEMAS.census.name,
250+
# DATASET_SCHEMAS.tax_w2_1099.name,
251+
# DATASET_SCHEMAS.wic.name,
252+
# DATASET_SCHEMAS.tax_1040.name,
340253
DATASET_SCHEMAS.acs.name,
341254
DATASET_SCHEMAS.cps.name,
342255
DATASET_SCHEMAS.ssa.name,
343-
DATASET_SCHEMAS.tax_w2_1099.name,
344-
DATASET_SCHEMAS.wic.name,
345-
DATASET_SCHEMAS.tax_1040.name,
346-
],
347-
)
348-
@pytest.mark.parametrize(
349-
"engine",
350-
[
351-
"pandas",
352-
"dask",
353-
],
354-
)
355-
def test_generate_dataset_with_year(dataset_name: str, engine: str) -> None:
356-
if "TODO" in dataset_name:
357-
pytest.skip(reason=dataset_name)
358-
year = 2030 # not default 2020
359-
generation_function = DATASET_GENERATION_FUNCS[dataset_name]
360-
original = get_unnoised_data(dataset_name)
361-
# Generate a new (non-fixture) noised dataset for a single year
362-
noised_data = generation_function(year=year, engine=engine)
363-
if engine == "dask":
364-
noised_data = noised_data.compute()
365-
assert not original.data.equals(noised_data)
366-
367-
368-
@pytest.mark.parametrize(
369-
"dataset_name",
370-
[
371-
DATASET_SCHEMAS.census.name,
372-
DATASET_SCHEMAS.tax_w2_1099.name,
373-
DATASET_SCHEMAS.wic.name,
374-
DATASET_SCHEMAS.tax_1040.name,
375256
],
376257
)
377258
@pytest.mark.parametrize(
@@ -391,16 +272,16 @@ def test_dataset_filter_by_year(
391272
pytest.skip(reason=dataset_name)
392273
year = 2030 # not default 2020
393274

394-
# Generate a new (non-fixture) noised dataset for a single year but mocked such
275+
# Generate a new (non-fixture) dataset for a single year but mocked such
395276
# that no noise actually happens (otherwise the years would get noised and
396277
# we couldn't tell if the filter was working properly)
397278
mocker.patch("pseudopeople.dataset.Dataset._noise_dataset")
398279
generation_function = DATASET_GENERATION_FUNCS[dataset_name]
399-
noised_data = generation_function(year=year, engine=engine)
280+
data = generation_function(year=year, engine=engine)
400281
if engine == "dask":
401-
noised_data = noised_data.compute()
282+
data = data.compute()
402283
dataset = DATASET_SCHEMAS.get_dataset_schema(dataset_name)
403-
assert (noised_data[dataset.date_column_name] == year).all()
284+
assert (data[dataset.date_column_name] == year).all()
404285

405286

406287
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)