DataDesigner/packages/data-designer/tests/interface/test_results.py at 56fbe31f7c320cee311b14fd66a303db6940d881 · NVIDIA-NeMo/DataDesigner · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

from unittest.mock import MagicMock, patch

import pytest

import data_designer.lazy_heavy_imports as lazy
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
from data_designer.config.config_builder import DataDesignerConfigBuilder
from data_designer.config.dataset_metadata import DatasetMetadata
from data_designer.config.preview_results import PreviewResults
from data_designer.config.utils.errors import DatasetSampleDisplayError
from data_designer.config.utils.visualization import display_sample_record as display_fn
from data_designer.engine.storage.artifact_storage import ArtifactStorage
from data_designer.interface.results import DatasetCreationResults


@pytest.fixture
def stub_artifact_storage(stub_dataframe):
    """Mock artifact storage that returns a test DataFrame."""
    storage = MagicMock(spec=ArtifactStorage)
    storage.load_dataset.return_value = stub_dataframe
    return storage


@pytest.fixture
def stub_dataset_metadata():
    """Fixture providing a DatasetMetadata instance."""
    return DatasetMetadata()


@pytest.fixture
def stub_dataset_creation_results(
    stub_artifact_storage, stub_dataset_profiler_results, stub_complete_builder, stub_dataset_metadata
):
    """Fixture providing a DatasetCreationResults instance."""
    return DatasetCreationResults(
        artifact_storage=stub_artifact_storage,
        analysis=stub_dataset_profiler_results,
        config_builder=stub_complete_builder,
        dataset_metadata=stub_dataset_metadata,
    )


def test_init(stub_artifact_storage, stub_dataset_profiler_results, stub_complete_builder, stub_dataset_metadata):
    """Test DatasetCreationResults initialization."""
    results = DatasetCreationResults(
        artifact_storage=stub_artifact_storage,
        analysis=stub_dataset_profiler_results,
        config_builder=stub_complete_builder,
        dataset_metadata=stub_dataset_metadata,
    )
    assert results.artifact_storage == stub_artifact_storage
    assert results._analysis == stub_dataset_profiler_results
    assert results._config_builder == stub_complete_builder
    assert results.dataset_metadata == stub_dataset_metadata


def test_load_dataset(stub_dataset_creation_results, stub_artifact_storage, stub_dataframe):
    """Test loading the dataset."""
    dataset = stub_dataset_creation_results.load_dataset()

    assert isinstance(dataset, lazy.pd.DataFrame)
    stub_artifact_storage.load_dataset.assert_called_once()
    lazy.pd.testing.assert_frame_equal(dataset, stub_dataframe)


def test_load_analysis(stub_dataset_creation_results, stub_dataset_profiler_results):
    """Test loading the analysis results."""
    analysis = stub_dataset_creation_results.load_analysis()

    assert isinstance(analysis, DatasetProfilerResults)
    assert analysis == stub_dataset_profiler_results


def test_load_analysis_returns_same_instance(stub_dataset_creation_results):
    """Test that load_analysis returns the same analysis instance."""
    analysis1 = stub_dataset_creation_results.load_analysis()
    analysis2 = stub_dataset_creation_results.load_analysis()

    assert analysis1 is analysis2


def test_record_sampler_dataset_initialization(stub_dataset_creation_results, stub_artifact_storage):
    """Test that _record_sampler_dataset cached property loads dataset correctly."""
    # Access the cached property
    dataset = stub_dataset_creation_results._record_sampler_dataset

    # Verify load_dataset was called
    stub_artifact_storage.load_dataset.assert_called_once()
    lazy.pd.testing.assert_frame_equal(dataset, stub_artifact_storage.load_dataset.return_value)


@patch("data_designer.config.utils.visualization.display_sample_record", autospec=True)
def test_display_sample_record_with_default_params(
    mock_display_sample_record, stub_dataset_creation_results, stub_dataframe
):
    """Test display_sample_record with default parameters."""
    stub_dataset_creation_results.display_sample_record()

    # Verify the underlying display_sample_record function was called
    mock_display_sample_record.assert_called_once()
    call_kwargs = mock_display_sample_record.call_args.kwargs
    assert call_kwargs["syntax_highlighting_theme"] == "dracula"
    assert call_kwargs["background_color"] is None
    assert call_kwargs["record_index"] == 0
    # Verify the record passed is the first row of the dataframe
    lazy.pd.testing.assert_series_equal(mock_display_sample_record.call_args.kwargs["record"], stub_dataframe.iloc[0])


@patch("data_designer.config.utils.visualization.display_sample_record", autospec=True)
def test_display_sample_record_with_custom_index(
    mock_display_sample_record, stub_dataset_creation_results, stub_dataframe
):
    """Test display_sample_record with a specific index."""
    stub_dataset_creation_results.display_sample_record(index=5)

    mock_display_sample_record.assert_called_once()
    call_kwargs = mock_display_sample_record.call_args.kwargs
    assert call_kwargs["record_index"] == 5
    assert call_kwargs["syntax_highlighting_theme"] == "dracula"
    assert call_kwargs["background_color"] is None
    # Verify the record passed is the correct row
    lazy.pd.testing.assert_series_equal(mock_display_sample_record.call_args.kwargs["record"], stub_dataframe.iloc[5])


@patch("data_designer.config.utils.visualization.display_sample_record", autospec=True)
def test_display_sample_record_with_custom_theme(mock_display_sample_record, stub_dataset_creation_results):
    """Test display_sample_record with custom syntax highlighting theme."""
    stub_dataset_creation_results.display_sample_record(syntax_highlighting_theme="monokai")

    mock_display_sample_record.assert_called_once()
    call_kwargs = mock_display_sample_record.call_args.kwargs
    assert call_kwargs["syntax_highlighting_theme"] == "monokai"
    assert call_kwargs["background_color"] is None


@patch("data_designer.config.utils.visualization.display_sample_record", autospec=True)
def test_display_sample_record_with_custom_background_color(mock_display_sample_record, stub_dataset_creation_results):
    """Test display_sample_record with custom background color."""
    stub_dataset_creation_results.display_sample_record(background_color="#282a36")

    mock_display_sample_record.assert_called_once()
    call_kwargs = mock_display_sample_record.call_args.kwargs
    assert call_kwargs["syntax_highlighting_theme"] == "dracula"
    assert call_kwargs["background_color"] == "#282a36"


@patch("data_designer.config.utils.visualization.display_sample_record", autospec=True)
def test_display_sample_record_with_all_custom_params(mock_display_sample_record, stub_dataset_creation_results):
    """Test display_sample_record with all parameters customized."""
    stub_dataset_creation_results.display_sample_record(
        index=3,
        syntax_highlighting_theme="monokai",
        background_color="#1e1e1e",
    )

    mock_display_sample_record.assert_called_once()
    call_kwargs = mock_display_sample_record.call_args.kwargs
    assert call_kwargs["record_index"] == 3
    assert call_kwargs["syntax_highlighting_theme"] == "monokai"
    assert call_kwargs["background_color"] == "#1e1e1e"


@patch("data_designer.config.utils.visualization.display_sample_record", autospec=True)
def test_display_sample_record_multiple_calls(
    mock_display_sample_record, stub_dataset_creation_results, stub_dataframe
):
    """Test that display_sample_record cycles through records on multiple calls."""
    num_records = len(stub_dataframe)

    # Call multiple times to test cycling
    for i in range(5):
        stub_dataset_creation_results.display_sample_record()

    assert mock_display_sample_record.call_count == 5

    # Verify that record indices cycle through 0, 1, 2, ..., num_records-1, 0, ...
    for i in range(5):
        call_kwargs = mock_display_sample_record.call_args_list[i].kwargs
        expected_index = i % num_records
        assert call_kwargs["record_index"] == expected_index


def test_display_sample_record_with_empty_dataset():
    """Test display_sample_record behavior with empty dataset."""
    empty_storage = MagicMock(spec=ArtifactStorage)
    empty_storage.load_dataset.return_value = lazy.pd.DataFrame()

    results = DatasetCreationResults(
        artifact_storage=empty_storage,
        analysis=MagicMock(spec=DatasetProfilerResults),
        config_builder=MagicMock(spec=DataDesignerConfigBuilder),
        dataset_metadata=DatasetMetadata(),
    )

    # Empty DataFrame is still a valid DataFrame, so accessing _record_sampler_dataset succeeds
    # but display_sample_record fails when trying to access index 0
    # Note: Currently raises UnboundLocalError due to bug in error handling, but tests that it fails
    with pytest.raises((DatasetSampleDisplayError, UnboundLocalError)):
        results.display_sample_record()


def test_display_sample_record_with_none_dataset():
    """Test display_sample_record behavior when dataset is None."""
    none_storage = MagicMock(spec=ArtifactStorage)
    none_storage.load_dataset.return_value = None

    results = DatasetCreationResults(
        artifact_storage=none_storage,
        analysis=MagicMock(spec=DatasetProfilerResults),
        config_builder=MagicMock(spec=DataDesignerConfigBuilder),
        dataset_metadata=DatasetMetadata(),
    )

    # Mixin raises DatasetSampleDisplayError when dataset is None
    with pytest.raises(DatasetSampleDisplayError, match="No valid dataset found"):
        results.display_sample_record()


def test_results_protocol_conformance(stub_dataset_creation_results):
    """Test that DatasetCreationResults conforms to ResultsProtocol."""
    # ResultsProtocol requires these methods
    assert hasattr(stub_dataset_creation_results, "load_dataset")
    assert hasattr(stub_dataset_creation_results, "load_analysis")
    assert hasattr(stub_dataset_creation_results, "display_sample_record")
    assert callable(stub_dataset_creation_results.load_dataset)
    assert callable(stub_dataset_creation_results.load_analysis)
    assert callable(stub_dataset_creation_results.display_sample_record)


def test_artifact_storage_load_dataset_called_once_for_caching(stub_dataset_creation_results, stub_artifact_storage):
    """Test that artifact_storage.load_dataset is called once when _record_sampler_dataset is cached."""
    # First access to _record_sampler_dataset
    _ = stub_dataset_creation_results._record_sampler_dataset

    # Second access to _record_sampler_dataset (should use cached value)
    _ = stub_dataset_creation_results._record_sampler_dataset

    # Should only be called once due to caching
    assert stub_artifact_storage.load_dataset.call_count == 1


def test_load_dataset_independent_of_record_sampler_cache(stub_dataset_creation_results, stub_artifact_storage):
    """Test that load_dataset calls artifact_storage.load_dataset independently of cache."""
    # Access _record_sampler_dataset to trigger caching
    _ = stub_dataset_creation_results._record_sampler_dataset

    # Reset the call count
    stub_artifact_storage.load_dataset.reset_mock()

    # Call load_dataset
    stub_dataset_creation_results.load_dataset()

    # Should call load_dataset again (independent of cache)
    stub_artifact_storage.load_dataset.assert_called_once()


@pytest.mark.parametrize("fmt", ["jsonl", "csv", "parquet"])
def test_export_writes_file(stub_dataset_creation_results, tmp_path, fmt):
    """export() writes a file in the requested format."""
    out = tmp_path / f"out.{fmt}"
    result = stub_dataset_creation_results.export(out, format=fmt)
    assert result == out
    assert out.exists()
    assert out.stat().st_size > 0


def test_export_jsonl_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
    """JSONL export writes one JSON object per line."""
    import json

    out = tmp_path / "out.jsonl"
    stub_dataset_creation_results.export(out, format="jsonl")
    lines = out.read_text(encoding="utf-8").splitlines()
    assert len(lines) == len(stub_dataframe)
    # Each line must be valid JSON
    for line in lines:
        json.loads(line)


def test_export_csv_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
    """CSV export has a header row and one data row per record."""
    import data_designer.lazy_heavy_imports as lazy

    out = tmp_path / "out.csv"
    stub_dataset_creation_results.export(out, format="csv")
    loaded = lazy.pd.read_csv(out)
    assert list(loaded.columns) == list(stub_dataframe.columns)
    assert len(loaded) == len(stub_dataframe)


def test_export_parquet_content(stub_dataset_creation_results, stub_dataframe, tmp_path):
    """Parquet export round-trips to the original DataFrame."""
    import data_designer.lazy_heavy_imports as lazy

    out = tmp_path / "out.parquet"
    stub_dataset_creation_results.export(out, format="parquet")
    loaded = lazy.pd.read_parquet(out)
    lazy.pd.testing.assert_frame_equal(loaded.reset_index(drop=True), stub_dataframe.reset_index(drop=True))


def test_export_default_format_is_jsonl(stub_dataset_creation_results, tmp_path):
    """export() defaults to JSONL when no format is given."""
    import json

    out = tmp_path / "out.jsonl"
    stub_dataset_creation_results.export(out)
    lines = out.read_text(encoding="utf-8").splitlines()
    # All lines must be valid JSON
    for line in lines:
        json.loads(line)


def test_export_unsupported_format_raises(stub_dataset_creation_results, tmp_path):
    """export() raises ValueError for unknown formats."""
    with pytest.raises(ValueError, match="Unsupported export format"):
        stub_dataset_creation_results.export(tmp_path / "out.xyz", format="xlsx")  # type: ignore[arg-type]


def test_export_returns_path_object(stub_dataset_creation_results, tmp_path):
    """export() returns a Path regardless of whether str or Path was passed."""
    from pathlib import Path

    out = tmp_path / "out.jsonl"
    result = stub_dataset_creation_results.export(str(out))
    assert isinstance(result, Path)


def test_preview_results_dataset_metadata() -> None:
    """Test that PreviewResults uses DatasetMetadata in display_sample_record."""
    config_builder = MagicMock(spec=DataDesignerConfigBuilder)
    config_builder.get_columns_of_type.return_value = []

    dataset_metadata = DatasetMetadata(seed_column_names=["name", "age"])

    results = PreviewResults(
        config_builder=config_builder,
        dataset=lazy.pd.DataFrame({"name": ["Alice"], "age": [25], "greeting": ["Hello"]}),
        dataset_metadata=dataset_metadata,
    )

    # Verify metadata is stored as public attribute
    assert results.dataset_metadata == dataset_metadata
    assert results.dataset_metadata.seed_column_names == ["name", "age"]

    # Patch display_sample_record to capture the seed_column_names argument
    with patch("data_designer.config.utils.visualization.display_sample_record", wraps=display_fn) as mock_display:
        results.display_sample_record(index=0)

        # Verify seed_column_names was passed to the display function
        mock_display.assert_called_once()
        call_kwargs = mock_display.call_args.kwargs
        assert call_kwargs["seed_column_names"] == ["name", "age"]