Skip to content

Commit 03521ef

Browse files
authored
Merge pull request #182 from MITLibraries/USE-306-handle-missing-metadata-or-embeddings
USE 306 - Incremental table read error handling
2 parents 7ae3b96 + e8fa5bf commit 03521ef

File tree

5 files changed

+87
-23
lines changed

5 files changed

+87
-23
lines changed

tests/test_embeddings.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import pytest
1111

1212
from tests.utils import generate_sample_embeddings_for_run
13-
from timdex_dataset_api import TIMDEXDataset
1413
from timdex_dataset_api.embeddings import (
1514
METADATA_SELECT_FILTER_COLUMNS,
1615
TIMDEX_DATASET_EMBEDDINGS_SCHEMA,
@@ -302,9 +301,7 @@ def test_current_embeddings_view_single_run(timdex_dataset_for_embeddings_views)
302301

303302
# write embeddings for run "apple-1"
304303
td.embeddings.write(generate_sample_embeddings_for_run(td, run_id="apple-1"))
305-
306-
# NOTE: at time of test creation, this manual reload is required
307-
td = TIMDEXDataset(td.location)
304+
td.refresh()
308305

309306
# query current_embeddings for apple source using read_dataframe
310307
result = td.embeddings.read_dataframe(table="current_embeddings", source="apple")
@@ -320,9 +317,7 @@ def test_current_embeddings_view_multiple_runs(timdex_dataset_for_embeddings_vie
320317
# write embeddings for runs "orange-1" and "orange-2"
321318
td.embeddings.write(generate_sample_embeddings_for_run(td, run_id="orange-1"))
322319
td.embeddings.write(generate_sample_embeddings_for_run(td, run_id="orange-2"))
323-
324-
# NOTE: at time of test creation, this manual reload is required
325-
td = TIMDEXDataset(td.location)
320+
td.refresh()
326321

327322
# query current_embeddings for orange source using read_dataframe
328323
result = td.embeddings.read_dataframe(table="current_embeddings", source="orange")
@@ -363,9 +358,7 @@ def test_current_embeddings_view_handles_duplicate_run_embeddings(
363358
td, run_id="lemon-2", embedding_timestamp="2025-08-03T00:00:00+00:00"
364359
)
365360
)
366-
367-
# NOTE: at time of test creation, this manual reload is required
368-
td = TIMDEXDataset(td.location)
361+
td.refresh()
369362

370363
# check all embeddings for lemon-2 to verify both writes exist
371364
all_lemon_2 = td.embeddings.read_dataframe(table="embeddings", run_id="lemon-2")
@@ -416,9 +409,7 @@ def test_embeddings_view_includes_all_embeddings(timdex_dataset_for_embeddings_v
416409
td, run_id="lemon-2", embedding_timestamp="2025-08-03T00:00:00+00:00"
417410
)
418411
)
419-
420-
# NOTE: at time of test creation, this manual reload is required
421-
td = TIMDEXDataset(td.location)
412+
td.refresh()
422413

423414
# query all embeddings for lemon source
424415
result = td.embeddings.read_dataframe(table="embeddings", source="lemon")
@@ -435,3 +426,25 @@ def test_embeddings_view_includes_all_embeddings(timdex_dataset_for_embeddings_v
435426
lemon_2_embeddings = result[result["run_id"] == "lemon-2"]
436427
assert len(lemon_2_embeddings) == 10 # 5 from each write
437428
assert (lemon_2_embeddings["run_date"] == date(2025, 8, 2)).all()
429+
430+
431+
def test_embeddings_read_batches_iter_returns_empty_when_embeddings_missing(
432+
timdex_dataset_empty, caplog
433+
):
434+
result = list(timdex_dataset_empty.embeddings.read_batches_iter())
435+
assert result == []
436+
assert (
437+
"Table 'embeddings' not found in DuckDB context. Embeddings may not yet exist "
438+
"or TIMDEXDataset.refresh() may be required." in caplog.text
439+
)
440+
441+
442+
def test_embeddings_read_batches_iter_returns_empty_for_invalid_table(
443+
timdex_embeddings_with_runs, caplog
444+
):
445+
"""read_batches_iter returns empty iterator for nonexistent table name."""
446+
with pytest.raises(
447+
ValueError,
448+
match="Invalid table: 'nonexistent'",
449+
):
450+
list(timdex_embeddings_with_runs.read_batches_iter(table="nonexistent"))

tests/test_read.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# ruff: noqa: D205, D209, PLR2004
2-
2+
import re
33

44
import pandas as pd
55
import pyarrow as pa
@@ -280,3 +280,28 @@ def test_read_batches_iter_limit_returns_n_rows(timdex_dataset_multi_source):
280280
batches = timdex_dataset_multi_source.read_batches_iter(limit=10)
281281
table = pa.Table.from_batches(batches)
282282
assert len(table) == 10
283+
284+
285+
def test_read_batches_iter_returns_empty_when_metadata_missing(
286+
timdex_dataset_empty, caplog
287+
):
288+
with pytest.raises(
289+
ValueError,
290+
match=re.escape(
291+
"Table 'records' not found in DuckDB context. If this is a new dataset, "
292+
"either records do not yet exist or a "
293+
"TIMDEXDataset.metadata.rebuild_dataset_metadata() may be required."
294+
),
295+
):
296+
list(timdex_dataset_empty.read_batches_iter())
297+
298+
299+
def test_read_batches_iter_returns_empty_for_invalid_table(
300+
timdex_dataset_multi_source, caplog
301+
):
302+
"""read_batches_iter returns empty iterator for nonexistent table name."""
303+
with pytest.raises(
304+
ValueError,
305+
match="Invalid table: 'nonexistent'",
306+
):
307+
list(timdex_dataset_multi_source.read_batches_iter(table="nonexistent"))

timdex_dataset_api/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from timdex_dataset_api.metadata import TIMDEXDatasetMetadata
66
from timdex_dataset_api.record import DatasetRecord
77

8-
__version__ = "3.9.0"
8+
__version__ = "3.10.0"
99

1010
__all__ = [
1111
"DatasetEmbedding",

timdex_dataset_api/dataset.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -443,17 +443,30 @@ def read_batches_iter(
443443
"""
444444
start_time = time.perf_counter()
445445

446+
# ensure valid table
447+
if table not in ["records", "current_records"]:
448+
raise ValueError(f"Invalid table: '{table}'")
449+
450+
# ensure table exists
451+
try:
452+
self.get_sa_table("metadata", table)
453+
except ValueError as exc:
454+
raise ValueError(
455+
f"Table '{table}' not found in DuckDB context. If this is a new "
456+
"dataset, either records do not yet exist or a "
457+
"TIMDEXDataset.metadata.rebuild_dataset_metadata() may be required."
458+
) from exc
459+
446460
temp_table_name = "read_meta_chunk"
447461
total_yield_count = 0
448462

449-
for i, meta_chunk_df in enumerate(
450-
self._iter_meta_chunks(
451-
table,
452-
limit=limit,
453-
where=where,
454-
**filters,
455-
)
456-
):
463+
meta_chunks = self._iter_meta_chunks(
464+
table,
465+
limit=limit,
466+
where=where,
467+
**filters,
468+
)
469+
for i, meta_chunk_df in enumerate(meta_chunks):
457470
batch_time = time.perf_counter()
458471
batch_yield_count = len(meta_chunk_df)
459472
total_yield_count += batch_yield_count

timdex_dataset_api/embeddings.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,19 @@ def read_batches_iter(
358358
"""
359359
start_time = time.perf_counter()
360360

361+
if table not in ["embeddings", "current_embeddings", "current_run_embeddings"]:
362+
raise ValueError(f"Invalid table: '{table}'")
363+
364+
# ensure table exists
365+
try:
366+
self.timdex_dataset.get_sa_table("data", table)
367+
except ValueError:
368+
logger.warning(
369+
f"Table '{table}' not found in DuckDB context. Embeddings may not yet "
370+
"exist or TIMDEXDataset.refresh() may be required."
371+
)
372+
return
373+
361374
data_query = self._build_query(
362375
table,
363376
columns,

0 commit comments

Comments
 (0)