Skip to content

Commit e8fa5bf

Browse files
committed
Incremental table read error handling
Why these changes are being introduced: There are a copule of primary ways in which a read method can fail based on the table requested: 1. The table name is invalid and will never work. 2. The table name is valid, but does not yet exist in the DuckDB context. For the first, we want to raise an error immediately. For the second, there is a bit more nuance depending on the table requested. How this addresses that need: For TIMDEXDataset.read_* methods, we operate from the assumption that `records` and `current_records` should always be available and so we raise an error indicating a metadata rebuild is required. But for TIMDEXEmbeddings, and potentially other data sources as added, we may legitimately not have data yet. As such, we'll want to log warnings and suggest a refresh, but just return an empty set. Side effects of this change: * Applications like TIM, which now attempt embeddings reading for its `reindex-source` CLI command, will no longer encounter an error if embeddings don't yet exist. * In the rare edge cases of a brand new dataset, we have better error raising and logging. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-306
1 parent 7ae3b96 commit e8fa5bf

File tree

5 files changed

+87
-23
lines changed

5 files changed

+87
-23
lines changed

tests/test_embeddings.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import pytest
1111

1212
from tests.utils import generate_sample_embeddings_for_run
13-
from timdex_dataset_api import TIMDEXDataset
1413
from timdex_dataset_api.embeddings import (
1514
METADATA_SELECT_FILTER_COLUMNS,
1615
TIMDEX_DATASET_EMBEDDINGS_SCHEMA,
@@ -302,9 +301,7 @@ def test_current_embeddings_view_single_run(timdex_dataset_for_embeddings_views)
302301

303302
# write embeddings for run "apple-1"
304303
td.embeddings.write(generate_sample_embeddings_for_run(td, run_id="apple-1"))
305-
306-
# NOTE: at time of test creation, this manual reload is required
307-
td = TIMDEXDataset(td.location)
304+
td.refresh()
308305

309306
# query current_embeddings for apple source using read_dataframe
310307
result = td.embeddings.read_dataframe(table="current_embeddings", source="apple")
@@ -320,9 +317,7 @@ def test_current_embeddings_view_multiple_runs(timdex_dataset_for_embeddings_vie
320317
# write embeddings for runs "orange-1" and "orange-2"
321318
td.embeddings.write(generate_sample_embeddings_for_run(td, run_id="orange-1"))
322319
td.embeddings.write(generate_sample_embeddings_for_run(td, run_id="orange-2"))
323-
324-
# NOTE: at time of test creation, this manual reload is required
325-
td = TIMDEXDataset(td.location)
320+
td.refresh()
326321

327322
# query current_embeddings for orange source using read_dataframe
328323
result = td.embeddings.read_dataframe(table="current_embeddings", source="orange")
@@ -363,9 +358,7 @@ def test_current_embeddings_view_handles_duplicate_run_embeddings(
363358
td, run_id="lemon-2", embedding_timestamp="2025-08-03T00:00:00+00:00"
364359
)
365360
)
366-
367-
# NOTE: at time of test creation, this manual reload is required
368-
td = TIMDEXDataset(td.location)
361+
td.refresh()
369362

370363
# check all embeddings for lemon-2 to verify both writes exist
371364
all_lemon_2 = td.embeddings.read_dataframe(table="embeddings", run_id="lemon-2")
@@ -416,9 +409,7 @@ def test_embeddings_view_includes_all_embeddings(timdex_dataset_for_embeddings_v
416409
td, run_id="lemon-2", embedding_timestamp="2025-08-03T00:00:00+00:00"
417410
)
418411
)
419-
420-
# NOTE: at time of test creation, this manual reload is required
421-
td = TIMDEXDataset(td.location)
412+
td.refresh()
422413

423414
# query all embeddings for lemon source
424415
result = td.embeddings.read_dataframe(table="embeddings", source="lemon")
@@ -435,3 +426,25 @@ def test_embeddings_view_includes_all_embeddings(timdex_dataset_for_embeddings_v
435426
lemon_2_embeddings = result[result["run_id"] == "lemon-2"]
436427
assert len(lemon_2_embeddings) == 10 # 5 from each write
437428
assert (lemon_2_embeddings["run_date"] == date(2025, 8, 2)).all()
429+
430+
431+
def test_embeddings_read_batches_iter_returns_empty_when_embeddings_missing(
432+
timdex_dataset_empty, caplog
433+
):
434+
result = list(timdex_dataset_empty.embeddings.read_batches_iter())
435+
assert result == []
436+
assert (
437+
"Table 'embeddings' not found in DuckDB context. Embeddings may not yet exist "
438+
"or TIMDEXDataset.refresh() may be required." in caplog.text
439+
)
440+
441+
442+
def test_embeddings_read_batches_iter_returns_empty_for_invalid_table(
443+
timdex_embeddings_with_runs, caplog
444+
):
445+
"""read_batches_iter returns empty iterator for nonexistent table name."""
446+
with pytest.raises(
447+
ValueError,
448+
match="Invalid table: 'nonexistent'",
449+
):
450+
list(timdex_embeddings_with_runs.read_batches_iter(table="nonexistent"))

tests/test_read.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# ruff: noqa: D205, D209, PLR2004
2-
2+
import re
33

44
import pandas as pd
55
import pyarrow as pa
@@ -280,3 +280,28 @@ def test_read_batches_iter_limit_returns_n_rows(timdex_dataset_multi_source):
280280
batches = timdex_dataset_multi_source.read_batches_iter(limit=10)
281281
table = pa.Table.from_batches(batches)
282282
assert len(table) == 10
283+
284+
285+
def test_read_batches_iter_returns_empty_when_metadata_missing(
286+
timdex_dataset_empty, caplog
287+
):
288+
with pytest.raises(
289+
ValueError,
290+
match=re.escape(
291+
"Table 'records' not found in DuckDB context. If this is a new dataset, "
292+
"either records do not yet exist or a "
293+
"TIMDEXDataset.metadata.rebuild_dataset_metadata() may be required."
294+
),
295+
):
296+
list(timdex_dataset_empty.read_batches_iter())
297+
298+
299+
def test_read_batches_iter_returns_empty_for_invalid_table(
300+
timdex_dataset_multi_source, caplog
301+
):
302+
"""read_batches_iter returns empty iterator for nonexistent table name."""
303+
with pytest.raises(
304+
ValueError,
305+
match="Invalid table: 'nonexistent'",
306+
):
307+
list(timdex_dataset_multi_source.read_batches_iter(table="nonexistent"))

timdex_dataset_api/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from timdex_dataset_api.metadata import TIMDEXDatasetMetadata
66
from timdex_dataset_api.record import DatasetRecord
77

8-
__version__ = "3.9.0"
8+
__version__ = "3.10.0"
99

1010
__all__ = [
1111
"DatasetEmbedding",

timdex_dataset_api/dataset.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -443,17 +443,30 @@ def read_batches_iter(
443443
"""
444444
start_time = time.perf_counter()
445445

446+
# ensure valid table
447+
if table not in ["records", "current_records"]:
448+
raise ValueError(f"Invalid table: '{table}'")
449+
450+
# ensure table exists
451+
try:
452+
self.get_sa_table("metadata", table)
453+
except ValueError as exc:
454+
raise ValueError(
455+
f"Table '{table}' not found in DuckDB context. If this is a new "
456+
"dataset, either records do not yet exist or a "
457+
"TIMDEXDataset.metadata.rebuild_dataset_metadata() may be required."
458+
) from exc
459+
446460
temp_table_name = "read_meta_chunk"
447461
total_yield_count = 0
448462

449-
for i, meta_chunk_df in enumerate(
450-
self._iter_meta_chunks(
451-
table,
452-
limit=limit,
453-
where=where,
454-
**filters,
455-
)
456-
):
463+
meta_chunks = self._iter_meta_chunks(
464+
table,
465+
limit=limit,
466+
where=where,
467+
**filters,
468+
)
469+
for i, meta_chunk_df in enumerate(meta_chunks):
457470
batch_time = time.perf_counter()
458471
batch_yield_count = len(meta_chunk_df)
459472
total_yield_count += batch_yield_count

timdex_dataset_api/embeddings.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,19 @@ def read_batches_iter(
358358
"""
359359
start_time = time.perf_counter()
360360

361+
if table not in ["embeddings", "current_embeddings", "current_run_embeddings"]:
362+
raise ValueError(f"Invalid table: '{table}'")
363+
364+
# ensure table exists
365+
try:
366+
self.timdex_dataset.get_sa_table("data", table)
367+
except ValueError:
368+
logger.warning(
369+
f"Table '{table}' not found in DuckDB context. Embeddings may not yet "
370+
"exist or TIMDEXDataset.refresh() may be required."
371+
)
372+
return
373+
361374
data_query = self._build_query(
362375
table,
363376
columns,

0 commit comments

Comments
 (0)