feat(libcommon): use async parquet_index.query() methods everywhere

kszucs · kszucs · commit 3aa0ed76f8f2 · 2025-11-21T17:24:41.000+01:00
diff --git a/libs/libcommon/src/libcommon/parquet_utils.py b/libs/libcommon/src/libcommon/parquet_utils.py
@@ -3,11 +3,12 @@
 import os
 from collections.abc import Iterable
 from dataclasses import dataclass, field
-from functools import lru_cache
+from functools import lru_cache, partial
 from pathlib import Path
 from typing import Optional, TypedDict
 from urllib.parse import unquote
 
+import anyio
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
@@ -526,7 +527,7 @@ def _init_viewer_index(
 
     # note that this cache size is global for the class, not per instance
     @lru_cache(maxsize=1)
-    def query(self, offset: int, length: int) -> tuple[pa.Table, list[str]]:
+    async def query(self, offset: int, length: int) -> tuple[pa.Table, list[str]]:
         """Query the parquet files
 
         Note that this implementation will always read at least one row group, to get the list of columns and always
@@ -541,11 +542,11 @@ def query(self, offset: int, length: int) -> tuple[pa.Table, list[str]]:
             `list[str]`: List of truncated columns.
         """
         if self._use_libviewer:
-            return self.query_libviewer_index(offset=offset, length=length)
+            return await self.query_libviewer_index(offset=offset, length=length)
         else:
-            return self.query_parquet_index(offset=offset, length=length)
+            return await self.query_parquet_index(offset=offset, length=length)
 
-    def query_parquet_index(self, offset: int, length: int) -> tuple[pa.Table, list[str]]:
+    async def query_parquet_index(self, offset: int, length: int) -> tuple[pa.Table, list[str]]:
         """Query the parquet files using ParquetIndexWithMetadata.
 
         This is the old implementation without libviewer doing row-group pruning using pyarrow.
@@ -554,9 +555,11 @@ def query_parquet_index(self, offset: int, length: int) -> tuple[pa.Table, list[
             f"Query {type(self.parquet_index).__name__} for dataset={self.dataset}, config={self.config},"
             f" split={self.split}, offset={offset}, length={length}"
         )
-        return self.parquet_index.query(offset=offset, length=length)
+        # run_sync doesn't support keyword arguments, so use partial
+        queryfn = partial(self.parquet_index.query, offset=offset, length=length)
+        return await anyio.to_thread.run_sync(queryfn)
 
-    def query_libviewer_index(self, offset: int, length: int) -> tuple[pa.Table, list[str]]:
+    async def query_libviewer_index(self, offset: int, length: int) -> tuple[pa.Table, list[str]]:
         """Query the parquet files using libviewer.
 
         This is the new implementation using libviewer doing row-group and page pruning.
@@ -574,7 +577,7 @@ def query_libviewer_index(self, offset: int, length: int) -> tuple[pa.Table, lis
             raise IndexError("Length must be non-negative")
 
         try:
-            batches, _files_to_index = self.viewer_index.sync_scan(
+            batches, _files_to_index = await self.viewer_index.scan(
                 offset=offset, limit=length, scan_size_limit=self.max_scan_size
             )
         except lv.DatasetError as e:
diff --git a/libs/libcommon/tests/test_parquet_utils.py b/libs/libcommon/tests/test_parquet_utils.py
@@ -454,37 +454,39 @@ def test_indexer_get_rows_index_sharded_with_parquet_metadata(
         assert metadata_path.exists()
 
 
-def test_rows_index_query_with_parquet_metadata(
+@pytest.mark.asyncio
+async def test_rows_index_query_with_parquet_metadata(
     rows_index_with_parquet_metadata: RowsIndex, ds_sharded: Dataset
 ) -> None:
     assert isinstance(rows_index_with_parquet_metadata.parquet_index, ParquetIndexWithMetadata)
     assert not hasattr(rows_index_with_parquet_metadata, "viewer_index")
-    result, _ = rows_index_with_parquet_metadata.query_parquet_index(offset=1, length=3)
+    result, _ = await rows_index_with_parquet_metadata.query_parquet_index(offset=1, length=3)
     assert result.to_pydict() == ds_sharded[1:4]
 
-    result, _ = rows_index_with_parquet_metadata.query_parquet_index(offset=1, length=-1)
+    result, _ = await rows_index_with_parquet_metadata.query_parquet_index(offset=1, length=-1)
     assert result.to_pydict() == ds_sharded[:0]
 
-    result, _ = rows_index_with_parquet_metadata.query_parquet_index(offset=1, length=0)
+    result, _ = await rows_index_with_parquet_metadata.query_parquet_index(offset=1, length=0)
     assert result.to_pydict() == ds_sharded[:0]
 
-    result, _ = rows_index_with_parquet_metadata.query_parquet_index(offset=999999, length=1)
+    result, _ = await rows_index_with_parquet_metadata.query_parquet_index(offset=999999, length=1)
     assert result.to_pydict() == ds_sharded[:0]
 
-    result, _ = rows_index_with_parquet_metadata.query_parquet_index(offset=1, length=99999999)
+    result, _ = await rows_index_with_parquet_metadata.query_parquet_index(offset=1, length=99999999)
     assert result.to_pydict() == ds_sharded[1:]
 
     with pytest.raises(IndexError):
-        rows_index_with_parquet_metadata.query_parquet_index(offset=-1, length=2)
+        await rows_index_with_parquet_metadata.query_parquet_index(offset=-1, length=2)
 
     # test that the other query() calls query_parquet_index() rather than query_libviewer_index()
-    result, _ = rows_index_with_parquet_metadata.query(offset=1, length=3)
+    result, _ = await rows_index_with_parquet_metadata.query(offset=1, length=3)
     assert result.to_pydict() == ds_sharded[1:4]
     with pytest.raises(AttributeError):
-        rows_index_with_parquet_metadata.query_libviewer_index(offset=1, length=3)
+        await rows_index_with_parquet_metadata.query_libviewer_index(offset=1, length=3)
 
 
-def test_rows_index_query_with_parquet_metadata_libviewer(
+@pytest.mark.asyncio
+async def test_rows_index_query_with_parquet_metadata_libviewer(
     ds_sharded: Dataset,
     ds_sharded_fs: AbstractFileSystem,
     dataset_sharded_with_config_parquet_metadata: dict[str, Any],
@@ -508,27 +510,28 @@ def test_rows_index_query_with_parquet_metadata_libviewer(
 
     assert isinstance(rows_index_with_parquet_metadata.viewer_index, lv.Dataset)
     assert not hasattr(rows_index_with_parquet_metadata, "parquet_index")
-    result, _truncated_cols = rows_index_with_parquet_metadata.query_libviewer_index(offset=1, length=3)
+    result, _truncated_cols = await rows_index_with_parquet_metadata.query_libviewer_index(offset=1, length=3)
     assert result.to_pydict() == ds_sharded[1:4]
-    result, _truncated_cols = rows_index_with_parquet_metadata.query_libviewer_index(offset=1, length=0)
+    result, _truncated_cols = await rows_index_with_parquet_metadata.query_libviewer_index(offset=1, length=0)
     assert result.to_pydict() == ds_sharded[:0]
-    result, _truncated_cols = rows_index_with_parquet_metadata.query_libviewer_index(offset=999999, length=1)
+    result, _truncated_cols = await rows_index_with_parquet_metadata.query_libviewer_index(offset=999999, length=1)
     assert result.to_pydict() == ds_sharded[:0]
-    result, _truncated_cols = rows_index_with_parquet_metadata.query_libviewer_index(offset=1, length=99999999)
+    result, _truncated_cols = await rows_index_with_parquet_metadata.query_libviewer_index(offset=1, length=99999999)
     assert result.to_pydict() == ds_sharded[1:]
     with pytest.raises(IndexError):
-        rows_index_with_parquet_metadata.query_libviewer_index(offset=0, length=-1)
+        await rows_index_with_parquet_metadata.query_libviewer_index(offset=0, length=-1)
     with pytest.raises(IndexError):
-        rows_index_with_parquet_metadata.query_libviewer_index(offset=-1, length=2)
+        await rows_index_with_parquet_metadata.query_libviewer_index(offset=-1, length=2)
 
     # test that the other query() calls query_libviewer_index() rather than query_parquet_index()
-    result, _ = rows_index_with_parquet_metadata.query(offset=1, length=3)
+    result, _ = await rows_index_with_parquet_metadata.query(offset=1, length=3)
     assert result.to_pydict() == ds_sharded[1:4]
     with pytest.raises(AttributeError):
-        rows_index_with_parquet_metadata.query_parquet_index(offset=1, length=3)
+        await rows_index_with_parquet_metadata.query_parquet_index(offset=1, length=3)
 
 
-def test_rows_index_query_with_too_big_rows(
+@pytest.mark.asyncio
+async def test_rows_index_query_with_too_big_rows(
     parquet_metadata_directory: StrPath,
     ds_sharded: Dataset,
     ds_sharded_fs: AbstractFileSystem,
@@ -546,7 +549,7 @@ def test_rows_index_query_with_too_big_rows(
             )
 
     with pytest.raises(TooBigRows):
-        index.query_parquet_index(offset=0, length=3)
+        await index.query_parquet_index(offset=0, length=3)
 
     with patch("libcommon.parquet_utils.libviewer_config", LibviewerConfig(enable_for_datasets=True)):
         index = RowsIndex(
@@ -563,10 +566,11 @@ def test_rows_index_query_with_too_big_rows(
 
     # test the same with page pruning API
     with pytest.raises(TooBigRows):
-        index.query_libviewer_index(offset=0, length=2)
+        await index.query_libviewer_index(offset=0, length=2)
 
 
-def test_rows_index_query_with_empty_dataset(
+@pytest.mark.asyncio
+async def test_rows_index_query_with_empty_dataset(
     ds_empty: Dataset,
     ds_empty_fs: AbstractFileSystem,
     dataset_empty_with_config_parquet_metadata: dict[str, Any],
@@ -585,10 +589,10 @@ def test_rows_index_query_with_empty_dataset(
 
     assert isinstance(index.parquet_index, ParquetIndexWithMetadata)
     assert not hasattr(index, "viewer_index")
-    result, _ = index.query_parquet_index(offset=0, length=1)
+    result, _ = await index.query_parquet_index(offset=0, length=1)
     assert result.to_pydict() == ds_empty[:0]
     with pytest.raises(IndexError):
-        index.query_parquet_index(offset=-1, length=2)
+        await index.query_parquet_index(offset=-1, length=2)
 
     # test the same with page pruning API
     import libviewer as lv
@@ -608,13 +612,14 @@ def test_rows_index_query_with_empty_dataset(
 
     assert isinstance(index.viewer_index, lv.Dataset)
     assert not hasattr(index, "parquet_index")
-    result, _ = index.query_libviewer_index(offset=0, length=1)
+    result, _ = await index.query_libviewer_index(offset=0, length=1)
     assert result.to_pydict() == ds_empty[:0]
     with pytest.raises(IndexError):
-        index.query_libviewer_index(offset=-1, length=2)
+        await index.query_libviewer_index(offset=-1, length=2)
 
 
-def test_indexer_schema_mistmatch_error(
+@pytest.mark.asyncio
+async def test_indexer_schema_mistmatch_error(
     ds_sharded_fs: AbstractFileSystem,
     ds_sharded_fs_with_different_schema: AbstractFileSystem,
     dataset_sharded_with_config_parquet_metadata: dict[str, Any],
@@ -632,7 +637,7 @@ def test_indexer_schema_mistmatch_error(
                     max_arrow_data_in_memory=9999999999,
                 )
                 with pytest.raises(SchemaMismatchError):
-                    index.query_parquet_index(offset=0, length=3)
+                    await index.query_parquet_index(offset=0, length=3)
 
 
 @pytest.mark.parametrize(
diff --git a/libs/libviewer/libviewer/dataset.py b/libs/libviewer/libviewer/dataset.py
@@ -1,5 +1,7 @@
 import os
+import functools
 
+import anyio
 from huggingface_hub import hf_hub_download, list_repo_files
 
 from ._internal import PyDataset, PyDatasetError as DatasetError  # noqa: F401
@@ -101,3 +103,15 @@ def from_cache(repo, metadata_store, revision=None, download=False):
             data_store="file://",
             metadata_store=metadata_store,
         )
+
+    def sync_scan(
+        self, limit=None, offset=None, scan_size_limit=1 * 1024 * 1024 * 1024
+    ):
+        fn = functools.partial(
+            self.scan, limit=limit, offset=offset, scan_size_limit=scan_size_limit
+        )
+        return anyio.run(fn)
+
+    def sync_index(self, files=None):
+        fn = functools.partial(self.index, files=files)
+        return anyio.run(fn)
diff --git a/libs/libviewer/src/lib.rs b/libs/libviewer/src/lib.rs
@@ -73,25 +73,6 @@ impl PyDataset {
         Ok(self.dataset.files.clone())
     }
 
-    #[pyo3(signature = (limit=None, offset=None, scan_size_limit=DEFAULT_SCAN_SIZE_LIMIT))]
-    fn sync_scan(
-        &self,
-        py: Python<'_>,
-        limit: Option<u64>,
-        offset: Option<u64>,
-        scan_size_limit: u64,
-    ) -> PyResult<(Vec<Py<PyAny>>, Vec<IndexedFile>)> {
-        let rt = tokio::runtime::Runtime::new()?;
-        let (record_batches, files_to_index) =
-            rt.block_on(self.dataset.scan(limit, offset, scan_size_limit))?;
-        let pyarrow_batches = record_batches
-            .into_iter()
-            .map(|batch| Ok(batch.into_pyarrow(py)?.unbind()))
-            .collect::<PyResult<Vec<_>>>()?;
-
-        Ok((pyarrow_batches, files_to_index))
-    }
-
     #[pyo3(signature = (limit=None, offset=None, scan_size_limit=DEFAULT_SCAN_SIZE_LIMIT))]
     fn scan<'py>(
         &self,
@@ -117,13 +98,6 @@ impl PyDataset {
         })
     }
 
-    #[pyo3(signature = (files=None))]
-    fn sync_index(&self, files: Option<Vec<IndexedFile>>) -> PyResult<Vec<IndexedFile>> {
-        let rt = tokio::runtime::Runtime::new()?;
-        let indexed_files = rt.block_on(self.dataset.index(files.as_deref()))?;
-        Ok(indexed_files)
-    }
-
     #[pyo3(signature = (files=None))]
     fn index<'py>(
         &self,
@@ -143,6 +117,7 @@ impl PyDataset {
 fn dv(m: &Bound<'_, PyModule>) -> PyResult<()> {
     // Bridge the Rust log crate with the Python logging module
     // pyo3_log::init();
+    env_logger::init();
 
     m.add_class::<PyDataset>()?;
     m.add("PyDatasetError", m.py().get_type::<PyDatasetError>())?;
diff --git a/services/rows/src/rows/routes/rows.py b/services/rows/src/rows/routes/rows.py
@@ -103,7 +103,7 @@ async def rows_endpoint(request: Request) -> Response:
                     with StepProfiler(method="rows_endpoint", step="query the rows"):
                         try:
                             # Some datasets have very long binary data that we truncate
-                            pa_table, truncated_columns = rows_index.query(offset=offset, length=length)
+                            pa_table, truncated_columns = await rows_index.query(offset=offset, length=length)
                         except TooBigRows as err:
                             raise TooBigContentError(str(err)) from None
                     with StepProfiler(method="rows_endpoint", step="transform to a list"):
diff --git a/services/worker/src/worker/job_runners/split/first_rows.py b/services/worker/src/worker/job_runners/split/first_rows.py
@@ -2,10 +2,12 @@
 # Copyright 2022 The HuggingFace Authors.
 
 
+import functools
 import logging
 from pathlib import Path
 from typing import Optional
 
+import anyio
 from datasets import IterableDataset, get_dataset_config_info, load_dataset
 from fsspec.implementations.http import HTTPFileSystem
 from libcommon.constants import MAX_NUM_ROWS_PER_PAGE
@@ -117,7 +119,8 @@ def compute_first_rows_from_parquet_response(
     def get_rows_content(rows_max_number: int) -> RowsContent:
         try:
             # Some datasets have very long binary data that we truncate
-            pa_table, truncated_columns = rows_index.query(offset=0, length=rows_max_number)
+            queryfn = functools.partial(rows_index.query, offset=0, length=rows_max_number)
+            pa_table, truncated_columns = anyio.run(queryfn)
             return RowsContent(
                 rows=pa_table.to_pylist(),
                 all_fetched=rows_index.num_rows_total <= rows_max_number,