fix reading order, fd number was limited (#114)

GBocharov · web-flow · commit b3aed5b37aab · 2025-06-30T15:47:49.000+02:00
diff --git a/src/yandex_cloud_ml_sdk/_datasets/dataset.py b/src/yandex_cloud_ml_sdk/_datasets/dataset.py
@@ -3,12 +3,11 @@
 
 import asyncio
 import dataclasses
-import os
 import tempfile
 from collections.abc import AsyncIterator, Iterator
 from datetime import datetime
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Iterable, TypeVar
+from typing import TYPE_CHECKING, Any, Final, Iterable, TypeVar
 
 import aiofiles
 import httpx
@@ -38,7 +37,7 @@
 logger = get_logger(__name__)
 
 DEFAULT_CHUNK_SIZE = 100 * 1024 ** 2
-
+DEFAULT_MAX_PARALLEL_DOWNLOADS: Final[int] = 16 # maximum number of files open for writing during download
 
 @dataclasses.dataclass(frozen=True)
 class ValidationErrorInfo:
@@ -153,6 +152,7 @@ async def _download(
         download_path: PathLike,
         timeout: float = 60,
         exist_ok: bool = False,
+        max_parallel_downloads: int = DEFAULT_MAX_PARALLEL_DOWNLOADS
     ) -> tuple[Path, ...]:
         logger.debug("Downloading dataset %s", self.id)
 
@@ -167,6 +167,7 @@ async def _download(
             base_path=base_path,
             exist_ok=exist_ok,
             timeout=timeout,
+            max_parallel_downloads=max_parallel_downloads,
         ), timeout)
 
     async def _read(
@@ -176,10 +177,24 @@ async def _read(
         batch_size: UndefinedOr[int],
     ) -> AsyncIterator[dict[Any, Any]]:
         batch_size_ = get_defined_value(batch_size, None)
+
         urls = await self._get_download_urls(timeout=timeout)
+
+        def key_comparator(item: tuple[str, str]) -> tuple[int, int | str]:
+            key_, _ = item
+            key_ = Path(key_).stem
+            if key_.isdigit():
+                return 0, int(key_)
+            else:
+                return 1, key_ # Non-numeric keys come after numeric keys
+
+        sorted_urls = sorted(urls, key=key_comparator)
+
         async with self._client.httpx() as client:
-            for _, url in urls:
-                fd, filename = tempfile.mkstemp()
+            for key, url in sorted_urls:
+                with tempfile.NamedTemporaryFile(delete=False) as tmp:
+                    filename = tmp.name
+
                 path = Path(filename)
                 try:
                     await self.__download_file(
@@ -188,29 +203,37 @@ async def _read(
                         client=client,
                         timeout=timeout
                     )
-
                     async for record in read_dataset_records(filename, batch_size=batch_size_):
                         yield record
                 finally:
-                    os.close(fd)
-                    path.unlink()
+                    if path.exists():
+                        path.unlink()
 
     async def __download_impl(
         self,
         base_path: Path,
         exist_ok: bool,
         timeout: float,
+        max_parallel_downloads: int = DEFAULT_MAX_PARALLEL_DOWNLOADS
     ) -> tuple[Path, ...]:
         urls = await self._get_download_urls(timeout=timeout)
+
         async with self._client.httpx() as client:
+
+            semaphore = asyncio.Semaphore(max_parallel_downloads)
+
+            async def limited_download(file_path, url) -> None:
+                async with semaphore:
+                    await self.__download_file(file_path, url, client, timeout=timeout)
+
             coroutines = []
             for key, url in urls:
                 file_path = base_path / key
                 if file_path.exists() and not exist_ok:
                     raise ValueError(f"{file_path} already exists")
 
                 coroutines.append(
-                    self.__download_file(file_path, url, client, timeout=timeout),
+                    limited_download(file_path, url)
                 )
 
             await asyncio.gather(*coroutines)
@@ -395,11 +418,13 @@ async def download(
         download_path: PathLike,
         timeout: float = 60,
         exist_ok: bool = False,
+        max_parallel_downloads: int = DEFAULT_MAX_PARALLEL_DOWNLOADS,
     ) -> tuple[Path, ...]:
         return await self._download(
             download_path=download_path,
             timeout=timeout,
             exist_ok=exist_ok,
+            max_parallel_downloads=max_parallel_downloads,
         )
 
     @requires_package('pyarrow', '>=19', 'AsyncDataset.read')
diff --git a/tests/datasets/test_download_datasets.py b/tests/datasets/test_download_datasets.py
@@ -2,8 +2,10 @@
 # pylint: disable=redefined-outer-name
 from __future__ import annotations
 
+import contextlib
 from pathlib import Path
 
+import aiofiles
 import httpx
 import pytest
 from pytest_httpx import HTTPXMock
@@ -44,7 +46,7 @@ async def test_download_to_temp_dir(mock_dataset, httpx_mock: HTTPXMock, mocker,
 
     paths = await mock_dataset.download(timeout=30, download_path=tmp_path)
 
-    assert paths == (tmp_path / "file1.txt", )
+    assert paths == (tmp_path / "file1.txt",)
     assert paths[0].read_bytes() == b"test file content"
 
 
@@ -178,5 +180,56 @@ async def test_download_with_exist_ok(mock_dataset, httpx_mock: HTTPXMock, mocke
 
     paths = await mock_dataset.download(timeout=30, download_path=tmp_path, exist_ok=False)
 
-    assert paths == (tmp_path / "file1.txt", )
+    assert paths == (tmp_path / "file1.txt",)
     assert paths[0].read_bytes() == b"test file content"
+
+
+@pytest.mark.asyncio
+async def test_download_fd_num(mock_dataset, httpx_mock: HTTPXMock, mocker, tmp_path: Path):
+    """Test checks that the number of simultaneously open fd's <= max_fd_num"""
+    max_open = 0
+    cur_open = 0
+    orig_aiofiles_open = aiofiles.open
+    max_fd_num = 5
+    fake_file_num = 10
+
+    @contextlib.asynccontextmanager
+    async def fake_open(*args, **kwargs):
+        nonlocal cur_open, max_open
+        cur_open += 1
+        max_open = max(max_open, cur_open)
+        f = await orig_aiofiles_open(*args, **kwargs)
+        try:
+            yield f
+        finally:
+            cur_open -= 1
+
+    mocker.patch("aiofiles.open", fake_open)
+
+    non_empty_dir = tmp_path / "non_empty"
+    non_empty_dir.mkdir()
+    (non_empty_dir / "file1.txt").write_text("existing content")
+
+    mocker.patch.object(
+        mock_dataset, "_get_download_urls",
+        return_value=[
+            (f"file{i}.txt", f"https://example.com/file{i}.txt") for i in range(fake_file_num)
+        ]
+    )
+    for i in range(fake_file_num):
+        httpx_mock.add_response(
+            url=f"https://example.com/file{i}.txt",
+            content=f"test file{i} content".encode()
+        )
+
+    paths = await mock_dataset.download(
+        timeout=30,
+        download_path=tmp_path,
+        exist_ok=False,
+        max_parallel_downloads=max_fd_num
+    )
+
+    assert paths == tuple(tmp_path / f"file{i}.txt" for i in range(fake_file_num))
+    assert paths[0].read_bytes() == f"test file{0} content".encode()
+
+    assert max_open <= max_fd_num
diff --git a/tests/datasets/test_read.py b/tests/datasets/test_read.py
@@ -1,15 +1,42 @@
+# pylint: disable=no-name-in-module
+# pylint: disable=redefined-outer-name
 from __future__ import annotations
 
+import io
 import uuid
 from pathlib import Path
 
+import httpx
 import psutil
+import pyarrow as pa
+import pyarrow.parquet as pq
 import pytest
+from pytest_httpx import HTTPXMock
+from yandex.cloud.ai.dataset.v1.dataset_pb2 import DatasetInfo
 
 from yandex_cloud_ml_sdk import AsyncYCloudML
+from yandex_cloud_ml_sdk._datasets.dataset import AsyncDataset
 
 pytestmark = [pytest.mark.asyncio, pytest.mark.require_env('pyarrow')]
 
+
+@pytest.fixture
+def mock_dataset(mocker) -> AsyncDataset:
+    """Create a mock dataset for testing."""
+    sdk_mock = mocker.MagicMock()
+    sdk_mock._client.httpx.return_value = httpx.AsyncClient()
+
+    dataset = AsyncDataset._from_proto(
+        sdk=sdk_mock,
+        proto=DatasetInfo(
+            dataset_id="id"
+        )
+    )
+
+    return dataset
+
+
+
 @pytest.mark.allow_grpc
 @pytest.mark.vcr
 async def test_simple_read(async_sdk: AsyncYCloudML, completions_jsonlines: Path) -> None:
@@ -35,3 +62,41 @@ async def test_simple_read(async_sdk: AsyncYCloudML, completions_jsonlines: Path
     assert 'response' in line
 
     await dataset.delete()
+
+def make_parquet_bytes(name: str) -> bytes:
+    table = pa.table({"name": [name]})
+    sink = io.BytesIO()
+    pq.write_table(table, sink)
+    return sink.getvalue()
+
+@pytest.mark.asyncio
+async def test_reading_order(mock_dataset, httpx_mock: HTTPXMock, mocker, tmp_path: Path) -> None:
+    """Test checks files reading order """
+    non_empty_dir = tmp_path / "non_empty"
+    non_empty_dir.mkdir()
+
+    file_names = ["1.parquet", "3.parquet", "test.parquet", "4.parquet", "2.parquet"]
+
+    mocker.patch.object(
+        mock_dataset, "_get_download_urls",
+        return_value=[
+            (non_empty_dir / fname, f"https://example.com/{fname}") for fname in file_names
+        ]
+    )
+    for fname in file_names:
+        httpx_mock.add_response(
+            url=f"https://example.com/{fname}",
+            content=make_parquet_bytes(fname)
+        )
+
+    process = psutil.Process()
+    fd_num = process.num_fds()
+
+    data = [line async for line in mock_dataset.read()]
+
+    assert process.num_fds() == fd_num
+    assert data == [{'name': '1.parquet'},
+                    {'name': '2.parquet'},
+                    {'name': '3.parquet'},
+                    {'name': '4.parquet'},
+                    {'name': 'test.parquet'}]