22from __future__ import annotations
33
44import asyncio
5+ import contextlib
56import dataclasses
7+ import tempfile
8+ from collections .abc import AsyncIterator , Iterator
69from datetime import datetime
710from pathlib import Path
8- from typing import TYPE_CHECKING , Any , Iterable , TypeVar , AsyncIterator
11+ from typing import TYPE_CHECKING , Any , Iterable , TypeVar , cast
912
1013import aiofiles
1114import httpx
1417from yandex .cloud .ai .dataset .v1 .dataset_pb2 import ValidationError as ProtoValidationError
1518from yandex .cloud .ai .dataset .v1 .dataset_service_pb2 import (
1619 DeleteDatasetRequest , DeleteDatasetResponse , FinishMultipartUploadDraftRequest , FinishMultipartUploadDraftResponse ,
17- GetUploadDraftUrlRequest , GetUploadDraftUrlResponse , StartMultipartUploadDraftRequest ,
18- StartMultipartUploadDraftResponse , UpdateDatasetRequest , UpdateDatasetResponse , UploadedPartInfo ,
19- GetDownloadUrlsRequest , GetDownloadUrlsResponse
20+ GetDownloadUrlsRequest , GetDownloadUrlsResponse , GetUploadDraftUrlRequest , GetUploadDraftUrlResponse ,
21+ StartMultipartUploadDraftRequest , StartMultipartUploadDraftResponse , UpdateDatasetRequest , UpdateDatasetResponse ,
22+ UploadedPartInfo
2023)
2124from yandex .cloud .ai .dataset .v1 .dataset_service_pb2_grpc import DatasetServiceStub
2225
2326from yandex_cloud_ml_sdk ._logging import get_logger
24- from yandex_cloud_ml_sdk ._types .misc import UNDEFINED , UndefinedOr , get_defined_value , PathLike , coerce_path
27+ from yandex_cloud_ml_sdk ._types .misc import UNDEFINED , PathLike , UndefinedOr , coerce_path , get_defined_value
2528from yandex_cloud_ml_sdk ._types .resource import BaseDeleteableResource , safe_on_delete
26- from yandex_cloud_ml_sdk ._utils .sync import run_sync
29+ from yandex_cloud_ml_sdk ._utils .packages import requires_package
30+ from yandex_cloud_ml_sdk ._utils .pyarrow import read_dataset_records
31+ from yandex_cloud_ml_sdk ._utils .sync import run_sync , run_sync_generator
2732
2833from .status import DatasetStatus
2934
@@ -161,14 +166,41 @@ async def _download(
161166 return await asyncio .wait_for (self .__download_impl (
162167 base_path = base_path ,
163168 exist_ok = exist_ok ,
169+ timeout = timeout ,
164170 ), timeout )
165171
172+ async def _read (
173+ self ,
174+ * ,
175+ timeout : float ,
176+ batch_size : UndefinedOr [int ],
177+ ) -> AsyncIterator [dict [Any , Any ]]:
178+ batch_size_ = get_defined_value (batch_size , None )
179+ urls = await self ._get_download_urls (timeout = timeout )
180+ async with self ._client .httpx () as client :
181+ for _ , url in urls :
182+ _ , filename = tempfile .mkstemp ()
183+ path = Path (filename )
184+ try :
185+ await self .__download_file (
186+ path = path ,
187+ url = url ,
188+ client = client ,
189+ timeout = timeout
190+ )
191+
192+ async for record in read_dataset_records (filename , batch_size = batch_size_ ):
193+ yield record
194+ finally :
195+ path .unlink (missing_ok = True )
196+
166197 async def __download_impl (
167198 self ,
168199 base_path : Path ,
169200 exist_ok : bool ,
201+ timeout : float ,
170202 ) -> tuple [Path , ...]:
171- urls = await self ._get_download_urls ()
203+ urls = await self ._get_download_urls (timeout = timeout )
172204 async with self ._client .httpx () as client :
173205 coroutines = []
174206 for key , url in urls :
@@ -177,7 +209,7 @@ async def __download_impl(
177209 raise ValueError (f"{ file_path } already exists" )
178210
179211 coroutines .append (
180- self .__download_file (file_path , url , client ),
212+ self .__download_file (file_path , url , client , timeout = timeout ),
181213 )
182214
183215 await asyncio .gather (* coroutines )
@@ -186,21 +218,27 @@ async def __download_impl(
186218
187219 async def __download_file (
188220 self ,
189- path : Path ,
221+ path : Path | str ,
190222 url : str ,
191223 client : httpx .AsyncClient ,
224+ timeout : float ,
192225 ) -> None :
193226 async with aiofiles .open (path , "wb" ) as file :
194- async for chunk in self .__read_from_url (url , client ):
227+ logger .debug (
228+ 'Going to download file for dataset %s from url %s to %s' ,
229+ self .id , url , file .name
230+ )
231+ async for chunk in self .__read_from_url (url , client , timeout = timeout ):
195232 await file .write (chunk )
196233
197234 async def __read_from_url (
198235 self ,
199236 url : str ,
200237 client : httpx .AsyncClient ,
238+ timeout : float ,
201239 chunk_size : int = 1024 * 1024 * 8 , # 8Mb
202240 ) -> AsyncIterator [bytes ]:
203- resp = await client .get (url )
241+ resp = await client .get (url , timeout = timeout )
204242 resp .raise_for_status ()
205243 async for chunk in resp .aiter_bytes (chunk_size = chunk_size ):
206244 yield chunk
@@ -257,6 +295,8 @@ async def _get_download_urls(
257295 expected_type = GetDownloadUrlsResponse ,
258296 )
259297
298+ logger .debug ("Dataset %s returned next download urls: %r" , self .id , result .download_urls )
299+
260300 return [
261301 (r .key , r .url ) for r in result .download_urls
262302 ]
@@ -361,12 +401,50 @@ async def download(
361401 exist_ok = exist_ok ,
362402 )
363403
404+ @requires_package ('pyarrow' , '>=19' , 'AsyncDataset.read' )
405+ async def read (
406+ self ,
407+ * ,
408+ timeout : float = 60 ,
409+ batch_size : UndefinedOr [int ] = UNDEFINED ,
410+ ) -> AsyncIterator [dict [Any , Any ]]:
411+ """Reads the dataset from backend and yields it records one by one.
412+
413+ This method lazily loads records by chunks, minimizing memory usage for large datasets.
414+ The iterator yields dictionaries where keys are field names and values are parsed data.
415+
416+ .. note::
417+ This method creates temporary files in the system's default temporary directory
418+ during operation. To control the location of temporary files, refer to Python's
419+ :func:`tempfile.gettempdir` documentation. Temporary files are automatically
420+ cleaned up after use.
421+
422+ :param timeout: Maximum time in seconds for both gRPC and HTTP operations.
423+ Includes connection establishment, data transfer, and processing time.
424+ Defaults to 60 seconds.
425+ :type timeout: float
426+ :param batch_size: Number of records to load to memory in one chunk.
427+ When UNDEFINED (default), uses backend's optimal chunk size (typically
428+ corresponds to distinct Parquet files storage layout).
429+ :type batch_size: int or Undefined
430+ :yields: Dictionary representing single record with field-value pairs
431+ :rtype: AsyncIterator[dict[Any, Any]]
432+
433+ """
434+
435+ async for record in self ._read (
436+ timeout = timeout ,
437+ batch_size = batch_size
438+ ):
439+ yield record
440+
364441
365442class Dataset (BaseDataset ):
366443 __update = run_sync (BaseDataset ._update )
367444 __delete = run_sync (BaseDataset ._delete )
368445 __list_upload_formats = run_sync (BaseDataset ._list_upload_formats )
369446 __download = run_sync (BaseDataset ._download )
447+ __read = run_sync_generator (BaseDataset ._read )
370448
371449 def update (
372450 self ,
@@ -410,5 +488,41 @@ def download(
410488 exist_ok = exist_ok ,
411489 )
412490
491+ @requires_package ('pyarrow' , '>=19' , 'Dataset.read' )
492+ def read (
493+ self ,
494+ * ,
495+ timeout : float = 60 ,
496+ batch_size : UndefinedOr [int ] = UNDEFINED ,
497+ ) -> Iterator [dict [Any , Any ]]:
498+ """Reads the dataset from backend and yields it records one by one.
499+
500+ This method lazily loads records by chunks, minimizing memory usage for large datasets.
501+ The iterator yields dictionaries where keys are field names and values are parsed data.
502+
503+ .. note::
504+ This method creates temporary files in the system's default temporary directory
505+ during operation. To control the location of temporary files, refer to Python's
506+ :func:`tempfile.gettempdir` documentation. Temporary files are automatically
507+ cleaned up after use.
508+
509+ :param timeout: Maximum time in seconds for both gRPC and HTTP operations.
510+ Includes connection establishment, data transfer, and processing time.
511+ Defaults to 60 seconds.
512+ :type timeout: float
513+ :param batch_size: Number of records to load to memory in one chunk.
514+ When UNDEFINED (default), uses backend's optimal chunk size (typically
515+ corresponds to distinct Parquet files storage layout).
516+ :type batch_size: int or Undefined
517+ :yields: Dictionary representing single record with field-value pairs
518+ :rtype Iterator[dict[Any, Any]]
519+
520+ """
521+
522+ yield from self .__read (
523+ timeout = timeout ,
524+ batch_size = batch_size
525+ )
526+
413527
414528DatasetTypeT = TypeVar ('DatasetTypeT' , bound = BaseDataset )
0 commit comments