@@ -246,6 +246,94 @@ SQLAlchemy allows this option to be specified in the connection string.
246246
247247 NOTE: PolarsCursor handles the CSV file on memory. Pay attention to the memory capacity.
248248
249+ Chunksize Options
250+ ~~~~~~~~~~~~~~~~~
251+
252+ PolarsCursor supports memory-efficient chunked processing of large query results
253+ using Polars' native lazy evaluation APIs. This allows processing datasets that
254+ are too large to fit in memory.
255+
256+ The chunksize option can be enabled by specifying an integer value in the ``cursor_kwargs ``
257+ argument of the connect method or as an argument to the cursor method.
258+
259+ .. code :: python
260+
261+ from pyathena import connect
262+ from pyathena.polars.cursor import PolarsCursor
263+
264+ cursor = connect(s3_staging_dir = " s3://YOUR_S3_BUCKET/path/to/" ,
265+ region_name = " us-west-2" ,
266+ cursor_class = PolarsCursor,
267+ cursor_kwargs = {
268+ " chunksize" : 50_000
269+ }).cursor()
270+
271+ .. code :: python
272+
273+ from pyathena import connect
274+ from pyathena.polars.cursor import PolarsCursor
275+
276+ cursor = connect(s3_staging_dir = " s3://YOUR_S3_BUCKET/path/to/" ,
277+ region_name = " us-west-2" ,
278+ cursor_class = PolarsCursor).cursor(chunksize = 50_000 )
279+
280+ When the chunksize option is enabled, data is loaded lazily in chunks. This applies
281+ to all data access methods:
282+
283+ **Standard DB-API fetch methods ** - ``fetchone() `` and ``fetchmany() `` load data
284+ chunk by chunk as needed, keeping memory usage bounded:
285+
286+ .. code :: python
287+
288+ from pyathena import connect
289+ from pyathena.polars.cursor import PolarsCursor
290+
291+ cursor = connect(s3_staging_dir = " s3://YOUR_S3_BUCKET/path/to/" ,
292+ region_name = " us-west-2" ,
293+ cursor_class = PolarsCursor).cursor(chunksize = 50_000 )
294+
295+ cursor.execute(" SELECT * FROM large_table" )
296+ # Data is loaded in 50,000 row chunks as you iterate
297+ for row in cursor:
298+ process_row(row)
299+
300+ **iter_chunks() method ** - Use this when you want to process data as Polars DataFrames
301+ in chunks, which is more efficient for batch processing:
302+
303+ .. code :: python
304+
305+ from pyathena import connect
306+ from pyathena.polars.cursor import PolarsCursor
307+
308+ cursor = connect(s3_staging_dir = " s3://YOUR_S3_BUCKET/path/to/" ,
309+ region_name = " us-west-2" ,
310+ cursor_class = PolarsCursor).cursor(chunksize = 50_000 )
311+
312+ cursor.execute(" SELECT * FROM large_table" )
313+ for chunk in cursor.iter_chunks():
314+ # Process each chunk - chunk is a polars.DataFrame
315+ processed = chunk.group_by(' category' ).agg(pl.sum(' value' ))
316+ print (f " Processed chunk with { chunk.height} rows " )
317+
318+ This method uses Polars' ``scan_csv() `` and ``scan_parquet() `` with ``collect_batches() ``
319+ for efficient lazy evaluation, minimizing memory usage when processing large datasets.
320+
321+ The chunked iteration also works with the unload option:
322+
323+ .. code :: python
324+
325+ from pyathena import connect
326+ from pyathena.polars.cursor import PolarsCursor
327+
328+ cursor = connect(s3_staging_dir = " s3://YOUR_S3_BUCKET/path/to/" ,
329+ region_name = " us-west-2" ,
330+ cursor_class = PolarsCursor).cursor(chunksize = 100_000 , unload = True )
331+
332+ cursor.execute(" SELECT * FROM huge_table" )
333+ for chunk in cursor.iter_chunks():
334+ # Process Parquet data in chunks
335+ process_chunk(chunk)
336+
249337 .. _async-polars-cursor :
250338
251339AsyncPolarsCursor
@@ -414,6 +502,42 @@ As with AsyncPolarsCursor, the unload option is also available.
414502 region_name = " us-west-2" ,
415503 cursor_class = AsyncPolarsCursor).cursor(unload = True )
416504
505+ As with PolarsCursor, the chunksize option is also available for memory-efficient processing.
506+ When chunksize is specified, data is loaded lazily in chunks for both standard fetch methods
507+ and ``iter_chunks() ``.
508+
509+ .. code :: python
510+
511+ from pyathena import connect
512+ from pyathena.polars.async_cursor import AsyncPolarsCursor
513+
514+ cursor = connect(s3_staging_dir = " s3://YOUR_S3_BUCKET/path/to/" ,
515+ region_name = " us-west-2" ,
516+ cursor_class = AsyncPolarsCursor).cursor(chunksize = 50_000 )
517+
518+ query_id, future = cursor.execute(" SELECT * FROM large_table" )
519+ result_set = future.result()
520+
521+ # Standard iteration - data loaded in chunks
522+ for row in result_set:
523+ process_row(row)
524+
525+ .. code :: python
526+
527+ from pyathena import connect
528+ from pyathena.polars.async_cursor import AsyncPolarsCursor
529+
530+ cursor = connect(s3_staging_dir = " s3://YOUR_S3_BUCKET/path/to/" ,
531+ region_name = " us-west-2" ,
532+ cursor_class = AsyncPolarsCursor).cursor(chunksize = 50_000 )
533+
534+ query_id, future = cursor.execute(" SELECT * FROM large_table" )
535+ result_set = future.result()
536+
537+ # Process as DataFrame chunks
538+ for chunk in result_set.iter_chunks():
539+ process_chunk(chunk)
540+
417541 .. _`polars.DataFrame object` : https://docs.pola.rs/api/python/stable/reference/dataframe/index.html
418542.. _`Polars` : https://pola.rs/
419543.. _`Unload options` : arrow.html#unload-options
0 commit comments