@@ -476,33 +476,54 @@ interface RowCount {
476
476
}
477
477
478
478
interface ScanParquetOptions {
479
- columns ?: string [ ] | number [ ] ;
480
- numRows ?: number ;
479
+ nRows ?: number ;
480
+ cache ?: boolean ;
481
481
parallel ?: "auto" | "columns" | "row_groups" | "none" ;
482
482
rowCount ?: RowCount ;
483
- cache ?: boolean ;
484
483
rechunk ?: boolean ;
485
- hive_partitioning ?: boolean ;
484
+ lowMemory ?: boolean ;
485
+ useStatistics ?: boolean ;
486
+ hivePartitioning ?: boolean ;
487
+ cloudOptions ?: Map < string , string > ;
488
+ retries ?: number ;
486
489
}
487
490
488
491
/**
489
- * __Lazily read from a parquet file or multiple files via glob patterns.__
490
- * ___
492
+ * Lazily read from a local or cloud-hosted parquet file (or files).
493
+
494
+ This function allows the query optimizer to push down predicates and projections to
495
+ the scan level, typically increasing performance and reducing memory overhead.
496
+
491
497
* This allows the query optimizer to push down predicates and projections to the scan level,
492
- * thereby potentially reducing memory overhead.
493
- * @param path Path to a file or or glob pattern
494
- * @param options.numRows Stop reading from parquet file after reading ``numRows``.
495
- * @param options.cache Cache the result after reading.
496
- * @param options.parallel Read the parquet file in parallel. The single threaded reader consumes less memory.
497
- * @param options.rechunk In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks.
498
+ * thereby potentially reducing memory overhead.
499
+ * @param source - Path(s) to a file. If a single path is given, it can be a globbing pattern.
500
+ @param options.nRows - Stop reading from parquet file after reading `n_rows`.
501
+ @param options.rowIndexName - If not None, this will insert a row index column with the given name into the DataFrame
502
+ @param options.rowIndexOffset - Offset to start the row index column (only used if the name is set)
503
+ @param options.parallel : {'auto', 'columns', 'row_groups', 'none'}
504
+ This determines the direction of parallelism. 'auto' will try to determine the optimal direction.
505
+ @param options.useStatistics - Use statistics in the parquet to determine if pages can be skipped from reading.
506
+ @param options.hivePartitioning - Infer statistics and schema from hive partitioned URL and use them to prune reads.
507
+ @param options.rechunk - In case of reading multiple files via a glob pattern rechunk the final DataFrame into contiguous memory chunks.
508
+ @param options.lowMemory - Reduce memory pressure at the expense of performance.
509
+ @param options.cache - Cache the result after reading.
510
+ @param options.storageOptions - Options that indicate how to connect to a cloud provider.
511
+ If the cloud provider is not supported by Polars, the storage options are passed to `fsspec.open()`.
512
+
513
+ The cloud providers currently supported are AWS, GCP, and Azure.
514
+ See supported keys here:
515
+
516
+ * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
517
+ * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
518
+ * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
519
+
520
+ If `storage_options` is not provided, Polars will try to infer the information from environment variables.
521
+ @param retries - Number of retries if accessing a cloud instance fails.
498
522
*/
499
- export function scanParquet ( path : string , options : ScanParquetOptions = { } ) {
500
- const pliOptions : any = { } ;
501
-
502
- pliOptions . nRows = options ?. numRows ;
503
- pliOptions . rowCount = options ?. rowCount ;
504
- pliOptions . parallel = options ?. parallel ?? "auto" ;
505
- return _LazyDataFrame ( pli . scanParquet ( path , pliOptions ) ) ;
523
+ export function scanParquet ( source : string , options : ScanParquetOptions = { } ) {
524
+ const defaultOptions = { parallel : "auto" } ;
525
+ const pliOptions = { ...defaultOptions , ...options } ;
526
+ return _LazyDataFrame ( pli . scanParquet ( source , pliOptions ) ) ;
506
527
}
507
528
508
529
export interface ReadIPCOptions {
0 commit comments