@@ -4,13 +4,18 @@ config const filename = "data.parquet";
44
55
66proc getArray(type eltType, ref data_chunks, num_rows) {
7+ // Creates an array to store all the unified chunks
78 var arr: [ 0 ..< num_rows] eltType;
89 var i = 0 ;
10+ // Traverse the list of Python `Value`s provided to us
911 for chunk in data_chunks {
12+ // Call `to_numpy` on the individual Python `Value`s
13+ // This returns a `PyArray` of the specified element type
1014 var chunk_arr =
1115 chunk!.call(
1216 owned PyArray(eltType, 1 ),
1317 " to_numpy" , kwargs= [ " zero_copy_only" = > false , " writable" = > true ] );
18+ // (Continue to) fill in `arr` with the contents of that numpy array
1419 arr[ i..#chunk_arr.size] = chunk_arr.array();
1520 i += chunk_arr.size;
1621 }
@@ -24,23 +29,30 @@ proc main() {
2429 var pa = interp.importModule(" pyarrow" );
2530 var pq = interp.importModule(" pyarrow.parquet" );
2631
32+ // Open a Parquet file
2733 var parquet_file = pq.call(" ParquetFile" , filename);
2834 var columns = parquet_file.get(" schema" ).get(" names" ): list(string );
2935 var num_rows = parquet_file.get(" metadata" ).get(" num_rows" ): int ;
36+ // Create an array of lists to store the result
3037 var data_chunks: [ 0 ..< columns.size] list(owned Value?);
3138
39+ // Iterate over the Parquet file
3240 for batch in parquet_file.call(" iter_batches" , kwargs= [ " batch_size" = > 300 ] ) {
41+ // In each batch, get each column and store it into a list in the array
3342 for (col, idx) in zip (columns, 0 ..) {
3443 data_chunks[ idx] .pushBack(batch.call(" __getitem__" , col));
3544 }
3645 }
3746
47+ // schema_arrow is used to get the type stored in the column
3848 var schema_arrow = parquet_file.get(' schema_arrow' );
3949 for (col, idx) in zip (columns, 0 ..) {
4050 write (" Column: " , col);
4151
4252 var rowType = schema_arrow.call(' field' , col).get(' type' );
4353 if pa.call(" int64" ) == rowType {
54+ // getArray traverses the list stored at each idx, using numpy to gather
55+ // chunks into a single array
4456 var arr = getArray(int (64 ), data_chunks[ idx] , num_rows);
4557 writeln (" Sum: " , + reduce arr);
4658 } else if pa.call(" float64" ) == rowType {
0 commit comments