Skip to content

Commit f0f3c79

Browse files
authored
Add comments to this test file (#27836)
[reviewed by @jabraham17] I went through this file for slides and made some notes for myself to understand it, so figured it would be useful to add them as comments to the test itself. Double checked that the test still ran okay
2 parents 44aa370 + dd08734 commit f0f3c79

File tree

1 file changed

+12
-0
lines changed

1 file changed

+12
-0
lines changed

test/library/packages/Python/examples/parquet/readParquet.chpl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@ config const filename = "data.parquet";
44

55

66
proc getArray(type eltType, ref data_chunks, num_rows) {
7+
// Creates an array to store all the unified chunks
78
var arr: [0..<num_rows] eltType;
89
var i = 0;
10+
// Traverse the list of Python `Value`s provided to us
911
for chunk in data_chunks {
12+
// Call `to_numpy` on the individual Python `Value`s
13+
// This returns a `PyArray` of the specified element type
1014
var chunk_arr =
1115
chunk!.call(
1216
owned PyArray(eltType, 1),
1317
"to_numpy", kwargs=["zero_copy_only"=>false, "writable"=>true]);
18+
// (Continue to) fill in `arr` with the contents of that numpy array
1419
arr[i..#chunk_arr.size] = chunk_arr.array();
1520
i += chunk_arr.size;
1621
}
@@ -24,23 +29,30 @@ proc main() {
2429
var pa = interp.importModule("pyarrow");
2530
var pq = interp.importModule("pyarrow.parquet");
2631

32+
// Open a Parquet file
2733
var parquet_file = pq.call("ParquetFile", filename);
2834
var columns = parquet_file.get("schema").get("names"):list(string);
2935
var num_rows = parquet_file.get("metadata").get("num_rows"):int;
36+
// Create an array of lists to store the result
3037
var data_chunks: [0..<columns.size] list(owned Value?);
3138

39+
// Iterate over the Parquet file
3240
for batch in parquet_file.call("iter_batches", kwargs=["batch_size"=>300]) {
41+
// In each batch, get each column and store it into a list in the array
3342
for (col, idx) in zip(columns, 0..) {
3443
data_chunks[idx].pushBack(batch.call("__getitem__", col));
3544
}
3645
}
3746

47+
// schema_arrow is used to get the type stored in the column
3848
var schema_arrow = parquet_file.get('schema_arrow');
3949
for (col, idx) in zip(columns, 0..) {
4050
write("Column: ", col);
4151

4252
var rowType = schema_arrow.call('field', col).get('type');
4353
if pa.call("int64") == rowType {
54+
// getArray traverses the list stored at each idx, using numpy to gather
55+
// chunks into a single array
4456
var arr = getArray(int(64), data_chunks[idx], num_rows);
4557
writeln(" Sum: ", + reduce arr);
4658
} else if pa.call("float64") == rowType {

0 commit comments

Comments
 (0)