Skip to content

Add a new Python module example with parquet files #27084

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions test/library/packages/Python/examples/parquet/CLEANFILES
1 change: 1 addition & 0 deletions test/library/packages/Python/examples/parquet/COMPOPTS
1 change: 1 addition & 0 deletions test/library/packages/Python/examples/parquet/EXECENV
1 change: 1 addition & 0 deletions test/library/packages/Python/examples/parquet/NUMLOCALES
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2
4 changes: 4 additions & 0 deletions test/library/packages/Python/examples/parquet/PRETEST
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env bash

FILE_DIR=$(cd $(dirname ${BASH_SOURCE[0]}) ; pwd)
$FILE_DIR/../../checkAndInstallPackage.sh $FILE_DIR pyarrow pandas numpy
13 changes: 13 additions & 0 deletions test/library/packages/Python/examples/parquet/create_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import pandas as pd
import numpy as np

n = 1000
data = {
'Integers': np.random.randint(1, 100, size=n),
'Floats1': np.random.uniform(1.0, 100.0, size=n),
'Floats2': np.random.uniform(1.0, 100.0, size=n)
}
df = pd.DataFrame(data)
df.to_parquet("data.parquet", index=False, row_group_size=100)
for c in data.keys():
print(f"Column: {c} Sum: {round(df[c].sum(), 1)}")
Empty file.
55 changes: 55 additions & 0 deletions test/library/packages/Python/examples/parquet/readParquet.chpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
use Python, List;

config const filename = "data.parquet";


proc getArray(type eltType, ref data_chunks, num_rows) {
var arr: [0..<num_rows] eltType;
var i = 0;
for chunk in data_chunks {
var chunk_arr =
chunk!.call(
owned PyArray(eltType, 1),
"to_numpy", kwargs=["zero_copy_only"=>false, "writable"=>true]);
arr[i..#chunk_arr.size] = chunk_arr.array();
i += chunk_arr.size;
}
return arr;
}

proc main() {

var interp = new Interpreter();

var pa = interp.importModule("pyarrow");
var pq = interp.importModule("pyarrow.parquet");

var parquet_file = pq.call("ParquetFile", filename);
var columns = parquet_file.get("schema").get("names"):list(string);
var num_rows = parquet_file.get("metadata").get("num_rows"):int;
var data_chunks: [0..<columns.size] list(owned Value?);

for batch in parquet_file.call("iter_batches", kwargs=["batch_size"=>300]) {
for (col, idx) in zip(columns, 0..) {
data_chunks[idx].pushBack(batch.call("__getitem__", col));
}
}

var schema_arrow = parquet_file.get('schema_arrow');
for (col, idx) in zip(columns, 0..) {
write("Column: ", col);

var rowType = schema_arrow.call('field', col).get('type');
if pa.call("int64") == rowType {
var arr = getArray(int(64), data_chunks[idx], num_rows);
writeln(" Sum: ", + reduce arr);
} else if pa.call("float64") == rowType {
var arr = getArray(real(64), data_chunks[idx], num_rows);
writeln(" Sum: ", + reduce arr);
} else {
writeln("Unknown type");
}
}
}


Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
data.parquet
readParquet.good
10 changes: 10 additions & 0 deletions test/library/packages/Python/examples/parquet/readParquet.precomp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

# respect CHPL_TEST_VENV_DIR if it is set and not none
if [ -n "$CHPL_TEST_VENV_DIR" ] && [ "$CHPL_TEST_VENV_DIR" != "none" ]; then
chpl_python=$CHPL_TEST_VENV_DIR/bin/python3
else
chpl_python=$($CHPL_HOME/util/config/find-python.sh)
fi

$chpl_python create_file.py > $1.good