Skip to content

Commit 798d0ee

Browse files
NJManganelliNick Manganellipre-commit-ci[bot]iannaikrommyd
authored
feat: Add a parquet uuid calculation (#3440)
* Calculate a uuid from parquet metadata, utilizing detailed info of the first and last row_groups plus the col_counts of all row_groups of the file or dataset * pre-commit fixups * test for parquet uuid * try more selective key-value pairs for hashing, which might be less vulnerable to OS-specific effects * Update testing hash and print uuid in case this still is insufficient for OS-agnostic uuid * style: pre-commit fixes * explicit importskip for pyarrow.parquet * Debug commit, prints * Debug commit, fail assertion * style: pre-commit fixes * Remove columns::statistics which contains a distinct_counts key whose value can be None or 0 depending on versions, sorting_columns also different and removed from this list * Remove DEBUG print statements, but not a full revert since one change still desired * Updated hash for parquet uuid * style: pre-commit fixes --------- Co-authored-by: Nick Manganelli <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ianna Osborne <[email protected]> Co-authored-by: Iason Krommydas <[email protected]>
1 parent d4c0e94 commit 798d0ee

File tree

3 files changed

+113
-7
lines changed

3 files changed

+113
-7
lines changed

src/awkward/operations/ak_from_parquet.py

+43-5
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
from __future__ import annotations
44

5+
import hashlib
6+
import json
7+
58
import fsspec.parquet
69

710
import awkward as ak
@@ -65,11 +68,8 @@ def from_parquet(
6568
See also #ak.to_parquet, #ak.metadata_from_parquet.
6669
"""
6770

68-
parquet_columns, subform, actual_paths, fs, subrg, row_counts, meta = metadata(
69-
path,
70-
storage_options,
71-
row_groups,
72-
columns,
71+
parquet_columns, subform, actual_paths, fs, subrg, row_counts, meta, uuid = (
72+
metadata(path, storage_options, row_groups, columns, calculate_uuid=True)
7373
)
7474
return _load(
7575
actual_paths,
@@ -95,6 +95,7 @@ def metadata(
9595
columns=None,
9696
ignore_metadata=False,
9797
scan_files=True,
98+
calculate_uuid=False,
9899
):
99100
# early exit if missing deps
100101
pyarrow_parquet = awkward._connect.pyarrow.import_pyarrow_parquet("ak.from_parquet")
@@ -193,6 +194,43 @@ def metadata(
193194
list_indicator=list_indicator, column_prefix=column_prefix
194195
)
195196

197+
# generate hash from the col_counts, first row_group and last row_group to calculate approximate parquet uuid
198+
uuid = None
199+
if calculate_uuid:
200+
uuids = [repr({"col_counts": col_counts})]
201+
for row_group_index in (0, metadata.num_row_groups - 1):
202+
row_group_info = metadata.row_group(row_group_index).to_dict()
203+
for k, v in row_group_info.items():
204+
# sorting columns, and columns::statistics have some version skew in underlying library
205+
# with latter's 'distinct_counts' showing None vs 0 for example, so they're not used
206+
if k in ["num_rows", "num_columns"]:
207+
uuids.append(repr({k: v}))
208+
if k == "columns":
209+
for subitem in v:
210+
for subkey in subitem:
211+
if subkey not in [
212+
"file_offset",
213+
"file_path",
214+
"physical_type",
215+
"path_in_schema",
216+
"compression",
217+
"encodings",
218+
"total_compressed_size",
219+
]:
220+
continue
221+
uuids.append(repr({subkey: subitem[subkey]}))
222+
uuid = hashlib.sha256(json.dumps(",".join(uuids)).encode()).hexdigest()
223+
return (
224+
parquet_columns,
225+
subform,
226+
actual_paths,
227+
fs,
228+
subrg,
229+
col_counts,
230+
metadata,
231+
uuid,
232+
)
233+
196234
return parquet_columns, subform, actual_paths, fs, subrg, col_counts, metadata
197235

198236

src/awkward/operations/ak_metadata_from_parquet.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,25 @@ def _impl(
7676
path, storage_options, row_groups=None, ignore_metadata=False, scan_files=True
7777
):
7878
results = ak.operations.ak_from_parquet.metadata(
79-
path, storage_options, row_groups, None, ignore_metadata, scan_files
79+
path,
80+
storage_options,
81+
row_groups,
82+
None,
83+
ignore_metadata,
84+
scan_files,
85+
calculate_uuid=True,
86+
)
87+
parquet_columns, subform, actual_paths, fs, subrg, col_counts, metadata, uuid = (
88+
results
8089
)
81-
parquet_columns, subform, actual_paths, fs, subrg, col_counts, metadata = results
8290

8391
out = {
8492
"form": subform,
8593
"fs": fs,
8694
"paths": actual_paths,
8795
"col_counts": col_counts,
8896
"columns": parquet_columns,
97+
"uuid": uuid,
8998
}
9099
if col_counts:
91100
out["num_rows"] = sum(col_counts)
+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
2+
3+
from __future__ import annotations
4+
5+
import pathlib
6+
7+
import pytest
8+
9+
import awkward
10+
11+
pytest.importorskip("pyarrow.parquet")
12+
13+
metadata_from_parquet_submodule = pytest.importorskip(
14+
"awkward.operations.ak_metadata_from_parquet"
15+
)
16+
metadata_from_parquet = metadata_from_parquet_submodule.metadata_from_parquet
17+
18+
SAMPLES_DIR = pathlib.Path(__file__).parent / "samples"
19+
input = SAMPLES_DIR / "nullable-record-primitives.parquet"
20+
21+
22+
def test_parquet_uuid():
23+
meta = metadata_from_parquet(input)
24+
assert (
25+
meta["uuid"]
26+
== "582dabdb8c87bfa17bc930676ed26b8d4ab22a900f92357751dc380c41acb593"
27+
)
28+
29+
30+
@pytest.mark.parametrize("calculate_uuid", [True, False])
31+
def test_return_tuple_with_or_without_uuid(calculate_uuid):
32+
results = awkward.operations.ak_from_parquet.metadata(
33+
input,
34+
{},
35+
None,
36+
None,
37+
False,
38+
True,
39+
calculate_uuid=calculate_uuid,
40+
)
41+
if calculate_uuid:
42+
assert len(results) == 8, "Expected 8 items in the result tuple"
43+
(
44+
parquet_columns,
45+
subform,
46+
actual_paths,
47+
fs,
48+
subrg,
49+
col_counts,
50+
metadata,
51+
uuid,
52+
) = results
53+
assert uuid is not None, "UUID should be present when calculate_uuid is True"
54+
print("uuid:", uuid)
55+
else:
56+
assert len(results) == 7, "Expected 7 items in the result tuple"
57+
parquet_columns, subform, actual_paths, fs, subrg, col_counts, metadata = (
58+
results
59+
)

0 commit comments

Comments
 (0)