Skip to content

Commit 1b40153

Browse files
authored
Merge pull request #234 from aodn/224-co-pull-metadata-from-input-parquet-files
224 co pull metadata from input parquet files
2 parents e0cfb89 + b8ef16f commit 1b40153

File tree

6 files changed

+237
-101
lines changed

6 files changed

+237
-101
lines changed

.pre-commit-config.yaml

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,10 @@ repos:
7171
types: [json]
7272
pass_filenames: false
7373
always_run: true
74-
- id: check-data-query-version-bump
75-
name: Check version bump in DataQuery.py
76-
entry: .pre-commit-hooks/check_version_bump.sh
77-
language: system
78-
pass_filenames: false
79-
- id: check-protected-env-vars
80-
name: Ensure protected env vars unchanged in DataQuery.py
81-
entry: .pre-commit-hooks/check_protected_env_vars.sh
74+
- id: check-data-query-protected-vars
75+
name: Validate DataQuery.py
76+
entry: python3 aodn_cloud_optimised/bin/validate_data_query.py
8277
language: system
78+
types: [json]
8379
pass_filenames: false
80+
always_run: true

.pre-commit-hooks/check_protected_env_vars.sh

Lines changed: 0 additions & 40 deletions
This file was deleted.

.pre-commit-hooks/check_version_bump.sh

Lines changed: 0 additions & 27 deletions
This file was deleted.

aodn_cloud_optimised/bin/create_dataset_config.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -818,7 +818,21 @@ def main():
818818
dataset_config_schema = dict()
819819

820820
for field in schema:
821-
dataset_config_schema[field.name] = {"type": str(field.type)}
821+
822+
# Extract core schema information
823+
dataset_config_schema[field.name] = {
824+
"type": str(field.type),
825+
"nullable": str(field.nullable),
826+
}
827+
828+
# Extract additional metadata if it exists
829+
if isinstance(field.metadata, dict):
830+
dataset_config_schema[field.name].update(
831+
{
832+
key.decode(): value.decode()
833+
for key, value in field.metadata.items()
834+
}
835+
)
822836

823837
regex_filter = [".*\\.parquet$"]
824838

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import datetime
2+
import pathlib
3+
import re
4+
import subprocess
5+
6+
import aodn_cloud_optimised.lib.DataQuery
7+
8+
# Regex pattern to find and capture the version string
9+
VERSION_PATTERN = re.compile(r'^__version__\s*=\s*"(.*)"\s*$')
10+
11+
12+
def get_file_content(ref: str, file_path: str) -> str | None:
13+
"""
14+
Retrieves the content of a specific file at a given git reference.
15+
"""
16+
17+
# Use git show to get the file content
18+
result = subprocess.run(
19+
["git", "show", f"{ref}:{file_path}"],
20+
capture_output=True,
21+
check=True,
22+
text=True,
23+
)
24+
return result.stdout
25+
26+
27+
def extract_version(content: str) -> str | None:
28+
"""
29+
Extracts the __version__ string from the file content using a regex.
30+
"""
31+
32+
for line in content.splitlines():
33+
match = VERSION_PATTERN.match(line.strip())
34+
if match:
35+
36+
# Return the captured group (the version string)
37+
return match.group(1)
38+
39+
return None
40+
41+
42+
def main():
43+
"""
44+
validation of Data Query global vars.
45+
46+
These often get tinkered with in local development of notebooks.
47+
48+
We must enforce they are set back to normal before commits.
49+
"""
50+
51+
# Check DataQuery.py is in the git diff
52+
stdout = subprocess.run(
53+
args=["git", "diff", "--cached", "--name-only"],
54+
check=True,
55+
text=True,
56+
capture_output=True,
57+
).stdout
58+
59+
# Exit if DataQuery.py is not in the changes
60+
if "aodn_cloud_optimised/lib/DataQuery.py" not in stdout.split("\n"):
61+
exit()
62+
63+
# Extract versions
64+
staged_content = get_file_content(
65+
ref="", file_path="aodn_cloud_optimised/lib/DataQuery.py"
66+
)
67+
staged_version = extract_version(staged_content)
68+
head_content = get_file_content(
69+
ref="HEAD", file_path="aodn_cloud_optimised/lib/DataQuery.py"
70+
)
71+
head_version = extract_version(head_content)
72+
73+
if staged_version == head_version:
74+
raise ValueError(
75+
f"DataQuery.__version__ must be updated. Bump from `{head_version}`"
76+
)
77+
78+
# Check the variables align to expected
79+
assert aodn_cloud_optimised.lib.DataQuery.REGION == "ap-southeast-2"
80+
assert (
81+
aodn_cloud_optimised.lib.DataQuery.ENDPOINT_URL
82+
== "https://s3.ap-southeast-2.amazonaws.com"
83+
)
84+
assert (
85+
aodn_cloud_optimised.lib.DataQuery.BUCKET_OPTIMISED_DEFAULT
86+
== "aodn-cloud-optimised"
87+
)
88+
assert aodn_cloud_optimised.lib.DataQuery.ROOT_PREFIX_CLOUD_OPTIMISED_PATH == ""
89+
assert aodn_cloud_optimised.lib.DataQuery.DEFAULT_TIME == datetime.datetime(
90+
1900, 1, 1
91+
)
92+
93+
print(aodn_cloud_optimised.lib.DataQuery.__version__)
94+
95+
96+
if __name__ == "__main__":
97+
main()

0 commit comments

Comments
 (0)