You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: aodn_cloud_optimised/bin/create_dataset_config.py
+78-24Lines changed: 78 additions & 24 deletions
Original file line number
Diff line number
Diff line change
@@ -43,10 +43,12 @@
43
43
importuuid
44
44
fromcollectionsimportOrderedDict
45
45
fromimportlib.resourcesimportfiles
46
+
fromurllib.parseimporturlparse
46
47
47
48
importnbformat
48
49
importpandasaspd
49
50
importpolarsaspl
51
+
importpyarrow.datasetasds
50
52
importpyarrow.parquetaspq
51
53
imports3fs
52
54
froms3pathimportPureS3Path
@@ -813,28 +815,54 @@ def main():
813
815
regex_filter= [".*\\.csv$"]
814
816
815
817
case".parquet":
816
-
withfs.open(fp, "rb") asf:
817
-
schema=pq.read_schema(f)
818
-
dataset_config_schema=dict()
819
-
820
-
forfieldinschema:
821
-
822
-
# Extract core schema information
823
-
dataset_config_schema[field.name] = {
824
-
"type": str(field.type),
825
-
"nullable": str(field.nullable),
826
-
}
818
+
dataset_config_schema=dict()
827
819
828
-
# Extract additional metadata if it exists
829
-
ifisinstance(field.metadata, dict):
830
-
dataset_config_schema[field.name].update(
831
-
{
832
-
key.decode(): value.decode()
833
-
forkey, valueinfield.metadata.items()
834
-
}
835
-
)
820
+
# TODO: at this stage, we don't know yet if it's a hive or single parquet file. Could add another option in the create_dataset_config script for parquet only.
821
+
iffs.isfile(fp):
822
+
# Try reading as a single Parquet file
823
+
withfs.open(fp, "rb") asf:
824
+
schema=pq.read_schema(f)
825
+
826
+
parquet_partitioning=None
827
+
iffs.isdir(fp):
828
+
# If that fails, assume it's a Hive-partitioned dataset
829
+
830
+
# Strip "s3://" if present, since s3fs expects only the key
831
+
parsed=urlparse(fp)
832
+
# TODO: this works but seems very ugly. Need to improve
833
+
dataset_path= (
834
+
f"{parsed.netloc}{parsed.path}"# ✅ keep the leading slash
835
+
)
836
+
parquet_partitioning="hive"
837
+
dataset=ds.dataset(
838
+
dataset_path,
839
+
format="parquet",
840
+
partitioning=parquet_partitioning,
841
+
filesystem=fs,
842
+
)
843
+
schema=dataset.schema
844
+
845
+
forfieldinschema:
846
+
# Extract core schema information
847
+
dataset_config_schema[field.name] = {
848
+
"type": str(field.type),
849
+
"nullable": str(field.nullable),
850
+
}
851
+
852
+
# Extract additional metadata if it exists
853
+
ifisinstance(field.metadata, dict):
854
+
dataset_config_schema[field.name].update(
855
+
{
856
+
key.decode(): value.decode()
857
+
forkey, valueinfield.metadata.items()
858
+
}
859
+
)
860
+
case".zarr":
861
+
# TODO: implement a zarr reader
836
862
837
-
regex_filter= [".*\\.parquet$"]
863
+
raiseNotImplementedError(
864
+
f"input file type `{obj_key_suffix}` not yet implemented"
year_range: Year filter: None, one year, or a two-year inclusive range, or a list of exclusive years to process.
75
+
type: Type of dataset. Can be "files", "parquet", or "zarr".
76
+
partitioning: Optional, used only for Parquet datasets (e.g., "hive").
77
+
filter: List of regex patterns to filter files (only valid for type="files").
78
+
year_range: Optional Year filter: None, one year, or a two-year inclusive range, or a list of exclusive years to process. (only valid for type="files")
0 commit comments