aodn
diff --git a/‎aodn_cloud_optimised/bin/create_dataset_config.py‎
Lines changed: 64 additions & 47 deletions b/‎aodn_cloud_optimised/bin/create_dataset_config.py‎
Lines changed: 64 additions & 47 deletions
diff --git a/‎aodn_cloud_optimised/bin/seabird_v1.py‎
Lines changed: 1 addition & 0 deletions b/‎aodn_cloud_optimised/bin/seabird_v1.py‎
Lines changed: 1 addition & 0 deletions
@@ -37,12 +37,15 @@
 import importlib.util
 import json
 import os
+import pathlib
 import uuid
 from collections import OrderedDict
 from importlib.resources import files
 
 import nbformat
 import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
 import s3fs
 from s3path import PureS3Path
 from termcolor import colored
@@ -734,21 +737,19 @@ def main():
 
     # Handle S3 path
     if args.file.startswith("s3://"):
-        nc_file = args.file
-        p = PureS3Path.from_uri(nc_file)
-        bucket = p.bucket
-        obj_key = str(p.key)
+        fp = args.file
+        s3_path = PureS3Path.from_uri(fp)
+        bucket = s3_path.bucket
+        obj_key = str(s3_path.key)
     else:
         obj_key = args.file
         bucket = args.bucket
-        nc_file = (
-            PureS3Path.from_uri(f"s3://{args.bucket}").joinpath(args.file).as_uri()
-        )
+        fp = PureS3Path.from_uri(f"s3://{args.bucket}").joinpath(args.file).as_uri()
 
     # Create an empty NetCDF with NaN variables alongside the JSON files. Acts as the source of truth for restoring missing dimensions.
     # only useful for Zarr to concatenate NetCDF together with missing var/dim in some NetCDF files
     if args.cloud_format == "zarr":
-        nc_nullify_path = nullify_netcdf_variables(nc_file, args.dataset_name)
+        nc_nullify_path = nullify_netcdf_variables(fp, args.dataset_name)
 
     # optionals s3fs options
     if args.s3fs_opts:
@@ -758,43 +759,61 @@ def main():
             anon=False,
         )
 
-    # Generate schema based on input type (NetCDF or CSV)
-    if obj_key.lower().endswith(".csv"):
-        csv_file = nc_file  # TODO: rename
-
-        csv_opts = json.loads(args.csv_opts) if args.csv_opts else {}
-        with fs.open(csv_file, "rb") as f:
-            df = pd.read_csv(f, **csv_opts)
-
-        dataset_config_schema = {"type": "object", "properties": {}}
-        for col, dtype in df.dtypes.items():
-            if pd.api.types.is_integer_dtype(dtype):
-                js_type = "integer"
-            elif pd.api.types.is_float_dtype(dtype):
-                js_type = "number"
-            elif pd.api.types.is_bool_dtype(dtype):
-                js_type = "boolean"
-            elif pd.api.types.is_object_dtype(dtype) | pd.api.types.is_string_dtype(
-                dtype
-            ):
-                js_type = "string"
-            else:
-                raise NotImplementedError(
-                    f"found dtype that did not fit into configured categories: `{dtype}`"
-                )
+    # Route by file type
+    obj_key_suffix = pathlib.Path(obj_key.lower()).suffix
+    match obj_key_suffix:
 
-            dataset_config_schema["properties"][col] = {"type": js_type}
+        case ".nc":
 
-    elif obj_key.lower().endswith(".nc"):
-        # Generate JSON schema from the NetCDF file
-        temp_file_path = generate_json_schema_from_s3_netcdf(
-            nc_file, cloud_format=args.cloud_format, s3_fs=fs
-        )
-        with open(temp_file_path, "r") as file:
-            dataset_config_schema = json.load(file)
-        os.remove(temp_file_path)
-    else:
-        raise NotImplementedError(f"input file type `{obj_key}` not implemented")
+            # Generate JSON schema from the NetCDF file
+            temp_file_path = generate_json_schema_from_s3_netcdf(
+                fp, cloud_format=args.cloud_format, s3_fs=fs
+            )
+            with open(temp_file_path, "r") as file:
+                dataset_config_schema = json.load(file)
+            os.remove(temp_file_path)
+
+        case ".csv":
+
+            # Load the csv using options
+            csv_opts = json.loads(args.csv_opts) if args.csv_opts else {}
+            with fs.open(fp, "rb") as f:
+                df = pd.read_csv(f, **csv_opts)
+
+            # Update column types for the dataset config
+            dataset_config_schema = dict()
+            for col, dtype in df.dtypes.items():
+                if pd.api.types.is_integer_dtype(dtype):
+                    js_type = "integer"
+                elif pd.api.types.is_float_dtype(dtype):
+                    js_type = "number"
+                elif pd.api.types.is_bool_dtype(dtype):
+                    js_type = "boolean"
+                elif pd.api.types.is_object_dtype(dtype) | pd.api.types.is_string_dtype(
+                    dtype
+                ):
+                    js_type = "string"
+                else:
+                    raise NotImplementedError(
+                        f"found dtype that did not fit into configured categories: `{dtype}`"
+                    )
+
+                dataset_config_schema[col] = {"type": js_type}
+
+        case ".parquet":
+
+            with fs.open(fp, "rb") as f:
+                schema = pq.read_schema(f)
+                dataset_config_schema = dict()
+
+                for field in schema:
+                    dataset_config_schema[field.name] = {"type": str(field.type)}
+
+        # Default: Raise NotImplemented
+        case _:
+            raise NotImplementedError(
+                f"input file type `{obj_key_suffix}` not implemented"
+            )
 
     dataset_config = {"schema": dataset_config_schema}
     # Define the path to the validation schema file
@@ -835,7 +854,7 @@ def main():
         "mode": f"{TO_REPLACE_PLACEHOLDER}",
         "restart_every_path": False,
     }
-    parent_s3_path = PureS3Path.from_uri(nc_file).parent.as_uri()
+    parent_s3_path = PureS3Path.from_uri(fp).parent.as_uri()
     dataset_config["run_settings"]["paths"] = [
         {"s3_uri": parent_s3_path, "filter": [".*\\.nc"], "year_range": []}
     ]
@@ -941,9 +960,7 @@ def main():
     with open(f"{module_path}/config/dataset/{args.dataset_name}.json", "w") as f:
         json.dump(dataset_config, f, indent=2)
 
-    create_dataset_script(
-        args.dataset_name, f"{args.dataset_name}.json", nc_file, bucket
-    )
+    create_dataset_script(args.dataset_name, f"{args.dataset_name}.json", fp, bucket)
     update_pyproject_toml(args.dataset_name)
 
     # fill up aws registry with GN3 uuid
 
@@ -0,0 +1 @@
+generic_launcher.py