aodn
diff --git a/‎aodn_cloud_optimised/bin/create_dataset_config.py‎
Lines changed: 62 additions & 48 deletions b/‎aodn_cloud_optimised/bin/create_dataset_config.py‎
Lines changed: 62 additions & 48 deletions
diff --git a/‎aodn_cloud_optimised/lib/GenericParquetHandler.py‎
Lines changed: 144 additions & 12 deletions b/‎aodn_cloud_optimised/lib/GenericParquetHandler.py‎
Lines changed: 144 additions & 12 deletions
@@ -37,12 +37,15 @@
 import importlib.util
 import json
 import os
+import pathlib
 import uuid
 from collections import OrderedDict
 from importlib.resources import files
 
 import nbformat
 import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
 import s3fs
 from s3path import PureS3Path
 from termcolor import colored
@@ -734,21 +737,19 @@ def main():
 
     # Handle S3 path
     if args.file.startswith("s3://"):
-        nc_file = args.file
-        p = PureS3Path.from_uri(nc_file)
-        bucket = p.bucket
-        obj_key = str(p.key)
+        fp = args.file
+        s3_path = PureS3Path.from_uri(fp)
+        bucket = s3_path.bucket
+        obj_key = str(s3_path.key)
     else:
         obj_key = args.file
         bucket = args.bucket
-        nc_file = (
-            PureS3Path.from_uri(f"s3://{args.bucket}").joinpath(args.file).as_uri()
-        )
+        fp = PureS3Path.from_uri(f"s3://{args.bucket}").joinpath(args.file).as_uri()
 
     # Create an empty NetCDF with NaN variables alongside the JSON files. Acts as the source of truth for restoring missing dimensions.
     # only useful for Zarr to concatenate NetCDF together with missing var/dim in some NetCDF files
     if args.cloud_format == "zarr":
-        nc_nullify_path = nullify_netcdf_variables(nc_file, args.dataset_name)
+        nc_nullify_path = nullify_netcdf_variables(fp, args.dataset_name)
 
     # optionals s3fs options
     if args.s3fs_opts:
@@ -758,43 +759,58 @@ def main():
             anon=False,
         )
 
-    # Generate schema based on input type (NetCDF or CSV)
-    if obj_key.lower().endswith(".csv"):
-        csv_file = nc_file  # TODO: rename
-
-        csv_opts = json.loads(args.csv_opts) if args.csv_opts else {}
-        with fs.open(csv_file, "rb") as f:
-            df = pd.read_csv(f, **csv_opts)
-
-        dataset_config_schema = {"type": "object", "properties": {}}
-        for col, dtype in df.dtypes.items():
-            if pd.api.types.is_integer_dtype(dtype):
-                js_type = "integer"
-            elif pd.api.types.is_float_dtype(dtype):
-                js_type = "number"
-            elif pd.api.types.is_bool_dtype(dtype):
-                js_type = "boolean"
-            elif pd.api.types.is_object_dtype(dtype) | pd.api.types.is_string_dtype(
-                dtype
-            ):
-                js_type = "string"
-            else:
-                raise NotImplementedError(
-                    f"found dtype that did not fit into configured categories: `{dtype}`"
-                )
+    # Route by file type
+    obj_key_suffix = pathlib.Path(obj_key.lower()).suffix
+    match obj_key_suffix:
+        case ".nc":
 
-            dataset_config_schema["properties"][col] = {"type": js_type}
-
-    elif obj_key.lower().endswith(".nc"):
-        # Generate JSON schema from the NetCDF file
-        temp_file_path = generate_json_schema_from_s3_netcdf(
-            nc_file, cloud_format=args.cloud_format, s3_fs=fs
-        )
-        with open(temp_file_path, "r") as file:
-            dataset_config_schema = json.load(file)
-        os.remove(temp_file_path)
-    else:
-        raise NotImplementedError(f"input file type `{obj_key}` not implemented")
+            # Generate JSON schema from the NetCDF file
+            temp_file_path = generate_json_schema_from_s3_netcdf(
+                fp, cloud_format=args.cloud_format, s3_fs=fs
+            )
+            with open(temp_file_path, "r") as file:
+                dataset_config_schema = json.load(file)
+            os.remove(temp_file_path)
+
+        case ".csv":
+
+            csv_opts = json.loads(args.csv_opts) if args.csv_opts else {}
+            with fs.open(fp, "rb") as f:
+                df = pd.read_csv(f, **csv_opts)
+
+            dataset_config_schema = {"type": "object", "properties": {}}
+            for col, dtype in df.dtypes.items():
+                if pd.api.types.is_integer_dtype(dtype):
+                    js_type = "integer"
+                elif pd.api.types.is_float_dtype(dtype):
+                    js_type = "number"
+                elif pd.api.types.is_bool_dtype(dtype):
+                    js_type = "boolean"
+                elif pd.api.types.is_object_dtype(dtype) | pd.api.types.is_string_dtype(
+                    dtype
+                ):
+                    js_type = "string"
+                else:
+                    raise NotImplementedError(
+                        f"found dtype that did not fit into configured categories: `{dtype}`"
+                    )
+
+                dataset_config_schema["properties"][col] = {"type": js_type}
+
+        case ".parquet":
+
+            with fs.open(fp, "rb") as f:
+                schema = pq.read_schema(f)
+                dataset_config_schema = dict()
+
+                for field in schema:
+                    dataset_config_schema[field.name] = {"type": str(field.type)}
+
+        # Default: Raise NotImplemented
+        case _:
+            raise NotImplementedError(
+                f"input file type `{obj_key_suffix}` not implemented"
+            )
 
     dataset_config = {"schema": dataset_config_schema}
     # Define the path to the validation schema file
@@ -835,7 +851,7 @@ def main():
         "mode": f"{TO_REPLACE_PLACEHOLDER}",
         "restart_every_path": False,
     }
-    parent_s3_path = PureS3Path.from_uri(nc_file).parent.as_uri()
+    parent_s3_path = PureS3Path.from_uri(fp).parent.as_uri()
     dataset_config["run_settings"]["paths"] = [
         {"s3_uri": parent_s3_path, "filter": [".*\\.nc"], "year_range": []}
     ]
@@ -941,9 +957,7 @@ def main():
     with open(f"{module_path}/config/dataset/{args.dataset_name}.json", "w") as f:
         json.dump(dataset_config, f, indent=2)
 
-    create_dataset_script(
-        args.dataset_name, f"{args.dataset_name}.json", nc_file, bucket
-    )
+    create_dataset_script(args.dataset_name, f"{args.dataset_name}.json", fp, bucket)
     update_pyproject_toml(args.dataset_name)
 
     # fill up aws registry with GN3 uuid
 
@@ -1,6 +1,7 @@
 import gc
 import importlib.resources
 import os
+import pathlib
 import re
 import timeit
 import traceback
@@ -15,6 +16,7 @@
 import pandas as pd
 import pyarrow as pa
 import pyarrow.parquet as pq
+import s3fs.core
 import xarray as xr
 from dask.distributed import wait
 from shapely.geometry import Point, Polygon
@@ -226,8 +228,60 @@ def preprocess_data_netcdf(
                         f"{self.uuid_log}: The NetCDF file does not conform to the pre-defined schema."
                     )
 
+    def preprocess_data_parquet(
+        self, parquet_fp
+    ) -> Generator[Tuple[pd.DataFrame, xr.Dataset], None, None]:
+        """
+        Preprocesses a parquet file using pyarrow and converts it into an xarray Dataset based on the dataset configuration.
+
+        Args:
+            parquet_fp (str or s3fs.core.S3File): File path or s3fs object of the parquet file to be processed.
+
+        Yields:
+            Tuple[pd.DataFrame, xr.Dataset]: A generator yielding a tuple containing the processed pandas DataFrame
+                and its corresponding xarray Dataset.
+
+        This method reads a parquet file(`parquet_fp`) using pyarrow.parquet `read_table` function.
+
+        The resultin DataFrame (`df`) is then converted into an xarray Dataset using `xr.Dataset.from_dataframe()`.
+
+        # TODO: Document `pq.read_table` options
+
+        The method also uses the 'schema' from the dataset configuration to assign attributes to variables in the
+        xarray Dataset. Each variable's attributes are extracted from the 'schema' and assigned to the Dataset variable's
+        attributes. The 'type' attribute from the `pyarrow_schema` is removed from the Dataset variables' attributes since it
+        is considered unnecessary.
+
+        If a variable in the Dataset is not found in the schema, an error is logged.
+
+        Notes:
+            Ensure that the config schema includes a column named "index" of type int64. When the internal conversions
+            occur between xarray, pandas and pyarrow, an "index" column is added to the pyarrow table. Rather than
+            detect when "index" should not have been added, it is easier to add "index" as an expected column that is
+            added by the cloud optimisation process.
+        """
+
+        table = pq.read_table(parquet_fp)
+        df = table.to_pandas()
+        df = df.drop(columns=self.drop_variables, errors="ignore")
+        ds = xr.Dataset.from_dataframe(df)
+
+        for var in ds.variables:
+            if var not in self.schema:
+                self.logger.error(
+                    f"{self.uuid_log}: Missing variable: {var} from dataset config"
+                )
+            else:
+                ds[var].attrs = self.schema.get(var)
+                del ds[var].attrs[
+                    "type"
+                ]  # remove the type attribute which is not necessary at all
+
+        yield df, ds
+
     def preprocess_data(
-        self, fp
+        self,
+        fp: str | s3fs.core.S3File,
     ) -> Generator[Tuple[pd.DataFrame, xr.Dataset], None, None]:
         """
         Overwrites the preprocess_data method from CommonHandler.
@@ -239,12 +293,31 @@ def preprocess_data(
             tuple: A tuple containing DataFrame and Dataset.
 
         If `fp` ends with ".nc", it delegates to `self.preprocess_data_netcdf(fp)`.
-        If `fp` ends with ".csv", it delegates to `self.preprocess_data_csv(fp)`.
+        Elif `fp` ends with ".csv", it delegates to `self.preprocess_data_csv(fp)`.
+        Elif `fp` ends with ".parquet", it delegates to `self.preprocess_data_parquet(fp)`.
+        Else raises a NotImplementedError
+
+        Raises:
+            NotImplementedError: Where the file type is not yet implemented
         """
-        if fp.path.endswith(".nc"):
-            return self.preprocess_data_netcdf(fp)
-        if fp.path.endswith(".csv"):
-            return self.preprocess_data_csv(fp)
+        # Extract file suffix
+        if isinstance(fp, str):
+            file_suffix = pathlib.Path(fp).suffix
+        elif isinstance(fp, s3fs.core.S3File):
+            file_suffix = pathlib.Path(fp.path).suffix
+
+        # Match preprocess method
+        match file_suffix.lower():
+            case ".nc":
+                return self.preprocess_data_netcdf(fp)
+            case ".csv":
+                return self.preprocess_data_csv(fp)
+            case ".parquet":
+                return self.preprocess_data_parquet(fp)
+            case _:
+                raise NotImplementedError(
+                    f"files with suffix `{file_suffix}` not yet implemented in preprocess_data"
+                )
 
     @staticmethod
     def cast_table_by_schema(table, schema) -> pa.Table:
@@ -396,9 +469,9 @@ def _add_polygon(self, df: pd.DataFrame) -> pd.DataFrame:
                 self.logger.warning(
                     f"{self.uuid_log}: The NetCDF contains NaN values of {geo_var}. Removing corresponding data"
                 )
-                df = df.dropna(
-                    subset=[geo_var]
-                ).reset_index()  # .reset_index(drop=True)
+                df = df.dropna(subset=[geo_var]).reset_index(
+                    drop=False
+                )  # For now leaving drop false to ensure no breaking changes
 
         point_geometry = [
             Point(lon, lat) for lon, lat in zip(df[lon_varname], df[lat_varname])
@@ -451,9 +524,11 @@ def _add_timestamp_df(self, df: pd.DataFrame, f) -> pd.DataFrame:
             if item.get("time_extent") is not None:
                 timestamp_info = item
 
+        # Extract time partition information
         timestamp_varname = timestamp_info.get("source_variable")
         time_varname = timestamp_info["time_extent"].get("time_varname", "TIME")
         partition_period = timestamp_info["time_extent"].get("partition_period")
+
         # look for the variable or column with datetime64 type
         if isinstance(df.index, pd.MultiIndex) and (time_varname in df.index.names):
             # for example, files with timeSeries and TIME dimensions such as
@@ -476,6 +551,37 @@ def _add_timestamp_df(self, df: pd.DataFrame, f) -> pd.DataFrame:
                 if pd.api.types.is_datetime64_any_dtype(df.index):
                     datetime_var = df.index
 
+            # Finally attempt to validate the defined time partition column
+            if "datetime_var" not in locals():
+
+                # Else look for the time columns with a different time related dtype
+                time_partition_column = df[time_varname]
+
+                # Validate no missing values
+                if time_partition_column.isnull().any():
+                    raise ValueError(
+                        "time partition column may not contain null values"
+                    )
+
+                # Validate that the time partition column translated via pd.to_datetime
+                try:
+                    pd.to_datetime(time_partition_column)
+                except Exception as e:
+                    raise ValueError(
+                        "time partition column failed to translate to pandas datetime dtype: {e}"
+                    )
+
+                # Because the df does not have a date time index, we have to create and fill the column in separately here
+                datetime_index = pd.DatetimeIndex(pd.to_datetime(time_partition_column))
+                df[timestamp_varname] = (
+                    np.int64(datetime_index.to_period(partition_period).to_timestamp())
+                    / 10**9
+                )
+                return df
+
+        if "datetime_var" not in locals():
+            raise ValueError("could not determine the datetime column/variable")
+
         if not isinstance(df.index, pd.MultiIndex) and (time_varname in df.index.names):
             today = datetime.today()
             # assume that todays year + 1 is the future, and no in-situ data should be in the future, since we're not dealing
@@ -661,13 +767,16 @@ def _rm_bad_timestamp_df(self, df: pd.DataFrame, f) -> pd.DataFrame:
         timestamp_varname = timestamp_info.get("source_variable")
         time_varname = timestamp_info["time_extent"].get("time_varname", "TIME")
 
-        if any(df[timestamp_varname] <= 0):
+        # Check any timestamps are before `1900-01-01 00:00:00`
+        if any(df[timestamp_varname] < -2208988800):
             self.logger.warning(
                 f"{self.uuid_log}: {f.path}: Bad values detected in {time_varname} time variable. Trimming corresponding data."
             )
-            df2 = df[df[timestamp_varname] > 0].copy()
+            df2 = df[df[timestamp_varname] >= -2208988800].copy()
             df = df2
-            df = df.reset_index()
+            df = df.reset_index(
+                drop=False
+            )  # For now leaving drop false to ensure no breaking changes
 
             if df.empty:
                 self.logger.error(
@@ -788,6 +897,28 @@ def check_var_attributes(self, ds):
         else:
             return True
 
+    def validate_dataset_dimensions(self, ds: xr.Dataset) -> None:
+        """Validate that all dataset dimensions have corresponding variables as defined in the schema.
+        For each dimension present in the dataset (TIME, LATITUDE, LONGITUDE), this function checks whether the
+        dimension is declared in ``dataset_config["schema"]``. If it is, it ensures
+        that a variable of the same name exists in the dataset (For example, dimension such as id won't be defined). If a required
+        variable is missing, a ``ValueError`` is raised.
+        Args:
+            ds: The xarray Dataset to validate.
+            dataset_config: Configuration dictionary containing a ``"schema"`` key
+                mapping variable names to their definitions.
+        Raises:
+            ValueError: If a dimension is defined in the schema but the corresponding
+                variable is missing in the dataset.
+        """
+        schema = self.dataset_config.get("schema", {})
+
+        for dim in ds.dims:
+            if dim in schema and dim not in ds.variables:
+                raise ValueError(
+                    f"{self.uuid_log}: Dimension '{dim}' is defined in schema but missing as a variable in dataset."
+                )
+
     def publish_cloud_optimised(
         self, df: pd.DataFrame, ds: xr.Dataset, s3_file_handle
     ) -> None:
@@ -805,6 +936,7 @@ def publish_cloud_optimised(
             x["source_variable"]
             for x in self.dataset_config["schema_transformation"]["partitioning"]
         ]
+        self.validate_dataset_dimensions(ds)
         df = self._fix_datetimejulian(df)
         df = self._add_timestamp_df(df, s3_file_handle)
         df = self._add_columns_df(df, ds, s3_file_handle)