Merge pull request #235 from aodn/ProcessInputParquetHive

lbesnard · web-flow · commit 40f72cf96b96 · 2025-11-13T12:29:48.000+11:00
Feat: improve parquet parser to read hived parquet dataset
diff --git a/aodn_cloud_optimised/bin/create_dataset_config.py b/aodn_cloud_optimised/bin/create_dataset_config.py
@@ -43,10 +43,12 @@
 import uuid
 from collections import OrderedDict
 from importlib.resources import files
+from urllib.parse import urlparse
 
 import nbformat
 import pandas as pd
 import polars as pl
+import pyarrow.dataset as ds
 import pyarrow.parquet as pq
 import s3fs
 from s3path import PureS3Path
@@ -813,28 +815,54 @@ def main():
             regex_filter = [".*\\.csv$"]
 
         case ".parquet":
-            with fs.open(fp, "rb") as f:
-                schema = pq.read_schema(f)
-                dataset_config_schema = dict()
-
-                for field in schema:
-
-                    # Extract core schema information
-                    dataset_config_schema[field.name] = {
-                        "type": str(field.type),
-                        "nullable": str(field.nullable),
-                    }
+            dataset_config_schema = dict()
 
-                    # Extract additional metadata if it exists
-                    if isinstance(field.metadata, dict):
-                        dataset_config_schema[field.name].update(
-                            {
-                                key.decode(): value.decode()
-                                for key, value in field.metadata.items()
-                            }
-                        )
+            # TODO: at this stage, we don't know yet if it's a hive or single parquet file. Could add another option in the create_dataset_config script for parquet only.
+            if fs.isfile(fp):
+                # Try reading as a single Parquet file
+                with fs.open(fp, "rb") as f:
+                    schema = pq.read_schema(f)
+
+                parquet_partitioning = None
+            if fs.isdir(fp):
+                # If that fails, assume it's a Hive-partitioned dataset
+
+                # Strip "s3://" if present, since s3fs expects only the key
+                parsed = urlparse(fp)
+                # TODO: this works but seems very ugly. Need to improve
+                dataset_path = (
+                    f"{parsed.netloc}{parsed.path}"  # ✅ keep the leading slash
+                )
+                parquet_partitioning = "hive"
+                dataset = ds.dataset(
+                    dataset_path,
+                    format="parquet",
+                    partitioning=parquet_partitioning,
+                    filesystem=fs,
+                )
+                schema = dataset.schema
+
+            for field in schema:
+                # Extract core schema information
+                dataset_config_schema[field.name] = {
+                    "type": str(field.type),
+                    "nullable": str(field.nullable),
+                }
+
+                # Extract additional metadata if it exists
+                if isinstance(field.metadata, dict):
+                    dataset_config_schema[field.name].update(
+                        {
+                            key.decode(): value.decode()
+                            for key, value in field.metadata.items()
+                        }
+                    )
+        case ".zarr":
+            # TODO: implement a zarr reader
 
-            regex_filter = [".*\\.parquet$"]
+            raise NotImplementedError(
+                f"input file type `{obj_key_suffix}` not yet implemented"
+            )
 
         # Default: Raise NotImplemented
         case _:
@@ -881,10 +909,36 @@ def main():
         "mode": f"{TO_REPLACE_PLACEHOLDER}",
         "restart_every_path": False,
     }
-    parent_s3_path = PureS3Path.from_uri(fp).parent.as_uri()
-    dataset_config["run_settings"]["paths"] = [
-        {"s3_uri": parent_s3_path, "filter": regex_filter, "year_range": []}
-    ]
+
+    match obj_key_suffix:
+        case ".nc" | ".csv":
+            parent_s3_path = PureS3Path.from_uri(fp).parent.as_uri()
+            dataset_config["run_settings"]["paths"] = [
+                {
+                    "type": "files",
+                    "s3_uri": parent_s3_path,
+                    "filter": regex_filter,
+                    "year_range": [],
+                }
+            ]
+        case ".zarr":
+            # TODO: partially implemented
+            parent_s3_path = PureS3Path.from_uri(fp).as_uri()
+            dataset_config["run_settings"]["paths"] = [
+                {
+                    "type": "zarr",
+                    "s3_uri": parent_s3_path,
+                }
+            ]
+        case ".parquet":
+            parent_s3_path = PureS3Path.from_uri(fp).as_uri()
+            dataset_config["run_settings"]["paths"] = [
+                {
+                    "type": "parquet",
+                    "partitioning": parquet_partitioning,
+                    "s3_uri": parent_s3_path,
+                }
+            ]
 
     if args.s3fs_opts:
         dataset_config.setdefault("run_settings", {})["s3_bucket_opts"] = {
diff --git a/aodn_cloud_optimised/bin/generic_cloud_optimised_creation.py b/aodn_cloud_optimised/bin/generic_cloud_optimised_creation.py
@@ -72,11 +72,22 @@ class PathConfig(BaseModel):
 
     Attributes:
         s3_uri: S3 URI as a POSIX path string.
-        filter: List of regex patterns to filter files.
-        year_range: Year filter: None, one year, or a two-year inclusive range, or a list of exclusive years to process.
+        type: Type of dataset. Can be "files", "parquet", or "zarr".
+        partitioning: Optional, used only for Parquet datasets (e.g., "hive").
+        filter: List of regex patterns to filter files (only valid for type="files").
+        year_range: Optional Year filter: None, one year, or a two-year inclusive range, or a list of exclusive years to process. (only valid for type="files")
+
     """
 
     s3_uri: str
+    type: Optional[Literal["files", "parquet", "zarr"]] = Field(
+        default=None,
+        description="Dataset type. One of 'files', 'parquet', or 'zarr'. Defaults to 'files' if not specified.",
+    )
+    partitioning: Optional[str] = Field(
+        default=None,
+        description="Partitioning scheme, only valid when type='parquet'. Currently supports 'hive'.",
+    )
     filter: List[str] = Field(
         default_factory=list,
         description="List of regular expression patterns used to filter matching files.",
@@ -125,10 +136,11 @@ def validate_s3_uri(cls, v: str) -> str:
             parsed = urlparse(v)
             if not parsed.netloc:
                 raise ValueError("s3_uri must include a bucket name after 's3://'")
-            if not parsed.path or parsed.path == "/":
-                raise ValueError(
-                    "s3_uri must include a valid key path after the bucket"
-                )
+            # TODO: remove the commented lines below. This used to be a good test, but now dataset could be a parquet hive partitioned at the root of the bucket.
+            # if not parsed.path or parsed.path == "/":
+            #     raise ValueError(
+            #         "s3_uri must include a valid key path after the bucket"
+            #     )
             try:
                 PurePosixPath(parsed.path.lstrip("/"))
             except Exception as e:
@@ -151,6 +163,55 @@ def validate_regex(cls, v):
                 raise ValueError(f"Invalid regex: {pattern} ({e})")
         return v
 
+    @model_validator(mode="after")
+    def validate_cross_fields(cls, values):
+        dataset_type = values.type or "files"
+        if values.type is None:
+            warnings.warn(
+                "No 'type' specified in PathConfig. Assuming 'files' as default.",
+                UserWarning,
+                stacklevel=2,
+            )
+            values.type = "files"
+            if (
+                any(".parquet" in f for f in values.filter)
+                or ".parquet" in values.s3_uri
+            ):
+                raise ValueError(
+                    "type must be defined as 'parquet' in run_settings.paths config if ingesting a parquet dataset."
+                )
+            elif any(".zarr" in f for f in values.filter) or ".zarr" in values.s3_uri:
+                raise ValueError(
+                    "type must be defined as 'zarr' in run_settings.paths config if ingesting a zarr dataset."
+                )
+
+        if dataset_type == "parquet":
+            if values.filter:
+                raise ValueError("filter must not be defined when type='parquet'")
+            if values.year_range:
+                raise ValueError("year_range must not be defined when type='parquet'")
+            if values.partitioning not in (None, "hive"):
+                raise ValueError(
+                    f"Invalid partitioning='{values.partitioning}' for parquet dataset. Only 'hive' is supported."
+                )
+
+        elif dataset_type == "zarr":
+            if values.filter:
+                raise ValueError("filter must not be defined when type='zarr'")
+            if values.year_range:
+                raise ValueError("year_range must not be defined when type='zarr'")
+            if values.partitioning:
+                raise ValueError("partitioning is not applicable when type='zarr'")
+
+        elif dataset_type == "files":
+            if values.partitioning:
+                raise ValueError("partitioning is not applicable when type='files'")
+
+        else:
+            raise ValueError(f"Unsupported dataset type: {dataset_type}")
+
+        return values
+
 
 class WorkerOptions(BaseModel):
     """Worker configuration for Coiled clusters.
@@ -1138,67 +1199,99 @@ def load_config_and_validate(config_filename: str) -> DatasetConfig:
     return DatasetConfig.model_validate(dataset_config)
 
 
-def json_update(base: dict, updates: dict) -> dict:
-    """Recursively update nested dictionaries."""
-    for k, v in updates.items():
-        if isinstance(v, dict) and isinstance(base.get(k), dict):
-            base[k] = json_update(base[k], v)
-        else:
-            base[k] = v
-    return base
-
-
 def collect_files(
     path_cfg: PathConfig,
-    suffix: str,
+    suffix: Optional[str],
     exclude: Optional[str],
     bucket_raw: Optional[str],
     s3_client_opts: Optional[dict] = None,
 ) -> List[str]:
-    """Collect files from an S3 bucket using suffix and optional regex filtering.
+    """Collect dataset paths from S3 based on dataset type.
+
+    Supports:
+      - 'files': lists and filters regular files (e.g., NetCDF, CSV)
+      - 'parquet': handles both single Parquet files and Hive-partitioned datasets
+      - 'zarr': returns the Zarr store path directly
 
     Args:
-        path_cfg: Configuration object including the S3 URI and optional regex filters.
+        path_cfg: Configuration object including type, S3 URI, and optional regex filters.
         suffix: File suffix to filter by, e.g., '.nc'. Set to None to disable suffix filtering.
         exclude: Optional regex string to exclude files.
         bucket_raw: Required if `path_cfg.s3_uri` is not a full S3 URI.
+        s3_client_opts: Optional dict with boto3 S3 client options.
 
     Returns:
-        List of matching file keys (paths) as strings.
+        List of dataset paths (files or root URIs) as strings.
     """
-    s3_uri = path_cfg.s3_uri
+    dataset_type = getattr(path_cfg, "type", "files")  # default value
+    s3_uri = path_cfg.s3_uri.rstrip("/")
+
+    # ---------------------------------------------------------------------
+    # Handle 'file' collection (NetCDF, CSV)
+    # ---------------------------------------------------------------------
+    if dataset_type == "files":
+        if s3_uri.startswith("s3://"):
+            parsed = urlparse(s3_uri)
+            bucket = parsed.netloc
+            prefix = parsed.path.lstrip("/")
+        else:
+            if not bucket_raw:
+                raise ValueError(
+                    "bucket_raw must be provided when s3_uri is not a full S3 URI."
+                )
+            bucket = bucket_raw
+            prefix = s3_uri
 
-    if s3_uri.startswith("s3://"):
-        parsed = urlparse(s3_uri)
-        bucket = parsed.netloc
-        prefix = parsed.path.lstrip("/")
-    else:
-        if not bucket_raw:
-            raise ValueError(
-                "bucket_raw must be provided when s3_uri is not a full S3 URI."
-            )
-        bucket = bucket_raw
-        prefix = s3_uri
+        prefix = str(PurePosixPath(prefix))  # normalise path
 
-    prefix = str(PurePosixPath(prefix))  # normalise path
+        matching_files = s3_ls(
+            bucket,
+            prefix,
+            suffix=suffix,
+            exclude=exclude,
+            s3_client_opts=s3_client_opts,
+        )
 
-    # matching_files = s3_ls(bucket, prefix, suffix=suffix, exclude=exclude)
-    matching_files = s3_ls(
-        bucket, prefix, suffix=None, exclude=exclude, s3_client_opts=s3_client_opts
-    )
+        for pattern in path_cfg.filter or []:
+            logger.info(f"Filtering files with regex pattern: {pattern}")
+            regex = re.compile(pattern)
+            matching_files = [f for f in matching_files if regex.search(f)]
+            if not matching_files:
+                raise ValueError(
+                    f"No files matching {pattern} under {s3_uri}. Modify regexp filter or path in configuration file. Abort"
+                )
 
-    for pattern in path_cfg.filter or []:
-        logger.info(f"Filtering files with regex pattern: {pattern}")
-        regex = re.compile(pattern)
-        matching_files = [f for f in matching_files if regex.search(f)]
-        if matching_files == []:
-            raise ValueError(
-                f"No files matching {pattern} under {s3_uri}. Modify regexp filter or path in configuration file. Abort"
-            )
+            logger.info(f"Matched {len(matching_files)} files")
+
+        return matching_files
+
+    # ---------------------------------------------------------------------
+    # Handle 'parquet' (single Parquet file or Hive-partitioned dataset)
+    # ---------------------------------------------------------------------
+    elif dataset_type == "parquet":
+        # No filters
+        return [s3_uri]
 
-        logger.info(f"Matched {len(matching_files)} files")
+    # ---------------------------------------------------------------------
+    # Handle 'zarr' (Zarr store)
+    # ---------------------------------------------------------------------
+    elif dataset_type == "zarr":
+        raise ValueError("zarr store as an input dataset is not yet implemented")
+        # return [s3_uri]
 
-    return matching_files
+    # Unsupported type
+    else:
+        raise ValueError(f"Unsupported dataset type: {dataset_type}")
+
+
+def json_update(base: dict, updates: dict) -> dict:
+    """Recursively update nested dictionaries."""
+    for k, v in updates.items():
+        if isinstance(v, dict) and isinstance(base.get(k), dict):
+            base[k] = json_update(base[k], v)
+        else:
+            base[k] = v
+    return base
 
 
 def join_s3_uri(base_uri: str, *parts: str) -> str:
diff --git a/aodn_cloud_optimised/config/dataset/diver_photoquadrat_score_qc.json b/aodn_cloud_optimised/config/dataset/diver_photoquadrat_score_qc.json
@@ -5,11 +5,8 @@
   "run_settings": {
     "paths": [
       {
-        "s3_uri": "s3://data-uplift-public/products/reef_life_survey",
-        "filter": [
-          "public_reef_life_survey_2025-11-04T03:14:37\\.parquet$"
-        ],
-        "year_range": []
+        "type": "parquet",
+        "s3_uri": "s3://data-uplift-public/products/reef_life_survey/public_reef_life_survey_2025-11-04T03:14:37.parquet"
       }
     ],
     "cluster": {
diff --git a/aodn_cloud_optimised/lib/GenericParquetHandler.py b/aodn_cloud_optimised/lib/GenericParquetHandler.py

Original file line number	Diff line number	Diff line change
`@@ -5,11 +5,8 @@`
`5`	`5`	`"run_settings": {`
`6`	`6`	`"paths": [`
`7`	`7`	`{`
`8`		`- "s3_uri": "s3://data-uplift-public/products/reef_life_survey",`
`9`		`- "filter": [`
`10`		`- "public_reef_life_survey_2025-11-04T03:14:37\\.parquet$"`
`11`		`- ],`
`12`		`- "year_range": []`
	`8`	`+ "type": "parquet",`
	`9`	`+ "s3_uri": "s3://data-uplift-public/products/reef_life_survey/public_reef_life_survey_2025-11-04T03:14:37.parquet"`
`13`	`10`	`}`
`14`	`11`	`],`
`15`	`12`	`"cluster": {`