Feat: improve hive partitioning detection for parquet reader

lbesnard · lbesnard · commit 5d4543a3de4b · 2025-11-13T11:23:07.000+11:00
diff --git a/aodn_cloud_optimised/bin/create_dataset_config.py b/aodn_cloud_optimised/bin/create_dataset_config.py
@@ -817,10 +817,13 @@ def main():
         case ".parquet":
             dataset_config_schema = dict()
 
+            # TODO: at this stage, we don't know yet if it's a hive or single parquet file. Could add another option in the create_dataset_config script for parquet only.
             try:
                 # Try reading as a single Parquet file
                 with fs.open(fp, "rb") as f:
                     schema = pq.read_schema(f)
+
+                parquet_partitioning = None
             except Exception:
                 # If that fails, assume it's a Hive-partitioned dataset
 
@@ -834,12 +837,18 @@ def main():
                     dataset_path, format="parquet", partitioning="hive", filesystem=fs
                 )
                 schema = dataset.schema
+                parquet_partitioning = "hive"
 
             dataset_config_schema = dict()
             for field in schema:
                 dataset_config_schema[field.name] = {"type": str(field.type)}
 
-            regex_filter = [".*\\.parquet$"]
+        case ".zarr":
+            # TODO: implement a zarr reader
+
+            raise NotImplementedError(
+                f"input file type `{obj_key_suffix}` not yet implemented"
+            )
 
         # Default: Raise NotImplemented
         case _:
@@ -886,10 +895,36 @@ def main():
         "mode": f"{TO_REPLACE_PLACEHOLDER}",
         "restart_every_path": False,
     }
-    parent_s3_path = PureS3Path.from_uri(fp).parent.as_uri()
-    dataset_config["run_settings"]["paths"] = [
-        {"s3_uri": parent_s3_path, "filter": regex_filter, "year_range": []}
-    ]
+
+    match obj_key_suffix:
+        case ".nc" | ".csv":
+            parent_s3_path = PureS3Path.from_uri(fp).parent.as_uri()
+            dataset_config["run_settings"]["paths"] = [
+                {
+                    "type": "files",
+                    "s3_uri": parent_s3_path,
+                    "filter": regex_filter,
+                    "year_range": [],
+                }
+            ]
+        case ".zarr":
+            # TODO: partially implemented
+            parent_s3_path = PureS3Path.from_uri(fp).as_uri()
+            dataset_config["run_settings"]["paths"] = [
+                {
+                    "type": "zarr",
+                    "s3_uri": parent_s3_path,
+                }
+            ]
+        case ".parquet":
+            parent_s3_path = PureS3Path.from_uri(fp).as_uri()
+            dataset_config["run_settings"]["paths"] = [
+                {
+                    "type": "parquet",
+                    "partitioning": parquet_partitioning,
+                    "s3_uri": parent_s3_path,
+                }
+            ]
 
     if args.s3fs_opts:
         dataset_config.setdefault("run_settings", {})["s3_bucket_opts"] = {
diff --git a/aodn_cloud_optimised/bin/generic_cloud_optimised_creation.py b/aodn_cloud_optimised/bin/generic_cloud_optimised_creation.py
@@ -72,11 +72,22 @@ class PathConfig(BaseModel):
 
     Attributes:
         s3_uri: S3 URI as a POSIX path string.
-        filter: List of regex patterns to filter files.
-        year_range: Year filter: None, one year, or a two-year inclusive range, or a list of exclusive years to process.
+        type: Type of dataset. Can be "files", "parquet", or "zarr".
+        partitioning: Optional, used only for Parquet datasets (e.g., "hive").
+        filter: List of regex patterns to filter files (only valid for type="files").
+        year_range: Optional Year filter: None, one year, or a two-year inclusive range, or a list of exclusive years to process. (only valid for type="files")
+
     """
 
     s3_uri: str
+    type: Optional[Literal["files", "parquet", "zarr"]] = Field(
+        default=None,
+        description="Dataset type. One of 'files', 'parquet', or 'zarr'. Defaults to 'files' if not specified.",
+    )
+    partitioning: Optional[str] = Field(
+        default=None,
+        description="Partitioning scheme, only valid when type='parquet'. Currently supports 'hive'.",
+    )
     filter: List[str] = Field(
         default_factory=list,
         description="List of regular expression patterns used to filter matching files.",
@@ -152,6 +163,55 @@ def validate_regex(cls, v):
                 raise ValueError(f"Invalid regex: {pattern} ({e})")
         return v
 
+    @model_validator(mode="after")
+    def validate_cross_fields(cls, values):
+        dataset_type = values.type or "files"
+        if values.type is None:
+            warnings.warn(
+                "No 'type' specified in PathConfig. Assuming 'files' as default.",
+                UserWarning,
+                stacklevel=2,
+            )
+            values.type = "files"
+            if (
+                any(".parquet" in f for f in values.filter)
+                or ".parquet" in values.s3_uri
+            ):
+                raise ValueError(
+                    "type must be defined as 'parquet' in run_settings.paths config if ingesting a parquet dataset."
+                )
+            elif any(".zarr" in f for f in values.filter) or ".zarr" in values.s3_uri:
+                raise ValueError(
+                    "type must be defined as 'zarr' in run_settings.paths config if ingesting a zarr dataset."
+                )
+
+        if dataset_type == "parquet":
+            if values.filter:
+                raise ValueError("filter must not be defined when type='parquet'")
+            if values.year_range:
+                raise ValueError("year_range must not be defined when type='parquet'")
+            if values.partitioning not in (None, "hive"):
+                raise ValueError(
+                    f"Invalid partitioning='{values.partitioning}' for parquet dataset. Only 'hive' is supported."
+                )
+
+        elif dataset_type == "zarr":
+            if values.filter:
+                raise ValueError("filter must not be defined when type='zarr'")
+            if values.year_range:
+                raise ValueError("year_range must not be defined when type='zarr'")
+            if values.partitioning:
+                raise ValueError("partitioning is not applicable when type='zarr'")
+
+        elif dataset_type == "files":
+            if values.partitioning:
+                raise ValueError("partitioning is not applicable when type='files'")
+
+        else:
+            raise ValueError(f"Unsupported dataset type: {dataset_type}")
+
+        return values
+
 
 class WorkerOptions(BaseModel):
     """Worker configuration for Coiled clusters.
@@ -1139,88 +1199,99 @@ def load_config_and_validate(config_filename: str) -> DatasetConfig:
     return DatasetConfig.model_validate(dataset_config)
 
 
-def json_update(base: dict, updates: dict) -> dict:
-    """Recursively update nested dictionaries."""
-    for k, v in updates.items():
-        if isinstance(v, dict) and isinstance(base.get(k), dict):
-            base[k] = json_update(base[k], v)
-        else:
-            base[k] = v
-    return base
-
-
 def collect_files(
     path_cfg: PathConfig,
-    suffix: str,
+    suffix: Optional[str],
     exclude: Optional[str],
     bucket_raw: Optional[str],
     s3_client_opts: Optional[dict] = None,
 ) -> List[str]:
-    """Collect files from an S3 bucket using suffix and optional regex filtering.
+    """Collect dataset paths from S3 based on dataset type.
+
+    Supports:
+      - 'files': lists and filters regular files (e.g., NetCDF, CSV)
+      - 'parquet': handles both single Parquet files and Hive-partitioned datasets
+      - 'zarr': returns the Zarr store path directly
 
     Args:
-        path_cfg: Configuration object including the S3 URI and optional regex filters.
+        path_cfg: Configuration object including type, S3 URI, and optional regex filters.
         suffix: File suffix to filter by, e.g., '.nc'. Set to None to disable suffix filtering.
         exclude: Optional regex string to exclude files.
         bucket_raw: Required if `path_cfg.s3_uri` is not a full S3 URI.
+        s3_client_opts: Optional dict with boto3 S3 client options.
 
     Returns:
-        List of matching file keys (paths) as strings.
+        List of dataset paths (files or root URIs) as strings.
     """
-    s3_uri = path_cfg.s3_uri
-
-    if s3_uri.startswith("s3://"):
-        parsed = urlparse(s3_uri)
-        bucket = parsed.netloc
-        prefix = parsed.path.lstrip("/")
-    else:
-        if not bucket_raw:
-            raise ValueError(
-                "bucket_raw must be provided when s3_uri is not a full S3 URI."
-            )
-        bucket = bucket_raw
-        prefix = s3_uri
-
-    prefix = str(PurePosixPath(prefix))  # normalise path
+    dataset_type = getattr(path_cfg, "type", "files")  # default value
+    s3_uri = path_cfg.s3_uri.rstrip("/")
+
+    # ---------------------------------------------------------------------
+    # Handle 'file' collection (NetCDF, CSV)
+    # ---------------------------------------------------------------------
+    if dataset_type == "files":
+        if s3_uri.startswith("s3://"):
+            parsed = urlparse(s3_uri)
+            bucket = parsed.netloc
+            prefix = parsed.path.lstrip("/")
+        else:
+            if not bucket_raw:
+                raise ValueError(
+                    "bucket_raw must be provided when s3_uri is not a full S3 URI."
+                )
+            bucket = bucket_raw
+            prefix = s3_uri
 
-    # handle case when collecting files for a parquet hive partition
-    for f in path_cfg.filter:
-        s3_uri = path_cfg.s3_uri.rstrip("/") + "/"
-        pattern_simplified = f.rstrip("$")
+        prefix = str(PurePosixPath(prefix))  # normalise path
 
-        # Only handle .parquet ending
-        if pattern_simplified.endswith(".parquet") or pattern_simplified.endswith(
-            ".parquet/"
-        ):
-            filename_candidate = pattern_simplified.split("/")[-1]
+        matching_files = s3_ls(
+            bucket,
+            prefix,
+            suffix=suffix,
+            exclude=exclude,
+            s3_client_opts=s3_client_opts,
+        )
 
-            # Check for unsupported regex characters
-            if re.search(r"[\*\[\]\(\)\+\?]", filename_candidate):
+        for pattern in path_cfg.filter or []:
+            logger.info(f"Filtering files with regex pattern: {pattern}")
+            regex = re.compile(pattern)
+            matching_files = [f for f in matching_files if regex.search(f)]
+            if not matching_files:
                 raise ValueError(
-                    f"In the case of a parquet dataset input, the filter value should match a dataset name without complex regex patterns. Filter '{f}' is too complex to convert to a filename. Please modify config"
+                    f"No files matching {pattern} under {s3_uri}. Modify regexp filter or path in configuration file. Abort"
                 )
 
-            # Remove escaped characters like \.
-            filename = re.sub(r"\\(.)", r"\1", filename_candidate)
-            return [s3_uri + filename]
+            logger.info(f"Matched {len(matching_files)} files")
 
-    # matching_files = s3_ls(bucket, prefix, suffix=suffix, exclude=exclude)
-    matching_files = s3_ls(
-        bucket, prefix, suffix=None, exclude=exclude, s3_client_opts=s3_client_opts
-    )
+        return matching_files
 
-    for pattern in path_cfg.filter or []:
-        logger.info(f"Filtering files with regex pattern: {pattern}")
-        regex = re.compile(pattern)
-        matching_files = [f for f in matching_files if regex.search(f)]
-        if matching_files == []:
-            raise ValueError(
-                f"No files matching {pattern} under {s3_uri}. Modify regexp filter or path in configuration file. Abort"
-            )
+    # ---------------------------------------------------------------------
+    # Handle 'parquet' (single Parquet file or Hive-partitioned dataset)
+    # ---------------------------------------------------------------------
+    elif dataset_type == "parquet":
+        # No filters
+        return [s3_uri]
 
-        logger.info(f"Matched {len(matching_files)} files")
+    # ---------------------------------------------------------------------
+    # Handle 'zarr' (Zarr store)
+    # ---------------------------------------------------------------------
+    elif dataset_type == "zarr":
+        raise ValueError("zarr store as an input dataset is not yet implemented")
+        # return [s3_uri]
 
-    return matching_files
+    # Unsupported type
+    else:
+        raise ValueError(f"Unsupported dataset type: {dataset_type}")
+
+
+def json_update(base: dict, updates: dict) -> dict:
+    """Recursively update nested dictionaries."""
+    for k, v in updates.items():
+        if isinstance(v, dict) and isinstance(base.get(k), dict):
+            base[k] = json_update(base[k], v)
+        else:
+            base[k] = v
+    return base
 
 
 def join_s3_uri(base_uri: str, *parts: str) -> str:
diff --git a/aodn_cloud_optimised/config/dataset/diver_photoquadrat_score_qc.json b/aodn_cloud_optimised/config/dataset/diver_photoquadrat_score_qc.json
@@ -5,11 +5,8 @@
   "run_settings": {
     "paths": [
       {
-        "s3_uri": "s3://data-uplift-public/products/reef_life_survey",
-        "filter": [
-          "public_reef_life_survey_2025-11-04T03:14:37\\.parquet$"
-        ],
-        "year_range": []
+        "type": "parquet",
+        "s3_uri": "s3://data-uplift-public/products/reef_life_survey/public_reef_life_survey_2025-11-04T03:14:37.parquet"
       }
     ],
     "cluster": {
diff --git a/aodn_cloud_optimised/lib/GenericParquetHandler.py b/aodn_cloud_optimised/lib/GenericParquetHandler.py
@@ -293,19 +293,39 @@ def preprocess_data_parquet(
             added by the cloud optimisation process.
         """
 
-        # Try reading as a single Parquet file
-        try:
-            table = pq.read_table(parquet_fp)
-        except (pa.ArrowInvalid, OSError):
-            # Treat as Hive-partitioned dataset
-            # parquet_fp is a file-like object: extract the key prefix
-            key_prefix = parquet_fp.path  # S3File objects have `.path` attribute
-            table = pds.dataset(
-                key_prefix,
-                format="parquet",
-                partitioning="hive",
-                filesystem=self.s3_fs_output,
-            ).to_table()
+        key_path = getattr(parquet_fp, "path", None)
+        full_path = key_path if key_path.startswith("s3://") else f"s3://{key_path}"
+
+        # matching the parquet file with the correct config in the paths array
+        matched_cfg = None
+        for path_cfg in self.dataset_config["run_settings"]["paths"]:
+            s3_uri = path_cfg.get("s3_uri", "").rstrip("/")
+            if full_path.startswith(s3_uri):
+                matched_cfg = path_cfg
+                break
+
+        if matched_cfg is None:
+            raise ValueError(f"No matching path configuration found for {full_path}")
+
+        partitioning = matched_cfg.get("partitioning", None)
+
+        match partitioning:
+            case None:
+                # reading as a single Parquet file
+                table = pq.read_table(parquet_fp)
+
+            case "hive":
+                key_prefix = parquet_fp.path  # S3File objects have `.path` attribute
+                table = pds.dataset(
+                    key_prefix,
+                    format="parquet",
+                    partitioning=partitioning,
+                    filesystem=self.s3_fs_output,
+                ).to_table()
+            case _:
+                raise ValueError(
+                    f"Partitioning value {partitioning} is not yet supported"
+                )
 
         df = table.to_pandas()
         df = df.drop(columns=self.drop_variables, errors="ignore")

Original file line number	Diff line number	Diff line change
`@@ -5,11 +5,8 @@`
`5`	`5`	`"run_settings": {`
`6`	`6`	`"paths": [`
`7`	`7`	`{`
`8`		`- "s3_uri": "s3://data-uplift-public/products/reef_life_survey",`
`9`		`- "filter": [`
`10`		`- "public_reef_life_survey_2025-11-04T03:14:37\\.parquet$"`
`11`		`- ],`
`12`		`- "year_range": []`
	`8`	`+ "type": "parquet",`
	`9`	`+ "s3_uri": "s3://data-uplift-public/products/reef_life_survey/public_reef_life_survey_2025-11-04T03:14:37.parquet"`
`13`	`10`	`}`
`14`	`11`	`],`
`15`	`12`	`"cluster": {`