aodn
diff --git a/‎aodn_cloud_optimised/bin/create_dataset_config.py‎
Lines changed: 7 additions & 1 deletion b/‎aodn_cloud_optimised/bin/create_dataset_config.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎aodn_cloud_optimised/bin/diver_cryptobenthic_fish_abundance_qc.py‎
Lines changed: 1 addition & 0 deletions b/‎aodn_cloud_optimised/bin/diver_cryptobenthic_fish_abundance_qc.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aodn_cloud_optimised/bin/diver_off_transect_species_observations_qc.py‎
Lines changed: 1 addition & 0 deletions b/‎aodn_cloud_optimised/bin/diver_off_transect_species_observations_qc.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aodn_cloud_optimised/bin/diver_site_information_qc.py‎
Lines changed: 1 addition & 0 deletions b/‎aodn_cloud_optimised/bin/diver_site_information_qc.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aodn_cloud_optimised/bin/diver_survey_metadata_qc.py‎
Lines changed: 1 addition & 0 deletions b/‎aodn_cloud_optimised/bin/diver_survey_metadata_qc.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎aodn_cloud_optimised/bin/generic_cloud_optimised_creation.py‎
Lines changed: 24 additions & 2 deletions b/‎aodn_cloud_optimised/bin/generic_cloud_optimised_creation.py‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎aodn_cloud_optimised/config/common.json‎
Lines changed: 1 addition & 1 deletion b/‎aodn_cloud_optimised/config/common.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎aodn_cloud_optimised/config/dataset/diver_cryptobenthic_fish_abundance_qc.json‎
Lines changed: 257 additions & 0 deletions b/‎aodn_cloud_optimised/config/dataset/diver_cryptobenthic_fish_abundance_qc.json‎
Lines changed: 257 additions & 0 deletions
@@ -772,6 +772,8 @@ def main():
                 dataset_config_schema = json.load(file)
             os.remove(temp_file_path)
 
+            regex_filter = [".*\\.nc$"]
+
         case ".csv":
             csv_opts = json.loads(args.csv_opts) if args.csv_opts else {}
 
@@ -808,6 +810,8 @@ def main():
                 # dataset_config_schema["properties"][col] = {"type": js_type}
                 dataset_config_schema[col] = {"type": js_type}
 
+            regex_filter = [".*\\.csv$"]
+
         case ".parquet":
             with fs.open(fp, "rb") as f:
                 schema = pq.read_schema(f)
@@ -816,6 +820,8 @@ def main():
                 for field in schema:
                     dataset_config_schema[field.name] = {"type": str(field.type)}
 
+            regex_filter = [".*\\.parquet$"]
+
         # Default: Raise NotImplemented
         case _:
             raise NotImplementedError(
@@ -863,7 +869,7 @@ def main():
     }
     parent_s3_path = PureS3Path.from_uri(fp).parent.as_uri()
     dataset_config["run_settings"]["paths"] = [
-        {"s3_uri": parent_s3_path, "filter": [".*\\.nc"], "year_range": []}
+        {"s3_uri": parent_s3_path, "filter": regex_filter, "year_range": []}
     ]
 
     if args.s3fs_opts:
 
@@ -0,0 +1 @@
+generic_launcher.py
@@ -0,0 +1 @@
+generic_launcher.py
@@ -0,0 +1 @@
+generic_launcher.py
@@ -0,0 +1 @@
+generic_launcher.py
@@ -643,6 +643,9 @@ class ParquetSchemaTransformation(BaseModel):
         default=None,
         description="Custom functions used to extract metadata from object keys and turn into variables, required if @function: is used in add_variables.",
     )
+    skip_partitioning_validation: bool = Field(
+        False, description="Set to true to skip required partitioning validation."
+    )
 
     @field_validator("add_variables")
     @classmethod
@@ -719,6 +722,9 @@ def validate_add_variables(cls, value):
 
     @model_validator(mode="after")
     def validate_required_patitions(self):
+        if self.skip_partitioning_validation:
+            return self
+
         if not self.partitioning:
             raise ValueError("'partitioning' key missing")
 
@@ -727,7 +733,7 @@ def validate_required_patitions(self):
         required_partitioning_keys = ["polygon", "timestamp"]
         if not all(key in partition_keys for key in required_partitioning_keys):
             raise ValueError(
-                f"Required variables {required_partitioning_keys} must be present in the 'partitioning' key. Only {partition_keys} available"
+                f"Required variables {required_partitioning_keys} must be present in the 'partitioning' key. Only {partition_keys} available.\n If you think those partitions shouldn't exist, set '\"skip_partitioning_validation\" : true' in the schema_transformation configuration"
             )
 
         return self
@@ -1241,14 +1247,21 @@ def main():
         description="Run cloud-optimised creation using config."
     )
     parser.add_argument(
-        "--config", required=False, help="JSON filename in config/dataset/"
+        "-c", "--config", required=False, help="JSON filename in config/dataset/"
     )
     parser.add_argument(
+        "-o",
         "--json-overwrite",
         type=str,
         help='JSON string to override config fields. Example:  \'{"run_settings": {"cluster": {"mode": null}, "raise_error": true}}\' ',
     )
 
+    parser.add_argument(
+        "-t",
+        "--test",
+        action="store_true",
+        help="Use integration testing bucket instead of the default optimised bucket.",
+    )
     args = parser.parse_args()
 
     try:
@@ -1289,6 +1302,15 @@ def main():
         or load_variable_from_config("ROOT_PREFIX_CLOUD_OPTIMISED_PATH")
     )
 
+    # Override for test mode
+    if args.test:
+        bucket_optimised = load_variable_from_config(
+            "BUCKET_INTEGRATION_TESTING_OPTIMISED_DEFAULT"
+        )
+        root_prefix = load_variable_from_config(
+            "ROOT_PREFIX_CLOUD_OPTIMISED_INTEGRATION_TESTING_PATH"
+        )
+
     s3_fs_common_opts = config.run_settings.s3_fs_common_opts
     s3_client_opts = boto3_from_opts_dict(s3_fs_common_opts)
 
 
@@ -4,5 +4,5 @@
   "ROOT_PREFIX_CLOUD_OPTIMISED_PATH": "",
   "BUCKET_INTEGRATION_TESTING_RAW_DEFAULT": "imos-data",
   "BUCKET_INTEGRATION_TESTING_OPTIMISED_DEFAULT": "imos-data-lab-optimised",
-  "ROOT_PREFIX_CLOUD_OPTIMISED_INTEGRATION_TESTING_PATH": "cloud_optimised/integration_testing"
+  "ROOT_PREFIX_CLOUD_OPTIMISED_INTEGRATION_TESTING_PATH": ""
 }
@@ -0,0 +1,257 @@
+{
+  "dataset_name": "diver_cryptobenthic_fish_abundance_qc",
+  "logger_name": "diver_cryptobenthic_fish_abundance_qc",
+  "cloud_optimised_format": "parquet",
+  "run_settings": {
+    "paths": [
+      {
+        "s3_uri": "s3://imos-data/IMOS/NRMN",
+        "filter": [
+          "ep_m2_cryptic_fish_public_data\\.csv"
+        ],
+        "year_range": []
+      }
+    ],
+    "cluster": {
+      "mode": "local",
+      "restart_every_path": false
+    },
+    "clear_existing_data": true,
+    "raise_error": false,
+    "coiled_cluster_options": {
+      "n_workers": [
+        1,
+        20
+      ],
+      "scheduler_vm_types": "m7i-flex.large",
+      "worker_vm_types": "m7i-flex.large",
+      "allow_ingress_from": "me",
+      "compute_purchase_option": "spot_with_fallback",
+      "worker_options": {
+        "nthreads": 4,
+        "memory_limit": "8GB"
+      }
+    },
+    "batch_size": 1,
+    "force_previous_parquet_deletion": false
+  },
+  "metadata_uuid": "9efa25cd-4da4-47b5-9385-45e3cbd11705",
+  "schema": {
+    "survey_id": {
+      "type": "int32"
+    },
+    "country": {
+      "type": "string"
+    },
+    "area": {
+      "type": "string"
+    },
+    "ecoregion": {
+      "type": "string"
+    },
+    "realm": {
+      "type": "string"
+    },
+    "location": {
+      "type": "string"
+    },
+    "site_code": {
+      "type": "string"
+    },
+    "site_name": {
+      "type": "string"
+    },
+    "latitude": {
+      "type": "float32"
+    },
+    "longitude": {
+      "type": "float32"
+    },
+    "survey_date": {
+      "type": "string"
+    },
+    "depth": {
+      "type": "float32"
+    },
+    "program": {
+      "type": "string"
+    },
+    "visibility": {
+      "type": "float32"
+    },
+    "hour": {
+      "type": "string"
+    },
+    "survey_latitude": {
+      "type": "float32"
+    },
+    "survey_longitude": {
+      "type": "float32"
+    },
+    "method": {
+      "type": "int32"
+    },
+    "block": {
+      "type": "int32"
+    },
+    "phylum": {
+      "type": "string"
+    },
+    "class_": {
+      "type": "string"
+    },
+    "order": {
+      "type": "string"
+    },
+    "family": {
+      "type": "string"
+    },
+    "species_name": {
+      "type": "string"
+    },
+    "reporting_name": {
+      "type": "string"
+    },
+    "size_class": {
+      "type": "float32"
+    },
+    "total": {
+      "type": "int32"
+    },
+    "biomass": {
+      "type": "float32"
+    },
+    "geom": {
+      "type": "string"
+    }
+  },
+  "aws_opendata_registry": {
+    "Name": "IMOS - National Reef Monitoring Network Sub-Facility - Global cryptobenthic fish abundance",
+    "Description": "The National Reef Monitoring Network brings together shallow reef surveys conducted around Australia into a centralised database. The IMOS National Reef Monitoring Network sub-Facility collates, cleans, stores and makes this data rapidly available from contributors including: Reef Life Survey, Parks Australia, Department of Biodiversity, Conservation and Attractions (Western Australia), Department of Environment, Water and Natural Resources (South Australia), Department of Primary Industries (New South Wales), Tasmanian Parks and Wildlife Service and Parks Victoria.\n\nThe data provided by the National Reef Monitoring Network contributes to establishing and supporting national marine baselines, and assisting with the management of Commonwealth and State marine reserves. \n\nReef Life Survey (RLS) and the Australian Temperate Reef Network (ATRC) aims to improve biodiversity conservation and the sustainable management of marine resources by coordinating surveys of rocky and coral reefs using scientific methods, with the ultimate goal to improve coastal stewardship. Our activities depend on the skills of marine scientists, experienced and motivated recreational SCUBA divers, partnerships with management agencies and university researchers, and active input from the ATRC partners and RLS Advisory Committee.\n\nRLS and ATRC data are freely available to the public for non-profit purposes, so not only managers, but also groups such as local dive clubs or schools may use these data to look at changes over time in their own local reefs. By making data freely available and through public outputs, RLS and ATRC aims to raise broader community awareness of the status of Australia?s marine biodiversity and associated conservation issues.\n\nThis dataset contains records of cryptobenthic fishes collected by RLS and ATRC divers and partners along 50m transects on shallow rocky and coral reefs using standard methods. Abundance information is available for all species recorded within quantitative survey limits (50 x 1 m swathes either side of the transect line, each distinguished as a 'Block'), with divers searching the reef surface (including cracks) carefully for hidden fishes. These observations are recorded concurrently with the macroinvertebrate observations and together make up the 'Method 2' component of the surveys. For this method, typically one 'Block' is completed per 50 m transect for the program ATRC and 2 blocks are completed for RLS ? although exceptions to this rule exist.\n\nThis dataset supersedes the RLS specific \"Reef Life Survey (RLS): Cryptic Fish\" collection that was available at https://catalogue-rls.imas.utas.edu.au/geonetwork/srv/en/metadata.show?uuid=6a56db3f-d1b2-438d-98c6-bd7dd540a4d5 (provision of data was stopped in June 2021).",
+    "Documentation": "https://catalogue-imos.aodn.org.au/geonetwork/srv/eng/catalog.search#/metadata/9efa25cd-4da4-47b5-9385-45e3cbd11705",
+    "Contact": "info@aodn.org.au",
+    "ManagedBy": "AODN",
+    "UpdateFrequency": "As Needed",
+    "Tags": [
+      "FILL UP MANUALLY - CHECK DOCUMENTATION"
+    ],
+    "License": "http://creativecommons.org/licenses/by/4.0/",
+    "Resources": [
+      {
+        "Description": "Cloud Optimised AODN dataset of IMOS - National Reef Monitoring Network Sub-Facility - Global cryptobenthic fish abundance",
+        "ARN": "arn:aws:s3:::aodn-cloud-optimised/diver_cryptobenthic_fish_abundance_qc.parquet",
+        "Region": "ap-southeast-2",
+        "Type": "S3 Bucket"
+      }
+    ],
+    "DataAtWork": {
+      "Tutorials": [
+        {
+          "Title": "Accessing IMOS - National Reef Monitoring Network Sub-Facility - Global cryptobenthic fish abundance",
+          "URL": "https://github.com/aodn/aodn_cloud_optimised/blob/main/notebooks/diver_cryptobenthic_fish_abundance_qc.ipynb",
+          "NotebookURL": "https://githubtocolab.com/aodn/aodn_cloud_optimised/blob/main/notebooks/diver_cryptobenthic_fish_abundance_qc.ipynb",
+          "AuthorName": "Laurent Besnard",
+          "AuthorURL": "https://github.com/aodn/aodn_cloud_optimised"
+        },
+        {
+          "Title": "Accessing and search for any AODN dataset",
+          "URL": "https://github.com/aodn/aodn_cloud_optimised/blob/main/notebooks/GetAodnData.ipynb",
+          "NotebookURL": "https://githubtocolab.com/aodn/aodn_cloud_optimised/blob/main/notebooks/GetAodnData.ipynb",
+          "AuthorName": "Laurent Besnard",
+          "AuthorURL": "https://github.com/aodn/aodn_cloud_optimised"
+        }
+      ]
+    },
+    "Citation": "IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access]"
+  },
+  "csv_config": {
+    "polars_read_csv_config": {
+      "separator": ",",
+      "has_header": true,
+      "null_values": [
+        "N/A",
+        "NaN"
+      ],
+      "try_parse_dates": false,
+      "infer_schema_length": 1000,
+      "encoding": "utf-8"
+    }
+  },
+  "schema_transformation": {
+    "drop_variables": [],
+    "add_variables": {
+      "filename": {
+        "source": "@filename",
+        "schema": {
+          "type": "string",
+          "units": "1",
+          "long_name": "Filename of the source file"
+        }
+      },
+      "timestamp": {
+        "source": "@partitioning:time_extent",
+        "schema": {
+          "type": "int64",
+          "units": "1",
+          "long_name": "Partition timestamp"
+        }
+      },
+      "polygon": {
+        "source": "@partitioning:spatial_extent",
+        "schema": {
+          "type": "string",
+          "units": "1",
+          "long_name": "Spatial partition polygon"
+        }
+      },
+      "TIME": {
+        "source": "@function:time_creation",
+        "schema": {
+          "type": "timestamp[ns]",
+          "units": "days since 1970-01-01T00:00:00Z",
+          "_FillValue": "",
+          "long_name": "Derived timestamp"
+        }
+      }
+    },
+    "functions": {
+      "time_creation": {
+        "extract_method": "from_variables",
+        "method": {
+          "creation_code": "def time_creation_from_variables(df):\n    import pandas as pd\n    date_col = df.get('survey_date')\n    hour_col = df.get('hour')\n\n    # Fill missing hour with 00:00:00\n    if hour_col is None:\n        hour_col = pd.Series(['00:00:00']*len(df), index=df.index)\n    else:\n        hour_col = hour_col.fillna('00:00:00')\n\n    # Combine date and hour strings\n    dt_str = date_col.astype(str) + ' ' + hour_col.astype(str)\n    result = pd.to_datetime(dt_str, errors='coerce', format='%Y-%m-%d %H:%M:%S')\n    # fallback to just date if conversion failed\n    mask = result.isna()\n    if mask.any():\n        result.loc[mask] = pd.to_datetime(date_col[mask], errors='coerce')\n    return result"
+        }
+      }
+    },
+    "partitioning": [
+      {
+        "source_variable": "timestamp",
+        "type": "time_extent",
+        "time_extent": {
+          "time_varname": "TIME",
+          "partition_period": "Y"
+        }
+      },
+      {
+        "source_variable": "polygon",
+        "type": "spatial_extent",
+        "spatial_extent": {
+          "lat_varname": "latitude",
+          "lon_varname": "longitude",
+          "spatial_resolution": 10
+        }
+      }
+    ],
+    "global_attributes": {
+      "delete": [
+        "geospatial_lat_max",
+        "geospatial_lat_min",
+        "geospatial_lon_max",
+        "geospatial_lon_min",
+        "date_created"
+      ],
+      "set": {
+        "title": "IMOS - National Reef Monitoring Network Sub-Facility - Global off-transect species observations"
+      }
+    }
+  }
+}
Original file line number	Diff line number	Diff line change
`@@ -4,5 +4,5 @@`
`4`	`4`	`"ROOT_PREFIX_CLOUD_OPTIMISED_PATH": "",`
`5`	`5`	`"BUCKET_INTEGRATION_TESTING_RAW_DEFAULT": "imos-data",`
`6`	`6`	`"BUCKET_INTEGRATION_TESTING_OPTIMISED_DEFAULT": "imos-data-lab-optimised",`
`7`		`- "ROOT_PREFIX_CLOUD_OPTIMISED_INTEGRATION_TESTING_PATH": "cloud_optimised/integration_testing"`
	`7`	`+ "ROOT_PREFIX_CLOUD_OPTIMISED_INTEGRATION_TESTING_PATH": ""`
`8`	`8`	`}`