Skip to content

Commit 8184af2

Browse files
authored
Merge pull request #118 from aodn/AwsRegistry
Feat: add function to retrieve info from CSV to populate AWS Registry
2 parents 3c4d3b6 + 614aab0 commit 8184af2

File tree

42 files changed

+454
-103
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+454
-103
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ jobs:
7979
- name: Upload coverage report
8080
uses: actions/upload-artifact@v4
8181
with:
82-
name: coverage-report-${{ github.job }}-${{ github.run_number }}
82+
name: coverage-report-${{ github.job }}-${{ github.run_number }}-${{ matrix.python-version }}
8383
path: coverage.xml
8484

8585
- name: Build package

aodn_cloud_optimised/bin/create_aws_registry_dataset.py

Lines changed: 120 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -27,21 +27,23 @@
2727
import io
2828
import json
2929
import os
30+
import sys
3031
import tempfile
3132
from argparse import RawTextHelpFormatter
3233
from importlib.resources import files
3334

35+
import pandas as pd
3436
import requests
3537
import xmltodict
36-
from colorama import init, Fore, Style
38+
from colorama import Fore, Style, init
3739

40+
from aodn_cloud_optimised.lib.common import list_json_files
3841
from aodn_cloud_optimised.lib.CommonHandler import CommonHandler
3942
from aodn_cloud_optimised.lib.config import (
40-
load_dataset_config,
4143
load_config,
44+
load_dataset_config,
4245
load_variable_from_config,
4346
)
44-
from aodn_cloud_optimised.lib.common import list_json_files
4547

4648

4749
def retrieve_geonetwork_metadata(
@@ -192,6 +194,63 @@ def update_nested_dict_key(dataset_config, keys, new_value):
192194
return dataset_config
193195

194196

197+
def populate_dataset_config_with_metadata_from_csv(json_file, csv_path):
198+
json_path = str(files("aodn_cloud_optimised.config.dataset").joinpath(json_file))
199+
dataset_config = load_dataset_config(json_path)
200+
201+
csv_data = pd.read_csv(
202+
csv_path,
203+
index_col="Cloud_Optimised_Collection_Name",
204+
encoding="ISO-8859-1",
205+
na_filter=False,
206+
)
207+
208+
dataset_name = dataset_config["dataset_name"]
209+
try:
210+
csv_dataset = csv_data.loc[dataset_name]
211+
except Exception as err:
212+
print(f"{dataset_name} NOT FOUND in CSV file")
213+
return
214+
215+
if not csv_dataset["AWS_Title"] == "":
216+
dataset_config = update_nested_dict_key(
217+
dataset_config,
218+
["aws_opendata_registry", "Name"],
219+
csv_dataset["AWS_Title"],
220+
)
221+
else:
222+
Warning(f"AWS_Title for {dataset_name} is missing from {csv_path}")
223+
224+
if not csv_dataset["AWS_Tags"] == "":
225+
aws_tags = [keyword.strip() for keyword in csv_dataset["AWS_Tags"].split(";")]
226+
dataset_config = update_nested_dict_key(
227+
dataset_config, ["aws_opendata_registry", "Tags"], aws_tags
228+
)
229+
else:
230+
Warning(f"AWS_Tags for {dataset_name} is missing from {csv_path}")
231+
232+
if not csv_dataset["AWS_Citation"] == "":
233+
dataset_config = update_nested_dict_key(
234+
dataset_config,
235+
["aws_opendata_registry", "Citation"],
236+
csv_dataset["AWS_Citation"],
237+
)
238+
else:
239+
Warning(f"AWS_Citation for {dataset_name} is missing from {csv_path}")
240+
241+
# dataset config coming from load_dataset_config is the result of parent and child configuration. When writing back
242+
# the configuration, we only want to write the child data back
243+
dataset_config_child = load_config(json_path)
244+
# Overwrite the original JSON file with the modified dataset_config
245+
with open(json_path, "w") as f:
246+
dataset_config_child["aws_opendata_registry"] = dataset_config[
247+
"aws_opendata_registry"
248+
]
249+
json.dump(dataset_config_child, f, indent=2)
250+
251+
print(f"Updated JSON file saved at: {json_path}")
252+
253+
195254
def populate_dataset_config_with_geonetwork_metadata(json_file):
196255
""" """
197256

@@ -286,7 +345,7 @@ def populate_dataset_config_with_geonetwork_metadata(json_file):
286345
dataset_config, ["aws_opendata_registry", "Resources"], dataset_location
287346
)
288347

289-
# dataset confi coming from load_dataset_config is the result of parent and child configuration. When writing back
348+
# dataset config coming from load_dataset_config is the result of parent and child configuration. When writing back
290349
# the configuration, we only want to write the child data back
291350
dataset_config_child = load_config(json_path)
292351
# Overwrite the original JSON file with the modified dataset_config
@@ -317,29 +376,7 @@ def convert_to_opendata_registry(json_file, output_directory):
317376
handler.create_metadata_aws_registry(target_directory=output_directory)
318377

319378

320-
def main():
321-
"""
322-
Main function to convert JSON files to AWS OpenData Registry format.
323-
324-
The script can be run in different ways:
325-
326-
1. Convert a specific JSON file to AWS OpenData Registry format.
327-
2. Convert all JSON files in the directory.
328-
3. Run interactively to list all available JSON files and prompt the user to choose one to convert.
329-
330-
Important:
331-
If the -g option is provided, the script will download metadata from the GeoNetwork metadata
332-
record and prompt the user to choose to replace existing values or not.
333-
334-
335-
Args (optional):
336-
-f, --file (str): Name of a specific JSON file to convert.
337-
-d, --directory (str): Output directory to save converted YAML files.
338-
-a, --all: Convert all JSON files in the directory.
339-
-g, --geonetwork: Retrieve metadata fields from GeoNetwork3 metadata record
340-
341-
If the directory is not specified, a temporary directory is created.
342-
"""
379+
def parse_args(arg_list: list[str] | None):
343380
parser = argparse.ArgumentParser(
344381
description="""
345382
Create AWS OpenData Registry YAML files from the dataset configuration, ready to be added to the OpenData Github
@@ -371,24 +408,71 @@ def main():
371408
help="Retrieve metadata from Geonetwork instance to populate OpenData Registry format. Interactive mode",
372409
)
373410

411+
parser.add_argument(
412+
"-c",
413+
"--csv-path",
414+
help="Add specific metadata from an external CSV file",
415+
)
416+
374417
args = parser.parse_args()
375418

419+
return args
420+
421+
422+
def main(arg_list: list[str] | None = None):
423+
"""
424+
Main function to convert JSON files to AWS OpenData Registry format.
425+
426+
The script can be run in different ways:
427+
428+
1. Convert a specific JSON file to AWS OpenData Registry format.
429+
2. Convert all JSON files in the directory.
430+
3. Run interactively to list all available JSON files and prompt the user to choose one to convert.
431+
432+
Important:
433+
If the -g option is provided, the script will download metadata from the GeoNetwork metadata
434+
record and prompt the user to choose to replace existing values or not.
435+
436+
437+
Args (optional):
438+
-f, --file (str): Name of a specific JSON file to convert.
439+
-d, --directory (str): Output directory to save converted YAML files.
440+
-a, --all: Convert all JSON files in the directory.
441+
-g, --geonetwork: Retrieve metadata fields from GeoNetwork3 metadata record
442+
443+
If the directory is not specified, a temporary directory is created.
444+
"""
376445
json_directory = str(files("aodn_cloud_optimised.config.dataset")._paths[0])
377446

447+
args = parse_args(sys.argv[1:])
448+
378449
if args.all:
379450
json_files = list_json_files(json_directory)
380451
if json_files:
381452
output_dir = args.directory or tempfile.mkdtemp()
382453
for file in json_files:
383454
if args.geonetwork:
384455
populate_dataset_config_with_geonetwork_metadata(file)
456+
457+
if args.csv_path:
458+
if os.path.exists(args.csv_path):
459+
populate_dataset_config_with_metadata_from_csv(
460+
file, args.csv_path
461+
)
462+
else:
463+
raise ValueError(f"{args.csv_path} does not exist")
385464
convert_to_opendata_registry(file, output_dir)
386465
else:
387466
print(f"No JSON files found in {json_directory}.")
388467
elif args.file:
389468
output_dir = args.directory or tempfile.mkdtemp()
390469
if args.geonetwork:
391470
populate_dataset_config_with_geonetwork_metadata(args.file)
471+
if args.csv_path:
472+
if os.path.exists(args.csv_path):
473+
populate_dataset_config_with_metadata_from_csv(args.file, args.csv_path)
474+
else:
475+
raise ValueError(f"{args.csv_path} does not exist")
392476

393477
convert_to_opendata_registry(args.file, output_dir)
394478
else:
@@ -406,6 +490,15 @@ def main():
406490
populate_dataset_config_with_geonetwork_metadata(
407491
json_files[choice_idx]
408492
)
493+
494+
if args.csv_path:
495+
if os.path.exists(args.csv_path):
496+
populate_dataset_config_with_metadata_from_csv(
497+
json_files[choice_idx], args.csv_path
498+
)
499+
else:
500+
raise ValueError(f"{args.csv_path} does not exist")
501+
409502
convert_to_opendata_registry(json_files[choice_idx], output_dir)
410503
else:
411504
print("Invalid choice. Aborting.")

aodn_cloud_optimised/config/dataset/animal_acoustic_tracking_delayed_qc.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,8 @@
201201
"UpdateFrequency": "As Needed",
202202
"Tags": [
203203
"oceans",
204-
"marine mammals"
204+
"marine mammals",
205+
"biology"
205206
],
206207
"License": "http://creativecommons.org/licenses/by/4.0/",
207208
"Resources": [
@@ -230,6 +231,6 @@
230231
}
231232
]
232233
},
233-
"Citation": "IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access]"
234+
"Citation": "The citation in a list of references is: \"IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access].\""
234235
}
235236
}

aodn_cloud_optimised/config/dataset/animal_ctd_satellite_relay_tagging_delayed_qc.json

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -432,14 +432,18 @@
432432
},
433433
"force_old_pq_del": false,
434434
"aws_opendata_registry": {
435-
"Name": "Satellite Relay Tagging Program - Southern Ocean - MEOP Quality Controlled CTD Profiles",
435+
"Name": "Marine Animal - Satellite Relay Tagging - Quality controlled profiles",
436436
"Description": "CTD (Conductivity-Temperature_Depth)-Satellite Relay Data Loggers (CTD-SRDLs) are used to explore how marine animal behaviour relates to their oceanic environment. Loggers developed at the University of St Andrews Sea Mammal Research Unit transmit data in near real-time via the Argo satellite system. Data represented here was collected in the Southern Ocean, from elephant, fur and Weddell Seals. In 2024 data was added from flatback and olive ridley turtles, from a pilot study co-funded by the Royal Australian Navy in collaboration with the Australian Institute of Marine Science and Indigenous Ranger groups.\n\nData parameters measured by the instruments include time, conductivity (salinity), temperature, pressure and depth. The data represented by this record have been Qc'd and are the Australian subset of the MEOP-CTD database (MEOP: Marine Mammals Exploring the Oceans Pole to Pole), complemented with the most recent Southern Ocean deployment data. This Australian subset of the Southern Ocean database represents about one quarter of the entire MEOP-CTD database, which currently is about 52,000 profiles obtained from 275 CTD-SRDL tag deployments. The Australian dataset originated in 2004, and was initially collected by Mark Hindell's team based at the University of Tasmania, and in later years his data has formed part of the Animal Tracking Facility of Integrated Marine Observing System (IMOS).",
437437
"Documentation": "https://catalogue-imos.aodn.org.au/geonetwork/srv/eng/catalog.search#/metadata/95d6314c-cfc7-40ae-b439-85f14541db71",
438438
"Contact": "info@aodn.org.au",
439439
"ManagedBy": "AODN",
440440
"UpdateFrequency": "As Needed",
441441
"Tags": [
442-
"FILL UP MANUALLY - CHECK DOCUMENTATION"
442+
"oceans",
443+
"marine mammals",
444+
"biology",
445+
"chemistry",
446+
"chemical biology"
443447
],
444448
"License": "http://creativecommons.org/licenses/by/4.0/",
445449
"Resources": [
@@ -468,6 +472,6 @@
468472
}
469473
]
470474
},
471-
"Citation": "IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access]"
475+
"Citation": "The citation in a list of references is: \"Integrated Marine Observing System (IMOS); Hindell, Mark [year-of-data-download], [Title], Antarctic Climate and Ecosystem Cooperative Research Centre (ACE CRC); Institute for Marine and Antarctic Studies (IMAS), University of Tasmania (UTAS), [data-access-URL], accessed [date-of-access].\" If data includes turtle data between January 2022 - June 2024, please cite as follows:\n\"Integrated Marine Observing System (IMOS); Hindell, Mark; Department of Defence [year-of-data-download], [Title], Antarctic Climate and Ecosystem Cooperative Research Centre (ACE CRC); Institute for Marine and Antarctic Studies (IMAS), University of Tasmania (UTAS), [data-access-URL], accessed [date-of-access].\""
472476
}
473477
}

aodn_cloud_optimised/config/dataset/autonomous_underwater_vehicle.json

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -179,14 +179,16 @@
179179
},
180180
"force_old_pq_del": false,
181181
"aws_opendata_registry": {
182-
"Name": "IMOS - Autonomous Underwater Vehicle (AUV) Facility",
182+
"Name": "Autonomous underwater vehicles",
183183
"Description": "The IMOS Autonomous Underwater Vehicle (AUV) Facility operates multiple ocean-going AUVs capable of undertaking high resolution, geo-referenced survey work. AUV Sirius is a modified version of a mid-size robotic vehicle Seabed built at the Woods Hole Oceanographic Institution. This class of AUV has been designed specifically for low speed, high resolution imaging and is passively stable in pitch and roll. AUV Nimbus is a custom design with the aim of deploying off smaller vessels than Sirius whilst remaining able to operate in similar environments. AUV Iver is an extended Ocean Server Iver2 class platform with additional navigation and imaging sensors but lacks the ability to work over complex underwater terrain. Based on its small size, it can be deployed from RHIBs and other small vessels. The main objective of the IMOS AUV Facility is to support sustained observations of the benthos, in support of the IMOS integrated benthic monitoring program. . The AUV facility is based at the Australian Centre for Field Robotics (ACFR) within the School of Aerospace, Mechanical and Mechatronic Engineering at the University of Sydney.\n\nThis IMOS Facility finished in June 2023, with data still available through the AODN Portal.",
184184
"Documentation": "https://catalogue-imos.aodn.org.au/geonetwork/srv/eng/catalog.search#/metadata/af5d0ff9-bb9c-4b7c-a63c-854a630b6984",
185185
"Contact": "info@aodn.org.au",
186186
"ManagedBy": "AODN",
187187
"UpdateFrequency": "As Needed",
188188
"Tags": [
189-
"FILL UP MANUALLY - CHECK DOCUMENTATION"
189+
"oceans",
190+
"chemistry",
191+
"chemical biology"
190192
],
191193
"License": "http://creativecommons.org/licenses/by/4.0/",
192194
"Resources": [
@@ -215,6 +217,6 @@
215217
}
216218
]
217219
},
218-
"Citation": "IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access]"
220+
"Citation": "The citation in a list of references is: \"IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access].\""
219221
}
220222
}

aodn_cloud_optimised/config/dataset/model_sea_level_anomaly_gridded_realtime.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,6 @@
170170
}
171171
]
172172
},
173-
"Citation": "IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access]"
173+
"Citation": "The citation in a list of references is: \"IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access].\""
174174
}
175175
}

aodn_cloud_optimised/config/dataset/mooring_hourly_timeseries_delayed_qc.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -813,6 +813,6 @@
813813
}
814814
]
815815
},
816-
"Citation": "IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access]. If using data from the Ningaloo (TAN100) mooring, please add to the citation - \"Department of Jobs, Tourism, Science and Innovation (DJTSI), Western Australian Government\". "
816+
"Citation": "The citation in a list of references is: \"IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access].\" If using data from the Ningaloo (TAN100) mooring, please add to the citation - \"Department of Jobs, Tourism, Science and Innovation (DJTSI), Western Australian Government\". "
817817
}
818818
}

aodn_cloud_optimised/config/dataset/mooring_satellite_altimetry_calibration_validation.json

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -302,14 +302,16 @@
302302
},
303303
"force_old_pq_del": false,
304304
"aws_opendata_registry": {
305-
"Name": "IMOS - SRS Satellite Altimetry Calibration and Validation Sub-Facility",
305+
"Name": "Satellite - Altimetry calibration and validation",
306306
"Description": "High precision satellite altimeter missions including TOPEX/Poseidon (T/P), Jason-1 and now OSTM/Jason-2, have contributed fundamental advances in our understanding of regional and global ocean circulation and its role in the Earth's climate and regional applications. These altimeter satellites essentially observe the height of the global oceans \u2013 as such, they have become the tool of choice for scientists to measure sea level rise \u2013 both at regional and global scales as well as giving information about ocean currents and large- and small-scale variability. The determination of changes in global mean sea level is of fundamental importance in understanding the response of the ocean to a continuing warming climate \u2013 both through thermal expansion of the ocean, melting of the major ice sheets of Greenland and Antarctica, and mountain glaciers, and redistribution of water over the continents and atmosphere. As with all scientific observations, it is vital that the measurement tool is operating within its specifications \u2013 calibration and validation is therefore an important component in this regard. \n\nThis IMOS sub-facility provides the sole southern hemisphere in situ calibration site to provide an ongoing calibration and validation data stream directly to the international (NASA and CNES sponsored) Ocean Surface Topography Science Team (OSTST). This contribution, importantly, spans multiple altimeter missions, thus enabling the assimilation of multi-mission satellite data to determine ongoing changes in sea level with flow-on effects to other uses of this data. The OSTST collectively oversees the operation and calibration of the precision altimetry missions, and ensures each of these missions is performing as accurately as possible in order to meet mission objectives. The IMOS supported calibration site in Australia is one of four primary in situ calibration/validation sites that contribute to the OSTST. The remaining sites include Harvest (USA), Corsica (France), and Gavdos (Greece). The Australian calibration site includes two comparison points where in situ data is compared against the altimeter \u2013 Bass Strait and Storm Bay. These two locations both lie on descending (N -> S) pass 088 of the satellite altimeter, and thus share similar satellite orbit characteristics. The use of these two sites allows detailed investigation into the accuracy of the altimeter over two different wave climates. The average significant wave height at Storm Bay is approximately double that observed at the comparatively sheltered Bass Strait location. One of the ongoing issues with satellite altimeter missions is the \u201cSea-state bias\u201d correction which is related to wave shape and height. We plan to use the different wave climates at the two locations, coupled with the fact that some other things (such as orbit errors) are the same at both locations to improve the quality of this correction.",
307307
"Documentation": "https://catalogue-imos.aodn.org.au/geonetwork/srv/eng/catalog.search#/metadata/78d588ed-79dd-47e2-b806-d39025194e7e",
308308
"Contact": "info@aodn.org.au",
309309
"ManagedBy": "AODN",
310310
"UpdateFrequency": "As Needed",
311311
"Tags": [
312-
"FILL UP MANUALLY - CHECK DOCUMENTATION"
312+
"oceans",
313+
"ocean currents",
314+
"chemistry"
313315
],
314316
"License": "http://creativecommons.org/licenses/by/4.0/",
315317
"Resources": [
@@ -338,6 +340,6 @@
338340
}
339341
]
340342
},
341-
"Citation": "IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access]"
343+
"Citation": "The citation in a list of references is: \"IMOS [year-of-data-download], [Title], [data-access-URL], accessed [date-of-access].\""
342344
}
343345
}

0 commit comments

Comments
 (0)