From e096fc0ec6fddf7a207dc3d7c2f648e0fe3b29ce Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 29 Jan 2025 18:39:17 -0800 Subject: [PATCH 1/4] Replace misleading schema filename reference in default param value --- metadata-translation/src/bin/validate_json.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/metadata-translation/src/bin/validate_json.py b/metadata-translation/src/bin/validate_json.py index e6e75bda..f8cb6911 100644 --- a/metadata-translation/src/bin/validate_json.py +++ b/metadata-translation/src/bin/validate_json.py @@ -11,6 +11,10 @@ def validate_json(data_path, schema_path, log_file): + r""" + TODO: Document this function. + TODO: Add type hints for this function's parameters and its return value. + """ with open(data_path, "r") as json_file: # load data data = json.load(json_file) @@ -32,9 +36,13 @@ def validate_json(data_path, schema_path, log_file): def test_gold_study_json( data_path="output/nmdc_etl/gold_study.json", - schema_path="../../../schema/nmdc.schema.json", + schema_path="/path/to/nmdc_materialized_patterns.schema.json", log_file="study_error.log", ): + r""" + TODO: Document this function. + TODO: Add type hints for this function's parameters and its return value. + """ valid = validate_json(data_path, schema_path, log_file) assert valid From 24367619b84828fd95a55aa7af8b77d6c2ac3698 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 29 Jan 2025 18:40:55 -0800 Subject: [PATCH 2/4] Replace obsolete schema filename in Python notebook Markdown prose --- .../notebooks/ghissue_252_253_linked_samples.ipynb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb b/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb index 7e06bf41..2e12f28b 100644 --- a/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb +++ b/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb @@ -301,9 +301,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "Need to update mongo collection schema to account for updated fields in nmdc.schema.json:" - ] + "source": "Need to update mongo collection schema to account for updated fields in `nmdc_materialized_patterns.schema.json`:" }, { "cell_type": "code", From 1e0e1b25f938028eae730ff412f1a3745a442d10 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 29 Jan 2025 18:43:42 -0800 Subject: [PATCH 3/4] Replace obsolete schema filename and URL in JSON Validation tutorial --- docs/tutorials/json.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorials/json.md b/docs/tutorials/json.md index e6db42d7..08b77e67 100644 --- a/docs/tutorials/json.md +++ b/docs/tutorials/json.md @@ -137,7 +137,7 @@ endpoint allows you to get a filtered list of documents from one of the NMDC Sch The `collection_name` must be one defined for a [nmdc:Database](https://microbiomedata.github.io/nmdc-schema/Database/), in the form expected by the JSON Schema, -[nmdc.schema.json](https://github.com/microbiomedata/nmdc-schema/blob/69fd1ee91afac1a943b2cc9bfbfdecd0e2cdd089/jsonschema/nmdc.schema.json#L987). +[nmdc_materialized_patterns.schema.json](https://github.com/microbiomedata/nmdc-schema/blob/1b42cef7e3a47930d25bde35b4bca0aa4391b283/nmdc_schema/nmdc_materialized_patterns.schema.json#L6699). This typically means that any spaces in the name should be entered as underscores (`_`) instead. The `filter`, if provided, is a JSON document in the form of the From 2042c09e61277468a6a55326ec3d7400e798303a Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 31 Jan 2025 19:44:57 -0800 Subject: [PATCH 4/4] Replace hard-coded obsolete file paths with references to CLI args --- metadata-translation/src/bin/validate_json.py | 49 ++++++++++++------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/metadata-translation/src/bin/validate_json.py b/metadata-translation/src/bin/validate_json.py index f8cb6911..b5144cd5 100644 --- a/metadata-translation/src/bin/validate_json.py +++ b/metadata-translation/src/bin/validate_json.py @@ -1,19 +1,12 @@ -import jsonschema +import sys + from jsonschema import Draft7Validator import json -# test using pytest; call with (python -m) pytest validate_json.py -# def test_always_pass(): -# assert True - -# def test_always_fail(): -# assert False - -def validate_json(data_path, schema_path, log_file): +def validate_json(data_path: str, schema_path: str, log_file: str) -> bool: r""" TODO: Document this function. - TODO: Add type hints for this function's parameters and its return value. """ with open(data_path, "r") as json_file: # load data @@ -28,26 +21,44 @@ def validate_json(data_path, schema_path, log_file): if not valid: with open(log_file, "w") as fp: for error in sorted(validator.iter_errors(data), key=lambda e: e.path): - # print(error.message) fp.write(error.message) return valid def test_gold_study_json( - data_path="output/nmdc_etl/gold_study.json", - schema_path="/path/to/nmdc_materialized_patterns.schema.json", - log_file="study_error.log", -): + schema_path: str, + data_path: str, + log_file_path: str = "error.log", +) -> bool: r""" - TODO: Document this function. - TODO: Add type hints for this function's parameters and its return value. + Validates the specified data against the specified schema, writing any validation errors to the specified log file. + + :param schema_path: Path to JSON-formatted NMDC Schema file against which you want to validate the data. + Example value: `/path/to/nmdc_materialized_patterns.schema.json` + :param data_path: Path to JSON-formatted data file you want to validate. + Example value: `/path/to/nmdc_etl/gold_study.json` + :param log_file_path: Path to log file to which you want the function to write validation error messages. """ - valid = validate_json(data_path, schema_path, log_file) + valid = validate_json(data_path, schema_path, log_file_path) assert valid return valid if __name__ == "__main__": - print("study test", test_gold_study_json()) + r""" + Note: In 2025, this script was updated ("quick 'n dirty"-ly) to allow the user to specify the various file paths via + CLI arguments. That update was prompted by team members noticing the hard-coded file paths in this script were + obsolete (i.e. they were paths to files that no longer existed in the repository). + """ + + # If an invalid number of CLI arguments was specified, abort and display a usage string. + if len(sys.argv) < 3: + raise SystemExit("Usage: script.py SCHEMA_PATH DATA_PATH [LOG_FILE_PATH]") + + print("study test", test_gold_study_json( + schema_path=sys.argv[1], + data_path=sys.argv[2], + log_file_path=sys.argv[3] if len(sys.argv) == 4 else None, + ))