diff --git a/docs/tutorials/json.md b/docs/tutorials/json.md index e6db42d7..08b77e67 100644 --- a/docs/tutorials/json.md +++ b/docs/tutorials/json.md @@ -137,7 +137,7 @@ endpoint allows you to get a filtered list of documents from one of the NMDC Sch The `collection_name` must be one defined for a [nmdc:Database](https://microbiomedata.github.io/nmdc-schema/Database/), in the form expected by the JSON Schema, -[nmdc.schema.json](https://github.com/microbiomedata/nmdc-schema/blob/69fd1ee91afac1a943b2cc9bfbfdecd0e2cdd089/jsonschema/nmdc.schema.json#L987). +[nmdc_materialized_patterns.schema.json](https://github.com/microbiomedata/nmdc-schema/blob/1b42cef7e3a47930d25bde35b4bca0aa4391b283/nmdc_schema/nmdc_materialized_patterns.schema.json#L6699). This typically means that any spaces in the name should be entered as underscores (`_`) instead. The `filter`, if provided, is a JSON document in the form of the diff --git a/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb b/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb index 7e06bf41..2e12f28b 100644 --- a/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb +++ b/metadata-translation/notebooks/ghissue_252_253_linked_samples.ipynb @@ -301,9 +301,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "Need to update mongo collection schema to account for updated fields in nmdc.schema.json:" - ] + "source": "Need to update mongo collection schema to account for updated fields in `nmdc_materialized_patterns.schema.json`:" }, { "cell_type": "code", diff --git a/metadata-translation/src/bin/validate_json.py b/metadata-translation/src/bin/validate_json.py index e6e75bda..b5144cd5 100644 --- a/metadata-translation/src/bin/validate_json.py +++ b/metadata-translation/src/bin/validate_json.py @@ -1,16 +1,13 @@ -import jsonschema +import sys + from jsonschema import Draft7Validator import json -# test using pytest; call with (python -m) pytest validate_json.py -# def test_always_pass(): -# assert True - -# def test_always_fail(): -# assert False - -def validate_json(data_path, schema_path, log_file): +def validate_json(data_path: str, schema_path: str, log_file: str) -> bool: + r""" + TODO: Document this function. + """ with open(data_path, "r") as json_file: # load data data = json.load(json_file) @@ -24,22 +21,44 @@ def validate_json(data_path, schema_path, log_file): if not valid: with open(log_file, "w") as fp: for error in sorted(validator.iter_errors(data), key=lambda e: e.path): - # print(error.message) fp.write(error.message) return valid def test_gold_study_json( - data_path="output/nmdc_etl/gold_study.json", - schema_path="../../../schema/nmdc.schema.json", - log_file="study_error.log", -): - valid = validate_json(data_path, schema_path, log_file) + schema_path: str, + data_path: str, + log_file_path: str = "error.log", +) -> bool: + r""" + Validates the specified data against the specified schema, writing any validation errors to the specified log file. + + :param schema_path: Path to JSON-formatted NMDC Schema file against which you want to validate the data. + Example value: `/path/to/nmdc_materialized_patterns.schema.json` + :param data_path: Path to JSON-formatted data file you want to validate. + Example value: `/path/to/nmdc_etl/gold_study.json` + :param log_file_path: Path to log file to which you want the function to write validation error messages. + """ + valid = validate_json(data_path, schema_path, log_file_path) assert valid return valid if __name__ == "__main__": - print("study test", test_gold_study_json()) + r""" + Note: In 2025, this script was updated ("quick 'n dirty"-ly) to allow the user to specify the various file paths via + CLI arguments. That update was prompted by team members noticing the hard-coded file paths in this script were + obsolete (i.e. they were paths to files that no longer existed in the repository). + """ + + # If an invalid number of CLI arguments was specified, abort and display a usage string. + if len(sys.argv) < 3: + raise SystemExit("Usage: script.py SCHEMA_PATH DATA_PATH [LOG_FILE_PATH]") + + print("study test", test_gold_study_json( + schema_path=sys.argv[1], + data_path=sys.argv[2], + log_file_path=sys.argv[3] if len(sys.argv) == 4 else None, + ))