From d3f8eadfce2a97c4eac0859c7e8347cd0f6d27c1 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Mon, 19 May 2025 11:45:48 -0700 Subject: [PATCH 01/36] Create generate_duo_schema.py Initial script version, created using ChatGPT --- utils/generate_duo_schema.py | 80 ++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 utils/generate_duo_schema.py diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py new file mode 100644 index 00000000..88acaf2c --- /dev/null +++ b/utils/generate_duo_schema.py @@ -0,0 +1,80 @@ +import pandas as pd +import json +import argparse +from collections import OrderedDict + +def build_condition(row): + """ + Builds a JSON Schema if-then rule based on a row from the data dictionary. + """ + condition = { + "if": { + "properties": { + "duoCodes": { + "type": "array", + "contains": { "const": row["DUO_Code"] } + } + }, + "required": ["duoCodes"] + }, + "then": { + "properties": { + "_accessRequirementIds": { + "type": "array", + "contains": { "const": int(row["Access_Requirement_ID"]) } + } + } + } + } + + # Optional conditional fields + additional_conditions = {} + required_fields = ["duoCodes"] + + if "Grant_Number" in row and pd.notna(row["Grant_Number"]): + additional_conditions["grantNumber"] = { "const": row["Grant_Number"] } + required_fields.append("grantNumber") + + if "Data_Type" in row and pd.notna(row["Data_Type"]): + additional_conditions["dataType"] = { "const": row["Data_Type"] } + required_fields.append("dataType") + + if additional_conditions: + condition["if"]["properties"].update(additional_conditions) + condition["if"]["required"] = required_fields + + return condition + + +def generate_json_schema(csv_path, output_path, title="DUO Access Schema", version="1.0.0", org_id="MC2-Custom"): + df = pd.read_csv(csv_path) + + conditions = [] + for _, row in df.iterrows(): + condition = build_condition(row) + conditions.append(condition) + + schema = OrderedDict({ + "$schema": "http://json-schema.org/draft-07/schema", + "title": title, + "$id": f"{org_id}-duoCodeAR-{version}", + "description": "Auto-generated schema defining DUO-based access restrictions.", + "allOf": conditions + }) + + with open(output_path, 'w') as f: + json.dump(schema, f, indent=2) + + print(f"✅ JSON Schema written to {output_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate DUO JSON Schema from Data Dictionary CSV") + parser.add_argument("csv_path", help="Path to the data_dictionary.csv") + parser.add_argument("output_path", help="Path to output JSON schema") + parser.add_argument("--title", default="DUO Access Schema", help="Schema title") + parser.add_argument("--version", default="1.0.0", help="Schema version") + parser.add_argument("--org_id", default="MC2-Custom", help="Organization ID for $id field") + + args = parser.parse_args() + generate_json_schema(args.csv_path, args.output_path, args.title, args.version, args.org_id) From 0767fb66276dee433efa933b5cd99729a10956a2 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Mon, 19 May 2025 12:06:37 -0700 Subject: [PATCH 02/36] Update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 43d4d097..b17ddd0b 100644 --- a/.gitignore +++ b/.gitignore @@ -139,4 +139,5 @@ upload_check* *manifests/ annotations/inputs/* annotations/outputs/* -annotations/output/* \ No newline at end of file +annotations/output/* +mapped_metadata.csv From d7ebc25176d58e98f2f0cc6b4ce98b1180160f14 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 21 May 2025 09:44:12 -0700 Subject: [PATCH 03/36] Update generate_duo_schema.py Adjust annotation names and argparse arguments add additional condition, based on column "Activated_By_Attribute" in source AR data dictionary --- utils/generate_duo_schema.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index 88acaf2c..b72cfdd7 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -10,17 +10,23 @@ def build_condition(row): condition = { "if": { "properties": { - "duoCodes": { + "dataUseModifiers": { "type": "array", + "items": { + "type": "string" + }, "contains": { "const": row["DUO_Code"] } } }, - "required": ["duoCodes"] + "required": ["dataUseModifiers"] }, "then": { "properties": { "_accessRequirementIds": { "type": "array", + "items": { + "type": "string" + }, "contains": { "const": int(row["Access_Requirement_ID"]) } } } @@ -29,7 +35,7 @@ def build_condition(row): # Optional conditional fields additional_conditions = {} - required_fields = ["duoCodes"] + required_fields = ["dataUseModifiers"] if "Grant_Number" in row and pd.notna(row["Grant_Number"]): additional_conditions["grantNumber"] = { "const": row["Grant_Number"] } @@ -39,6 +45,10 @@ def build_condition(row): additional_conditions["dataType"] = { "const": row["Data_Type"] } required_fields.append("dataType") + if "Activated_By_Attribute" in row and pd.notna(row["Activated_By_Attribute"]): + additional_conditions["activatedByAttribute"] = { "const": row["Activated_By_Attribute"] } + required_fields.append("activatedByAttribute") + if additional_conditions: condition["if"]["properties"].update(additional_conditions) condition["if"]["required"] = required_fields @@ -46,18 +56,20 @@ def build_condition(row): return condition -def generate_json_schema(csv_path, output_path, title="DUO Access Schema", version="1.0.0", org_id="MC2-Custom"): +def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id): df = pd.read_csv(csv_path) conditions = [] for _, row in df.iterrows(): + if grant_id and row["Grant_Number"] != grant_id: + continue condition = build_condition(row) conditions.append(condition) schema = OrderedDict({ "$schema": "http://json-schema.org/draft-07/schema", "title": title, - "$id": f"{org_id}-duoCodeAR-{version}", + "$id": f"{org_id}-{grant_id}-AccessRequirementSchema-{version}", "description": "Auto-generated schema defining DUO-based access restrictions.", "allOf": conditions }) @@ -71,10 +83,13 @@ def generate_json_schema(csv_path, output_path, title="DUO Access Schema", versi if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate DUO JSON Schema from Data Dictionary CSV") parser.add_argument("csv_path", help="Path to the data_dictionary.csv") - parser.add_argument("output_path", help="Path to output JSON schema") - parser.add_argument("--title", default="DUO Access Schema", help="Schema title") - parser.add_argument("--version", default="1.0.0", help="Schema version") - parser.add_argument("--org_id", default="MC2-Custom", help="Organization ID for $id field") + parser.add_argument("output_path", help="Path to output directory for the JSON schema") + parser.add_argument("-t", "--title", default="AccessRequirementSchema", help="Schema title") + parser.add_argument("-v", "--version", default="v1.0.0", help="Schema version") + parser.add_argument("-o", "--org_id", default="MC2", help="Organization ID for $id field") + parser.add_argument("-g", "--grant_id", help="Grant number to select conditions for from reference table") args = parser.parse_args() - generate_json_schema(args.csv_path, args.output_path, args.title, args.version, args.org_id) + + output_path = "".join([args.output_path, "/", args.org_id, ".", f"{args.grant_id}-" if args.grant_id else "", "AccessRequirement-", args.version, ".schema.json"]) + generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, args.grant_id) From e02477ddc61743d249816f35874bbb241dfe59dd Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 21 May 2025 11:43:56 -0700 Subject: [PATCH 04/36] Update generate_duo_schema.py Add attribute typing and adjust auto-generated schema name --- utils/generate_duo_schema.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index b72cfdd7..fc290af8 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -38,16 +38,16 @@ def build_condition(row): required_fields = ["dataUseModifiers"] if "Grant_Number" in row and pd.notna(row["Grant_Number"]): - additional_conditions["grantNumber"] = { "const": row["Grant_Number"] } + additional_conditions["grantNumber"] = { "type": "array", "items": { "type": "string" }, "contains": { "const": row["Grant_Number"] } } required_fields.append("grantNumber") if "Data_Type" in row and pd.notna(row["Data_Type"]): - additional_conditions["dataType"] = { "const": row["Data_Type"] } + additional_conditions["dataType"] = { "type": "array", "items": { "type": "string" }, "contains": { "const": row["Data_Type"] } } required_fields.append("dataType") if "Activated_By_Attribute" in row and pd.notna(row["Activated_By_Attribute"]): - additional_conditions["activatedByAttribute"] = { "const": row["Activated_By_Attribute"] } - required_fields.append("activatedByAttribute") + additional_conditions[row["Activated_By_Attribute"]] = { "type": "array", "items": { "type": "string" }, "contains": { "const": "True" } } + required_fields.append(row["Activated_By_Attribute"]) if additional_conditions: condition["if"]["properties"].update(additional_conditions) @@ -91,5 +91,5 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id args = parser.parse_args() - output_path = "".join([args.output_path, "/", args.org_id, ".", f"{args.grant_id}-" if args.grant_id else "", "AccessRequirement-", args.version, ".schema.json"]) + output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "", args.version, "-schema.json"]) generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, args.grant_id) From bbdac942233aa386504c4104e871060d46a5fad3 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 21 May 2025 11:46:29 -0700 Subject: [PATCH 05/36] Update synapse_json_schema_bind.py - simplify input arguments to take either a path or url that points to a JSON schema - update expected naming convention for JSON - update path/url parsing to accept AR and non-AR schemas --- utils/synapse_json_schema_bind.py | 116 +++++++++++++----------------- 1 file changed, 50 insertions(+), 66 deletions(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index 8c77e316..98ecac35 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -20,41 +20,37 @@ def get_args(): parser.add_argument( "-t", type=str, + default=None, help="Synapse Id of an entity to which a schema will be bound.", - required=False + required=True ) parser.add_argument( - "-c", + "-l", type=str, - help="The Component name of the schema that will be bound to the requested entity.", + default=None, + help="The URL for the JSON schema to be bound to the requested entity.", required=False ) parser.add_argument( - "-v", - type=str, - help="The release version of the schema. This should match the release version tag on GitHub.", - required=True - ) - parser.add_argument( - "-g", + "-p", type=str, - help="Grant number associated with a duoCodeAR type schema, in CAxxxxxx format (e.g., CA274499).", + default=None, + help="The file path for the JSON schema to be bound to the requested entity.", required=False ) parser.add_argument( - "-s", + "-n", type=str, - help="Path to a CSV file with entity Synapse Ids and Components on each row.", - required=False + default="Multi Consortia Coordinating Center", + help="The name of the organization with which the JSON schema should be associated.", + required=True ) return parser.parse_args() -def get_schema_organization(service) -> tuple: +def get_schema_organization(service, org_name: str) -> tuple: """Create or access the MC2 Center Synapse organization, return a tuple of schema service object, organization object, and organization name""" - - org_name = "Multi Consortia Coordinating Center" print(f"Creating organization: {org_name}") @@ -72,7 +68,7 @@ def register_json_schema(org, schema_type: str, schema_json: json, version: str, """Register or access a previously registered JSON schema and return the uri. If the schema was previously registered, the constructed uri will be returned. uri format: [schema_org_name]-[schema_type]-[num_version] - Example uri: MultiConsortiaCoordinatingCenter-CA987654duoCodeAR-2.0.0 + Example uri: MultiConsortiaCoordinatingCenter-CA987654AccessRequirement-2.0.0 """ num_version = version.split("v")[1] @@ -95,7 +91,7 @@ def bind_schema_to_entity(syn, service, schema_uri: str, entity_id: str, compone For JSON schemas associated with DUO-based access restrictions, use the REST API and enable derived annotations, For non-AR schemas, use the python client bind_json_schema function""" - if component_type != "duoCodeAR": + if component_type != "AccessRequirement": print(f"Binding non-AR schema {schema_uri}") service.bind_json_schema(schema_uri, entity_id) @@ -111,44 +107,48 @@ def bind_schema_to_entity(syn, service, schema_uri: str, entity_id: str, compone ) -def get_schema_from_url(component: str, version: str, grant: str) -> tuple[any, str]: - """Access a JSON schema stored in the MC2 Center data-models GitHub repo, - based on the version release tag, data type, and grant number (if of type duoCodeAR). - Return request JSON and adjusted component name. +def get_schema_from_url(url: str, path: str) -> tuple[any, str, str, str]: + """Access a JSON schema via a provided path or URL. + Return request JSON and parsed schema name elements. Note that the filename must match expected conventions: - Non-AR schema example: mc2.DatasetView.schema.json - AR schema example: mc2.CA987654duoCodeAR.schema.json + Non-AR schema example: mc2.DatasetView-v1.0.0-schema.json + AR schema example: MC2.AccessRequirement-CA000001-v3.0.2-schema.json """ - #base_schema_url = "".join(["https://raw.githubusercontent.com/mc2-center/data-models/refs/tags/v", version, "/json_schemas/"]) - base_schema_url = "".join(["https://raw.githubusercontent.com/mc2-center/data-models/refs/heads/136-173-dataset-schema/json_schemas/"]) - - if grant is not None and component == "duoCodeAR": - component = "".join([grant, component]) - - component_json_name = ".".join(["mc2", component, "schema", "json"]) - - schema_url = "".join([base_schema_url, component_json_name]) - - source_schema = requests.get(schema_url) - - schema_json = source_schema.json() + if url or path is not None: + if url is not None: + schema = url + source_schema = requests.get(url) + schema_json = source_schema.json() + else: + schema = path + source_schema = open(path, "r") + schema_json = json.load(source_schema) + + schema_info = schema.split("/")[-1] + base_component = schema_info.split(".")[1].split("-")[0] + if base_component == "AccessRequirement": + component = "".join(schema_info.split("-")[0:2]).split(".")[1] + version = schema_info.split("-")[2] + else: + component = base_component + version = schema_info.split("-")[1] print(f"JSON schema {component} {version} successfully acquired from repository") - return schema_json, component + return schema_json, component, base_component, version -def get_register_bind_schema(syn, component: str, grant: str, version: str, target: str, schema_org_name: str, org, service): +def get_register_bind_schema(syn, target: str, schema_org_name: str, org, service, path, url): """Access JSON from URL, register the JSON schema, and bind the schema to the target entity.""" - - schema_json, component_adjusted = get_schema_from_url(component, version, grant) + + schema_json, component_adjusted, base_component, version = get_schema_from_url(url, path) print(f"Registering JSON schema {component_adjusted} {version}") uri = register_json_schema(org, component_adjusted, schema_json, version, schema_org_name) - - bound_schema = bind_schema_to_entity(syn, service, uri, target, component) + + bind_schema_to_entity(syn, service, uri, target, base_component) print(f"\nSchema {component_adjusted} {version} successfully bound to entity {target}") @@ -158,34 +158,18 @@ def main(): args = get_args() - target, component, version, grant, sheet = args.t, args.c, args.v, args.g, args.s + target, url, path, org_name = args.t, args.l, args.p, args.n syn.get_available_services() schema_service = syn.service("json_schema") - service, org, schema_org_name = get_schema_organization(schema_service) - - if sheet: - id_set = pd.read_csv(sheet, header=None) - if id_set.iat[0,0] == "entity" and id_set.iat[0,1] == "component": - print(f"\nInput sheet read successfully!\n\nBinding schemas now...") - id_set = id_set.iloc[1:,:] - count = 0 - for row in id_set.itertuples(index=False): - target = row[0] - component = row[1] - get_register_bind_schema(syn, component, grant, version, target, schema_org_name, org, service) - count += 1 - print(f"\n\nDONE ✅\n{count} schemas bound") - else: - print(f"\n❗❗❗ The table provided does not appear to be formatted for this operation.❗❗❗\nPlease check its contents and try again.") + service, org, schema_org_name = get_schema_organization(schema_service, org_name) - else: # if no sheet provided, run process for one round of inputs only - if target and component: - get_register_bind_schema(syn, component, grant, version, target, schema_org_name, org, service) - else: - print(f"\n❗❗❗ No dataset information provided.❗❗❗\nPlease check your command line inputs and try again.") + if target: + get_register_bind_schema(syn, target, schema_org_name, org, service, path, url) + else: + print(f"\n❗❗❗ No dataset information provided.❗❗❗\nPlease check your command line inputs and try again.") if __name__ == "__main__": main() From 6bbf136e1e00436fa30e40b76be125a8b9d34a33 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 21 May 2025 12:47:53 -0700 Subject: [PATCH 06/36] Update generate_duo_schema.py Modify script to automatically generate additional conditions based on columns in data dictionary CSV, provided they are not considered a "base condition", as defined on line 60 --- utils/generate_duo_schema.py | 38 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index fc290af8..34bd78af 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -3,7 +3,7 @@ import argparse from collections import OrderedDict -def build_condition(row): +def build_condition(row, col_names): """ Builds a JSON Schema if-then rule based on a row from the data dictionary. """ @@ -15,7 +15,7 @@ def build_condition(row): "items": { "type": "string" }, - "contains": { "const": row["DUO_Code"] } + "contains": { "const": row["dataUseModifiers"] } } }, "required": ["dataUseModifiers"] @@ -27,7 +27,7 @@ def build_condition(row): "items": { "type": "string" }, - "contains": { "const": int(row["Access_Requirement_ID"]) } + "contains": { "const": int(row["accessRequirementId"]) } } } } @@ -37,17 +37,14 @@ def build_condition(row): additional_conditions = {} required_fields = ["dataUseModifiers"] - if "Grant_Number" in row and pd.notna(row["Grant_Number"]): - additional_conditions["grantNumber"] = { "type": "array", "items": { "type": "string" }, "contains": { "const": row["Grant_Number"] } } - required_fields.append("grantNumber") + if "activatedByAttribute" in row and pd.notna(row["activatedByAttribute"]): + additional_conditions[row["activatedByAttribute"]] = { "type": "array", "items": { "type": "string" }, "contains": { "const": row["activationValue"] } } + required_fields.append(row["activatedByAttribute"]) - if "Data_Type" in row and pd.notna(row["Data_Type"]): - additional_conditions["dataType"] = { "type": "array", "items": { "type": "string" }, "contains": { "const": row["Data_Type"] } } - required_fields.append("dataType") - - if "Activated_By_Attribute" in row and pd.notna(row["Activated_By_Attribute"]): - additional_conditions[row["Activated_By_Attribute"]] = { "type": "array", "items": { "type": "string" }, "contains": { "const": "True" } } - required_fields.append(row["Activated_By_Attribute"]) + for col in col_names: + if col in row and pd.notna(row[col]): + additional_conditions[col] = { "type": "array", "items": { "type": "string" }, "contains": { "const": row[col] } } + required_fields.append(col) if additional_conditions: condition["if"]["properties"].update(additional_conditions) @@ -57,13 +54,16 @@ def build_condition(row): def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id): - df = pd.read_csv(csv_path) + df = pd.read_csv(csv_path, header=0, dtype=str) conditions = [] + base_conditions = ["dataUseModifiers", "accessRequirementId", "activatedByAttribute", "activationValue", "entityIdList"] + col_names = df.columns.tolist() + col_names = [col for col in col_names if col not in base_conditions] for _, row in df.iterrows(): - if grant_id and row["Grant_Number"] != grant_id: + if grant_id != "Project" and row["grantNumber"] != grant_id: continue - condition = build_condition(row) + condition = build_condition(row, col_names) conditions.append(condition) schema = OrderedDict({ @@ -86,10 +86,10 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id parser.add_argument("output_path", help="Path to output directory for the JSON schema") parser.add_argument("-t", "--title", default="AccessRequirementSchema", help="Schema title") parser.add_argument("-v", "--version", default="v1.0.0", help="Schema version") - parser.add_argument("-o", "--org_id", default="MC2", help="Organization ID for $id field") + parser.add_argument("-o", "--org_id", default="mc2", help="Organization ID for $id field") parser.add_argument("-g", "--grant_id", help="Grant number to select conditions for from reference table") args = parser.parse_args() - output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "", args.version, "-schema.json"]) - generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, args.grant_id) + output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", args.version, "-schema.json"]) + generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id if args.grant_id else "Project") From f9528288824eae559930f4cca12579a6f7fc16a8 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 21 May 2025 12:48:20 -0700 Subject: [PATCH 07/36] Update synapse_json_schema_bind.py Don't require org name, since it has a default --- utils/synapse_json_schema_bind.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index 98ecac35..da94f4bc 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -43,7 +43,7 @@ def get_args(): type=str, default="Multi Consortia Coordinating Center", help="The name of the organization with which the JSON schema should be associated.", - required=True + required=False ) return parser.parse_args() From 705edb2e79e6f484e137df391fdd658ecfdcd9e2 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 21 May 2025 13:22:32 -0700 Subject: [PATCH 08/36] Add multi_condition parameter Passing -m at runtime will generate a JSON schema with additional conditions beyond dataUseModifiers, as defined in the AR data dictionary CSV. If -m is not given, then JSON schema "if-then" statement will only be controlled by the value of annotation "dataUseModifiers" --- utils/generate_duo_schema.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index 34bd78af..e113024d 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -3,7 +3,7 @@ import argparse from collections import OrderedDict -def build_condition(row, col_names): +def build_condition(row, col_names, multi_condition): """ Builds a JSON Schema if-then rule based on a row from the data dictionary. """ @@ -37,14 +37,15 @@ def build_condition(row, col_names): additional_conditions = {} required_fields = ["dataUseModifiers"] - if "activatedByAttribute" in row and pd.notna(row["activatedByAttribute"]): - additional_conditions[row["activatedByAttribute"]] = { "type": "array", "items": { "type": "string" }, "contains": { "const": row["activationValue"] } } - required_fields.append(row["activatedByAttribute"]) + if multi_condition is not None: + if "activatedByAttribute" in row and pd.notna(row["activatedByAttribute"]): + additional_conditions[row["activatedByAttribute"]] = { "type": "array", "items": { "type": "string" }, "contains": { "const": row["activationValue"] } } + required_fields.append(row["activatedByAttribute"]) - for col in col_names: - if col in row and pd.notna(row[col]): - additional_conditions[col] = { "type": "array", "items": { "type": "string" }, "contains": { "const": row[col] } } - required_fields.append(col) + for col in col_names: + if col in row and pd.notna(row[col]): + additional_conditions[col] = { "type": "array", "items": { "type": "string" }, "contains": { "const": row[col] } } + required_fields.append(col) if additional_conditions: condition["if"]["properties"].update(additional_conditions) @@ -53,7 +54,10 @@ def build_condition(row, col_names): return condition -def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id): +def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id, multi_condition): + """ + Generates a JSON Schema from a CSV file containing DUO-based access restrictions. + """ df = pd.read_csv(csv_path, header=0, dtype=str) conditions = [] @@ -63,7 +67,7 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id for _, row in df.iterrows(): if grant_id != "Project" and row["grantNumber"] != grant_id: continue - condition = build_condition(row, col_names) + condition = build_condition(row, col_names, multi_condition) conditions.append(condition) schema = OrderedDict({ @@ -88,8 +92,9 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id parser.add_argument("-v", "--version", default="v1.0.0", help="Schema version") parser.add_argument("-o", "--org_id", default="mc2", help="Organization ID for $id field") parser.add_argument("-g", "--grant_id", help="Grant number to select conditions for from reference table") + parser.add_argument("-m", "--multi_condition", help="Generate schema with multiple conditions defined in the CSV", action="store_true", default=None) args = parser.parse_args() output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", args.version, "-schema.json"]) - generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id if args.grant_id else "Project") + generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id if args.grant_id else "Project", multi_condition=args.multi_condition) From 692c526171f96e31fd8eb48872b48cd5fd4f2788 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 23 May 2025 12:40:48 -0700 Subject: [PATCH 09/36] Update generate_duo_schema.py --- utils/generate_duo_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index e113024d..0581c862 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -56,7 +56,7 @@ def build_condition(row, col_names, multi_condition): def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id, multi_condition): """ - Generates a JSON Schema from a CSV file containing DUO-based access restrictions. + Generates a JSON Schema from a CSV file containing annotation-based access restrictions. """ df = pd.read_csv(csv_path, header=0, dtype=str) From 3cd8d35cf493f01acc9482442262e2ba3059873f Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 28 May 2025 16:55:14 -0700 Subject: [PATCH 10/36] Add study and grant options Added inputs for study id and study col, to designate an additional Id by which to filter ARs Adjusted existing inputs to have dedicated grant id and grant col Added logic to filter by study id if it is provided Added additional identifiers to schema Ids and output file names: study id and "mc" to indicate a multi-component schema --- utils/generate_duo_schema.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index 0581c862..72aba0ea 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -54,7 +54,7 @@ def build_condition(row, col_names, multi_condition): return condition -def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id, multi_condition): +def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id, multi_condition, study_id, grant_col, study_col): """ Generates a JSON Schema from a CSV file containing annotation-based access restrictions. """ @@ -65,7 +65,9 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id col_names = df.columns.tolist() col_names = [col for col in col_names if col not in base_conditions] for _, row in df.iterrows(): - if grant_id != "Project" and row["grantNumber"] != grant_id: + if grant_id != "Project" and row[grant_col] != grant_id: + continue + if study_id is not None and row[study_col] != study_id: continue condition = build_condition(row, col_names, multi_condition) conditions.append(condition) @@ -73,7 +75,7 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id schema = OrderedDict({ "$schema": "http://json-schema.org/draft-07/schema", "title": title, - "$id": f"{org_id}-{grant_id}-AccessRequirementSchema-{version}", + "$id": f"{org_id}-{grant_id}-{study_id + '-' if study_id else ''}{'mc-' if multi_condition is not None else ''}AccessRequirementSchema-{version}", "description": "Auto-generated schema defining DUO-based access restrictions.", "allOf": conditions }) @@ -91,10 +93,13 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id parser.add_argument("-t", "--title", default="AccessRequirementSchema", help="Schema title") parser.add_argument("-v", "--version", default="v1.0.0", help="Schema version") parser.add_argument("-o", "--org_id", default="mc2", help="Organization ID for $id field") - parser.add_argument("-g", "--grant_id", help="Grant number to select conditions for from reference table") - parser.add_argument("-m", "--multi_condition", help="Generate schema with multiple conditions defined in the CSV", action="store_true", default=None) + parser.add_argument("-g", "--grant_id", help="Grant number to select conditions for from reference table. If nothing is provided, the JSON schema will include all conditions listed in the input table.", default="Project") + parser.add_argument("-m", "--multi_condition", help="Boolean. Generate schema with multiple conditions defined in the CSV", action="store_true", default=None) + parser.add_argument("-gc", "--grant_col", help="Name of the column in the DCC AR data dictionary that will contain the identifier for the grant", default="grantNumber") + parser.add_argument("-s", "--study_id", help="Study ID to select conditions for from reference table. If nothing is provided, the JSON schema will include all applicable studies listed in the input table.", default=None) + parser.add_argument("-sc", "--study_col", help="Name of the column in the DCC AR data dictionary that will contain the identifier for the study", default="studyKey") args = parser.parse_args() - output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", args.version, "-schema.json"]) - generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id if args.grant_id else "Project", multi_condition=args.multi_condition) + output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", args.version, f"-{args.study_id}" if args.study_id else "", "-mc." if args.multi_condition else "-", "schema.json"]) + generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id, multi_condition=args.multi_condition, study_id = args.study_id, grant_col=args.grant_col, study_col=args.study_col) From 43ccc8cc3c6d41b014ea7126d9122312004d09ad Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 28 May 2025 17:41:23 -0700 Subject: [PATCH 11/36] Adjust output file naming Make version the second to last argument, to simplify parsing --- utils/generate_duo_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index 72aba0ea..418e8a8f 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -101,5 +101,5 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id args = parser.parse_args() - output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", args.version, f"-{args.study_id}" if args.study_id else "", "-mc." if args.multi_condition else "-", "schema.json"]) + output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", f"{args.study_id}-" if args.study_id else "", "mc-" if args.multi_condition else "", args.version, "-schema.json"]) generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id, multi_condition=args.multi_condition, study_id = args.study_id, grant_col=args.grant_col, study_col=args.study_col) From b832cf1c48bf1c672ac86acddb656005558ce6e2 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 28 May 2025 17:43:04 -0700 Subject: [PATCH 12/36] Account for version position in name Adjusted slicing/list position references when parsing input URL/file path to ensure all info is accurately captured --- utils/synapse_json_schema_bind.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index da94f4bc..9029f889 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -128,9 +128,10 @@ def get_schema_from_url(url: str, path: str) -> tuple[any, str, str, str]: schema_info = schema.split("/")[-1] base_component = schema_info.split(".")[1].split("-")[0] + if base_component == "AccessRequirement": - component = "".join(schema_info.split("-")[0:2]).split(".")[1] - version = schema_info.split("-")[2] + component = "".join(schema_info.split("-")[0:-2]).split(".")[1] + version = schema_info.split("-")[-2] else: component = base_component version = schema_info.split("-")[1] From 04dbb0073e9cc49ffb578dd7c933c62d500373dd Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 30 May 2025 17:38:22 -0700 Subject: [PATCH 13/36] Update synapse_json_schema_bind.py Add option to identify that a schema has AR-related information integrated, which will ensure "enableDerivedAnnotations" is used when binding the JSON --- utils/synapse_json_schema_bind.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index 9029f889..dec8dd93 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -45,6 +45,13 @@ def get_args(): help="The name of the organization with which the JSON schema should be associated.", required=False ) + parser.add_argument( + "-ar", + action="store_true", + help="Indicates if the schema includes Access Requirement information.", + required=False, + default=None + ) return parser.parse_args() @@ -86,16 +93,12 @@ def register_json_schema(org, schema_type: str, schema_json: json, version: str, return uri -def bind_schema_to_entity(syn, service, schema_uri: str, entity_id: str, component_type: str): +def bind_schema_to_entity(syn, service, schema_uri: str, entity_id: str, component_type: str, includes_ar: bool): """Associate a registered JSON schema with a Synapse entity. For JSON schemas associated with DUO-based access restrictions, use the REST API and enable derived annotations, For non-AR schemas, use the python client bind_json_schema function""" - if component_type != "AccessRequirement": - print(f"Binding non-AR schema {schema_uri}") - service.bind_json_schema(schema_uri, entity_id) - - else: + if component_type == "AccessRequirement" or includes_ar is not None: print(f"Binding AR schema {schema_uri}") request_body = { "entityId": entity_id, @@ -105,7 +108,10 @@ def bind_schema_to_entity(syn, service, schema_uri: str, entity_id: str, compone syn.restPUT( f"/entity/{entity_id}/schema/binding", body=json.dumps(request_body) ) - + + else: + print(f"Binding non-AR schema {schema_uri}") + service.bind_json_schema(schema_uri, entity_id) def get_schema_from_url(url: str, path: str) -> tuple[any, str, str, str]: """Access a JSON schema via a provided path or URL. @@ -141,7 +147,7 @@ def get_schema_from_url(url: str, path: str) -> tuple[any, str, str, str]: return schema_json, component, base_component, version -def get_register_bind_schema(syn, target: str, schema_org_name: str, org, service, path, url): +def get_register_bind_schema(syn, target: str, schema_org_name: str, org, service, path, url, includes_ar: bool): """Access JSON from URL, register the JSON schema, and bind the schema to the target entity.""" schema_json, component_adjusted, base_component, version = get_schema_from_url(url, path) @@ -149,7 +155,7 @@ def get_register_bind_schema(syn, target: str, schema_org_name: str, org, servic uri = register_json_schema(org, component_adjusted, schema_json, version, schema_org_name) - bind_schema_to_entity(syn, service, uri, target, base_component) + bind_schema_to_entity(syn, service, uri, target, base_component, includes_ar) print(f"\nSchema {component_adjusted} {version} successfully bound to entity {target}") @@ -159,7 +165,7 @@ def main(): args = get_args() - target, url, path, org_name = args.t, args.l, args.p, args.n + target, url, path, org_name, includes_ar = args.t, args.l, args.p, args.n, args.ar syn.get_available_services() @@ -168,7 +174,7 @@ def main(): service, org, schema_org_name = get_schema_organization(schema_service, org_name) if target: - get_register_bind_schema(syn, target, schema_org_name, org, service, path, url) + get_register_bind_schema(syn, target, schema_org_name, org, service, path, url, includes_ar) else: print(f"\n❗❗❗ No dataset information provided.❗❗❗\nPlease check your command line inputs and try again.") From 121b4c86a30ed46438c7c35cd0409b4d511abd19 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 3 Jun 2025 14:51:51 -0700 Subject: [PATCH 14/36] Update synapse_json_schema_bind.py --- utils/synapse_json_schema_bind.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index dec8dd93..d9a64c9a 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -2,7 +2,7 @@ This script will create and bind a JSON schema to an entity -Usage: python synapse_json_schema_bind.py -t [Entity Synapse Id] -c [Schema data type] -v [Data model release version] -g [Grant number in CA format] -s [Path to CSV for schema binding in bulk] +Usage: python synapse_json_schema_bind.py -t [Entity Synapse Id] -l [JSON Schema URL] -p [JSON Schema File Path] -n [Organization Name] -ar [Access Requirement Flag] author: orion.banks """ From f932b7f9c3ac7eea5e2f3abe7231aa12a6584422 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 3 Jun 2025 14:52:59 -0700 Subject: [PATCH 15/36] Add flag to select based on data type Select ARs based on data type. Add data type designation to file name and schema id if provided. --- utils/generate_duo_schema.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index 418e8a8f..4da07b0f 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -54,7 +54,7 @@ def build_condition(row, col_names, multi_condition): return condition -def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id, multi_condition, study_id, grant_col, study_col): +def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id, multi_condition, study_id, grant_col, study_col, data_type, data_col): """ Generates a JSON Schema from a CSV file containing annotation-based access restrictions. """ @@ -69,13 +69,15 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id continue if study_id is not None and row[study_col] != study_id: continue + if data_type is not None and row[data_col] != data_type: + continue condition = build_condition(row, col_names, multi_condition) conditions.append(condition) schema = OrderedDict({ "$schema": "http://json-schema.org/draft-07/schema", "title": title, - "$id": f"{org_id}-{grant_id}-{study_id + '-' if study_id else ''}{'mc-' if multi_condition is not None else ''}AccessRequirementSchema-{version}", + "$id": f"{org_id}-{grant_id}-{study_id + '-' if study_id is not None else ''}{data_type + '-' if data_type is not None else ''}{'mc-' if multi_condition is not None else ''}AccessRequirementSchema-{version}", "description": "Auto-generated schema defining DUO-based access restrictions.", "allOf": conditions }) @@ -98,8 +100,10 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id parser.add_argument("-gc", "--grant_col", help="Name of the column in the DCC AR data dictionary that will contain the identifier for the grant", default="grantNumber") parser.add_argument("-s", "--study_id", help="Study ID to select conditions for from reference table. If nothing is provided, the JSON schema will include all applicable studies listed in the input table.", default=None) parser.add_argument("-sc", "--study_col", help="Name of the column in the DCC AR data dictionary that will contain the identifier for the study", default="studyKey") + parser.add_argument("-dt", "--data_type", help="Data type to select conditions for from reference table. If nothing is provided, the JSON schema will include all applicable data types listed in the input table.", default=None) + parser.add_argument("-dc", "--data_col", help="Name of the column in the DCC AR data dictionary that will contain the identifier for the data type", default="dataType") args = parser.parse_args() - output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", f"{args.study_id}-" if args.study_id else "", "mc-" if args.multi_condition else "", args.version, "-schema.json"]) - generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id, multi_condition=args.multi_condition, study_id = args.study_id, grant_col=args.grant_col, study_col=args.study_col) + output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", f"{args.study_id}-" if args.study_id else "", f"{args.data_type}-" if args.data_type else "", "mc-" if args.multi_condition else "", args.version, "-schema.json"]) + generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id, multi_condition=args.multi_condition, study_id = args.study_id, grant_col=args.grant_col, study_col=args.study_col, data_type = args.data_type, data_col=args.data_col) From 27cd48301b1afa7c9e4b46525ac29f6d96dbe645 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:09:34 -0700 Subject: [PATCH 16/36] Add flag to select by species Add option to provide species when selecting conditions from input table. Add filtering conditions and integrate into file name + schema id --- utils/generate_duo_schema.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index 4da07b0f..f9d5720b 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -54,7 +54,7 @@ def build_condition(row, col_names, multi_condition): return condition -def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id, multi_condition, study_id, grant_col, study_col, data_type, data_col): +def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id, multi_condition, study_id, grant_col, study_col, data_type, data_col, species_type, species_col): """ Generates a JSON Schema from a CSV file containing annotation-based access restrictions. """ @@ -71,13 +71,15 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id continue if data_type is not None and row[data_col] != data_type: continue + if species_type is not None and row[species_col] != species_type: + continue condition = build_condition(row, col_names, multi_condition) conditions.append(condition) schema = OrderedDict({ "$schema": "http://json-schema.org/draft-07/schema", "title": title, - "$id": f"{org_id}-{grant_id}-{study_id + '-' if study_id is not None else ''}{data_type + '-' if data_type is not None else ''}{'mc-' if multi_condition is not None else ''}AccessRequirementSchema-{version}", + "$id": f"{org_id}-{grant_id}-{study_id + '-' if study_id is not None else ''}{data_type + '-' if data_type is not None else ''}{species_type + '-' if species_type is not None else ''}{'mc-' if multi_condition is not None else ''}AccessRequirementSchema-{version}", "description": "Auto-generated schema defining DUO-based access restrictions.", "allOf": conditions }) @@ -100,10 +102,12 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id parser.add_argument("-gc", "--grant_col", help="Name of the column in the DCC AR data dictionary that will contain the identifier for the grant", default="grantNumber") parser.add_argument("-s", "--study_id", help="Study ID to select conditions for from reference table. If nothing is provided, the JSON schema will include all applicable studies listed in the input table.", default=None) parser.add_argument("-sc", "--study_col", help="Name of the column in the DCC AR data dictionary that will contain the identifier for the study", default="studyKey") - parser.add_argument("-dt", "--data_type", help="Data type to select conditions for from reference table. If nothing is provided, the JSON schema will include all applicable data types listed in the input table.", default=None) + parser.add_argument("-d", "--data_type", help="Data type to select conditions for from reference table. If nothing is provided, the JSON schema will include all applicable data types listed in the input table.", default=None) parser.add_argument("-dc", "--data_col", help="Name of the column in the DCC AR data dictionary that will contain the identifier for the data type", default="dataType") + parser.add_argument("-p", "--species_type", help="Species to select conditions for from reference table. If nothing is provided, the JSON schema will include all applicable species listed in the input table.", default=None) + parser.add_argument("-pc", "--species_col", help="Name of the column in the DCC AR data dictionary that will contain the identifier for the species", default="speciesType") args = parser.parse_args() - output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", f"{args.study_id}-" if args.study_id else "", f"{args.data_type}-" if args.data_type else "", "mc-" if args.multi_condition else "", args.version, "-schema.json"]) - generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id, multi_condition=args.multi_condition, study_id = args.study_id, grant_col=args.grant_col, study_col=args.study_col, data_type = args.data_type, data_col=args.data_col) + output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", f"{args.study_id}-" if args.study_id else "", f"{args.data_type}-" if args.data_type else "", f"{args.species_type}-" if args.species_type else "", "mc-" if args.multi_condition else "", args.version, "-schema.json"]) + generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id, multi_condition=args.multi_condition, study_id = args.study_id, grant_col=args.grant_col, study_col=args.study_col, data_type = args.data_type, data_col=args.data_col, species_type = args.species_type, species_col=args.species_col) From 644946f7fefd462307b19dbddc50af679c1e408f Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:58:54 -0700 Subject: [PATCH 17/36] Make default org generic --- utils/generate_duo_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index f9d5720b..1581b7a4 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -96,7 +96,7 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id parser.add_argument("output_path", help="Path to output directory for the JSON schema") parser.add_argument("-t", "--title", default="AccessRequirementSchema", help="Schema title") parser.add_argument("-v", "--version", default="v1.0.0", help="Schema version") - parser.add_argument("-o", "--org_id", default="mc2", help="Organization ID for $id field") + parser.add_argument("-o", "--org_id", default="Project", help="Organization ID for $id field") parser.add_argument("-g", "--grant_id", help="Grant number to select conditions for from reference table. If nothing is provided, the JSON schema will include all conditions listed in the input table.", default="Project") parser.add_argument("-m", "--multi_condition", help="Boolean. Generate schema with multiple conditions defined in the CSV", action="store_true", default=None) parser.add_argument("-gc", "--grant_col", help="Name of the column in the DCC AR data dictionary that will contain the identifier for the grant", default="grantNumber") From 66af239fe27a2ad264770445eb5ed07946d5304f Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:59:15 -0700 Subject: [PATCH 18/36] Add no bind flag Supports option to not bind schema --- utils/synapse_json_schema_bind.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index d9a64c9a..9c5f3a8f 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -22,7 +22,7 @@ def get_args(): type=str, default=None, help="Synapse Id of an entity to which a schema will be bound.", - required=True + required=False ) parser.add_argument( "-l", @@ -52,6 +52,13 @@ def get_args(): required=False, default=None ) + parser.add_argument( + "--no_bind", + action="store_true", + help="Indicates the schema should not be bound to the entity.", + required=False, + default=None + ) return parser.parse_args() From e97a94596018c0ac865ba64a8467b17c35d0dca0 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:32:28 -0700 Subject: [PATCH 19/36] Add no bind logic and messaging Implement option to not bind the schema; this is useful if the schema will not be used directly, but will be referenced by other JSON schemas used in Synapse. Note that the URL and unique schema id will be printed to the terminal for reference. --- utils/synapse_json_schema_bind.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index 9c5f3a8f..2db3a8f7 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -95,8 +95,10 @@ def register_json_schema(org, schema_type: str, schema_json: json, version: str, print(f"JSON schema {uri} was successfully registered.") except synapseclient.core.exceptions.SynapseHTTPError as error: print(error) - print(f"JSON schema {uri} was previously registered and will be bound to the entity.") - + print(f"JSON schema {uri} was previously registered and will not be updated.") + + print(f"\nSchema is available at https://repo-prod.prod.sagebase.org/repo/v1/schema/type/registered/{uri}\nThe schema can be referenced using the id: {uri}\n") + return uri @@ -154,7 +156,7 @@ def get_schema_from_url(url: str, path: str) -> tuple[any, str, str, str]: return schema_json, component, base_component, version -def get_register_bind_schema(syn, target: str, schema_org_name: str, org, service, path, url, includes_ar: bool): +def get_register_bind_schema(syn, target: str, schema_org_name: str, org, service, path, url, includes_ar: bool, no_bind: bool): """Access JSON from URL, register the JSON schema, and bind the schema to the target entity.""" schema_json, component_adjusted, base_component, version = get_schema_from_url(url, path) @@ -162,9 +164,10 @@ def get_register_bind_schema(syn, target: str, schema_org_name: str, org, servic uri = register_json_schema(org, component_adjusted, schema_json, version, schema_org_name) - bind_schema_to_entity(syn, service, uri, target, base_component, includes_ar) - print(f"\nSchema {component_adjusted} {version} successfully bound to entity {target}") - + if no_bind is None: + bind_schema_to_entity(syn, service, uri, target, base_component, includes_ar) + print(f"\nSchema {component_adjusted} {version} successfully bound to entity {target}") + def main(): @@ -172,7 +175,10 @@ def main(): args = get_args() - target, url, path, org_name, includes_ar = args.t, args.l, args.p, args.n, args.ar + target, url, path, org_name, includes_ar, no_bind = args.t, args.l, args.p, args.n, args.ar, args.no_bind + + if no_bind is not None: + print(f"Warning ❗❗❗ Schema will not be bound to the entity if one was provided.") syn.get_available_services() @@ -180,9 +186,9 @@ def main(): service, org, schema_org_name = get_schema_organization(schema_service, org_name) - if target: - get_register_bind_schema(syn, target, schema_org_name, org, service, path, url, includes_ar) - else: + get_register_bind_schema(syn, target, schema_org_name, org, service, path, url, includes_ar, no_bind) + + if target is None and no_bind is None: print(f"\n❗❗❗ No dataset information provided.❗❗❗\nPlease check your command line inputs and try again.") if __name__ == "__main__": From 98bfafdc5fc652ca1a075b9dc67daef4046bd774 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:35:20 -0700 Subject: [PATCH 20/36] Make default organization name generic --- utils/synapse_json_schema_bind.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index 2db3a8f7..08bae5f0 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -41,8 +41,8 @@ def get_args(): parser.add_argument( "-n", type=str, - default="Multi Consortia Coordinating Center", - help="The name of the organization with which the JSON schema should be associated.", + default="Example Organization", + help="The name of the organization with which the JSON schema should be associated. Default: 'Example Organization'.", required=False ) parser.add_argument( From 31c16a65f37d13804e1f905d0afad55e0871ab52 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Mon, 9 Jun 2025 16:14:53 -0700 Subject: [PATCH 21/36] Add flag to select AR ID --- utils/generate_duo_schema.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index 1581b7a4..0385058c 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -54,7 +54,7 @@ def build_condition(row, col_names, multi_condition): return condition -def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id, multi_condition, study_id, grant_col, study_col, data_type, data_col, species_type, species_col): +def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id, multi_condition, study_id, grant_col, study_col, data_type, data_col, species_type, species_col, access_requirement): """ Generates a JSON Schema from a CSV file containing annotation-based access restrictions. """ @@ -65,6 +65,8 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id col_names = df.columns.tolist() col_names = [col for col in col_names if col not in base_conditions] for _, row in df.iterrows(): + if access_requirement is not None and row["accessRequirementId"] != access_requirement: + continue if grant_id != "Project" and row[grant_col] != grant_id: continue if study_id is not None and row[study_col] != study_id: @@ -97,6 +99,7 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id parser.add_argument("-t", "--title", default="AccessRequirementSchema", help="Schema title") parser.add_argument("-v", "--version", default="v1.0.0", help="Schema version") parser.add_argument("-o", "--org_id", default="Project", help="Organization ID for $id field") + parser.add_argument("-a", "--access_requirement", default=None, help="Access requirement ID to select conditions for from reference table. If nothing is provided, the JSON schema will include all applicable conditions listed in the input table.") parser.add_argument("-g", "--grant_id", help="Grant number to select conditions for from reference table. If nothing is provided, the JSON schema will include all conditions listed in the input table.", default="Project") parser.add_argument("-m", "--multi_condition", help="Boolean. Generate schema with multiple conditions defined in the CSV", action="store_true", default=None) parser.add_argument("-gc", "--grant_col", help="Name of the column in the DCC AR data dictionary that will contain the identifier for the grant", default="grantNumber") @@ -109,5 +112,5 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id args = parser.parse_args() - output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", f"{args.study_id}-" if args.study_id else "", f"{args.data_type}-" if args.data_type else "", f"{args.species_type}-" if args.species_type else "", "mc-" if args.multi_condition else "", args.version, "-schema.json"]) - generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id, multi_condition=args.multi_condition, study_id = args.study_id, grant_col=args.grant_col, study_col=args.study_col, data_type = args.data_type, data_col=args.data_col, species_type = args.species_type, species_col=args.species_col) + output_path = "".join([args.output_path, "/", args.org_id, ".", "AccessRequirement-", f"{args.grant_id}-" if args.grant_id else "Project-", f"{args.study_id}-" if args.study_id else "", f"{args.data_type}-" if args.data_type else "", f"{args.species_type}-" if args.species_type else "", "mc-" if args.multi_condition else "", f"{args.access_requirement}-" if args.access_requirement else "", args.version, "-schema.json"]) + generate_json_schema(args.csv_path, output_path, args.title, args.version, args.org_id, grant_id = args.grant_id, multi_condition=args.multi_condition, study_id = args.study_id, grant_col=args.grant_col, study_col=args.study_col, data_type = args.data_type, data_col=args.data_col, species_type = args.species_type, species_col=args.species_col, access_requirement=args.access_requirement) From dd3d9af9a25c7103701f91c21cd4f57b16da1cd3 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Mon, 9 Jun 2025 17:00:11 -0700 Subject: [PATCH 22/36] Add AR ID to id field --- utils/generate_duo_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index 0385058c..d2e38855 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -81,7 +81,7 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id schema = OrderedDict({ "$schema": "http://json-schema.org/draft-07/schema", "title": title, - "$id": f"{org_id}-{grant_id}-{study_id + '-' if study_id is not None else ''}{data_type + '-' if data_type is not None else ''}{species_type + '-' if species_type is not None else ''}{'mc-' if multi_condition is not None else ''}AccessRequirementSchema-{version}", + "$id": f"{org_id}-{grant_id}-{study_id + '-' if study_id is not None else ''}{data_type + '-' if data_type is not None else ''}{species_type + '-' if species_type is not None else ''}{'mc-' if multi_condition is not None else ''}AccessRequirementSchema-{access_requirement + '-' if access_requirement is not None else ''}{version}", "description": "Auto-generated schema defining DUO-based access restrictions.", "allOf": conditions }) From 9de0769f5c9b597106028b0d17842fec699b3f76 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 10 Jun 2025 12:35:17 -0700 Subject: [PATCH 23/36] Update JSON description format and arg help --- utils/generate_duo_schema.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index d2e38855..2035120e 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -82,7 +82,7 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id "$schema": "http://json-schema.org/draft-07/schema", "title": title, "$id": f"{org_id}-{grant_id}-{study_id + '-' if study_id is not None else ''}{data_type + '-' if data_type is not None else ''}{species_type + '-' if species_type is not None else ''}{'mc-' if multi_condition is not None else ''}AccessRequirementSchema-{access_requirement + '-' if access_requirement is not None else ''}{version}", - "description": "Auto-generated schema defining DUO-based access restrictions.", + "description": f"Auto-generated schema that defines access requirements for biomedical data. Organization: {org_id}, Grant number or Project designation: {grant_id}, Study ID: {study_id if study_id else 'N/A'}, Data Type: {data_type if data_type else 'N/A'}, Species Type: {species_type if species_type else 'N/A'}, Multi-condition: {'Yes' if multi_condition else 'No'}, Selected Access Requirement ID: {access_requirement if access_requirement else 'N/A'}", "allOf": conditions }) @@ -93,8 +93,8 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Generate DUO JSON Schema from Data Dictionary CSV") - parser.add_argument("csv_path", help="Path to the data_dictionary.csv") + parser = argparse.ArgumentParser(description="Generate Access Requirement JSON Schema from Data Dictionary CSV") + parser.add_argument("csv_path", help="Path to the data_dictionary.csv. See and example at https://github.com/Sage-Bionetworks/governanceDUO/blob/main/access_requirement_JSON/example_annotation_AR_reference.csv") parser.add_argument("output_path", help="Path to output directory for the JSON schema") parser.add_argument("-t", "--title", default="AccessRequirementSchema", help="Schema title") parser.add_argument("-v", "--version", default="v1.0.0", help="Schema version") From 8f81dbfc25b343ab0736edc81f101786e3ab5302 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 10 Jun 2025 12:35:33 -0700 Subject: [PATCH 24/36] Update doc strings --- utils/synapse_json_schema_bind.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index 08bae5f0..4f7fc813 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -2,7 +2,13 @@ This script will create and bind a JSON schema to an entity -Usage: python synapse_json_schema_bind.py -t [Entity Synapse Id] -l [JSON Schema URL] -p [JSON Schema File Path] -n [Organization Name] -ar [Access Requirement Flag] +Usage: python synapse_json_schema_bind.py -t [Entity Synapse Id] -l [JSON Schema URL] -p [JSON Schema File Path] -n [Organization Name] -ar --no_bind +-t Synapse Id of an entity to which a schema will be bound. +-l URL for the JSON schema to be bound to the requested entity. +-p File path for the JSON schema to be bound to the requested entity. +-n Name of the organization with which the JSON schema should be associated. Default: 'Example Organization'. +-ar Indicates if the schema includes Access Requirement information. +--no_bind Indicates the schema should not be bound to the entity. author: orion.banks """ @@ -63,7 +69,7 @@ def get_args(): def get_schema_organization(service, org_name: str) -> tuple: - """Create or access the MC2 Center Synapse organization, + """Create or access the named Synapse organization, return a tuple of schema service object, organization object, and organization name""" print(f"Creating organization: {org_name}") @@ -82,7 +88,7 @@ def register_json_schema(org, schema_type: str, schema_json: json, version: str, """Register or access a previously registered JSON schema and return the uri. If the schema was previously registered, the constructed uri will be returned. uri format: [schema_org_name]-[schema_type]-[num_version] - Example uri: MultiConsortiaCoordinatingCenter-CA987654AccessRequirement-2.0.0 + Example uri: ExampleOrganization-CA987654AccessRequirement-2.0.0 """ num_version = version.split("v")[1] From c55a78e9b9d6b6d8bd2f6df010aa10c1481d9579 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 11 Jul 2025 12:12:40 -0700 Subject: [PATCH 25/36] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index b17ddd0b..6fd6833b 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,4 @@ annotations/inputs/* annotations/outputs/* annotations/output/* mapped_metadata.csv +utils/example_files/* From 11da96e20b311efef4ba26c15f11d31bc0e5063b Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:23:29 -0700 Subject: [PATCH 26/36] Update example CSV URL --- utils/generate_duo_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index 2035120e..6213f8ce 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -94,7 +94,7 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate Access Requirement JSON Schema from Data Dictionary CSV") - parser.add_argument("csv_path", help="Path to the data_dictionary.csv. See and example at https://github.com/Sage-Bionetworks/governanceDUO/blob/main/access_requirement_JSON/example_annotation_AR_reference.csv") + parser.add_argument("csv_path", help="Path to the data_dictionary.csv. See and example at https://github.com/Sage-Bionetworks/governanceDUO/blob/main/access_requirement_JSON/README.md") parser.add_argument("output_path", help="Path to output directory for the JSON schema") parser.add_argument("-t", "--title", default="AccessRequirementSchema", help="Schema title") parser.add_argument("-v", "--version", default="v1.0.0", help="Schema version") From 63ff8fea263638c30720b0fc70537d78dca0993b Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:56:36 -0700 Subject: [PATCH 27/36] Make default org tag DCC Improves naming convention clarity --- utils/generate_duo_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/generate_duo_schema.py b/utils/generate_duo_schema.py index 6213f8ce..bdf14566 100644 --- a/utils/generate_duo_schema.py +++ b/utils/generate_duo_schema.py @@ -98,7 +98,7 @@ def generate_json_schema(csv_path, output_path, title, version, org_id, grant_id parser.add_argument("output_path", help="Path to output directory for the JSON schema") parser.add_argument("-t", "--title", default="AccessRequirementSchema", help="Schema title") parser.add_argument("-v", "--version", default="v1.0.0", help="Schema version") - parser.add_argument("-o", "--org_id", default="Project", help="Organization ID for $id field") + parser.add_argument("-o", "--org_id", default="DCC", help="Organization ID for $id field") parser.add_argument("-a", "--access_requirement", default=None, help="Access requirement ID to select conditions for from reference table. If nothing is provided, the JSON schema will include all applicable conditions listed in the input table.") parser.add_argument("-g", "--grant_id", help="Grant number to select conditions for from reference table. If nothing is provided, the JSON schema will include all conditions listed in the input table.", default="Project") parser.add_argument("-m", "--multi_condition", help="Boolean. Generate schema with multiple conditions defined in the CSV", action="store_true", default=None) From 01b3a0e47b5332f37f2b4beda4bfd6beffaa8918 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:57:41 -0700 Subject: [PATCH 28/36] Get args before synapse login When using --help flag, placing get_args first ensures the help message is printed and the script is stopped before logging into synapse --- utils/synapse_json_schema_bind.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index 4f7fc813..db61884c 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -177,9 +177,9 @@ def get_register_bind_schema(syn, target: str, schema_org_name: str, org, servic def main(): - syn = synapseclient.login() - args = get_args() + + syn = synapseclient.login() target, url, path, org_name, includes_ar, no_bind = args.t, args.l, args.p, args.n, args.ar, args.no_bind From 74a364e3069e68f0c3d05a452eefdc3eab389ccd Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 17 Sep 2025 14:33:12 -0700 Subject: [PATCH 29/36] Update input handling and printing Ensure schema bind functions are not run if no target synId is provided Update print functions to improve readability --- utils/synapse_json_schema_bind.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index db61884c..ce8dcd08 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -78,7 +78,7 @@ def get_schema_organization(service, org_name: str) -> tuple: schema_org = service.JsonSchemaOrganization(name = org_name) schema_org.create() except synapseclient.core.exceptions.SynapseHTTPError: - print(f"Organization {org_name} already exists, getting info now...") + print(f"\nOrganization {org_name} already exists, getting info now...") schema_org = service.get_organization(organization_name = org_name) return service, schema_org, org_name @@ -100,10 +100,10 @@ def register_json_schema(org, schema_type: str, schema_json: json, version: str, uri = schema.uri print(f"JSON schema {uri} was successfully registered.") except synapseclient.core.exceptions.SynapseHTTPError as error: - print(error) - print(f"JSON schema {uri} was previously registered and will not be updated.") + print("\n" + error) + print(f"JSON schema {uri} was previously registered and will not be updated.\n") - print(f"\nSchema is available at https://repo-prod.prod.sagebase.org/repo/v1/schema/type/registered/{uri}\nThe schema can be referenced using the id: {uri}\n") + print(f"\nSchema is available at https://repo-prod.prod.sagebase.org/repo/v1/schema/type/registered/{uri}\nThe schema can be referenced using the id: {uri}") return uri @@ -114,7 +114,7 @@ def bind_schema_to_entity(syn, service, schema_uri: str, entity_id: str, compone For non-AR schemas, use the python client bind_json_schema function""" if component_type == "AccessRequirement" or includes_ar is not None: - print(f"Binding AR schema {schema_uri}") + print(f"\nBinding AR schema {schema_uri}") request_body = { "entityId": entity_id, "schema$id": schema_uri, @@ -125,7 +125,7 @@ def bind_schema_to_entity(syn, service, schema_uri: str, entity_id: str, compone ) else: - print(f"Binding non-AR schema {schema_uri}") + print(f"\nBinding non-AR schema {schema_uri}") service.bind_json_schema(schema_uri, entity_id) def get_schema_from_url(url: str, path: str) -> tuple[any, str, str, str]: @@ -157,7 +157,7 @@ def get_schema_from_url(url: str, path: str) -> tuple[any, str, str, str]: component = base_component version = schema_info.split("-")[1] - print(f"JSON schema {component} {version} successfully acquired from repository") + print(f"\nJSON schema {component} {version} successfully acquired from repository") return schema_json, component, base_component, version @@ -166,13 +166,17 @@ def get_register_bind_schema(syn, target: str, schema_org_name: str, org, servic """Access JSON from URL, register the JSON schema, and bind the schema to the target entity.""" schema_json, component_adjusted, base_component, version = get_schema_from_url(url, path) - print(f"Registering JSON schema {component_adjusted} {version}") + print(f"\nRegistering JSON schema {component_adjusted} {version}\n") uri = register_json_schema(org, component_adjusted, schema_json, version, schema_org_name) - if no_bind is None: + if no_bind is None and target is not None: bind_schema_to_entity(syn, service, uri, target, base_component, includes_ar) print(f"\nSchema {component_adjusted} {version} successfully bound to entity {target}") + else: + print("\nSchema was not bound to an entity.") + + print("\nDONE ✅") def main(): @@ -184,7 +188,10 @@ def main(): target, url, path, org_name, includes_ar, no_bind = args.t, args.l, args.p, args.n, args.ar, args.no_bind if no_bind is not None: - print(f"Warning ❗❗❗ Schema will not be bound to the entity if one was provided.") + print(f"Warning ❗❗❗ Schema will not be bound to the entity if one was provided.\n") + + if target is None: + print(f"Warning ❗❗❗ No entity id provided. Schema will only be registered.\n") syn.get_available_services() @@ -193,9 +200,6 @@ def main(): service, org, schema_org_name = get_schema_organization(schema_service, org_name) get_register_bind_schema(syn, target, schema_org_name, org, service, path, url, includes_ar, no_bind) - - if target is None and no_bind is None: - print(f"\n❗❗❗ No dataset information provided.❗❗❗\nPlease check your command line inputs and try again.") if __name__ == "__main__": main() From d6d2fc4a1195953d075fb71c9b45a3e7f547109b Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:22:29 -0700 Subject: [PATCH 30/36] Remove newline from print argument --- utils/synapse_json_schema_bind.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/synapse_json_schema_bind.py b/utils/synapse_json_schema_bind.py index ce8dcd08..a4a65ea1 100644 --- a/utils/synapse_json_schema_bind.py +++ b/utils/synapse_json_schema_bind.py @@ -100,7 +100,7 @@ def register_json_schema(org, schema_type: str, schema_json: json, version: str, uri = schema.uri print(f"JSON schema {uri} was successfully registered.") except synapseclient.core.exceptions.SynapseHTTPError as error: - print("\n" + error) + print(error) print(f"JSON schema {uri} was previously registered and will not be updated.\n") print(f"\nSchema is available at https://repo-prod.prod.sagebase.org/repo/v1/schema/type/registered/{uri}\nThe schema can be referenced using the id: {uri}") From a03da9660192410a6a3f48cee92cb155a9fc3e4e Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 25 Sep 2025 17:00:39 -0700 Subject: [PATCH 31/36] Create add_json_conditions.py Add script that modifies JSON schemas from schematic to include a "contains" label for attributes with enums in conditionals. This is to allow multiple conditions to be applied, based on a single input array --- utils/add_json_conditions.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 utils/add_json_conditions.py diff --git a/utils/add_json_conditions.py b/utils/add_json_conditions.py new file mode 100644 index 00000000..f8bae42c --- /dev/null +++ b/utils/add_json_conditions.py @@ -0,0 +1,35 @@ +import json +import argparse +import pathlib + + +def generate_json_schema(input): + """ + Corrects conditional logic for arrays. + """ + + output = "".join([pathlib.Path(input).stem, ".updated.json"]) + + with open(input, 'r') as i: + model = json.load(i) + + conditions = model["allOf"] + + for property in conditions: + for prop in property["if"]["properties"]: + if property["if"]["properties"][prop]["enum"]: + property["if"]["properties"][prop] = { "contains": property["if"]["properties"][prop]} + + with open(output, 'w') as f: + json.dump(model, f, indent=2) + + print(f"✅ JSON Schema written to {output}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate corrected JSON schema") + parser.add_argument("input_path", help="Path to a JSON schema file.") + args = parser.parse_args() + + + generate_json_schema(args.input_path) \ No newline at end of file From eb5ef7a3b10003b9af8f67c8f241524084527d43 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 30 Sep 2025 13:50:38 -0700 Subject: [PATCH 32/36] Fix output filename and formatting in JSON schema generator Changed the output filename separator from '.' to '-' for consistency. Also fixed minor formatting in the dictionary assignment for better readability. --- utils/add_json_conditions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/add_json_conditions.py b/utils/add_json_conditions.py index f8bae42c..9ce9cfd2 100644 --- a/utils/add_json_conditions.py +++ b/utils/add_json_conditions.py @@ -8,7 +8,7 @@ def generate_json_schema(input): Corrects conditional logic for arrays. """ - output = "".join([pathlib.Path(input).stem, ".updated.json"]) + output = "".join([pathlib.Path(input).stem, "-updated.json"]) with open(input, 'r') as i: model = json.load(i) @@ -18,7 +18,7 @@ def generate_json_schema(input): for property in conditions: for prop in property["if"]["properties"]: if property["if"]["properties"][prop]["enum"]: - property["if"]["properties"][prop] = { "contains": property["if"]["properties"][prop]} + property["if"]["properties"][prop] = {"contains": property["if"]["properties"][prop]} with open(output, 'w') as f: json.dump(model, f, indent=2) From ceb4e4be37323ed1e6e9bd12c1ed1d9aba923765 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 25 Sep 2025 10:34:54 -0700 Subject: [PATCH 33/36] Load input as strings Addresses bug `TypeError: expected string or bytes-like object` --- utils/merge_and_correct_manifests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/merge_and_correct_manifests.py b/utils/merge_and_correct_manifests.py index 21dca286..c7ef9183 100644 --- a/utils/merge_and_correct_manifests.py +++ b/utils/merge_and_correct_manifests.py @@ -168,7 +168,7 @@ def main(): print(f"\nDatabase read successfully!") if new_entries is not None: - new_entries_df = pd.read_csv(new_entries, keep_default_na=False, index_col=False) + new_entries_df = pd.read_csv(new_entries, keep_default_na=False, index_col=False, dtype=str) print(f"\nNew_entries read successfully!") filtered_entries_df = filter_updated_manifest(new_entries_df, index_col, data_type) filtered_entries_df.drop(["entityId", "iconTags", "Source"], axis=1, errors="ignore", inplace=True) From 76074e2edf2501bfeadd69b2ed72749ad302f5c2 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Mon, 20 Oct 2025 12:54:02 -0700 Subject: [PATCH 34/36] Add tools from curator examples --- .../create_file_based_metadata_task.py | 466 ++++++++++++++++++ .../create_record_based_metadata_task.py | 278 +++++++++++ curator_tools/list_curation_task.py | 14 + curator_tools/validate_json_schema.py | 16 + 4 files changed, 774 insertions(+) create mode 100644 curator_tools/create_file_based_metadata_task.py create mode 100644 curator_tools/create_record_based_metadata_task.py create mode 100644 curator_tools/list_curation_task.py create mode 100644 curator_tools/validate_json_schema.py diff --git a/curator_tools/create_file_based_metadata_task.py b/curator_tools/create_file_based_metadata_task.py new file mode 100644 index 00000000..cb195fef --- /dev/null +++ b/curator_tools/create_file_based_metadata_task.py @@ -0,0 +1,466 @@ +""" +Create a file view and CurationTask for schema-bound folders following the file-based metadata workflow. +Pre-Requisites: + Requires conflicting versions of schematicpy and synapseclient. + Install schematicpy dependencies first, then uninstall synapseclient and reinstall with pip install git+https://github.com/Sage-Bionetworks/synapsePythonClient.git@synpy-1653-metadata-tasks-and-recordsets +Usage: + python create_file_view.py --folder-id syn12345678 --datatype MyDatatype.studyXYZ + python create_file_view.py --folder-id syn12345678 --datatype MyDatatype.studyXYZ \\ + --instructions "Custom curation instructions" + python create_file_view.py --folder-id syn12345678 --datatype MyDatatype.studyXYZ --no-wiki +Users can also set arguments using the global variables below, + but CLI arguments are used first. +""" + +import argparse +import warnings +from typing import Any, Optional + +from synapseclient import Synapse # type: ignore +from synapseclient import Wiki # type: ignore +from synapseclient.core.exceptions import SynapseHTTPError # type: ignore +from synapseclient.models import ( # type: ignore + Column, + ColumnType, + EntityView, + Folder, + ViewTypeMask, +) +from synapseclient.models.curation import CurationTask, FileBasedMetadataTaskProperties + +FOLDER_ID = "" # The Synapse ID of the entity you want to create the file view and CurationTask for +ATTACH_WIKI = None # Whether or not to attach the file view to the folder wiki. True or False +DATATYPE = "" # Data type name for the CurationTask (required) +# Instructions for the curation task (required) +INSTRUCTIONS = "" + + +TYPE_DICT = { + "string": ColumnType.STRING, + "number": ColumnType.DOUBLE, + "integer": ColumnType.INTEGER, + "boolean": ColumnType.BOOLEAN, +} + +LIST_TYPE_DICT = { + "string": ColumnType.STRING_LIST, + "integer": ColumnType.INTEGER_LIST, + "boolean": ColumnType.BOOLEAN_LIST, +} + + +def create_json_schema_entity_view( + syn: Synapse, + synapse_entity_id: str, + entity_view_name: str = "JSON Schema view", +) -> str: + """ + Creates a Synapse entity view based on a JSON Schema that is bound to a Synapse entity + This functionality is needed only temporarily. See note at top of module. + Args: + syn: A Synapse object thats been logged in + synapse_entity_id: The ID of the entity in Synapse to bind the JSON Schema to + entity_view_name: The name the crated entity view will have + Returns: + The Synapse id of the crated entity view + """ + warnings.warn( + "This function is a prototype, and could change or be removed at any point." + ) + js_service = syn.service("json_schema") + json_schema = js_service.get_json_schema(synapse_entity_id) + org = js_service.JsonSchemaOrganization( + json_schema["jsonSchemaVersionInfo"]["organizationName"] + ) + schema_version = js_service.JsonSchemaVersion( + org, + json_schema["jsonSchemaVersionInfo"]["schemaName"], + json_schema["jsonSchemaVersionInfo"]["semanticVersion"], + ) + columns = _create_columns_from_json_schema(schema_version.body) + view = EntityView( + name=entity_view_name, + parent_id=synapse_entity_id, + scope_ids=[synapse_entity_id], + view_type_mask=ViewTypeMask.FILE, + columns=columns, + ).store(synapse_client=syn) + # This reorder is so that these show up in the front of the EntityView in Synapse + view.reorder_column(name="createdBy", index=0) + view.reorder_column(name="name", index=0) + view.reorder_column(name="id", index=0) + view.store(synapse_client=syn) + return view.id + + +def create_or_update_wiki_with_entity_view( + syn: Synapse, + entity_view_id: str, + owner_id: str, + title: Optional[str] = None, +) -> Wiki: + """ + Creates or updates a Wiki for an entity if the wiki exists or not. + An EntityView query is added to the wiki markdown + This functionality is needed only temporarily. See note at top of module. + Args: + syn: A Synapse object thats been logged in + entity_view_id: The Synapse id of the EntityView for the query + owner_id: The ID of the entity in Synapse that the wiki will be created/updated + title: The (new) title of the wiki to be created/updated + Returns: + The created Wiki object + """ + warnings.warn( + "This function is a prototype, and could change or be removed at any point." + ) + entity = syn.get(owner_id) + + try: + wiki = syn.getWiki(entity) + except SynapseHTTPError: + wiki = None + if wiki: + return update_wiki_with_entity_view(syn, entity_view_id, owner_id, title) + return create_entity_view_wiki(syn, entity_view_id, owner_id, title) + + +def create_entity_view_wiki( + syn: Synapse, + entity_view_id: str, + owner_id: str, + title: Optional[str] = None, +) -> Wiki: + """ + Creates a wiki with a query of an entity view + This functionality is needed only temporarily. See note at top of module. + Args: + syn: A Synapse object thats been logged in + entity_view_id: The Synapse id of the entity view to make the wiki for + owner_id: The ID of the entity in Synapse to put as owner of the wiki + title: The title of the wiki to be created + Returns: + The created wiki object + """ + warnings.warn( + "This function is a prototype, and could change or be removed at any point." + ) + content = ( + "${synapsetable?query=select %2A from " + f"{entity_view_id}" + "&showquery=false&tableonly=false}" + ) + if title is None: + title = "Entity View" + wiki = Wiki(title=title, owner=owner_id, markdown=content) + wiki = syn.store(wiki) + return wiki + + +def update_wiki_with_entity_view( + syn: Synapse, entity_view_id: str, owner_id: str, title: Optional[str] = None +) -> Wiki: + """ + Updates a wiki to include a query of an entity view + This functionality is needed only temporarily. See note at top of module. + Args: + syn: A Synapse object thats been logged in + entity_view_id: The Synapse id of the entity view to make the query for + owner_id: The ID of the entity in Synapse to put as owner of the wiki + title: The title of the wiki to be updated + Returns: + The created wiki object + """ + warnings.warn( + "This function is a prototype, and could change or be removed at any point." + ) + entity = syn.get(owner_id) + wiki = syn.getWiki(entity) + + new_content = ( + "\n" + "${synapsetable?query=select %2A from " + f"{entity_view_id}" + "&showquery=false&tableonly=false}" + ) + wiki.markdown = wiki.markdown + new_content + if title: + wiki.title = title + + syn.store(wiki) + return wiki + + +def _create_columns_from_json_schema(json_schema: dict[str, Any]) -> list[Column]: + """Creates a list of Synapse Columns based on the JSON Schema type + Arguments: + json_schema: The JSON Schema in dict form + Raises: + ValueError: If the JSON Schema has no properties + ValueError: If the JSON Schema properties is not a dict + Returns: + A list of Synapse columns based on the JSON Schema + """ + properties = json_schema.get("properties") + if properties is None: + raise ValueError("The JSON Schema is missing a 'properties' field.") + if not isinstance(properties, dict): + raise ValueError( + "The 'properties' field in the JSON Schema must be a dictionary." + ) + columns = [] + for name, prop_schema in properties.items(): + column_type = _get_column_type_from_js_property(prop_schema) + maximum_size = None + if column_type == "STRING": + maximum_size = 100 + if column_type in LIST_TYPE_DICT.values(): + maximum_size = 5 + + column = Column( + name=name, + column_type=column_type, + maximum_size=maximum_size, + default_value=None, + ) + columns.append(column) + return columns + + +def _get_column_type_from_js_property(js_property: dict[str, Any]) -> ColumnType: + """ + Gets the Synapse column type from a JSON Schema property. + The JSON Schema should be valid but that should not be assumed. + If the type can not be determined ColumnType.STRING will be returned. + Args: + js_property: A JSON Schema property in dict form. + Returns: + A Synapse ColumnType based on the JSON Schema type + """ + # Enums are always strings in Synapse tables + if "enum" in js_property: + return ColumnType.STRING + if "type" in js_property: + if js_property["type"] == "array": + return _get_list_column_type_from_js_property(js_property) + return TYPE_DICT.get(js_property["type"], ColumnType.STRING) + # A oneOf list usually indicates that the type could be one or more different things + if "oneOf" in js_property and isinstance(js_property["oneOf"], list): + return _get_column_type_from_js_one_of_list(js_property["oneOf"]) + return ColumnType.STRING + + +def _get_column_type_from_js_one_of_list(js_one_of_list: list[Any]) -> ColumnType: + """ + Gets the Synapse column type from a JSON Schema oneOf list. + Items in the oneOf list should be dicts, but that should not be assumed. + Args: + js_one_of_list: A list of items to check for type + Returns: + A Synapse ColumnType based on the JSON Schema type + """ + # items in a oneOf list should be dicts + items = [item for item in js_one_of_list if isinstance(item, dict)] + # Enums are always strings in Synapse tables + if [item for item in items if "enum" in item]: + return ColumnType.STRING + # For Synapse ColumnType we can ignore null types in JSON Schemas + type_items = [item for item in items if "type" in item if item["type"] != "null"] + if len(type_items) == 1: + type_item = type_items[0] + if type_item["type"] == "array": + return _get_list_column_type_from_js_property(type_item) + return TYPE_DICT.get(type_item["type"], ColumnType.STRING) + return ColumnType.STRING + + +def _get_list_column_type_from_js_property(js_property: dict[str, Any]) -> ColumnType: + """ + Gets the Synapse column type from a JSON Schema array property + Args: + js_property: A JSON Schema property in dict form. + Returns: + A Synapse ColumnType based on the JSON Schema type + """ + if "items" in js_property and isinstance(js_property["items"], dict): + # Enums are always strings in Synapse tables + if "enum" in js_property["items"]: + return ColumnType.STRING_LIST + if "type" in js_property["items"]: + return LIST_TYPE_DICT.get( + js_property["items"]["type"], ColumnType.STRING_LIST + ) + + return ColumnType.STRING_LIST + + +def create_file_view( + folder_id: str, + attach_wiki: bool, + datatype: str, + instructions: str +) -> tuple[str, str]: + """ + Create a file view for a schema-bound folder using schematic. + Args: + folder_id: The Synapse Folder ID to crate the file view for + attach_wiki (bool): Wether or not to attack a Synapse Wiki + datatype (str): Data type name for the CurationTask (required) + instructions (str): Instructions for the curation task (required) + Returns: + A tuple: + The first item is Synapse ID of the entity view created + The second item is the task ID of the curation task created + """ + syn = Synapse() + syn.login() + + syn.logger.info("Attempting to create entity view.") + try: + entity_view_id = create_json_schema_entity_view( + syn=syn, + synapse_entity_id=folder_id + ) + except Exception as e: + msg = f"Error creating entity view: {str(e)}" + syn.logger.error(msg) + raise e + syn.logger.info("Created entity view.") + + if attach_wiki: + syn.logger.info("Attempting to attach wiki.") + try: + create_or_update_wiki_with_entity_view( + syn=syn, + entity_view_id=entity_view_id, + owner_id=folder_id + ) + except Exception as e: + msg = f"Error creating wiki: {str(e)}" + syn.logger.error(msg) + raise e + syn.logger.info("Wiki attached.") + + # Validate that the folder has an attached JSON schema + # The datatype parameter is now required and used directly for the CurationTask. + + js = syn.service("json_schema") + syn.logger.info("Attempting to get the attached schema.") + try: + js.get_json_schema_from_entity(folder_id) + except Exception as e: + msg = "Error getting the attached schema." + syn.logger.exception(msg) + raise e + syn.logger.info("Schema retrieval successful") + + # Use the provided datatype (required parameter) + task_datatype = datatype + + syn.logger.info("Attempting to get the Synapse ID of the provided folders project.") + try: + entity = Folder(folder_id).get(synapse_client=syn) + parent = syn.get(entity.parent_id) + project = None + while not project: + if parent.concreteType == "org.sagebionetworks.repo.model.Project": + project = parent + break + parent = syn.get(parent.parentId) + except Exception as e: + msg = "Error getting the Synapse ID of the provided folders project}" + syn.logger.exception(msg) + raise e + syn.logger.info("Got the Synapse ID of the provided folders project.") + + syn.logger.info("Attempting to create the CurationTask.") + try: + task = CurationTask( + data_type=task_datatype, + project_id=project.id, + instructions=instructions, + task_properties=FileBasedMetadataTaskProperties( + upload_folder_id=folder_id, + file_view_id=entity_view_id, + ) + ).store(synapse_client=syn) + except Exception as e: + msg = f"Error creating the CurationTask.: {str(e)}" + syn.logger.error(msg) + raise e + syn.logger.info("Created the CurationTask.") + + return (entity_view_id, task.task_id) + + +def main(): + """Main function for command-line usage.""" + parser = argparse.ArgumentParser( + description="Create file views for schema-bound folders" + ) + parser.add_argument( + '--folder-id', + type=str, + # required=True, + help='Synapse folder ID' + ) + parser.add_argument( + '--datatype', + type=str, + help='Data type name for the CurationTask (required)' + ) + parser.add_argument( + '--instructions', + type=str, + help='Instructions for the curation task (required)' + ) + parser.add_argument( + '--no-wiki', + action='store_false', + help='Do not attach view to folder wiki' + ) + args = parser.parse_args() + + if args.folder_id is not None: + folder_id = args.folder_id + elif FOLDER_ID: + folder_id = FOLDER_ID + else: + raise ValueError("folder_id must be provided via CLI or global in script") + + if args.datatype is not None: + datatype = args.datatype + elif DATATYPE: + datatype = DATATYPE + else: + raise ValueError("datatype must be provided via CLI argument --datatype or set in global variable DATATYPE") + + if args.instructions is not None: + instructions = args.instructions + elif INSTRUCTIONS: + instructions = INSTRUCTIONS + else: + raise ValueError( + "instructions must be provided via CLI argument --instructions or set in global variable INSTRUCTIONS" + ) + + if not args.no_wiki: + attach_wiki = False + elif ATTACH_WIKI is not None: + attach_wiki = ATTACH_WIKI + else: + attach_wiki = True + + entity_view_id, curation_task_id = create_file_view( + folder_id=folder_id, + attach_wiki=attach_wiki, + datatype=datatype, + instructions=instructions + ) + print(f"Wiki attached: {attach_wiki}") + print(f"View ID: {entity_view_id}") + print(f"Task ID: {curation_task_id}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/curator_tools/create_record_based_metadata_task.py b/curator_tools/create_record_based_metadata_task.py new file mode 100644 index 00000000..7adb3182 --- /dev/null +++ b/curator_tools/create_record_based_metadata_task.py @@ -0,0 +1,278 @@ +""" +Generate and upload CSV templates as a RecordSet for record-based metadata, create a +CurationTask, and also create a Grid to bootstrap the ValidationStatistics. +Usage: + python create_record_based_metadata_task.py --folder-id syn12345678 --dcc AD \\ + --datatype BiospecimenMetadataTemplate --schema_path path/to/schema.json \\ + --schema_uri schema_uri --upsert_keys specimenID \\ + --instructions "Please curate this metadata according to the schema requirements" + + # Multiple upsert keys: + python create_record_based_metadata_task.py --folder-id syn12345678 --dcc AD \\ + --datatype BiospecimenMetadataTemplate --schema_uri schema_uri \\ + --upsert_keys specimenID participantID sampleDate +Users can also set arguments using the global variables below, + but CLI arguments are used first. +""" + +import argparse +import tempfile +import pandas as pd +from pprint import pprint +from typing import Dict, Any, List, Optional +import json + +import synapseclient +from synapseclient import Synapse +from synapseclient.models import RecordSet, CurationTask, RecordBasedMetadataTaskProperties, Grid +from synapseclient.services.json_schema import JsonSchemaService + +PROJECT_ID = "" # The Synapse ID of the project where the folder exists +FOLDER_ID = "" # The Synapse ID of the folder to upload to +DCC = "" # Data Coordination Center +DATATYPE = "" # Data type name +SCHEMA_URI = "" # JSON schema URI +SCHEMA_PATH = None # Path to JSON schema file located on your machine, alternative to SCHEMA_URI +UPSERT_KEYS = [] # List of column names to use as upsert keys, e.g., ['specimenID', 'participantID'] +# Instructions for the curation task (required) +INSTRUCTIONS = "These are my custom instructions to tell someone what to do" + +def extract_property_titles(schema_data: Dict[str, Any]) -> List[str]: + """ + Extract title fields from all properties in a JSON schema. + Args: + schema_data: The parsed JSON schema data + Returns: + List of title values from the properties + """ + titles = [] + + # Check if 'properties' exists in the schema + if 'properties' not in schema_data: + return titles + + properties = schema_data['properties'] + + for property_name, property_data in properties.items(): + if isinstance(property_data, dict): + if 'title' in property_data: + titles.append(property_data['title']) + else: + titles.append(property_name) + + return titles + + +def create_dataframe_from_titles(titles: List[str]) -> pd.DataFrame: + """ + Create an empty DataFrame with the extracted titles as column names. + Args: + titles: List of title strings to use as column names + Returns: + Empty DataFrame with titles as columns + """ + if not titles: + return pd.DataFrame() + + df = pd.DataFrame(columns=titles) + return df + + +def extract_schema_properties_from_dict(schema_data: Dict[str, Any]) -> pd.DataFrame: + """ + Process a JSON schema dictionary and return a DataFrame with property titles as columns. + Args: + schema_data: The parsed JSON schema data as a dictionary + Returns: + DataFrame with property titles as columns + """ + titles = extract_property_titles(schema_data) + + df = create_dataframe_from_titles(titles) + + return df + + +def extract_schema_properties_from_file(json_file_path: str) -> pd.DataFrame: + """ + Process a JSON schema file and return a DataFrame with property titles as columns. + Args: + json_file_path: Path to the JSON schema file + Returns: + DataFrame with property titles as columns + Raises: + FileNotFoundError: If the JSON file doesn't exist + json.JSONDecodeError: If the JSON file is malformed + ValueError: If the file doesn't contain a valid schema structure + """ + try: + with open(json_file_path, 'r', encoding='utf-8') as file: + schema_data = json.load(file) + + return extract_schema_properties_from_dict(schema_data) + + except FileNotFoundError as e: + raise FileNotFoundError(f"JSON schema file not found: {json_file_path}") from e + except json.JSONDecodeError as e: + raise json.JSONDecodeError(f"Invalid JSON in file '{json_file_path}': {e}", e.doc, e.pos) + + +def extract_schema_properties_from_web(syn: Synapse, schema_uri: str) -> pd.DataFrame: + """ + Extract schema properties from a web-based JSON schema URI using Synapse. + This function retrieves a JSON schema from a web URI through the Synapse platform + and extracts property titles to create a DataFrame with those titles as columns. + Args: + syn: Authenticated Synapse client instance + schema_uri: URI pointing to the JSON schema resource + Returns: + DataFrame with property titles from the schema as column names + """ + try: + org_name, schema_name, version = schema_uri.split("-") + except ValueError as e: + raise ValueError( + f"Invalid schema URI format: {schema_uri}. Expected format 'org-name-schema.name.schema-version'.") from e + + js = JsonSchemaService(synapse=syn) + schemas_list = js.list_json_schemas(organization_name=org_name) + if not any(schema_name == s["schemaName"] for s in schemas_list): + raise ValueError(f"Schema URI '{schema_uri}' not found in Synapse JSON schemas.") + + schema = js.get_json_schema_body(json_schema_uri=schema_uri) + return extract_schema_properties_from_dict(schema) + + +def extract_schema(syn: Synapse, schema_path: Optional[str] = None, schema_uri: Optional[str] = None) -> pd.DataFrame: + """ + Extract schema properties from either a local file or web URI. + This function provides a unified interface for extracting JSON schema properties + from different sources. It accepts either a local file path or a web URI and + delegates to the appropriate extraction function. + Args: + syn: Authenticated Synapse client instance (required for web URI extraction) + schema_path: Optional path to a local JSON schema file + schema_uri: Optional URI pointing to a web-based JSON schema resource + Returns: + DataFrame with property titles from the schema as column names + Raises: + ValueError: If neither schema_path nor schema_uri is provided, or if both are provided + FileNotFoundError: If schema_path is provided but the file doesn't exist + json.JSONDecodeError: If the local schema file contains invalid JSON + SynapseError: If there are issues retrieving the web-based schema + Note: + At least one of schema_path or schema_uri must be provided, if both are given the uri will be used. + """ + if schema_uri: + return extract_schema_properties_from_web(syn, schema_uri) + elif schema_path: + return extract_schema_properties_from_file(schema_path) + else: + raise ValueError("Either schema_path or schema_uri must be provided.") + + +def main(): + """Main function for command-line usage.""" + parser = argparse.ArgumentParser( + description="Generate and upload CSV templates for record-based metadata" + ) + + parser.add_argument('--folder_id', type=str, required=False, + help='Synapse folder ID for upload') + parser.add_argument('--dcc', type=str, required=False, + help='Data Coordination Center') + parser.add_argument('--datatype', type=str, required=False, + help='Data type name') + parser.add_argument('--schema_uri', type=str, required=False, default=None, + help='JSON schema URI') + parser.add_argument('--schema_path', type=str, required=False, default=None, + help='path to JSON schema') + parser.add_argument('--upsert_keys', type=str, nargs='+', required=False, + help='Column names to use as upsert keys (one or more)') + parser.add_argument('--instructions', type=str, required=False, + help='Instructions for the curation task (required)') + + args = parser.parse_args() + + # Use CLI arguments first, then fall back to constants + folder_id = args.folder_id if args.folder_id is not None else FOLDER_ID + dcc = args.dcc if args.dcc is not None else DCC + datatype = args.datatype if args.datatype is not None else DATATYPE + schema_uri = args.schema_uri if args.schema_uri is not None else SCHEMA_URI + schema_path = args.schema_path if args.schema_path is not None else SCHEMA_PATH + upsert_keys = args.upsert_keys if args.upsert_keys is not None else UPSERT_KEYS + instructions = args.instructions if args.instructions is not None else INSTRUCTIONS + + # Validate required parameters + if folder_id is None: + raise ValueError("folder_id must be provided via CLI or global variable FOLDER_ID") + if dcc is None: + raise ValueError("dcc must be provided via CLI or global variable DCC") + if datatype is None: + raise ValueError("datatype must be provided via CLI or global variable DATATYPE") + if upsert_keys is None: + raise ValueError("upsert_keys must be provided via CLI or global variable UPSERT_KEYS") + if instructions is None: + raise ValueError("instructions must be provided via CLI or global variable INSTRUCTIONS") + + syn = synapseclient.Synapse() + syn.login() + + template_df = extract_schema(syn=syn, schema_path=schema_path, schema_uri=schema_uri) + syn.logger.info(f"Extracted schema properties and created template: {template_df.columns.tolist()}") + + tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") + try: + with open(tmp.name, 'w') as f: + template_df.to_csv(f, index=False) + except Exception as e: + syn.logger.error(f"Error writing template to temporary CSV file: {e}") + raise e + + try: + with open(tmp.name, 'r') as f: + recordset_with_data = RecordSet( + name=f"{dcc}_{datatype}_RecordSet", + parent_id=folder_id, + description=f"RecordSet for {dcc} {datatype}", + path=f.name, + upsert_keys=upsert_keys + ).store(synapse_client=syn) + recordset_id = recordset_with_data.id + syn.logger.info(f"Created RecordSet with ID: {recordset_id}") + pprint(recordset_with_data) + except Exception as e: + syn.logger.error(f"Error creating RecordSet in Synapse: {e}") + raise e + + try: + curation_task = CurationTask( + data_type=datatype, + project_id=PROJECT_ID, + instructions=instructions, + task_properties=RecordBasedMetadataTaskProperties( + record_set_id=recordset_id, + ) + ).store(synapse_client=syn) + syn.logger.info( + f"Created CurationTask ({curation_task.task_id}) in folder {folder_id} for data type {datatype}") + pprint(curation_task) + except Exception as e: + syn.logger.error(f"Error creating CurationTask in Synapse: {e}") + raise e + + try: + curation_grid: Grid = Grid( + record_set_id=recordset_id, + ) + curation_grid.create(synapse_client=syn) + curation_grid = curation_grid.export_to_record_set(synapse_client=syn) + syn.logger.info(f"Created Grid view for RecordSet ID: {recordset_id} for data type {datatype}") + pprint(curation_grid) + except Exception as e: + syn.logger.error(f"Error creating Grid view in Synapse: {e}") + raise e + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/curator_tools/list_curation_task.py b/curator_tools/list_curation_task.py new file mode 100644 index 00000000..0e0fa0fb --- /dev/null +++ b/curator_tools/list_curation_task.py @@ -0,0 +1,14 @@ +from pprint import pprint +from synapseclient import Synapse +from synapseclient.models.curation import CurationTask + +PROJECT_ID = "" # The Synapse ID of the project to list tasks from + +syn = Synapse() +syn.login() + + +for curation_task in CurationTask.list( + project_id=PROJECT_ID +): + pprint(curation_task) \ No newline at end of file diff --git a/curator_tools/validate_json_schema.py b/curator_tools/validate_json_schema.py new file mode 100644 index 00000000..e953b91d --- /dev/null +++ b/curator_tools/validate_json_schema.py @@ -0,0 +1,16 @@ +from synapseclient import Synapse +from synapseclient.models import Folder + +# Data from: https://synapse.org/Synapse:syn69735275/tables/ +# The URI of the JSON Schema you want to bind, for example: `sage.schemas.v2571-ad.IndividualAnimalMetadataTemplate.schema-0.1.0` +URI = "" +# The Synapse ID of the entity you want to bind the JSON Schema to. This should be the ID of a Folder where you want to enforce the schema. +FOLDER_ID = "" + +syn = Synapse() +syn.login() + +folder = Folder(id=FOLDER_ID).get() +schema_validation = folder.validate_schema() + +print(f"Schema validation result for folder {FOLDER_ID}: {schema_validation}") \ No newline at end of file From 8b408c4c1ff97cd8ecf1c0425a93eb39facc60df Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Mon, 20 Oct 2025 14:01:37 -0700 Subject: [PATCH 35/36] Create query_schema_registry.py --- curator_tools/query_schema_registry.py | 150 +++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 curator_tools/query_schema_registry.py diff --git a/curator_tools/query_schema_registry.py b/curator_tools/query_schema_registry.py new file mode 100644 index 00000000..875dbf51 --- /dev/null +++ b/curator_tools/query_schema_registry.py @@ -0,0 +1,150 @@ +""" +Query the Synapse schema registry table to retrieve Schema URIs based on DCC and datatype. +This script queries the schema registry table at syn69735275 to find matching schemas +based on the provided DCC (Data Coordination Center) and datatype parameters. +Results are sorted by version and the URI is returned. +Usage: + python query_schema_registry.py --dcc ad --datatype IndividualAnimalMetadataTemplate + + # Or use the global variables in the script + python query_schema_registry.py +Users can also set arguments using the global variables below, +but CLI arguments take precedence. +""" + +import argparse +from typing import List, Optional +from synapseclient import Synapse +from synapseclient.models import Table + +# Global variables - set these if you don't want to use command line arguments +DCC = "" # Data Coordination Center (e.g., 'ad', 'amp', 'mc2') +DATATYPE = "" # Data type name from schema + +# The Synapse ID of the schema registry table +SCHEMA_REGISTRY_TABLE_ID = "" + + +def query_schema_registry( + dcc: str, + datatype: str, + synapse_client: Optional[Synapse] = None +) -> List[dict]: + """ + Query the schema registry table to find schemas matching DCC and datatype. + Arguments: + dcc: Data Coordination Center identifier (e.g., 'ad', 'amp', 'mc2') + datatype: Data type name from the schema + synapse_client: Authenticated Synapse client instance + Returns: + List of dictionaries containing schema information, sorted by version + """ + if synapse_client is None: + syn = Synapse() + syn.login() + else: + syn = synapse_client + + # Construct SQL query to search for schemas matching DCC and datatype + # The query looks for exact matches in DCC and contains match for datatype + # Results are sorted by version in descending order (newest first) + query = f""" + SELECT * FROM {SCHEMA_REGISTRY_TABLE_ID} + WHERE dcc = '{dcc}' + AND datatype LIKE '%{datatype}%' + ORDER BY version DESC + """ + + print(f"Querying schema registry with DCC='{dcc}' and datatype='{datatype}'...") + print(f"SQL Query: {query}") + + # Query the table and get results as a pandas DataFrame + table = Table(id=SCHEMA_REGISTRY_TABLE_ID) + results_df = table.query(query=query) + + if results_df.empty: + print(f"No schemas found for DCC='{dcc}' and datatype='{datatype}'") + return [] + + # Convert DataFrame to list of dictionaries for easier handling + results = results_df.to_dict('records') + + print(f"Found {len(results)} matching schema(s):") + for i, result in enumerate(results, 1): + print(f" {i}. URI: {result.get('uri', 'N/A')}") + print(f" Version: {result.get('version', 'N/A')}") + print(f" DCC: {result.get('dcc', 'N/A')}") + print(f" DataType: {result.get('datatype', 'N/A')}") + if i < len(results): + print() + + return results + + +def get_latest_schema_uri(dcc: str, datatype: str, synapse_client: Optional[Synapse] = None) -> Optional[str]: + """ + Get the URI of the latest schema version for the given DCC and datatype. + Arguments: + dcc: Data Coordination Center identifier + datatype: Data type name from the schema + synapse_client: Authenticated Synapse client instance + Returns: + URI string of the latest schema version, or None if not found + """ + results = query_schema_registry(dcc, datatype, synapse_client) + + if results: + latest_schema = results[0] # Results are sorted by version DESC, so first is latest + uri = latest_schema.get('uri') + print(f"\nLatest schema URI: {uri}") + return uri + else: + print(f"\nNo schema found for DCC='{dcc}' and datatype='{datatype}'") + return None + + +def main(): + """Main function for command-line usage.""" + parser = argparse.ArgumentParser( + description="Query the Synapse schema registry to find Schema URIs by DCC and datatype" + ) + parser.add_argument( + '--dcc', + type=str, + help='Data Coordination Center identifier (e.g., ad, amp, mc2)' + ) + parser.add_argument( + '--datatype', + type=str, + help='Data type name from the schema (e.g., IndividualAnimalMetadataTemplate)' + ) + + args = parser.parse_args() + + # Use command line arguments if provided, otherwise use global variables + if args.dcc is not None: + dcc = args.dcc + elif DCC: + dcc = DCC + else: + raise ValueError("DCC must be provided via CLI argument --dcc or set in global variable DCC") + + if args.datatype is not None: + datatype = args.datatype + elif DATATYPE: + datatype = DATATYPE + else: + raise ValueError("datatype must be provided via CLI argument --datatype or set in global variable DATATYPE") + + # Initialize Synapse client + syn = Synapse() + syn.login() + + # Get just the latest schema URI + latest_uri = get_latest_schema_uri(dcc, datatype, syn) + if latest_uri: + print(f"\nUse this URI in your scripts: {latest_uri}") + + +if __name__ == "__main__": + main() \ No newline at end of file From 3828493d2ce52058114e3944a9f3bc7bf78b756f Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 21 Oct 2025 14:43:01 -0700 Subject: [PATCH 36/36] Add ref to AR-related schema if passed --- utils/add_json_conditions.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/utils/add_json_conditions.py b/utils/add_json_conditions.py index 9ce9cfd2..72bf5757 100644 --- a/utils/add_json_conditions.py +++ b/utils/add_json_conditions.py @@ -3,7 +3,7 @@ import pathlib -def generate_json_schema(input): +def generate_json_schema(input, ar_schema): """ Corrects conditional logic for arrays. """ @@ -20,6 +20,10 @@ def generate_json_schema(input): if property["if"]["properties"][prop]["enum"]: property["if"]["properties"][prop] = {"contains": property["if"]["properties"][prop]} + ref_conditions = {"$ref": f"{ar_schema}"} if ar_schema is not None else None + + conditions = conditions.append(ref_conditions) + with open(output, 'w') as f: json.dump(model, f, indent=2) @@ -29,7 +33,8 @@ def generate_json_schema(input): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate corrected JSON schema") parser.add_argument("input_path", help="Path to a JSON schema file.") + parser.add_argument("ar_schema", help="URI for AR JSON schema") args = parser.parse_args() - generate_json_schema(args.input_path) \ No newline at end of file + generate_json_schema(args.input_path, args.ar_schema) \ No newline at end of file