From 07ecd8302fc05d1136c04aa98d61cbfd118ca1c4 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 8 May 2025 16:30:50 -0700 Subject: [PATCH 01/67] Create map_to_crdc.py --- utils/map_to_crdc.py | 118 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 utils/map_to_crdc.py diff --git a/utils/map_to_crdc.py b/utils/map_to_crdc.py new file mode 100644 index 00000000..58b746a7 --- /dev/null +++ b/utils/map_to_crdc.py @@ -0,0 +1,118 @@ +"""Clean and prep MC2 database tables for backpopulation + +This script will reorder and modify database table manifest columns +to match the respective View-type schema. + +author: orion.banks +""" + +import argparse +import pandas as pd +import sys +import re + +def get_args(): + """Set up command-line interface and get arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "-t", + type=str, + help="Dataset metadata CSV", + required=True, + ) + parser.add_argument( + "-v", + type=str, + help="Grant metadata CSV", + required=False, + default=None, + ) + parser.add_argument( + "-f", + type=str, + help="Consortium metadata CSV", + required=False, + default=None, + ) + parser.add_argument( + "-s", + type=str, + help="Study metadata CSV", + required=False, + default=None, + ) + parser.add_argument( + "-i", + type=str, + help="Target metadata CSV", + required=False, + default=None, + ) + parser.add_argument( + "-m", + type=str, + help="Target-to-source mapping CSV", + required=False, + default=None, + ) + return parser.parse_args() + +def extract_lists(df: pd.DataFrame, list_columns, pattern) -> pd.DataFrame: + """Extract bracketed/quoted lists from sheets.""" + + for col in list_columns: + + df[col] = ( + df[col] + .apply(lambda x: re.findall(pattern, x)) + .str.join(", ")) + + return df + +def map_columns(df: pd.DataFrame, column_map: list[tuple]) -> pd.DataFrame: + """Map outdated columns to new column names and drop old columns.""" + + for start, end in column_map: + + df[f"{end}"] = [ + x for x in df[f"{start}"] + ] + + return df + + +def main(): + """Main function.""" + + args = get_args() + + dataset_input, grant_input, consortium_input, study_input, target_input, mapping = args.t, args.v, args.f, args.s, args.i, args.m + + dataset, grant, consortium, study = None, None, None, None + + source_metadata = [(dataset_input, dataset), (grant_input, grant), (consortium_input, consortium), (study_input, study)] + + target = pd.read_csv(target_input, header=0).fillna("") + + mapping = pd.read_csv(mapping, header=0).to_dict() + + for input_file in source_metadata: + input_file[1] = pd.read_csv(input_file[0], header=0).fillna("") + + for col in target: + for k, v in mapping.items(): + if col == k: + source_col = v + if source_col in dataset.columns: + target[col] = dataset[source_col] + elif source_col in grant.columns: + target[col] = grant[source_col] + elif source_col in consortium.columns: + target[col] = consortium[source_col] + elif source_col in study.columns: + target[col] = study[source_col] + + target.to_csv("mapped_metadata.csv", index=False) + +if __name__ == "__main__": + main() From 0b61f164511792b105f6402b025865dadcfd99af Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Mon, 19 May 2025 11:53:55 -0700 Subject: [PATCH 02/67] Generalize functions and simplify inputs Current expectation is that this information will be pulled from Synapse tables, using an input CSV that denotes "Component", "Table Synapse Id" --- utils/map_to_crdc.py | 102 ++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 59 deletions(-) diff --git a/utils/map_to_crdc.py b/utils/map_to_crdc.py index 58b746a7..84ace9aa 100644 --- a/utils/map_to_crdc.py +++ b/utils/map_to_crdc.py @@ -8,6 +8,7 @@ import argparse import pandas as pd +import synapseclient import sys import re @@ -15,48 +16,42 @@ def get_args(): """Set up command-line interface and get arguments.""" parser = argparse.ArgumentParser() parser.add_argument( - "-t", + "-d", type=str, help="Dataset metadata CSV", required=True, - ) - parser.add_argument( - "-v", - type=str, - help="Grant metadata CSV", - required=False, - default=None, - ) - parser.add_argument( - "-f", - type=str, - help="Consortium metadata CSV", - required=False, default=None, - ) - parser.add_argument( - "-s", - type=str, - help="Study metadata CSV", - required=False, - default=None, - ) + ), parser.add_argument( - "-i", + "-t", type=str, - help="Target metadata CSV", - required=False, + help="Target template name", + required=True, default=None, ) parser.add_argument( "-m", type=str, help="Target-to-source mapping CSV", - required=False, + required=True, default=None, ) return parser.parse_args() + +def get_table(syn, source_id: str, cols: str | list = "*") -> pd.DataFrame: + """Collect columns from a Synapse table entity and return as a Dataframe.""" + + if type(cols) == list: + cols = ", ".join(["".join(['"', col, '"']) for col in cols]) + + query = f"SELECT {cols} FROM {source_id}" + table = syn.tableQuery(query).asDataFrame().fillna("") + print(f"Data acquired from Synapse table {source_id}") + + return table + + def extract_lists(df: pd.DataFrame, list_columns, pattern) -> pd.DataFrame: """Extract bracketed/quoted lists from sheets.""" @@ -69,50 +64,39 @@ def extract_lists(df: pd.DataFrame, list_columns, pattern) -> pd.DataFrame: return df -def map_columns(df: pd.DataFrame, column_map: list[tuple]) -> pd.DataFrame: - """Map outdated columns to new column names and drop old columns.""" - - for start, end in column_map: - - df[f"{end}"] = [ - x for x in df[f"{start}"] - ] - - return df - def main(): """Main function.""" args = get_args() - dataset_input, grant_input, consortium_input, study_input, target_input, mapping = args.t, args.v, args.f, args.s, args.i, args.m + manifests, target_output, mapping = args.d, args.t, args.m - dataset, grant, consortium, study = None, None, None, None + syn = synapseclient.login() - source_metadata = [(dataset_input, dataset), (grant_input, grant), (consortium_input, consortium), (study_input, study)] + manifests_df = pd.read_csv(manifests, header=0).fillna("") + mapping_df = pd.read_csv(mapping, header=0).fillna("") - target = pd.read_csv(target_input, header=0).fillna("") + source_metadata_dict = dict(zip(manifests_df["Component"], manifests_df["Table_syn_id"])) - mapping = pd.read_csv(mapping, header=0).to_dict() + gc_template_dict = dict(zip(mapping_df["Property"], (zip(mapping_df["Node"], mapping_df["Acceptable Values"])))) - for input_file in source_metadata: - input_file[1] = pd.read_csv(input_file[0], header=0).fillna("") - - for col in target: - for k, v in mapping.items(): - if col == k: - source_col = v - if source_col in dataset.columns: - target[col] = dataset[source_col] - elif source_col in grant.columns: - target[col] = grant[source_col] - elif source_col in consortium.columns: - target[col] = consortium[source_col] - elif source_col in study.columns: - target[col] = study[source_col] - - target.to_csv("mapped_metadata.csv", index=False) + gc_mc2_mapping_dict = dict(zip(mapping_df["Property"], mapping_df["MC2 attribute"])) + + for type, table in source_metadata_dict.items(): + table_df = get_table(syn, table, cols="*") + source_metadata_dict[type] = (table_df, table_df.columns.tolist()) + + template_df = pd.DataFrame() + + for attribute, (template, valid_values) in gc_template_dict.items(): + if template == target_output: + template_df[attribute] = "" + for component, (df, cols) in source_metadata_dict.items(): + if gc_mc2_mapping_dict[attribute] in cols: + template_df[attribute] = df[gc_mc2_mapping_dict[attribute]] + + template_df.to_csv("mapped_metadata.csv", index=False) if __name__ == "__main__": main() From fd29230b8a7b5166dfd5e1cbc12cf62530d4b3b4 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Mon, 19 May 2025 11:54:10 -0700 Subject: [PATCH 03/67] Update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 43d4d097..b17ddd0b 100644 --- a/.gitignore +++ b/.gitignore @@ -139,4 +139,5 @@ upload_check* *manifests/ annotations/inputs/* annotations/outputs/* -annotations/output/* \ No newline at end of file +annotations/output/* +mapped_metadata.csv From 4aa12264b7798294f81a8725085e01211eb07789 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 21 May 2025 13:28:14 -0700 Subject: [PATCH 04/67] Add script description --- utils/map_to_crdc.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/utils/map_to_crdc.py b/utils/map_to_crdc.py index 84ace9aa..0fe85551 100644 --- a/utils/map_to_crdc.py +++ b/utils/map_to_crdc.py @@ -1,7 +1,12 @@ -"""Clean and prep MC2 database tables for backpopulation +"""Map MC2 Center metadata to GC models. -This script will reorder and modify database table manifest columns -to match the respective View-type schema. +This script maps metadata from MC2 Center to the Genomic Commons (GC) models. +It retrieves metadata from Synapse tables, extracts relevant information, +and generates a CSV file with the mapped metadata. +The script requires the following command-line arguments: +1. -d: Path to the dataset metadata CSV file. An example of this file can be found here: https://docs.google.com/spreadsheets/d/1LLpSIFAh12YdKnGfzXMxGpoKCaEH90nDx-QvncaIJlk/edit?gid=288959359#gid=288959359 +2. -t: Target template name. +3. -m: Path to the target-to-source mapping CSV file. author: orion.banks """ @@ -9,7 +14,6 @@ import argparse import pandas as pd import synapseclient -import sys import re def get_args(): From 7d6ede0c840f12723f950b8e33e44b0ff86ae94a Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 4 Jun 2025 17:50:12 -0700 Subject: [PATCH 05/67] Create example_input_map_to_crdc.csv Example input metadata reference sheet for map_to_crdc.py --- utils/example_input_map_to_crdc.csv | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 utils/example_input_map_to_crdc.csv diff --git a/utils/example_input_map_to_crdc.csv b/utils/example_input_map_to_crdc.csv new file mode 100644 index 00000000..94299112 --- /dev/null +++ b/utils/example_input_map_to_crdc.csv @@ -0,0 +1,2 @@ +Component,Table_syn_id +Biospecimen,syn65484828 \ No newline at end of file From de410d5f06519043c76dc22fc478dfe3432ae6bb Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 24 Jun 2025 13:30:42 -0700 Subject: [PATCH 06/67] Update map_to_crdc.py Add else: continue to column populating loop, to decrease indent count Replace valid_values with "_" in for loop variable assignment --- utils/map_to_crdc.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/utils/map_to_crdc.py b/utils/map_to_crdc.py index 0fe85551..7f39eb6e 100644 --- a/utils/map_to_crdc.py +++ b/utils/map_to_crdc.py @@ -93,12 +93,14 @@ def main(): template_df = pd.DataFrame() - for attribute, (template, valid_values) in gc_template_dict.items(): + for attribute, (template, _) in gc_template_dict.items(): if template == target_output: template_df[attribute] = "" - for component, (df, cols) in source_metadata_dict.items(): - if gc_mc2_mapping_dict[attribute] in cols: - template_df[attribute] = df[gc_mc2_mapping_dict[attribute]] + else: + continue + for component, (df, cols) in source_metadata_dict.items(): + if gc_mc2_mapping_dict[attribute] in cols: + template_df[attribute] = df[gc_mc2_mapping_dict[attribute]] template_df.to_csv("mapped_metadata.csv", index=False) From 1f9a2bf535b8f29f885282cb2ec02df591176365 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 25 Jun 2025 09:50:03 -0700 Subject: [PATCH 07/67] Update example_input_map_to_crdc.csv --- utils/example_input_map_to_crdc.csv | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/utils/example_input_map_to_crdc.csv b/utils/example_input_map_to_crdc.csv index 94299112..b933d1b6 100644 --- a/utils/example_input_map_to_crdc.csv +++ b/utils/example_input_map_to_crdc.csv @@ -1,2 +1,6 @@ Component,Table_syn_id -Biospecimen,syn65484828 \ No newline at end of file +Biospecimen,syn65877820 +FileView,syn65877828 +Individual,syn65877815 +ImagingLevel2,syn65880241 +Study,syn65877778 From 1dc4136d4d00a49c35e1705a81243bacf6d3b9b9 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 25 Jun 2025 09:50:13 -0700 Subject: [PATCH 08/67] Update map_to_crdc.py --- utils/map_to_crdc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/map_to_crdc.py b/utils/map_to_crdc.py index 7f39eb6e..723b18be 100644 --- a/utils/map_to_crdc.py +++ b/utils/map_to_crdc.py @@ -102,7 +102,8 @@ def main(): if gc_mc2_mapping_dict[attribute] in cols: template_df[attribute] = df[gc_mc2_mapping_dict[attribute]] - template_df.to_csv("mapped_metadata.csv", index=False) + template_df.to_csv(f"{target_output}_mapped_metadata.csv", index=False) + print(f"Mapped metadata saved to {target_output}_mapped_metadata.csv") if __name__ == "__main__": main() From 161ac28f81638246256749dd2017233a34115341 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 26 Jun 2025 17:45:34 -0700 Subject: [PATCH 09/67] Refactor code to pull info from datasets --- utils/map_to_crdc.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/utils/map_to_crdc.py b/utils/map_to_crdc.py index 723b18be..1a369d9e 100644 --- a/utils/map_to_crdc.py +++ b/utils/map_to_crdc.py @@ -14,6 +14,7 @@ import argparse import pandas as pd import synapseclient +from synapseclient.models import query import re def get_args(): @@ -43,13 +44,15 @@ def get_args(): return parser.parse_args() -def get_table(syn, source_id: str, cols: str | list = "*") -> pd.DataFrame: +def get_table(syn, source_id: str, cols: str | list = "*", conditions: str | None = None) -> pd.DataFrame: """Collect columns from a Synapse table entity and return as a Dataframe.""" if type(cols) == list: cols = ", ".join(["".join(['"', col, '"']) for col in cols]) query = f"SELECT {cols} FROM {source_id}" + if conditions is not None: + query += f" WHERE {conditions}" table = syn.tableQuery(query).asDataFrame().fillna("") print(f"Data acquired from Synapse table {source_id}") @@ -81,28 +84,38 @@ def main(): manifests_df = pd.read_csv(manifests, header=0).fillna("") mapping_df = pd.read_csv(mapping, header=0).fillna("") - source_metadata_dict = dict(zip(manifests_df["Component"], manifests_df["Table_syn_id"])) + source_metadata_dict = dict(zip(manifests_df["entity_id"], (zip(manifests_df["data_type"], manifests_df["study_key"])))) gc_template_dict = dict(zip(mapping_df["Property"], (zip(mapping_df["Node"], mapping_df["Acceptable Values"])))) gc_mc2_mapping_dict = dict(zip(mapping_df["Property"], mapping_df["MC2 attribute"])) - for type, table in source_metadata_dict.items(): - table_df = get_table(syn, table, cols="*") - source_metadata_dict[type] = (table_df, table_df.columns.tolist()) - template_df = pd.DataFrame() for attribute, (template, _) in gc_template_dict.items(): if template == target_output: - template_df[attribute] = "" + template_df[attribute] = "" # create GC template columns + print(f"{attribute} added to template \n") + + template_df["crdc_id"] = "" + attribute_list = template_df.columns.tolist() + + for id, (data_type, study_key) in source_metadata_dict.items(): + if data_type == "Study" and target_output in ["study", "image"]: + df = get_table(syn, id, cols="*", conditions=f"Study_id = '{study_key}'") + elif target_output != "study": + if data_type not in ["Study"]: + df = query(query=f"SELECT * FROM {id}") else: - continue - for component, (df, cols) in source_metadata_dict.items(): - if gc_mc2_mapping_dict[attribute] in cols: - template_df[attribute] = df[gc_mc2_mapping_dict[attribute]] + df = pd.DataFrame() + source_metadata_dict[id] = (data_type, df, df.columns.tolist()) + + for _, (data_type, df, cols) in source_metadata_dict.items(): + mapped_attributes = [attribute for attribute in attribute_list if "".join("".join(str(gc_mc2_mapping_dict[attribute]).split(" ")).split("-")) in cols] + mapped_df = df.rename(columns={"".join("".join(str(gc_mc2_mapping_dict[attribute]).split(" ")).split("-")): attribute for attribute in mapped_attributes}) + template_df = pd.concat([template_df, mapped_df]).drop_duplicates(subset=attribute_list, keep="first").reset_index(drop=True) - template_df.to_csv(f"{target_output}_mapped_metadata.csv", index=False) + template_df[attribute_list].to_csv(f"{target_output}_mapped_metadata.csv", index=False) print(f"Mapped metadata saved to {target_output}_mapped_metadata.csv") if __name__ == "__main__": From 3cca9a53f2caf459c49a2268cb8e669a01414f87 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 26 Jun 2025 17:45:43 -0700 Subject: [PATCH 10/67] Update example_input_map_to_crdc.csv --- utils/example_input_map_to_crdc.csv | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/utils/example_input_map_to_crdc.csv b/utils/example_input_map_to_crdc.csv index b933d1b6..7a9fa016 100644 --- a/utils/example_input_map_to_crdc.csv +++ b/utils/example_input_map_to_crdc.csv @@ -1,6 +1,5 @@ -Component,Table_syn_id -Biospecimen,syn65877820 -FileView,syn65877828 -Individual,syn65877815 -ImagingLevel2,syn65880241 -Study,syn65877778 +entity_id,data_type,study_key +syn64713344,ImagingLevel2,CA261841-CR_2024 +syn64713343,ImagingLevel2,CA261841-CR_2024 +syn64779989,ImagingLevel3,CA261841-CR_2024 +syn65877778,Study,CA261841-CR_2024 From 7b37f9d2452b30e6c1b74848dabf27ca2c8f279e Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 26 Jun 2025 17:45:51 -0700 Subject: [PATCH 11/67] Update table_to_annotations.py --- utils/table_to_annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/table_to_annotations.py b/utils/table_to_annotations.py index 68da605b..6a174659 100644 --- a/utils/table_to_annotations.py +++ b/utils/table_to_annotations.py @@ -353,7 +353,7 @@ def main(): collect_record_annotations(syn, model_info_tuple, model_dict, keys_to_drop) if datasetview_table is not None: - collect_dataset_annotations(syn, target, dataset_info_tuple, keys_to_drop) + collect_dataset_annotations(syn, target, dataset_info_tuple, keys_to_drop=None) if __name__ == "__main__": From e0500ddbab65f7f16fdd553115e2e6cb215f9e10 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Mon, 30 Jun 2025 14:24:59 -0700 Subject: [PATCH 12/67] Add functionality to group files in scope folder Addresses an issue where creating/adding files to a Dataset is "Forbidden", which appears to show up when large numbers of files are being added to a single dataset. The changes in this commit will identify the number of groups required to fit within a pre-defined file max, create additional dataset(s) required for the file groups, then add one file group to each dataset. --- utils/build_datasets.py | 109 +++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 36 deletions(-) diff --git a/utils/build_datasets.py b/utils/build_datasets.py index 1f9f5c0f..cf8eae13 100644 --- a/utils/build_datasets.py +++ b/utils/build_datasets.py @@ -12,6 +12,7 @@ import argparse import os import pandas as pd +import random import re import synapseclient from synapseclient import Dataset @@ -47,34 +48,47 @@ def get_table(syn, source_id: str) -> pd.DataFrame: return table -def filter_files_in_folder(syn, scope: str, formats: list[str]) -> list: +def filter_files_in_folder(syn, scope: str, formats: list[str], folder_or_files: str) -> list: """Capture all files in provided scope and select files that match a list of formats, return list of dataset items""" dataset_items = [] walk_path = synapseutils.walk(syn, scope, ["file"]) for *_, filename in walk_path: - for f, entity_id in filename: - if any(f.endswith(fmt) for fmt in formats): # only select files of desired format - dataset_items.append({ + if folder_or_files == "files": + file_to_add = [entity_id for f, entity_id in filename if any(f.endswith(fmt) for fmt in formats)] # only select files of desired format + elif folder_or_files == "folder": + file_to_add = [entity_id for f, entity_id in filename] # select all files in folder + for entity_id in file_to_add: + dataset_items.append({ "entityId": entity_id, "versionNumber": syn.get(entity_id, downloadFile=False).versionLabel }) - + dataset_len = len(dataset_items) + print(f"--> {dataset_len} files found...") return dataset_items -def create_dataset_entity(syn, name: str, grant: str) -> Dataset: +def create_dataset_entity(syn, name: str, grant: str, multi_dataset: bool) -> Dataset: """Create an empty Synapse Dataset using the Project associated with the applicable grant number as parent. Return the Dataset object.""" query = f"SELECT grantId FROM syn21918972 WHERE grantViewId='{grant}'" project_id = syn.tableQuery(query).asDataFrame().iat[0, 0] + if multi_dataset: + name = f"{name}-{random.randint(1000, 9999)}" # append random number to name for multi-dataset dataset = Dataset(name=name, parent=project_id) return dataset +def chunk_files_for_dataset(scope_files: list[str], file_max: int, dataset_total: int) -> list[list[str]]: + """Chunk files into lists of size file_max for Dataset creation.""" + file_groups = [] + for i in range(dataset_total): + file_group = scope_files[i * file_max:(i + 1) * file_max] + file_groups.append(file_group) + return file_groups def main(): @@ -84,6 +98,8 @@ def main(): dsp, new_name = args.d, args.n update_dsp_sheet = None + + file_max = 10000 # maximum number of files per Dataset if os.path.exists(dsp): dsp_df = pd.read_csv(dsp, keep_default_na=False) @@ -97,7 +113,7 @@ def main(): ) exit() - if dsp_df.iat[1, 0] == "DataDSP": + if dsp_df.iat[0, 0] == "DataDSP": count = 0 for _, row in dsp_df.iterrows(): grant_id = row["GrantView Key"] @@ -110,44 +126,65 @@ def main(): print(f"Skipping Dataset {dataset_name} of type {level}") continue # move to next table entry if not data files - print(f"\nProcessing Dataset {dataset_name}") + dataset_id_list = [] + file_scope_list = [] + dataset_name_list = [] + + if formats: # only filter files if formats were specified + print(f"--> Filtering files from {scope_id}") + folder_or_files = "files" # filter files by extension/format + else: + folder_or_files = "folder" # whole folder should be added, don't filter files + + scope_files = filter_files_in_folder(syn, scope_id, formats, folder_or_files) + print(f"--> {scope_id} files acquired!\n {len(scope_files)} files will be added to the Dataset.") + if dataset_id: # check if a Dataset entity was previously recorded - print(f"--> Accessing Dataset {dataset_id}") - dataset = syn.get(dataset_id) - print(f"--> {dataset_id} accessed!") + print(f"--> Files will be added to Dataset {dataset_id}") + dataset = syn.get(dataset_id, downloadFile=False) else: - print( - f"--> A new Dataset will be created for files from {scope_id}" - ) - dataset = create_dataset_entity(syn, dataset_name, grant_id) + dataset = create_dataset_entity(syn, dataset_name, grant_id, multi_dataset=False) update_dsp_sheet = True # record the new DatasetView_id in DSP - print(f"--> New Dataset created!") + print(f"--> New Dataset created for files from {scope_id}") + + dataset_id_list.append(dataset.id) + dataset_name_list.append(dataset.name) - if formats: # only filter files if formats were specified - print(f"--> Filtering files from {scope_id}") - scope_files = filter_files_in_folder(syn, scope_id, formats) - folder_or_files = "files" # use add_items function + if len(scope_files) > file_max: + dataset_total = (len(scope_files) // file_max) + 1 + multi_dataset = True + update_dsp_sheet = True print( - f"--> {scope_id} files filtered!\n {len(scope_files)} files will be added to the Dataset." + f"--> File count exceeds file max.\n--> Creating {dataset_total} new Datasets for files from {scope_id}" ) + for i in range(dataset_total): + dataset = create_dataset_entity(syn, dataset_name, grant_id, multi_dataset) + print(f"--> New Dataset created!") + dataset_id_list.append(dataset.id) + dataset_name_list.append(dataset.name) + file_scope_list = chunk_files_for_dataset(scope_files, file_max, dataset_total) else: - folder_or_files = "folder" # whole folder should be added, use add_folder function - - if folder_or_files == "folder": - print(f"--> Adding Folder {scope_id} to Dataset {dataset_id}") - dataset.add_folder(scope_id, force=True) - print(f"--> Folder added to Dataset!") - elif folder_or_files == "files": - print(f"--> Adding Files from {scope_id} to Dataset {dataset_id}") - dataset.add_items(dataset_items=scope_files, force=True) - print(f"--> Files added to Dataset!") + multi_dataset = False + file_scope_list = [scope_files] # single dataset, no chunking needed - dataset = syn.store(dataset) - print(f"Dataset {dataset_id} successfully stored in {dataset.parentId}") + dataset_tuples = zip(dataset_id_list, file_scope_list, dataset_name_list) - if update_dsp_sheet is not None: - dataset_id = dataset.id - dsp_df.at[_, "DatasetView Key"] = dataset_id + for dataset_id, scope_files, name in dataset_tuples: + dataset = syn.get(dataset_id, downloadFile=False) + dataset.add_items(dataset_items=scope_files, force=True) + print(f"--> Files added to Dataset!") + dataset = syn.store(dataset) + print(f"Dataset {dataset.id} successfully stored in {dataset.parentId}") + if update_dsp_sheet is not None: + temp_df = pd.DataFrame() + if multi_dataset: + temp_df.loc[_] = dsp_df.loc[_] + temp_df[_, "DatasetView Key"] = dataset.id + temp_df[_, "DSP Dataset Name"] = name + dsp_df = pd.concat([dsp_df, temp_df], ignore_index=True) + else: + dataset_id = dataset.id + dsp_df.at[_, "DatasetView Key"] = dataset_id count += 1 else: From 4979407bc5be6410b7b296def266edf21d83ad74 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Mon, 30 Jun 2025 17:24:30 -0700 Subject: [PATCH 13/67] Update build_datasets.py --- utils/build_datasets.py | 80 +++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/utils/build_datasets.py b/utils/build_datasets.py index cf8eae13..81a95423 100644 --- a/utils/build_datasets.py +++ b/utils/build_datasets.py @@ -69,7 +69,7 @@ def filter_files_in_folder(syn, scope: str, formats: list[str], folder_or_files: return dataset_items -def create_dataset_entity(syn, name: str, grant: str, multi_dataset: bool) -> Dataset: +def create_dataset_entity(syn, name: str, grant: str, multi_dataset: bool, scope: list) -> tuple[Dataset, str]: """Create an empty Synapse Dataset using the Project associated with the applicable grant number as parent. Return the Dataset object.""" @@ -78,16 +78,20 @@ def create_dataset_entity(syn, name: str, grant: str, multi_dataset: bool) -> Da project_id = syn.tableQuery(query).asDataFrame().iat[0, 0] if multi_dataset: name = f"{name}-{random.randint(1000, 9999)}" # append random number to name for multi-dataset - dataset = Dataset(name=name, parent=project_id) + dataset = Dataset(name=name, parent=project_id, dataset_items=scope) + dataset = syn.store(dataset) return dataset def chunk_files_for_dataset(scope_files: list[str], file_max: int, dataset_total: int) -> list[list[str]]: """Chunk files into lists of size file_max for Dataset creation.""" file_groups = [] - for i in range(dataset_total): - file_group = scope_files[i * file_max:(i + 1) * file_max] - file_groups.append(file_group) + i = 0 + while i < dataset_total: + file_groups.append(scope_files[i * file_max:(i + 1) * file_max]) + i += 1 + if i == dataset_total: + file_groups.append(scope_files[i * file_max:]) return file_groups def main(): @@ -98,8 +102,11 @@ def main(): dsp, new_name = args.d, args.n update_dsp_sheet = None + create_dataset = False + multi_dataset = False + dataset_total = 0 - file_max = 10000 # maximum number of files per Dataset + file_max = 3 # maximum number of files per Dataset if os.path.exists(dsp): dsp_df = pd.read_csv(dsp, keep_default_na=False) @@ -130,6 +137,13 @@ def main(): file_scope_list = [] dataset_name_list = [] + if dataset_id: # check if a Dataset entity was previously recorded + print(f"--> Files will be added to Dataset {dataset_id}") + else: + dataset_total = dataset_total + 1 + create_dataset = True + update_dsp_sheet = True + if formats: # only filter files if formats were specified print(f"--> Filtering files from {scope_id}") folder_or_files = "files" # filter files by extension/format @@ -138,53 +152,49 @@ def main(): scope_files = filter_files_in_folder(syn, scope_id, formats, folder_or_files) print(f"--> {scope_id} files acquired!\n {len(scope_files)} files will be added to the Dataset.") - - if dataset_id: # check if a Dataset entity was previously recorded - print(f"--> Files will be added to Dataset {dataset_id}") - dataset = syn.get(dataset_id, downloadFile=False) - else: - dataset = create_dataset_entity(syn, dataset_name, grant_id, multi_dataset=False) - update_dsp_sheet = True # record the new DatasetView_id in DSP - print(f"--> New Dataset created for files from {scope_id}") - dataset_id_list.append(dataset.id) - dataset_name_list.append(dataset.name) - if len(scope_files) > file_max: - dataset_total = (len(scope_files) // file_max) + 1 + dataset_total = dataset_total + (len(scope_files) // file_max) multi_dataset = True update_dsp_sheet = True + create_dataset = True print( f"--> File count exceeds file max.\n--> Creating {dataset_total} new Datasets for files from {scope_id}" ) - for i in range(dataset_total): - dataset = create_dataset_entity(syn, dataset_name, grant_id, multi_dataset) - print(f"--> New Dataset created!") - dataset_id_list.append(dataset.id) - dataset_name_list.append(dataset.name) file_scope_list = chunk_files_for_dataset(scope_files, file_max, dataset_total) + else: multi_dataset = False file_scope_list = [scope_files] # single dataset, no chunking needed - dataset_tuples = zip(dataset_id_list, file_scope_list, dataset_name_list) - - for dataset_id, scope_files, name in dataset_tuples: + if dataset_id: dataset = syn.get(dataset_id, downloadFile=False) - dataset.add_items(dataset_items=scope_files, force=True) - print(f"--> Files added to Dataset!") + dataset_id_list.append(dataset.id) + dataset_name_list.append(dataset.name) + dataset.add_items(dataset_items=file_scope_list[0], force=True) dataset = syn.store(dataset) - print(f"Dataset {dataset.id} successfully stored in {dataset.parentId}") + print(f"--> Files added to existing Dataset {dataset.id}") + file_scope_list = file_scope_list[1:] # remove first item, already added + + if create_dataset: + for i, scope in zip(range(dataset_total), file_scope_list): + dataset = create_dataset_entity(syn, dataset_name, grant_id, multi_dataset, scope) + print(f"--> New Dataset created and populated with files!") + dataset_id_list.append(dataset.id) + dataset_name_list.append(dataset.name) + + dataset_tuples = zip(dataset_id_list, dataset_name_list) + + for dataset_id, name in dataset_tuples: if update_dsp_sheet is not None: - temp_df = pd.DataFrame() + temp_df = dsp_df.copy() if multi_dataset: - temp_df.loc[_] = dsp_df.loc[_] - temp_df[_, "DatasetView Key"] = dataset.id - temp_df[_, "DSP Dataset Name"] = name + temp_df.at[_, "DatasetView Key"] = dataset_id + temp_df.at[_, "DSP Dataset Name"] = name dsp_df = pd.concat([dsp_df, temp_df], ignore_index=True) else: - dataset_id = dataset.id dsp_df.at[_, "DatasetView Key"] = dataset_id + dsp_df.drop_duplicates(subset=["DatasetView Key"], keep="last", inplace=True) count += 1 else: @@ -193,7 +203,7 @@ def main(): ) exit() - print(f"\n\nDONE ✅\n{count} Datasets processed") + print(f"\n\nDONE ✅\n{count} DSP entries processed") if update_dsp_sheet is not None: dsp_path = f"{os.getcwd()}/{new_name}.csv" From 534353516b4d3335ab4a63adf70cdfbe91ccaed4 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 2 Jul 2025 15:55:35 -0700 Subject: [PATCH 14/67] Update .gitignore Don't track utils/example_files --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b17ddd0b..19d73c74 100644 --- a/.gitignore +++ b/.gitignore @@ -140,4 +140,4 @@ upload_check* annotations/inputs/* annotations/outputs/* annotations/output/* -mapped_metadata.csv +utils/example_files/* From 87802916c141a9f98e1de41912d09c97c2b8b2ba Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 2 Jul 2025 16:02:36 -0700 Subject: [PATCH 15/67] Update build_datasets.py Additional updates to add large numbers of files to datasets. The script will now create an appropriate number of groups based on the number of files, taking into account the number of datasets that will be used to store the files. Newly created datasets are added as new rows to the DSP and include the values from the source row. --- utils/build_datasets.py | 45 ++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/utils/build_datasets.py b/utils/build_datasets.py index 81a95423..0fb6052c 100644 --- a/utils/build_datasets.py +++ b/utils/build_datasets.py @@ -90,8 +90,11 @@ def chunk_files_for_dataset(scope_files: list[str], file_max: int, dataset_total while i < dataset_total: file_groups.append(scope_files[i * file_max:(i + 1) * file_max]) i += 1 - if i == dataset_total: - file_groups.append(scope_files[i * file_max:]) + if i == dataset_total: + if len(scope_files) % file_max != 0: + file_groups.append(scope_files[i * file_max:]) + else: + file_groups.append(scope_files[(i - 1) * file_max:i * file_max]) return file_groups def main(): @@ -101,15 +104,14 @@ def main(): args = get_args() dsp, new_name = args.d, args.n + update_dsp_sheet = None create_dataset = False multi_dataset = False - dataset_total = 0 + file_max = 5000 # maximum number of files per Dataset; set to 5000 to avoid web page latency issues - file_max = 3 # maximum number of files per Dataset - if os.path.exists(dsp): - dsp_df = pd.read_csv(dsp, keep_default_na=False) + dsp_df = pd.read_csv(dsp, keep_default_na=False, header=0) print("\nData Sharing Plan read successfully!") elif "syn" in dsp: dsp_df = get_table(syn, dsp) @@ -121,6 +123,7 @@ def main(): exit() if dsp_df.iat[0, 0] == "DataDSP": + updated_df = pd.DataFrame(columns=dsp_df.columns) count = 0 for _, row in dsp_df.iterrows(): grant_id = row["GrantView Key"] @@ -133,6 +136,7 @@ def main(): print(f"Skipping Dataset {dataset_name} of type {level}") continue # move to next table entry if not data files + dataset_total = 1 dataset_id_list = [] file_scope_list = [] dataset_name_list = [] @@ -140,7 +144,6 @@ def main(): if dataset_id: # check if a Dataset entity was previously recorded print(f"--> Files will be added to Dataset {dataset_id}") else: - dataset_total = dataset_total + 1 create_dataset = True update_dsp_sheet = True @@ -154,14 +157,15 @@ def main(): print(f"--> {scope_id} files acquired!\n {len(scope_files)} files will be added to the Dataset.") if len(scope_files) > file_max: - dataset_total = dataset_total + (len(scope_files) // file_max) + new_dataset_count = (len(scope_files) // file_max) + dataset_total = dataset_total + new_dataset_count multi_dataset = True update_dsp_sheet = True create_dataset = True print( - f"--> File count exceeds file max.\n--> Creating {dataset_total} new Datasets for files from {scope_id}" + f"--> File count exceeds file max.\n--> Creating {dataset_total} groups for files from {scope_id}" ) - file_scope_list = chunk_files_for_dataset(scope_files, file_max, dataset_total) + file_scope_list = chunk_files_for_dataset(scope_files, file_max, new_dataset_count) else: multi_dataset = False @@ -172,31 +176,30 @@ def main(): dataset_id_list.append(dataset.id) dataset_name_list.append(dataset.name) dataset.add_items(dataset_items=file_scope_list[0], force=True) - dataset = syn.store(dataset) + syn.store(dataset) print(f"--> Files added to existing Dataset {dataset.id}") file_scope_list = file_scope_list[1:] # remove first item, already added if create_dataset: - for i, scope in zip(range(dataset_total), file_scope_list): + for scope in file_scope_list: dataset = create_dataset_entity(syn, dataset_name, grant_id, multi_dataset, scope) print(f"--> New Dataset created and populated with files!") dataset_id_list.append(dataset.id) dataset_name_list.append(dataset.name) + + count += 1 dataset_tuples = zip(dataset_id_list, dataset_name_list) for dataset_id, name in dataset_tuples: if update_dsp_sheet is not None: temp_df = dsp_df.copy() - if multi_dataset: - temp_df.at[_, "DatasetView Key"] = dataset_id - temp_df.at[_, "DSP Dataset Name"] = name - dsp_df = pd.concat([dsp_df, temp_df], ignore_index=True) - else: - dsp_df.at[_, "DatasetView Key"] = dataset_id - dsp_df.drop_duplicates(subset=["DatasetView Key"], keep="last", inplace=True) + temp_df.iloc[[_]] = row + temp_df.at[_, "DatasetView Key"] = dataset_id + temp_df.at[_, "DSP Dataset Name"] = name + updated_df = pd.concat([updated_df, temp_df], ignore_index=True) + updated_df.drop_duplicates(subset=["DatasetView Key"], keep="last", inplace=True) - count += 1 else: print( f"❗❗❗ The table provided does not appear to be a Dataset Sharing Plan.❗❗❗\nPlease check its contents and try again." @@ -207,7 +210,7 @@ def main(): if update_dsp_sheet is not None: dsp_path = f"{os.getcwd()}/{new_name}.csv" - dsp_df.to_csv(path_or_buf=dsp_path, index=False) + updated_df.to_csv(path_or_buf=dsp_path, index=False) print(f"\nDSP sheet has been updated\nPath: {dsp_path}") From 589dde0e9cf53195a1e111e8e89402e52dcf22a9 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 2 Jul 2025 16:03:33 -0700 Subject: [PATCH 16/67] Moved crdc example input to example_files folder --- utils/example_input_map_to_crdc.csv | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 utils/example_input_map_to_crdc.csv diff --git a/utils/example_input_map_to_crdc.csv b/utils/example_input_map_to_crdc.csv deleted file mode 100644 index 7a9fa016..00000000 --- a/utils/example_input_map_to_crdc.csv +++ /dev/null @@ -1,5 +0,0 @@ -entity_id,data_type,study_key -syn64713344,ImagingLevel2,CA261841-CR_2024 -syn64713343,ImagingLevel2,CA261841-CR_2024 -syn64779989,ImagingLevel3,CA261841-CR_2024 -syn65877778,Study,CA261841-CR_2024 From b0fbf787e85c756da1861be2bd4b9e29430a93d7 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 11 Jul 2025 11:43:38 -0700 Subject: [PATCH 17/67] Update map_to_crdc.py --- utils/map_to_crdc.py | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/utils/map_to_crdc.py b/utils/map_to_crdc.py index 1a369d9e..19f50515 100644 --- a/utils/map_to_crdc.py +++ b/utils/map_to_crdc.py @@ -41,6 +41,13 @@ def get_args(): required=True, default=None, ) + parser.add_argument( + "-o", + type=str, + help="Output directory", + required=False, + default="./output", + ) return parser.parse_args() @@ -59,25 +66,12 @@ def get_table(syn, source_id: str, cols: str | list = "*", conditions: str | Non return table -def extract_lists(df: pd.DataFrame, list_columns, pattern) -> pd.DataFrame: - """Extract bracketed/quoted lists from sheets.""" - - for col in list_columns: - - df[col] = ( - df[col] - .apply(lambda x: re.findall(pattern, x)) - .str.join(", ")) - - return df - - def main(): """Main function.""" args = get_args() - manifests, target_output, mapping = args.d, args.t, args.m + manifests, target_output, mapping, out_dir = args.d, args.t, args.m, args.o syn = synapseclient.login() @@ -103,9 +97,8 @@ def main(): for id, (data_type, study_key) in source_metadata_dict.items(): if data_type == "Study" and target_output in ["study", "image"]: df = get_table(syn, id, cols="*", conditions=f"Study_id = '{study_key}'") - elif target_output != "study": - if data_type not in ["Study"]: - df = query(query=f"SELECT * FROM {id}") + elif data_type != "Study" and target_output != "study": + df = query(query=f"SELECT * FROM {id}") else: df = pd.DataFrame() source_metadata_dict[id] = (data_type, df, df.columns.tolist()) @@ -115,8 +108,8 @@ def main(): mapped_df = df.rename(columns={"".join("".join(str(gc_mc2_mapping_dict[attribute]).split(" ")).split("-")): attribute for attribute in mapped_attributes}) template_df = pd.concat([template_df, mapped_df]).drop_duplicates(subset=attribute_list, keep="first").reset_index(drop=True) - template_df[attribute_list].to_csv(f"{target_output}_mapped_metadata.csv", index=False) - print(f"Mapped metadata saved to {target_output}_mapped_metadata.csv") + template_df[attribute_list].to_csv(f"{out_dir}/{target_output}_mapped_metadata.csv", index=False) + print(f"Mapped metadata saved to {out_dir}/{target_output}_mapped_metadata.csv") if __name__ == "__main__": main() From 7419461c593da1cb58276cfca91f81fd1be16b76 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 11 Jul 2025 12:21:28 -0700 Subject: [PATCH 18/67] Revert changes to build_datasets.py New features are being logged in a separate branch --- utils/build_datasets.py | 134 +++++++++++++--------------------------- 1 file changed, 42 insertions(+), 92 deletions(-) diff --git a/utils/build_datasets.py b/utils/build_datasets.py index 0fb6052c..1f9f5c0f 100644 --- a/utils/build_datasets.py +++ b/utils/build_datasets.py @@ -12,7 +12,6 @@ import argparse import os import pandas as pd -import random import re import synapseclient from synapseclient import Dataset @@ -48,54 +47,34 @@ def get_table(syn, source_id: str) -> pd.DataFrame: return table -def filter_files_in_folder(syn, scope: str, formats: list[str], folder_or_files: str) -> list: +def filter_files_in_folder(syn, scope: str, formats: list[str]) -> list: """Capture all files in provided scope and select files that match a list of formats, return list of dataset items""" dataset_items = [] walk_path = synapseutils.walk(syn, scope, ["file"]) for *_, filename in walk_path: - if folder_or_files == "files": - file_to_add = [entity_id for f, entity_id in filename if any(f.endswith(fmt) for fmt in formats)] # only select files of desired format - elif folder_or_files == "folder": - file_to_add = [entity_id for f, entity_id in filename] # select all files in folder - for entity_id in file_to_add: - dataset_items.append({ + for f, entity_id in filename: + if any(f.endswith(fmt) for fmt in formats): # only select files of desired format + dataset_items.append({ "entityId": entity_id, "versionNumber": syn.get(entity_id, downloadFile=False).versionLabel }) - dataset_len = len(dataset_items) - print(f"--> {dataset_len} files found...") + return dataset_items -def create_dataset_entity(syn, name: str, grant: str, multi_dataset: bool, scope: list) -> tuple[Dataset, str]: +def create_dataset_entity(syn, name: str, grant: str) -> Dataset: """Create an empty Synapse Dataset using the Project associated with the applicable grant number as parent. Return the Dataset object.""" query = f"SELECT grantId FROM syn21918972 WHERE grantViewId='{grant}'" project_id = syn.tableQuery(query).asDataFrame().iat[0, 0] - if multi_dataset: - name = f"{name}-{random.randint(1000, 9999)}" # append random number to name for multi-dataset - dataset = Dataset(name=name, parent=project_id, dataset_items=scope) - dataset = syn.store(dataset) + dataset = Dataset(name=name, parent=project_id) return dataset -def chunk_files_for_dataset(scope_files: list[str], file_max: int, dataset_total: int) -> list[list[str]]: - """Chunk files into lists of size file_max for Dataset creation.""" - file_groups = [] - i = 0 - while i < dataset_total: - file_groups.append(scope_files[i * file_max:(i + 1) * file_max]) - i += 1 - if i == dataset_total: - if len(scope_files) % file_max != 0: - file_groups.append(scope_files[i * file_max:]) - else: - file_groups.append(scope_files[(i - 1) * file_max:i * file_max]) - return file_groups def main(): @@ -104,14 +83,10 @@ def main(): args = get_args() dsp, new_name = args.d, args.n - update_dsp_sheet = None - create_dataset = False - multi_dataset = False - file_max = 5000 # maximum number of files per Dataset; set to 5000 to avoid web page latency issues - + if os.path.exists(dsp): - dsp_df = pd.read_csv(dsp, keep_default_na=False, header=0) + dsp_df = pd.read_csv(dsp, keep_default_na=False) print("\nData Sharing Plan read successfully!") elif "syn" in dsp: dsp_df = get_table(syn, dsp) @@ -122,8 +97,7 @@ def main(): ) exit() - if dsp_df.iat[0, 0] == "DataDSP": - updated_df = pd.DataFrame(columns=dsp_df.columns) + if dsp_df.iat[1, 0] == "DataDSP": count = 0 for _, row in dsp_df.iterrows(): grant_id = row["GrantView Key"] @@ -136,81 +110,57 @@ def main(): print(f"Skipping Dataset {dataset_name} of type {level}") continue # move to next table entry if not data files - dataset_total = 1 - dataset_id_list = [] - file_scope_list = [] - dataset_name_list = [] - + print(f"\nProcessing Dataset {dataset_name}") if dataset_id: # check if a Dataset entity was previously recorded - print(f"--> Files will be added to Dataset {dataset_id}") + print(f"--> Accessing Dataset {dataset_id}") + dataset = syn.get(dataset_id) + print(f"--> {dataset_id} accessed!") else: - create_dataset = True - update_dsp_sheet = True - + print( + f"--> A new Dataset will be created for files from {scope_id}" + ) + dataset = create_dataset_entity(syn, dataset_name, grant_id) + update_dsp_sheet = True # record the new DatasetView_id in DSP + print(f"--> New Dataset created!") + if formats: # only filter files if formats were specified print(f"--> Filtering files from {scope_id}") - folder_or_files = "files" # filter files by extension/format - else: - folder_or_files = "folder" # whole folder should be added, don't filter files - - scope_files = filter_files_in_folder(syn, scope_id, formats, folder_or_files) - print(f"--> {scope_id} files acquired!\n {len(scope_files)} files will be added to the Dataset.") - - if len(scope_files) > file_max: - new_dataset_count = (len(scope_files) // file_max) - dataset_total = dataset_total + new_dataset_count - multi_dataset = True - update_dsp_sheet = True - create_dataset = True + scope_files = filter_files_in_folder(syn, scope_id, formats) + folder_or_files = "files" # use add_items function print( - f"--> File count exceeds file max.\n--> Creating {dataset_total} groups for files from {scope_id}" + f"--> {scope_id} files filtered!\n {len(scope_files)} files will be added to the Dataset." ) - file_scope_list = chunk_files_for_dataset(scope_files, file_max, new_dataset_count) - else: - multi_dataset = False - file_scope_list = [scope_files] # single dataset, no chunking needed - - if dataset_id: - dataset = syn.get(dataset_id, downloadFile=False) - dataset_id_list.append(dataset.id) - dataset_name_list.append(dataset.name) - dataset.add_items(dataset_items=file_scope_list[0], force=True) - syn.store(dataset) - print(f"--> Files added to existing Dataset {dataset.id}") - file_scope_list = file_scope_list[1:] # remove first item, already added - - if create_dataset: - for scope in file_scope_list: - dataset = create_dataset_entity(syn, dataset_name, grant_id, multi_dataset, scope) - print(f"--> New Dataset created and populated with files!") - dataset_id_list.append(dataset.id) - dataset_name_list.append(dataset.name) - - count += 1 + folder_or_files = "folder" # whole folder should be added, use add_folder function - dataset_tuples = zip(dataset_id_list, dataset_name_list) + if folder_or_files == "folder": + print(f"--> Adding Folder {scope_id} to Dataset {dataset_id}") + dataset.add_folder(scope_id, force=True) + print(f"--> Folder added to Dataset!") + elif folder_or_files == "files": + print(f"--> Adding Files from {scope_id} to Dataset {dataset_id}") + dataset.add_items(dataset_items=scope_files, force=True) + print(f"--> Files added to Dataset!") - for dataset_id, name in dataset_tuples: - if update_dsp_sheet is not None: - temp_df = dsp_df.copy() - temp_df.iloc[[_]] = row - temp_df.at[_, "DatasetView Key"] = dataset_id - temp_df.at[_, "DSP Dataset Name"] = name - updated_df = pd.concat([updated_df, temp_df], ignore_index=True) - updated_df.drop_duplicates(subset=["DatasetView Key"], keep="last", inplace=True) + dataset = syn.store(dataset) + print(f"Dataset {dataset_id} successfully stored in {dataset.parentId}") + if update_dsp_sheet is not None: + dataset_id = dataset.id + dsp_df.at[_, "DatasetView Key"] = dataset_id + + count += 1 else: print( f"❗❗❗ The table provided does not appear to be a Dataset Sharing Plan.❗❗❗\nPlease check its contents and try again." ) exit() - print(f"\n\nDONE ✅\n{count} DSP entries processed") + print(f"\n\nDONE ✅\n{count} Datasets processed") if update_dsp_sheet is not None: dsp_path = f"{os.getcwd()}/{new_name}.csv" - updated_df.to_csv(path_or_buf=dsp_path, index=False) + dsp_df.to_csv(path_or_buf=dsp_path, index=False) print(f"\nDSP sheet has been updated\nPath: {dsp_path}") From 998de6e7875e4406b309fdd8efb1641e2764c4e5 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 16 Jul 2025 17:32:14 -0700 Subject: [PATCH 19/67] Create process_arachne_mapping.py Initial version, created by ChatGPT, based off of example JSON mapping file. --- utils/process_arachne_mapping.py | 73 ++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 utils/process_arachne_mapping.py diff --git a/utils/process_arachne_mapping.py b/utils/process_arachne_mapping.py new file mode 100644 index 00000000..cf687552 --- /dev/null +++ b/utils/process_arachne_mapping.py @@ -0,0 +1,73 @@ +""" +process_arachne_mapping.py + +Takes an input CSV of metadata and JSON formatted mapping, generated by the +arachne agent, as input. + +Returns a TSV file of General Commons metadata, built based on input CSV and +the mapping JSON. + +author: orion.banks +""" + +import pandas as pd +import os + +# Define the mapping configuration +mapping_config = { + "source_file": "input_1/sample.csv", + "targets": [ + { + "output_file": "gdc_sample.csv", + "mappings": [ + {"target": "type", "source": "type"}, + {"target": "sample_id", "source": "sample_id"}, + {"target": "sample_description", "source": "sample_description"}, + {"target": "sample_tumor_status", "source": "sample_tumor_status"}, + {"target": "sample_anatomic_site", "source": "anatomic_site"}, + {"target": "sample_age_at_collection", "source": "participant_age_at_collection"}, + { + "target": "sample_type_category", + "source": "tumor_classification", + "value": { + "map": { + "Primary": "Primary Tumor", + "Metastatic": "Metastatic Tumor" + } + } + }, + {"target": "participant.study_participant_id", "source": "participant.participant_id"} + ] + } + ] +} + +def transform_csv_to_tsv(mapping_config): + source_file = mapping_config["source_file"] + for target in mapping_config["targets"]: + output_file = target["output_file"].replace(".csv", ".tsv") + mappings = target["mappings"] + + # Read the input CSV file + df = pd.read_csv(source_file) + + # Prepare the output DataFrame + transformed_df = pd.DataFrame() + + for mapping in mappings: + source_col = mapping["source"] + target_col = mapping["target"] + + if "value" in mapping and "map" in mapping["value"]: + value_map = mapping["value"]["map"] + transformed_df[target_col] = df[source_col].map(value_map).fillna(df[source_col]) + else: + transformed_df[target_col] = df[source_col] + + # Save the transformed DataFrame to TSV + os.makedirs(os.path.dirname(output_file), exist_ok=True) + transformed_df.to_csv(output_file, sep='\t', index=False) + print(f"Transformed file saved to {output_file}") + +# Run the transformation +transform_csv_to_tsv(mapping_config) From bcdf91779fb574780a29771a74254f7a8fe7f23e Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 17 Jul 2025 17:47:31 -0700 Subject: [PATCH 20/67] Update process_arachne_mapping.py --- utils/process_arachne_mapping.py | 124 ++++++++++++++++++------------- 1 file changed, 72 insertions(+), 52 deletions(-) diff --git a/utils/process_arachne_mapping.py b/utils/process_arachne_mapping.py index cf687552..0631a9d8 100644 --- a/utils/process_arachne_mapping.py +++ b/utils/process_arachne_mapping.py @@ -10,64 +10,84 @@ author: orion.banks """ +import argparse +import json import pandas as pd import os -# Define the mapping configuration -mapping_config = { - "source_file": "input_1/sample.csv", - "targets": [ - { - "output_file": "gdc_sample.csv", - "mappings": [ - {"target": "type", "source": "type"}, - {"target": "sample_id", "source": "sample_id"}, - {"target": "sample_description", "source": "sample_description"}, - {"target": "sample_tumor_status", "source": "sample_tumor_status"}, - {"target": "sample_anatomic_site", "source": "anatomic_site"}, - {"target": "sample_age_at_collection", "source": "participant_age_at_collection"}, - { - "target": "sample_type_category", - "source": "tumor_classification", - "value": { - "map": { - "Primary": "Primary Tumor", - "Metastatic": "Metastatic Tumor" - } - } - }, - {"target": "participant.study_participant_id", "source": "participant.participant_id"} - ] - } - ] -} - -def transform_csv_to_tsv(mapping_config): - source_file = mapping_config["source_file"] - for target in mapping_config["targets"]: - output_file = target["output_file"].replace(".csv", ".tsv") - mappings = target["mappings"] +def get_args(): + """Set up command-line interface and get arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "-m", + "--mapping", + type=str, + help="Path to mapping file generated by the arachne agent", + required=True + ) + + return parser.parse_args() - # Read the input CSV file - df = pd.read_csv(source_file) - - # Prepare the output DataFrame - transformed_df = pd.DataFrame() +# Define the mapping configuration +def build_mapping_config(mapping_file: str) -> dict: + """ + Reads a mapping JSON file from the arachne agent. + Returns a dictionary of the parsed mapping configuration. + """ + with open(mapping_file, 'r') as mapping_json: + mapping = json.load(mapping_json) - for mapping in mappings: - source_col = mapping["source"] - target_col = mapping["target"] + return mapping - if "value" in mapping and "map" in mapping["value"]: - value_map = mapping["value"]["map"] - transformed_df[target_col] = df[source_col].map(value_map).fillna(df[source_col]) - else: - transformed_df[target_col] = df[source_col] +def transform_csv_to_tsv(mapping_config): + source_file = mapping_config["source_file"] + for target in mapping_config["targets"]: + output_file = target["output_file"].replace(".csv", ".tsv") + mappings = target["mappings"] + + # Read the input CSV file + df = pd.read_csv(source_file) + + # Prepare the output DataFrame + transformed_df = pd.DataFrame() + + for mapping in mappings: + try: + source_col = mapping["source"] + except KeyError: + print(f"'source' key not found in mapping {mapping}. Assuming this will be a constant or empty.") + source_col = None + target_col = mapping["target"] + + if "value" in mapping: + if source_col is None: + if mapping["value"]: + constant_value = mapping["value"] + transformed_df[target_col] = constant_value + else: + transformed_df[target_col] = None + elif "map" in mapping["value"]: + value_map = mapping["value"]["map"] + transformed_df[target_col] = df[source_col].map(value_map).fillna(df[source_col]) + else: + constant_value = mapping["value"] + transformed_df[target_col] = constant_value + else: + transformed_df[target_col] = df[source_col].fillna('') - # Save the transformed DataFrame to TSV - os.makedirs(os.path.dirname(output_file), exist_ok=True) - transformed_df.to_csv(output_file, sep='\t', index=False) - print(f"Transformed file saved to {output_file}") + # Save the transformed DataFrame to TSV + os.makedirs(os.path.dirname(output_file), exist_ok=True) + transformed_df.to_csv(output_file, sep='\t', index=False) + print(f"Transformed file saved to {output_file}") # Run the transformation -transform_csv_to_tsv(mapping_config) +def main(): + + args = get_args() + + mapping_config = build_mapping_config(args.mapping) + + transform_csv_to_tsv(mapping_config) + +if __name__ == "__main__": + main() From 031502eb5687a88c455242d1d4163c3cdc96d5d4 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 18 Jul 2025 11:40:35 -0700 Subject: [PATCH 21/67] Improve handling when value is None --- utils/process_arachne_mapping.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/utils/process_arachne_mapping.py b/utils/process_arachne_mapping.py index 0631a9d8..11307703 100644 --- a/utils/process_arachne_mapping.py +++ b/utils/process_arachne_mapping.py @@ -60,20 +60,18 @@ def transform_csv_to_tsv(mapping_config): target_col = mapping["target"] if "value" in mapping: - if source_col is None: - if mapping["value"]: - constant_value = mapping["value"] - transformed_df[target_col] = constant_value + if mapping["value"] is not None: + if "map" in mapping["value"]: + value_map = mapping["value"]["map"] + default = mapping["value"]["default_literal"] if mapping["value"]["default_literal"] else "" + transformed_df[target_col] = df[source_col].map(value_map).fillna(default) else: - transformed_df[target_col] = None - elif "map" in mapping["value"]: - value_map = mapping["value"]["map"] - transformed_df[target_col] = df[source_col].map(value_map).fillna(df[source_col]) + constant_value = mapping["value"] + transformed_df[target_col] = [constant_value] * df.shape[0] else: - constant_value = mapping["value"] - transformed_df[target_col] = constant_value + transformed_df[target_col] = None else: - transformed_df[target_col] = df[source_col].fillna('') + transformed_df[target_col] = df[source_col].fillna("") # Save the transformed DataFrame to TSV os.makedirs(os.path.dirname(output_file), exist_ok=True) From 197a14664358a38079b58bc3bfc0909a1aa931e7 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 27 Aug 2025 17:11:55 -0700 Subject: [PATCH 22/67] Create csv_to_ttl.py Initial version, generates valid ttl input for arachne agent, based on this input CSV: https://docs.google.com/spreadsheets/d/1LLpSIFAh12YdKnGfzXMxGpoKCaEH90nDx-QvncaIJlk/edit?gid=264257960#gid=264257960 The sheet linked above contains a version of the MC2 Center data model, which has been expanded and packaged for conversion to triples. Currently planning an update to the script that allows the use of a standard schematic-compatible data model CSV as the primary input. --- utils/csv_to_ttl.py | 89 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 utils/csv_to_ttl.py diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py new file mode 100644 index 00000000..71cc5b4b --- /dev/null +++ b/utils/csv_to_ttl.py @@ -0,0 +1,89 @@ +""" +csv_to_ttl.py + +Converts a CSV with formatted ttl info to a ttl file +ttl file can be used as a graph input for the arachne agent. + +author: orion.banks +""" + +import argparse +import os +import pandas as pd + +def get_args(): + """Set up command-line interface and get arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "-m", + "--mapping", + type=str, + help="Path to ttl source content file", + required=True + ) + parser.add_argument( + "-o", + "--output", + type=str, + help="Path to folder where graph should be stored", + required=False, + default=os.getcwd() + ) + parser.add_argument( + "-g", + "--org_name", + type=str, + help="Abbreviation used to label the data model", + required=False, + default="new_org" + ) + return parser.parse_args() + +def main(): + + args = get_args() + + base_tag = "" + label_tag = "" + desc_tag = "" + node_tag = "" + type_tag = "" + req_tag = "" + cde_tag = "" + key_tag = "" + + ttl_df = pd.read_csv(args.mapping, header=0, keep_default_na=False) + + out_file = "/".join([args.output, f"{args.org_name}.ttl"]) + + with open(out_file, "w+") as f: + + for _, row in ttl_df.iterrows(): + ttl_dict = { + "term": row["term"], + label_tag: row["label"], + desc_tag: row["description"], + node_tag: row["node"], + type_tag: row["type"], + req_tag: row["required_by"], + cde_tag: row["is_cde"], + key_tag: row["is_key"] + } + + f.write(f"{ttl_dict['term']} {label_tag} {ttl_dict[label_tag]};"+"\n") + f.write("\t"+f"{desc_tag} {ttl_dict[desc_tag]};"+"\n") + f.write("\t"+f"{node_tag} {ttl_dict[node_tag]};"+"\n") + line_end = ";" if ttl_dict[req_tag] or ttl_dict[key_tag] or ttl_dict[cde_tag] else " ." + f.write("\t"+f"{type_tag} {ttl_dict[type_tag]}{line_end}"+"\n") + if ttl_dict[req_tag]: + line_end = ";\n" if ttl_dict[key_tag] or ttl_dict[cde_tag] else " .\n" + f.write("\t"+f"{req_tag} {''.join([ttl_dict[req_tag], line_end])}") + if ttl_dict[key_tag]: + line_end = ";\n" if ttl_dict[cde_tag] else " .\n" + f.write("\t"+f"{key_tag} {''.join([ttl_dict[key_tag], line_end])}") + if ttl_dict[cde_tag]: + f.write("\t"+f"{cde_tag} {' '.join([ttl_dict[cde_tag], '.'])}"+"\n") + f.write("\n") + +if __name__ == "__main__": + main() From a8e137bdb83ba1f2b925f32f8696ef09b03b9dea Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 28 Aug 2025 17:46:41 -0700 Subject: [PATCH 23/67] Update process_arachne_mapping.py Add try/except block to create output dir if not defined in arachne mapping output --- utils/process_arachne_mapping.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/utils/process_arachne_mapping.py b/utils/process_arachne_mapping.py index 11307703..fd8b346c 100644 --- a/utils/process_arachne_mapping.py +++ b/utils/process_arachne_mapping.py @@ -74,7 +74,11 @@ def transform_csv_to_tsv(mapping_config): transformed_df[target_col] = df[source_col].fillna("") # Save the transformed DataFrame to TSV - os.makedirs(os.path.dirname(output_file), exist_ok=True) + try: + os.makedirs(os.path.dirname(output_file), exist_ok=True) + except FileNotFoundError: + os.makedirs("mappings", exist_ok=True) + output_file = "/".join(["mappings", output_file]) transformed_df.to_csv(output_file, sep='\t', index=False) print(f"Transformed file saved to {output_file}") From 77f8deedd0580ad68a97ed0c3fda6a51744e6fc4 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 5 Sep 2025 13:01:29 -0700 Subject: [PATCH 24/67] Create build_template_ttl.py Add a function that generates serialized RDF triples, representing a metadata template, from a metadata template CSV. --- utils/build_template_ttl.py | 101 ++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 utils/build_template_ttl.py diff --git a/utils/build_template_ttl.py b/utils/build_template_ttl.py new file mode 100644 index 00000000..54140f26 --- /dev/null +++ b/utils/build_template_ttl.py @@ -0,0 +1,101 @@ +""" +build_template_ttl.py + +Converts a metadata template CSV info to a ttl file defining the template. +ttl file can be used as input for the arachne agent and would be available as a target. + +author: orion.banks +""" + +import argparse +import os +import pandas as pd +from pathlib import Path +from uuid import uuid4 + +def get_args(): + """Set up command-line interface and get arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "-t", + "--template", + type=str, + help="Path to metadata template CSV", + required=False + ) + parser.add_argument( + "-o", + "--output", + type=str, + help="Path to folder where graph should be stored", + required=False, + default=os.getcwd() + ) + parser.add_argument( + "-g", + "--org_name", + type=str, + help="Abbreviation for org, used in RDF prefixes", + required=False, + default="new_org" + ) + parser.add_argument( + "-p", + "--tag_prefix", + type=str, + help="The tag that will be used as a prefix in RDF", + required=False, + default="http://syn.org" + ) + return parser.parse_args() + + +def format_uri(column_name): + + return column_name.strip().lower().replace(" ", "_") + + +def main(): + + args = get_args() + + prefix_tag = args.tag_prefix + conform_tag = "" + int_tag = "" + + if args.template: + print(f"Processing model [{args.template}] to template.ttl...") + template_path = Path(args.template) + template_name = template_path.stem + template_df = pd.read_csv(args.template, header=0, keep_default_na=True) + + out_file = "/".join([args.output, f"{args.org_name}_{template_name}.ttl"]) + + with open(out_file, "w+") as f: + print(f"Building RDF triples and serializing to TTL...") + + # write template definition + f.write(f"<{prefix_tag}/{args.org_name}/{template_name}> a <{prefix_tag}/Template> ."+"\n") + f.write(f"<{prefix_tag}/{args.org_name}/{template_name}> {conform_tag} <{prefix_tag}/{args.org_name}> ."+"\n") + + # write column definitions + # set col position counter + col_position = 0 + for col in template_df.columns: + clean_col = format_uri(col) + col_uuid = uuid4() + if col in ["Component", "type"]: + f.write(f'<{prefix_tag}/{args.org_name}/{template_name}/{clean_col}> <{prefix_tag}/defaultValue> "{template_name}"'+"\n") + f.write(f"<{prefix_tag}/{args.org_name}/{template_name}> <{prefix_tag}/hasColumn> <{prefix_tag}/{args.org_name}/{template_name}/{clean_col}> ."+"\n") + f.write(f"<{prefix_tag}/{col_uuid}>a <{prefix_tag}/ColumnPosition> ;"+"\n") + f.write(f"<{prefix_tag}/template> <{prefix_tag}/{args.org_name}/{template_name}> ;"+"\n") + f.write("\t"+f"<{prefix_tag}/column> <{prefix_tag}/{args.org_name}/{template_name}/{clean_col}> ;"+"\n") + f.write("\t"+f'<{prefix_tag}/header> "{col}" ;'+"\n") + f.write("\t"+f'<{prefix_tag}/position> "{col_position}"^^{int_tag} .'+"\n") + col_position += 1 + + print(f"Done ✅") + print(f"{out_file} was written with {col_position} attributes!") + +if __name__ == "__main__": + main() From 2b3d4840125adf35d141c4ea7026efa11c25b1ba Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 5 Sep 2025 16:27:26 -0700 Subject: [PATCH 25/67] Allow schematic CSV as input Added a couple functions to support extraction of RDF content directly from a schematic data model CSV. Functions use the upcoming version of the MC2 Center data model (v12.0.0) which includes the new columnType column and CDE:Public Id references in the Property column, where applicable. --- utils/csv_to_ttl.py | 101 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 98 insertions(+), 3 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 71cc5b4b..a586772a 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -16,10 +16,17 @@ def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "-m", + "--model", + type=str, + help="Path to schematic data model CSV", + required=False + ) + parser.add_argument( + "-p", "--mapping", type=str, help="Path to ttl source content file", - required=True + required=False ) parser.add_argument( "-o", @@ -39,6 +46,82 @@ def get_args(): ) return parser.parse_args() + +def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name) -> pd.DataFrame: + + out_df = pd.DataFrame() + + # Step 1: Identify all node rows (treat rows with non-empty DependsOn as nodes) + node_rows = input_df[input_df["DependsOn"].notna()] + attribute_rows = input_df[input_df["DependsOn"].isna()].set_index("Attribute") + attribute_to_node = {row["Attribute"]: str(row["DependsOn"]).split(", ") for _, row in node_rows.iterrows()} + + + attribute_info = [(attribute, node) for node, attribute_list in attribute_to_node.items() for attribute in attribute_list] + out_df["label"] = [entry[0] for entry in attribute_info] + out_df["Resolved_Node"] = [entry[1] for entry in attribute_info] + + # Step 2: Assign node URI for each attribute + out_df["Resolved_Node_URI"] = out_df["Resolved_Node"].apply( + lambda x: f"" + ) + + # Step 3: Construct term URIs for each attribute + out_df["term"] = out_df.apply(lambda row: format_uri(row["Resolved_Node"], row["label"], org_name), axis=1) + + # Step 4: TTL-compatible column formatting + for _, row in out_df.iterrows(): + out_df.at[_, "description"] = attribute_rows.loc[row["label"], "Description"] + out_df.at[_, "is_cde"] = get_cde_id(str(attribute_rows.loc[row["label"], "Properties"])) + out_df.at[_, "node"] = row["Resolved_Node_URI"] + out_df.at[_, "is_key"] = "true" if str(attribute_rows.loc[row["label"], "Validation Rules"]).strip().lower() == "unique" else "" + out_df.at[_, "required_by"] = row["Resolved_Node_URI"] if str(attribute_rows.loc[row["label"], "Required"]).strip().lower() == "true" else "" + col_type = attribute_rows.loc[row["label"], "columnType"] + is_enum = True if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else False + out_df.at[_, "type"] = '"' + str(convert_schematic_column_type(col_type, is_enum)) + '"' + + out_df["label"] = '"' + out_df["label"].fillna('') + '"' + out_df["description"] = '"' + out_df["description"].fillna('').replace('"', '') + '"' + out_df["is_cde"] = out_df["is_cde"].fillna("") + + # Final output + final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "is_key"] + return out_df[final_cols] + + +def format_uri(node, attribute, org_name): + + node_segment = node.strip().lower().replace(" ", "_") + attr_segment = attribute.strip().lower().replace(" ", "_") + + return f"" + + +def convert_schematic_column_type(type, is_enum): + + if type in ["string", "string_list"]: + string_type = "string;enum" if is_enum else "string" + if type == "string_list": + out_type = f"array[{string_type}]" + else: + out_type = string_type + else: + out_type = type + + return out_type + + +def get_cde_id(entry): + + entry = entry.split(", ") if len(entry.split(", ")) > 1 else entry + + if type(entry) == list: + for ref in entry: + if ref.split(":")[0] == "CDE": + return ref.split(":")[1] + else: + return entry.split(":")[1] if entry.split(":")[0] == "CDE" else "" + def main(): args = get_args() @@ -52,12 +135,21 @@ def main(): cde_tag = "" key_tag = "" - ttl_df = pd.read_csv(args.mapping, header=0, keep_default_na=False) + if args.mapping: + print(f"Processing RDF triples precursor CSV [{args.mapping}]...") + ttl_df = pd.read_csv(args.mapping, header=0, keep_default_na=False) + print(f"RDF triples will be built from pre-cursor file!") + + elif args.model: + print(f"Processing model [{args.model}] to RDF triples precursor CSV...") + model_df = pd.read_csv(args.model, header=0, keep_default_na=True) + ttl_df = convert_schematic_model_to_ttl_format(model_df, args.org_name) + print(f"RDF triples will be built from the generated precursor dataframe!") out_file = "/".join([args.output, f"{args.org_name}.ttl"]) with open(out_file, "w+") as f: - + print(f"Building RDF triples and serializing to TTL...") for _, row in ttl_df.iterrows(): ttl_dict = { "term": row["term"], @@ -85,5 +177,8 @@ def main(): f.write("\t"+f"{cde_tag} {' '.join([ttl_dict[cde_tag], '.'])}"+"\n") f.write("\n") + print(f"Done ✅") + print(f"{out_file} was written with {len(ttl_df)} triples!") + if __name__ == "__main__": main() From d0762aed841e4be47439990a8112dd392c425426 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 5 Sep 2025 16:42:37 -0700 Subject: [PATCH 26/67] Update csv_to_ttl.py - update typing and docstrings - allow base tag to be defined at input; use base tag to construct other RDF tags that are not external references --- utils/csv_to_ttl.py | 48 +++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index a586772a..e2cfd646 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -43,12 +43,20 @@ def get_args(): help="Abbreviation used to label the data model", required=False, default="new_org" + ) + parser.add_argument( + "-b", + "--base_tag", + type=str, + help="url applied to the beginning of internal tags", + required=False, + default="http://syn.org" ) return parser.parse_args() -def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name) -> pd.DataFrame: - +def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base_tag: str) -> pd.DataFrame: + """Convert schematic model DataFrame to TTL format.""" out_df = pd.DataFrame() # Step 1: Identify all node rows (treat rows with non-empty DependsOn as nodes) @@ -63,13 +71,13 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name) -> p # Step 2: Assign node URI for each attribute out_df["Resolved_Node_URI"] = out_df["Resolved_Node"].apply( - lambda x: f"" + lambda x: f"<{base_tag}/{org_name}/{x.strip().lower().replace(' ', '_')}>" ) # Step 3: Construct term URIs for each attribute - out_df["term"] = out_df.apply(lambda row: format_uri(row["Resolved_Node"], row["label"], org_name), axis=1) + out_df["term"] = out_df.apply(lambda row: format_uri(base_tag, row["Resolved_Node"], row["label"], org_name), axis=1) - # Step 4: TTL-compatible column formatting + # Step 4: Info extraction and TTL-compatible column formatting for _, row in out_df.iterrows(): out_df.at[_, "description"] = attribute_rows.loc[row["label"], "Description"] out_df.at[_, "is_cde"] = get_cde_id(str(attribute_rows.loc[row["label"], "Properties"])) @@ -89,16 +97,18 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name) -> p return out_df[final_cols] -def format_uri(node, attribute, org_name): - +def format_uri(base_tag:str, node:str, attribute:str, org_name:str) -> str: + """Format the URI for a given node and attribute.""" + node_segment = node.strip().lower().replace(" ", "_") attr_segment = attribute.strip().lower().replace(" ", "_") - return f"" + return f"<{base_tag}/{org_name}/{node_segment}/{attr_segment}>" -def convert_schematic_column_type(type, is_enum): - +def convert_schematic_column_type(type:str, is_enum:bool) -> str: + """Convert schematic column type to TTL-compatible format.""" + if type in ["string", "string_list"]: string_type = "string;enum" if is_enum else "string" if type == "string_list": @@ -111,8 +121,8 @@ def convert_schematic_column_type(type, is_enum): return out_type -def get_cde_id(entry): - +def get_cde_id(entry: str) -> str: + """Extract CDE ID from Properties entry.""" entry = entry.split(", ") if len(entry.split(", ")) > 1 else entry if type(entry) == list: @@ -126,14 +136,14 @@ def main(): args = get_args() - base_tag = "" + base_tag = args.base_tag label_tag = "" desc_tag = "" - node_tag = "" - type_tag = "" - req_tag = "" - cde_tag = "" - key_tag = "" + node_tag = f"<{base_tag}/node>" + type_tag = f"<{base_tag}/type>" + req_tag = f"<{base_tag}/requiredBy>" + cde_tag = f"<{base_tag}/isCDE>" + key_tag = f"<{base_tag}/isKey>" if args.mapping: print(f"Processing RDF triples precursor CSV [{args.mapping}]...") @@ -143,7 +153,7 @@ def main(): elif args.model: print(f"Processing model [{args.model}] to RDF triples precursor CSV...") model_df = pd.read_csv(args.model, header=0, keep_default_na=True) - ttl_df = convert_schematic_model_to_ttl_format(model_df, args.org_name) + ttl_df = convert_schematic_model_to_ttl_format(model_df, args.org_name, base_tag) print(f"RDF triples will be built from the generated precursor dataframe!") out_file = "/".join([args.output, f"{args.org_name}.ttl"]) From 1ce7c68c0556703ff5f7c69210f8b119b1aaea2b Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 9 Sep 2025 15:06:04 -0700 Subject: [PATCH 27/67] Remove map_to_crdc.py from branch No longer being developed --- utils/map_to_crdc.py | 115 ------------------------------------------- 1 file changed, 115 deletions(-) delete mode 100644 utils/map_to_crdc.py diff --git a/utils/map_to_crdc.py b/utils/map_to_crdc.py deleted file mode 100644 index 19f50515..00000000 --- a/utils/map_to_crdc.py +++ /dev/null @@ -1,115 +0,0 @@ -"""Map MC2 Center metadata to GC models. - -This script maps metadata from MC2 Center to the Genomic Commons (GC) models. -It retrieves metadata from Synapse tables, extracts relevant information, -and generates a CSV file with the mapped metadata. -The script requires the following command-line arguments: -1. -d: Path to the dataset metadata CSV file. An example of this file can be found here: https://docs.google.com/spreadsheets/d/1LLpSIFAh12YdKnGfzXMxGpoKCaEH90nDx-QvncaIJlk/edit?gid=288959359#gid=288959359 -2. -t: Target template name. -3. -m: Path to the target-to-source mapping CSV file. - -author: orion.banks -""" - -import argparse -import pandas as pd -import synapseclient -from synapseclient.models import query -import re - -def get_args(): - """Set up command-line interface and get arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument( - "-d", - type=str, - help="Dataset metadata CSV", - required=True, - default=None, - ), - parser.add_argument( - "-t", - type=str, - help="Target template name", - required=True, - default=None, - ) - parser.add_argument( - "-m", - type=str, - help="Target-to-source mapping CSV", - required=True, - default=None, - ) - parser.add_argument( - "-o", - type=str, - help="Output directory", - required=False, - default="./output", - ) - return parser.parse_args() - - -def get_table(syn, source_id: str, cols: str | list = "*", conditions: str | None = None) -> pd.DataFrame: - """Collect columns from a Synapse table entity and return as a Dataframe.""" - - if type(cols) == list: - cols = ", ".join(["".join(['"', col, '"']) for col in cols]) - - query = f"SELECT {cols} FROM {source_id}" - if conditions is not None: - query += f" WHERE {conditions}" - table = syn.tableQuery(query).asDataFrame().fillna("") - print(f"Data acquired from Synapse table {source_id}") - - return table - - -def main(): - """Main function.""" - - args = get_args() - - manifests, target_output, mapping, out_dir = args.d, args.t, args.m, args.o - - syn = synapseclient.login() - - manifests_df = pd.read_csv(manifests, header=0).fillna("") - mapping_df = pd.read_csv(mapping, header=0).fillna("") - - source_metadata_dict = dict(zip(manifests_df["entity_id"], (zip(manifests_df["data_type"], manifests_df["study_key"])))) - - gc_template_dict = dict(zip(mapping_df["Property"], (zip(mapping_df["Node"], mapping_df["Acceptable Values"])))) - - gc_mc2_mapping_dict = dict(zip(mapping_df["Property"], mapping_df["MC2 attribute"])) - - template_df = pd.DataFrame() - - for attribute, (template, _) in gc_template_dict.items(): - if template == target_output: - template_df[attribute] = "" # create GC template columns - print(f"{attribute} added to template \n") - - template_df["crdc_id"] = "" - attribute_list = template_df.columns.tolist() - - for id, (data_type, study_key) in source_metadata_dict.items(): - if data_type == "Study" and target_output in ["study", "image"]: - df = get_table(syn, id, cols="*", conditions=f"Study_id = '{study_key}'") - elif data_type != "Study" and target_output != "study": - df = query(query=f"SELECT * FROM {id}") - else: - df = pd.DataFrame() - source_metadata_dict[id] = (data_type, df, df.columns.tolist()) - - for _, (data_type, df, cols) in source_metadata_dict.items(): - mapped_attributes = [attribute for attribute in attribute_list if "".join("".join(str(gc_mc2_mapping_dict[attribute]).split(" ")).split("-")) in cols] - mapped_df = df.rename(columns={"".join("".join(str(gc_mc2_mapping_dict[attribute]).split(" ")).split("-")): attribute for attribute in mapped_attributes}) - template_df = pd.concat([template_df, mapped_df]).drop_duplicates(subset=attribute_list, keep="first").reset_index(drop=True) - - template_df[attribute_list].to_csv(f"{out_dir}/{target_output}_mapped_metadata.csv", index=False) - print(f"Mapped metadata saved to {out_dir}/{target_output}_mapped_metadata.csv") - -if __name__ == "__main__": - main() From 7ae79ea9883beba6208e65e831423e5124b9420a Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 9 Sep 2025 15:20:28 -0700 Subject: [PATCH 28/67] Update requirements.txt --- requirements.txt | 186 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 184 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 022bbbbc..b54b87e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,184 @@ -pandas >= 1.5.1 -openpyxl >= 3.0.10 +altair==4.2.0 +anyio==4.10.0 +appnope==0.1.4 +argon2-cffi==25.1.0 +argon2-cffi-bindings==25.1.0 +arrow==1.3.0 +asttokens==3.0.0 +async-lru==2.0.5 +asyncio==3.4.3 +asyncio-atexit==1.0.1 +attrs==25.3.0 +babel==2.17.0 +beautifulsoup4==4.13.5 +bleach==6.2.0 +cachetools==5.5.2 +certifi==2025.8.3 +cffi==2.0.0 +charset-normalizer==3.4.3 +click==8.2.1 +click-log==0.4.0 +clickclick==20.10.2 +colorama==0.4.6 +comm==0.2.3 +connexion==2.14.2 +cryptography==45.0.7 +dataclasses-json==0.6.7 +dateparser==1.2.2 +debugpy==1.8.16 +decorator==5.2.1 +defusedxml==0.7.1 +Deprecated==1.2.18 +dill==0.4.0 +entrypoints==0.4 +et_xmlfile==2.0.0 +exceptiongroup==1.3.0 +executing==2.2.1 +fastjsonschema==2.21.2 +Flask==2.2.5 +Flask-Cors==4.0.2 +fqdn==1.5.1 +google-api-core==2.25.1 +google-api-python-client==2.123.0 +google-auth==2.40.3 +google-auth-httplib2==0.1.1 +google-auth-oauthlib==0.8.0 +googleapis-common-protos==1.70.0 +graphviz==0.20.3 +great-expectations==0.15.50 +h11==0.16.0 +httpcore==1.0.9 +httplib2==0.30.0 +httpx==0.28.1 +idna==3.10 +importlib_metadata==8.7.0 +inflection==0.5.1 +ipykernel==6.30.1 +ipython==8.37.0 +ipywidgets==8.1.7 +isodate==0.6.1 +isoduration==20.11.0 +itsdangerous==2.2.0 +jedi==0.19.2 +Jinja2==3.1.6 +json5==0.12.1 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +jupyter-events==0.12.0 +jupyter-lsp==2.3.0 +jupyter_client==8.6.3 +jupyter_core==5.8.1 +jupyter_server==2.17.0 +jupyter_server_terminals==0.5.3 +jupyterlab==4.4.7 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.15 +lark==1.2.2 +makefun==1.16.0 +MarkupSafe==3.0.2 +marshmallow==3.26.1 +matplotlib-inline==0.1.7 +mistune==3.1.4 +mypy_extensions==1.1.0 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.4.2 +notebook==7.4.5 +notebook_shim==0.2.4 +numpy==1.26.4 +oauth2client==4.1.3 +oauthlib==3.3.1 +openpyxl==3.1.5 +opentelemetry-api==1.36.0 +opentelemetry-exporter-otlp-proto-common==1.36.0 +opentelemetry-exporter-otlp-proto-http==1.36.0 +opentelemetry-instrumentation==0.57b0 +opentelemetry-instrumentation-flask==0.57b0 +opentelemetry-instrumentation-httpx==0.57b0 +opentelemetry-instrumentation-requests==0.57b0 +opentelemetry-instrumentation-threading==0.57b0 +opentelemetry-instrumentation-urllib==0.57b0 +opentelemetry-instrumentation-wsgi==0.57b0 +opentelemetry-proto==1.36.0 +opentelemetry-sdk==1.36.0 +opentelemetry-semantic-conventions==0.57b0 +opentelemetry-util-http==0.57b0 +overrides==7.7.0 +packaging==25.0 +pandarallel==1.6.5 +pandas==2.3.2 +pandocfilters==1.5.1 +parso==0.8.5 +pexpect==4.9.0 +platformdirs==4.4.0 +prometheus_client==0.22.1 +prompt_toolkit==3.0.52 +proto-plus==1.26.1 +protobuf==6.32.0 +psutil==5.9.8 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pyasn1==0.6.1 +pyasn1_modules==0.4.2 +pycparser==2.22 +pydantic==1.10.22 +Pygments==2.19.2 +pygsheets==2.0.6 +PyJWT==2.10.1 +pyparsing==3.2.3 +python-dateutil==2.9.0.post0 +python-dotenv==0.21.1 +python-json-logger==3.3.0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==27.1.0 +rdflib==6.3.2 +referencing==0.36.2 +regex==2025.9.1 +requests==2.32.5 +requests-oauthlib==2.0.0 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rfc3987-syntax==1.1.0 +rpds-py==0.27.1 +rsa==4.9.1 +ruamel.yaml==0.17.17 +schematicpy==25.8.1 +scipy==1.15.3 +Send2Trash==1.8.3 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.8 +stack-data==0.6.3 +swagger-ui-bundle==0.0.9 +synapseclient==4.8.0 +tenacity==8.5.0 +terminado==0.18.1 +tinycss2==1.4.0 +toml==0.10.2 +tomli==2.2.1 +toolz==1.0.0 +tornado==6.5.2 +tqdm==4.67.1 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20250822 +typing-inspect==0.9.0 +typing_extensions==4.15.0 +tzdata==2025.2 +tzlocal==5.3.1 +uri-template==1.3.0 +uritemplate==4.2.0 +urllib3==1.26.20 +wcwidth==0.2.13 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 +Werkzeug==2.2.3 +widgetsnbextension==4.0.14 +wrapt==1.17.3 +zipp==3.23.0 From 77e533002a7b42c31fad47e277151bc8365fa677 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 12:57:42 -0700 Subject: [PATCH 29/67] Update csv_to_ttl.py - add function to convert TSV from CRDC Model Navigator into serialized RDF - add function to encode CRDC Type to valid RDF types - add code to document valid/permissible values in TTL triples - add input routing to direct CRDC models and schematic models to specific processing functions --- utils/csv_to_ttl.py | 83 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 10 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index e2cfd646..c17500a0 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -10,6 +10,8 @@ import argparse import os import pandas as pd +from pathlib import Path +import re def get_args(): """Set up command-line interface and get arguments.""" @@ -18,7 +20,7 @@ def get_args(): "-m", "--model", type=str, - help="Path to schematic data model CSV", + help="Path to schematic or CRDC data model CSV", required=False ) parser.add_argument( @@ -83,7 +85,8 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, out_df.at[_, "is_cde"] = get_cde_id(str(attribute_rows.loc[row["label"], "Properties"])) out_df.at[_, "node"] = row["Resolved_Node_URI"] out_df.at[_, "is_key"] = "true" if str(attribute_rows.loc[row["label"], "Validation Rules"]).strip().lower() == "unique" else "" - out_df.at[_, "required_by"] = row["Resolved_Node_URI"] if str(attribute_rows.loc[row["label"], "Required"]).strip().lower() == "true" else "" + out_df.at[_, "required_by"] = row["Resolved_Node_URI"] if str(attribute_rows.loc[row["label"], "Required"]).strip().lower() == "true" else "" + out_df.at[_, "has_enum"] = str(attribute_rows.loc[row["label"], "Required"]) if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else "" col_type = attribute_rows.loc[row["label"], "columnType"] is_enum = True if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else False out_df.at[_, "type"] = '"' + str(convert_schematic_column_type(col_type, is_enum)) + '"' @@ -93,7 +96,37 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, out_df["is_cde"] = out_df["is_cde"].fillna("") # Final output - final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "is_key"] + final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "is_key", "has_enum"] + return out_df[final_cols] + + +def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base_tag: str) -> pd.DataFrame: + """Convert schematic model DataFrame to TTL format.""" + out_df = pd.DataFrame() + + out_df["term"] = input_df.apply( + lambda row: format_uri(base_tag, row["Node"], row["Property"], org_name), axis=1) + out_df["label"] = '"' + input_df["Property"].fillna('') + '"' + out_df["description"] = '"' + input_df["Description"].fillna('').replace('"', '') + '"' + out_df["node"] = input_df["Node"].apply( + lambda x: f"<{base_tag}/{org_name}/{x.strip().lower().replace(' ', '_')}>") + out_df["is_cde"] = input_df["CDECode"].fillna("").apply(lambda x: str(x).split(".")[0]) + out_df["is_key"] = input_df["Key Property"].apply(lambda x: str(x)).replace(["False", "True"], ["", "true"]) + out_df["required_by"] = input_df["Required"].apply(lambda x: str(x)) + out_df["type"] = input_df["Type"].apply(lambda x: str(x)) + out_df["has_enum"] = input_df["Acceptable Values"].fillna("").apply(lambda x: x.split(",")) + out_df["cde_name"] = input_df["CDEFullName"].apply(lambda x: str(x)) + + + for _, row in out_df.iterrows(): + col_type = row["type"] + is_enum = True if len(row["has_enum"]) > 1 else False + out_df.at[_, "type"] = '"' + str(convert_gc_column_type(col_type, is_enum)) + '"' + out_df.at[_, "required_by"] = row["node"] if row["required_by"] == "required" else "" + out_df.at[_, "has_enum"] = ", ".join(row["has_enum"]) + + # Final output + final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "cde_name", "is_key", "has_enum"] return out_df[final_cols] @@ -132,6 +165,26 @@ def get_cde_id(entry: str) -> str: else: return entry.split(":")[1] if entry.split(":")[0] == "CDE" else "" + +def convert_gc_column_type(type:str, is_enum:bool) -> str: + """Convert GC column type to TTL-compatible format.""" + + if type in ["string", "list"]: + string_type = "string;enum" if is_enum else "string" + if type == "list": + out_type = f"array[{string_type}]" + else: + out_type = string_type + elif re.match(r'{"pattern"', type) is not None: + out_type = "string" + elif re.match(r'{"value_type":"number"', type) is not None: + out_type = "number" + else: + out_type = type + + return out_type + + def main(): args = get_args() @@ -144,6 +197,7 @@ def main(): req_tag = f"<{base_tag}/requiredBy>" cde_tag = f"<{base_tag}/isCDE>" key_tag = f"<{base_tag}/isKey>" + enum_tag = f"<{base_tag}/acceptableValues>" if args.mapping: print(f"Processing RDF triples precursor CSV [{args.mapping}]...") @@ -152,8 +206,12 @@ def main(): elif args.model: print(f"Processing model [{args.model}] to RDF triples precursor CSV...") - model_df = pd.read_csv(args.model, header=0, keep_default_na=True) - ttl_df = convert_schematic_model_to_ttl_format(model_df, args.org_name, base_tag) + sep = "," if Path(args.model).suffix == ".csv" else "\t" + model_df = pd.read_csv(args.model, header=0, keep_default_na=True, sep=sep) + if str(args.org_name).lower() in ["new_org", "mc2", "nf", "adkp", "htan"]: + ttl_df = convert_schematic_model_to_ttl_format(model_df, args.org_name, base_tag) + if str(args.org_name).lower() in ["gc", "crdc", "dh"]: + ttl_df = convert_crdc_model_to_ttl_format(model_df, args.org_name, base_tag) print(f"RDF triples will be built from the generated precursor dataframe!") out_file = "/".join([args.output, f"{args.org_name}.ttl"]) @@ -169,22 +227,27 @@ def main(): type_tag: row["type"], req_tag: row["required_by"], cde_tag: row["is_cde"], - key_tag: row["is_key"] + key_tag: row["is_key"], + enum_tag: row["has_enum"] } f.write(f"{ttl_dict['term']} {label_tag} {ttl_dict[label_tag]};"+"\n") f.write("\t"+f"{desc_tag} {ttl_dict[desc_tag]};"+"\n") f.write("\t"+f"{node_tag} {ttl_dict[node_tag]};"+"\n") - line_end = ";" if ttl_dict[req_tag] or ttl_dict[key_tag] or ttl_dict[cde_tag] else " ." + line_end = ";" if ttl_dict[req_tag] or ttl_dict[key_tag] or ttl_dict[cde_tag] or ttl_dict[enum_tag] else " ." f.write("\t"+f"{type_tag} {ttl_dict[type_tag]}{line_end}"+"\n") if ttl_dict[req_tag]: - line_end = ";\n" if ttl_dict[key_tag] or ttl_dict[cde_tag] else " .\n" + line_end = ";\n" if ttl_dict[key_tag] or ttl_dict[cde_tag] or ttl_dict[enum_tag] else " .\n" f.write("\t"+f"{req_tag} {''.join([ttl_dict[req_tag], line_end])}") if ttl_dict[key_tag]: - line_end = ";\n" if ttl_dict[cde_tag] else " .\n" + line_end = ";\n" if ttl_dict[cde_tag] or ttl_dict[enum_tag] else " .\n" f.write("\t"+f"{key_tag} {''.join([ttl_dict[key_tag], line_end])}") if ttl_dict[cde_tag]: - f.write("\t"+f"{cde_tag} {' '.join([ttl_dict[cde_tag], '.'])}"+"\n") + line_end = ";\n" if ttl_dict[enum_tag] else " .\n" + f.write("\t"+f"{cde_tag} {''.join([ttl_dict[cde_tag], line_end])}") + if ttl_dict[enum_tag]: + line_end = " .\n" + f.write("\t"+f"{enum_tag} {''.join([ttl_dict[enum_tag], line_end])}") f.write("\n") print(f"Done ✅") From 280206888af10b4ab85729eee4cead63b788a6f2 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 13:10:43 -0700 Subject: [PATCH 30/67] Update csv_to_ttl.py Retain CDE full name in description for CRDC model attributes --- utils/csv_to_ttl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index c17500a0..2f05b926 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -107,7 +107,8 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df["term"] = input_df.apply( lambda row: format_uri(base_tag, row["Node"], row["Property"], org_name), axis=1) out_df["label"] = '"' + input_df["Property"].fillna('') + '"' - out_df["description"] = '"' + input_df["Description"].fillna('').replace('"', '') + '"' + out_df["description"] = input_df["Description"].fillna("").replace('"', '') + out_df["cde_name"] = input_df["CDEFullName"].fillna("") out_df["node"] = input_df["Node"].apply( lambda x: f"<{base_tag}/{org_name}/{x.strip().lower().replace(' ', '_')}>") out_df["is_cde"] = input_df["CDECode"].fillna("").apply(lambda x: str(x).split(".")[0]) @@ -124,6 +125,7 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df.at[_, "type"] = '"' + str(convert_gc_column_type(col_type, is_enum)) + '"' out_df.at[_, "required_by"] = row["node"] if row["required_by"] == "required" else "" out_df.at[_, "has_enum"] = ", ".join(row["has_enum"]) + out_df.at[_, "description"] = '"' + (f'{row["cde_name"]}: ' if str(row["cde_name"]) != "nan" else "") + row["description"] + '"' # Final output final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "cde_name", "is_key", "has_enum"] From 1995a7babb6e06faa02bdf0cd78a9916b2c6893b Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 13:20:04 -0700 Subject: [PATCH 31/67] Update description and usage --- utils/csv_to_ttl.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 2f05b926..5d5950cf 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -1,9 +1,20 @@ """ csv_to_ttl.py -Converts a CSV with formatted ttl info to a ttl file +Converts a schematic data model CSV or CRDC data model TSV to RDF triples +Serialized triples to a ttl file ttl file can be used as a graph input for the arachne agent. +usage: csv_to_ttl.py [-h] [-m MODEL] [-p MAPPING] [-o OUTPUT] [-g ORG_NAME] [-b BASE_TAG] + +options: + -h, --help show this help message and exit + -m MODEL, --model MODEL Path to schematic or CRDC data model CSV + -p MAPPING, --mapping MAPPING Path to ttl source content file + -o OUTPUT, --output OUTPUT Path to folder where graph should be stored (Default: current directory) + -g ORG_NAME, --org_name ORG_NAME Abbreviation used to label the data model and determine how model should be processed (Default: 'new_org', schematic processing) + -b BASE_TAG, --base_tag BASE_TAG url applied to the beginning of internal tags (Default: 'http://syn.org') + author: orion.banks """ @@ -34,7 +45,7 @@ def get_args(): "-o", "--output", type=str, - help="Path to folder where graph should be stored", + help="Path to folder where graph should be stored (Default: current directory)", required=False, default=os.getcwd() ) @@ -42,7 +53,7 @@ def get_args(): "-g", "--org_name", type=str, - help="Abbreviation used to label the data model", + help="Abbreviation used to label the data model and determine how model should be processed (Default: 'new_org', schematic processing)", required=False, default="new_org" ) @@ -50,7 +61,7 @@ def get_args(): "-b", "--base_tag", type=str, - help="url applied to the beginning of internal tags", + help="url applied to the beginning of internal tags (Default: 'http://syn.org')", required=False, default="http://syn.org" ) @@ -101,7 +112,7 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base_tag: str) -> pd.DataFrame: - """Convert schematic model DataFrame to TTL format.""" + """Convert CRDC model DataFrame to TTL format.""" out_df = pd.DataFrame() out_df["term"] = input_df.apply( @@ -118,7 +129,6 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df["has_enum"] = input_df["Acceptable Values"].fillna("").apply(lambda x: x.split(",")) out_df["cde_name"] = input_df["CDEFullName"].apply(lambda x: str(x)) - for _, row in out_df.iterrows(): col_type = row["type"] is_enum = True if len(row["has_enum"]) > 1 else False @@ -127,7 +137,6 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df.at[_, "has_enum"] = ", ".join(row["has_enum"]) out_df.at[_, "description"] = '"' + (f'{row["cde_name"]}: ' if str(row["cde_name"]) != "nan" else "") + row["description"] + '"' - # Final output final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "cde_name", "is_key", "has_enum"] return out_df[final_cols] From d996e380b475d12d9c337e970e139d0ebc4c2a65 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 13:21:19 -0700 Subject: [PATCH 32/67] Update model input description --- utils/csv_to_ttl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 5d5950cf..52565527 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -9,7 +9,7 @@ options: -h, --help show this help message and exit - -m MODEL, --model MODEL Path to schematic or CRDC data model CSV + -m MODEL, --model MODEL Path to schematic model CSV or CRDC data model TSV -p MAPPING, --mapping MAPPING Path to ttl source content file -o OUTPUT, --output OUTPUT Path to folder where graph should be stored (Default: current directory) -g ORG_NAME, --org_name ORG_NAME Abbreviation used to label the data model and determine how model should be processed (Default: 'new_org', schematic processing) @@ -31,7 +31,7 @@ def get_args(): "-m", "--model", type=str, - help="Path to schematic or CRDC data model CSV", + help="Path to schematic model CSV or CRDC data model TSV", required=False ) parser.add_argument( From 481d1cc475426f7f09dc2efe3b26f89aa9afa12d Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 13:23:19 -0700 Subject: [PATCH 33/67] Don't include CDE name in processed CRDC DF cols --- utils/csv_to_ttl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 52565527..fa4a4bc9 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -137,7 +137,7 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df.at[_, "has_enum"] = ", ".join(row["has_enum"]) out_df.at[_, "description"] = '"' + (f'{row["cde_name"]}: ' if str(row["cde_name"]) != "nan" else "") + row["description"] + '"' - final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "cde_name", "is_key", "has_enum"] + final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "is_key", "has_enum"] return out_df[final_cols] From c679d09445170d82d3050690a60675d2b4d8475a Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 15:04:56 -0700 Subject: [PATCH 34/67] Output additional tag from model ingest functions Output the node name (if only one node is detected) or "all", which is used in main function to build the ttl filename. This is to ensure that separate ttl files are created when a directory of CSV or TSV files are processed via bash script --- utils/csv_to_ttl.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index fa4a4bc9..5902ad1b 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -105,10 +105,12 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, out_df["label"] = '"' + out_df["label"].fillna('') + '"' out_df["description"] = '"' + out_df["description"].fillna('').replace('"', '') + '"' out_df["is_cde"] = out_df["is_cde"].fillna("") + + node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split("/")[-1].split(">")[0] # Final output final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "is_key", "has_enum"] - return out_df[final_cols] + return out_df[final_cols], node_name def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base_tag: str) -> pd.DataFrame: @@ -137,8 +139,10 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df.at[_, "has_enum"] = ", ".join(row["has_enum"]) out_df.at[_, "description"] = '"' + (f'{row["cde_name"]}: ' if str(row["cde_name"]) != "nan" else "") + row["description"] + '"' + node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split("/")[-1].split(">")[0] + final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "is_key", "has_enum"] - return out_df[final_cols] + return out_df[final_cols], node_name def format_uri(base_tag:str, node:str, attribute:str, org_name:str) -> str: @@ -213,6 +217,7 @@ def main(): if args.mapping: print(f"Processing RDF triples precursor CSV [{args.mapping}]...") ttl_df = pd.read_csv(args.mapping, header=0, keep_default_na=False) + node_name = "mapped" print(f"RDF triples will be built from pre-cursor file!") elif args.model: @@ -220,12 +225,12 @@ def main(): sep = "," if Path(args.model).suffix == ".csv" else "\t" model_df = pd.read_csv(args.model, header=0, keep_default_na=True, sep=sep) if str(args.org_name).lower() in ["new_org", "mc2", "nf", "adkp", "htan"]: - ttl_df = convert_schematic_model_to_ttl_format(model_df, args.org_name, base_tag) + ttl_df, node_name = convert_schematic_model_to_ttl_format(model_df, args.org_name, base_tag) if str(args.org_name).lower() in ["gc", "crdc", "dh"]: - ttl_df = convert_crdc_model_to_ttl_format(model_df, args.org_name, base_tag) + ttl_df, node_name = convert_crdc_model_to_ttl_format(model_df, args.org_name, base_tag) print(f"RDF triples will be built from the generated precursor dataframe!") - out_file = "/".join([args.output, f"{args.org_name}.ttl"]) + out_file = "/".join([args.output, f"{args.org_name}_{node_name}.ttl"]) with open(out_file, "w+") as f: print(f"Building RDF triples and serializing to TTL...") From efdd19819139e0389378d0d3abc0bcb1f559afa1 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 15:26:17 -0700 Subject: [PATCH 35/67] Create build_ttl_graphs.sh #Usage: bash build_ttl_graphs.sh [SOURCE_DIR] [FILE_EXTENSION] [ORG_NAME] This script will convert files of the given extension to TTL-formatted RDF via csv_to_ttl.py It will concatenate multiple TTLs to a single file, if multiple were provided in the source directory --- build_ttl_graphs.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 build_ttl_graphs.sh diff --git a/build_ttl_graphs.sh b/build_ttl_graphs.sh new file mode 100644 index 00000000..4a6872bf --- /dev/null +++ b/build_ttl_graphs.sh @@ -0,0 +1,28 @@ +#/usr/bin/bash + +#Usage: bash build_ttl_graphs.sh [SOURCE_DIR] [FILE_EXTENSION] [ORG_NAME] [VERSION] + +#This script will convert files of the given extension to TTL-formatted RDF via csv_to_ttl.py +#It will concatenate multiple TTLs to a single file, if multiple were provided in the source directory + + +dir="$1" +datatype="$2" +org="$3" +version="$4" +outdir="ttl_graphs" + +mkdir -p ./"$outdir" + +for file in "$dir"/*."$datatype"; do + if [ -f "$file" ]; then + python utils/csv_to_ttl.py -m "$file" -g "$org" -o "$outdir" -v "$version" + fi +done + +for ttl in "$outdir"/*.ttl; do + if [ -f "$ttl" ]; then + cat "$ttl" >> "$org"_all.ttl + mv "$org"_all.ttl "$outdir" + fi +done From 7ba7042e63948f750edab582578825526656022c Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 15:29:33 -0700 Subject: [PATCH 36/67] Add inputs, add version to output Added inputs for: version (version number) and reference_type (allows the user to override the automatic model type selection, which is based on org name) Added version tag to output filename --- utils/csv_to_ttl.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 5902ad1b..7e10f74e 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -53,9 +53,17 @@ def get_args(): "-g", "--org_name", type=str, - help="Abbreviation used to label the data model and determine how model should be processed (Default: 'new_org', schematic processing)", + help="Abbreviation used in the data model name and RDF tags. (Default: 'new_org')", required=False, default="new_org" + ) + parser.add_argument( + "-r", + "--reference_type", + type=str, + help="The type of data model reference used as a basis for the input. One of 'schematic' or 'crdc'. If no input is given, the reference type will be automatically determined based on the provided org name (Default: None)", + required=False, + default=None ) parser.add_argument( "-b", @@ -64,6 +72,14 @@ def get_args(): help="url applied to the beginning of internal tags (Default: 'http://syn.org')", required=False, default="http://syn.org" + ) + parser.add_argument( + "-v", + "--version", + type=str, + help="Version applied to output ttl filename (Default: None)", + required=False, + default=None ) return parser.parse_args() @@ -230,7 +246,7 @@ def main(): ttl_df, node_name = convert_crdc_model_to_ttl_format(model_df, args.org_name, base_tag) print(f"RDF triples will be built from the generated precursor dataframe!") - out_file = "/".join([args.output, f"{args.org_name}_{node_name}.ttl"]) + out_file = "/".join([args.output, f"{args.org_name}_{node_name}_{args.version}.ttl"]) with open(out_file, "w+") as f: print(f"Building RDF triples and serializing to TTL...") From 4d951d8c31c3ecbbfc351b77220ad312b4072dd7 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:04:28 -0700 Subject: [PATCH 37/67] Modify function selection logic to use ref Add reference_type to code that selects which ingestion function to use. This will override any selection based on org_name --- utils/csv_to_ttl.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 7e10f74e..a5173eee 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -61,6 +61,7 @@ def get_args(): "-r", "--reference_type", type=str, + choices=["schematic", "crdc"], help="The type of data model reference used as a basis for the input. One of 'schematic' or 'crdc'. If no input is given, the reference type will be automatically determined based on the provided org name (Default: None)", required=False, default=None @@ -237,12 +238,20 @@ def main(): print(f"RDF triples will be built from pre-cursor file!") elif args.model: - print(f"Processing model [{args.model}] to RDF triples precursor CSV...") + print(f"Processing model [{args.model}] to RDF triples precursor dataframe...") sep = "," if Path(args.model).suffix == ".csv" else "\t" model_df = pd.read_csv(args.model, header=0, keep_default_na=True, sep=sep) - if str(args.org_name).lower() in ["new_org", "mc2", "nf", "adkp", "htan"]: + ref = args.reference_type + if ref is None: + if str(args.org_name).lower() in ["new_org", "mc2", "nf", "adkp", "htan"]: + ref = "schematic" + if str(args.org_name).lower() in ["gc", "crdc", "dh"]: + ref = "crdc" + if ref == "schematic": + print(f"Processing model based on schematic CSV specification...") ttl_df, node_name = convert_schematic_model_to_ttl_format(model_df, args.org_name, base_tag) - if str(args.org_name).lower() in ["gc", "crdc", "dh"]: + if ref == "crdc": + print(f"Processing model based on CRDC TSV specification...") ttl_df, node_name = convert_crdc_model_to_ttl_format(model_df, args.org_name, base_tag) print(f"RDF triples will be built from the generated precursor dataframe!") From b8a90a2ec873c3b80afc99345aad5a7fcb9011f7 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:09:14 -0700 Subject: [PATCH 38/67] Add version, allow CSV and TSV Added version as an input option, which will be added to the output file name. Added code to select the input file separator type, based on the file extension --- utils/build_template_ttl.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/utils/build_template_ttl.py b/utils/build_template_ttl.py index 54140f26..31c996c3 100644 --- a/utils/build_template_ttl.py +++ b/utils/build_template_ttl.py @@ -46,6 +46,14 @@ def get_args(): help="The tag that will be used as a prefix in RDF", required=False, default="http://syn.org" + ) + parser.add_argument( + "-v", + "--version", + type=str, + help="Version applied to output ttl filename (Default: None)", + required=False, + default=None ) return parser.parse_args() @@ -67,9 +75,10 @@ def main(): print(f"Processing model [{args.template}] to template.ttl...") template_path = Path(args.template) template_name = template_path.stem - template_df = pd.read_csv(args.template, header=0, keep_default_na=True) + sep = "," if template_path.suffix == ".csv" else "\t" + template_df = pd.read_csv(args.template, header=0, keep_default_na=True, sep=sep) - out_file = "/".join([args.output, f"{args.org_name}_{template_name}.ttl"]) + out_file = "/".join([args.output, f"{args.org_name}_{template_name}_{args.version}.ttl"]) with open(out_file, "w+") as f: print(f"Building RDF triples and serializing to TTL...") From 16a6f5d203d3acfe267a9ad84c58211944196160 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:28:39 -0700 Subject: [PATCH 39/67] Give 1.0.0 as default --- utils/build_template_ttl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/build_template_ttl.py b/utils/build_template_ttl.py index 31c996c3..bc2df9cd 100644 --- a/utils/build_template_ttl.py +++ b/utils/build_template_ttl.py @@ -51,9 +51,9 @@ def get_args(): "-v", "--version", type=str, - help="Version applied to output ttl filename (Default: None)", + help="Version applied to output ttl filename (Default: 1.0.0)", required=False, - default=None + default="1.0.0" ) return parser.parse_args() From c8eb704cfcae15e1e67d269fa1d53eb220e421c1 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:29:00 -0700 Subject: [PATCH 40/67] Move file once all RDF has been added --- build_ttl_graphs.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build_ttl_graphs.sh b/build_ttl_graphs.sh index 4a6872bf..8d2d6fde 100644 --- a/build_ttl_graphs.sh +++ b/build_ttl_graphs.sh @@ -23,6 +23,7 @@ done for ttl in "$outdir"/*.ttl; do if [ -f "$ttl" ]; then cat "$ttl" >> "$org"_all.ttl - mv "$org"_all.ttl "$outdir" fi done + +mv "$org"_all.ttl "$outdir" From 5c57047142469c0624a7e66776d46b4f60daa99b Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:33:35 -0700 Subject: [PATCH 41/67] Update naming conventions --- build_ttl_graphs.sh | 4 ++-- build_ttl_templates.sh | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 build_ttl_templates.sh diff --git a/build_ttl_graphs.sh b/build_ttl_graphs.sh index 8d2d6fde..7263d8dd 100644 --- a/build_ttl_graphs.sh +++ b/build_ttl_graphs.sh @@ -22,8 +22,8 @@ done for ttl in "$outdir"/*.ttl; do if [ -f "$ttl" ]; then - cat "$ttl" >> "$org"_all.ttl + cat "$ttl" >> "$org"_"$version"_model.ttl fi done -mv "$org"_all.ttl "$outdir" +mv "$org"_"$version"_model.ttl "$outdir" diff --git a/build_ttl_templates.sh b/build_ttl_templates.sh new file mode 100644 index 00000000..35e343a8 --- /dev/null +++ b/build_ttl_templates.sh @@ -0,0 +1,29 @@ +#/usr/bin/bash + +#Usage: bash build_ttl_templates.sh [SOURCE_DIR] [FILE_EXTENSION] [ORG_NAME] [VERSION] + +#This script will convert files of the given extension to TTL-formatted template specifications via build_template_ttl.py +#It will concatenate multiple TTLs to a single file, if multiple were provided in the source directory + + +dir="$1" +datatype="$2" +org="$3" +version="$4" +outdir="ttl_templates" + +mkdir -p ./"$outdir" + +for file in "$dir"/*."$datatype"; do + if [ -f "$file" ]; then + python utils/build_template_ttl.py -t "$file" -g "$org" -o "$outdir" -v "$version" + fi +done + +for ttl in "$outdir"/*.ttl; do + if [ -f "$ttl" ]; then + cat "$ttl" >> "$org"_"$version"_all_templates.ttl + fi +done + +mv "$org"_"$version"_all_templates.ttl "$outdir" From 628d390b3dd3a12fe1b823c74f9bc052f67c1486 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:44:41 -0700 Subject: [PATCH 42/67] Update naming for output directory --- build_ttl_graphs.sh | 2 +- build_ttl_templates.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build_ttl_graphs.sh b/build_ttl_graphs.sh index 7263d8dd..4069cb5b 100644 --- a/build_ttl_graphs.sh +++ b/build_ttl_graphs.sh @@ -10,7 +10,7 @@ dir="$1" datatype="$2" org="$3" version="$4" -outdir="ttl_graphs" +outdir="$org"_ttl_graphs mkdir -p ./"$outdir" diff --git a/build_ttl_templates.sh b/build_ttl_templates.sh index 35e343a8..5d020adb 100644 --- a/build_ttl_templates.sh +++ b/build_ttl_templates.sh @@ -10,7 +10,7 @@ dir="$1" datatype="$2" org="$3" version="$4" -outdir="ttl_templates" +outdir="$org"_ttl_templates mkdir -p ./"$outdir" From 8e09dc680178860f720cb6ebf3e1679a267ffbe2 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 16:54:50 -0700 Subject: [PATCH 43/67] Parse name and version from filename For GC manifest TSVs --- utils/build_template_ttl.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/utils/build_template_ttl.py b/utils/build_template_ttl.py index bc2df9cd..c972b7e8 100644 --- a/utils/build_template_ttl.py +++ b/utils/build_template_ttl.py @@ -70,6 +70,7 @@ def main(): prefix_tag = args.tag_prefix conform_tag = "" int_tag = "" + version = args.version if args.template: print(f"Processing model [{args.template}] to template.ttl...") @@ -78,7 +79,11 @@ def main(): sep = "," if template_path.suffix == ".csv" else "\t" template_df = pd.read_csv(args.template, header=0, keep_default_na=True, sep=sep) - out_file = "/".join([args.output, f"{args.org_name}_{template_name}_{args.version}.ttl"]) + if template_name.startswith("GC_Data_Loading_Template"): + template_name = template_name.split("_")[-2] + version = template_name.split("_")[-1] + + out_file = "/".join([args.output, f"{args.org_name}_{template_name}_{version}.ttl"]) with open(out_file, "w+") as f: print(f"Building RDF triples and serializing to TTL...") From d100205172d96e6dfffc1c87a3ac084870d46e7e Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 17:09:14 -0700 Subject: [PATCH 44/67] Update description --- utils/csv_to_ttl.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index a5173eee..3774880e 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -5,15 +5,25 @@ Serialized triples to a ttl file ttl file can be used as a graph input for the arachne agent. -usage: csv_to_ttl.py [-h] [-m MODEL] [-p MAPPING] [-o OUTPUT] [-g ORG_NAME] [-b BASE_TAG] +usage: csv_to_ttl.py [-h] [-m MODEL] [-p MAPPING] [-o OUTPUT] [-g ORG_NAME] [-r {schematic,crdc}] [-b BASE_TAG] [-v VERSION] options: - -h, --help show this help message and exit - -m MODEL, --model MODEL Path to schematic model CSV or CRDC data model TSV - -p MAPPING, --mapping MAPPING Path to ttl source content file - -o OUTPUT, --output OUTPUT Path to folder where graph should be stored (Default: current directory) - -g ORG_NAME, --org_name ORG_NAME Abbreviation used to label the data model and determine how model should be processed (Default: 'new_org', schematic processing) - -b BASE_TAG, --base_tag BASE_TAG url applied to the beginning of internal tags (Default: 'http://syn.org') + -h, --help show this help message and exit + -m MODEL, --model MODEL + Path to schematic model CSV or CRDC data model TSV + -p MAPPING, --mapping MAPPING + Path to ttl source content file + -o OUTPUT, --output OUTPUT + Path to folder where graph should be stored (Default: current directory) + -g ORG_NAME, --org_name ORG_NAME + Abbreviation used in the data model name and RDF tags. (Default: 'new_org') + -r {schematic,crdc}, --reference_type {schematic,crdc} + The type of data model reference used as a basis for the input. One of 'schematic' or 'crdc'. If no input is given, the reference type will be automatically determined based on + the provided org name (Default: None) + -b BASE_TAG, --base_tag BASE_TAG + url applied to the beginning of internal tags (Default: 'http://syn.org') + -v VERSION, --version VERSION + Version applied to output ttl filename (Default: None) author: orion.banks """ From b8b137252da4e9ce6304d8b4e5224aa944987c0d Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 10 Sep 2025 17:10:07 -0700 Subject: [PATCH 45/67] Update description --- utils/build_template_ttl.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/utils/build_template_ttl.py b/utils/build_template_ttl.py index c972b7e8..43218d15 100644 --- a/utils/build_template_ttl.py +++ b/utils/build_template_ttl.py @@ -4,6 +4,21 @@ Converts a metadata template CSV info to a ttl file defining the template. ttl file can be used as input for the arachne agent and would be available as a target. +usage: build_template_ttl.py [-h] [-t TEMPLATE] [-o OUTPUT] [-g ORG_NAME] [-p TAG_PREFIX] [-v VERSION] + +options: + -h, --help show this help message and exit + -t TEMPLATE, --template TEMPLATE + Path to metadata template CSV + -o OUTPUT, --output OUTPUT + Path to folder where graph should be stored + -g ORG_NAME, --org_name ORG_NAME + Abbreviation for org, used in RDF prefixes + -p TAG_PREFIX, --tag_prefix TAG_PREFIX + The tag that will be used as a prefix in RDF + -v VERSION, --version VERSION + Version applied to output ttl filename (Default: 1.0.0) + author: orion.banks """ From 8c8301d844fa3141e05a0de3283b2b2fdaf9f1e3 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:58:46 -0700 Subject: [PATCH 46/67] Add missing " ." --- utils/build_template_ttl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/build_template_ttl.py b/utils/build_template_ttl.py index 43218d15..5a49550d 100644 --- a/utils/build_template_ttl.py +++ b/utils/build_template_ttl.py @@ -114,7 +114,7 @@ def main(): clean_col = format_uri(col) col_uuid = uuid4() if col in ["Component", "type"]: - f.write(f'<{prefix_tag}/{args.org_name}/{template_name}/{clean_col}> <{prefix_tag}/defaultValue> "{template_name}"'+"\n") + f.write(f'<{prefix_tag}/{args.org_name}/{template_name}/{clean_col}> <{prefix_tag}/defaultValue> "{template_name}" .'+"\n") f.write(f"<{prefix_tag}/{args.org_name}/{template_name}> <{prefix_tag}/hasColumn> <{prefix_tag}/{args.org_name}/{template_name}/{clean_col}> ."+"\n") f.write(f"<{prefix_tag}/{col_uuid}>a <{prefix_tag}/ColumnPosition> ;"+"\n") f.write(f"<{prefix_tag}/template> <{prefix_tag}/{args.org_name}/{template_name}> ;"+"\n") From 4bd93d28a04c6e4e26551c0f3455f16fc0448ca2 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 11 Sep 2025 13:12:26 -0700 Subject: [PATCH 47/67] Fix Valid Value parsing to build list --- utils/csv_to_ttl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 3774880e..f93e6cd9 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -124,7 +124,7 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, out_df.at[_, "node"] = row["Resolved_Node_URI"] out_df.at[_, "is_key"] = "true" if str(attribute_rows.loc[row["label"], "Validation Rules"]).strip().lower() == "unique" else "" out_df.at[_, "required_by"] = row["Resolved_Node_URI"] if str(attribute_rows.loc[row["label"], "Required"]).strip().lower() == "true" else "" - out_df.at[_, "has_enum"] = str(attribute_rows.loc[row["label"], "Required"]) if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else "" + out_df.at[_, "has_enum"] = '["' + '", "'.join(attribute_rows.loc[row["label"], "Valid Values"].split(", ")) + '"]' if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else "" col_type = attribute_rows.loc[row["label"], "columnType"] is_enum = True if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else False out_df.at[_, "type"] = '"' + str(convert_schematic_column_type(col_type, is_enum)) + '"' From 8657a5e1b97a9635ae2e90c76f997bf5f0a55913 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 11 Sep 2025 13:31:09 -0700 Subject: [PATCH 48/67] Adjust quoting to address read errors --- utils/csv_to_ttl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index f93e6cd9..2c848628 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -124,13 +124,13 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, out_df.at[_, "node"] = row["Resolved_Node_URI"] out_df.at[_, "is_key"] = "true" if str(attribute_rows.loc[row["label"], "Validation Rules"]).strip().lower() == "unique" else "" out_df.at[_, "required_by"] = row["Resolved_Node_URI"] if str(attribute_rows.loc[row["label"], "Required"]).strip().lower() == "true" else "" - out_df.at[_, "has_enum"] = '["' + '", "'.join(attribute_rows.loc[row["label"], "Valid Values"].split(", ")) + '"]' if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else "" + out_df.at[_, "has_enum"] = '"[' + ", ".join(attribute_rows.loc[row["label"], "Valid Values"].split(", ")) + ']"' if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else "" col_type = attribute_rows.loc[row["label"], "columnType"] is_enum = True if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else False out_df.at[_, "type"] = '"' + str(convert_schematic_column_type(col_type, is_enum)) + '"' out_df["label"] = '"' + out_df["label"].fillna('') + '"' - out_df["description"] = '"' + out_df["description"].fillna('').replace('"', '') + '"' + out_df["description"] = '"' + out_df["description"].fillna('').apply(lambda x: x.replace('"', '')) + '"' out_df["is_cde"] = out_df["is_cde"].fillna("") node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split("/")[-1].split(">")[0] From d157e7be20306ee1f2827d781d175f21e78c9eb4 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 11 Sep 2025 15:30:48 -0700 Subject: [PATCH 49/67] Fix quoting and don't write CDE if TBD --- utils/csv_to_ttl.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 2c848628..63d36227 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -147,7 +147,7 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df["term"] = input_df.apply( lambda row: format_uri(base_tag, row["Node"], row["Property"], org_name), axis=1) out_df["label"] = '"' + input_df["Property"].fillna('') + '"' - out_df["description"] = input_df["Description"].fillna("").replace('"', '') + out_df["description"] = input_df["Description"].fillna("") out_df["cde_name"] = input_df["CDEFullName"].fillna("") out_df["node"] = input_df["Node"].apply( lambda x: f"<{base_tag}/{org_name}/{x.strip().lower().replace(' ', '_')}>") @@ -163,9 +163,9 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base is_enum = True if len(row["has_enum"]) > 1 else False out_df.at[_, "type"] = '"' + str(convert_gc_column_type(col_type, is_enum)) + '"' out_df.at[_, "required_by"] = row["node"] if row["required_by"] == "required" else "" - out_df.at[_, "has_enum"] = ", ".join(row["has_enum"]) - out_df.at[_, "description"] = '"' + (f'{row["cde_name"]}: ' if str(row["cde_name"]) != "nan" else "") + row["description"] + '"' - + out_df.at[_, "has_enum"] = (''.join(['"[', ', '.join(row["has_enum"]).replace('"', '').replace('[', '').replace(']', ''), ']"'])) if is_enum else "" + out_df.at[_, "description"] = '"' + ''.join([f'{str(row["cde_name"])}: ' if str(row["cde_name"]) != "nan" else "", row["description"]]).replace('"', '') + '"' + node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split("/")[-1].split(">")[0] final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "is_key", "has_enum"] @@ -288,12 +288,12 @@ def main(): line_end = ";" if ttl_dict[req_tag] or ttl_dict[key_tag] or ttl_dict[cde_tag] or ttl_dict[enum_tag] else " ." f.write("\t"+f"{type_tag} {ttl_dict[type_tag]}{line_end}"+"\n") if ttl_dict[req_tag]: - line_end = ";\n" if ttl_dict[key_tag] or ttl_dict[cde_tag] or ttl_dict[enum_tag] else " .\n" + line_end = ";\n" if ttl_dict[key_tag] or (ttl_dict[cde_tag] and ttl_dict[cde_tag] != "TBD") or ttl_dict[enum_tag] else " .\n" f.write("\t"+f"{req_tag} {''.join([ttl_dict[req_tag], line_end])}") if ttl_dict[key_tag]: - line_end = ";\n" if ttl_dict[cde_tag] or ttl_dict[enum_tag] else " .\n" + line_end = ";\n" if (ttl_dict[cde_tag] and ttl_dict[cde_tag] != "TBD") or ttl_dict[enum_tag] else " .\n" f.write("\t"+f"{key_tag} {''.join([ttl_dict[key_tag], line_end])}") - if ttl_dict[cde_tag]: + if ttl_dict[cde_tag] and ttl_dict[cde_tag] != "TBD": line_end = ";\n" if ttl_dict[enum_tag] else " .\n" f.write("\t"+f"{cde_tag} {''.join([ttl_dict[cde_tag], line_end])}") if ttl_dict[enum_tag]: From 47b79f0e85c5b231f017c6442e8bc593ddd46cb4 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 12 Sep 2025 13:46:01 -0700 Subject: [PATCH 50/67] Remove "default_literal" reference --- utils/process_arachne_mapping.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils/process_arachne_mapping.py b/utils/process_arachne_mapping.py index fd8b346c..93dff29e 100644 --- a/utils/process_arachne_mapping.py +++ b/utils/process_arachne_mapping.py @@ -63,8 +63,7 @@ def transform_csv_to_tsv(mapping_config): if mapping["value"] is not None: if "map" in mapping["value"]: value_map = mapping["value"]["map"] - default = mapping["value"]["default_literal"] if mapping["value"]["default_literal"] else "" - transformed_df[target_col] = df[source_col].map(value_map).fillna(default) + transformed_df[target_col] = df[source_col].map(value_map).fillna("") else: constant_value = mapping["value"] transformed_df[target_col] = [constant_value] * df.shape[0] From eda3a4c335d5108136a4975f40cecefdcc2dbbd7 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 23 Sep 2025 14:10:03 -0700 Subject: [PATCH 51/67] Add functions to render model Model can be rendered as a multidirectional graph in an interactive window (by passing arg -ig) OR it can be displayed as a table diagram and saved as a PNG (by passing arg -bg) --- utils/csv_to_ttl.py | 48 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 63d36227..0693ae3f 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -24,16 +24,30 @@ url applied to the beginning of internal tags (Default: 'http://syn.org') -v VERSION, --version VERSION Version applied to output ttl filename (Default: None) + -bg, --build_graph + Boolean. Pass this flag to generate a PNG of the input model (Default: None) + -ig, --interactive_graph + Boolean. Pass this flag to generate an interactive visualization of the input model (Default: None) author: orion.banks """ import argparse +import io +from IPython.display import display, Image +import matplotlib.pyplot as plt +import networkx as nx import os import pandas as pd from pathlib import Path +from PIL import Image +import pydot +import rdflib +from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph +from rdflib.tools import rdf2dot import re + def get_args(): """Set up command-line interface and get arguments.""" parser = argparse.ArgumentParser() @@ -91,6 +105,22 @@ def get_args(): help="Version applied to output ttl filename (Default: None)", required=False, default=None + ) + parser.add_argument( + "-bg", + "--build_graph", + help="Boolean. Pass this flag to generate a PNG of the input model (Default: None)", + action="store_true", + required=False, + default=None + ) + parser.add_argument( + "-ig", + "--interactive_graph", + help="Boolean. Pass this flag to generate an interactive visualization of the input model (Default: None)", + action="store_true", + required=False, + default=None ) return parser.parse_args() @@ -304,5 +334,23 @@ def main(): print(f"Done ✅") print(f"{out_file} was written with {len(ttl_df)} triples!") + image_path = "/".join([args.output, f"{args.org_name}_{node_name}_{args.version}.png"]) + g = rdflib.Graph() + model_graph = g.parse(out_file, format="turtle") + + if args.build_graph is not None: + dot_stream = io.StringIO() + rdf2dot.rdf2dot(model_graph, dot_stream, opts={display}) + dot_string = dot_stream.getvalue() + dg = pydot.graph_from_dot_data(dot_string) + dg[0].write_png(image_path) + image = Image.open(image_path) + image.show() + + if args.interactive_graph is not None: + model_graph = rdflib_to_networkx_multidigraph(model_graph) + nx.draw_networkx(model_graph, arrows=False, with_labels=True, font_size=4, node_size=200) + plt.show() + if __name__ == "__main__": main() From 352b96e1c64c98d3cfb00e7c2aa84d35b23445f8 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:33:32 -0700 Subject: [PATCH 52/67] Add code to extract schematic model components Can now build visualizations of graphs based on selected components of a schematic model, by passing one or more component names to arg "-s" at runtime --- utils/csv_to_ttl.py | 67 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 0693ae3f..f155aafd 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -121,6 +121,14 @@ def get_args(): action="store_true", required=False, default=None + ) + parser.add_argument( + "-s", + "--subset", + type=str, + help="(For schematic models only) The name of one or more data model components to extract from model (Default: None)", + required=False, + default=None ) return parser.parse_args() @@ -256,6 +264,24 @@ def convert_gc_column_type(type:str, is_enum:bool) -> str: return out_type +def subset_model(model_df: pd.DataFrame, nodes: str) -> pd.DataFrame: + + node_subset_df = pd.DataFrame() + + nodes = nodes.split(", ") + + model_df = model_df.set_index("Attribute") + + for node in nodes: + node_attributes = str(model_df.loc[node, "DependsOn"]).split(", ") + node_attributes.append(node) + node_rows = model_df.loc[node_attributes] + node_subset_df = pd.concat([node_subset_df, node_rows]) + + node_subset_df = node_subset_df.drop_duplicates().reset_index() + + return node_subset_df + def main(): @@ -289,6 +315,8 @@ def main(): ref = "crdc" if ref == "schematic": print(f"Processing model based on schematic CSV specification...") + if args.subset is not None: + model_df = subset_model(model_df, f"{args.subset}") ttl_df, node_name = convert_schematic_model_to_ttl_format(model_df, args.org_name, base_tag) if ref == "crdc": print(f"Processing model based on CRDC TSV specification...") @@ -333,24 +361,43 @@ def main(): print(f"Done ✅") print(f"{out_file} was written with {len(ttl_df)} triples!") - - image_path = "/".join([args.output, f"{args.org_name}_{node_name}_{args.version}.png"]) + g = rdflib.Graph() model_graph = g.parse(out_file, format="turtle") - + image_path = "/".join([args.output, f"{args.org_name}_{node_name}_{args.version}.png"]) + if args.build_graph is not None: - dot_stream = io.StringIO() - rdf2dot.rdf2dot(model_graph, dot_stream, opts={display}) - dot_string = dot_stream.getvalue() - dg = pydot.graph_from_dot_data(dot_string) - dg[0].write_png(image_path) - image = Image.open(image_path) - image.show() + retry = 0 + image = None + while image is None: + if retry == 1: + value_tag = rdflib.URIRef("http://syn.org/acceptableValues") + model_graph = model_graph.remove((None, value_tag, None)) + dot_stream = io.StringIO() + rdf2dot.rdf2dot(model_graph, dot_stream) + dot_string = dot_stream.getvalue() + dg = pydot.graph_from_dot_data(dot_string) + try: + dg[0].write_png(image_path) + image = Image.open(image_path) + image.show() + print(f"Success! Graph visualization is available at {image_path}") + image = True + except: + print("Failed to generate a visualization of the graph. Retrying with fewer triples...") + retry += 1 + + if retry == 2: + print("Failed to generate a visualization of the graph. Skipping.") + break if args.interactive_graph is not None: + print("Generating interactive plot...") model_graph = rdflib_to_networkx_multidigraph(model_graph) nx.draw_networkx(model_graph, arrows=False, with_labels=True, font_size=4, node_size=200) plt.show() + + print(f"Done ✅") if __name__ == "__main__": main() From 0df5b93a4eaa9f1de02e9f73aeb8cdbb5bebb15a Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 30 Sep 2025 17:28:26 -0700 Subject: [PATCH 53/67] Refactor to use prefixes in RDF --- utils/csv_to_ttl.py | 125 ++++++++++++++++++++++++++++---------------- 1 file changed, 79 insertions(+), 46 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index f155aafd..0a43fa44 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -158,7 +158,7 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, # Step 4: Info extraction and TTL-compatible column formatting for _, row in out_df.iterrows(): out_df.at[_, "description"] = attribute_rows.loc[row["label"], "Description"] - out_df.at[_, "is_cde"] = get_cde_id(str(attribute_rows.loc[row["label"], "Properties"])) + out_df.at[_, "maps_to"] = get_reference_id(str(attribute_rows.loc[row["label"], "Properties"])) out_df.at[_, "node"] = row["Resolved_Node_URI"] out_df.at[_, "is_key"] = "true" if str(attribute_rows.loc[row["label"], "Validation Rules"]).strip().lower() == "unique" else "" out_df.at[_, "required_by"] = row["Resolved_Node_URI"] if str(attribute_rows.loc[row["label"], "Required"]).strip().lower() == "true" else "" @@ -169,12 +169,12 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, out_df["label"] = '"' + out_df["label"].fillna('') + '"' out_df["description"] = '"' + out_df["description"].fillna('').apply(lambda x: x.replace('"', '')) + '"' - out_df["is_cde"] = out_df["is_cde"].fillna("") + out_df["maps_to"] = out_df["maps_to"].fillna("") node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split("/")[-1].split(">")[0] # Final output - final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "is_key", "has_enum"] + final_cols = ["term", "label", "description", "node", "type", "required_by", "maps_to", "is_key", "has_enum"] return out_df[final_cols], node_name @@ -186,10 +186,9 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base lambda row: format_uri(base_tag, row["Node"], row["Property"], org_name), axis=1) out_df["label"] = '"' + input_df["Property"].fillna('') + '"' out_df["description"] = input_df["Description"].fillna("") - out_df["cde_name"] = input_df["CDEFullName"].fillna("") out_df["node"] = input_df["Node"].apply( lambda x: f"<{base_tag}/{org_name}/{x.strip().lower().replace(' ', '_')}>") - out_df["is_cde"] = input_df["CDECode"].fillna("").apply(lambda x: str(x).split(".")[0]) + out_df["maps_to"] = input_df["CDECode"].fillna("").apply(lambda x: str(x).split(".")[0]) out_df["is_key"] = input_df["Key Property"].apply(lambda x: str(x)).replace(["False", "True"], ["", "true"]) out_df["required_by"] = input_df["Required"].apply(lambda x: str(x)) out_df["type"] = input_df["Type"].apply(lambda x: str(x)) @@ -206,7 +205,7 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split("/")[-1].split(">")[0] - final_cols = ["term", "label", "description", "node", "type", "required_by", "is_cde", "is_key", "has_enum"] + final_cols = ["term", "label", "description", "node", "type", "required_by", "maps_to", "is_key", "has_enum"] return out_df[final_cols], node_name @@ -234,16 +233,14 @@ def convert_schematic_column_type(type:str, is_enum:bool) -> str: return out_type -def get_cde_id(entry: str) -> str: +def get_reference_id(entry: str) -> list[tuple[str, str]]: """Extract CDE ID from Properties entry.""" entry = entry.split(", ") if len(entry.split(", ")) > 1 else entry if type(entry) == list: - for ref in entry: - if ref.split(":")[0] == "CDE": - return ref.split(":")[1] + return ", ".join([f"{ref.split(':')[0]}:{ref.split(':')[1]}" for ref in entry]) else: - return entry.split(":")[1] if entry.split(":")[0] == "CDE" else "" + return f"{entry.split(':')[0]}:{entry.split(':')[1]}" if len(entry.split(":")) > 1 else "" def convert_gc_column_type(type:str, is_enum:bool) -> str: @@ -288,14 +285,29 @@ def main(): args = get_args() base_tag = args.base_tag - label_tag = "" - desc_tag = "" - node_tag = f"<{base_tag}/node>" - type_tag = f"<{base_tag}/type>" - req_tag = f"<{base_tag}/requiredBy>" - cde_tag = f"<{base_tag}/isCDE>" - key_tag = f"<{base_tag}/isKey>" - enum_tag = f"<{base_tag}/acceptableValues>" + + label = "label" + desc = "desc" + node = "node" + type = "type" + reqby = "reqby" + key = "key" + enum = "enum" + duo = "duo" + cde = "cde" + + tag_dict = { + label : "", + desc : "", + node : f"<{base_tag}/node/>", + type : f"<{base_tag}/type/>", + reqby : f"<{base_tag}/requiredBy/>", + key : f"<{base_tag}/isKey/>", + enum : f"<{base_tag}/acceptableValues/>", + duo : f"", + cde : f"<{base_tag}/isCDE/>", + } + if args.mapping: print(f"Processing RDF triples precursor CSV [{args.mapping}]...") @@ -309,7 +321,7 @@ def main(): model_df = pd.read_csv(args.model, header=0, keep_default_na=True, sep=sep) ref = args.reference_type if ref is None: - if str(args.org_name).lower() in ["new_org", "mc2", "nf", "adkp", "htan"]: + if str(args.org_name).lower() in ["new_org", "mc2", "nf", "adkp", "htan", "ada"]: ref = "schematic" if str(args.org_name).lower() in ["gc", "crdc", "dh"]: ref = "crdc" @@ -325,39 +337,60 @@ def main(): out_file = "/".join([args.output, f"{args.org_name}_{node_name}_{args.version}.ttl"]) + prefix_list = [] + with open(out_file, "w+") as f: print(f"Building RDF triples and serializing to TTL...") for _, row in ttl_df.iterrows(): + props = None ttl_dict = { - "term": row["term"], - label_tag: row["label"], - desc_tag: row["description"], - node_tag: row["node"], - type_tag: row["type"], - req_tag: row["required_by"], - cde_tag: row["is_cde"], - key_tag: row["is_key"], - enum_tag: row["has_enum"] + label: row["label"], + desc: row["description"], + node: row["node"], + type: row["type"], + reqby: row["required_by"], + key: row["is_key"], + enum: row["has_enum"] } - f.write(f"{ttl_dict['term']} {label_tag} {ttl_dict[label_tag]};"+"\n") - f.write("\t"+f"{desc_tag} {ttl_dict[desc_tag]};"+"\n") - f.write("\t"+f"{node_tag} {ttl_dict[node_tag]};"+"\n") - line_end = ";" if ttl_dict[req_tag] or ttl_dict[key_tag] or ttl_dict[cde_tag] or ttl_dict[enum_tag] else " ." - f.write("\t"+f"{type_tag} {ttl_dict[type_tag]}{line_end}"+"\n") - if ttl_dict[req_tag]: - line_end = ";\n" if ttl_dict[key_tag] or (ttl_dict[cde_tag] and ttl_dict[cde_tag] != "TBD") or ttl_dict[enum_tag] else " .\n" - f.write("\t"+f"{req_tag} {''.join([ttl_dict[req_tag], line_end])}") - if ttl_dict[key_tag]: - line_end = ";\n" if (ttl_dict[cde_tag] and ttl_dict[cde_tag] != "TBD") or ttl_dict[enum_tag] else " .\n" - f.write("\t"+f"{key_tag} {''.join([ttl_dict[key_tag], line_end])}") - if ttl_dict[cde_tag] and ttl_dict[cde_tag] != "TBD": - line_end = ";\n" if ttl_dict[enum_tag] else " .\n" - f.write("\t"+f"{cde_tag} {''.join([ttl_dict[cde_tag], line_end])}") - if ttl_dict[enum_tag]: - line_end = " .\n" - f.write("\t"+f"{enum_tag} {''.join([ttl_dict[enum_tag], line_end])}") + if row["maps_to"]: + props = {f"{mapping.split(':')[0].lower()}":f"{mapping.split(':')[1]}" for mapping in row["maps_to"].split(", ")} + ttl_dict.update(props) + + new_prefixes = [item for item in ttl_dict] + prefix_list = prefix_list + new_prefixes + f.write("\n") + f.write(f"{row['term']} {label} {ttl_dict[label]};"+"\n") + f.write("\t"+f"{desc} {ttl_dict[desc]};"+"\n") + f.write("\t"+f"{node} {ttl_dict[node]};"+"\n") + line_end = ";" if ttl_dict[reqby] or ttl_dict[key] or props or ttl_dict[enum] else " ." + f.write("\t"+f"{type} {ttl_dict[type]}{line_end}"+"\n") + if ttl_dict[reqby]: + line_end = ";\n" if ttl_dict[key] or props or ttl_dict[enum] else " .\n" + f.write("\t"+f"{reqby} {''.join([ttl_dict[reqby], line_end])}") + if ttl_dict[key]: + line_end = ";\n" if props or ttl_dict[enum] else " .\n" + f.write("\t"+f"{key} {''.join([ttl_dict[key], line_end])}") + if ttl_dict[enum]: + line_end = ";\n" if props else " .\n" + f.write("\t"+f"{enum} {''.join([ttl_dict[enum], line_end])}") + if props: + end = len(props) + i = 0 + for key in props: + i += 1 + line_end = ";\n" if i < end else " .\n" + if props[key] and props[key] != "TBD": + f.write("\t"+f"{key} {''.join([props[key], line_end])}") + + with open(out_file, "r") as f: + current_lines = f.read() + + with open(out_file, "w+") as f: + prefix_set = set(prefix_list) + lines = "".join([f"@prefix {prefix}: {tag_dict[prefix]}"+" .\n" for prefix in prefix_set]) + current_lines + f.write(lines) print(f"Done ✅") print(f"{out_file} was written with {len(ttl_df)} triples!") From 5fea4f45a7417a75f99afa2cc13fadfc21d2e65d Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 1 Oct 2025 17:42:03 -0700 Subject: [PATCH 54/67] Continue RDF prefix refactor --- utils/csv_to_ttl.py | 144 ++++++++++++++++++++++++-------------------- 1 file changed, 79 insertions(+), 65 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 0a43fa44..e6bb8253 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -133,13 +133,14 @@ def get_args(): return parser.parse_args() -def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base_tag: str) -> pd.DataFrame: +def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base_tag: str) -> tuple[pd.DataFrame, str, list[str]]: """Convert schematic model DataFrame to TTL format.""" out_df = pd.DataFrame() # Step 1: Identify all node rows (treat rows with non-empty DependsOn as nodes) - node_rows = input_df[input_df["DependsOn"].notna()] - attribute_rows = input_df[input_df["DependsOn"].isna()].set_index("Attribute") + node_rows = input_df[input_df["DependsOn"].str.contains("Component")] + node_list = node_rows["Attribute"].to_list() + attribute_rows = input_df[~input_df["DependsOn"].str.contains("Component")].set_index("Attribute") attribute_to_node = {row["Attribute"]: str(row["DependsOn"]).split(", ") for _, row in node_rows.iterrows()} @@ -149,11 +150,11 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, # Step 2: Assign node URI for each attribute out_df["Resolved_Node_URI"] = out_df["Resolved_Node"].apply( - lambda x: f"<{base_tag}/{org_name}/{x.strip().lower().replace(' ', '_')}>" + lambda x: f"{org_name}:{str(x).strip().lower().replace(' ', '_')}" ) # Step 3: Construct term URIs for each attribute - out_df["term"] = out_df.apply(lambda row: format_uri(base_tag, row["Resolved_Node"], row["label"], org_name), axis=1) + out_df["term"] = out_df.apply(lambda row: format_uri(row["Resolved_Node"], row["label"]), axis=1) # Step 4: Info extraction and TTL-compatible column formatting for _, row in out_df.iterrows(): @@ -162,23 +163,24 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, out_df.at[_, "node"] = row["Resolved_Node_URI"] out_df.at[_, "is_key"] = "true" if str(attribute_rows.loc[row["label"], "Validation Rules"]).strip().lower() == "unique" else "" out_df.at[_, "required_by"] = row["Resolved_Node_URI"] if str(attribute_rows.loc[row["label"], "Required"]).strip().lower() == "true" else "" - out_df.at[_, "has_enum"] = '"[' + ", ".join(attribute_rows.loc[row["label"], "Valid Values"].split(", ")) + ']"' if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else "" + out_df.at[_, "has_enum"] = '"[' + ", ".join(str(attribute_rows.loc[row["label"], "Valid Values"]).split(", ")) + ']"' if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else "" col_type = attribute_rows.loc[row["label"], "columnType"] - is_enum = True if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else False - out_df.at[_, "type"] = '"' + str(convert_schematic_column_type(col_type, is_enum)) + '"' + validation = attribute_rows.loc[row["label"], "Validation Rules"] + is_enum = True if str(attribute_rows.loc[row["label"], "Valid Values"]) != "" else False + out_df.at[_, "type"] = '"' + convert_schematic_column_type(col_type, validation, is_enum) + '"' out_df["label"] = '"' + out_df["label"].fillna('') + '"' out_df["description"] = '"' + out_df["description"].fillna('').apply(lambda x: x.replace('"', '')) + '"' out_df["maps_to"] = out_df["maps_to"].fillna("") - node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split("/")[-1].split(">")[0] + node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split(":")[-1][:-2] # Final output final_cols = ["term", "label", "description", "node", "type", "required_by", "maps_to", "is_key", "has_enum"] - return out_df[final_cols], node_name + return out_df[final_cols], node_name, node_list -def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base_tag: str) -> pd.DataFrame: +def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base_tag: str) -> tuple[pd.DataFrame, str, list[str]]: """Convert CRDC model DataFrame to TTL format.""" out_df = pd.DataFrame() @@ -188,12 +190,13 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df["description"] = input_df["Description"].fillna("") out_df["node"] = input_df["Node"].apply( lambda x: f"<{base_tag}/{org_name}/{x.strip().lower().replace(' ', '_')}>") - out_df["maps_to"] = input_df["CDECode"].fillna("").apply(lambda x: str(x).split(".")[0]) - out_df["is_key"] = input_df["Key Property"].apply(lambda x: str(x)).replace(["False", "True"], ["", "true"]) + out_df["maps_to"] = input_df["CDECode"].fillna("").apply(lambda x: ":".join(["CDE", str(x).split(".")[0]]) if x not in ["", "TBD"] else "") + out_df["is_key"] = input_df["Key Property"].apply(lambda x: str(x)).replace(["FALSE", "True"], ["", "true"]) out_df["required_by"] = input_df["Required"].apply(lambda x: str(x)) out_df["type"] = input_df["Type"].apply(lambda x: str(x)) out_df["has_enum"] = input_df["Acceptable Values"].fillna("").apply(lambda x: x.split(",")) out_df["cde_name"] = input_df["CDEFullName"].apply(lambda x: str(x)) + node_list = input_df["Node"].to_list() for _, row in out_df.iterrows(): col_type = row["type"] @@ -201,34 +204,34 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df.at[_, "type"] = '"' + str(convert_gc_column_type(col_type, is_enum)) + '"' out_df.at[_, "required_by"] = row["node"] if row["required_by"] == "required" else "" out_df.at[_, "has_enum"] = (''.join(['"[', ', '.join(row["has_enum"]).replace('"', '').replace('[', '').replace(']', ''), ']"'])) if is_enum else "" - out_df.at[_, "description"] = '"' + ''.join([f'{str(row["cde_name"])}: ' if str(row["cde_name"]) != "nan" else "", row["description"]]).replace('"', '') + '"' + out_df.at[_, "description"] = '"' + ''.join([f'{str(row["cde_name"])}: ' if str(row["cde_name"]) != "" else "", row["description"]]).replace('"', '') + '"' - node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split("/")[-1].split(">")[0] + node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split(":")[-1] final_cols = ["term", "label", "description", "node", "type", "required_by", "maps_to", "is_key", "has_enum"] - return out_df[final_cols], node_name + return out_df[final_cols], node_name, node_list -def format_uri(base_tag:str, node:str, attribute:str, org_name:str) -> str: +def format_uri(node:str, attribute:str) -> str: """Format the URI for a given node and attribute.""" - node_segment = node.strip().lower().replace(" ", "_") - attr_segment = attribute.strip().lower().replace(" ", "_") + node_segment = str(node).strip().lower().replace(" ", "_").replace('10x_', '') + attr_segment = attribute.strip().lower().replace(" ", "_").replace('10x_', '') - return f"<{base_tag}/{org_name}/{node_segment}/{attr_segment}>" + return f"{node_segment}:{attr_segment}" -def convert_schematic_column_type(type:str, is_enum:bool) -> str: +def convert_schematic_column_type(type:str, validation: str, is_enum:bool) -> str: """Convert schematic column type to TTL-compatible format.""" - if type in ["string", "string_list"]: + if type in ["string", "string_list"] or validation in ["str", "list like"]: string_type = "string;enum" if is_enum else "string" - if type == "string_list": + if type == "string_list" or validation == "list like": out_type = f"array[{string_type}]" else: out_type = string_type else: - out_type = type + out_type = type if type else validation return out_type @@ -287,28 +290,28 @@ def main(): base_tag = args.base_tag label = "label" - desc = "desc" + desc = "description" node = "node" type = "type" - reqby = "reqby" - key = "key" - enum = "enum" - duo = "duo" - cde = "cde" + reqby = "requiredBy" + key = "isKey" + enum = "acceptableValues" + duo = "DUO_" + cde = "CDE" + tag_dict = { - label : "", - desc : "", - node : f"<{base_tag}/node/>", - type : f"<{base_tag}/type/>", - reqby : f"<{base_tag}/requiredBy/>", - key : f"<{base_tag}/isKey/>", - enum : f"<{base_tag}/acceptableValues/>", - duo : f"", - cde : f"<{base_tag}/isCDE/>", + label : ("rdfs", ""), + desc : ("purl", ""), + node : ("syn", f"<{base_tag}/>"), + type : ("syn", f"<{base_tag}/>"), + reqby : ("syn", f"<{base_tag}/>"), + key : ("syn", f"<{base_tag}/>"), + enum : ("syn", f"<{base_tag}/>"), + duo : ("obo", ""), + cde : ("syn", f"<{base_tag}/>"), } - if args.mapping: print(f"Processing RDF triples precursor CSV [{args.mapping}]...") ttl_df = pd.read_csv(args.mapping, header=0, keep_default_na=False) @@ -318,7 +321,7 @@ def main(): elif args.model: print(f"Processing model [{args.model}] to RDF triples precursor dataframe...") sep = "," if Path(args.model).suffix == ".csv" else "\t" - model_df = pd.read_csv(args.model, header=0, keep_default_na=True, sep=sep) + model_df = pd.read_csv(args.model, header=0, keep_default_na=False, na_values="nan", sep=sep, dtype=str) ref = args.reference_type if ref is None: if str(args.org_name).lower() in ["new_org", "mc2", "nf", "adkp", "htan", "ada"]: @@ -329,10 +332,12 @@ def main(): print(f"Processing model based on schematic CSV specification...") if args.subset is not None: model_df = subset_model(model_df, f"{args.subset}") - ttl_df, node_name = convert_schematic_model_to_ttl_format(model_df, args.org_name, base_tag) + ttl_df, node_name, node_list = convert_schematic_model_to_ttl_format(model_df, args.org_name, base_tag) if ref == "crdc": print(f"Processing model based on CRDC TSV specification...") - ttl_df, node_name = convert_crdc_model_to_ttl_format(model_df, args.org_name, base_tag) + if args.subset is not None: + model_df = model_df[model_df["Node"].isin(args.subset.split(", "))] + ttl_df, node_name, node_list = convert_crdc_model_to_ttl_format(model_df, args.org_name, base_tag) print(f"RDF triples will be built from the generated precursor dataframe!") out_file = "/".join([args.output, f"{args.org_name}_{node_name}_{args.version}.ttl"]) @@ -354,43 +359,51 @@ def main(): } if row["maps_to"]: - props = {f"{mapping.split(':')[0].lower()}":f"{mapping.split(':')[1]}" for mapping in row["maps_to"].split(", ")} + props = {f"{mapping.split(':')[0].upper()}":f"{mapping.split(':')[1]}" for mapping in row["maps_to"].split(", ")} ttl_dict.update(props) new_prefixes = [item for item in ttl_dict] prefix_list = prefix_list + new_prefixes f.write("\n") - f.write(f"{row['term']} {label} {ttl_dict[label]};"+"\n") - f.write("\t"+f"{desc} {ttl_dict[desc]};"+"\n") - f.write("\t"+f"{node} {ttl_dict[node]};"+"\n") - line_end = ";" if ttl_dict[reqby] or ttl_dict[key] or props or ttl_dict[enum] else " ." - f.write("\t"+f"{type} {ttl_dict[type]}{line_end}"+"\n") + f.write(f"{row['term']} {tag_dict[label][0]}:{label} {ttl_dict[label]};"+"\n") + f.write("\t"+f"{tag_dict[desc][0]}:{desc} {ttl_dict[desc]};"+"\n") + f.write("\t"+f"{tag_dict[node][0]}:{node} {ttl_dict[node]};"+"\n") + if ttl_dict[type] != "": + line_end = ";" if ttl_dict[reqby] or ttl_dict[key] or props or ttl_dict[enum] not in ['"[]"', ""] else " ." + f.write("\t"+f"{tag_dict[type][0]}:{type} {ttl_dict[type]}{line_end}"+"\n") if ttl_dict[reqby]: - line_end = ";\n" if ttl_dict[key] or props or ttl_dict[enum] else " .\n" - f.write("\t"+f"{reqby} {''.join([ttl_dict[reqby], line_end])}") + line_end = ";\n" if ttl_dict[key] or props or ttl_dict[enum] not in ['"[]"', ""] else " .\n" + f.write("\t"+f"{tag_dict[reqby][0]}:{reqby} {''.join([ttl_dict[reqby], line_end])}") if ttl_dict[key]: - line_end = ";\n" if props or ttl_dict[enum] else " .\n" - f.write("\t"+f"{key} {''.join([ttl_dict[key], line_end])}") - if ttl_dict[enum]: + line_end = ";\n" if props or ttl_dict[enum] not in ['"[]"', ""] else " .\n" + f.write("\t"+f"{tag_dict[key][0]}:{key} {''.join([ttl_dict[key], line_end])}") + if ttl_dict[enum] not in ['"[]"', ""]: line_end = ";\n" if props else " .\n" - f.write("\t"+f"{enum} {''.join([ttl_dict[enum], line_end])}") + f.write("\t"+f"{tag_dict[enum][0]}:{enum} {''.join([ttl_dict[enum], line_end])}") if props: end = len(props) i = 0 - for key in props: + for prop in props: i += 1 line_end = ";\n" if i < end else " .\n" - if props[key] and props[key] != "TBD": - f.write("\t"+f"{key} {''.join([props[key], line_end])}") + if props[prop] and props[prop] != "TBD": + f.write("\t"+f"{tag_dict[prop][0]}:{prop} {''.join([props[prop], line_end])}") with open(out_file, "r") as f: current_lines = f.read() with open(out_file, "w+") as f: prefix_set = set(prefix_list) - lines = "".join([f"@prefix {prefix}: {tag_dict[prefix]}"+" .\n" for prefix in prefix_set]) + current_lines - f.write(lines) + node_set = set(node_list) + first_lines = [f"@prefix {tag_dict[prefix][0]}: {tag_dict[prefix][1]}"+" .\n" for prefix in prefix_set] + org_line = f"@prefix {args.org_name}: syn:{args.org_name} .\n" + node_lines = "".join([f"@prefix {node_type.lower().replace(' ', '_').replace('10x_', '')}: {args.org_name}:{node_type.lower().replace(' ', '_').replace('10x_', '')} .\n" for node_type in node_set]) + first_lines_set = "".join(set(first_lines)) + f.write(first_lines_set) + f.write(org_line) + f.write(node_lines) + f.write(current_lines) print(f"Done ✅") print(f"{out_file} was written with {len(ttl_df)} triples!") @@ -403,7 +416,7 @@ def main(): retry = 0 image = None while image is None: - if retry == 1: + if retry > 0: value_tag = rdflib.URIRef("http://syn.org/acceptableValues") model_graph = model_graph.remove((None, value_tag, None)) dot_stream = io.StringIO() @@ -419,10 +432,11 @@ def main(): except: print("Failed to generate a visualization of the graph. Retrying with fewer triples...") retry += 1 - - if retry == 2: - print("Failed to generate a visualization of the graph. Skipping.") - break + if retry == 2: + print("Failed to generate a visualization of the graph. Skipping.") + with open("graph_string_error.txt", "w+") as f: + f.write(dg[0].to_string()) + break if args.interactive_graph is not None: print("Generating interactive plot...") From 059ec7322a986339578d75d4fc67d8f5b8d1f7bb Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 1 Oct 2025 17:48:29 -0700 Subject: [PATCH 55/67] Adjust node formatting Fixes syntax errors in dot strings, allows rendering of graph --- utils/csv_to_ttl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index e6bb8253..8b700756 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -397,8 +397,8 @@ def main(): prefix_set = set(prefix_list) node_set = set(node_list) first_lines = [f"@prefix {tag_dict[prefix][0]}: {tag_dict[prefix][1]}"+" .\n" for prefix in prefix_set] - org_line = f"@prefix {args.org_name}: syn:{args.org_name} .\n" - node_lines = "".join([f"@prefix {node_type.lower().replace(' ', '_').replace('10x_', '')}: {args.org_name}:{node_type.lower().replace(' ', '_').replace('10x_', '')} .\n" for node_type in node_set]) + org_line = f"@prefix {args.org_name}: .\n" + node_lines = "".join([f"@prefix {node_type.lower().replace(' ', '_').replace('10x_', '')}: <{args.org_name}:{node_type.lower().replace(' ', '_').replace('10x_', '')}/> .\n" for node_type in node_set]) first_lines_set = "".join(set(first_lines)) f.write(first_lines_set) f.write(org_line) From 8ed4f096812b63c8a50a8a129ea1e57a03be9ce8 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Thu, 2 Oct 2025 11:44:54 -0700 Subject: [PATCH 56/67] Replace CRDC tags with CURIEs, adjust attribute filtering --- utils/csv_to_ttl.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 8b700756..3c50f18c 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -139,12 +139,17 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, # Step 1: Identify all node rows (treat rows with non-empty DependsOn as nodes) node_rows = input_df[input_df["DependsOn"].str.contains("Component")] + attributes_with_deps = input_df[~input_df["DependsOn"].str.contains("^[Component]|\s")] node_list = node_rows["Attribute"].to_list() attribute_rows = input_df[~input_df["DependsOn"].str.contains("Component")].set_index("Attribute") attribute_to_node = {row["Attribute"]: str(row["DependsOn"]).split(", ") for _, row in node_rows.iterrows()} + attribute_to_attribute = {row["Attribute"]: str(row["DependsOn"]).split(", ") for _, row in attributes_with_deps.iterrows()} attribute_info = [(attribute, node) for node, attribute_list in attribute_to_node.items() for attribute in attribute_list] + if attribute_to_attribute: + conditional_attributes = [(attribute, nd) for node, attribute in attribute_to_attribute.items() for att, nd in attribute_info if node == att] + attribute_info = attribute_info + conditional_attributes out_df["label"] = [entry[0] for entry in attribute_info] out_df["Resolved_Node"] = [entry[1] for entry in attribute_info] @@ -185,11 +190,11 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df = pd.DataFrame() out_df["term"] = input_df.apply( - lambda row: format_uri(base_tag, row["Node"], row["Property"], org_name), axis=1) + lambda row: format_uri(row["Node"], row["Property"]), axis=1) out_df["label"] = '"' + input_df["Property"].fillna('') + '"' out_df["description"] = input_df["Description"].fillna("") out_df["node"] = input_df["Node"].apply( - lambda x: f"<{base_tag}/{org_name}/{x.strip().lower().replace(' ', '_')}>") + lambda x: f"{org_name}:{x.strip().lower().replace(' ', '_')}") out_df["maps_to"] = input_df["CDECode"].fillna("").apply(lambda x: ":".join(["CDE", str(x).split(".")[0]]) if x not in ["", "TBD"] else "") out_df["is_key"] = input_df["Key Property"].apply(lambda x: str(x)).replace(["FALSE", "True"], ["", "true"]) out_df["required_by"] = input_df["Required"].apply(lambda x: str(x)) @@ -206,7 +211,7 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df.at[_, "has_enum"] = (''.join(['"[', ', '.join(row["has_enum"]).replace('"', '').replace('[', '').replace(']', ''), ']"'])) if is_enum else "" out_df.at[_, "description"] = '"' + ''.join([f'{str(row["cde_name"])}: ' if str(row["cde_name"]) != "" else "", row["description"]]).replace('"', '') + '"' - node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split(":")[-1] + node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split(":")[-1][:-2] final_cols = ["term", "label", "description", "node", "type", "required_by", "maps_to", "is_key", "has_enum"] return out_df[final_cols], node_name, node_list @@ -276,9 +281,14 @@ def subset_model(model_df: pd.DataFrame, nodes: str) -> pd.DataFrame: node_attributes = str(model_df.loc[node, "DependsOn"]).split(", ") node_attributes.append(node) node_rows = model_df.loc[node_attributes] + for _, row in node_rows.iterrows(): + if row["Valid Values"]: + for value in row["Valid Values"]: + if value in model_df.index and model_df[value, "DependsOn"]: + node_rows = pd.concat([node_rows, model_df.loc[value]]) node_subset_df = pd.concat([node_subset_df, node_rows]) - node_subset_df = node_subset_df.drop_duplicates().reset_index() + node_subset_df = node_subset_df.drop_duplicates().reset_index().fillna("nan") return node_subset_df @@ -331,13 +341,13 @@ def main(): if ref == "schematic": print(f"Processing model based on schematic CSV specification...") if args.subset is not None: - model_df = subset_model(model_df, f"{args.subset}") - ttl_df, node_name, node_list = convert_schematic_model_to_ttl_format(model_df, args.org_name, base_tag) + model = subset_model(model_df, f"{args.subset}") + ttl_df, node_name, node_list = convert_schematic_model_to_ttl_format(model, args.org_name, base_tag) if ref == "crdc": print(f"Processing model based on CRDC TSV specification...") if args.subset is not None: - model_df = model_df[model_df["Node"].isin(args.subset.split(", "))] - ttl_df, node_name, node_list = convert_crdc_model_to_ttl_format(model_df, args.org_name, base_tag) + model = model_df[model_df["Node"].isin(args.subset.split(", "))] + ttl_df, node_name, node_list = convert_crdc_model_to_ttl_format(model, args.org_name, base_tag) print(f"RDF triples will be built from the generated precursor dataframe!") out_file = "/".join([args.output, f"{args.org_name}_{node_name}_{args.version}.ttl"]) From 791bb69d0f1a77eab1cb15e2113dec767793f5a5 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 3 Oct 2025 11:58:05 -0700 Subject: [PATCH 57/67] Update subset function to handle conditional attributes --- utils/csv_to_ttl.py | 52 ++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 3c50f18c..e88ab3e2 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -133,23 +133,23 @@ def get_args(): return parser.parse_args() -def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base_tag: str) -> tuple[pd.DataFrame, str, list[str]]: +def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, subset: None|str) -> tuple[pd.DataFrame, str, list[str]]: """Convert schematic model DataFrame to TTL format.""" out_df = pd.DataFrame() # Step 1: Identify all node rows (treat rows with non-empty DependsOn as nodes) + if subset is None: + node_rows = input_df[input_df["DependsOn"].str.contains("Component")] + node_list = node_rows["Attribute"].to_list() + input_df = subset_model(input_df, node_list) + node_rows = input_df[input_df["DependsOn"].str.contains("Component")] - attributes_with_deps = input_df[~input_df["DependsOn"].str.contains("^[Component]|\s")] node_list = node_rows["Attribute"].to_list() + attribute_rows = input_df[~input_df["DependsOn"].str.contains("Component")].set_index("Attribute") attribute_to_node = {row["Attribute"]: str(row["DependsOn"]).split(", ") for _, row in node_rows.iterrows()} - attribute_to_attribute = {row["Attribute"]: str(row["DependsOn"]).split(", ") for _, row in attributes_with_deps.iterrows()} + attribute_info = set([(attribute, node) for node, attribute_list in attribute_to_node.items() for attribute in attribute_list]) - - attribute_info = [(attribute, node) for node, attribute_list in attribute_to_node.items() for attribute in attribute_list] - if attribute_to_attribute: - conditional_attributes = [(attribute, nd) for node, attribute in attribute_to_attribute.items() for att, nd in attribute_info if node == att] - attribute_info = attribute_info + conditional_attributes out_df["label"] = [entry[0] for entry in attribute_info] out_df["Resolved_Node"] = [entry[1] for entry in attribute_info] @@ -271,24 +271,28 @@ def convert_gc_column_type(type:str, is_enum:bool) -> str: def subset_model(model_df: pd.DataFrame, nodes: str) -> pd.DataFrame: - node_subset_df = pd.DataFrame() - - nodes = nodes.split(", ") + nodes = nodes.split(", ") if type(nodes)==str else nodes model_df = model_df.set_index("Attribute") + out_df_list = [] + for node in nodes: - node_attributes = str(model_df.loc[node, "DependsOn"]).split(", ") + subset_df = pd.DataFrame() + node_attributes = model_df.loc[node, "DependsOn"].split(", ") + attribute_deps = model_df.loc[model_df["DependsOn"] != ""] + attribute_deps = attribute_deps.loc[~attribute_deps["DependsOn"].str.contains("Component", regex=False)] + attribute_attributes = attribute_deps["DependsOn"].tolist() node_attributes.append(node) + node_rows = model_df.loc[node_attributes] - for _, row in node_rows.iterrows(): - if row["Valid Values"]: - for value in row["Valid Values"]: - if value in model_df.index and model_df[value, "DependsOn"]: - node_rows = pd.concat([node_rows, model_df.loc[value]]) - node_subset_df = pd.concat([node_subset_df, node_rows]) - - node_subset_df = node_subset_df.drop_duplicates().reset_index().fillna("nan") + attribute_dep_rows = model_df.loc[attribute_attributes] + + subset_df = pd.concat([subset_df, node_rows, attribute_dep_rows]) + subset_df.at[node, "DependsOn"] = ", ".join(subset_df.index.drop(node).tolist()) + out_df_list.append(subset_df) + + node_subset_df = pd.concat(out_df_list).drop_duplicates().reset_index(names="Attribute").fillna("nan") return node_subset_df @@ -341,13 +345,13 @@ def main(): if ref == "schematic": print(f"Processing model based on schematic CSV specification...") if args.subset is not None: - model = subset_model(model_df, f"{args.subset}") - ttl_df, node_name, node_list = convert_schematic_model_to_ttl_format(model, args.org_name, base_tag) + model_df = subset_model(model_df, f"{args.subset}") + ttl_df, node_name, node_list = convert_schematic_model_to_ttl_format(model_df, args.org_name, args.subset) if ref == "crdc": print(f"Processing model based on CRDC TSV specification...") if args.subset is not None: - model = model_df[model_df["Node"].isin(args.subset.split(", "))] - ttl_df, node_name, node_list = convert_crdc_model_to_ttl_format(model, args.org_name, base_tag) + model_df = model_df[model_df["Node"].isin(args.subset.split(", "))] + ttl_df, node_name, node_list = convert_crdc_model_to_ttl_format(model_df, args.org_name, base_tag) print(f"RDF triples will be built from the generated precursor dataframe!") out_file = "/".join([args.output, f"{args.org_name}_{node_name}_{args.version}.ttl"]) From 424271d7d860e25096b27ddf67a4e43b1967542e Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:07:17 -0700 Subject: [PATCH 58/67] Adjust subset output functions and docstrings --- utils/csv_to_ttl.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index e88ab3e2..a22f4570 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -4,6 +4,8 @@ Converts a schematic data model CSV or CRDC data model TSV to RDF triples Serialized triples to a ttl file ttl file can be used as a graph input for the arachne agent. +For schematic-based models, conditional dependencies are extracted and added to the data model graph. +Optionally generates a data model diagram and an interactive model viewer (WIP) usage: csv_to_ttl.py [-h] [-m MODEL] [-p MAPPING] [-o OUTPUT] [-g ORG_NAME] [-r {schematic,crdc}] [-b BASE_TAG] [-v VERSION] @@ -28,6 +30,8 @@ Boolean. Pass this flag to generate a PNG of the input model (Default: None) -ig, --interactive_graph Boolean. Pass this flag to generate an interactive visualization of the input model (Default: None) + -s SUBSET, --subset SUBSET + The name of one or more data types to extract from the model. Provide multiple as a quoted comma-separated list, e.g., 'Study, Biospecimen' (Default: None) author: orion.banks """ @@ -126,7 +130,7 @@ def get_args(): "-s", "--subset", type=str, - help="(For schematic models only) The name of one or more data model components to extract from model (Default: None)", + help="The name of one or more data types to extract from the model. Provide multiple as a quoted comma-separated list, e.g., 'Study, Biospecimen' (Default: None)", required=False, default=None ) @@ -292,7 +296,7 @@ def subset_model(model_df: pd.DataFrame, nodes: str) -> pd.DataFrame: subset_df.at[node, "DependsOn"] = ", ".join(subset_df.index.drop(node).tolist()) out_df_list.append(subset_df) - node_subset_df = pd.concat(out_df_list).drop_duplicates().reset_index(names="Attribute").fillna("nan") + node_subset_df = pd.concat(out_df_list).reset_index(names="Attribute").drop_duplicates().fillna("nan") return node_subset_df From 2a76a407dcea6b816b430620ea5ca9f5d17a064f Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:10:15 -0700 Subject: [PATCH 59/67] Adjust output naming node_name is used to label output ttl --- utils/csv_to_ttl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index a22f4570..903d9067 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -182,7 +182,7 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, out_df["description"] = '"' + out_df["description"].fillna('').apply(lambda x: x.replace('"', '')) + '"' out_df["maps_to"] = out_df["maps_to"].fillna("") - node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split(":")[-1][:-2] + node_name = "_".join(subset.split(", ")) if subset is not None else "all" # Final output final_cols = ["term", "label", "description", "node", "type", "required_by", "maps_to", "is_key", "has_enum"] From b2aab9f936969eb99494eb087e15697c64f2fa4b Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:31:30 -0700 Subject: [PATCH 60/67] Adjust input argument references --- utils/csv_to_ttl.py | 66 ++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 903d9067..ce4f896f 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -2,12 +2,11 @@ csv_to_ttl.py Converts a schematic data model CSV or CRDC data model TSV to RDF triples -Serialized triples to a ttl file -ttl file can be used as a graph input for the arachne agent. +Serializes triples to a ttl file - ttl file can be used as a graph input for the arachne agent. For schematic-based models, conditional dependencies are extracted and added to the data model graph. Optionally generates a data model diagram and an interactive model viewer (WIP) -usage: csv_to_ttl.py [-h] [-m MODEL] [-p MAPPING] [-o OUTPUT] [-g ORG_NAME] [-r {schematic,crdc}] [-b BASE_TAG] [-v VERSION] +usage: csv_to_ttl.py [-h] [-m MODEL] [-p MAPPING] [-o OUTPUT] [-g ORG_NAME] [-r {schematic,crdc}] [-b BASE_TAG] [-f BASE_REF] [-v VERSION] [-s SUBSET] [-bg] [-ig] options: -h, --help show this help message and exit @@ -20,18 +19,20 @@ -g ORG_NAME, --org_name ORG_NAME Abbreviation used in the data model name and RDF tags. (Default: 'new_org') -r {schematic,crdc}, --reference_type {schematic,crdc} - The type of data model reference used as a basis for the input. One of 'schematic' or 'crdc'. If no input is given, the reference type will be automatically determined based on - the provided org name (Default: None) + The type of data model reference used as a basis for the input. One of 'schematic' or 'crdc'. If no input is given, the reference type will be + automatically determined based on the provided org name (Default: None) -b BASE_TAG, --base_tag BASE_TAG url applied to the beginning of internal tags (Default: 'http://syn.org') + -f BASE_REF, --base_ref BASE_REF + Reference tag used to represent base_tag in ttl header (Default: 'syn') -v VERSION, --version VERSION Version applied to output ttl filename (Default: None) - -bg, --build_graph - Boolean. Pass this flag to generate a PNG of the input model (Default: None) + -s SUBSET, --subset SUBSET + The name of one or more data types to extract from the model. Provide multiple as a quoted comma-separated list, e.g., 'Study, Biospecimen' (Default: + None) + -bg, --build_graph Boolean. Pass this flag to generate a PNG of the input model (Default: None) -ig, --interactive_graph Boolean. Pass this flag to generate an interactive visualization of the input model (Default: None) - -s SUBSET, --subset SUBSET - The name of one or more data types to extract from the model. Provide multiple as a quoted comma-separated list, e.g., 'Study, Biospecimen' (Default: None) author: orion.banks """ @@ -101,6 +102,14 @@ def get_args(): help="url applied to the beginning of internal tags (Default: 'http://syn.org')", required=False, default="http://syn.org" + ) + parser.add_argument( + "-f", + "--base_ref", + type=str, + help="Reference tag used to represent base_tag in ttl header (Default: 'syn')", + required=False, + default="syn" ) parser.add_argument( "-v", @@ -109,6 +118,14 @@ def get_args(): help="Version applied to output ttl filename (Default: None)", required=False, default=None + ) + parser.add_argument( + "-s", + "--subset", + type=str, + help="The name of one or more data types to extract from the model. Provide multiple as a quoted comma-separated list, e.g., 'Study, Biospecimen' (Default: None)", + required=False, + default=None ) parser.add_argument( "-bg", @@ -125,14 +142,6 @@ def get_args(): action="store_true", required=False, default=None - ) - parser.add_argument( - "-s", - "--subset", - type=str, - help="The name of one or more data types to extract from the model. Provide multiple as a quoted comma-separated list, e.g., 'Study, Biospecimen' (Default: None)", - required=False, - default=None ) return parser.parse_args() @@ -189,7 +198,7 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, return out_df[final_cols], node_name, node_list -def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base_tag: str) -> tuple[pd.DataFrame, str, list[str]]: +def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, subset: str) -> tuple[pd.DataFrame, str, list[str]]: """Convert CRDC model DataFrame to TTL format.""" out_df = pd.DataFrame() @@ -215,7 +224,7 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, base out_df.at[_, "has_enum"] = (''.join(['"[', ', '.join(row["has_enum"]).replace('"', '').replace('[', '').replace(']', ''), ']"'])) if is_enum else "" out_df.at[_, "description"] = '"' + ''.join([f'{str(row["cde_name"])}: ' if str(row["cde_name"]) != "" else "", row["description"]]).replace('"', '') + '"' - node_name = "all" if len(out_df["node"].unique()) > 1 else str(out_df["node"].unique()).split(":")[-1][:-2] + node_name = "_".join(subset.split(", ")) if subset is not None else "all" final_cols = ["term", "label", "description", "node", "type", "required_by", "maps_to", "is_key", "has_enum"] return out_df[final_cols], node_name, node_list @@ -306,6 +315,7 @@ def main(): args = get_args() base_tag = args.base_tag + base_ref = args.base_ref label = "label" desc = "description" @@ -318,16 +328,16 @@ def main(): cde = "CDE" - tag_dict = { + tag_dict = { # Can replace tuples with alternative tag definitions label : ("rdfs", ""), desc : ("purl", ""), - node : ("syn", f"<{base_tag}/>"), - type : ("syn", f"<{base_tag}/>"), - reqby : ("syn", f"<{base_tag}/>"), - key : ("syn", f"<{base_tag}/>"), - enum : ("syn", f"<{base_tag}/>"), + node : (base_ref, f"<{base_tag}/>"), + type : (base_ref, f"<{base_tag}/>"), + reqby : (base_ref, f"<{base_tag}/>"), + key : (base_ref, f"<{base_tag}/>"), + enum : (base_ref, f"<{base_tag}/>"), duo : ("obo", ""), - cde : ("syn", f"<{base_tag}/>"), + cde : (base_ref, f"<{base_tag}/>"), } if args.mapping: @@ -355,7 +365,7 @@ def main(): print(f"Processing model based on CRDC TSV specification...") if args.subset is not None: model_df = model_df[model_df["Node"].isin(args.subset.split(", "))] - ttl_df, node_name, node_list = convert_crdc_model_to_ttl_format(model_df, args.org_name, base_tag) + ttl_df, node_name, node_list = convert_crdc_model_to_ttl_format(model_df, args.org_name, args.subset) print(f"RDF triples will be built from the generated precursor dataframe!") out_file = "/".join([args.output, f"{args.org_name}_{node_name}_{args.version}.ttl"]) @@ -435,7 +445,7 @@ def main(): image = None while image is None: if retry > 0: - value_tag = rdflib.URIRef("http://syn.org/acceptableValues") + value_tag = rdflib.URIRef(f"{base_tag}/acceptableValues") model_graph = model_graph.remove((None, value_tag, None)) dot_stream = io.StringIO() rdf2dot.rdf2dot(model_graph, dot_stream) From c4bf3b955e45f6a20bdec88f384bc6d741ca4367 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:40:13 -0700 Subject: [PATCH 61/67] Move node_name function to main, adjust ins and outs This is now a common function, since it relies on the subset argument, not the processed input. --- utils/csv_to_ttl.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index ce4f896f..45a44056 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -146,7 +146,7 @@ def get_args(): return parser.parse_args() -def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, subset: None|str) -> tuple[pd.DataFrame, str, list[str]]: +def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, subset: None|str) -> tuple[pd.DataFrame, list[str]]: """Convert schematic model DataFrame to TTL format.""" out_df = pd.DataFrame() @@ -190,15 +190,13 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, out_df["label"] = '"' + out_df["label"].fillna('') + '"' out_df["description"] = '"' + out_df["description"].fillna('').apply(lambda x: x.replace('"', '')) + '"' out_df["maps_to"] = out_df["maps_to"].fillna("") - - node_name = "_".join(subset.split(", ")) if subset is not None else "all" # Final output final_cols = ["term", "label", "description", "node", "type", "required_by", "maps_to", "is_key", "has_enum"] - return out_df[final_cols], node_name, node_list + return out_df[final_cols], node_list -def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, subset: str) -> tuple[pd.DataFrame, str, list[str]]: +def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str) -> tuple[pd.DataFrame, list[str]]: """Convert CRDC model DataFrame to TTL format.""" out_df = pd.DataFrame() @@ -224,10 +222,8 @@ def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, subs out_df.at[_, "has_enum"] = (''.join(['"[', ', '.join(row["has_enum"]).replace('"', '').replace('[', '').replace(']', ''), ']"'])) if is_enum else "" out_df.at[_, "description"] = '"' + ''.join([f'{str(row["cde_name"])}: ' if str(row["cde_name"]) != "" else "", row["description"]]).replace('"', '') + '"' - node_name = "_".join(subset.split(", ")) if subset is not None else "all" - final_cols = ["term", "label", "description", "node", "type", "required_by", "maps_to", "is_key", "has_enum"] - return out_df[final_cols], node_name, node_list + return out_df[final_cols], node_list def format_uri(node:str, attribute:str) -> str: @@ -282,6 +278,7 @@ def convert_gc_column_type(type:str, is_enum:bool) -> str: return out_type + def subset_model(model_df: pd.DataFrame, nodes: str) -> pd.DataFrame: nodes = nodes.split(", ") if type(nodes)==str else nodes @@ -327,7 +324,6 @@ def main(): duo = "DUO_" cde = "CDE" - tag_dict = { # Can replace tuples with alternative tag definitions label : ("rdfs", ""), desc : ("purl", ""), @@ -360,14 +356,15 @@ def main(): print(f"Processing model based on schematic CSV specification...") if args.subset is not None: model_df = subset_model(model_df, f"{args.subset}") - ttl_df, node_name, node_list = convert_schematic_model_to_ttl_format(model_df, args.org_name, args.subset) + ttl_df, node_list = convert_schematic_model_to_ttl_format(model_df, args.org_name, args.subset) if ref == "crdc": print(f"Processing model based on CRDC TSV specification...") if args.subset is not None: model_df = model_df[model_df["Node"].isin(args.subset.split(", "))] - ttl_df, node_name, node_list = convert_crdc_model_to_ttl_format(model_df, args.org_name, args.subset) + ttl_df, node_list = convert_crdc_model_to_ttl_format(model_df, args.org_name) print(f"RDF triples will be built from the generated precursor dataframe!") - + + node_name = "_".join(args.subset.split(", ")) if args.subset is not None else "all" out_file = "/".join([args.output, f"{args.org_name}_{node_name}_{args.version}.ttl"]) prefix_list = [] From fe13a7900e3f8bb982c8c7aeadb46a1db42273b3 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 3 Oct 2025 14:05:12 -0700 Subject: [PATCH 62/67] Pass base_ref to ttl header --- utils/csv_to_ttl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 45a44056..42858825 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -422,7 +422,7 @@ def main(): prefix_set = set(prefix_list) node_set = set(node_list) first_lines = [f"@prefix {tag_dict[prefix][0]}: {tag_dict[prefix][1]}"+" .\n" for prefix in prefix_set] - org_line = f"@prefix {args.org_name}: .\n" + org_line = f"@prefix {args.org_name}: <{base_ref}:{args.org_name}/> .\n" node_lines = "".join([f"@prefix {node_type.lower().replace(' ', '_').replace('10x_', '')}: <{args.org_name}:{node_type.lower().replace(' ', '_').replace('10x_', '')}/> .\n" for node_type in node_set]) first_lines_set = "".join(set(first_lines)) f.write(first_lines_set) From ce072755b1d879c84242adcb2b0dd740520fe620 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:58:18 -0700 Subject: [PATCH 63/67] Refactor to use prefixes --- utils/build_template_ttl.py | 75 ++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 17 deletions(-) diff --git a/utils/build_template_ttl.py b/utils/build_template_ttl.py index 5a49550d..a8b2d0bb 100644 --- a/utils/build_template_ttl.py +++ b/utils/build_template_ttl.py @@ -55,12 +55,20 @@ def get_args(): default="new_org" ) parser.add_argument( - "-p", - "--tag_prefix", + "-b", + "--base_tag", type=str, help="The tag that will be used as a prefix in RDF", required=False, default="http://syn.org" + ) + parser.add_argument( + "-r", + "--base_ref", + type=str, + help="The prefix that will be used to represent the base_tag", + required=False, + default="syn" ) parser.add_argument( "-v", @@ -82,10 +90,36 @@ def main(): args = get_args() - prefix_tag = args.tag_prefix - conform_tag = "" - int_tag = "" + base_tag = args.base_tag + base_ref = args.base_ref + + org_tag = args.org_name + conform_tag = "conformsTo" + int_tag = "integer" version = args.version + template_tag = "Template" + col_tag = "hasColumn" + pos_tag = "ColumnPosition" + colname_tag = "column" + header_tag = "header" + value_tag = "position" + def_val_tag = "defaultValue" + uuid_tag = "uuid" + + tag_dict = { # Can replace tuples with alternative tag definitions + base_ref : (base_ref, f"<{base_tag}/>"), + org_tag : (org_tag, f"<{base_ref}:{org_tag}/>"), + conform_tag : ("purl", ""), + int_tag : ("xml", ""), + template_tag : (base_ref, f"<{base_tag}/>"), + col_tag : (base_ref, f"<{base_tag}/>"), + uuid_tag : (base_ref, f"<{base_tag}/>"), + pos_tag : (base_ref, f"<{base_tag}/>"), + colname_tag : (base_ref, f"<{base_tag}/>"), + header_tag : (base_ref, f"<{base_tag}/>"), + value_tag : (base_ref, f"<{base_tag}/>"), + def_val_tag: (base_ref, f"<{base_tag}/>") + } if args.template: print(f"Processing model [{args.template}] to template.ttl...") @@ -95,17 +129,24 @@ def main(): template_df = pd.read_csv(args.template, header=0, keep_default_na=True, sep=sep) if template_name.startswith("GC_Data_Loading_Template"): - template_name = template_name.split("_")[-2] - version = template_name.split("_")[-1] + template_name_split = template_name.split("_") + template_name = template_name_split[-2] + version = template_name_split[-1] out_file = "/".join([args.output, f"{args.org_name}_{template_name}_{version}.ttl"]) with open(out_file, "w+") as f: print(f"Building RDF triples and serializing to TTL...") - + prefix_set = [prefix for prefix in tag_dict.keys()] + first_lines = [f"@prefix {tag_dict[prefix][0]}: {tag_dict[prefix][1]}"+" .\n" for prefix in prefix_set] + template_name_tag = f"@prefix {template_name}: <{tag_dict[template_tag][0]}:{template_name}/>"+" .\n\n" + first_lines_set = "".join(sorted(set(first_lines))) + f.write(first_lines_set) + f.write(template_name_tag) + # write template definition - f.write(f"<{prefix_tag}/{args.org_name}/{template_name}> a <{prefix_tag}/Template> ."+"\n") - f.write(f"<{prefix_tag}/{args.org_name}/{template_name}> {conform_tag} <{prefix_tag}/{args.org_name}> ."+"\n") + f.write(f"{tag_dict[org_tag][0]}:{template_name} a {tag_dict[base_ref][0]}:{template_tag} ."+"\n") + f.write(f"{tag_dict[template_tag][0]}:{template_name} {tag_dict[conform_tag][0]}:{conform_tag} {tag_dict[base_ref][0]}:{org_tag} ."+"\n") # write column definitions # set col position counter @@ -114,13 +155,13 @@ def main(): clean_col = format_uri(col) col_uuid = uuid4() if col in ["Component", "type"]: - f.write(f'<{prefix_tag}/{args.org_name}/{template_name}/{clean_col}> <{prefix_tag}/defaultValue> "{template_name}" .'+"\n") - f.write(f"<{prefix_tag}/{args.org_name}/{template_name}> <{prefix_tag}/hasColumn> <{prefix_tag}/{args.org_name}/{template_name}/{clean_col}> ."+"\n") - f.write(f"<{prefix_tag}/{col_uuid}>a <{prefix_tag}/ColumnPosition> ;"+"\n") - f.write(f"<{prefix_tag}/template> <{prefix_tag}/{args.org_name}/{template_name}> ;"+"\n") - f.write("\t"+f"<{prefix_tag}/column> <{prefix_tag}/{args.org_name}/{template_name}/{clean_col}> ;"+"\n") - f.write("\t"+f'<{prefix_tag}/header> "{col}" ;'+"\n") - f.write("\t"+f'<{prefix_tag}/position> "{col_position}"^^{int_tag} .'+"\n") + f.write(f'{template_name}:{clean_col} {tag_dict[def_val_tag][0]}:{def_val_tag} "{template_name}" .'+"\n") + f.write(f"{tag_dict[template_tag][0]}:{template_name} {tag_dict[col_tag][0]}:{col_tag} {template_name}:{clean_col} ."+"\n") + f.write(f"{tag_dict[uuid_tag][0]}:{col_uuid} a {tag_dict[pos_tag][0]}:{pos_tag} ;"+"\n") + f.write(f"{tag_dict[template_tag][0]}:{template_tag} {tag_dict[org_tag][0]}:{template_name} ;"+"\n") + f.write("\t"+f"{tag_dict[colname_tag][0]}:{colname_tag} {template_name}:{clean_col} ;"+"\n") + f.write("\t"+f'{tag_dict[header_tag][0]}:{header_tag} "{clean_col}" ;'+"\n") + f.write("\t"+f'{tag_dict[value_tag][0]}:{value_tag} "{col_position}"^^{tag_dict[int_tag][0]}:{int_tag} .'+"\n") col_position += 1 print(f"Done ✅") From 6aab19dca0ce9b37fb7c68908275783f2bb2e21c Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:10:55 -0700 Subject: [PATCH 64/67] Add visualization function --- utils/build_template_ttl.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/utils/build_template_ttl.py b/utils/build_template_ttl.py index a8b2d0bb..39ce51a6 100644 --- a/utils/build_template_ttl.py +++ b/utils/build_template_ttl.py @@ -24,9 +24,17 @@ import argparse import os +import io import pandas as pd from pathlib import Path from uuid import uuid4 +from IPython.display import display, Image +from PIL import Image +import pydot +import rdflib +from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph +from rdflib.tools import rdf2dot + def get_args(): """Set up command-line interface and get arguments.""" @@ -92,6 +100,7 @@ def main(): base_tag = args.base_tag base_ref = args.base_ref + build_graph = True org_tag = args.org_name conform_tag = "conformsTo" @@ -167,5 +176,20 @@ def main(): print(f"Done ✅") print(f"{out_file} was written with {col_position} attributes!") + g = rdflib.Graph() + model_graph = g.parse(out_file, format="turtle") + image_path = "/".join([args.output, f"{args.org_name}_{template_name}_{version}.png"]) + + if build_graph is not None: + dot_stream = io.StringIO() + rdf2dot.rdf2dot(model_graph, dot_stream) + dot_string = dot_stream.getvalue() + dg = pydot.graph_from_dot_data(dot_string) + dg[0].write_png(image_path) + image = Image.open(image_path) + image.show() + print(f"Success! Graph visualization is available at {image_path}") + + if __name__ == "__main__": main() From 9785d3318ed75bfd7e02e2c7818ea95cd8bcdaca Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:18:42 -0700 Subject: [PATCH 65/67] Update docstrings, add bg flag -bg flag toggles if the template is visualized and saved as a PNG --- utils/build_template_ttl.py | 54 +++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/utils/build_template_ttl.py b/utils/build_template_ttl.py index 39ce51a6..6dc63c2c 100644 --- a/utils/build_template_ttl.py +++ b/utils/build_template_ttl.py @@ -4,37 +4,38 @@ Converts a metadata template CSV info to a ttl file defining the template. ttl file can be used as input for the arachne agent and would be available as a target. -usage: build_template_ttl.py [-h] [-t TEMPLATE] [-o OUTPUT] [-g ORG_NAME] [-p TAG_PREFIX] [-v VERSION] +usage: build_template_ttl.py [-h] [-t TEMPLATE] [-o OUTPUT] [-g ORG_NAME] [-b BASE_TAG] [-r BASE_REF] [-v VERSION] [-bg] options: -h, --help show this help message and exit -t TEMPLATE, --template TEMPLATE - Path to metadata template CSV + Path to metadata template in tabular format (Default: None) -o OUTPUT, --output OUTPUT - Path to folder where graph should be stored + Path to folder where graph should be stored (Default: current directory) -g ORG_NAME, --org_name ORG_NAME - Abbreviation for org, used in RDF prefixes - -p TAG_PREFIX, --tag_prefix TAG_PREFIX - The tag that will be used as a prefix in RDF + Abbreviation for org, used in RDF prefixes (Default: 'new_org) + -b BASE_TAG, --base_tag BASE_TAG + The tag that will be used as a prefix in RDF (Default: 'http://syn.org') + -r BASE_REF, --base_ref BASE_REF + The prefix that will be used to represent the base_tag (Default: 'syn') -v VERSION, --version VERSION Version applied to output ttl filename (Default: 1.0.0) + -bg, --build_graph Boolean. Pass this flag to generate a PNG of the input model (Default: None) author: orion.banks """ import argparse -import os import io +import os import pandas as pd -from pathlib import Path -from uuid import uuid4 -from IPython.display import display, Image -from PIL import Image import pydot import rdflib -from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph -from rdflib.tools import rdf2dot +from pathlib import Path +from PIL import Image +from rdflib.tools import rdf2dot +from uuid import uuid4 def get_args(): """Set up command-line interface and get arguments.""" @@ -43,14 +44,14 @@ def get_args(): "-t", "--template", type=str, - help="Path to metadata template CSV", + help="Path to metadata template in tabular format (Default: None)", required=False ) parser.add_argument( "-o", "--output", type=str, - help="Path to folder where graph should be stored", + help="Path to folder where graph should be stored (Default: current directory)", required=False, default=os.getcwd() ) @@ -58,7 +59,7 @@ def get_args(): "-g", "--org_name", type=str, - help="Abbreviation for org, used in RDF prefixes", + help="Abbreviation for org, used in RDF prefixes (Default: 'new_org)", required=False, default="new_org" ) @@ -66,7 +67,7 @@ def get_args(): "-b", "--base_tag", type=str, - help="The tag that will be used as a prefix in RDF", + help="The tag that will be used as a prefix in RDF (Default: 'http://syn.org')", required=False, default="http://syn.org" ) @@ -74,7 +75,7 @@ def get_args(): "-r", "--base_ref", type=str, - help="The prefix that will be used to represent the base_tag", + help="The prefix that will be used to represent the base_tag (Default: 'syn')", required=False, default="syn" ) @@ -85,6 +86,14 @@ def get_args(): help="Version applied to output ttl filename (Default: 1.0.0)", required=False, default="1.0.0" + ) + parser.add_argument( + "-bg", + "--build_graph", + help="Boolean. Pass this flag to generate a PNG of the input model (Default: None)", + action="store_true", + required=False, + default=None ) return parser.parse_args() @@ -100,7 +109,7 @@ def main(): base_tag = args.base_tag base_ref = args.base_ref - build_graph = True + build_graph = args.build_graph org_tag = args.org_name conform_tag = "conformsTo" @@ -176,11 +185,10 @@ def main(): print(f"Done ✅") print(f"{out_file} was written with {col_position} attributes!") - g = rdflib.Graph() - model_graph = g.parse(out_file, format="turtle") - image_path = "/".join([args.output, f"{args.org_name}_{template_name}_{version}.png"]) - if build_graph is not None: + g = rdflib.Graph() + model_graph = g.parse(out_file, format="turtle") + image_path = "/".join([args.output, f"{args.org_name}_{template_name}_{version}.png"]) dot_stream = io.StringIO() rdf2dot.rdf2dot(model_graph, dot_stream) dot_string = dot_stream.getvalue() From 0d301d97e0cc894dffd8ebedd5e78c6f3195fd0f Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Tue, 21 Oct 2025 17:15:49 -0700 Subject: [PATCH 66/67] Use pydotplus; add graph engine input --- utils/csv_to_ttl.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index 42858825..c58a8b6a 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -46,7 +46,7 @@ import pandas as pd from pathlib import Path from PIL import Image -import pydot +import pydotplus import rdflib from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph from rdflib.tools import rdf2dot @@ -438,7 +438,7 @@ def main(): image_path = "/".join([args.output, f"{args.org_name}_{node_name}_{args.version}.png"]) if args.build_graph is not None: - retry = 0 + retry = 1 image = None while image is None: if retry > 0: @@ -447,9 +447,9 @@ def main(): dot_stream = io.StringIO() rdf2dot.rdf2dot(model_graph, dot_stream) dot_string = dot_stream.getvalue() - dg = pydot.graph_from_dot_data(dot_string) + graph = pydotplus.graph_from_dot_data(dot_string) try: - dg[0].write_png(image_path) + graph.write_png(image_path, prog="sfdp") image = Image.open(image_path) image.show() print(f"Success! Graph visualization is available at {image_path}") @@ -460,7 +460,7 @@ def main(): if retry == 2: print("Failed to generate a visualization of the graph. Skipping.") with open("graph_string_error.txt", "w+") as f: - f.write(dg[0].to_string()) + f.write(graph.to_string()) break if args.interactive_graph is not None: From 4bb505b75707700cec5600a1b443ecea65b10803 Mon Sep 17 00:00:00 2001 From: Orion Banks <49208907+Bankso@users.noreply.github.com> Date: Wed, 22 Oct 2025 13:27:26 -0700 Subject: [PATCH 67/67] Add primary and foreign key handling In a schematic-based model, if 'primary_key' is listed in 'Properties', 'is_key' will be 'true' If the primary key of another model is listed (form component_id, e.g., 'Study_id') in Properties, a triple will be created to document the linkage. This is used by graphing software to render diagrams. --- utils/csv_to_ttl.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/utils/csv_to_ttl.py b/utils/csv_to_ttl.py index c58a8b6a..aea2625a 100644 --- a/utils/csv_to_ttl.py +++ b/utils/csv_to_ttl.py @@ -175,17 +175,21 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, out_df["term"] = out_df.apply(lambda row: format_uri(row["Resolved_Node"], row["label"]), axis=1) # Step 4: Info extraction and TTL-compatible column formatting + key_tuples = [] for _, row in out_df.iterrows(): out_df.at[_, "description"] = attribute_rows.loc[row["label"], "Description"] - out_df.at[_, "maps_to"] = get_reference_id(str(attribute_rows.loc[row["label"], "Properties"])) + mappings = get_reference_id(str(attribute_rows.loc[row["label"], "Properties"])) + out_df.at[_, "maps_to"] = mappings[0] out_df.at[_, "node"] = row["Resolved_Node_URI"] - out_df.at[_, "is_key"] = "true" if str(attribute_rows.loc[row["label"], "Validation Rules"]).strip().lower() == "unique" else "" + out_df.at[_, "is_key"] = "true" if "primary_key" in mappings[1] else "" out_df.at[_, "required_by"] = row["Resolved_Node_URI"] if str(attribute_rows.loc[row["label"], "Required"]).strip().lower() == "true" else "" out_df.at[_, "has_enum"] = '"[' + ", ".join(str(attribute_rows.loc[row["label"], "Valid Values"]).split(", ")) + ']"' if str(attribute_rows.loc[row["label"], "Valid Values"]) != "nan" else "" col_type = attribute_rows.loc[row["label"], "columnType"] validation = attribute_rows.loc[row["label"], "Validation Rules"] is_enum = True if str(attribute_rows.loc[row["label"], "Valid Values"]) != "" else False out_df.at[_, "type"] = '"' + convert_schematic_column_type(col_type, validation, is_enum) + '"' + for e in mappings[1]: + key_tuples.append((e, row["Resolved_Node_URI"], row["term"])) out_df["label"] = '"' + out_df["label"].fillna('') + '"' out_df["description"] = '"' + out_df["description"].fillna('').apply(lambda x: x.replace('"', '')) + '"' @@ -193,7 +197,7 @@ def convert_schematic_model_to_ttl_format(input_df: pd.DataFrame, org_name: str, # Final output final_cols = ["term", "label", "description", "node", "type", "required_by", "maps_to", "is_key", "has_enum"] - return out_df[final_cols], node_list + return out_df[final_cols], node_list, [key_tuple for key_tuple in key_tuples if key_tuple is not None] def convert_crdc_model_to_ttl_format(input_df: pd.DataFrame, org_name: str) -> tuple[pd.DataFrame, list[str]]: @@ -250,15 +254,17 @@ def convert_schematic_column_type(type:str, validation: str, is_enum:bool) -> st return out_type -def get_reference_id(entry: str) -> list[tuple[str, str]]: +def get_reference_id(entry: str) -> tuple[str, list[str]]: """Extract CDE ID from Properties entry.""" - entry = entry.split(", ") if len(entry.split(", ")) > 1 else entry + entry = entry.split(", ") if len(entry.split(", ")) > 1 else [entry] + + refs = [e for e in entry if len(e.split(":")) > 1] + keys = [e for e in entry if len(e.split("_")) > 1] - if type(entry) == list: - return ", ".join([f"{ref.split(':')[0]}:{ref.split(':')[1]}" for ref in entry]) - else: - return f"{entry.split(':')[0]}:{entry.split(':')[1]}" if len(entry.split(":")) > 1 else "" + ref_out = ", ".join([f"{ref.split(':')[0]}:{ref.split(':')[1]}" for ref in refs]) + key_out = keys + return (ref_out, key_out) def convert_gc_column_type(type:str, is_enum:bool) -> str: """Convert GC column type to TTL-compatible format.""" @@ -356,12 +362,13 @@ def main(): print(f"Processing model based on schematic CSV specification...") if args.subset is not None: model_df = subset_model(model_df, f"{args.subset}") - ttl_df, node_list = convert_schematic_model_to_ttl_format(model_df, args.org_name, args.subset) + ttl_df, node_list, key_tuple_list = convert_schematic_model_to_ttl_format(model_df, args.org_name, args.subset) if ref == "crdc": print(f"Processing model based on CRDC TSV specification...") if args.subset is not None: model_df = model_df[model_df["Node"].isin(args.subset.split(", "))] ttl_df, node_list = convert_crdc_model_to_ttl_format(model_df, args.org_name) + key_tuple_list = None print(f"RDF triples will be built from the generated precursor dataframe!") node_name = "_".join(args.subset.split(", ")) if args.subset is not None else "all" @@ -429,6 +436,12 @@ def main(): f.write(org_line) f.write(node_lines) f.write(current_lines) + f.write("\n") + if key_tuple_list is not None: + for primary, schema, foreign in key_tuple_list: + if "id" in primary.split("_"): + line = f"{':'.join([str(primary).split('_')[0].lower(), str(primary).lower()])} {str(schema).lower()} {str(foreign).lower()} .\n" + f.write(line) print(f"Done ✅") print(f"{out_file} was written with {len(ttl_df)} triples!") @@ -449,7 +462,7 @@ def main(): dot_string = dot_stream.getvalue() graph = pydotplus.graph_from_dot_data(dot_string) try: - graph.write_png(image_path, prog="sfdp") + graph.write_png(image_path, prog="dot") image = Image.open(image_path) image.show() print(f"Success! Graph visualization is available at {image_path}")