Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,6 @@ great_expectations/
outputs/
upload_check*
*manifests/
annotations/inputs/*
annotations/output/*
annotations/outputs/*
97 changes: 97 additions & 0 deletions annotations/add_boolean_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import pandas as pd
import synapseclient
import argparse
import pandas as pd
from pathlib import Path
import subprocess
import re

# Login to Synapse
def login():

syn = synapseclient.login()

return syn

def get_tables(syn, tableIdList, mergeFlag):

tables = [] # set up lists to store info
names = []

for tableId in tableIdList:
# pull table from Synapse
table = syn.tableQuery(f"SELECT * FROM {tableId}").asDataFrame().fillna("")
# grab name of data type from table
# assumes "Component" is first column in table
name = table.iat[1, 0]
# build path to store table as CSV
manifestPath = Path(f"output/{name}/{name}.csv")
# create folder to store CSVs
manifestPath.parent.mkdir(parents=True, exist_ok=True)

# convert df to CSV
table.to_csv(manifestPath, index=False, lineterminator="\n")
# if merging store the table for the next function
if mergeFlag:
tables.append(table)
# if not merging, store the file path for the next function
else:
tables.append(manifestPath)
# store the name for next functions
names.append(name)

return list(zip(tables, names))

def add_boolean_comparison_columns():

syn_login = login()
grants = get_tables(syn_login, ['syn64590399'], True)[0][0]
pubs = get_tables(syn_login, ['syn52752398'], True)[0][0]
datasets = get_tables(syn_login, ['syn52752399'], True)[0][0]
tools = get_tables(syn_login, ['syn52820451'], True)[0][0]
educ = get_tables(syn_login, ['syn52963530'], True)[0][0]

grant_pubs = []
grant_ds = []
grant_tools = []
grant_educ = []
for i in range(len(grants)):
grant_pubs.append(grants['GrantView_id'][i] in pubs['GrantView Key'].values)
grant_ds.append(grants['GrantView_id'][i] in datasets['GrantView Key'].values)
grant_tools.append(grants['GrantView_id'][i] in tools['GrantView Key'].values)
grant_educ.append(grants['GrantView_id'][i] in educ['GrantView Key'].values)

grants['Publication_boolean'] = grant_pubs
grants['Dataset_boolean'] = grant_ds
grants['Tool_boolean'] = grant_tools
grants['Education_boolean'] = grant_educ


pubs_ds = []
pubs_tools = []
pubs_educ = []
for i in range(len(pubs)):
pubs_ds.append(pubs['PublicationView_id'][i] in datasets['PublicationView Key'].values)
pubs_tools.append(pubs['PublicationView_id'][i] in tools['PublicationView Key'].values)
pubs_educ.append(pubs['PublicationView_id'][i] in educ['PublicationView Key'].values)

pubs['Dataset_boolean'] = pubs_ds
pubs['Tool_boolean'] = pubs_tools
pubs['Education_boolean'] = pubs_educ

ds_tools = []
ds_educ = []
for i in range(len(datasets)):
ds_tools.append(datasets['DatasetView_id'][i] in tools['DatasetView Key'].values)
ds_educ.append(datasets['DatasetView_id'][i] in educ['DatasetView Key'].values)

datasets['Tool_boolean'] = ds_tools
datasets['Education_boolean'] = ds_educ

tools_educ = []
for i in range(len(tools)):
tools_educ.append(tools['ToolView_id'][i] in educ['ToolView Key'].values)

tools['Education_boolean'] = tools_educ

return [grants, pubs, datasets, tools, educ]