Skip to content

Commit d14edc4

Browse files
authored
Merge pull request #917 from Sage-Bionetworks/develop-table-replace
Develop Table Uploads: Replace
2 parents 5f5fc67 + d06600e commit d14edc4

File tree

2 files changed

+101
-15
lines changed

2 files changed

+101
-15
lines changed

schematic/models/metadata.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ def submit_metadata_manifest(
305305
#TODO: avoid explicitly exposing Synapse store functionality
306306
# just instantiate a Store class and let it decide at runtime/config
307307
# the store type
308-
syn_store = SynapseStorage(input_token=input_token)
308+
syn_store = SynapseStorage(input_token = input_token, project_scope = project_scope)
309309
manifest_id=None
310310
censored_manifest_id=None
311311
restrict_maniest=False

schematic/store/synapse.py

Lines changed: 100 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import re
1616
import synapseclient
1717
import synapseutils
18+
from time import sleep
1819

1920
from synapseclient import (
2021
Synapse,
@@ -77,11 +78,12 @@ def __init__(
7778
"""
7879

7980
self.syn = self.login(token, access_token, input_token)
81+
self.project_scope = project_scope
8082
try:
8183
self.storageFileview = CONFIG["synapse"]["master_fileview"]
82-
if project_scope:
84+
if self.project_scope:
8385
self.storageFileviewTable = self.syn.tableQuery(
84-
f"SELECT * FROM {self.storageFileview} WHERE projectId IN {tuple(project_scope + [''])}"
86+
f"SELECT * FROM {self.storageFileview} WHERE projectId IN {tuple(self.project_scope + [''])}"
8587
).asDataFrame()
8688
else:
8789
# get data in administrative fileview for this pipeline
@@ -712,9 +714,26 @@ def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable
712714

713715
return df, results
714716

717+
def _get_tables(self) -> List[Table]:
718+
project = self.syn.get(self.project_scope[0])
719+
return list(self.syn.getChildren(project, includeTypes=["table"]))
720+
721+
def get_table_info(self) -> List[str]:
722+
"""Gets the names of the tables in the schema
723+
Returns:
724+
list[str]: A list of table names
725+
"""
726+
tables = self._get_tables()
727+
if tables:
728+
return {table["name"]: table["id"] for table in tables}
729+
else:
730+
return {None:None}
731+
715732
@missing_entity_handler
716-
def upload_format_manifest_table(self, se, manifest, datasetId, table_name, restrict, useSchemaLabel,):
733+
def upload_format_manifest_table(self, se, manifest, datasetId, table_name, restrict, useSchemaLabel):
717734
# Rename the manifest columns to display names to match fileview
735+
table_info = self.get_table_info()
736+
718737
blacklist_chars = ['(', ')', '.', ' ']
719738
manifest_columns = manifest.columns.tolist()
720739

@@ -748,8 +767,22 @@ def upload_format_manifest_table(self, se, manifest, datasetId, table_name, rest
748767

749768
# Put table manifest onto synapse
750769
schema = Schema(name=table_name, columns=col_schema, parent=self.getDatasetProject(datasetId))
751-
table = self.syn.store(Table(schema, table_manifest), isRestricted=restrict)
752-
manifest_table_id = table.schema.id
770+
if table_name not in table_info.keys():
771+
manifest_table_id = self.make_synapse_table(table_to_load = table_manifest,
772+
dataset_id = datasetId,
773+
existingTableId = None,
774+
table_name = table_name,
775+
column_type_dictionary = col_schema,
776+
restrict = restrict,
777+
manipulation = 'replace')
778+
else:
779+
manifest_table_id = self.make_synapse_table(table_to_load = table_manifest,
780+
dataset_id = datasetId,
781+
existingTableId = table_info[table_name],
782+
table_name = table_name,
783+
column_type_dictionary = col_schema,
784+
restrict = restrict,
785+
manipulation = 'replace')
753786

754787

755788
return manifest_table_id, manifest, table_manifest
@@ -1036,7 +1069,7 @@ def associateMetadataWithFiles(
10361069
# If specified, upload manifest as a table and get the SynID and manifest
10371070
if manifest_record_type == 'table' or manifest_record_type == 'both':
10381071
manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table(
1039-
se, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,)
1072+
se, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel)
10401073

10411074
# Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed
10421075
# also set metadata for each synapse entity as Synapse annotations
@@ -1317,20 +1350,73 @@ def getDatasetAnnotationsBatch(
13171350

13181351
return table
13191352

1320-
def make_synapse_table(self, table_to_load, dataset_id, existingTableId, table_name,
1321-
update_col = 'entityId', column_type_dictionary = {}, specify_schema=True, restrict = False):
1353+
def make_synapse_table(self,
1354+
table_to_load: pd.DataFrame,
1355+
dataset_id: str, table_name: str,
1356+
existingTableId: str = None,
1357+
update_col: str = 'entityId',
1358+
column_type_dictionary: Dict = {},
1359+
specify_schema: bool = True,
1360+
restrict: bool = False,
1361+
manipulation: str = 'update') -> str:
13221362
'''
1323-
Record based data
1363+
Make a synapse table for record based data
1364+
1365+
Args:
1366+
table_to_load (pd.DataFrame): table to upload to synapse
1367+
dataset_id (str): synID for dataset related to manifest to be uploaded as table
1368+
existingTableId (str): Optional, synID of existing table to upload to
1369+
table_name (str): Name of the table that will be displayed on synapse
1370+
update_col (str): Optional, if updating a table by aligning on index, column to use as indices
1371+
column_type_dictionary (Dict): dictionary of column types
1372+
specify_schema (bool): specify a schema for the table at upload according to types in column_type_dictionary
1373+
restrict (bool): set to True if access restrictions need to be imposed on table when stored on synapse, False otherwise
1374+
manipulation (str): type of manipulation to do if a table exists already. Can be either "update" or "replace".
1375+
Defaults to "update" to preserve old behavior
1376+
1377+
Returns:
1378+
str: synId of table uploaded to synapse
1379+
13241380
'''
1325-
# create/update a table corresponding to this dataset in this dataset's parent project
1326-
# update_col is the column in the table that has a unique code that will allow Synapse to
1327-
# locate its position in the old and new table.
13281381
if existingTableId:
13291382
existing_table, existing_results = self.get_synapse_table(existingTableId)
1330-
table_to_load = update_df(existing_table, table_to_load, update_col)
1383+
1384+
manipulation = manipulation.lower()
1385+
if manipulation not in ['update', 'replace']:
1386+
raise NotImplementedError(
1387+
"Currently, only 'update' and 'replace' table operations are supported."
1388+
)
1389+
1390+
# create/update a table corresponding to this dataset in this dataset's parent project
1391+
# update_col is the column in the table that has a unique code that will allow Synapse to
1392+
# locate its position in the old and new table.
1393+
if manipulation == 'update':
1394+
table_to_load = update_df(existing_table, table_to_load, update_col)
1395+
1396+
elif manipulation == 'replace':
1397+
# remove rows
1398+
self.syn.delete(existing_results)
1399+
1400+
# wait for row deletion to finish on synapse before getting empty table
1401+
sleep(5)
1402+
# removes all current columns
1403+
current_table = self.syn.get(existingTableId)
1404+
current_columns = self.syn.getTableColumns(current_table)
1405+
for col in current_columns:
1406+
current_table.removeColumn(col)
1407+
1408+
# adds new columns to schema
1409+
new_columns = as_table_columns(table_to_load)
1410+
for col in new_columns:
1411+
current_table.addColumn(col)
1412+
self.syn.store(current_table, isRestricted = restrict)
1413+
1414+
# store table with existing etag data and impose restrictions as appropriate
13311415
self.syn.store(Table(existingTableId, table_to_load, etag = existing_results.etag), isRestricted = restrict)
1416+
13321417
# remove system metadata from manifest
13331418
existing_table.drop(columns = ['ROW_ID', 'ROW_VERSION'], inplace = True)
1419+
return existingTableId
13341420
else:
13351421
datasetEntity = self.syn.get(dataset_id, downloadFile = False)
13361422
datasetName = datasetEntity.name
@@ -1359,7 +1445,7 @@ def make_synapse_table(self, table_to_load, dataset_id, existingTableId, table_n
13591445
cols.append(Column(name=col, columnType='STRING', maximumSize=500))
13601446
schema = Schema(name=table_name, columns=cols, parent=datasetParentProject)
13611447
table = Table(schema, table_to_load)
1362-
table_id = self.syn.store(table, isRestricted = restrict)
1448+
table = self.syn.store(table, isRestricted = restrict)
13631449
return table.schema.id
13641450
else:
13651451
# For just uploading the tables to synapse using default

0 commit comments

Comments
 (0)