Skip to content

Commit ba1e879

Browse files
authored
Merge pull request #1254 from Sage-Bionetworks/develop-filename-annotations-FDS-654
BugFix: Fixed an issue where files on synapse were not being annotated correctly
2 parents 30a41f5 + 3ed79bd commit ba1e879

File tree

5 files changed

+95
-24
lines changed

5 files changed

+95
-24
lines changed

schematic/models/commands.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,9 @@ def submit_manifest(
110110
"""
111111
Running CLI with manifest validation (optional) and submission options.
112112
"""
113-
if jsonld is None:
114-
jsonld = CONFIG.model_location
115-
log_value_from_config("jsonld", jsonld)
113+
114+
jsonld = CONFIG.model_location
115+
log_value_from_config("jsonld", jsonld)
116116

117117
metadata_model = MetadataModel(
118118
inputMModelLocation=jsonld, inputMModelLocationType="local"

schematic/store/synapse.py

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -574,13 +574,7 @@ def updateDatasetManifestFiles(self, sg: SchemaGenerator, datasetId: str, store:
574574
# the columns Filename and entityId are assumed to be present in manifest schema
575575
# TODO: use idiomatic panda syntax
576576
if dataset_files:
577-
new_files = {"Filename": [], "entityId": []}
578-
579-
# find new files if any
580-
for file_id, file_name in dataset_files:
581-
if not file_id in manifest["entityId"].values:
582-
new_files["Filename"].append(file_name)
583-
new_files["entityId"].append(file_id)
577+
new_files = self._get_file_entityIds(dataset_files=dataset_files, only_new_files=True, manifest=manifest)
584578

585579
# update manifest so that it contain new files
586580
new_files = pd.DataFrame(new_files)
@@ -600,6 +594,38 @@ def updateDatasetManifestFiles(self, sg: SchemaGenerator, datasetId: str, store:
600594
manifest = manifest.fillna("")
601595

602596
return manifest_id, manifest
597+
598+
def _get_file_entityIds(self, dataset_files: List, only_new_files: bool = False, manifest: pd.DataFrame = None):
599+
"""
600+
Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files
601+
602+
Args:
603+
manifest: metadata manifest
604+
dataset_file: List of all files in a dataset
605+
only_new_files: boolean to control whether only new files are returned or all files in the dataset
606+
Returns:
607+
files: dictionary of file names and entityIDs, with scope as specified by `only_new_files`
608+
"""
609+
files = {"Filename": [], "entityId": []}
610+
611+
if only_new_files:
612+
if manifest is None:
613+
raise UnboundLocalError(
614+
"No manifest was passed in, a manifest is required when `only_new_files` is True."
615+
)
616+
617+
# find new files (that are not in the current manifest) if any
618+
for file_id, file_name in dataset_files:
619+
if not file_id in manifest["entityId"].values:
620+
files["Filename"].append(file_name)
621+
files["entityId"].append(file_id)
622+
else:
623+
# get all files
624+
for file_id, file_name in dataset_files:
625+
files["Filename"].append(file_name)
626+
files["entityId"].append(file_id)
627+
628+
return files
603629

604630
def getProjectManifests(self, projectId: str) -> List[str]:
605631
"""Gets all metadata manifest files across all datasets in a specified project.
@@ -1343,6 +1369,18 @@ def add_entities(
13431369
manifest (pd.DataFrame): modified to add entitiyId as appropriate.
13441370
13451371
'''
1372+
1373+
# Expected behavior is to annotate files if `Filename` is present regardless of `-mrt` setting
1374+
if 'filename' in [col.lower() for col in manifest.columns]:
1375+
# get current list of files and store as dataframe
1376+
dataset_files = self.getFilesInStorageDataset(datasetId)
1377+
files_and_entityIds = self._get_file_entityIds(dataset_files=dataset_files, only_new_files=False)
1378+
file_df = pd.DataFrame(files_and_entityIds)
1379+
1380+
# Merge dataframes to add entityIds
1381+
manifest = manifest.merge(file_df, how = 'left', on='Filename', suffixes=['_x',None]).drop('entityId_x',axis=1)
1382+
1383+
# Fill `entityId` for each row if missing and annotate entity as appropriate
13461384
for idx, row in manifest.iterrows():
13471385
if not row["entityId"] and (manifest_record_type == 'file_and_entities' or
13481386
manifest_record_type == 'table_file_and_entities'):
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
Component,CheckList,CheckRegexList,CheckRegexSingle,CheckNum,CheckFloat,CheckInt,CheckString,CheckURL,CheckMatchatLeast,CheckMatchatLeastvalues,CheckMatchExactly,CheckMatchExactlyvalues,CheckRecommended,CheckAges,CheckUnique,Uuid,entityId
1+
Component,CheckList,CheckRegexList,CheckRegexSingle,CheckNum,CheckFloat,CheckInt,CheckString,CheckURL,CheckMatchatLeast,CheckMatchatLeastvalues,CheckMatchExactly,CheckMatchExactlyvalues,CheckRecommended,CheckAges,CheckUnique,Id,entityId
22
MockComponent,"valid,list,values","a,c,f",a,6,99.65,7,valid,https://www.google.com/,1985,4891,23487492,24323472834,,6571,str1,0f7812cc-8a0e-4f54-b8c4-e497cb7b34d0,syn35367245
33
MockComponent,"valid,list,values","a,c,f",a,6,99.65,8.52,valid,https://www.google.com/,1985,4891,23487492,24323472834,,6571,str1,da82f8e2-c7b0-428f-8f9d-677252ef5f68,syn35367246
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Filename,Sample ID,File Format,Component,Genome Build,Genome FASTA
2+
TestRNA-seqDataset1/TestRNA-seq-dummy-dataset.rtf,ABCD,BAM,BulkRNA-seqAssay,GRCh38,
3+
TestRNA-seqDataset1/TestRNA-seq-dummy-dataset2.rtf,EFGH,CRAM,BulkRNA-seqAssay,GRCm39,

tests/test_store.py

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from schematic.store.synapse import SynapseStorage, DatasetFileView, ManifestDownload
1515
from schematic.schemas.generator import SchemaGenerator
1616
from synapseclient.core.exceptions import SynapseHTTPError
17+
from synapseclient.entity import File
1718
from schematic.configuration.configuration import Configuration
1819

1920
logging.basicConfig(level=logging.DEBUG)
@@ -113,9 +114,37 @@ def test_getFileAnnotations(self, synapse_store):
113114

114115
assert expected_dict == actual_dict
115116

116-
def test_annotation_submission(self, synapse_store, helpers, config: Configuration):
117-
manifest_path = "mock_manifests/annotations_test_manifest.csv"
118-
117+
@pytest.mark.parametrize('only_new_files',[True, False])
118+
def test_get_file_entityIds(self, helpers, synapse_store, only_new_files):
119+
manifest_path = "mock_manifests/test_BulkRNAseq.csv"
120+
dataset_files = synapse_store.getFilesInStorageDataset('syn39241199')
121+
122+
if only_new_files:
123+
# Prepare manifest is getting Ids for new files only
124+
manifest = helpers.get_data_frame(manifest_path)
125+
entityIds = pd.DataFrame({'entityId': ['syn39242580', 'syn51900502']})
126+
manifest = manifest.join(entityIds)
127+
128+
# get entityIds for new files
129+
files_and_Ids = synapse_store._get_file_entityIds(dataset_files=dataset_files, only_new_files=only_new_files, manifest=manifest)
130+
131+
# Assert that there are no new files
132+
for value in files_and_Ids.values():
133+
assert value == []
134+
135+
else:
136+
# get entityIds for all files
137+
files_and_Ids = synapse_store._get_file_entityIds(dataset_files=dataset_files, only_new_files=only_new_files)
138+
139+
# assert that the correct number of files were found
140+
assert len(files_and_Ids['entityId']) == 2
141+
142+
@pytest.mark.parametrize('manifest_path, test_annotations, datasetId, manifest_record_type',
143+
[ ("mock_manifests/annotations_test_manifest.csv", {'CheckInt': '7', 'CheckList': 'valid, list, values'}, 'syn34295552', 'file_and_entities'),
144+
("mock_manifests/test_BulkRNAseq.csv", {'FileFormat': 'BAM', 'GenomeBuild': 'GRCh38'}, 'syn39241199', 'table_and_file')],
145+
ids = ['non file-based',
146+
'file-based'])
147+
def test_annotation_submission(self, synapse_store, helpers, manifest_path, test_annotations, datasetId, manifest_record_type, config: Configuration):
119148
# Upload dataset annotations
120149
sg = SchemaGenerator(config.model_location)
121150

@@ -129,8 +158,8 @@ def test_annotation_submission(self, synapse_store, helpers, config: Configurati
129158
manifest_id = synapse_store.associateMetadataWithFiles(
130159
schemaGenerator = sg,
131160
metadataManifestPath = helpers.get_data_path(manifest_path),
132-
datasetId = 'syn34295552',
133-
manifest_record_type = 'file_and_entities',
161+
datasetId = datasetId,
162+
manifest_record_type = manifest_record_type,
134163
useSchemaLabel = True,
135164
hideBlanks = True,
136165
restrict_manifest = False,
@@ -139,17 +168,19 @@ def test_annotation_submission(self, synapse_store, helpers, config: Configurati
139168
pass
140169

141170
# Retrive annotations
142-
entity_id, entity_id_spare = helpers.get_data_frame(manifest_path)["entityId"][0:2]
171+
entity_id = helpers.get_data_frame(manifest_path)["entityId"][0]
143172
annotations = synapse_store.getFileAnnotations(entity_id)
144173

145174
# Check annotations of interest
146-
assert annotations['CheckInt'] == '7'
147-
assert annotations['CheckList'] == 'valid, list, values'
148-
assert 'CheckRecommended' not in annotations.keys()
149-
150-
151-
175+
for key in test_annotations.keys():
176+
assert key in annotations.keys()
177+
assert annotations[key] == test_annotations[key]
152178

179+
if manifest_path.endswith('annotations_test_manifest.csv'):
180+
assert 'CheckRecommended' not in annotations.keys()
181+
elif manifest_path.endswith('test_BulkRNAseq.csv'):
182+
entity = synapse_store.syn.get(entity_id)
183+
assert type(entity) == File
153184

154185
@pytest.mark.parametrize("force_batch", [True, False], ids=["batch", "non_batch"])
155186
def test_getDatasetAnnotations(self, dataset_id, synapse_store, force_batch):
@@ -466,7 +497,6 @@ def test_upsertTable(self, helpers, synapse_store, config:Configuration, project
466497
# delete table
467498
synapse_store.syn.delete(tableId)
468499

469-
470500
class TestDownloadManifest:
471501
@pytest.mark.parametrize("datasetFileView", [{"id": ["syn51203973", "syn51203943"], "name": ["synapse_storage_manifest.csv", "synapse_storage_manifest_censored.csv"]}, {"id": ["syn51203973"], "name": ["synapse_storage_manifest.csv"]}, {"id": ["syn51203943"], "name": ["synapse_storage_manifest_censored.csv"]}])
472502
def test_get_manifest_id(self, synapse_store, datasetFileView):

0 commit comments

Comments
 (0)