Skip to content

Commit 26f489a

Browse files
author
Bruno Grande
authored
Merge pull request #425 from Sage-Bionetworks/bgrande/fill-in-filename-column
Fill in Filename column in new manifests
2 parents f4ca2f0 + 3c06bef commit 26f489a

File tree

8 files changed

+130
-39
lines changed

8 files changed

+130
-39
lines changed

schematic/manifest/commands.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ def get_manifest(ctx, title, data_type, jsonld, dataset_id, sheet_url,
9595
if sheet_url:
9696
logger.info("Find the manifest template using this Google Sheet URL:")
9797
click.echo(result)
98+
9899
elif isinstance(result, pd.DataFrame):
99100
if output_csv is None:
100101
prefix, _ = os.path.splitext(jsonld)
@@ -106,6 +107,7 @@ def get_manifest(ctx, title, data_type, jsonld, dataset_id, sheet_url,
106107
logger.info(
107108
f"Find the manifest template using this CSV file path: {output_csv}"
108109
)
110+
109111
result.to_csv(output_csv, index=False)
110112

111113
return result

schematic/manifest/generator.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,12 @@ def __init__(self,
7272
# additional metadata to add to manifest
7373
self.additional_metadata = additional_metadata
7474

75+
# Determine whether current data type is file-based
76+
is_file_based = False
77+
if self.root:
78+
is_file_based = "Filename" in self.sg.get_node_dependencies(self.root)
79+
self.is_file_based = is_file_based
80+
7581

7682
def _attribute_to_letter(self, attribute, manifest_fields):
7783
"""Map attribute to column letter in a google sheet
@@ -838,7 +844,7 @@ def get_manifest_with_annotations(
838844
# during empty manifest generation. For more info, search
839845
# for `additional_metadata` in `self.get_empty_manifest`.
840846
# Hence, the shared columns need to be updated separately.
841-
if self.use_annotations:
847+
if self.is_file_based and self.use_annotations:
842848
# This approach assumes that `update_df` returns
843849
# a data frame whose columns are in the same order
844850
manifest_df = update_df(manifest_df, annotations)
@@ -894,12 +900,16 @@ def get_manifest(self, dataset_id: str = None, sheet_url: bool = None,
894900
# Generate empty template and optionally fill in with annotations
895901
else:
896902

897-
# Get data frame of existing annotations to bootstrap empty manifest
898-
# Avoiding the retrieval of annotations by default due to slowness
903+
# Using getDatasetAnnotations() to retrieve file names and subset
904+
# entities to files and folders (ignoring tables/views)
899905
annotations = pd.DataFrame()
900-
if self.use_annotations:
906+
if self.is_file_based:
901907
annotations = syn_store.getDatasetAnnotations(dataset_id)
902908

909+
# Subset columns if no interested in user-defined annotations
910+
if self.is_file_based and not self.use_annotations:
911+
annotations = annotations[["Filename", "eTag", "entityId"]]
912+
903913
# Update `additional_metadata` and generate manifest
904914
manifest_url, manifest_df = self.get_manifest_with_annotations(annotations)
905915

@@ -908,8 +918,6 @@ def get_manifest(self, dataset_id: str = None, sheet_url: bool = None,
908918
else:
909919
return manifest_df
910920

911-
# This point is unreachable based on the above if-else conditionals
912-
913921

914922
def populate_manifest_spreadsheet(self, existing_manifest_path, empty_manifest_url):
915923
"""Creates a google sheet manifest based on existing manifest.

schematic/schemas/df_parser.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,7 @@ def create_schema_classes(schema_extension: pd.DataFrame, se: SchemaExplorer) ->
538538
component_dependencies = attribute["DependsOn Component"]
539539
else:
540540
continue
541-
541+
542542
logger.debug(">>> Adding component dependencies for " + attribute["Attribute"])
543543

544544
# iterate over potentially multiple dependency components
@@ -572,7 +572,7 @@ def create_schema_classes(schema_extension: pd.DataFrame, se: SchemaExplorer) ->
572572

573573

574574
#TODO check for cycles in component dependencies schema subgraph
575-
575+
576576
logger.debug("<<< Done adding component dependencies for " + attribute["Attribute"])
577577

578578

@@ -718,7 +718,7 @@ def create_nx_schema_objects(schema_extension: pd.DataFrame, se: SchemaExplorer)
718718
description = description
719719
)
720720
se.add_schema_object_nx(new_property, **rel_dict)
721-
721+
722722
logger.debug("Done adding properties")
723723

724724
# # set range values and dependency requirements for each attribute
@@ -792,7 +792,7 @@ def create_nx_schema_objects(schema_extension: pd.DataFrame, se: SchemaExplorer)
792792
validation_rules = property_info["validation_rules"]
793793
)
794794
se.edit_schema_object_nx(property_range_edit)
795-
795+
796796
logger.debug(val + " added to value range")
797797

798798
# get validation rules for this attribute, if any are specified
@@ -829,7 +829,7 @@ def create_nx_schema_objects(schema_extension: pd.DataFrame, se: SchemaExplorer)
829829
validation_rules = property_info["validation_rules"]
830830
)
831831
se.edit_schema_object_nx(property_val_rule_edit)
832-
832+
833833
logger.debug(val + "validation rules added")
834834

835835
# get dependencies for this attribute, if any are specified
@@ -917,14 +917,14 @@ def create_nx_schema_objects(schema_extension: pd.DataFrame, se: SchemaExplorer)
917917
validation_rules = property_info["validation_rules"]
918918
)
919919
se.edit_schema_object_nx(property_dependencies_edit)
920-
920+
921921
logger.debug(dep + " added to dependencies")
922922

923923
#TODO check for cycles in attribute dependencies schema subgraph
924924

925925
# check if the attribute requires any components
926926
if not pd.isnull(attribute["DependsOn Component"]):
927-
component_dependencies = attribute["DependsOn Component"]
927+
component_dependencies = attribute["DependsOn Component"]
928928
else:
929929
continue
930930

@@ -956,7 +956,7 @@ def create_nx_schema_objects(schema_extension: pd.DataFrame, se: SchemaExplorer)
956956
requires_components = class_info["component_dependencies"]
957957
)
958958
se.edit_schema_object_nx(class_component_dependencies_edit)
959-
959+
960960
logger.debug(comp_dep + " added to dependencies")
961961

962962

schematic/store/synapse.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -551,30 +551,43 @@ def getDatasetAnnotations(
551551
"""
552552
# Get all files in given dataset
553553
dataset_files = self.getFilesInStorageDataset(datasetId)
554-
dataset_file_ids = [i for i, _ in dataset_files]
554+
dataset_files_map = dict(dataset_files)
555+
dataset_file_ids, _ = list(zip(*dataset_files))
555556

556557
# Get annotations for each file from Step 1
557-
558558
# Batch mode
559559
try_batch = len(dataset_files) >= 50 or force_batch
560560
if try_batch:
561561
try:
562+
logger.info("Trying batch mode for retrieving Synapse annotations")
562563
table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
563564
except (SynapseAuthenticationError, SynapseHTTPError):
564565
logger.info(
565566
f"Unable to create a temporary file view bound to {datasetId}. "
566567
"Defaulting to slower iterative retrieval of annotations."
567568
)
568569
# Default to the slower non-batch method
570+
logger.info("Batch mode failed (probably due to permission error)")
569571
try_batch = False
570572

571573
# Non-batch mode
572574
if not try_batch:
575+
logger.info("Using slower (non-batch) sequential mode")
573576
records = [self.getFileAnnotations(i) for i in dataset_file_ids]
574577
# Remove any annotations for non-file/folders (stored as None)
575578
records = filter(None, records)
576579
table = pd.DataFrame.from_records(records)
577580

581+
# Add filenames for the files that "survived" annotation retrieval
582+
filenames = [dataset_files_map[i] for i in table["entityId"]]
583+
table.insert(0, "Filename", filenames)
584+
585+
# Ensure that entityId and eTag are at the end
586+
entity_ids = table.pop("entityId")
587+
etags = table.pop("eTag")
588+
table.insert(len(table.columns), "entityId", entity_ids)
589+
table.insert(len(table.columns), "eTag", etags)
590+
578591
# Missing values are filled in with empty strings for Google Sheets
579592
if fill_na:
580593
table.fillna("", inplace=True)

tests/data/example.model.csv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
11
Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
2-
Patient,,,"Patient ID, Sex, Year of Birth, Diagnosis",,FALSE,DataType,,,
2+
Patient,,,"Patient ID, Sex, Year of Birth, Diagnosis, Component",,FALSE,DataType,,,
33
Patient ID,,,,,TRUE,DataProperty,,,
44
Sex,,"Female, Male, Other",,,TRUE,DataProperty,,,
55
Year of Birth,,,,,FALSE,DataProperty,,,
66
Diagnosis,,"Healthy, Cancer",,,TRUE,DataProperty,,,
77
Cancer,,,"Cancer Type, Family History",,FALSE,ValidValue,,,
88
Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,
99
Family History,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,list
10-
Biospecimen,,,"Sample ID, Patient ID, Tissue Status",,FALSE,DataType,,,
10+
Biospecimen,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,,,
1111
Sample ID,,,,,TRUE,DataProperty,,,
1212
Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,,
13-
Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format",,FALSE,DataType,,,
13+
Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,,,
1414
Filename,,,,,TRUE,DataProperty,,,
1515
File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,TRUE,DataProperty,,,
1616
BAM,,,Genome Build,,FALSE,ValidValue,,,
1717
CRAM,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,,,
1818
CSV/TSV,,,Genome Build,,FALSE,ValidValue,,,
1919
Genome Build,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,,,
20-
Genome FASTA,,,,,TRUE,DataProperty,,,
20+
Genome FASTA,,,,,TRUE,DataProperty,,,

tests/data/example.model.jsonld

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1999,6 +1999,9 @@
19991999
},
20002000
{
20012001
"@id": "bts:Diagnosis"
2002+
},
2003+
{
2004+
"@id": "bts:Component"
20022005
}
20032006
],
20042007
"sms:validationRules": []
@@ -2209,6 +2212,9 @@
22092212
},
22102213
{
22112214
"@id": "bts:TissueStatus"
2215+
},
2216+
{
2217+
"@id": "bts:Component"
22122218
}
22132219
],
22142220
"sms:validationRules": []
@@ -2279,6 +2285,9 @@
22792285
},
22802286
{
22812287
"@id": "bts:FileFormat"
2288+
},
2289+
{
2290+
"@id": "bts:Component"
22822291
}
22832292
],
22842293
"sms:validationRules": []
@@ -2448,6 +2457,23 @@
24482457
"sms:required": "sms:true",
24492458
"sms:validationRules": []
24502459
},
2460+
{
2461+
"@id": "bts:Component",
2462+
"@type": "rdfs:Class",
2463+
"rdfs:comment": "TBD",
2464+
"rdfs:label": "Component",
2465+
"rdfs:subClassOf": [
2466+
{
2467+
"@id": "bts:Patient"
2468+
}
2469+
],
2470+
"schema:isPartOf": {
2471+
"@id": "http://schema.biothings.io"
2472+
},
2473+
"sms:displayName": "Component",
2474+
"sms:required": "sms:false",
2475+
"sms:validationRules": []
2476+
},
24512477
{
24522478
"@id": "bts:Female",
24532479
"@type": "rdfs:Class",

tests/test_manifest.py

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,23 +19,33 @@ def mock_creds():
1919
yield mock_creds
2020

2121

22-
@pytest.fixture(params=[True, False], ids=["use_annotations", "skip_annotations"])
22+
@pytest.fixture(params=[
23+
(True, "Patient"),
24+
(False, "Patient"),
25+
(True, "BulkRNA-seqAssay"),
26+
(False, "BulkRNA-seqAssay"),
27+
], ids=[
28+
"use_annotations-Patient",
29+
"skip_annotations-Patient",
30+
"use_annotations-BulkRNAseqAssay",
31+
"skip_annotations-BulkRNAseqAssay",
32+
])
2333
def manifest_generator(helpers, request):
2434

2535
# Rename request param for readability
26-
use_annotations = request.param
36+
use_annotations, data_type = request.param
2737

2838
manifest_generator = ManifestGenerator(
2939
path_to_json_ld=helpers.get_data_path("example.model.jsonld"),
30-
root="Patient",
40+
root=data_type,
3141
use_annotations=use_annotations,
3242
)
3343

34-
yield manifest_generator, use_annotations
44+
yield manifest_generator, use_annotations, data_type
3545

3646
# Clean-up
3747
try:
38-
os.remove(helpers.get_data_path("example.Patient.schema.json"))
48+
os.remove(helpers.get_data_path(f"example.{data_type}.schema.json"))
3949
except FileNotFoundError:
4050
pass
4151

@@ -47,14 +57,14 @@ def manifest(manifest_generator, request):
4757
sheet_url = request.param
4858

4959
# See parameterization of the `manifest_generator` fixture
50-
generator, use_annotations = manifest_generator
60+
generator, use_annotations, data_type = manifest_generator
5161

5262
manifest = generator.get_manifest(
5363
dataset_id="syn25057021",
5464
sheet_url=sheet_url
5565
)
5666

57-
yield manifest, use_annotations, sheet_url
67+
yield manifest, use_annotations, data_type, sheet_url
5868

5969

6070

@@ -80,25 +90,53 @@ def test_init(self, monkeypatch, mock_creds, helpers):
8090
def test_get_manifest_first_time(self, manifest):
8191

8292
# See parameterization of the `manifest_generator` fixture
83-
output, use_annotations, sheet_url = manifest
93+
output, use_annotations, data_type, sheet_url = manifest
8494

8595
if sheet_url:
96+
logger.debug(output)
8697
assert isinstance(output, str)
8798
assert output.startswith("https://docs.google.com/spreadsheets/")
88-
print(output)
8999
return
90100

91101
# Beyond this point, the output is assumed to be a data frame
92-
assert "Year of Birth" in output
93102

103+
# Update expectations based on whether the data type is file-based
104+
is_file_based = data_type in ["BulkRNA-seqAssay"]
105+
106+
assert "Component" in output
107+
assert is_file_based == ("eTag" in output)
108+
assert is_file_based == ("Filename" in output)
109+
assert (is_file_based and use_annotations) == ("confidence" in output)
110+
111+
# Data type-specific columns
112+
assert (data_type == "Patient") == ("Diagnosis" in output)
113+
assert (data_type == "BulkRNA-seqAssay") == ("File Format" in output)
114+
115+
# The rest of the tests have to do with a file-based data type
116+
if data_type != "BulkRNA-seqAssay":
117+
assert output.shape[0] == 1 # Number of rows
118+
return
119+
120+
# Beyond this point, the output is to be from a file-based assay
121+
122+
# Confirm contents of Filename column
123+
assert output["Filename"].tolist() == [
124+
"TestDataset-Annotations-v2/Sample_A.txt",
125+
"TestDataset-Annotations-v2/Sample_B.txt",
126+
"TestDataset-Annotations-v2/Sample_C.txt",
127+
]
128+
129+
# Test dimensions of data frame
130+
assert output.shape[0] == 3 # Number of rows
94131
if use_annotations:
95132
assert output.shape[1] == 13 # Number of columns
96133
assert output.shape[0] == 3 # Number of rows
97134
assert "eTag" in output
98135
assert "confidence" in output
99136
assert output["Year of Birth"].tolist() == ["1980", "", ""]
100137
else:
101-
assert output.shape[1] == 6 # Number of columns
102-
assert output.shape[0] == 0 # Number of rows
103-
assert "confidence" not in output
104-
assert output["Year of Birth"].tolist() == []
138+
assert output.shape[1] == 8 # Number of columns
139+
140+
# An annotation merged with an attribute from the data model
141+
if use_annotations:
142+
assert output["File Format"].tolist() == ["txt", "csv", "fastq"]

0 commit comments

Comments
 (0)