Merge pull request #425 from Sage-Bionetworks/bgrande/fill-in-filename-column

Bruno Grande · web-flow · commit 26f489a4edaa · 2021-04-12T11:57:40.000-07:00
Fill in Filename column in new manifests
diff --git a/schematic/manifest/commands.py b/schematic/manifest/commands.py
@@ -95,6 +95,7 @@ def get_manifest(ctx, title, data_type, jsonld, dataset_id, sheet_url,
     if sheet_url:
         logger.info("Find the manifest template using this Google Sheet URL:")
         click.echo(result)
+
     elif isinstance(result, pd.DataFrame):
         if output_csv is None:
             prefix, _ = os.path.splitext(jsonld)
@@ -106,6 +107,7 @@ def get_manifest(ctx, title, data_type, jsonld, dataset_id, sheet_url,
         logger.info(
             f"Find the manifest template using this CSV file path: {output_csv}"
         )
+
         result.to_csv(output_csv, index=False)
 
     return result
diff --git a/schematic/manifest/generator.py b/schematic/manifest/generator.py
@@ -72,6 +72,12 @@ def __init__(self,
         # additional metadata to add to manifest
         self.additional_metadata = additional_metadata
 
+        # Determine whether current data type is file-based
+        is_file_based = False
+        if self.root:
+            is_file_based = "Filename" in self.sg.get_node_dependencies(self.root)
+        self.is_file_based = is_file_based
+
 
     def _attribute_to_letter(self, attribute, manifest_fields):
         """Map attribute to column letter in a google sheet
@@ -838,7 +844,7 @@ def get_manifest_with_annotations(
         # during empty manifest generation. For more info, search
         # for `additional_metadata` in `self.get_empty_manifest`.
         # Hence, the shared columns need to be updated separately.
-        if self.use_annotations:
+        if self.is_file_based and self.use_annotations:
             # This approach assumes that `update_df` returns
             # a data frame whose columns are in the same order
             manifest_df = update_df(manifest_df, annotations)
@@ -894,12 +900,16 @@ def get_manifest(self, dataset_id: str = None, sheet_url: bool = None,
         # Generate empty template and optionally fill in with annotations
         else:
 
-            # Get data frame of existing annotations to bootstrap empty manifest
-            # Avoiding the retrieval of annotations by default due to slowness
+            # Using getDatasetAnnotations() to retrieve file names and subset
+            # entities to files and folders (ignoring tables/views)
             annotations = pd.DataFrame()
-            if self.use_annotations:
+            if self.is_file_based:
                 annotations = syn_store.getDatasetAnnotations(dataset_id)
 
+            # Subset columns if no interested in user-defined annotations
+            if self.is_file_based and not self.use_annotations:
+                annotations = annotations[["Filename", "eTag", "entityId"]]
+
             # Update `additional_metadata` and generate manifest
             manifest_url, manifest_df = self.get_manifest_with_annotations(annotations)
 
@@ -908,8 +918,6 @@ def get_manifest(self, dataset_id: str = None, sheet_url: bool = None,
             else:
                 return manifest_df
 
-        # This point is unreachable based on the above if-else conditionals
-
 
     def populate_manifest_spreadsheet(self, existing_manifest_path, empty_manifest_url):
         """Creates a google sheet manifest based on existing manifest.
diff --git a/schematic/schemas/df_parser.py b/schematic/schemas/df_parser.py
@@ -538,7 +538,7 @@ def create_schema_classes(schema_extension: pd.DataFrame, se: SchemaExplorer) ->
             component_dependencies = attribute["DependsOn Component"]
         else:
             continue
-            
+
         logger.debug(">>> Adding component dependencies for " + attribute["Attribute"])
 
         # iterate over potentially multiple dependency components
@@ -572,7 +572,7 @@ def create_schema_classes(schema_extension: pd.DataFrame, se: SchemaExplorer) ->
 
 
         #TODO check for cycles in component dependencies schema subgraph
-        
+
         logger.debug("<<< Done adding component dependencies for " + attribute["Attribute"])
 
 
@@ -718,7 +718,7 @@ def create_nx_schema_objects(schema_extension: pd.DataFrame, se: SchemaExplorer)
                                                 description = description
                     )
                     se.add_schema_object_nx(new_property, **rel_dict)
-                    
+
     logger.debug("Done adding properties")
 
     # # set range values and dependency requirements for each attribute
@@ -792,7 +792,7 @@ def create_nx_schema_objects(schema_extension: pd.DataFrame, se: SchemaExplorer)
                                                     validation_rules = property_info["validation_rules"]
                     )
                     se.edit_schema_object_nx(property_range_edit)
-                    
+
                 logger.debug(val + " added to value range")
 
         # get validation rules for this attribute, if any are specified
@@ -829,7 +829,7 @@ def create_nx_schema_objects(schema_extension: pd.DataFrame, se: SchemaExplorer)
                                                    validation_rules = property_info["validation_rules"]
                 )
                 se.edit_schema_object_nx(property_val_rule_edit)
-                
+
             logger.debug(val + "validation rules added")
 
         # get dependencies for this attribute, if any are specified
@@ -917,14 +917,14 @@ def create_nx_schema_objects(schema_extension: pd.DataFrame, se: SchemaExplorer)
                                                         validation_rules = property_info["validation_rules"]
                     )
                     se.edit_schema_object_nx(property_dependencies_edit)
-                                                 
+
                 logger.debug(dep + " added to dependencies")
 
             #TODO check for cycles in attribute dependencies schema subgraph
 
         # check if the attribute requires any components
         if not pd.isnull(attribute["DependsOn Component"]):
-            component_dependencies = attribute["DependsOn Component"] 
+            component_dependencies = attribute["DependsOn Component"]
         else:
             continue
 
@@ -956,7 +956,7 @@ def create_nx_schema_objects(schema_extension: pd.DataFrame, se: SchemaExplorer)
                                             requires_components = class_info["component_dependencies"]
             )
             se.edit_schema_object_nx(class_component_dependencies_edit)
-                                                 
+
         logger.debug(comp_dep + " added to dependencies")
 
 
diff --git a/schematic/store/synapse.py b/schematic/store/synapse.py
@@ -551,30 +551,43 @@ def getDatasetAnnotations(
         """
         # Get all files in given dataset
         dataset_files = self.getFilesInStorageDataset(datasetId)
-        dataset_file_ids = [i for i, _ in dataset_files]
+        dataset_files_map = dict(dataset_files)
+        dataset_file_ids, _ = list(zip(*dataset_files))
 
         # Get annotations for each file from Step 1
-
         # Batch mode
         try_batch = len(dataset_files) >= 50 or force_batch
         if try_batch:
             try:
+                logger.info("Trying batch mode for retrieving Synapse annotations")
                 table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
             except (SynapseAuthenticationError, SynapseHTTPError):
                 logger.info(
                     f"Unable to create a temporary file view bound to {datasetId}. "
                     "Defaulting to slower iterative retrieval of annotations."
                 )
                 # Default to the slower non-batch method
+                logger.info("Batch mode failed (probably due to permission error)")
                 try_batch = False
 
         # Non-batch mode
         if not try_batch:
+            logger.info("Using slower (non-batch) sequential mode")
             records = [self.getFileAnnotations(i) for i in dataset_file_ids]
             # Remove any annotations for non-file/folders (stored as None)
             records = filter(None, records)
             table = pd.DataFrame.from_records(records)
 
+        # Add filenames for the files that "survived" annotation retrieval
+        filenames = [dataset_files_map[i] for i in table["entityId"]]
+        table.insert(0, "Filename", filenames)
+
+        # Ensure that entityId and eTag are at the end
+        entity_ids = table.pop("entityId")
+        etags = table.pop("eTag")
+        table.insert(len(table.columns), "entityId", entity_ids)
+        table.insert(len(table.columns), "eTag", etags)
+
         # Missing values are filled in with empty strings for Google Sheets
         if fill_na:
             table.fillna("", inplace=True)
diff --git a/tests/data/example.model.csv b/tests/data/example.model.csv
@@ -1,20 +1,20 @@
 ﻿Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
-Patient,,,"Patient ID, Sex, Year of Birth, Diagnosis",,FALSE,DataType,,,
+Patient,,,"Patient ID, Sex, Year of Birth, Diagnosis, Component",,FALSE,DataType,,,
 Patient ID,,,,,TRUE,DataProperty,,,
 Sex,,"Female, Male, Other",,,TRUE,DataProperty,,,
 Year of Birth,,,,,FALSE,DataProperty,,,
 Diagnosis,,"Healthy, Cancer",,,TRUE,DataProperty,,,
 Cancer,,,"Cancer Type, Family History",,FALSE,ValidValue,,,
 Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,
 Family History,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,list
-Biospecimen,,,"Sample ID, Patient ID, Tissue Status",,FALSE,DataType,,,
+Biospecimen,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,,,
 Sample ID,,,,,TRUE,DataProperty,,,
 Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,,
-Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format",,FALSE,DataType,,,
+Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,,,
 Filename,,,,,TRUE,DataProperty,,,
 File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,TRUE,DataProperty,,,
 BAM,,,Genome Build,,FALSE,ValidValue,,,
 CRAM,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,,,
 CSV/TSV,,,Genome Build,,FALSE,ValidValue,,,
 Genome Build,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,,,
-Genome FASTA,,,,,TRUE,DataProperty,,,
+Genome FASTA,,,,,TRUE,DataProperty,,,
diff --git a/tests/data/example.model.jsonld b/tests/data/example.model.jsonld
@@ -1999,6 +1999,9 @@
                 },
                 {
                     "@id": "bts:Diagnosis"
+                },
+                {
+                    "@id": "bts:Component"
                 }
             ],
             "sms:validationRules": []
@@ -2209,6 +2212,9 @@
                 },
                 {
                     "@id": "bts:TissueStatus"
+                },
+                {
+                    "@id": "bts:Component"
                 }
             ],
             "sms:validationRules": []
@@ -2279,6 +2285,9 @@
                 },
                 {
                     "@id": "bts:FileFormat"
+                },
+                {
+                    "@id": "bts:Component"
                 }
             ],
             "sms:validationRules": []
@@ -2448,6 +2457,23 @@
             "sms:required": "sms:true",
             "sms:validationRules": []
         },
+        {
+            "@id": "bts:Component",
+            "@type": "rdfs:Class",
+            "rdfs:comment": "TBD",
+            "rdfs:label": "Component",
+            "rdfs:subClassOf": [
+                {
+                    "@id": "bts:Patient"
+                }
+            ],
+            "schema:isPartOf": {
+                "@id": "http://schema.biothings.io"
+            },
+            "sms:displayName": "Component",
+            "sms:required": "sms:false",
+            "sms:validationRules": []
+        },
         {
             "@id": "bts:Female",
             "@type": "rdfs:Class",
diff --git a/tests/test_manifest.py b/tests/test_manifest.py
@@ -19,23 +19,33 @@ def mock_creds():
     yield mock_creds
 
 
-@pytest.fixture(params=[True, False], ids=["use_annotations", "skip_annotations"])
+@pytest.fixture(params=[
+    (True, "Patient"),
+    (False, "Patient"),
+    (True, "BulkRNA-seqAssay"),
+    (False, "BulkRNA-seqAssay"),
+], ids=[
+    "use_annotations-Patient",
+    "skip_annotations-Patient",
+    "use_annotations-BulkRNAseqAssay",
+    "skip_annotations-BulkRNAseqAssay",
+])
 def manifest_generator(helpers, request):
 
     # Rename request param for readability
-    use_annotations = request.param
+    use_annotations, data_type = request.param
 
     manifest_generator = ManifestGenerator(
         path_to_json_ld=helpers.get_data_path("example.model.jsonld"),
-        root="Patient",
+        root=data_type,
         use_annotations=use_annotations,
     )
 
-    yield manifest_generator, use_annotations
+    yield manifest_generator, use_annotations, data_type
 
     # Clean-up
     try:
-        os.remove(helpers.get_data_path("example.Patient.schema.json"))
+        os.remove(helpers.get_data_path(f"example.{data_type}.schema.json"))
     except FileNotFoundError:
         pass
 
@@ -47,14 +57,14 @@ def manifest(manifest_generator, request):
     sheet_url = request.param
 
     # See parameterization of the `manifest_generator` fixture
-    generator, use_annotations = manifest_generator
+    generator, use_annotations, data_type = manifest_generator
 
     manifest = generator.get_manifest(
         dataset_id="syn25057021",
         sheet_url=sheet_url
     )
 
-    yield manifest, use_annotations, sheet_url
+    yield manifest, use_annotations, data_type, sheet_url
 
 
 
@@ -80,25 +90,53 @@ def test_init(self, monkeypatch, mock_creds, helpers):
     def test_get_manifest_first_time(self, manifest):
 
         # See parameterization of the `manifest_generator` fixture
-        output, use_annotations, sheet_url = manifest
+        output, use_annotations, data_type, sheet_url = manifest
 
         if sheet_url:
+            logger.debug(output)
             assert isinstance(output, str)
             assert output.startswith("https://docs.google.com/spreadsheets/")
-            print(output)
             return
 
         # Beyond this point, the output is assumed to be a data frame
-        assert "Year of Birth" in output
 
+        # Update expectations based on whether the data type is file-based
+        is_file_based = data_type in ["BulkRNA-seqAssay"]
+
+        assert "Component" in output
+        assert is_file_based == ("eTag" in output)
+        assert is_file_based == ("Filename" in output)
+        assert (is_file_based and use_annotations) == ("confidence" in output)
+
+        # Data type-specific columns
+        assert (data_type == "Patient") == ("Diagnosis" in output)
+        assert (data_type == "BulkRNA-seqAssay") == ("File Format" in output)
+
+        # The rest of the tests have to do with a file-based data type
+        if data_type != "BulkRNA-seqAssay":
+            assert output.shape[0] == 1   # Number of rows
+            return
+
+        # Beyond this point, the output is to be from a file-based assay
+
+        # Confirm contents of Filename column
+        assert output["Filename"].tolist() == [
+            "TestDataset-Annotations-v2/Sample_A.txt",
+            "TestDataset-Annotations-v2/Sample_B.txt",
+            "TestDataset-Annotations-v2/Sample_C.txt",
+        ]
+
+        # Test dimensions of data frame
+        assert output.shape[0] == 3   # Number of rows
         if use_annotations:
             assert output.shape[1] == 13  # Number of columns
             assert output.shape[0] == 3  # Number of rows
             assert "eTag" in output
             assert "confidence" in output
             assert output["Year of Birth"].tolist() == ["1980", "", ""]
         else:
-            assert output.shape[1] == 6  # Number of columns
-            assert output.shape[0] == 0  # Number of rows
-            assert "confidence" not in output
-            assert output["Year of Birth"].tolist() == []
+            assert output.shape[1] == 8  # Number of columns
+
+        # An annotation merged with an attribute from the data model
+        if use_annotations:
+            assert output["File Format"].tolist() == ["txt", "csv", "fastq"]
diff --git a/tests/test_store.py b/tests/test_store.py