Skip to content

Commit 12d7707

Browse files
authored
Merge pull request #1283 from Sage-Bionetworks/develop
Schematic release 23.9.1
2 parents d861295 + 1105499 commit 12d7707

File tree

7 files changed

+286
-63
lines changed

7 files changed

+286
-63
lines changed

.github/workflows/api_test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ jobs:
7676
if: ${{ inputs.perform_benchmarking }}
7777
run: >
7878
source .venv/bin/activate;
79-
pytest -m "schematic_api"
79+
pytest -m "schematic_api and not submission"
8080
8181
- name: Run API tests + Exclude Benchmarks
8282
env:
@@ -85,4 +85,4 @@ jobs:
8585
if: ${{ false == inputs.perform_benchmarking }}
8686
run: >
8787
source .venv/bin/activate;
88-
pytest -m "schematic_api and not rule_benchmark"
88+
pytest -m "schematic_api and not submission and not rule_benchmark"

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ markers = [
126126
Google credentials (skipped on GitHub CI) \
127127
""",
128128
"""\
129+
submission: tests that involve submitting manifests
130+
""",
131+
"""\
129132
not_windows: tests that don't work on on windows machine
130133
""",
131134
"""\

schematic/manifest/generator.py

Lines changed: 28 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ def _gather_dependency_requirements(self, json_schema, required_metadata_fields)
439439

440440
return required_metadata_fields
441441

442-
def _add_root_to_component(self, required_metadata_fields):
442+
def _add_root_to_component(self, required_metadata_fields: Dict[str, List]):
443443
"""If 'Component' is in the column set, add root node as a
444444
metadata component entry in the first row of that column.
445445
Args:
@@ -462,7 +462,6 @@ def _add_root_to_component(self, required_metadata_fields):
462462
)
463463
else:
464464
self.additional_metadata["Component"] = [self.root]
465-
466465
return
467466

468467
def _get_additional_metadata(self, required_metadata_fields: dict) -> dict:
@@ -886,7 +885,7 @@ def _request_note_valid_values(self, i, req, validation_rules, valid_values):
886885
else:
887886
return
888887

889-
def _request_notes_comments(self, i, req, json_schema):
888+
def _set_required_columns_color(self, i, req, json_schema):
890889
"""Update background colors so that columns that are required are highlighted
891890
Args:
892891
i (int): column index
@@ -1126,10 +1125,10 @@ def _create_requests_body(
11261125
if get_row_formatting:
11271126
requests_body["requests"].append(get_row_formatting)
11281127

1129-
# Add notes to headers to provide descriptions of the attribute
1130-
header_notes = self._request_notes_comments(i, req, json_schema)
1131-
if header_notes:
1132-
requests_body["requests"].append(header_notes)
1128+
# set color of required columns to blue
1129+
required_columns_color = self._set_required_columns_color(i, req, json_schema)
1130+
if required_columns_color:
1131+
requests_body["requests"].append(required_columns_color)
11331132
# Add note on how to use multi-select, when appropriate
11341133
note_vv = self._request_note_valid_values(
11351134
i, req, validation_rules, valid_values
@@ -1365,19 +1364,16 @@ def map_annotation_names_to_display_names(
13651364
return annotations.rename(columns=label_map)
13661365

13671366
def get_manifest_with_annotations(
1368-
self, annotations: pd.DataFrame, sheet_url:bool=None, strict: Optional[bool]=None,
1367+
self, annotations: pd.DataFrame, strict: Optional[bool]=None
13691368
) -> Tuple[ps.Spreadsheet, pd.DataFrame]:
13701369
"""Generate manifest, optionally with annotations (if requested).
1371-
13721370
Args:
13731371
annotations (pd.DataFrame): Annotations table (can be empty).
13741372
strict (Optional Bool): strictness with which to apply validation rules to google sheets. True, blocks incorrect entries, False, raises a warning
1375-
sheet_url (Will be deprecated): a boolean ; determine if a pandas dataframe or a google sheet url gets return
13761373
Returns:
13771374
Tuple[ps.Spreadsheet, pd.DataFrame]: Both the Google Sheet
13781375
URL and the corresponding data frame is returned.
13791376
"""
1380-
13811377
# Map annotation labels to display names to match manifest columns
13821378
annotations = self.map_annotation_names_to_display_names(annotations)
13831379

@@ -1391,19 +1387,19 @@ def get_manifest_with_annotations(
13911387
self.additional_metadata = annotations_dict
13921388

13931389
# Generate empty manifest using `additional_metadata`
1394-
manifest_url = self.get_empty_manifest(sheet_url=sheet_url, strict=strict)
1390+
# With annotations added, regenerate empty manifest
1391+
manifest_url = self.get_empty_manifest(sheet_url=True, strict=strict)
13951392
manifest_df = self.get_dataframe_by_url(manifest_url=manifest_url)
13961393

13971394
# Annotations clashing with manifest attributes are skipped
13981395
# during empty manifest generation. For more info, search
13991396
# for `additional_metadata` in `self.get_empty_manifest`.
14001397
# Hence, the shared columns need to be updated separately.
1401-
if self.is_file_based and self.use_annotations:
1402-
# This approach assumes that `update_df` returns
1403-
# a data frame whose columns are in the same order
1404-
manifest_df = update_df(manifest_df, annotations)
1405-
manifest_sh = self.set_dataframe_by_url(manifest_url, manifest_df)
1406-
manifest_url = manifest_sh.url
1398+
# This approach assumes that `update_df` returns
1399+
# a data frame whose columns are in the same order
1400+
manifest_df = update_df(manifest_df, annotations)
1401+
manifest_sh = self.set_dataframe_by_url(manifest_url, manifest_df)
1402+
manifest_url = manifest_sh.url
14071403

14081404
return manifest_url, manifest_df
14091405

@@ -1527,7 +1523,7 @@ def get_manifest(
15271523
manifest_record = store.updateDatasetManifestFiles(self.sg, datasetId = dataset_id, store = False)
15281524

15291525
# get URL of an empty manifest file created based on schema component
1530-
empty_manifest_url = self.get_empty_manifest(strict=strict, sheet_url=sheet_url)
1526+
empty_manifest_url = self.get_empty_manifest(strict=strict, sheet_url=True)
15311527

15321528
# Populate empty template with existing manifest
15331529
if manifest_record:
@@ -1547,25 +1543,24 @@ def get_manifest(
15471543
return result
15481544

15491545
# Generate empty template and optionally fill in with annotations
1546+
# if there is no existing manifest and use annotations is set to True,
1547+
# pull annotations (in reality, annotations should be empty when there is no existing manifest)
15501548
else:
15511549
# Using getDatasetAnnotations() to retrieve file names and subset
15521550
# entities to files and folders (ignoring tables/views)
1553-
15541551
annotations = pd.DataFrame()
1555-
if self.is_file_based:
1556-
annotations = store.getDatasetAnnotations(dataset_id)
1557-
1558-
# if there are no files with annotations just generate an empty manifest
1559-
if annotations.empty:
1560-
manifest_url = self.get_empty_manifest(strict=strict)
1561-
manifest_df = self.get_dataframe_by_url(manifest_url)
1552+
if self.use_annotations:
1553+
if self.is_file_based:
1554+
annotations = store.getDatasetAnnotations(dataset_id)
1555+
# Update `additional_metadata` and generate manifest
1556+
manifest_url, manifest_df = self.get_manifest_with_annotations(annotations,strict=strict)
15621557
else:
1563-
# Subset columns if no interested in user-defined annotations and there are files present
1564-
if self.is_file_based and not self.use_annotations:
1565-
annotations = annotations[["Filename", "eTag", "entityId"]]
1566-
1567-
# Update `additional_metadata` and generate manifest
1568-
manifest_url, manifest_df = self.get_manifest_with_annotations(annotations, sheet_url=sheet_url, strict=strict)
1558+
empty_manifest_df = self.get_dataframe_by_url(empty_manifest_url)
1559+
if self.is_file_based:
1560+
# for file-based manifest, make sure that entityId column and Filename column still gets filled even though use_annotations gets set to False
1561+
manifest_df = store.add_entity_id_and_filename(dataset_id,empty_manifest_df)
1562+
else:
1563+
manifest_df = empty_manifest_df
15691564

15701565
# Update df with existing manifest. Agnostic to output format
15711566
updated_df, out_of_schema_columns = self._update_dataframe_with_existing_df(empty_manifest_url=empty_manifest_url, existing_df=manifest_df)

schematic/store/synapse.py

Lines changed: 93 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import shutil
1111

1212
# allows specifying explicit variable types
13-
from typing import Dict, List, Tuple, Sequence, Union
13+
from typing import Dict, List, Tuple, Sequence, Union, Optional
1414
from collections import OrderedDict
1515
from tenacity import retry, stop_after_attempt, wait_chain, wait_fixed, retry_if_exception_type
1616

@@ -35,7 +35,7 @@
3535
from synapseclient.table import CsvFileTable, build_table, Schema
3636
from synapseclient.annotations import from_synapse_annotations
3737
from synapseclient.core.exceptions import SynapseHTTPError, SynapseAuthenticationError, SynapseUnmetAccessRestrictions
38-
from synapseutils import walk
38+
import synapseutils
3939
from synapseutils.copy_functions import changeFileMetaData
4040

4141
import uuid
@@ -413,7 +413,7 @@ def getFilesInStorageDataset(
413413
"""
414414

415415
# select all files within a given storage dataset folder (top level folder in a Synapse storage project or folder marked with contentType = 'dataset')
416-
walked_path = walk(self.syn, datasetId)
416+
walked_path = synapseutils.walk(self.syn, datasetId, includeTypes=["folder", "file"])
417417

418418
file_list = []
419419

@@ -538,6 +538,94 @@ def getDataTypeFromManifest(self, manifestId:str):
538538

539539
return result_dict
540540

541+
def _get_files_metadata_from_dataset(self, datasetId: str, only_new_files: bool, manifest:pd.DataFrame=None) -> Optional[dict]:
542+
"""retrieve file ids under a particular datasetId
543+
544+
Args:
545+
datasetId (str): a dataset id
546+
only_new_files (bool): if only adding new files that are not already exist
547+
manifest (pd.DataFrame): metadata manifest dataframe. Default to None.
548+
549+
Returns:
550+
a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available
551+
"""
552+
dataset_files = self.getFilesInStorageDataset(datasetId)
553+
if dataset_files:
554+
dataset_file_names_id_dict = self._get_file_entityIds(dataset_files, only_new_files=only_new_files, manifest=manifest)
555+
return dataset_file_names_id_dict
556+
else:
557+
return None
558+
559+
def add_entity_id_and_filename(self, datasetId: str, manifest: pd.DataFrame) -> pd.DataFrame:
560+
"""add entityid and filename column to an existing manifest assuming entityId column is not already present
561+
562+
Args:
563+
datasetId (str): dataset syn id
564+
manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
565+
566+
Returns:
567+
pd.DataFrame: returns a pandas dataframe
568+
"""
569+
# get file names and entity ids of a given dataset
570+
dataset_files_dict = self._get_files_metadata_from_dataset(datasetId, only_new_files=False)
571+
572+
if dataset_files_dict:
573+
# turn manifest dataframe back to a dictionary for operation
574+
manifest_dict = manifest.to_dict('list')
575+
576+
# update Filename column
577+
# add entityId column to the end
578+
manifest_dict.update(dataset_files_dict)
579+
580+
# if the component column exists in existing manifest, fill up that column
581+
if "Component" in manifest_dict.keys():
582+
manifest_dict["Component"] = manifest_dict["Component"] * max(1, len(manifest_dict["Filename"]))
583+
584+
# turn dictionary back to a dataframe
585+
manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient='index')
586+
manifest_df_updated = manifest_df_index.transpose()
587+
588+
# fill na with empty string
589+
manifest_df_updated = manifest_df_updated.fillna("")
590+
591+
# drop index
592+
manifest_df_updated = manifest_df_updated.reset_index(drop=True)
593+
594+
return manifest_df_updated
595+
else:
596+
return manifest
597+
598+
def fill_in_entity_id_filename(self, datasetId: str, manifest: pd.DataFrame) -> Tuple[List, pd.DataFrame]:
599+
"""fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
600+
601+
Args:
602+
datasetId (str): dataset syn id
603+
manifest (pd.DataFrame): existing manifest dataframe.
604+
605+
Returns:
606+
Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
607+
"""
608+
# get dataset file names and entity id as a list of tuple
609+
dataset_files = self.getFilesInStorageDataset(datasetId)
610+
611+
# update manifest with additional filenames, if any
612+
# note that if there is an existing manifest and there are files in the dataset
613+
# the columns Filename and entityId are assumed to be present in manifest schema
614+
# TODO: use idiomatic panda syntax
615+
if dataset_files:
616+
new_files = self._get_file_entityIds(dataset_files=dataset_files, only_new_files=True, manifest=manifest)
617+
618+
# update manifest so that it contains new dataset files
619+
new_files = pd.DataFrame(new_files)
620+
manifest = (
621+
pd.concat([manifest, new_files], sort=False)
622+
.reset_index()
623+
.drop("index", axis=1)
624+
)
625+
626+
manifest = manifest.fillna("")
627+
return dataset_files, manifest
628+
541629
def updateDatasetManifestFiles(self, sg: SchemaGenerator, datasetId: str, store:bool = True) -> Union[Tuple[str, pd.DataFrame], None]:
542630
"""Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
543631
@@ -562,32 +650,20 @@ def updateDatasetManifestFiles(self, sg: SchemaGenerator, datasetId: str, store:
562650
manifest_filepath = self.syn.get(manifest_id).path
563651
manifest = load_df(manifest_filepath)
564652

565-
# get current list of files
566-
dataset_files = self.getFilesInStorageDataset(datasetId)
567-
568653
# update manifest with additional filenames, if any
569654
# note that if there is an existing manifest and there are files in the dataset
570655
# the columns Filename and entityId are assumed to be present in manifest schema
571656
# TODO: use idiomatic panda syntax
572-
if dataset_files:
573-
new_files = self._get_file_entityIds(dataset_files=dataset_files, only_new_files=True, manifest=manifest)
574-
575-
# update manifest so that it contain new files
576-
new_files = pd.DataFrame(new_files)
577-
manifest = (
578-
pd.concat([manifest, new_files], sort=False)
579-
.reset_index()
580-
.drop("index", axis=1)
581-
)
582657

658+
dataset_files, manifest = self.fill_in_entity_id_filename(datasetId, manifest)
659+
if dataset_files:
583660
# update the manifest file, so that it contains the relevant entity IDs
584661
if store:
585662
manifest.to_csv(manifest_filepath, index=False)
586663

587664
# store manifest and update associated metadata with manifest on Synapse
588665
manifest_id = self.associateMetadataWithFiles(sg, manifest_filepath, datasetId)
589666

590-
manifest = manifest.fillna("")
591667

592668
return manifest_id, manifest
593669

0 commit comments

Comments
 (0)