Skip to content

Commit b79283e

Browse files
Merge pull request #1252 from Sage-Bionetworks/develop-decouple-vr-from-excel-FDS-537
Only add validation rules to google sheets
2 parents 5fb5743 + 967da31 commit b79283e

File tree

1 file changed

+32
-21
lines changed

1 file changed

+32
-21
lines changed

schematic/manifest/generator.py

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pathlib import Path
1010
import pygsheets as ps
1111
from tempfile import NamedTemporaryFile
12-
from typing import Dict, List, Tuple, Union, Optional
12+
from typing import Dict, List, Optional, Tuple, Union
1313

1414
from schematic.schemas.generator import SchemaGenerator
1515
from schematic.utils.google_api_utils import (
@@ -288,7 +288,7 @@ def _get_column_data_validation_values(
288288
spreadsheet_id,
289289
valid_values,
290290
column_id,
291-
strict,
291+
strict:Optional[bool],
292292
validation_type="ONE_OF_LIST",
293293
custom_ui=True,
294294
input_message="Choose one from dropdown",
@@ -706,7 +706,7 @@ def _request_regex_vr(self, gs_formula, i:int, text_color={"red": 1}):
706706
return requests_vr
707707

708708
def _request_regex_match_vr_formatting(self, validation_rules: List[str], i: int,
709-
spreadsheet_id: str, requests_body: dict, strict: bool = None,
709+
spreadsheet_id: str, requests_body: dict, strict: Optional[bool],
710710
):
711711
"""
712712
Purpose:
@@ -1079,7 +1079,8 @@ def _create_requests_body(
10791079
ordered_metadata_fields,
10801080
json_schema,
10811081
spreadsheet_id,
1082-
strict=None,
1082+
sheet_url,
1083+
strict: Optional[bool],
10831084
):
10841085
"""Create and store all formatting changes for the google sheet to
10851086
execute at once.
@@ -1094,6 +1095,8 @@ def _create_requests_body(
10941095
representing the data model, including: '$schema', '$id', 'title',
10951096
'type', 'properties', 'required'
10961097
spreadsheet_id: str, of the id for the google sheet
1098+
sheet_url (Will be deprecated): a boolean ; determine if a pandas dataframe or a google sheet url gets return
1099+
strict (Optional Bool): strictness with which to apply validation rules to google sheets. True, blocks incorrect entries, False, raises a warning
10971100
Return:
10981101
requests_body(dict):
10991102
containing all the update requests to add to the gs
@@ -1102,10 +1105,11 @@ def _create_requests_body(
11021105
requests_body = {}
11031106
requests_body["requests"] = []
11041107
for i, req in enumerate(ordered_metadata_fields[0]):
1105-
# Gather validation rules and valid values for attribute
1108+
# Gather validation rules and valid values for attribute.
11061109
validation_rules = self.sg.get_node_validation_rules(req)
1107-
1108-
if validation_rules:
1110+
1111+
# Add regex match validaiton rule to Google Sheets.
1112+
if validation_rules and sheet_url:
11091113
requests_body =self._request_regex_match_vr_formatting(
11101114
validation_rules, i, spreadsheet_id, requests_body, strict
11111115
)
@@ -1163,7 +1167,7 @@ def _create_requests_body(
11631167
requests_body["requests"].append(borders_formatting)
11641168
return requests_body
11651169

1166-
def _create_empty_gs(self, required_metadata_fields, json_schema, spreadsheet_id, strict=None):
1170+
def _create_empty_gs(self, required_metadata_fields, json_schema, spreadsheet_id, sheet_url, strict: Optional[bool]):
11671171
"""Generate requests to add columns and format the google sheet.
11681172
Args:
11691173
required_metadata_fields(dict):
@@ -1174,6 +1178,8 @@ def _create_empty_gs(self, required_metadata_fields, json_schema, spreadsheet_id
11741178
representing the data model, including: '$schema', '$id', 'title',
11751179
'type', 'properties', 'required'
11761180
spreadsheet_id: str, of the id for the google sheet
1181+
sheet_url (str): google sheet url of template manifest
1182+
strict (Optional Bool): strictness with which to apply validation rules to google sheets. True, blocks incorrect entries, False, raises a warning
11771183
Returns:
11781184
manifest_url (str): url of the google sheet manifest.
11791185
"""
@@ -1193,6 +1199,7 @@ def _create_empty_gs(self, required_metadata_fields, json_schema, spreadsheet_id
11931199
ordered_metadata_fields,
11941200
json_schema,
11951201
spreadsheet_id,
1202+
sheet_url,
11961203
strict,
11971204
)
11981205

@@ -1236,12 +1243,14 @@ def _gather_all_fields(self, fields, json_schema):
12361243
)
12371244
return required_metadata_fields
12381245

1239-
def get_empty_manifest(self, strict: Optional[bool], json_schema_filepath: str=None):
1246+
def get_empty_manifest(self, strict: Optional[bool], json_schema_filepath: str=None, sheet_url: Optional[bool]=None):
12401247
"""Create an empty manifest using specifications from the
12411248
json schema.
12421249
Args:
12431250
strict (bool): strictness with which to apply validation rules to google sheets. If true, blocks incorrect entries; if false, raises a warning
12441251
json_schema_filepath (str): path to json schema file
1252+
sheet_url (Will be deprecated): a boolean ; determine if a pandas dataframe or a google sheet url gets return
1253+
strict (Optional Bool): strictness with which to apply validation rules to google sheets. True, blocks incorrect entries, False, raises a warning
12451254
Returns:
12461255
manifest_url (str): url of the google sheet manifest.
12471256
TODO:
@@ -1255,7 +1264,7 @@ def get_empty_manifest(self, strict: Optional[bool], json_schema_filepath: str=N
12551264
)
12561265

12571266
manifest_url = self._create_empty_gs(
1258-
required_metadata_fields, json_schema, spreadsheet_id, strict
1267+
required_metadata_fields, json_schema, spreadsheet_id, sheet_url=sheet_url, strict=strict,
12591268
)
12601269
return manifest_url
12611270

@@ -1356,13 +1365,14 @@ def map_annotation_names_to_display_names(
13561365
return annotations.rename(columns=label_map)
13571366

13581367
def get_manifest_with_annotations(
1359-
self, annotations: pd.DataFrame, strict: bool=None,
1368+
self, annotations: pd.DataFrame, sheet_url:bool=None, strict: Optional[bool]=None,
13601369
) -> Tuple[ps.Spreadsheet, pd.DataFrame]:
13611370
"""Generate manifest, optionally with annotations (if requested).
13621371
13631372
Args:
13641373
annotations (pd.DataFrame): Annotations table (can be empty).
1365-
1374+
strict (Optional Bool): strictness with which to apply validation rules to google sheets. True, blocks incorrect entries, False, raises a warning
1375+
sheet_url (Will be deprecated): a boolean ; determine if a pandas dataframe or a google sheet url gets return
13661376
Returns:
13671377
Tuple[ps.Spreadsheet, pd.DataFrame]: Both the Google Sheet
13681378
URL and the corresponding data frame is returned.
@@ -1381,8 +1391,8 @@ def get_manifest_with_annotations(
13811391
self.additional_metadata = annotations_dict
13821392

13831393
# Generate empty manifest using `additional_metadata`
1384-
manifest_url = self.get_empty_manifest(strict=strict)
1385-
manifest_df = self.get_dataframe_by_url(manifest_url)
1394+
manifest_url = self.get_empty_manifest(sheet_url=sheet_url, strict=strict)
1395+
manifest_df = self.get_dataframe_by_url(manifest_url=manifest_url)
13861396

13871397
# Annotations clashing with manifest attributes are skipped
13881398
# during empty manifest generation. For more info, search
@@ -1474,14 +1484,14 @@ def _handle_output_format_logic(self, output_format: str = None, output_path: st
14741484
return dataframe
14751485

14761486
def get_manifest(
1477-
self, dataset_id: str = None, sheet_url: bool = None, json_schema: str = None, output_format: str = None, output_path: str = None, access_token: str = None, strict: bool = None,
1487+
self, dataset_id: str = None, sheet_url: bool = None, json_schema: str = None, output_format: str = None, output_path: str = None, access_token: str = None, strict: Optional[bool]=None,
14781488
) -> Union[str, pd.DataFrame]:
14791489
"""Gets manifest for a given dataset on Synapse.
14801490
TODO: move this function to class MetadatModel (after MetadataModel is refactored)
14811491
14821492
Args:
14831493
dataset_id: Synapse ID of the "dataset" entity on Synapse (for a given center/project).
1484-
sheet_url: Determines if googlesheet URL or pandas dataframe should be returned.
1494+
sheet_url (Will be deprecated): a boolean ; determine if a pandas dataframe or a google sheet url gets return
14851495
output_format: Determines if Google sheet URL, pandas dataframe, or Excel spreadsheet gets returned.
14861496
output_path: Determines the output path of the exported manifest
14871497
access_token: Token in .synapseConfig. Since we could not pre-load access_token as an environment variable on AWS, we have to add this variable.
@@ -1492,7 +1502,7 @@ def get_manifest(
14921502

14931503
# Handle case when no dataset ID is provided
14941504
if not dataset_id:
1495-
manifest_url = self.get_empty_manifest(json_schema_filepath=json_schema, strict=strict)
1505+
manifest_url = self.get_empty_manifest(json_schema_filepath=json_schema, strict=strict, sheet_url=sheet_url)
14961506

14971507
# if output_form parameter is set to "excel", return an excel spreadsheet
14981508
if output_format == "excel":
@@ -1517,7 +1527,7 @@ def get_manifest(
15171527
manifest_record = store.updateDatasetManifestFiles(self.sg, datasetId = dataset_id, store = False)
15181528

15191529
# get URL of an empty manifest file created based on schema component
1520-
empty_manifest_url = self.get_empty_manifest(strict=strict)
1530+
empty_manifest_url = self.get_empty_manifest(strict=strict, sheet_url=sheet_url)
15211531

15221532
# Populate empty template with existing manifest
15231533
if manifest_record:
@@ -1555,7 +1565,8 @@ def get_manifest(
15551565
annotations = annotations[["Filename", "eTag", "entityId"]]
15561566

15571567
# Update `additional_metadata` and generate manifest
1558-
manifest_url, manifest_df = self.get_manifest_with_annotations(annotations)
1568+
manifest_url, manifest_df = self.get_manifest_with_annotations(annotations, sheet_url=sheet_url, strict=strict)
1569+
15591570
# Update df with existing manifest. Agnostic to output format
15601571
updated_df, out_of_schema_columns = self._update_dataframe_with_existing_df(empty_manifest_url=empty_manifest_url, existing_df=manifest_df)
15611572

@@ -1609,8 +1620,8 @@ def _update_dataframe_with_existing_df(self, empty_manifest_url: str, existing_d
16091620
"""
16101621

16111622
# Get headers for the current schema and existing manifest df.
1612-
current_schema_headers = list(self.get_dataframe_by_url(empty_manifest_url).columns)
1613-
existing_manifest_headers = list(existing_df.columns)
1623+
current_schema_headers = list(self.get_dataframe_by_url(manifest_url=empty_manifest_url).columns)
1624+
existing_manfiest_headers = list(existing_df.columns)
16141625

16151626
# Find columns that exist in the current schema, but are not in the manifest being downloaded.
16161627
new_columns = self._get_missing_columns(current_schema_headers, existing_manifest_headers)

0 commit comments

Comments
 (0)