Skip to content

Commit 728e717

Browse files
authored
Merge pull request #1429 from Sage-Bionetworks/develop
v24.5.1 fix
2 parents 0b26a01 + d3668c0 commit 728e717

15 files changed

+508
-114
lines changed

RELEASE.md

Lines changed: 0 additions & 51 deletions
This file was deleted.

schematic/models/validate_attribute.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
np_array_to_str_list,
2424
iterable_to_str_list,
2525
rule_in_rule_list,
26+
get_list_robustness,
2627
)
2728

2829
from synapseclient.core.exceptions import SynapseNoCredentialsError
@@ -58,6 +59,7 @@ def generate_schema_error(
5859
error_col=attribute_name,
5960
error_message=error_message,
6061
error_val=invalid_entry,
62+
message_level="error",
6163
)
6264

6365
return error_list, warning_list
@@ -475,7 +477,12 @@ def _get_rule_attributes(
475477

476478
is_schema_error = rule_name == "schema"
477479
col_is_recommended = rule_name == "recommended"
478-
col_is_required = dmge.get_node_required(node_display_name=error_col_name)
480+
481+
if not is_schema_error:
482+
col_is_required = dmge.get_node_required(node_display_name=error_col_name)
483+
else:
484+
col_is_required = False
485+
479486
return (
480487
rule_parts,
481488
rule_name,
@@ -823,16 +830,17 @@ def list_validation(
823830
# white spaces removed.
824831
errors = []
825832
warnings = []
833+
replace_null = True
826834

827835
csv_re = comma_separated_list_regex()
828836

829-
rule_parts = val_rule.lower().split(" ")
830-
if len(rule_parts) > 1:
831-
list_robustness = rule_parts[1]
832-
else:
833-
list_robustness = "strict"
837+
# Check if lists -must- be a list, or can be a single value.
838+
list_robustness = get_list_robustness(val_rule=val_rule)
839+
840+
if list_robustness == "like":
841+
replace_null = False
834842

835-
if list_robustness == "strict":
843+
elif list_robustness == "strict":
836844
manifest_col = manifest_col.astype(str)
837845

838846
# This will capture any if an entry is not formatted properly. Only for strict lists
@@ -864,7 +872,7 @@ def list_validation(
864872
warnings.append(vr_warnings)
865873

866874
# Convert string to list.
867-
manifest_col = parse_str_series_to_list(manifest_col)
875+
manifest_col = parse_str_series_to_list(manifest_col, replace_null=replace_null)
868876

869877
return errors, warnings, manifest_col
870878

schematic/models/validate_manifest.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@
2525
from schematic.store.synapse import SynapseStorage
2626
from schematic.models.GE_Helpers import GreatExpectationsHelpers
2727
from schematic.utils.validate_rules_utils import validation_rule_info
28-
from schematic.utils.validate_utils import rule_in_rule_list
28+
from schematic.utils.validate_utils import (
29+
rule_in_rule_list,
30+
convert_nan_entries_to_empty_strings,
31+
)
2932
from schematic.utils.schema_utils import extract_component_validation_rules
3033

3134
logger = logging.getLogger(__name__)
@@ -103,9 +106,9 @@ def validate_manifest_rules(
103106
manifest: pd.core.frame.DataFrame,
104107
dmge: DataModelGraphExplorer,
105108
restrict_rules: bool,
106-
project_scope: List,
109+
project_scope: list[str],
107110
access_token: Optional[str] = None,
108-
) -> (pd.core.frame.DataFrame, List[List[str]]):
111+
) -> (pd.core.frame.DataFrame, list[list[str]]):
109112
"""
110113
Purpose:
111114
Take validation rules set for a particular attribute
@@ -295,8 +298,7 @@ def validate_manifest_values(
295298
warnings = []
296299
col_attr = {} # save the mapping between column index and attribute name
297300

298-
# Replace nans with empty strings so jsonschema
299-
manifest = manifest.replace({np.nan: ""})
301+
manifest = convert_nan_entries_to_empty_strings(manifest=manifest)
300302

301303
# numerical values need to be type string for the jsonValidator
302304
for col in manifest.select_dtypes(
@@ -347,15 +349,18 @@ def validate_all(
347349
project_scope: List,
348350
access_token: str,
349351
):
352+
# Run Validation Rules
350353
vm = ValidateManifest(errors, manifest, manifestPath, dmge, jsonSchema)
351354
manifest, vmr_errors, vmr_warnings = vm.validate_manifest_rules(
352355
manifest, dmge, restrict_rules, project_scope, access_token
353356
)
357+
354358
if vmr_errors:
355359
errors.extend(vmr_errors)
356360
if vmr_warnings:
357361
warnings.extend(vmr_warnings)
358362

363+
# Run JSON Schema Validation
359364
vmv_errors, vmv_warnings = vm.validate_manifest_values(manifest, jsonSchema, dmge)
360365
if vmv_errors:
361366
errors.extend(vmv_errors)

schematic/utils/validate_utils.py

Lines changed: 76 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,33 @@ def comma_separated_list_regex() -> Pattern[str]:
5454
return csv_list_regex
5555

5656

57+
def convert_nan_entries_to_empty_strings(
58+
manifest: pd.core.frame.DataFrame,
59+
) -> pd.core.frame.DataFrame:
60+
"""
61+
Nans need to be converted to empty strings for JSON Schema Validation. This helper
62+
converts an a list with a single '<NA>' string or a single np.nan to empty strings.
63+
These types of expected NANs come from different stages of conversion during import
64+
and validation.
65+
66+
Args:
67+
manifest: pd.core.frame.DataFrame, manifest prior to removing nans and
68+
replacing with empty strings.
69+
Returns:
70+
manifest: pd.core.frame.DataFrame, manifest post removing nans and
71+
replacing with empty strings.
72+
"""
73+
# Replace nans with empty strings so jsonschema, address replace type infering depreciation.
74+
with pd.option_context("future.no_silent_downcasting", True):
75+
manifest = manifest.replace({np.nan: ""}).infer_objects(copy=False) # type: ignore
76+
77+
for col in manifest.columns:
78+
for index, value in manifest[col].items():
79+
if value == ["<NA>"]:
80+
manifest.loc[index, col] = [""] # type: ignore
81+
return manifest
82+
83+
5784
def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str]]:
5885
"""
5986
Function to standardize
@@ -70,18 +97,62 @@ def rule_in_rule_list(rule: str, rule_list: list[str]) -> Optional[re.Match[str]
7097
return re.search(rule_type, rule_list_str, flags=re.IGNORECASE)
7198

7299

73-
def parse_str_series_to_list(col: pd.Series) -> pd.Series:
100+
def get_list_robustness(val_rule: str) -> str:
101+
"""Helper function to extract list robustness from the validation rule.
102+
List robustness defines if the input -must- be a list (several values
103+
or a single value with a trailing comma),
104+
or if a user is allowed to submit a single value.
105+
List rules default to `strict` if not defined to be `like`
106+
Args:
107+
val_rule: str, validation rule string.
108+
Returns:
109+
list_robutness: str, list robustness extracted from validation rule.
110+
"""
111+
list_robustness_options = ["like", "strict"]
112+
list_robustness = None
113+
default_robustness = list_robustness_options[1]
114+
115+
# Get the parts of a single rule, list is assumed to be in the first position, based on
116+
# requirements that can be found in documentation.
117+
rule_parts = val_rule.lower().split(" ")
118+
119+
if len(rule_parts) > 1:
120+
# Check if list_robustness is defined in the rule, if not give them the default.
121+
list_robustness_list = [
122+
part for part in rule_parts if part in list_robustness_options
123+
]
124+
if list_robustness_list:
125+
list_robustness = list_robustness_list[0]
126+
127+
if not list_robustness:
128+
# If no robustness has been defined by the user, set to the default.
129+
list_robustness = default_robustness
130+
return list_robustness
131+
132+
133+
def parse_str_series_to_list(col: pd.Series, replace_null: bool = True) -> pd.Series:
74134
"""
75135
Parse a pandas series of comma delimited strings
76-
into a series with values that are lists of strings
136+
into a series with values that are lists of strings. If replace_null, fill null values
137+
with nan. If the type of the value needs to be an array, fill with empty list.
77138
ex.
78139
Input: 'a,b,c'
79140
Output: ['a','b','c']
80141
81142
"""
82-
col = col.apply(
83-
lambda x: [s.strip() for s in str(x).split(",")] if not pd.isnull(x) else pd.NA
84-
)
143+
if replace_null:
144+
col = col.apply(
145+
lambda x: [s.strip() for s in str(x).split(",")]
146+
if not pd.isnull(x)
147+
else pd.NA
148+
)
149+
else:
150+
col = col.apply(
151+
lambda x: [s.strip() for s in str(x).split(",")]
152+
if (isinstance(x, np.ndarray) and not x.any()) or not pd.isnull(x)
153+
else []
154+
)
155+
85156
return col
86157

87158

tests/data/example.model.csv

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,16 @@ CRAM,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,,,
1919
CSV/TSV,,,Genome Build,,FALSE,ValidValue,,,
2020
Genome Build,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,,,
2121
Genome FASTA,,,,,TRUE,DataProperty,,,
22-
MockComponent,,,"Component, Check List, Check Regex List, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Match None, Check Match None values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,,
23-
Check List,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict
24-
Check Regex List,,,,,TRUE,DataProperty,,,list strict::regex match [a-f]
22+
MockComponent,,,"Component, Check List, Check List Enum, Check List Like, Check List Like Enum, Check List Strict, Check List Enum Strict, Check Regex List, Check Regex List Like, Check Regex List Strict, Check Regex Single, Check Regex Format, Check Regex Integer, Check Num, Check Float, Check Int, Check String, Check URL,Check Match at Least, Check Match at Least values, Check Match Exactly, Check Match Exactly values, Check Match None, Check Match None values, Check Recommended, Check Ages, Check Unique, Check Range, Check Date, Check NA",,FALSE,DataType,,,
23+
Check List,,,,,TRUE,DataProperty,,,list
24+
Check List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list
25+
Check List Like,,,,,TRUE,DataProperty,,,list like
26+
Check List Like Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list like
27+
Check List Strict,,,,,TRUE,DataProperty,,,list strict
28+
Check List Enum Strict,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list strict
29+
Check Regex List,,,,,TRUE,DataProperty,,,list::regex match [a-f]
30+
Check Regex List Strict,,,,,TRUE,DataProperty,,,list strict::regex match [a-f]
31+
Check Regex List Like,,,,,TRUE,DataProperty,,,list like::regex match [a-f]
2532
Check Regex Single,,,,,TRUE,DataProperty,,,regex search [a-f]
2633
Check Regex Format,,,,,TRUE,DataProperty,,,regex match [a-f]
2734
Check Regex Integer,,,,,TRUE,DataProperty,,,regex search ^\d+$

0 commit comments

Comments
 (0)