Skip to content

Commit c51af2d

Browse files
Merge pull request #1401 from Sage-Bionetworks/develop-required-validation-rule-FDS-1626
add functionality to set requirements based per component
2 parents 0b2d7b3 + 7d18e94 commit c51af2d

15 files changed

+1442
-59
lines changed

schematic/models/GE_Helpers.py

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,37 @@
3838
rule_in_rule_list,
3939
np_array_to_str_list,
4040
iterable_to_str_list,
41+
required_is_only_rule,
4142
)
4243

4344
logger = logging.getLogger(__name__)
4445

4546

47+
# List of modifiers that users can add to a rule, that arent rules themselves.
48+
# as additional modifiers are added will need to update this list
49+
50+
RULE_MODIFIERS = ["error", "warning", "strict", "like", "set", "value"]
51+
VALIDATION_EXPECTATION = {
52+
"int": "expect_column_values_to_be_in_type_list",
53+
"float": "expect_column_values_to_be_in_type_list",
54+
"str": "expect_column_values_to_be_of_type",
55+
"num": "expect_column_values_to_be_in_type_list",
56+
"date": "expect_column_values_to_be_dateutil_parseable",
57+
"recommended": "expect_column_values_to_not_be_null",
58+
"protectAges": "expect_column_values_to_be_between",
59+
"unique": "expect_column_values_to_be_unique",
60+
"inRange": "expect_column_values_to_be_between",
61+
"IsNA": "expect_column_values_to_match_regex_list",
62+
# To be implemented rules with possible expectations
63+
# "list": "expect_column_values_to_not_match_regex_list",
64+
# "regex": "expect_column_values_to_match_regex",
65+
# "url": "expect_column_values_to_be_valid_urls",
66+
# "matchAtLeastOne": "expect_foreign_keys_in_column_a_to_exist_in_column_b",
67+
# "matchExactlyOne": "expect_foreign_keys_in_column_a_to_exist_in_column_b",
68+
# "matchNone": "expect_compound_columns_to_be_unique",
69+
}
70+
71+
4672
class GreatExpectationsHelpers(object):
4773
"""
4874
Great Expectations helper class
@@ -134,25 +160,6 @@ def build_expectation_suite(
134160
saves expectation suite and identifier to self
135161
136162
"""
137-
validation_expectation = {
138-
"int": "expect_column_values_to_be_in_type_list",
139-
"float": "expect_column_values_to_be_in_type_list",
140-
"str": "expect_column_values_to_be_of_type",
141-
"num": "expect_column_values_to_be_in_type_list",
142-
"date": "expect_column_values_to_be_dateutil_parseable",
143-
"recommended": "expect_column_values_to_not_be_null",
144-
"protectAges": "expect_column_values_to_be_between",
145-
"unique": "expect_column_values_to_be_unique",
146-
"inRange": "expect_column_values_to_be_between",
147-
"IsNA": "expect_column_values_to_match_regex_list",
148-
# To be implemented rules with possible expectations
149-
# "list": "expect_column_values_to_not_match_regex_list",
150-
# "regex": "expect_column_values_to_match_regex",
151-
# "url": "expect_column_values_to_be_valid_urls",
152-
# "matchAtLeastOne": "expect_foreign_keys_in_column_a_to_exist_in_column_b",
153-
# "matchExactlyOne": "expect_foreign_keys_in_column_a_to_exist_in_column_b",
154-
# "matchNone": "expect_compound_columns_to_be_unique",
155-
}
156163

157164
# create blank expectation suite
158165
self.expectation_suite_name = "Manifest_test_suite"
@@ -185,7 +192,14 @@ def build_expectation_suite(
185192
base_rule = rule.split(" ")[0]
186193

187194
# check if rule has an implemented expectation
188-
if rule_in_rule_list(rule, self.unimplemented_expectations):
195+
if rule_in_rule_list(
196+
rule, self.unimplemented_expectations
197+
) or required_is_only_rule(
198+
rule=rule,
199+
attribute=col,
200+
rule_modifiers=RULE_MODIFIERS,
201+
validation_expectation=VALIDATION_EXPECTATION,
202+
):
189203
continue
190204

191205
args["column"] = col
@@ -328,7 +342,7 @@ def build_expectation_suite(
328342
rule=rule,
329343
args=args,
330344
meta=meta,
331-
validation_expectation=validation_expectation,
345+
validation_expectation=VALIDATION_EXPECTATION,
332346
)
333347

334348
self.context.update_expectation_suite(
@@ -368,7 +382,7 @@ def add_expectation(
368382
# Create an Expectation
369383
expectation_configuration = ExpectationConfiguration(
370384
# Name of expectation type being added
371-
expectation_type=validation_expectation[rule.split(" ")[0]],
385+
expectation_type=VALIDATION_EXPECTATION[rule.split(" ")[0]],
372386
# add arguments and meta message
373387
kwargs={**args},
374388
meta={**meta},

schematic/models/validate_manifest.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,14 @@ def check_max_rule_num(
8080
errors, list[list[str]]: list of errors being compiled, with additional error list being appended if appropriate
8181
"""
8282
# Check that attribute rules conform to limits:
83-
# IsNa is operates differently than most rules, do not consider it as a rule for evaluating
83+
# IsNa and required is operate differently than most rules, do not consider it as a rule for evaluating
8484
# if the number of rule pairs has been exceeded.
8585
if "IsNa" in validation_rules:
8686
validation_rules.remove("IsNa")
8787

88+
if "required" in validation_rules:
89+
validation_rules.remove("required")
90+
8891
# no more than two rules for an attribute.
8992
# As more combinations get added, may want to bring out into its own function / or use validate_rules_utils?
9093
if len(validation_rules) > 2:

schematic/schemas/data_model_graph.py

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,12 @@
1414
get_property_label_from_display_name,
1515
get_class_label_from_display_name,
1616
DisplayLabelType,
17+
extract_component_validation_rules,
1718
)
1819
from schematic.utils.general import unlist
1920
from schematic.utils.viz_utils import visualize
21+
from schematic.utils.validate_utils import rule_in_rule_list
22+
2023

2124
logger = logging.getLogger(__name__)
2225

@@ -226,6 +229,95 @@ def get_adjacent_nodes_by_relationship(
226229

227230
return list(nodes)
228231

232+
def get_component_node_required(
233+
self,
234+
manifest_component: str,
235+
node_validation_rules: Optional[list[str]] = None,
236+
node_label: Optional[str] = None,
237+
node_display_name: Optional[str] = None,
238+
) -> bool:
239+
"""Check if a node is required taking into account the manifest component it is defined in
240+
(requirements can be set in validaiton rule as well as required column)
241+
Args:
242+
manifest_component: str, manifest component display name that the node belongs to.
243+
node_validation_rules: list[str], valdation rules for a given node and component.
244+
node_label: str, Label of the node you would want to get the comment for.
245+
node_display_name: str, node display name for the node being queried.
246+
Returns:
247+
True, if node is required, False if not
248+
"""
249+
node_required = False
250+
251+
if not node_validation_rules:
252+
# Get node validation rules for a given component
253+
node_validation_rules = self.get_component_node_validation_rules(
254+
manifest_component=manifest_component,
255+
node_label=node_label,
256+
node_display_name=node_display_name,
257+
)
258+
259+
# Check if the valdation rule specifies that the node is required for this particular
260+
# component.
261+
if rule_in_rule_list("required", node_validation_rules):
262+
node_required = True
263+
# To prevent any unintended errors, ensure the Required field for this node is False
264+
if self.get_node_required(
265+
node_label=node_label, node_display_name=node_display_name
266+
):
267+
if not node_display_name:
268+
assert node_label is not None
269+
node_display_name = self.graph.nodes[node_label][
270+
self.rel_dict["displayName"]["node_label"]
271+
]
272+
error_str = " ".join(
273+
[
274+
f"For component: {manifest_component} and attribute: {node_display_name}",
275+
"requirements are being specified in both the Required field and in the",
276+
"Validation Rules. If you desire to use validation rules to set component",
277+
"specific requirements for this attribute",
278+
"then the Required field needs to be set to False, or the validation may",
279+
"not work as intended, for other components where the attribute",
280+
"that should not be required.",
281+
]
282+
)
283+
284+
logger.error(error_str)
285+
else:
286+
# If requirements are not being set in the validaiton rule, then just pull the
287+
# standard node requirements from the model
288+
node_required = self.get_node_required(
289+
node_label=node_label, node_display_name=node_display_name
290+
)
291+
return node_required
292+
293+
def get_component_node_validation_rules(
294+
self,
295+
manifest_component: str,
296+
node_label: Optional[str] = None,
297+
node_display_name: Optional[str] = None,
298+
) -> list[str]:
299+
"""Get valdation rules for a given node and component.
300+
Args:
301+
manifest_component: str, manifest component display name that the node belongs to.
302+
node_label: str, Label of the node you would want to get the comment for.
303+
node_display_name: str, node display name for the node being queried.
304+
Returns:
305+
validation_rules: list[str], validation rules list for a given node and component.
306+
"""
307+
# get any additional validation rules associated with this node (e.g. can this node
308+
# be mapped to a list of other nodes)
309+
node_validation_rules = self.get_node_validation_rules(
310+
node_label=node_label, node_display_name=node_display_name
311+
)
312+
313+
# Parse the validation rules per component if applicable
314+
if node_validation_rules and isinstance(node_validation_rules, dict):
315+
node_validation_rules = extract_component_validation_rules(
316+
manifest_component=manifest_component,
317+
validation_rules_dict=node_validation_rules,
318+
)
319+
return node_validation_rules
320+
229321
def get_component_requirements(
230322
self,
231323
source_component: str,
@@ -671,7 +763,7 @@ def get_node_required(
671763

672764
def get_node_validation_rules(
673765
self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
674-
) -> list:
766+
) -> Union[list, dict[str, str]]:
675767
"""Get validation rules associated with a node,
676768
677769
Args:

schematic/schemas/data_model_json_schema.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -187,15 +187,16 @@ def get_json_validation_schema(
187187
range_domain_map[node] = []
188188
range_domain_map[node].append(node_display_name)
189189

190-
# can this node be map to the empty set (if required no; if not required yes)
191-
# TODO: change "required" to different term, required may be a bit misleading
192-
# (i.e. is the node required in the schema)
193-
node_required = self.dmge.get_node_required(node_label=process_node)
194-
195-
# get any additional validation rules associated with this node (e.g. can this node
196-
# be mapped to a list of other nodes)
197-
node_validation_rules = self.dmge.get_node_validation_rules(
198-
node_display_name=node_display_name
190+
# Get node validation rules for the current node, and the given component
191+
node_validation_rules = self.dmge.get_component_node_validation_rules(
192+
manifest_component=source_node, node_display_name=node_display_name
193+
)
194+
195+
# Get if the node is required for the given component
196+
node_required = self.dmge.get_component_node_required(
197+
manifest_component=source_node,
198+
node_validation_rules=node_validation_rules,
199+
node_display_name=node_display_name,
199200
)
200201

201202
if node_display_name in reverse_dependencies:
@@ -231,7 +232,6 @@ def get_json_validation_schema(
231232
# set schema conditional dependencies
232233
for node in reverse_dependencies[node_display_name]:
233234
# set all of the conditional nodes that require this process node
234-
235235
# get node domain if any
236236
# ow this node is a conditional requirement
237237
if node in range_domain_map:

schematic/utils/validate_utils.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import re
66
from collections.abc import Mapping
7+
import logging
78
from typing import Pattern, Union, Iterable, Any, Optional
89
from numbers import Number
910
from jsonschema import validate
@@ -12,6 +13,8 @@
1213
from schematic.utils.io_utils import load_json
1314
from schematic import LOADER
1415

16+
logger = logging.getLogger(__name__)
17+
1518

1619
def validate_schema(schema: Union[Mapping, bool]) -> None:
1720
"""Validate schema against schema.org standard"""
@@ -104,3 +107,73 @@ def iterable_to_str_list(obj: Union[str, Number, Iterable]) -> list[str]:
104107
# If the object is iterable and not a string, convert every element
105108
# to string and wrap as a list
106109
return [str(item) for item in obj]
110+
111+
112+
def required_is_only_rule(
113+
rule: str,
114+
attribute: str,
115+
rule_modifiers: list[str],
116+
validation_expectation: dict[str, str],
117+
) -> bool:
118+
"""Need to determine if required is the only rule being set. Do this way so we dont have
119+
to enforce a position for it (ie, it can only be before message and after the rule).
120+
This ensures that 'required' is not treated like a real rule, in the case it is
121+
accidentally combined with a rule modifier. The required rule is t
122+
123+
Args:
124+
rule: str, the validation rule string
125+
attribute: str, attribute the validation rule is set to
126+
rule_modifiers: list[str], list of rule modifiers available to add to rules
127+
validation_expectation: dict[str, str], currently implemented expectations.
128+
Returns:
129+
bool, True, if required is the only rule, false if it is not.
130+
"""
131+
# convert rule to lowercase to ensure punctuation does not throw off determination.
132+
rule = rule.lower()
133+
134+
# If required is not in the rule, it cant be the only rule, return False
135+
if "required" not in rule:
136+
return False
137+
138+
# If the entire rule is just 'required' then it is easily determined to be the only rule
139+
if rule == "required":
140+
return True
141+
142+
# Try to find an expectation rule in the rule, if there is one there log it and
143+
# continue
144+
# This function is called as part of an if that is already looking for in house rules
145+
# so don't worry about looking for them.
146+
rule_parts = rule.split(" ")
147+
for idx, rule_part in enumerate(rule_parts):
148+
if rule_part in validation_expectation:
149+
return False
150+
151+
# identify then remove all rule modifiers, all that should be left is required in the
152+
# case that someone used a standard modifier with required
153+
idx_to_remove = []
154+
if "required" in rule_parts:
155+
for idx, rule_part in enumerate(rule_parts):
156+
if rule_part in rule_modifiers:
157+
idx_to_remove.append(idx)
158+
159+
if idx_to_remove:
160+
for idx in sorted(idx_to_remove, reverse=True):
161+
del rule_parts[idx]
162+
163+
# In this case, rule modifiers have been added to required. This is not the expected use
164+
# so log a warning, but let user proceed.
165+
if rule_parts == ["required"]:
166+
warning_message = " ".join(
167+
[
168+
f"For Attribute: {attribute}, it looks like required was set as a single rule,"
169+
"with modifiers attached.",
170+
"Rule modifiers do not work in conjunction with the required validation rule.",
171+
"Please reformat your rule.",
172+
]
173+
)
174+
logger.warning(warning_message)
175+
return True
176+
177+
# Return false if no other condition has been met. In this case if the rule is not a real
178+
# rule an error will be raised from the containing function.
179+
return False
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
Attribute,Description,Valid Values,DependsOn,Properties,Required,Parent,DependsOn Component,Source,Validation Rules
2+
Component,,,,,TRUE,,,,
3+
Patient,,,"Patient ID, Sex, Year of Birth, Diagnosis, Component",,FALSE,DataType,,,
4+
Patient ID,,,,,FALSE,DataProperty,,,#Patient unique warning^^#Biospecimen unique required error
5+
Sex,,"Female, Male, Other",,,TRUE,DataProperty,,,
6+
Year of Birth,,,,,FALSE,DataProperty,,,
7+
Diagnosis,,"Healthy, Cancer",,,TRUE,DataProperty,,,
8+
Cancer,,,"Cancer Type, Family History",,FALSE,ValidValue,,,
9+
Cancer Type,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,
10+
Family History,,"Breast, Colorectal, Lung, Prostate, Skin",,,TRUE,DataProperty,,,list strict
11+
Biospecimen,,,"Sample ID, Patient ID, Tissue Status, Component",,FALSE,DataType,Patient,,
12+
Sample ID,,,,,TRUE,DataProperty,,,
13+
Tissue Status,,"Healthy, Malignant",,,TRUE,DataProperty,,,
14+
Bulk RNA-seq Assay,,,"Filename, Sample ID, File Format, Component",,FALSE,DataType,Biospecimen,,
15+
Filename,,,,,TRUE,DataProperty,,,
16+
File Format,,"FASTQ, BAM, CRAM, CSV/TSV",,,FALSE,DataProperty,,,^^#BulkRNA-seqAssay required
17+
BAM,,,Genome Build,,FALSE,ValidValue,,,
18+
CRAM,,,"Genome Build, Genome FASTA",,FALSE,ValidValue,,,
19+
CSV/TSV,,,Genome Build,,FALSE,ValidValue,,,
20+
Genome Build,,"GRCh37, GRCh38, GRCm38, GRCm39",,,TRUE,DataProperty,,,
21+
Genome FASTA,,,,,TRUE,DataProperty,,,

0 commit comments

Comments
 (0)