Skip to content

Commit 04a0965

Browse files
authored
[Schematic-294] Add pattern regex rule to jsonschemas (#1608)
* refactored DataModelJSONSchema into DataModelJSONSchema2 * refactored DataModelJSONSchema into DataModelJSONSchema2 * fix test * added minimum, maximum, and description to schemas * formatting changes * ran vlack * fix types that do not work with python3.9 * fix types that do not work with python3.9 * ran black * added test files * remove test files form git control * improve test * git rm test files * renamed class * improved tests * added two componets to JSON Schema test * added two componets to JSON Schema test * renamed test files * git rm test files * component generator now uses new schema generator class * removed description adding from component generator * fix formatting * added missing end of file lines * move test files * sonarcloud fixes * fix types not supported in python3.9 * fix bad path * remove uneeded test * synapse does not allow the type keyword to be an array * add integration test * fix formatting * add tests to cover missing coverage * added title keyword to all types in an anyOf list * regenerated expected test files * fix end of lines * change integration test to use Synapse fixture * Linglings suggestions * removed unused import * JSON schemas not written with display name * change schemas to not use display name * add integration test for validation schemas in Synapse * added json schema validation tests * added missing docstring * moved instances to files * fix bug in schema generation * fix mypy issues * cleaned up test functionality into fixtures * add test case * various improvements * integration tests now create schemas instead of using expected ones * fixed bug where conditiuonal dependencies where using display names * added additional schema validation tests * fixed bug in json_files_equal * files formatted * improved module level docstring * move function to method of PropertyData * improve PropertyData docstring * improvements to JSONSchema class * fix argument name * improved NodeProcessor docstring * ran pre-commit * fixed line in docstring * remove uneeded function * fix accidental commit * fix incorrect type is test * Add WHEN/GIVEN/THEN pattern to some tests * ran pre-commit * added validation rules to exception message * added validation rules to exception message * cleaned up comments, docstring and vareiable name * imrpive docstring * ran pre-commit * Linglings suggestions * ran pre-commit * ran pylint * added tests for _write_data_model * added unit test for DataModelGraphExplorer.get_adjacent_nodes_by_relationship * revert json_files_equal back to orginal state * created a metadata_model fixture in conftest * ran pre-commit * change name of output directory, possible collision with other tests * refactored JSONSchemaGenerator class into function * JSONSchemaComponentGnerator now using crate_json_schema function * ran pre-commit * can now write using display names * added attributes to example model * moved functioanlity for creating nodes to the Node class * moved functioanlity for creating nodes to the Node class * cleaned up formatting * remove dmr from classes * clean up tests * use new display names parameter * run pre-commit * move module-scoped synapse to conftest * remove unused imports * add more detail to test id * add ids to paramatized tests * fix malformed test ids * ran pre-commit * removed after yield teardowns with finalizers * ran pre-commit * regex validation rule is now translated to pattern keyword * update expected file * cleanup * cleanup tests * cleanup tests * changed strings into enum classes * moved enums and constants to constants file * ran precommit * handle merge * Linglings suggestions * moved code involving validation rules oout of pont_init and into helper function * added test * add pylint diable line * ran black
1 parent 15196c6 commit 04a0965

File tree

7 files changed

+383
-121
lines changed

7 files changed

+383
-121
lines changed

schematic/schemas/constants.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""A module for constants and enums to be shared across the schemas module"""
2+
3+
from enum import Enum
4+
5+
6+
class ValidationRule(Enum):
7+
"""Validation rules that are used to create JSON Schema"""
8+
9+
REGEX = "regex"
10+
IN_RANGE = "inRange"
11+
STR = "str"
12+
FLOAT = "float"
13+
INT = "int"
14+
BOOL = "bool"
15+
NUM = "num"
16+
17+
18+
class JSONSchemaType(Enum):
19+
"""This enum is allowed values type values for a JSON Schema in a data model"""
20+
21+
STRING = "string"
22+
NUMBER = "number"
23+
INTEGER = "integer"
24+
BOOLEAN = "boolean"
25+
26+
27+
class RegexModule(Enum):
28+
"""This enum are allowed modules for the regex validation rule"""
29+
30+
SEARCH = "search"
31+
MATCH = "match"
32+
33+
34+
# A dict where the keys are type validation rules, and the values are their JSON Schema equivalent
35+
TYPE_RULES = {
36+
ValidationRule.STR.value: JSONSchemaType.STRING.value,
37+
ValidationRule.NUM.value: JSONSchemaType.NUMBER.value,
38+
ValidationRule.FLOAT.value: JSONSchemaType.NUMBER.value,
39+
ValidationRule.INT.value: JSONSchemaType.INTEGER.value,
40+
ValidationRule.BOOL.value: JSONSchemaType.BOOLEAN.value,
41+
}

schematic/schemas/create_json_schema.py

Lines changed: 155 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
The JSONSchema class is used to store all the data needed to write the final JSON Schema
77
"""
88

9+
# pylint: disable=too-many-lines
10+
911
import logging
1012
import os
1113
from typing import Union, Any, Optional
@@ -14,20 +16,17 @@
1416
from schematic.schemas.data_model_graph import DataModelGraphExplorer
1517
from schematic.utils.schema_utils import get_json_schema_log_file_path
1618
from schematic.utils.validate_utils import rule_in_rule_list
17-
1819
from schematic.utils.io_utils import export_json
20+
from schematic.schemas.constants import (
21+
ValidationRule,
22+
JSONSchemaType,
23+
RegexModule,
24+
TYPE_RULES,
25+
)
1926

2027

2128
logger = logging.getLogger(__name__)
2229

23-
# A dict where the keys are type validation rules, and the values are their JSON Schema equivalent
24-
TYPE_RULES = {
25-
"str": "string",
26-
"num": "number",
27-
"float": "number",
28-
"int": "integer",
29-
"bool": "boolean",
30-
}
3130

3231
# Complex types
3332
Items = dict[str, Union[str, float, list[str]]]
@@ -147,6 +146,7 @@ class Node: # pylint: disable=too-many-instance-attributes
147146
is_array: Whether or not the property is an array (inferred from validation_rules)
148147
minimum: The minimum value of the property (if numeric) (inferred from validation_rules)
149148
maximum: The maximum value of the property (if numeric) (inferred from validation_rules)
149+
pattern: The regex pattern of the property
150150
"""
151151

152152
name: str
@@ -162,14 +162,11 @@ class Node: # pylint: disable=too-many-instance-attributes
162162
is_array: bool = field(init=False)
163163
minimum: Optional[float] = field(init=False)
164164
maximum: Optional[float] = field(init=False)
165+
pattern: Optional[str] = field(init=False)
165166

166167
def __post_init__(self) -> None:
167168
"""
168169
Uses the dmge to fill in most of the fields of the dataclass
169-
170-
Raises:
171-
ValueError: If the type is not numeric, and there is an
172-
inRange rule in the validation rules
173170
"""
174171
self.display_name = self.dmge.get_nodes_display_names([self.name])[0]
175172
self.valid_values = sorted(self.dmge.get_node_range(node_label=self.name))
@@ -193,32 +190,83 @@ def __post_init__(self) -> None:
193190
node_display_name=self.display_name
194191
)
195192

196-
self.type = None
197-
self.is_array = False
198-
self.minimum = None
199-
self.maximum = None
193+
(
194+
self.type,
195+
self.is_array,
196+
self.minimum,
197+
self.maximum,
198+
self.pattern,
199+
) = _get_validation_rule_based_fields(validation_rules)
200200

201-
if validation_rules:
202-
if rule_in_rule_list("list", validation_rules):
203-
self.is_array = True
204201

205-
type_rule = _get_type_rule_from_rule_list(validation_rules)
206-
if type_rule:
207-
self.type = TYPE_RULES.get(type_rule)
202+
def _get_validation_rule_based_fields(
203+
validation_rules: list[str],
204+
) -> tuple[Optional[str], bool, Optional[float], Optional[float], Optional[str]]:
205+
"""
206+
Gets the fields for the Node class that are based on the validation rules
207+
208+
Args:
209+
validation_rules: A list of validation rules
208210
209-
range_rule = _get_in_range_rule_from_rule_list(validation_rules)
210-
if range_rule:
211-
if self.type is None:
212-
self.type = "number"
213-
elif self.type not in ["number", "integer"]:
214-
raise ValueError(
215-
"Validation type must be either 'number' or 'integer' "
216-
f"when using the inRange rule, but got: {self.type}"
217-
)
218-
self.minimum, self.maximum = _get_ranges_from_range_rule(range_rule)
211+
Raises:
212+
ValueError: If both the inRange and regex rule are present
213+
ValueError: If the inRange rule and a type validation rule other than 'int' or 'num'
214+
are present
215+
ValueError: If the regex rule and a type validation rule other than 'str' are present
219216
217+
Returns:
218+
A tuple containing the type, is_array, minimum, maximum, and pattern fields for
219+
a Node object
220+
"""
221+
prop_type: Optional[str] = None
222+
is_array = False
223+
minimum: Optional[float] = None
224+
maximum: Optional[float] = None
225+
pattern: Optional[str] = None
226+
227+
if validation_rules:
228+
if rule_in_rule_list("list", validation_rules):
229+
is_array = True
230+
231+
type_rule = _get_type_rule_from_rule_list(validation_rules)
232+
if type_rule:
233+
prop_type = TYPE_RULES.get(type_rule)
234+
235+
regex_rule = _get_rule_from_rule_list(ValidationRule.REGEX, validation_rules)
236+
range_rule = _get_rule_from_rule_list(ValidationRule.IN_RANGE, validation_rules)
237+
if range_rule and regex_rule:
238+
raise ValueError(
239+
"regex and inRange rules are incompatible: ", validation_rules
240+
)
241+
242+
if range_rule:
243+
if prop_type not in [
244+
JSONSchemaType.NUMBER.value,
245+
JSONSchemaType.INTEGER.value,
246+
None,
247+
]:
248+
raise ValueError(
249+
"Validation rules must be either 'int' or 'num' when using the inRange rule"
250+
)
251+
prop_type = prop_type or JSONSchemaType.NUMBER.value
252+
minimum, maximum = _get_range_from_in_range_rule(range_rule)
253+
254+
if regex_rule:
255+
if prop_type not in (None, JSONSchemaType.STRING.value):
256+
raise ValueError("Type must be 'string' when using a regex rule")
257+
prop_type = JSONSchemaType.STRING.value
258+
pattern = _get_pattern_from_regex_rule(regex_rule)
259+
260+
return (
261+
prop_type,
262+
is_array,
263+
minimum,
264+
maximum,
265+
pattern,
266+
)
220267

221-
def _get_ranges_from_range_rule(
268+
269+
def _get_range_from_in_range_rule(
222270
rule: str,
223271
) -> tuple[Optional[float], Optional[float]]:
224272
"""
@@ -230,8 +278,8 @@ def _get_ranges_from_range_rule(
230278
Returns:
231279
The min and max from the rule
232280
"""
233-
range_min: Union[float, None] = None
234-
range_max: Union[float, None] = None
281+
range_min: Optional[float] = None
282+
range_max: Optional[float] = None
235283
parameters = rule.split(" ")
236284
if len(parameters) > 1 and parameters[1].isnumeric():
237285
range_min = float(parameters[1])
@@ -240,28 +288,28 @@ def _get_ranges_from_range_rule(
240288
return (range_min, range_max)
241289

242290

243-
def _get_in_range_rule_from_rule_list(rule_list: list[str]) -> Optional[str]:
244-
"""
245-
Returns the inRange rule from a list of rules if there is only one
246-
Returns None if there are no inRange rules
291+
def _get_pattern_from_regex_rule(rule: str) -> Optional[str]:
292+
"""Gets the pattern from the regex rule
247293
248294
Arguments:
249-
rule_list: A list of validation rules
250-
251-
Raises:
252-
ValueError: When more than one inRange rule is found
295+
rule: The full regex rule
253296
254297
Returns:
255-
The inRange rule if one is found, or None
298+
If the module parameter is search or match, and the pattern parameter exists
299+
the pattern is returned
300+
Otherwise None
256301
"""
257-
in_range_rules = [rule for rule in rule_list if rule.startswith("inRange")]
258-
if len(in_range_rules) > 1:
259-
raise ValueError(
260-
"Found more than one inRange rule in validation rules: ", rule_list
261-
)
262-
if len(in_range_rules) == 0:
302+
parameters = rule.split(" ")
303+
if len(parameters) != 3:
304+
return None
305+
_, module, pattern = parameters
306+
# Do not translate other modules
307+
if module not in [item.value for item in RegexModule]:
263308
return None
264-
return in_range_rules[0]
309+
# Match is just search but only at the beginning of the string
310+
if module == RegexModule.MATCH.value and not pattern.startswith("^"):
311+
return f"^{pattern}"
312+
return pattern
265313

266314

267315
def _get_type_rule_from_rule_list(rule_list: list[str]) -> Optional[str]:
@@ -279,14 +327,42 @@ def _get_type_rule_from_rule_list(rule_list: list[str]) -> Optional[str]:
279327
The type rule if one is found, or None
280328
"""
281329
rule_list = [rule.split(" ")[0] for rule in rule_list]
282-
type_rules = [rule for rule in rule_list if rule in TYPE_RULES]
283-
if len(type_rules) > 1:
330+
rule_list = [rule for rule in rule_list if rule in TYPE_RULES]
331+
if len(rule_list) > 1:
284332
raise ValueError(
285333
"Found more than one type rule in validation rules: ", rule_list
286334
)
287-
if len(type_rules) == 0:
335+
if len(rule_list) == 0:
288336
return None
289-
return type_rules[0]
337+
return rule_list[0]
338+
339+
340+
def _get_rule_from_rule_list(
341+
rule: ValidationRule, rule_list: list[str]
342+
) -> Optional[str]:
343+
"""
344+
Returns the a rule from a list of rules if there is only one
345+
346+
Arguments:
347+
rule: A ValidationRule enum
348+
rule_list: A list of validation rules
349+
350+
Raises:
351+
ValueError: When more than one of the rule is found
352+
353+
Returns:
354+
The rule if one is found, otherwise None is returned
355+
"""
356+
rule_value = rule.value
357+
rule_list = [rule for rule in rule_list if rule.startswith(rule_value)]
358+
if len(rule_list) > 1:
359+
msg = (
360+
f"Found more than one '{rule_value}' rule in validation rules: {rule_list}"
361+
)
362+
raise ValueError(msg)
363+
if len(rule_list) == 0:
364+
return None
365+
return rule_list[0]
290366

291367

292368
@dataclass
@@ -501,7 +577,7 @@ def create_json_schema( # pylint: disable=too-many-arguments
501577
datatype: str,
502578
schema_name: str,
503579
write_schema: bool = True,
504-
schema_path: Union[str, None] = None,
580+
schema_path: Optional[str] = None,
505581
jsonld_path: Optional[str] = None,
506582
use_property_display_names: bool = True,
507583
use_valid_value_display_names: bool = True,
@@ -539,7 +615,6 @@ def create_json_schema( # pylint: disable=too-many-arguments
539615
Returns:
540616
JSON Schema as a dictionary.
541617
"""
542-
logger.info("Starting to create JSON Schema for %s", datatype)
543618
graph_state = GraphTraversalState(dmge, datatype)
544619

545620
json_schema = JSONSchema(
@@ -640,6 +715,11 @@ def _write_data_model(
640715
json_schema_dirname = os.path.dirname(json_schema_path)
641716
if json_schema_dirname != "":
642717
os.makedirs(json_schema_dirname, exist_ok=True)
718+
719+
logger.info(
720+
"The JSON schema file can be inspected by setting the following "
721+
"nested key in the configuration: (model > location)."
722+
)
643723
else:
644724
raise ValueError(
645725
"Either schema_path or both name and jsonld_path must be provided."
@@ -832,10 +912,7 @@ def _create_array_property(node: Node) -> Property:
832912
items: Items = {}
833913
if node.type:
834914
items["type"] = node.type
835-
if node.minimum is not None:
836-
items["minimum"] = node.minimum
837-
if node.maximum is not None:
838-
items["maximum"] = node.maximum
915+
_set_type_specific_keywords(items, node)
839916

840917
array_type_dict: TypeDict = {"type": "array", "title": "array"}
841918
null_type_dict: TypeDict = {"type": "null", "title": "null"}
@@ -916,9 +993,21 @@ def _create_simple_property(node: Node) -> Property:
916993
elif node.is_required:
917994
prop["not"] = {"type": "null"}
918995

919-
if node.minimum is not None:
920-
prop["minimum"] = node.minimum
921-
if node.maximum is not None:
922-
prop["maximum"] = node.maximum
996+
_set_type_specific_keywords(prop, node)
923997

924998
return prop
999+
1000+
1001+
def _set_type_specific_keywords(schema: dict[str, Any], node: Node) -> None:
1002+
"""Sets JSON Schema keywords that are allowed if type has been set
1003+
1004+
Arguments:
1005+
schema: The schema to set keywords on
1006+
node (Node): The node the corresponds to the property which is being set in the JSON Schema
1007+
"""
1008+
if node.minimum is not None:
1009+
schema["minimum"] = node.minimum
1010+
if node.maximum is not None:
1011+
schema["maximum"] = node.maximum
1012+
if node.pattern is not None:
1013+
schema["pattern"] = node.pattern

0 commit comments

Comments
 (0)