Skip to content

Commit de8ed07

Browse files
feat: ✨ check primary key exists (#218)
# Description This PR implements the MUST constraint that primary key fields must exist (i.e. they must be fields on the same resource). Closes #216 Needs an in-depth review. ## Checklist - [x] Formatted Markdown - [x] Ran `just run-all` --------- Co-authored-by: Luke W. Johnston <[email protected]>
1 parent 6bf2abd commit de8ed07

File tree

2 files changed

+139
-11
lines changed

2 files changed

+139
-11
lines changed

src/check_datapackage/check.py

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
from dataclasses import dataclass, field
44
from functools import reduce
55
from types import TracebackType
6-
from typing import Any, Callable, Iterator, Optional
6+
from typing import Any, Callable, Iterator, Optional, cast
77

8+
from jsonpath import findall, resolve
89
from jsonschema import Draft7Validator, FormatChecker, ValidationError
910

1011
from check_datapackage.config import Config
@@ -16,8 +17,10 @@
1617
from check_datapackage.exclusion import exclude
1718
from check_datapackage.extensions import apply_extensions
1819
from check_datapackage.internals import (
20+
PropertyField,
1921
_filter,
2022
_flat_map,
23+
_get_fields_at_jsonpath,
2124
_map,
2225
)
2326
from check_datapackage.issue import Issue
@@ -126,6 +129,7 @@ class for more details, especially about the default values.
126129
_set_should_fields_to_required(schema)
127130

128131
issues = _check_object_against_json_schema(properties, schema)
132+
issues += _check_keys(properties, issues)
129133
issues += apply_extensions(properties, config.extensions)
130134
issues = exclude(issues, config.exclusions, properties)
131135
issues = sorted(set(issues))
@@ -136,6 +140,86 @@ class for more details, especially about the default values.
136140
return issues
137141

138142

143+
def _check_keys(properties: dict[str, Any], issues: list[Issue]) -> list[Issue]:
144+
"""Check that primary and foreign keys exist."""
145+
# Primary keys
146+
resources_with_pk = _get_fields_at_jsonpath(
147+
"$.resources[?(length(@.schema.primaryKey) > 0 || @.schema.primaryKey == '')]",
148+
properties,
149+
)
150+
resources_with_pk = _keep_resources_with_no_issue_at_property(
151+
resources_with_pk, issues, "schema.primaryKey"
152+
)
153+
key_issues = _flat_map(resources_with_pk, _check_primary_key)
154+
155+
# Foreign keys
156+
157+
return key_issues
158+
159+
160+
def _issues_at_property(
161+
resource: PropertyField, issues: list[Issue], jsonpath: str
162+
) -> list[Issue]:
163+
return _filter(
164+
issues,
165+
lambda issue: f"{resource.jsonpath}.{jsonpath}" in issue.jsonpath,
166+
)
167+
168+
169+
def _keep_resources_with_no_issue_at_property(
170+
resources: list[PropertyField], issues: list[Issue], jsonpath: str
171+
) -> list[PropertyField]:
172+
"""Filter out resources that have an issue at or under the given `jsonpath`."""
173+
return _filter(
174+
resources,
175+
lambda resource: not _issues_at_property(resource, issues, jsonpath),
176+
)
177+
178+
179+
def _check_primary_key(resource: PropertyField) -> list[Issue]:
180+
"""Check that primary key fields exist in the resource."""
181+
pk_fields = resolve("/schema/primaryKey", resource.value)
182+
pk_fields_list = _key_fields_as_str_list(pk_fields)
183+
unknown_fields = _get_unknown_key_fields(pk_fields_list, resource.value)
184+
185+
if not unknown_fields:
186+
return []
187+
188+
return [
189+
Issue(
190+
jsonpath=f"{resource.jsonpath}.schema.primaryKey",
191+
type="primary-key",
192+
message=(
193+
f"No fields found in resource for primary key fields: {unknown_fields}."
194+
),
195+
instance=pk_fields,
196+
)
197+
]
198+
199+
200+
def _key_fields_as_str_list(key_fields: Any) -> list[str]:
201+
"""Returns the list representation of primary and foreign key fields.
202+
203+
Key fields can be represented either as a string (containing one field name)
204+
or a list of strings.
205+
206+
The input should contain a correctly typed `key_fields` object.
207+
"""
208+
if not isinstance(key_fields, list):
209+
key_fields = [key_fields]
210+
return cast(list[str], key_fields)
211+
212+
213+
def _get_unknown_key_fields(
214+
key_fields: list[str], properties: dict[str, Any], resource_path: str = ""
215+
) -> str:
216+
"""Return the key fields that don't exist on the specified resource."""
217+
known_fields = findall(f"{resource_path}schema.fields[*].name", properties)
218+
unknown_fields = _filter(key_fields, lambda field: field not in known_fields)
219+
unknown_fields = _map(unknown_fields, lambda field: f"{field!r}")
220+
return ", ".join(unknown_fields)
221+
222+
139223
def _set_should_fields_to_required(schema: dict[str, Any]) -> dict[str, Any]:
140224
"""Set 'SHOULD' fields to 'REQUIRED' in the schema."""
141225
should_fields = ("name", "id", "licenses")

tests/test_check.py

Lines changed: 54 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,60 @@ def test_fails_properties_with_pattern_mismatch():
9191
assert issues[0].jsonpath == "$.contributors[0].path"
9292

9393

94+
@mark.parametrize("primary_key", ["id", ["id", "name"]])
95+
def test_pass_good_primary_key(primary_key):
96+
properties = example_package_properties()
97+
properties["resources"][0]["schema"]["primaryKey"] = primary_key
98+
properties["resources"][0]["schema"]["fields"].extend(
99+
[
100+
{"name": "id", "type": "integer"},
101+
{"name": "name", "type": "string"},
102+
]
103+
)
104+
105+
issues = check(properties)
106+
107+
assert issues == []
108+
109+
110+
@mark.parametrize("primary_key", ["", "last_name", ["first_name", "last_name"]])
111+
def test_fail_primary_key_with_unknown_fields(primary_key):
112+
properties = example_package_properties()
113+
properties["resources"][0]["schema"]["primaryKey"] = primary_key
114+
115+
issues = check(properties)
116+
117+
assert len(issues) == 1
118+
assert issues[0].jsonpath == "$.resources[0].schema.primaryKey"
119+
assert issues[0].type == "primary-key"
120+
assert issues[0].instance == primary_key
121+
122+
123+
@mark.parametrize("primary_key", [None, 123, [], [123, "a_field"]])
124+
def test_do_not_check_bad_primary_key_against_fields(primary_key):
125+
properties = example_package_properties()
126+
properties["resources"][0]["schema"]["primaryKey"] = primary_key
127+
128+
issues = check(properties)
129+
130+
assert len(issues) == 1
131+
assert issues[0].type != "primary-key"
132+
133+
134+
def test_do_not_check_primary_key_against_bad_field():
135+
properties = example_package_properties()
136+
properties["resources"][0]["schema"]["primaryKey"] = "eye-colour"
137+
properties["resources"][0]["schema"]["fields"].append(
138+
# Bad name
139+
{"name": 123, "type": "integer"},
140+
)
141+
142+
issues = check(properties)
143+
144+
assert len(issues) == 1
145+
assert issues[0].type != "primary-key"
146+
147+
94148
# "SHOULD" checks
95149

96150

@@ -597,16 +651,6 @@ def test_fail_foreign_keys_with_bad_array_item():
597651
)
598652

599653

600-
@mark.parametrize("primary_key", ["id", ["name", "address"]])
601-
def test_pass_good_primary_key(primary_key):
602-
properties = example_package_properties()
603-
properties["resources"][0]["schema"]["primaryKey"] = primary_key
604-
605-
issues = check(properties)
606-
607-
assert issues == []
608-
609-
610654
def test_fail_primary_key_of_bad_type():
611655
properties = example_package_properties()
612656
properties["resources"][0]["schema"]["primaryKey"] = 123

0 commit comments

Comments
 (0)