diff --git a/src/check_datapackage/check.py b/src/check_datapackage/check.py index 843e2ccd..affb206f 100644 --- a/src/check_datapackage/check.py +++ b/src/check_datapackage/check.py @@ -153,7 +153,17 @@ def _check_keys(properties: dict[str, Any], issues: list[Issue]) -> list[Issue]: key_issues = _flat_map(resources_with_pk, _check_primary_key) # Foreign keys - + resources_with_fk = _get_fields_at_jsonpath( + "$.resources[?(length(@.schema.foreignKeys) > 0)]", + properties, + ) + resources_with_fk = _keep_resources_with_no_issue_at_property( + resources_with_fk, issues, "schema.foreignKeys" + ) + key_issues += _flat_map( + resources_with_fk, + lambda resource: _check_foreign_keys(resource, properties), + ) return key_issues @@ -197,6 +207,35 @@ def _check_primary_key(resource: PropertyField) -> list[Issue]: ] +def _check_foreign_keys( + resource: PropertyField, properties: dict[str, Any] +) -> list[Issue]: + """Check that foreign key source and destination fields exist.""" + # Safe, as only FKs of the correct type here + foreign_keys = cast( + list[dict[str, Any]], resolve("/schema/foreignKeys", resource.value) + ) + foreign_keys_diff_resource = _filter( + foreign_keys, + lambda fk: "resource" in fk["reference"] and fk["reference"]["resource"] != "", + ) + foreign_keys_same_resource = _filter( + foreign_keys, lambda fk: fk not in foreign_keys_diff_resource + ) + + issues = _flat_map(foreign_keys, lambda fk: _check_fk_source_fields(fk, resource)) + issues += _flat_map( + foreign_keys_same_resource, + lambda fk: _check_fk_dest_fields_same_resource(fk, resource), + ) + issues += _flat_map( + foreign_keys_diff_resource, + lambda fk: _check_fk_dest_fields_diff_resource(fk, resource, properties), + ) + + return issues + + def _key_fields_as_str_list(key_fields: Any) -> list[str]: """Returns the list representation of primary and foreign key fields. @@ -220,6 +259,109 @@ def _get_unknown_key_fields( return ", ".join(unknown_fields) +def _check_fk_source_fields( + foreign_key: dict[str, Any], resource: PropertyField +) -> list[Issue]: + """Check that foreign key source fields exist and have the correct number.""" + issues = [] + source_fields = resolve("/fields", foreign_key) + source_field_list = _key_fields_as_str_list(source_fields) + unknown_fields = _get_unknown_key_fields(source_field_list, resource.value) + if unknown_fields: + issues.append( + Issue( + jsonpath=f"{resource.jsonpath}.schema.foreignKeys.fields", + type="foreign-key-source-fields", + message=( + "No fields found in resource for foreign key source fields: " + f"{unknown_fields}." + ), + instance=source_fields, + ) + ) + + dest_fields = _key_fields_as_str_list(resolve("/reference/fields", foreign_key)) + if len(source_field_list) != len(dest_fields): + issues.append( + Issue( + jsonpath=f"{resource.jsonpath}.schema.foreignKeys.fields", + type="foreign-key-source-fields", + message=( + "The number of foreign key source fields must be the same as " + "the number of foreign key destination fields." + ), + instance=source_fields, + ) + ) + return issues + + +def _check_fk_dest_fields_same_resource( + foreign_key: dict[str, Any], + resource: PropertyField, +) -> list[Issue]: + """Check that foreign key destination fields exist on the same resource.""" + dest_fields = resolve("/reference/fields", foreign_key) + dest_field_list = _key_fields_as_str_list(dest_fields) + unknown_fields = _get_unknown_key_fields(dest_field_list, resource.value) + if not unknown_fields: + return [] + + return [ + Issue( + jsonpath=f"{resource.jsonpath}.schema.foreignKeys.reference.fields", + type="foreign-key-destination-fields", + message=( + "No fields found in resource for foreign key " + f"destination fields: {unknown_fields}." + ), + instance=dest_fields, + ) + ] + + +def _check_fk_dest_fields_diff_resource( + foreign_key: dict[str, Any], resource: PropertyField, properties: dict[str, Any] +) -> list[Issue]: + """Check that foreign key destination fields exist on the destination resource.""" + dest_fields = resolve("/reference/fields", foreign_key) + dest_field_list = _key_fields_as_str_list(dest_fields) + # Safe, as only keys of the correct type here + dest_resource_name = cast(str, resolve("/reference/resource", foreign_key)) + + dest_resource_path = f"resources[?(@.name == '{dest_resource_name}')]" + if not findall(dest_resource_path, properties): + return [ + Issue( + jsonpath=f"{resource.jsonpath}.schema.foreignKeys.reference.resource", + type="foreign-key-destination-resource", + message=( + f"The destination resource {dest_resource_name!r} of this foreign " + "key doesn't exist in the package." + ), + instance=dest_resource_name, + ) + ] + + unknown_fields = _get_unknown_key_fields( + dest_field_list, properties, f"{dest_resource_path}." + ) + if not unknown_fields: + return [] + + return [ + Issue( + jsonpath=f"{resource.jsonpath}.schema.foreignKeys.reference.fields", + type="foreign-key-destination-fields", + message=( + f"No fields found in destination resource {dest_resource_name!r} " + f"for foreign key destination fields: {unknown_fields}." + ), + instance=dest_fields, + ) + ] + + def _set_should_fields_to_required(schema: dict[str, Any]) -> dict[str, Any]: """Set 'SHOULD' fields to 'REQUIRED' in the schema.""" should_fields = ("name", "id", "licenses") diff --git a/tests/test_check.py b/tests/test_check.py index 14388ee3..a59257e5 100644 --- a/tests/test_check.py +++ b/tests/test_check.py @@ -1,6 +1,6 @@ from typing import Any -from pytest import mark, raises +from pytest import fixture, mark, raises from check_datapackage.check import DataPackageError, check from check_datapackage.config import Config @@ -11,7 +11,7 @@ ) from check_datapackage.exclusion import Exclusion from check_datapackage.extensions import Extensions, RequiredCheck -from check_datapackage.internals import _map +from check_datapackage.internals import _filter, _map from tests.test_extensions import lowercase_check # "MUST" checks @@ -145,6 +145,265 @@ def test_do_not_check_primary_key_against_bad_field(): assert issues[0].type != "primary-key" +def test_pass_good_foreign_keys_same_resource(): + properties = example_package_properties() + properties["resources"][0]["schema"]["fields"].extend( + [ + {"name": "id", "type": "integer"}, + {"name": "best_friend_id", "type": "integer"}, + ] + ) + properties["resources"][0]["schema"]["foreignKeys"] = [ + {"fields": ["best_friend_id"], "reference": {"fields": ["id"]}}, + {"fields": "best_friend_id", "reference": {"fields": "id"}}, + { + "fields": "best_friend_id", + "reference": {"resource": "", "fields": "id"}, + }, + ] + + issues = check(properties) + + assert issues == [] + + +@fixture +def properties_fk() -> dict[str, Any]: + return { + "resources": [ + { + "name": "animals", + "path": "resources/animals/data.parquet", + "schema": { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "toy_id", "type": "integer"}, + ], + "foreignKeys": [ + { + "fields": "toy_id", + "reference": {"resource": "toys", "fields": "id"}, + }, + { + "fields": ["toy_id"], + "reference": {"resource": "toys", "fields": ["id"]}, + }, + ], + }, + }, + { + "name": "toys", + "path": "resources/toys/data.parquet", + "schema": { + "fields": [ + {"name": "id", "type": "integer"}, + {"name": "material", "type": "string"}, + ] + }, + }, + ] + } + + +def test_pass_good_foreign_keys_different_resource(properties_fk): + issues = check(properties_fk) + + assert issues == [] + + +def test_fail_mismatched_foreign_key_fields(): + properties = example_package_properties() + properties["resources"][0]["schema"]["fields"].extend( + [ + {"name": "known_field", "type": "integer"}, + {"name": "known_field2", "type": "integer"}, + ] + ) + properties["resources"][0]["schema"]["foreignKeys"] = [ + { + # Outer and inner `fields` not same length + "fields": ["known_field"], + "reference": {"fields": ["known_field", "known_field2"]}, + }, + ] + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].jsonpath == "$.resources[0].schema.foreignKeys.fields" + assert issues[0].type == "foreign-key-source-fields" + assert issues[0].instance == ["known_field"] + + +@mark.parametrize( + "source_fields, dest_fields", + [ + ("", "known_field"), + ("unknown_field", "known_field"), + (["unknown", "field"], ["known_field", "known_field2"]), + ], +) +def test_fail_bad_foreign_key_source_fields(source_fields, dest_fields): + properties = example_package_properties() + properties["resources"][0]["schema"]["fields"].extend( + [ + {"name": "known_field", "type": "integer"}, + {"name": "known_field2", "type": "integer"}, + ] + ) + properties["resources"][0]["schema"]["foreignKeys"] = [ + {"fields": source_fields, "reference": {"fields": dest_fields}}, + ] + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].jsonpath == "$.resources[0].schema.foreignKeys.fields" + assert issues[0].type == "foreign-key-source-fields" + assert issues[0].instance == source_fields + + +@mark.parametrize( + "source_fields, dest_fields", + [ + ("known_field", ""), + ("known_field", "unknown_field"), + (["known_field", "known_field2"], ["unknown", "field"]), + ], +) +def test_fail_bad_foreign_key_destination_fields_same_resource( + source_fields, dest_fields +): + properties = example_package_properties() + properties["resources"][0]["schema"]["fields"].extend( + [ + {"name": "known_field", "type": "integer"}, + {"name": "known_field2", "type": "integer"}, + ] + ) + properties["resources"][0]["schema"]["foreignKeys"] = [ + {"fields": source_fields, "reference": {"fields": dest_fields}}, + ] + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].jsonpath == "$.resources[0].schema.foreignKeys.reference.fields" + assert issues[0].type == "foreign-key-destination-fields" + assert issues[0].instance == dest_fields + + +def test_do_not_check_bad_foreign_keys_against_fields_same_resource(): + properties = example_package_properties() + properties["resources"][0]["schema"]["foreignKeys"] = [ + 123, + {}, + {"fields": None, "reference": {"fields": "unknown_field"}}, + {"fields": 123, "reference": {"fields": "unknown_field"}}, + {"fields": "unknown_field", "reference": {"fields": []}}, + { + "fields": [123, "unknown_field"], + "reference": {"fields": [123, "unknown_field"]}, + }, + { + "fields": "unknown_field", + "reference": {"resource": 123, "fields": "unknown_field"}, + }, + ] + + issues = check(properties) + + assert not _filter(issues, lambda issue: "foreign-key" in issue.type) + + +def test_do_not_check_foreign_keys_against_bad_field_same_resource(): + properties = example_package_properties() + properties["resources"][0]["schema"]["fields"].append( + # Bad name + {"name": 123, "type": "integer"}, + ) + properties["resources"][0]["schema"]["foreignKeys"] = [ + {"fields": "eye-colour", "reference": {"fields": "eye-colour"}}, + ] + + issues = check(properties) + + assert len(issues) == 1 + assert issues[0].type != "foreign-key" + + +def test_fail_foreign_key_with_unknown_destination_resource(properties_fk): + properties_fk["resources"][0]["schema"]["foreignKeys"].append( + { + "fields": ["toy_id"], + "reference": {"resource": "unknown", "fields": ["id"]}, + } + ) + + issues = check(properties_fk) + + assert len(issues) == 1 + assert issues[0].jsonpath == "$.resources[0].schema.foreignKeys.reference.resource" + assert issues[0].type == "foreign-key-destination-resource" + assert issues[0].instance == "unknown" + + +@mark.parametrize( + "source_fields, dest_fields", + [ + ("toy_id", ""), + ("toy_id", "unknown_field"), + (["toy_id", "id"], ["unknown", "field"]), + ], +) +def test_fail_bad_foreign_key_destination_fields_different_resource( + source_fields, dest_fields, properties_fk +): + properties_fk["resources"][0]["schema"]["foreignKeys"] = [ + { + "fields": source_fields, + "reference": {"resource": "toys", "fields": dest_fields}, + }, + ] + + issues = check(properties_fk) + + assert len(issues) == 1 + assert issues[0].jsonpath == "$.resources[0].schema.foreignKeys.reference.fields" + assert issues[0].type == "foreign-key-destination-fields" + assert issues[0].instance == dest_fields + + +def test_do_not_check_bad_foreign_keys_against_fields_different_resource(properties_fk): + properties_fk["resources"][0]["schema"]["foreignKeys"] = [ + 123, + {}, + {"fields": None, "reference": {"resource": "toys", "fields": "unknown_field"}}, + {"fields": 123, "reference": {"resource": "toys", "fields": "unknown_field"}}, + {"fields": "unknown_field", "reference": {"resource": "toys", "fields": []}}, + { + "fields": [123, "unknown_field"], + "reference": {"resource": "toys", "fields": [123, "unknown_field"]}, + }, + ] + + issues = check(properties_fk) + + assert not _filter(issues, lambda issue: "foreign-key" in issue.type) + + +def test_do_not_check_foreign_keys_against_bad_field_different_resource(properties_fk): + properties_fk["resources"][0]["schema"]["fields"].append( + # Bad name + {"name": 123, "type": "integer"}, + ) + + issues = check(properties_fk) + + assert len(issues) == 1 + assert issues[0].type != "foreign-key" + + # "SHOULD" checks @@ -494,30 +753,6 @@ def test_fail_field_with_mixed_type_enum_constraint(): assert issues[0].jsonpath == "$.resources[0].schema.fields[0].constraints.enum" -def test_pass_good_foreign_keys(): - properties = example_package_properties() - properties["resources"][0]["schema"]["foreignKeys"] = [ - { - "fields": "purchase", - "reference": { - "resource": "purchases", - "fields": "purchase_id", - }, - }, - { - "fields": ["first_name", "last_name"], - "reference": { - "resource": "customers", - "fields": ["first_name", "last_name"], - }, - }, - ] - - issues = check(properties) - - assert issues == [] - - def test_fail_foreign_keys_of_bad_type(): properties = example_package_properties() properties["resources"][0]["schema"]["foreignKeys"] = 123