Skip to content

Commit 7a5d00c

Browse files
authored
Generate json schema on the implemented metadata model (#465)
2 parents a59d849 + 0496c68 commit 7a5d00c

File tree

3 files changed

+419
-0
lines changed

3 files changed

+419
-0
lines changed
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
import dataclasses
2+
import json
3+
import sys
4+
import argparse
5+
from pathlib import Path
6+
7+
import pandas as pd
8+
9+
from schema_inspector import get_initial_schema
10+
11+
IGNORES = []
12+
13+
# Some classes are implemented under a different name
14+
# Implementation: Conceptual Model
15+
class_maps = {
16+
"News": "News Item",
17+
"IndustrialSector": "Business Sector",
18+
"Turnover": "Company Revenue",
19+
"NumberOfEmployees": "Company Size",
20+
"AIoDEntryRead": "AIoD Entry",
21+
}
22+
23+
# All names in this maps should be normalized
24+
# Class: {Implementation: Conceptual Model}
25+
property_renames_by_class = {
26+
"aiodentry": {"datecreated": "entrycreated", "datemodified": "modified"},
27+
"airesource": {
28+
"alternatename": "alternativename",
29+
"industrialsector": "businesssector",
30+
"ispartof": "partof",
31+
},
32+
"computationalasset": {"type": "computationalassettype"},
33+
"educationalresource": {
34+
"prerequisite": "prerequisiteknowledge",
35+
"targetaudience": "targetseducationallevel",
36+
},
37+
"event": {
38+
"status": "currenteventstatus",
39+
"mode": "usesmode",
40+
},
41+
"experiment": {"executionsettings": "exemplaryexecutionsettings"},
42+
"publication": {
43+
"isbn": "hasisbn",
44+
"issn": "hasissn",
45+
},
46+
"person": {"languages": "language"},
47+
"aiasset": {"license": "licence"},
48+
"runnabledistribution": {"deploymenttimemilliseconds": "deploymenttimemsec"},
49+
}
50+
# Some classes are artefacts of the implementation or are required for
51+
# other parts of the system. We can ignore them in this comparison.
52+
class_ignores = {
53+
# Classes defined for convenience to help with the conceptual model implementation
54+
"Taxonomy",
55+
"NamedRelation",
56+
"Text",
57+
"Body",
58+
# Classes defined for the REST API unrelated to the conceptual model
59+
"Bookmark",
60+
"AIoDEntryCreate",
61+
}
62+
63+
property_suffix_ignores = {"_id", "_identifier", "__"}
64+
65+
66+
def normalize(string: str) -> str:
67+
return string.casefold().replace(" ", "").replace("_", "")
68+
69+
70+
def main():
71+
parser = argparse.ArgumentParser(
72+
description="Compare implementation schema with conceptual model definition."
73+
)
74+
parser.add_argument(
75+
"source_path",
76+
type=str,
77+
help="Path to the implementation source directory",
78+
)
79+
parser.add_argument(
80+
"conceptual_model_path",
81+
type=str,
82+
help="Path to the model export definition file",
83+
)
84+
parser.add_argument(
85+
"--class",
86+
"-c",
87+
dest="class_name",
88+
type=str,
89+
help="Optional: Name of a specific class to inspect (as defined in the model)",
90+
)
91+
92+
args = parser.parse_args()
93+
94+
source_path = Path(args.source_path)
95+
if not (source_path.exists() and source_path.is_dir()):
96+
parser.error(f"No source directory {source_path.absolute()} found.")
97+
98+
conceptual_model_path = Path(args.conceptual_model_path)
99+
if not (conceptual_model_path.exists() and conceptual_model_path.is_file()):
100+
parser.error(f"No conceptual model file {conceptual_model_path.absolute()} found.")
101+
102+
implementation = load_implemented_schema(source_path)
103+
definition = json.loads(conceptual_model_path.read_text())
104+
conceptual_model = {clazz["name"]: clazz for clazz in definition["classes"]}
105+
106+
report_differences(implementation, conceptual_model, args.class_name)
107+
108+
109+
def load_implemented_schema(source_path) -> dict:
110+
implemented_classes, errors, class_hierarchy = get_initial_schema(source_path)
111+
for clazz, metadata in implemented_classes.items():
112+
metadata["parents"] = {}
113+
parents = class_hierarchy.get(clazz, [])[:]
114+
while parents:
115+
parent = parents.pop()
116+
if parent in ["SQLModel", "BaseModel"]:
117+
continue
118+
metadata["parents"][parent] = implemented_classes[parent]
119+
parents.extend(class_hierarchy.get(parent, []))
120+
121+
all_properties = {name: property for name, property in metadata["properties"].items()}
122+
inherited_properties = {
123+
name: property
124+
for parent_name, parent in metadata["parents"].items()
125+
for name, property in parent["properties"].items()
126+
# If the name matches with a Base suffix, it should be considered as a direct definition
127+
# The structure of the code base has ItemBase defining properties which will be properties
128+
# of the table in the database, and Item defines the relationships to other tables.
129+
if parent_name != f"{clazz}Base"
130+
}
131+
metadata["inherited_properties"] = inherited_properties
132+
metadata["direct_properties"] = {
133+
k: v for k, v in all_properties.items() if k not in inherited_properties
134+
}
135+
136+
implemented_classes = {
137+
class_maps.get(k, k): v
138+
for k, v in implemented_classes.items()
139+
if not any(k.endswith(suffix) for suffix in ["Base", "ORM", "Table", "Link"])
140+
and k not in class_ignores
141+
}
142+
return implemented_classes
143+
144+
145+
def report_differences(implementation: dict, definition: dict, class_name=None):
146+
names_implementation = {normalize(name): name for name in implementation}
147+
names_definition = {normalize(name): name for name in definition}
148+
149+
matching = sorted(set(names_implementation) & set(names_definition))
150+
only_implementation = sorted(set(names_implementation) - set(names_definition))
151+
only_definition = sorted(set(names_definition) - set(names_implementation))
152+
153+
if class_name:
154+
matching = set(matching) & {normalize(class_name)}
155+
only_implementation = set(only_implementation) & {normalize(class_name)}
156+
only_definition = set(only_definition) & {normalize(class_name)}
157+
if not (matching | only_definition | only_implementation):
158+
print(f"Class '{class_name}' not found in either implementation or definition.")
159+
return
160+
161+
print("Classes only in the implementation:")
162+
print(only_implementation)
163+
164+
print("Classes only in the definition:")
165+
print(only_definition)
166+
167+
print("Classes in both")
168+
print(matching)
169+
170+
for clazz in matching:
171+
impl = implementation[names_implementation[clazz]]
172+
if "Taxonomy" in impl["parents"]:
173+
continue
174+
175+
print("\n", clazz)
176+
diffs = report_difference(
177+
implementation[names_implementation[clazz]], definition[names_definition[clazz]], clazz
178+
)
179+
if diffs:
180+
records = [dataclasses.asdict(d) for d in diffs]
181+
print(
182+
pd.DataFrame.from_records(records).loc[
183+
:, ["defined_as", "implemented_as", "defined_type", "implemented_type"]
184+
]
185+
)
186+
187+
188+
@dataclasses.dataclass
189+
class Comparison:
190+
normalized_name: str
191+
defined_as: str | None = None
192+
implemented_as: str | None = None
193+
defined_type: str | None = None
194+
implemented_type: str | None = None
195+
196+
197+
def report_difference(implementation: dict, definition: dict, clazz: str) -> list[Comparison]:
198+
implemented_properties = {
199+
normalize(prop): prop
200+
for prop in implementation["direct_properties"]
201+
if not any(prop.endswith(suffix) for suffix in property_suffix_ignores)
202+
}
203+
defined_properties = {normalize(prop["name"]): prop for prop in definition["direct_properties"]}
204+
205+
all_properties = set(defined_properties) | set(implemented_properties)
206+
property_renames = property_renames_by_class.get(clazz, {})
207+
# To avoid reporting a property twice, we only pick one of two definitions for properties
208+
# which are named differently in implementation than the conceptual model.
209+
all_properties = {p for p in all_properties if p not in property_renames.values()}
210+
211+
property_map = []
212+
for property_name in all_properties:
213+
prop = Comparison(normalized_name=property_name)
214+
if implemented_as := implemented_properties.get(property_name):
215+
prop.implemented_as = implemented_as
216+
prop.implemented_type = implementation["properties"][implemented_as].get(
217+
"type", "TYPE_UNKNOWN"
218+
)
219+
220+
if different_name := property_renames.get(property_name):
221+
property_name = different_name
222+
if defined_as := defined_properties.get(property_name):
223+
prop.defined_as = defined_as["name"]
224+
def_type = defined_as["range"]
225+
if isinstance(def_type, list) and len(def_type) == 1:
226+
def_type = def_type[0]
227+
prop.defined_type = def_type
228+
property_map.append(prop)
229+
230+
def sort_properties(comparison):
231+
# we want the matched properties, then unmatched properties
232+
if comparison.implemented_as and comparison.defined_as:
233+
return ord(comparison.defined_as[0])
234+
if comparison.defined_as:
235+
return ord(comparison.defined_as[0]) + 26
236+
if comparison.implemented_as:
237+
return ord(comparison.implemented_as[0]) + 26 * 2
238+
raise NotImplemented
239+
240+
return sorted(property_map, key=sort_properties)
241+
242+
243+
if __name__ == "__main__":
244+
main()

scripts/model_comparison/readme.md

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Comparison to Conceptual Model
2+
3+
The REST API aims to implement the conceptual model definitions.
4+
However, the implementation may drift apart from the conceptual model over time due to
5+
practical reasons (e.g., no time to process updates).
6+
The `compare.py` script can be used to compare the implementation in the REST API with the
7+
definitions in the conceptual model.
8+
9+
## Requirements
10+
11+
Files:
12+
13+
- The REST API code repository
14+
- The `model-export.json` file from the `metadata-schema` repository's `export` directory
15+
16+
Python environment:
17+
18+
- The dependencies of the REST API need to be installed, as the code will import its modules.
19+
- The `pandas` package needs to be installed
20+
21+
To install the environment, from the root of the REST API repository execute:
22+
```bash
23+
python -m venv .venv
24+
source .venv/bin/activate
25+
python -m pip install uv
26+
uv pip install -r pyproject.toml
27+
uv pip install pandas
28+
```
29+
30+
## Usage
31+
Invoke the `compare.py` script, specifying the location of the REST API's `src` directory and the model export file:
32+
33+
```bash
34+
python compare.py <source_path> <conceptual_model_path>
35+
```
36+
37+
For example:
38+
```bash
39+
python compare.py /path/to/rest/api/src /path/to/model-export.json
40+
```
41+
42+
To inspect the changes of only a single class, use the `--class` or `-c` option:
43+
```bash
44+
python compare.py /path/to/rest/api/src /path/to/model-export.json --class "Person"
45+
```
46+
47+
To see all available options:
48+
```bash
49+
python compare.py --help
50+
```
51+
52+
The `compare.py` script may need to be updated over time:
53+
54+
- `class_maps`: mapping to clarify when the implemented name of a class differs from the name in the definition.
55+
- `property_renames_by_class`: mapping to clarify for each class when the implemented name of a property differs from the name in the definition.
56+
- `class_ignores`: classes which are implemented but should not be reported, useful to hide implementation details.
57+
- `property_suffix_ignores`: suffixes of properties which should not be reported, useful to hide implementation details.
58+
59+
Note that to compare implemented and defined names (for both classes and properties) they are typically _normalized_ by converting them to lowercases and removing spaces and underscores.
60+
Some of the mappings above use normalized names for some of the keys, this is documented in the `compare.py` file.
61+
62+
## Limitations
63+
This tool does not identify properties which are implemented at the wrong level of the hierarchy.
64+
For example, given the situation:
65+
66+
- `B` is defined as a subclass of `A`
67+
- `B` is implemented as a subclass of `A`
68+
- property `c` is defined on `A`
69+
- property `c` is implemented in `B`
70+
71+
then the tool will report that `A` is missing `c` and `B` has an extra `c` property.

0 commit comments

Comments
 (0)