|
| 1 | +import dataclasses |
| 2 | +import json |
| 3 | +import sys |
| 4 | +import argparse |
| 5 | +from pathlib import Path |
| 6 | + |
| 7 | +import pandas as pd |
| 8 | + |
| 9 | +from schema_inspector import get_initial_schema |
| 10 | + |
| 11 | +IGNORES = [] |
| 12 | + |
| 13 | +# Some classes are implemented under a different name |
| 14 | +# Implementation: Conceptual Model |
| 15 | +class_maps = { |
| 16 | + "News": "News Item", |
| 17 | + "IndustrialSector": "Business Sector", |
| 18 | + "Turnover": "Company Revenue", |
| 19 | + "NumberOfEmployees": "Company Size", |
| 20 | + "AIoDEntryRead": "AIoD Entry", |
| 21 | +} |
| 22 | + |
| 23 | +# All names in this maps should be normalized |
| 24 | +# Class: {Implementation: Conceptual Model} |
| 25 | +property_renames_by_class = { |
| 26 | + "aiodentry": {"datecreated": "entrycreated", "datemodified": "modified"}, |
| 27 | + "airesource": { |
| 28 | + "alternatename": "alternativename", |
| 29 | + "industrialsector": "businesssector", |
| 30 | + "ispartof": "partof", |
| 31 | + }, |
| 32 | + "computationalasset": {"type": "computationalassettype"}, |
| 33 | + "educationalresource": { |
| 34 | + "prerequisite": "prerequisiteknowledge", |
| 35 | + "targetaudience": "targetseducationallevel", |
| 36 | + }, |
| 37 | + "event": { |
| 38 | + "status": "currenteventstatus", |
| 39 | + "mode": "usesmode", |
| 40 | + }, |
| 41 | + "experiment": {"executionsettings": "exemplaryexecutionsettings"}, |
| 42 | + "publication": { |
| 43 | + "isbn": "hasisbn", |
| 44 | + "issn": "hasissn", |
| 45 | + }, |
| 46 | + "person": {"languages": "language"}, |
| 47 | + "aiasset": {"license": "licence"}, |
| 48 | + "runnabledistribution": {"deploymenttimemilliseconds": "deploymenttimemsec"}, |
| 49 | +} |
| 50 | +# Some classes are artefacts of the implementation or are required for |
| 51 | +# other parts of the system. We can ignore them in this comparison. |
| 52 | +class_ignores = { |
| 53 | + # Classes defined for convenience to help with the conceptual model implementation |
| 54 | + "Taxonomy", |
| 55 | + "NamedRelation", |
| 56 | + "Text", |
| 57 | + "Body", |
| 58 | + # Classes defined for the REST API unrelated to the conceptual model |
| 59 | + "Bookmark", |
| 60 | + "AIoDEntryCreate", |
| 61 | +} |
| 62 | + |
| 63 | +property_suffix_ignores = {"_id", "_identifier", "__"} |
| 64 | + |
| 65 | + |
| 66 | +def normalize(string: str) -> str: |
| 67 | + return string.casefold().replace(" ", "").replace("_", "") |
| 68 | + |
| 69 | + |
| 70 | +def main(): |
| 71 | + parser = argparse.ArgumentParser( |
| 72 | + description="Compare implementation schema with conceptual model definition." |
| 73 | + ) |
| 74 | + parser.add_argument( |
| 75 | + "source_path", |
| 76 | + type=str, |
| 77 | + help="Path to the implementation source directory", |
| 78 | + ) |
| 79 | + parser.add_argument( |
| 80 | + "conceptual_model_path", |
| 81 | + type=str, |
| 82 | + help="Path to the model export definition file", |
| 83 | + ) |
| 84 | + parser.add_argument( |
| 85 | + "--class", |
| 86 | + "-c", |
| 87 | + dest="class_name", |
| 88 | + type=str, |
| 89 | + help="Optional: Name of a specific class to inspect (as defined in the model)", |
| 90 | + ) |
| 91 | + |
| 92 | + args = parser.parse_args() |
| 93 | + |
| 94 | + source_path = Path(args.source_path) |
| 95 | + if not (source_path.exists() and source_path.is_dir()): |
| 96 | + parser.error(f"No source directory {source_path.absolute()} found.") |
| 97 | + |
| 98 | + conceptual_model_path = Path(args.conceptual_model_path) |
| 99 | + if not (conceptual_model_path.exists() and conceptual_model_path.is_file()): |
| 100 | + parser.error(f"No conceptual model file {conceptual_model_path.absolute()} found.") |
| 101 | + |
| 102 | + implementation = load_implemented_schema(source_path) |
| 103 | + definition = json.loads(conceptual_model_path.read_text()) |
| 104 | + conceptual_model = {clazz["name"]: clazz for clazz in definition["classes"]} |
| 105 | + |
| 106 | + report_differences(implementation, conceptual_model, args.class_name) |
| 107 | + |
| 108 | + |
| 109 | +def load_implemented_schema(source_path) -> dict: |
| 110 | + implemented_classes, errors, class_hierarchy = get_initial_schema(source_path) |
| 111 | + for clazz, metadata in implemented_classes.items(): |
| 112 | + metadata["parents"] = {} |
| 113 | + parents = class_hierarchy.get(clazz, [])[:] |
| 114 | + while parents: |
| 115 | + parent = parents.pop() |
| 116 | + if parent in ["SQLModel", "BaseModel"]: |
| 117 | + continue |
| 118 | + metadata["parents"][parent] = implemented_classes[parent] |
| 119 | + parents.extend(class_hierarchy.get(parent, [])) |
| 120 | + |
| 121 | + all_properties = {name: property for name, property in metadata["properties"].items()} |
| 122 | + inherited_properties = { |
| 123 | + name: property |
| 124 | + for parent_name, parent in metadata["parents"].items() |
| 125 | + for name, property in parent["properties"].items() |
| 126 | + # If the name matches with a Base suffix, it should be considered as a direct definition |
| 127 | + # The structure of the code base has ItemBase defining properties which will be properties |
| 128 | + # of the table in the database, and Item defines the relationships to other tables. |
| 129 | + if parent_name != f"{clazz}Base" |
| 130 | + } |
| 131 | + metadata["inherited_properties"] = inherited_properties |
| 132 | + metadata["direct_properties"] = { |
| 133 | + k: v for k, v in all_properties.items() if k not in inherited_properties |
| 134 | + } |
| 135 | + |
| 136 | + implemented_classes = { |
| 137 | + class_maps.get(k, k): v |
| 138 | + for k, v in implemented_classes.items() |
| 139 | + if not any(k.endswith(suffix) for suffix in ["Base", "ORM", "Table", "Link"]) |
| 140 | + and k not in class_ignores |
| 141 | + } |
| 142 | + return implemented_classes |
| 143 | + |
| 144 | + |
| 145 | +def report_differences(implementation: dict, definition: dict, class_name=None): |
| 146 | + names_implementation = {normalize(name): name for name in implementation} |
| 147 | + names_definition = {normalize(name): name for name in definition} |
| 148 | + |
| 149 | + matching = sorted(set(names_implementation) & set(names_definition)) |
| 150 | + only_implementation = sorted(set(names_implementation) - set(names_definition)) |
| 151 | + only_definition = sorted(set(names_definition) - set(names_implementation)) |
| 152 | + |
| 153 | + if class_name: |
| 154 | + matching = set(matching) & {normalize(class_name)} |
| 155 | + only_implementation = set(only_implementation) & {normalize(class_name)} |
| 156 | + only_definition = set(only_definition) & {normalize(class_name)} |
| 157 | + if not (matching | only_definition | only_implementation): |
| 158 | + print(f"Class '{class_name}' not found in either implementation or definition.") |
| 159 | + return |
| 160 | + |
| 161 | + print("Classes only in the implementation:") |
| 162 | + print(only_implementation) |
| 163 | + |
| 164 | + print("Classes only in the definition:") |
| 165 | + print(only_definition) |
| 166 | + |
| 167 | + print("Classes in both") |
| 168 | + print(matching) |
| 169 | + |
| 170 | + for clazz in matching: |
| 171 | + impl = implementation[names_implementation[clazz]] |
| 172 | + if "Taxonomy" in impl["parents"]: |
| 173 | + continue |
| 174 | + |
| 175 | + print("\n", clazz) |
| 176 | + diffs = report_difference( |
| 177 | + implementation[names_implementation[clazz]], definition[names_definition[clazz]], clazz |
| 178 | + ) |
| 179 | + if diffs: |
| 180 | + records = [dataclasses.asdict(d) for d in diffs] |
| 181 | + print( |
| 182 | + pd.DataFrame.from_records(records).loc[ |
| 183 | + :, ["defined_as", "implemented_as", "defined_type", "implemented_type"] |
| 184 | + ] |
| 185 | + ) |
| 186 | + |
| 187 | + |
| 188 | +@dataclasses.dataclass |
| 189 | +class Comparison: |
| 190 | + normalized_name: str |
| 191 | + defined_as: str | None = None |
| 192 | + implemented_as: str | None = None |
| 193 | + defined_type: str | None = None |
| 194 | + implemented_type: str | None = None |
| 195 | + |
| 196 | + |
| 197 | +def report_difference(implementation: dict, definition: dict, clazz: str) -> list[Comparison]: |
| 198 | + implemented_properties = { |
| 199 | + normalize(prop): prop |
| 200 | + for prop in implementation["direct_properties"] |
| 201 | + if not any(prop.endswith(suffix) for suffix in property_suffix_ignores) |
| 202 | + } |
| 203 | + defined_properties = {normalize(prop["name"]): prop for prop in definition["direct_properties"]} |
| 204 | + |
| 205 | + all_properties = set(defined_properties) | set(implemented_properties) |
| 206 | + property_renames = property_renames_by_class.get(clazz, {}) |
| 207 | + # To avoid reporting a property twice, we only pick one of two definitions for properties |
| 208 | + # which are named differently in implementation than the conceptual model. |
| 209 | + all_properties = {p for p in all_properties if p not in property_renames.values()} |
| 210 | + |
| 211 | + property_map = [] |
| 212 | + for property_name in all_properties: |
| 213 | + prop = Comparison(normalized_name=property_name) |
| 214 | + if implemented_as := implemented_properties.get(property_name): |
| 215 | + prop.implemented_as = implemented_as |
| 216 | + prop.implemented_type = implementation["properties"][implemented_as].get( |
| 217 | + "type", "TYPE_UNKNOWN" |
| 218 | + ) |
| 219 | + |
| 220 | + if different_name := property_renames.get(property_name): |
| 221 | + property_name = different_name |
| 222 | + if defined_as := defined_properties.get(property_name): |
| 223 | + prop.defined_as = defined_as["name"] |
| 224 | + def_type = defined_as["range"] |
| 225 | + if isinstance(def_type, list) and len(def_type) == 1: |
| 226 | + def_type = def_type[0] |
| 227 | + prop.defined_type = def_type |
| 228 | + property_map.append(prop) |
| 229 | + |
| 230 | + def sort_properties(comparison): |
| 231 | + # we want the matched properties, then unmatched properties |
| 232 | + if comparison.implemented_as and comparison.defined_as: |
| 233 | + return ord(comparison.defined_as[0]) |
| 234 | + if comparison.defined_as: |
| 235 | + return ord(comparison.defined_as[0]) + 26 |
| 236 | + if comparison.implemented_as: |
| 237 | + return ord(comparison.implemented_as[0]) + 26 * 2 |
| 238 | + raise NotImplemented |
| 239 | + |
| 240 | + return sorted(property_map, key=sort_properties) |
| 241 | + |
| 242 | + |
| 243 | +if __name__ == "__main__": |
| 244 | + main() |
0 commit comments