Skip to content

Commit 97f3c80

Browse files
committed
feat: implement CAS for Data Model deduplication
1 parent 316a7a6 commit 97f3c80

2 files changed

Lines changed: 45 additions & 5 deletions

File tree

api_app/analyzers_manager/models.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,14 +121,20 @@ def _create_data_model_dictionary(self) -> Dict:
121121
return result
122122

123123
def create_data_model(self) -> Optional[BaseDataModel]:
124-
# TODO we don't need to actually crate a new object every time.
125-
# if the report is the same of the previous one, we can just link it
126124
if not self._validation_before_data_model():
127125
return None
128126
dictionary = self._create_data_model_dictionary()
127+
temp_instance = self.data_model_class()
128+
fingerprint = temp_instance.generate_fingerprint(data=dictionary)
129+
existing_data_model = self.data_model_class.objects.filter(fingerprint=fingerprint).first()
130+
131+
if existing_data_model:
132+
logger.info(f"Deduplicated: Linking existing Data Model {existing_data_model.pk}")
133+
self.data_model = existing_data_model
134+
else:
135+
self.data_model: BaseDataModel = self.data_model_class.objects.create(fingerprint=fingerprint)
136+
self.data_model.merge(dictionary)
129137

130-
self.data_model: BaseDataModel = self.data_model_class.objects.create()
131-
self.data_model.merge(dictionary)
132138
self.save()
133139
return self.data_model
134140

api_app/data_model_manager/models.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
import hashlib
12
import json
23
import logging
3-
from typing import Dict, Type, Union
4+
from typing import Any, Dict, Type, Union
45

56
from django.contrib.contenttypes.fields import GenericRelation
67
from django.contrib.contenttypes.models import ContentType
@@ -26,6 +27,14 @@
2627
logger = logging.getLogger(__name__)
2728

2829

30+
def normalize_dict(obj: Any) -> Any:
31+
if isinstance(obj, dict):
32+
return {k: normalize_dict(v) for k, v in sorted(obj.items())}
33+
if isinstance(obj, list):
34+
return [normalize_dict(i) for i in obj]
35+
return obj
36+
37+
2938
class IETFReport(models.Model):
3039
rrname = LowercaseCharField(max_length=100)
3140
rrtype = LowercaseCharField(max_length=100)
@@ -91,6 +100,7 @@ class BaseDataModel(models.Model):
91100
default=dict
92101
) # field for additional information related to a specific analyzer
93102
date = models.DateTimeField(default=now)
103+
fingerprint = models.CharField(max_length=64, db_index=True, blank=True, default="")
94104
analyzers_report = GenericRelation(
95105
to="analyzers_manager.AnalyzerReport",
96106
object_id_field="data_model_object_id",
@@ -125,6 +135,30 @@ def owner(self) -> User:
125135
elif self.jobs.exists():
126136
return self.jobs.first().user
127137

138+
def get_content_map(self, data: Dict = None) -> Dict:
139+
if data is None:
140+
data = {}
141+
for field in self._meta.fields:
142+
name = field.name
143+
if name in ["id", "date", "fingerprint"]:
144+
continue
145+
value = getattr(self, name)
146+
if hasattr(value, "isoformat"):
147+
value = value.isoformat()
148+
data[name] = value
149+
data.pop("id", None)
150+
data.pop("date", None)
151+
data.pop("fingerprint", None)
152+
data.pop("analyzers_report", None)
153+
data.pop("jobs", None)
154+
data.pop("user_events", None)
155+
return normalize_dict(data)
156+
157+
def generate_fingerprint(self, data: Dict = None) -> str:
158+
content_map = self.get_content_map(data)
159+
encoded_data = json.dumps(content_map, sort_keys=True).encode("utf-8")
160+
return hashlib.sha256(encoded_data).hexdigest()
161+
128162
def merge(self, other: Union["BaseDataModel", Dict], append: bool = True) -> "BaseDataModel":
129163
if not self.pk:
130164
raise ValueError("Unable to merge a model that was not saved.")

0 commit comments

Comments
 (0)