Skip to content

Commit 836a3fe

Browse files
committed
feat: implement Content-Addressable Storage for Data Models to prevent storage bloat
1 parent 316a7a6 commit 836a3fe

5 files changed

Lines changed: 240 additions & 35 deletions

File tree

api_app/analyzers_manager/models.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from django.contrib.contenttypes.fields import GenericForeignKey, GenericRelation
88
from django.contrib.contenttypes.models import ContentType
99
from django.core.exceptions import ValidationError
10-
from django.db import models
10+
from django.db import IntegrityError, models, transaction
1111

1212
from api_app.analyzers_manager.constants import HashChoices, TypeChoices
1313
from api_app.analyzers_manager.exceptions import AnalyzerConfigurationException
@@ -17,6 +17,7 @@
1717
from api_app.data_model_manager.models import BaseDataModel
1818
from api_app.decorators import classproperty
1919
from api_app.fields import ChoiceArrayField
20+
from api_app.helpers import calculate_json_fingerprint
2021
from api_app.models import AbstractReport, PythonConfig, PythonModule
2122

2223
logger = getLogger(__name__)
@@ -121,16 +122,35 @@ def _create_data_model_dictionary(self) -> Dict:
121122
return result
122123

123124
def create_data_model(self) -> Optional[BaseDataModel]:
124-
# TODO we don't need to actually crate a new object every time.
125-
# if the report is the same of the previous one, we can just link it
126125
if not self._validation_before_data_model():
127126
return None
128127
dictionary = self._create_data_model_dictionary()
129-
130-
self.data_model: BaseDataModel = self.data_model_class.objects.create()
131-
self.data_model.merge(dictionary)
132-
self.save()
133-
return self.data_model
128+
excluded_fields = {"id", "date", "fingerprint", "analyzers_report", "jobs", "user_events"}
129+
data_to_hash = {
130+
k: v for k, v in dictionary.items() if k not in excluded_fields and v not in (None, "", [], {})
131+
}
132+
if not data_to_hash:
133+
self.data_model: BaseDataModel = self.data_model_class.objects.create()
134+
self.data_model.merge(dictionary)
135+
self.save()
136+
return self.data_model
137+
138+
fp = calculate_json_fingerprint(data_to_hash)
139+
try:
140+
with transaction.atomic():
141+
data_model, created = self.data_model_class.objects.get_or_create(
142+
fingerprint=fp, defaults={"fingerprint": fp}
143+
)
144+
self.data_model = data_model
145+
if created:
146+
self.data_model.merge(dictionary)
147+
self.save()
148+
return data_model
149+
except IntegrityError:
150+
data_model = self.data_model_class.objects.get(fingerprint=fp)
151+
self.data_model = data_model
152+
self.save()
153+
return data_model
134154

135155

136156
class MimeTypes(models.TextChoices):
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# This file is a part of IntelOwl https://github.com/intelowlproject/IntelOwl
2+
# See the file 'LICENSE' for copying permission.
3+
# Generated by Django 4.2.27 on 2026-03-14 15:51
4+
import uuid
5+
6+
from django.db import migrations, models
7+
8+
9+
def gen_fingerprints(apps, schema_editor):
10+
for model_name in ["DomainDataModel", "IPDataModel", "FileDataModel"]:
11+
Model = apps.get_model("data_model_manager", model_name)
12+
for obj in Model.objects.filter(fingerprint__isnull=True):
13+
obj.fingerprint = f"gen-{uuid.uuid4()}"
14+
obj.save(update_fields=["fingerprint"])
15+
16+
17+
class Migration(migrations.Migration):
18+
dependencies = [
19+
("data_model_manager", "0011_data_model_date_index"),
20+
]
21+
22+
operations = [
23+
migrations.AddField(
24+
model_name="domaindatamodel",
25+
name="fingerprint",
26+
field=models.CharField(blank=True, max_length=64, null=True),
27+
),
28+
migrations.AddField(
29+
model_name="filedatamodel",
30+
name="fingerprint",
31+
field=models.CharField(blank=True, max_length=64, null=True),
32+
),
33+
migrations.AddField(
34+
model_name="ipdatamodel",
35+
name="fingerprint",
36+
field=models.CharField(blank=True, max_length=64, null=True),
37+
),
38+
migrations.RunPython(gen_fingerprints, reverse_code=migrations.RunPython.noop),
39+
migrations.AlterField(
40+
model_name="domaindatamodel",
41+
name="fingerprint",
42+
field=models.CharField(blank=True, db_index=True, max_length=64, null=True, unique=True),
43+
),
44+
migrations.AlterField(
45+
model_name="filedatamodel",
46+
name="fingerprint",
47+
field=models.CharField(blank=True, db_index=True, max_length=64, null=True, unique=True),
48+
),
49+
migrations.AlterField(
50+
model_name="ipdatamodel",
51+
name="fingerprint",
52+
field=models.CharField(blank=True, db_index=True, max_length=64, null=True, unique=True),
53+
),
54+
]

api_app/data_model_manager/models.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# This file is a part of IntelOwl https://github.com/intelowlproject/IntelOwl
2+
# See the file 'LICENSE' for copying permission.
13
import json
24
import logging
35
from typing import Dict, Type, Union
@@ -82,14 +84,11 @@ class BaseDataModel(models.Model):
8284
blank=True,
8385
default=list,
8486
)
85-
related_threats = SetField(
86-
LowercaseCharField(max_length=100), default=list, blank=True
87-
) # threats/related_threats, used as a pointer to other IOCs
87+
related_threats = SetField(LowercaseCharField(max_length=100), default=list, blank=True)
8888
tags = SetField(LowercaseCharField(max_length=100), null=True, blank=True, default=None)
8989
malware_family = LowercaseCharField(max_length=100, null=True, blank=True, default=None)
90-
additional_info = models.JSONField(
91-
default=dict
92-
) # field for additional information related to a specific analyzer
90+
additional_info = models.JSONField(default=dict)
91+
fingerprint = models.CharField(max_length=64, null=True, blank=True, unique=True, db_index=True)
9392
date = models.DateTimeField(default=now)
9493
analyzers_report = GenericRelation(
9594
to="analyzers_manager.AnalyzerReport",
@@ -108,8 +107,16 @@ class BaseDataModel(models.Model):
108107
content_type_field="data_model_content_type",
109108
)
110109

111-
TAGS = DataModelTags
110+
def save(self, *args, **kwargs):
111+
if not self.fingerprint:
112+
import uuid
112113

114+
self.fingerprint = f"gen-{uuid.uuid4()}"
115+
if "update_fields" in kwargs and kwargs["update_fields"] is not None:
116+
kwargs["update_fields"] = set(kwargs["update_fields"]) | {"fingerprint"}
117+
super().save(*args, **kwargs)
118+
119+
TAGS = DataModelTags
113120
EVALUATIONS = DataModelEvaluations
114121

115122
class Meta:
@@ -131,7 +138,7 @@ def merge(self, other: Union["BaseDataModel", Dict], append: bool = True) -> "Ba
131138
if not isinstance(other, (self.__class__, dict)):
132139
raise TypeError(f"Different class between {self} and {type(other)}")
133140
for field_name, field in self.get_fields().items():
134-
if field_name == "id":
141+
if field_name in {"id", "fingerprint"}:
135142
continue
136143
result_attr = getattr(self, field_name)
137144
if isinstance(other, dict):

api_app/helpers.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
# This file is a part of IntelOwl https://github.com/intelowlproject/IntelOwl
22
# See the file 'LICENSE' for copying permission.
3-
4-
# general helper functions used by the Django API
5-
3+
import datetime
64
import hashlib
75
import ipaddress
6+
import json
87
import logging
98
import random
109
import re
1110
import typing
11+
import uuid
1212
import warnings
13+
from typing import Any, Dict
1314

1415
from django.utils import timezone
1516

@@ -71,28 +72,49 @@ def get_now():
7172

7273

7374
def gen_random_colorhex() -> str:
74-
# flake8: noqa
75-
r = lambda: random.randint(0, 255)
76-
return "#%02X%02X%02X" % (r(), r(), r())
75+
def r():
76+
return random.randint(0, 255)
77+
78+
return f"#{r():02X}{r():02X}{r():02X}"
7779

7880

7981
def calculate_md5(value: bytes) -> str:
80-
return hashlib.md5(value).hexdigest() # skipcq BAN-B324
82+
return hashlib.md5(value).hexdigest()
8183

8284

8385
def calculate_sha1(value: bytes) -> str:
84-
return hashlib.sha1(value).hexdigest() # skipcq BAN-B324
86+
return hashlib.sha1(value).hexdigest()
8587

8688

8789
def calculate_sha256(value: bytes) -> str:
88-
return hashlib.sha256(value).hexdigest() # skipcq BAN-B324
90+
return hashlib.sha256(value).hexdigest()
91+
92+
93+
def normalize_dict(data: Any) -> Any:
94+
if isinstance(data, dict):
95+
return {k: normalize_dict(v) for k, v in sorted(data.items())}
96+
if isinstance(data, list):
97+
try:
98+
return sorted(
99+
[normalize_dict(v) for v in data],
100+
key=lambda x: json.dumps(x, sort_keys=True) if isinstance(x, (dict, list)) else str(x),
101+
)
102+
except (TypeError, ValueError):
103+
return [normalize_dict(v) for v in data]
104+
if isinstance(data, (datetime.datetime, datetime.date)):
105+
return data.isoformat()
106+
if isinstance(data, uuid.UUID):
107+
return str(data)
108+
return data
109+
110+
111+
def calculate_json_fingerprint(data: Dict) -> str:
112+
normalized = normalize_dict(data)
113+
json_str = json.dumps(normalized, sort_keys=True, separators=(",", ":"))
114+
return calculate_sha256(json_str.encode("utf-8"))
89115

90116

91117
def get_ip_version(ip_value):
92-
"""
93-
Returns ip version
94-
Supports IPv4 and IPv6
95-
"""
96118
ip_type = None
97119
try:
98120
ip = ipaddress.ip_address(ip_value)
@@ -103,23 +125,18 @@ def get_ip_version(ip_value):
103125

104126

105127
def get_hash_type(hash_value):
106-
"""
107-
Returns hash type
108-
Supports md5, sha1, sha256 and sha512
109-
"""
110128
RE_HASH_MAP = {
111129
"md5": re.compile(r"^[a-f\d]{32}$", re.IGNORECASE | re.ASCII),
112130
"sha-1": re.compile(r"^[a-f\d]{40}$", re.IGNORECASE | re.ASCII),
113131
"sha-256": re.compile(r"^[a-f\d]{64}$", re.IGNORECASE | re.ASCII),
114132
"sha-512": re.compile(r"^[a-f\d]{128}$", re.IGNORECASE | re.ASCII),
115133
}
116-
117134
detected_hash_type = None
118135
for hash_type, re_hash in RE_HASH_MAP.items():
119136
if re.match(re_hash, hash_value):
120137
detected_hash_type = hash_type
121138
break
122-
return detected_hash_type # stays None if no matches
139+
return detected_hash_type
123140

124141

125142
def deprecated(message: str):
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
from api_app.analyzers_manager.models import AnalyzerReport, AnalyzerConfig
2+
from api_app.data_model_manager.models import IPDataModel
3+
from api_app.models import Job, Analyzable
4+
from api_app.choices import Classification
5+
from tests import CustomTestCase
6+
from django.utils import timezone
7+
import uuid
8+
9+
class CASDeduplicationTestCase(CustomTestCase):
10+
def test_cas_deduplication(self):
11+
# Setup
12+
an = Analyzable.objects.create(name="1.1.1.1", classification=Classification.IP)
13+
14+
# Use an existing analyzer to avoid ModuleNotFoundError in signals
15+
from api_app.models import PythonModule
16+
pm = PythonModule.objects.filter(module__icontains="AbuseIPDB").first()
17+
if not pm:
18+
pm = PythonModule.objects.create(module="abuseipdb.AbuseIPDB", base_path="api_app.analyzers_manager.analyzers")
19+
20+
config = AnalyzerConfig.objects.create(
21+
name="TestAnalyzer",
22+
python_module=pm,
23+
mapping_data_model={"result": "asn"}
24+
)
25+
26+
# Create first report (new job)
27+
job1 = Job.objects.create(analyzable=an)
28+
report1 = AnalyzerReport.objects.create(
29+
job=job1,
30+
config=config,
31+
report={"result": 12345},
32+
status=AnalyzerReport.STATUSES.SUCCESS.value,
33+
task_id=uuid.uuid4(),
34+
parameters={}
35+
)
36+
dm1 = report1.create_data_model()
37+
self.assertIsNotNone(dm1.fingerprint)
38+
self.assertEqual(dm1.asn, 12345)
39+
40+
# Create second identical report (different job to avoid unique constraint)
41+
job2 = Job.objects.create(analyzable=an)
42+
report2 = AnalyzerReport.objects.create(
43+
job=job2,
44+
config=config,
45+
report={"result": 12345},
46+
status=AnalyzerReport.STATUSES.SUCCESS.value,
47+
task_id=uuid.uuid4(),
48+
parameters={}
49+
)
50+
dm2 = report2.create_data_model()
51+
52+
# Verify deduplication (same data -> same data model instance)
53+
self.assertEqual(dm1.pk, dm2.pk)
54+
self.assertEqual(dm1.fingerprint, dm2.fingerprint)
55+
56+
# Create different report
57+
job3 = Job.objects.create(analyzable=an)
58+
report3 = AnalyzerReport.objects.create(
59+
job=job3,
60+
config=config,
61+
report={"result": 67890},
62+
status=AnalyzerReport.STATUSES.SUCCESS.value,
63+
task_id=uuid.uuid4(),
64+
parameters={}
65+
)
66+
dm3 = report3.create_data_model()
67+
self.assertNotEqual(dm1.pk, dm3.pk)
68+
self.assertNotEqual(dm1.fingerprint, dm3.fingerprint)
69+
70+
def test_empty_data_no_deduplication(self):
71+
an = Analyzable.objects.create(name="2.2.2.2", classification=Classification.IP)
72+
73+
from api_app.models import PythonModule
74+
pm = PythonModule.objects.filter(module__icontains="AbuseIPDB").first()
75+
if not pm:
76+
pm = PythonModule.objects.create(module="abuseipdb.AbuseIPDB", base_path="api_app.analyzers_manager.analyzers")
77+
78+
config = AnalyzerConfig.objects.create(
79+
name="EmptyAnalyzer",
80+
python_module=pm,
81+
mapping_data_model={}
82+
)
83+
84+
job1 = Job.objects.create(analyzable=an)
85+
report1 = AnalyzerReport.objects.create(
86+
job=job1,
87+
config=config,
88+
report={},
89+
status=AnalyzerReport.STATUSES.SUCCESS.value,
90+
task_id=uuid.uuid4(),
91+
parameters={}
92+
)
93+
dm1 = report1.create_data_model()
94+
self.assertTrue(dm1.fingerprint.startswith("gen-"))
95+
96+
job2 = Job.objects.create(analyzable=an)
97+
report2 = AnalyzerReport.objects.create(
98+
job=job2,
99+
config=config,
100+
report={},
101+
status=AnalyzerReport.STATUSES.SUCCESS.value,
102+
task_id=uuid.uuid4(),
103+
parameters={}
104+
)
105+
dm2 = report2.create_data_model()
106+
self.assertNotEqual(dm1.pk, dm2.pk)
107+
self.assertTrue(dm2.fingerprint.startswith("gen-"))

0 commit comments

Comments
 (0)