Skip to content

Commit 0d8417b

Browse files
Add files for open source release
0 parents  commit 0d8417b

22 files changed

Lines changed: 2891 additions & 0 deletions

.gitignore

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
.coverage
2+
.eggs
3+
.pytest_cache
4+
.test_venv
5+
.vscode
6+
.idea
7+
.venv
8+
.tests-venv
9+
.python-version
10+
.cid
11+
.cvol
12+
.DS_Store
13+
*~
14+
*.pyc
15+
*.egg-info
16+
*.isorted
17+
build
18+
dist
19+
_build
20+
git_version.py
21+
__pycache__
22+
faider-build
23+
pip-wheel-metadata
24+
azkaban_templates/development_config.yaml
25+
test_report.xml
26+
.di-build-cache/
27+
di-build-tools/

README.md

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# OpenCTI STIX2 Attribution Tools
2+
3+
4+
## Description
5+
6+
The model is multi-label classifier and predicts Intrusion Set based on Incident. It was trained using Naive Bayes and both training and validation sets were generated.
7+
8+
### How to call parser to convert incident from `json` format to `string`
9+
```python
10+
from attribution_tools import parsers
11+
parsers.incident_json_to_str(incident_json: dict) -> str
12+
```
13+
### How to retrain the model
14+
```python
15+
from attribution_tools.train_attribution_model import TrainingAttributionToolsModel
16+
trained_values = TrainingAttributionToolsModel(intrusion_sets_data: list, database_version: string)
17+
trained_values.retrain_model() -> (model, f1_score, incremented_database_version)
18+
```
19+
### Format of intrusion set
20+
The value of `intrusion_sets_data` is list of dictionaries, where each dictionary is json representation of intrusion set. The `database_version` has a default value of `"(0, 0, 1)"` and if no value provided that value will be used. However, the database versions helps to track model version and check that the correct model version is used.
21+
### Retraining return value
22+
Retraining module returns tuple value, where the first value is model and the second one is f1 score. F1 score value is a value between 0 and 1. Closer value is to 1, more accurate model is.
23+
### How to call predict function
24+
The value of `model` is `None` by default and in case no value provided the default model from repo will be used
25+
```python
26+
from attribution_tools.attribution_model import AttributionToolsModel
27+
attribution_tools_model = AttributionToolsModel(model_value -> trained_values[0], database_version -> trained_values[2])
28+
attribution_tools_model.predict(incident_str: str) -> json
29+
```
30+
### Format of prediction results
31+
```python
32+
{"label": {"labels": [str_intrusion-set, str_intrusion-set, str_intrusion-set], "probas": [double, double, double]}, "db_version": str}
33+
```
34+
In case of error, the `label` will take values:
35+
* `-1` in case of input parameter error;
36+
* `-2` if model is `None`;
37+
* `-3` if case of unexpected exception.
38+
When everything passed successful, the string with 3 top intrusion set's and their probabilities will be returned.
39+
### Example of returned value
40+
```
41+
{"label": {'labels': ['Aggah_intrusion-set--088d7359-97fb-591b-aeed-be46caf1027d', 'Kippis_intrusion-set--088d7359-2332-591b-aeed-be83caf1027d', 'UNC2891_intrusion-set--6520a731-fa8a-5232-ba9f-8e0bff785ad6'], 'probas': [0.9585474768119115, 0.04145252318808973, 0.03145252318808973]}, "db_version": "(0, 0, 1)"}
42+
```
43+
44+
### CC-Driver
45+
This package was developed as a part of [CC-Driver project](https://www.ccdriver-h2020.com/), funded by the European Union’s Horizon 2020 Research and Innovation Programme under Grant Agreement No. 883543

attribution_tools/__init__.py

Whitespace-only changes.
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
"""
2+
Implementation of the Attribution Model
3+
"""
4+
5+
import json
6+
import logging
7+
from pathlib import Path, PurePath
8+
9+
import dill as pickle
10+
import pandas as pd
11+
12+
META_DATA_FILENAME = "meta_data.json"
13+
MODEL_PATH_LOCATION = "data"
14+
MODEL_NAME = "model.pickle"
15+
DATABASE_VERSION = "(0, 0, 1)"
16+
TOP_N = 3
17+
18+
19+
class AttributionToolsModel:
20+
"""The class used for initializing the model and making the predictions."""
21+
22+
def __init__(
23+
self,
24+
model=None,
25+
database_version=DATABASE_VERSION,
26+
initial_path=None,
27+
model_name=MODEL_NAME,
28+
meta_file_name=META_DATA_FILENAME,
29+
):
30+
self.log = logging.getLogger(__name__)
31+
self.db_version = database_version
32+
if model is None:
33+
if initial_path is None:
34+
initial_path = PurePath(Path(__file__).parent.resolve(), MODEL_PATH_LOCATION)
35+
self.log.info("Model is stored in %r", initial_path)
36+
self.meta_data_path_location = PurePath(initial_path, meta_file_name)
37+
self.model_path_location = PurePath(initial_path, model_name)
38+
self.model = None
39+
else:
40+
self.model = model
41+
42+
def predict(self, incident_str):
43+
"""Predict the intrusion set based on incident string"""
44+
if not isinstance(incident_str, str) or len(incident_str) == 0:
45+
return {"label": -1, "db_version": self.db_version}
46+
try:
47+
if self.model is None:
48+
self.model = self.load_files()
49+
if self.model is None:
50+
return {"label": -2, "db_version": self.db_version}
51+
y_test_pred = self.model.predict_proba([incident_str])
52+
df_pred = pd.DataFrame(data={"labels": self.model.classes_, "probas": y_test_pred[0]})
53+
label_val = df_pred.sort_values(by=["probas"], ascending=False).head(TOP_N).to_dict("list")
54+
return {"label": label_val, "db_version": self.db_version}
55+
except Exception as exception:
56+
self.log.warning("The exception happened and the score can not be predicted for %r", incident_str)
57+
self.log.exception(exception)
58+
return {"label": -3, "db_version": self.db_version}
59+
60+
def load_files(self):
61+
"""Load the model and meta data"""
62+
try:
63+
if self.db_version == DATABASE_VERSION:
64+
with open(self.meta_data_path_location, "rb") as filename:
65+
meta_data = json.load(filename)
66+
self.log.info("The model meta data downloaded from %s: ", self.meta_data_path_location)
67+
self.db_version = meta_data["db_version"]
68+
self.log.info(
69+
"The model version is %s, the meta data creation time is %s: ",
70+
meta_data["db_version"],
71+
meta_data["time_metadata_created"],
72+
)
73+
except Exception as exception:
74+
self.log.warning("The exception happened and the json file can not be loaded")
75+
self.log.exception(exception)
76+
77+
model = None
78+
try:
79+
with open(self.model_path_location, "rb") as filename:
80+
model = pickle.load(filename)
81+
self.log.info("The pickle file with model was loaded from %r location", self.model_path_location)
82+
except Exception as exception:
83+
self.log.warning("The exception happened and the pickle file can not be loaded")
84+
self.log.exception(exception)
85+
finally:
86+
return model

attribution_tools/generator.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
"""The module used to genetate incidents data based on intrusion sets"""
2+
import logging
3+
import math
4+
import random
5+
from math import ceil
6+
from typing import List
7+
8+
import numpy
9+
from scipy.stats import betabinom
10+
from scipy.stats.sampling import DiscreteAliasUrn
11+
12+
from attribution_tools.parsers import AttackPattern, IntrusionSet, Malware, Tool
13+
14+
logger = logging.getLogger()
15+
logger.setLevel(logging.ERROR)
16+
17+
18+
def generate_incident_size(lbound: int, ubound: int) -> int:
19+
"""Generate the size of incident based on"""
20+
alpha, beta = 1.5, 10.0
21+
22+
region_size = ceil(ubound - lbound)
23+
assert region_size > 0, f"Wrong bound arguments: {lbound}, {ubound}"
24+
25+
percent_point_func = numpy.arange(
26+
betabinom.ppf(0.0, region_size, alpha, beta), betabinom.ppf(1.0, region_size, alpha, beta)
27+
)
28+
random_variable = betabinom(region_size, alpha, beta)
29+
30+
pmf_values = random_variable.pmf(percent_point_func).tolist()
31+
generator = DiscreteAliasUrn(pmf_values, random_state=numpy.random.default_rng())
32+
33+
return (generator.rvs(size=1) + lbound)[0]
34+
35+
36+
class IncidentGenerator:
37+
"""Class in used to create incident based on intrusion set."""
38+
39+
# expected size of an incident (lower and upper bounds)
40+
N_SIZE_MIN, N_SIZE_MAX = 10, 50
41+
# fraction taken by attack patterns
42+
FRAC_ATTACK_PATTERN = 0.5
43+
# fraction taken by tools
44+
FRAC_TOOLS = 0.2
45+
# fraction taken by malware
46+
FRAC_MALWARE = 0.2
47+
# fraction taken by other elements
48+
FRAC_OTHER = 0.1
49+
50+
def generate(self, source: IntrusionSet) -> list:
51+
"""Generation of the incident content."""
52+
content = []
53+
n_size_max = sum(
54+
[
55+
len(source.attack_patterns),
56+
len(source.malwares),
57+
len(source.tools),
58+
len(source.indicators),
59+
len(source.identities),
60+
len(source.locations),
61+
len(source.vulnerabilities),
62+
]
63+
)
64+
if n_size_max < self.N_SIZE_MIN:
65+
n_size_max = self.N_SIZE_MIN
66+
67+
n_size = generate_incident_size(self.N_SIZE_MIN, self.N_SIZE_MAX)
68+
n_size = min(n_size, n_size_max)
69+
70+
content.extend(self.sample_attack_patterns(source.attack_patterns, n_size))
71+
content.extend(self.sample_tools(source.tools, n_size))
72+
content.extend(self.sample_malwares(source.malwares, n_size))
73+
other_entities = source.indicators + source.vulnerabilities + source.identities + source.locations
74+
content.extend(self.sample_others(other_entities, n_size))
75+
76+
return content
77+
78+
def sample_attack_patterns(self, source: List[AttackPattern], max_incident_size) -> List[str]:
79+
"""Creates the sample list of attack patterns."""
80+
result = []
81+
if source:
82+
n_max_attack_patterns = math.ceil(max_incident_size * self.FRAC_ATTACK_PATTERN)
83+
selection = set(random.choices(source, k=n_max_attack_patterns))
84+
result.extend([item.semantic_id for item in selection])
85+
return result
86+
87+
def sample_tools(self, source: List[Tool], max_incident_size) -> List[str]:
88+
"""Creates the sample list of tools."""
89+
result = []
90+
if source:
91+
n_max_tools = math.ceil(max_incident_size * self.FRAC_TOOLS)
92+
selection = set(random.choices(source, k=n_max_tools))
93+
result.extend([item.semantic_id for item in selection])
94+
return result
95+
96+
def sample_malwares(self, source: List[Malware], max_incident_size) -> List[str]:
97+
"""Creates the sample list of malwares."""
98+
result = []
99+
if source:
100+
n_max_malwares = math.ceil(max_incident_size * self.FRAC_MALWARE)
101+
selection = set(random.choices(source, k=n_max_malwares))
102+
result.extend([item.semantic_id for item in selection])
103+
return result
104+
105+
def sample_others(self, source: list, max_incident_size) -> List[str]:
106+
"""Creates the sample list of other STIX2 entities."""
107+
result = []
108+
if source:
109+
n_max_others = math.ceil(max_incident_size * self.FRAC_OTHER)
110+
selection = random.sample(source, min(len(source), n_max_others)) # do note the difference of this method
111+
result.extend([item.semantic_id for item in selection])
112+
return result

0 commit comments

Comments
 (0)