WithSecureLabs
diff --git a/‎.gitignore‎
Lines changed: 27 additions & 0 deletions b/‎.gitignore‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 45 additions & 0 deletions b/‎README.md‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎attribution_tools/__init__.py‎ b/‎attribution_tools/__init__.py‎
diff --git a/‎attribution_tools/attribution_model.py‎
Lines changed: 86 additions & 0 deletions b/‎attribution_tools/attribution_model.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎attribution_tools/generator.py‎
Lines changed: 112 additions & 0 deletions b/‎attribution_tools/generator.py‎
Lines changed: 112 additions & 0 deletions
@@ -0,0 +1,27 @@
+.coverage
+.eggs
+.pytest_cache
+.test_venv
+.vscode
+.idea
+.venv
+.tests-venv
+.python-version
+.cid
+.cvol
+.DS_Store
+*~
+*.pyc
+*.egg-info
+*.isorted
+build
+dist
+_build
+git_version.py
+__pycache__
+faider-build
+pip-wheel-metadata
+azkaban_templates/development_config.yaml
+test_report.xml
+.di-build-cache/
+di-build-tools/
@@ -0,0 +1,45 @@
+# OpenCTI STIX2 Attribution Tools
+
+
+## Description
+
+The model is multi-label classifier and predicts Intrusion Set based on Incident. It was trained using Naive Bayes and both training and validation sets were generated.
+  
+### How to call parser to convert incident from `json` format to `string`
+```python
+from attribution_tools import parsers
+parsers.incident_json_to_str(incident_json: dict) -> str
+```
+### How to retrain the model
+```python
+from attribution_tools.train_attribution_model import TrainingAttributionToolsModel
+trained_values = TrainingAttributionToolsModel(intrusion_sets_data: list, database_version: string)
+trained_values.retrain_model() -> (model, f1_score, incremented_database_version)
+```
+### Format of intrusion set
+The value of `intrusion_sets_data` is list of dictionaries, where each dictionary is json representation of intrusion set. The `database_version` has a default value of `"(0, 0, 1)"` and if no value provided that value will be used. However, the database versions helps to track model version and check that the correct model version is used. 
+### Retraining return value
+Retraining module returns tuple value, where the first value is model and the second one is f1 score. F1 score value is a value between 0 and 1. Closer value is to 1, more accurate model is.
+### How to call predict function
+The value of `model` is `None` by default and in case no value provided the default model from repo will be used
+```python
+from attribution_tools.attribution_model import AttributionToolsModel
+attribution_tools_model = AttributionToolsModel(model_value -> trained_values[0], database_version -> trained_values[2])
+attribution_tools_model.predict(incident_str: str) -> json
+```
+### Format of prediction results
+```python
+{"label": {"labels": [str_intrusion-set, str_intrusion-set, str_intrusion-set], "probas": [double, double, double]}, "db_version": str}
+```
+In case of error, the `label` will take values:
+*  `-1` in case of input parameter error;
+*  `-2` if model is `None`;
+*  `-3` if case of unexpected exception.
+When everything passed successful, the string with 3 top intrusion set's and their probabilities will be returned.
+### Example of returned value
+```
+{"label": {'labels': ['Aggah_intrusion-set--088d7359-97fb-591b-aeed-be46caf1027d', 'Kippis_intrusion-set--088d7359-2332-591b-aeed-be83caf1027d', 'UNC2891_intrusion-set--6520a731-fa8a-5232-ba9f-8e0bff785ad6'], 'probas': [0.9585474768119115, 0.04145252318808973, 0.03145252318808973]}, "db_version": "(0, 0, 1)"}
+```
+
+### CC-Driver
+This package was developed as a part of [CC-Driver project](https://www.ccdriver-h2020.com/), funded by the European Union’s Horizon 2020 Research and Innovation Programme under Grant Agreement No. 883543
@@ -0,0 +1,86 @@
+"""
+Implementation of the Attribution Model
+"""
+
+import json
+import logging
+from pathlib import Path, PurePath
+
+import dill as pickle
+import pandas as pd
+
+META_DATA_FILENAME = "meta_data.json"
+MODEL_PATH_LOCATION = "data"
+MODEL_NAME = "model.pickle"
+DATABASE_VERSION = "(0, 0, 1)"
+TOP_N = 3
+
+
+class AttributionToolsModel:
+    """The class used for initializing the model and making the predictions."""
+
+    def __init__(
+        self,
+        model=None,
+        database_version=DATABASE_VERSION,
+        initial_path=None,
+        model_name=MODEL_NAME,
+        meta_file_name=META_DATA_FILENAME,
+    ):
+        self.log = logging.getLogger(__name__)
+        self.db_version = database_version
+        if model is None:
+            if initial_path is None:
+                initial_path = PurePath(Path(__file__).parent.resolve(), MODEL_PATH_LOCATION)
+            self.log.info("Model is stored in %r", initial_path)
+            self.meta_data_path_location = PurePath(initial_path, meta_file_name)
+            self.model_path_location = PurePath(initial_path, model_name)
+            self.model = None
+        else:
+            self.model = model
+
+    def predict(self, incident_str):
+        """Predict the intrusion set based on incident string"""
+        if not isinstance(incident_str, str) or len(incident_str) == 0:
+            return {"label": -1, "db_version": self.db_version}
+        try:
+            if self.model is None:
+                self.model = self.load_files()
+                if self.model is None:
+                    return {"label": -2, "db_version": self.db_version}
+            y_test_pred = self.model.predict_proba([incident_str])
+            df_pred = pd.DataFrame(data={"labels": self.model.classes_, "probas": y_test_pred[0]})
+            label_val = df_pred.sort_values(by=["probas"], ascending=False).head(TOP_N).to_dict("list")
+            return {"label": label_val, "db_version": self.db_version}
+        except Exception as exception:
+            self.log.warning("The exception happened and the score can not be predicted for %r", incident_str)
+            self.log.exception(exception)
+        return {"label": -3, "db_version": self.db_version}
+
+    def load_files(self):
+        """Load the model and meta data"""
+        try:
+            if self.db_version == DATABASE_VERSION:
+                with open(self.meta_data_path_location, "rb") as filename:
+                    meta_data = json.load(filename)
+                self.log.info("The model meta data downloaded from %s: ", self.meta_data_path_location)
+                self.db_version = meta_data["db_version"]
+                self.log.info(
+                    "The model version is %s, the meta data creation time is %s: ",
+                    meta_data["db_version"],
+                    meta_data["time_metadata_created"],
+                )
+        except Exception as exception:
+            self.log.warning("The exception happened and the json file can not be loaded")
+            self.log.exception(exception)
+
+        model = None
+        try:
+            with open(self.model_path_location, "rb") as filename:
+                model = pickle.load(filename)
+            self.log.info("The pickle file with model was loaded from %r location", self.model_path_location)
+        except Exception as exception:
+            self.log.warning("The exception happened and the pickle file can not be loaded")
+            self.log.exception(exception)
+        finally:
+            return model
@@ -0,0 +1,112 @@
+"""The module used to genetate incidents data based on intrusion sets"""
+import logging
+import math
+import random
+from math import ceil
+from typing import List
+
+import numpy
+from scipy.stats import betabinom
+from scipy.stats.sampling import DiscreteAliasUrn
+
+from attribution_tools.parsers import AttackPattern, IntrusionSet, Malware, Tool
+
+logger = logging.getLogger()
+logger.setLevel(logging.ERROR)
+
+
+def generate_incident_size(lbound: int, ubound: int) -> int:
+    """Generate the size of incident based on"""
+    alpha, beta = 1.5, 10.0
+
+    region_size = ceil(ubound - lbound)
+    assert region_size > 0, f"Wrong bound arguments: {lbound}, {ubound}"
+
+    percent_point_func = numpy.arange(
+        betabinom.ppf(0.0, region_size, alpha, beta), betabinom.ppf(1.0, region_size, alpha, beta)
+    )
+    random_variable = betabinom(region_size, alpha, beta)
+
+    pmf_values = random_variable.pmf(percent_point_func).tolist()
+    generator = DiscreteAliasUrn(pmf_values, random_state=numpy.random.default_rng())
+
+    return (generator.rvs(size=1) + lbound)[0]
+
+
+class IncidentGenerator:
+    """Class in used to create incident based on intrusion set."""
+
+    # expected size of an incident (lower and upper bounds)
+    N_SIZE_MIN, N_SIZE_MAX = 10, 50
+    # fraction taken by attack patterns
+    FRAC_ATTACK_PATTERN = 0.5
+    # fraction taken by tools
+    FRAC_TOOLS = 0.2
+    # fraction taken by malware
+    FRAC_MALWARE = 0.2
+    # fraction taken by other elements
+    FRAC_OTHER = 0.1
+
+    def generate(self, source: IntrusionSet) -> list:
+        """Generation of the incident content."""
+        content = []
+        n_size_max = sum(
+            [
+                len(source.attack_patterns),
+                len(source.malwares),
+                len(source.tools),
+                len(source.indicators),
+                len(source.identities),
+                len(source.locations),
+                len(source.vulnerabilities),
+            ]
+        )
+        if n_size_max < self.N_SIZE_MIN:
+            n_size_max = self.N_SIZE_MIN
+
+        n_size = generate_incident_size(self.N_SIZE_MIN, self.N_SIZE_MAX)
+        n_size = min(n_size, n_size_max)
+
+        content.extend(self.sample_attack_patterns(source.attack_patterns, n_size))
+        content.extend(self.sample_tools(source.tools, n_size))
+        content.extend(self.sample_malwares(source.malwares, n_size))
+        other_entities = source.indicators + source.vulnerabilities + source.identities + source.locations
+        content.extend(self.sample_others(other_entities, n_size))
+
+        return content
+
+    def sample_attack_patterns(self, source: List[AttackPattern], max_incident_size) -> List[str]:
+        """Creates the sample list of attack patterns."""
+        result = []
+        if source:
+            n_max_attack_patterns = math.ceil(max_incident_size * self.FRAC_ATTACK_PATTERN)
+            selection = set(random.choices(source, k=n_max_attack_patterns))
+            result.extend([item.semantic_id for item in selection])
+        return result
+
+    def sample_tools(self, source: List[Tool], max_incident_size) -> List[str]:
+        """Creates the sample list of tools."""
+        result = []
+        if source:
+            n_max_tools = math.ceil(max_incident_size * self.FRAC_TOOLS)
+            selection = set(random.choices(source, k=n_max_tools))
+            result.extend([item.semantic_id for item in selection])
+        return result
+
+    def sample_malwares(self, source: List[Malware], max_incident_size) -> List[str]:
+        """Creates the sample list of malwares."""
+        result = []
+        if source:
+            n_max_malwares = math.ceil(max_incident_size * self.FRAC_MALWARE)
+            selection = set(random.choices(source, k=n_max_malwares))
+            result.extend([item.semantic_id for item in selection])
+        return result
+
+    def sample_others(self, source: list, max_incident_size) -> List[str]:
+        """Creates the sample list of other STIX2 entities."""
+        result = []
+        if source:
+            n_max_others = math.ceil(max_incident_size * self.FRAC_OTHER)
+            selection = random.sample(source, min(len(source), n_max_others))  # do note the difference of this method
+            result.extend([item.semantic_id for item in selection])
+        return result