SYN flood scenario (krkn-chaos#668)

tsebastiani · web-flow · commit e02c6d128761 · 2024-07-29T15:31:37.000-04:00
* scenario config file

Signed-off-by: Tullio Sebastiani &lt;tsebasti@redhat.com&gt;

* syn flood plugin

Signed-off-by: Tullio Sebastiani &lt;tsebasti@redhat.com&gt;

* run_krkn.py updaated

Signed-off-by: Tullio Sebastiani &lt;tsebasti@redhat.com&gt;

* requirements.txt + documentation + config.yaml

* set node selector defaults to worker

Signed-off-by: Tullio Sebastiani &lt;tsebasti@redhat.com&gt;

---------

Signed-off-by: Tullio Sebastiani &lt;tsebasti@redhat.com&gt;
diff --git a/README.md b/README.md
@@ -64,6 +64,7 @@ Scenario type               | Kubernetes
 [Network_Chaos](docs/network_chaos.md) | :heavy_check_mark: |
 [ManagedCluster Scenarios](docs/managedcluster_scenarios.md) | :heavy_check_mark: |
 [Service Hijacking Scenarios](docs/service_hijacking_scenarios.md) | :heavy_check_mark: |
+[SYN Flood Scenarios](docs/syn_flood_scenarios.md) | :heavy_check_mark: |
 
 
 ### Kraken scenario pass/fail criteria and report
diff --git a/config/config.yaml b/config/config.yaml
@@ -44,6 +44,8 @@ kraken:
             - scenarios/openshift/network_chaos.yaml
         - service_hijacking:
               - scenarios/kube/service_hijacking.yaml
+        - syn_flood:
+              - scenarios/kube/syn_flood.yaml
 
 cerberus:
     cerberus_enabled: False                                # Enable it when cerberus is previously installed
diff --git a/docs/syn_flood_scenarios.md b/docs/syn_flood_scenarios.md
@@ -0,0 +1,33 @@
+### SYN Flood Scenarios
+
+This scenario generates a substantial amount of TCP traffic directed at one or more Kubernetes services within 
+the cluster to test the server's resiliency under extreme traffic conditions. 
+It can also target hosts outside the cluster by specifying a reachable IP address or hostname. 
+This scenario leverages the distributed nature of Kubernetes clusters to instantiate multiple instances 
+of the same pod against a single host, significantly increasing the effectiveness of the attack. 
+The configuration also allows for the specification of multiple node selectors, enabling Kubernetes to schedule 
+the attacker pods on a user-defined subset of nodes to make the test more realistic.
+
+ ```yaml
+packet-size: 120 # hping3 packet size
+window-size: 64 # hping 3 TCP window size
+duration: 10 # chaos scenario duration
+namespace: default # namespace where the target service(s) are deployed
+target-service: target-svc # target service name (if set target-service-label must be empty)
+target-port: 80 # target service TCP port
+target-service-label : "" # target service label, can be used to target multiple target at the same time
+                          # if they have the same label set (if set target-service must be empty)
+number-of-pods: 2 # number of attacker pod instantiated per each target
+image: quay.io/krkn-chaos/krkn-syn-flood # syn flood attacker container image
+attacker-nodes: # this will set the node affinity to schedule the attacker node. Per each node label selector
+                # can be specified multiple values in this way the kube scheduler will schedule the attacker pods
+                # in the best way possible based on the provided labels. Multiple labels can be specified
+  kubernetes.io/hostname:
+    - host_1
+    - host_2
+  kubernetes.io/os:
+    - linux
+
+ ```
+
+The attacker container source code is available [here](https://github.com/krkn-chaos/krkn-syn-flood).
diff --git a/kraken/syn_flood/__init__.py b/kraken/syn_flood/__init__.py
@@ -0,0 +1 @@
+from .syn_flood import *
diff --git a/kraken/syn_flood/syn_flood.py b/kraken/syn_flood/syn_flood.py
@@ -0,0 +1,132 @@
+import logging
+import os.path
+import time
+from typing import List
+
+import krkn_lib.utils
+import yaml
+from krkn_lib.k8s import KrknKubernetes
+from krkn_lib.models.telemetry import ScenarioTelemetry
+from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
+
+
+def run(scenarios_list: list[str], krkn_kubernetes: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_post_scenarios = []
+    for scenario in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = scenario
+        scenario_telemetry.start_timestamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, scenario)
+
+        try:
+            pod_names = []
+            config = parse_config(scenario)
+            if config["target-service-label"]:
+                target_services = krkn_kubernetes.select_service_by_label(config["namespace"], config["target-service-label"])
+            else:
+                target_services = [config["target-service"]]
+
+            for target in target_services:
+                if not krkn_kubernetes.service_exists(target, config["namespace"]):
+                    raise Exception(f"{target} service not found")
+                for i in range(config["number-of-pods"]):
+                    pod_name = "syn-flood-" + krkn_lib.utils.get_random_string(10)
+                    krkn_kubernetes.deploy_syn_flood(pod_name,
+                                                     config["namespace"],
+                                                     config["image"],
+                                                     target,
+                                                     config["target-port"],
+                                                     config["packet-size"],
+                                                     config["window-size"],
+                                                     config["duration"],
+                                                     config["attacker-nodes"]
+                                                     )
+                    pod_names.append(pod_name)
+
+            logging.info("waiting all the attackers to finish:")
+            did_finish = False
+            finished_pods = []
+            while not did_finish:
+                for pod_name in pod_names:
+                    if not krkn_kubernetes.is_pod_running(pod_name, config["namespace"]):
+                        finished_pods.append(pod_name)
+                    if set(pod_names) == set(finished_pods):
+                        did_finish = True
+                time.sleep(1)
+
+        except Exception as e:
+            logging.error(f"Failed to run syn flood scenario {scenario}: {e}")
+            failed_post_scenarios.append(scenario)
+            scenario_telemetry.exit_status = 1
+        else:
+            scenario_telemetry.exit_status = 0
+        scenario_telemetry.end_timestamp = time.time()
+        scenario_telemetries.append(scenario_telemetry)
+    return failed_post_scenarios, scenario_telemetries
+
+def parse_config(scenario_file: str) -> dict[str,any]:
+    if not os.path.exists(scenario_file):
+        raise Exception(f"failed to load scenario file {scenario_file}")
+
+    try:
+        with open(scenario_file) as stream:
+            config = yaml.safe_load(stream)
+    except Exception:
+        raise Exception(f"{scenario_file} is not a valid yaml file")
+
+    missing = []
+    if not check_key_value(config ,"packet-size"):
+        missing.append("packet-size")
+    if not check_key_value(config,"window-size"):
+        missing.append("window-size")
+    if not check_key_value(config, "duration"):
+        missing.append("duration")
+    if not check_key_value(config, "namespace"):
+        missing.append("namespace")
+    if not check_key_value(config, "number-of-pods"):
+        missing.append("number-of-pods")
+    if not check_key_value(config, "target-port"):
+        missing.append("target-port")
+    if not check_key_value(config, "image"):
+        missing.append("image")
+    if "target-service" not in config.keys():
+        missing.append("target-service")
+    if "target-service-label" not in config.keys():
+        missing.append("target-service-label")
+
+
+
+
+    if len(missing) > 0:
+        raise Exception(f"{(',').join(missing)} parameter(s) are missing")
+
+    if not config["target-service"] and not config["target-service-label"]:
+        raise Exception("you have either to set a target service or a label")
+    if config["target-service"] and config["target-service-label"]:
+        raise Exception("you cannot select both target-service and target-service-label")
+
+    if 'attacker-nodes' and not is_node_affinity_correct(config['attacker-nodes']):
+        raise Exception("attacker-nodes format is not correct")
+    return config
+
+def check_key_value(dictionary, key):
+    if key in dictionary:
+        value = dictionary[key]
+        if value is not None and value != '':
+            return True
+    return False
+
+def is_node_affinity_correct(obj) -> bool:
+    if not isinstance(obj, dict):
+        return False
+    for key in obj.keys():
+        if not isinstance(key, str):
+            return False
+        if not isinstance(obj[key], list):
+            return False
+    return True
+
+
+
+
diff --git a/requirements.txt b/requirements.txt
@@ -15,7 +15,7 @@ google-api-python-client==2.116.0
 ibm_cloud_sdk_core==3.18.0
 ibm_vpc==0.20.0
 jinja2==3.1.4
-krkn-lib==2.1.6
+krkn-lib==2.1.7
 lxml==5.1.0
 kubernetes==28.1.0
 oauth2client==4.1.3
diff --git a/run_kraken.py b/run_kraken.py
@@ -27,7 +27,7 @@
 import kraken.prometheus as prometheus_plugin
 import kraken.service_hijacking.service_hijacking as service_hijacking_plugin
 import server as server
-from kraken import plugins
+from kraken import plugins, syn_flood
 from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.ocp import KrknOpenshift
 from krkn_lib.telemetry.elastic import KrknElastic
@@ -354,6 +354,10 @@ def main(cfg):
                             logging.info("Running Service Hijacking Chaos")
                             failed_post_scenarios, scenario_telemetries = service_hijacking_plugin.run(scenarios_list, wait_duration, kubecli, telemetry_k8s)
                             chaos_telemetry.scenarios.extend(scenario_telemetries)
+                        elif scenario_type == "syn_flood":
+                            logging.info("Running Syn Flood Chaos")
+                            failed_post_scenarios, scenario_telemetries = syn_flood.run(scenarios_list, kubecli, telemetry_k8s)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
 
                         # Check for critical alerts when enabled
                         post_critical_alerts = 0
diff --git a/scenarios/kube/syn_flood.yaml b/scenarios/kube/syn_flood.yaml
@@ -0,0 +1,16 @@
+packet-size: 120 # hping3 packet size
+window-size: 64 # hping 3 TCP window size
+duration: 10 # chaos scenario duration
+namespace: default # namespace where the target service(s) are deployed
+target-service: elasticsearch # target service name (if set target-service-label must be empty)
+target-port: 9200 # target service TCP port
+target-service-label : "" # target service label, can be used to target multiple target at the same time
+                          # if they have the same label set (if set target-service must be empty)
+number-of-pods: 2 # number of attacker pod instantiated per each target
+image: quay.io/krkn-chaos/krkn-syn-flood:v1.0.0 # syn flood attacker container image
+attacker-nodes:                       # this will set the node affinity to schedule the attacker node. Per each node label selector
+    node-role.kubernetes.io/worker:   # can be specified multiple values in this way the kube scheduler will schedule the attacker pods
+      - ""                            # in the best way possible based on the provided labels. Multiple labels can be specified
+                                      # set empty value  `attacker-nodes: {}`  to let kubernetes schedule the pods
+
+