From 3070c023f49abc6fce93af36a6740cd8034a09ca Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 001/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 155 ++++++++++++++++++++- 1 file changed, 152 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b3e98c515..124ec61f9 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -5,6 +5,7 @@ import json import datetime import traceback +import sys from slips_files.common.imports import * from slips_files.core.evidence_structure.evidence import ( @@ -112,6 +113,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -124,14 +260,18 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", "saddr", "ts", "origstate", - "flow_type", + "type_", + "dir_", + "history", + "dbytes", + "dpkts", "smac", "dmac", ] @@ -141,13 +281,22 @@ def process_features(self, dataset): except ValueError: pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) + + # Convert categories to floats dataset.state = dataset.state.astype("float64") # Convert proto to categorical. For now we only have few states, so we can hardcode... From df6e9196532d0ba050f4922a36ed1d2b1a2638b5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 002/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 000000000..b671a09a2 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 1497681f7a9b2f7d20a1c1e570646ca3b2c2bdbc Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 003/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 2eeb3ceb889625d179e07beb5e01e589d553ccf2 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 004/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 169 ++---------------- slips_files/core/database/database_manager.py | 3 - .../core/database/redis_db/profile_handler.py | 169 ++---------------- .../core/database/sqlite_db/database.py | 6 +- 4 files changed, 41 insertions(+), 306 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 124ec61f9..c57a7a358 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -5,9 +5,13 @@ import json import datetime import traceback -import sys +import warnings + -from slips_files.common.imports import * +from slips_files.common.state_handler import get_final_state_from_flags +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, @@ -29,8 +33,6 @@ def warn(*args, **kwargs): pass -import warnings - warnings.warn = warn @@ -113,141 +115,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -281,12 +148,17 @@ def process_features(self, dataset): except ValueError: pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -370,7 +242,7 @@ def process_flows(self): try: # We get all the flows so far # because this retraining happens in batches - flows = self.db.get_all_flows() + flows: list = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware @@ -464,7 +336,7 @@ def detect(self): """ try: # Store the real label if there is one - y_flow = self.flow["label"] + # y_flow = self.flow["label"] # remove the real label column self.flow = self.flow.drop("label", axis=1) # remove the label predictions column of the other modules @@ -568,13 +440,10 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): data = msg["data"] - # Convert from json to dict data = json.loads(data) - profileid = data["profileid"] + # profileid = data["profileid"] twid = data["twid"] - # Get flow that is now in json format flow = data["flow"] - # Convert flow to a dict flow = json.loads(flow) # Convert the common fields to something that can # be interpreted diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index fe7b02d04..f1ef1290c 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -569,9 +569,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index d785b51c9..23c23d3d4 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -14,7 +14,7 @@ import redis import validators - +from slips_files.common.state_handler import get_final_state_from_flags from slips_files.common.abstracts.observer import IObservable from slips_files.core.output import Output @@ -324,14 +324,15 @@ def add_port( state_hist = flow.state_hist if hasattr(flow, "state_hist") else "" if "^" in state_hist: - # The majority of the FP with horizontal port scan detection happen because a - # benign computer changes wifi, and many not established conns are redone, - # which look like a port scan to 10 webpages. To avoid this, we IGNORE all - # the flows that have in the history of flags (field history in zeek), the ^, + # The majority of the FP with horizontal port scan detection + # happen because a benign computer changes wifi, and many not + # established conns are redone, which look like a port scan to + # 10 webpages. To avoid this, we IGNORE all the flows that have + # in the history of flags (field history in zeek), the ^, # that means that the flow was swapped/flipped. - # The below key_name is only used by the portscan module to check for horizontal - # portscan, which means we can safely ignore it here and it won't affect the rest - # of slips + # The below key_name is only used by the portscan module to + # check for horizontal portscan, which means we can safely + # ignore it here and it won't affect the rest of slips return False # Choose which port to use based if we were asked Dst or Src @@ -342,10 +343,10 @@ def add_port( ip_key = "srcips" if role == "Server" else "dstips" # Get the state. Established, NotEstablished - summaryState = self.get_final_state_from_flags(state, pkts) + state = get_final_state_from_flags(state, pkts) old_profileid_twid_data = self.get_data_from_profile_tw( - profileid, twid, port_type, summaryState, proto, role, "Ports" + profileid, twid, port_type, state, proto, role, "Ports" ) try: @@ -355,7 +356,8 @@ def add_port( port_data["totalpkt"] += pkts port_data["totalbytes"] += totbytes - # if there's a conn from this ip on this port, update the pkts of this conn + # if there's a conn from this ip on this port, update the pkts + # of this conn if ip in port_data[ip_key]: port_data[ip_key][ip]["pkts"] += pkts port_data[ip_key][ip]["spkts"] += spkts @@ -386,145 +388,10 @@ def add_port( old_profileid_twid_data[port] = port_data data = json.dumps(old_profileid_twid_data) hash_key = f"{profileid}{self.separator}{twid}" - key_name = f"{port_type}Ports{role}{proto}{summaryState}" + key_name = f"{port_type}Ports{role}{proto}{state}" self.r.hset(hash_key, key_name, str(data)) self.mark_profile_tw_as_modified(profileid, twid, starttime) - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in getFinalStateFromFlags() in database.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) - def get_data_from_profile_tw( self, profileid: str, @@ -722,14 +589,14 @@ def add_ips(self, profileid, twid, flow, role): self.update_times_contacted(ip, direction, profileid, twid) # Get the state. Established, NotEstablished - summaryState = self.get_final_state_from_flags(flow.state, flow.pkts) - key_name = f"{direction}IPs{role}{flow.proto.upper()}{summaryState}" + state = get_final_state_from_flags(flow.state, flow.pkts) + key_name = f"{direction}IPs{role}{flow.proto.upper()}{state}" # Get the previous data about this key old_profileid_twid_data = self.get_data_from_profile_tw( profileid, twid, direction, - summaryState, + state, flow.proto, role, "IPs", @@ -806,7 +673,7 @@ def add_flow( The profileid is the main profile that this flow is related too. : param new_profile_added : is set to True for everytime we see a new srcaddr """ - summary_state = self.get_final_state_from_flags(flow.state, flow.pkts) + summary_state = get_final_state_from_flags(flow.state, flow.pkts) flow_dict = { "ts": flow.starttime, "dur": flow.dur, diff --git a/slips_files/core/database/sqlite_db/database.py b/slips_files/core/database/sqlite_db/database.py index 4792ea67c..4dd52dbfc 100644 --- a/slips_files/core/database/sqlite_db/database.py +++ b/slips_files/core/database/sqlite_db/database.py @@ -31,11 +31,13 @@ def connect(self): """ db_newly_created = False if not os.path.exists(self._flows_db): - # db not created, mark it as first time accessing it so we can init tables once we connect + # db not created, mark it as first time accessing it so we can + # init tables once we connect db_newly_created = True self._init_db() - # you can get multithreaded access on a single pysqlite connection by passing "check_same_thread=False" + # you can get multithreaded access on a single pysqlite connection + # by passing "check_same_thread=False" self.conn = sqlite3.connect( self._flows_db, check_same_thread=False, timeout=20 ) From f0eb12f0053b15d98e426a5459374d82f2919807 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:36:55 +0200 Subject: [PATCH 005/498] mlflow. Ignore UID column --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c57a7a358..e2aa1e0ee 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -141,6 +141,7 @@ def process_features(self, dataset): "dpkts", "smac", "dmac", + "uid", ] for field in to_drop: try: From 6bc8351cf891c12bf16f4d298a8f3f50c0506850 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 006/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9e0aa772c..c8226368c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -11,6 +11,7 @@ import datetime import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -121,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -133,7 +269,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -156,15 +292,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From 5489ab209a6c96f03f8afd73c7ce7f31a78382f2 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 007/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 000000000..b671a09a2 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 3776649a496c4b2b40962752b19f961e047f21bd Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 008/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From f75e88b9a312b22ca6b14af438bd43a0a428a36c Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 009/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 98651fd08..2f81ecd8e 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -607,9 +607,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 47193d79912875918ab9e5612b617b3c4ec42886 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 010/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9e0aa772c..c8226368c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -11,6 +11,7 @@ import datetime import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -121,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -133,7 +269,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -156,15 +292,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From 0de55cb022a1c84cc642febf6383e9d314510a23 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 011/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 000000000..b671a09a2 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From cfa52224d7aee90e9ce0cf5e68625360564b3181 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 012/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 8d14ef8a2803807e785f1bc4222ea2f391dd46e1 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 013/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 9ecc16635..4de72c756 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -607,9 +607,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From fc735e7374e409de67c16e6a4b6e392efbc5d603 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 014/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a70..94eb27afd 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 45ea08b585d956c0fb483cf789ec974111f0d6b5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 015/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 94eb27afd..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From a3ff70d540ab30bd35686eb5c4b338bbff17aa25 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 016/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3a3b0a72f..8917fef6a 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -120,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -132,7 +268,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -155,15 +291,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From 385d1e2cf142e677602dc80c94d9ecd5c6c0896b Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 017/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 000000000..b671a09a2 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From d8783fd1d1b85e1e39aa4d2b05520a95463e52da Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 018/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 242ab4633538e6632d8418cf5df33469d8dfc585 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 019/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 8917fef6a..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index d0d586c4c..e0028e813 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -610,9 +610,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 06d18ac0a03710092ed0be96eeec10cf89cb2ecf Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 020/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..12c3589ed 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 22731c9987bee24e4848658b095918ebd40ffdc0 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 021/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589ed..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 88dd4e6a6a527f021269be0c022f403b1ba23961 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 022/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..12c3589ed 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 5b81164e3e7be0145975fbe7016021614bdeafd5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 023/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589ed..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From c0c5e537c723578ceae0ec4002b25d882d37ec36 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 024/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..12c3589ed 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 48cf9d05e63b9d09e44536dc77da6553118561ed Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 025/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589ed..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e2aaf16170aefd3350c32122988f29633454c260 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 026/498] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e0028e813..d0d586c4c 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -610,6 +610,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 5b87d35ad971e343d73daa846350d6277682e3ba Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:25:03 +0100 Subject: [PATCH 027/498] delete sys --- modules/flowmldetection/flowmldetection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..286a397ef 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,6 @@ import json import traceback import warnings -import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 08dec989d4d1bb54ecf4922f294bcbee5c264ab3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 028/498] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115b..000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 0c4455c108509246993c8aa081310f9c0ce5a240 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 029/498] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 286a397ef..fac5e674f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -160,7 +160,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From 3429549c6326c9c7d7b9bc299fef48d6b754fb48 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 030/498] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 144 +++++++++++---------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fac5e674f..e6ea0b517 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -55,8 +55,12 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -67,26 +71,25 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -95,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -118,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -144,9 +147,7 @@ def process_features(self, dataset): "history", "uid", "dir_", - "dbytes", "endtime", - "bytes", "flow_source", ] for field in to_drop: @@ -161,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -199,7 +199,11 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ dataset.proto, dataset.dport, dataset.sport, @@ -210,10 +214,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_flow: + for field in fields_to_convert_to_float: try: field = field.astype("float64") - except ValueError: + except (ValueError, AttributeError): pass return dataset @@ -222,9 +226,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: @@ -240,44 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "Established", - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "Established", - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: - given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", - "dbytes", - "dpkts", "endtime", - "bytes", "flow_source", ] for field in fields_to_drop: @@ -343,7 +347,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" + f"Error in detect() while processing " f"\n{x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -435,18 +439,16 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): + # When a new flow arrives msg = json.loads(msg["data"]) - twid = msg["twid"] + self.twid = msg["twid"] + self.profileid = msg["profileid"] self.flow = msg["flow"] - # these fields are expected in testing. update the original + # These following extra fields are expected in testing. update the original # flow dict to have them self.flow.update( { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -459,23 +461,31 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -495,8 +505,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -504,9 +514,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From a779358bb3a6f8d72446c45c8b3feaf1406c87f4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:29 +0100 Subject: [PATCH 031/498] Fix the profiler handler for cases of nan in state --- .../core/database/redis_db/profile_handler.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 0489372cd..1ea764464 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -379,7 +379,12 @@ def get_final_state_from_flags(self, state, pkts): We receive the pakets to distinguish some Reset connections """ try: - pre = state.split("_")[0] + # In some flows the state is a nan + try: + pre = state.split("_")[0] + except AttributeError: + pre = '' + try: # Try suricata states """ @@ -401,7 +406,11 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - suf = state.split("_")[1] + # In some flows the state is a nan + try: + suf = state.split("_")[1] + except AttributeError: + suf = '' if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -502,7 +511,7 @@ def get_final_state_from_flags(self, state, pkts): except Exception: exception_line = sys.exc_info()[2].tb_lineno self.print( - f"Error in getFinalStateFromFlags() in database.py line {exception_line}", + f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}", 0, 1, ) From 223d72d0948098bb30f3a0992ac978f2249a9c35 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:20:42 +0100 Subject: [PATCH 032/498] slips.yaml. Update to have correct labels. By default test. Defaul training lbel is benign --- config/slips.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index 34f41e710..31847a6df 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -106,13 +106,12 @@ parameters: deletePrevdb: true # Set the label for all the flows that are being read. - # For now only normal and malware directly. No option for setting labels - # with a filter + # For now only Benign and Malicious (Capitalized) # The purpose is to be used in the training of ML models and to output # flows with labels for other tools. - # label: malicious - # label: unknown - label: normal + # label: Malicious + # label: Benign + label: Benign # If Zeek files are rotated or not to avoid running out of disk. # Zeek rotation is enabled by default when using an interface, # which means Slips will delete all Zeek log files after 1 day From 18b9a9559b08a4675248e2437eae7b271ab9ec94 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:21:21 +0100 Subject: [PATCH 033/498] First ipython to tst ML flow related models --- modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb new file mode 100644 index 000000000..d726cd280 --- /dev/null +++ b/modules/flowmldetection/flowmlanalysis.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of Flows with Machine Learning for Slips" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis of a fixed list of flows to try techniques and find parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.preprocessing import StandardScaler\n", + "import pickle\n", + "import pandas as pd\n", + "import json\n", + "import traceback\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "slips-new", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 5c89e4db5a40fda5b1cce21996c684d36c93d667 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 034/498] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e6ea0b517..0fa1e4d76 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From ebbfd953cb028cf9ef0b75cd17168fc70f6921b0 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 035/498] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0fa1e4d76..5c5f9943f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From 13287d134eb09ac30dcb0e056d5465544d545591 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:26:42 +0100 Subject: [PATCH 036/498] profile_handler. Small bug in how we handled the profiles, we were using 'in' instead of == for established. Some not established MAY not have been correctly captured --- slips_files/core/database/redis_db/profile_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 1ea764464..85fdec5a6 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -393,9 +393,10 @@ def get_final_state_from_flags(self, state, pkts): these are: New, Established and Closed,for UDP only new and established. For each of these states Suricata can employ different timeouts. """ - if "new" in state or "established" in state: + # This is controversial, but if we dont have a good state, we consider it not established for now + if "new" in state or state.lower() == "established": return "Established" - elif "closed" in state: + elif "closed" in state or state.lower() == 'not established': return "Not Established" # We have varius type of states depending on the type of flow. @@ -406,7 +407,6 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - # In some flows the state is a nan try: suf = state.split("_")[1] except AttributeError: From 9588762aa88da736012b2b6f5844f3ca0c39f15c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 037/498] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1073 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 411 zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice! z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWpQXRJ|Emr(~!@tY-9P0&;YE zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1 z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8 zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5 delta 380 zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN zW*@ diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch literal 890 zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD< z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*|B)5DrlQdy7+(!%6#F{QHuBFo&v zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~ zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E; z8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_ delta 525 zcmeyxHj9S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5mhSAF{H=RwvmLf> z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%YfQ^>gH&y>a4SnPazubB5!cqc1YnCGT))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4d&dkpP QMw@}2k%^wsl+q+U0P2|NZU6uP From 531946f0f0d880cc68dd95991a08b387b6a78c39 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 038/498] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 039/498] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5c5f9943f..fe950ed4b 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "endtime", "flow_source", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From 473b0958153803624f757ac7b3bb85ffb9d68930 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 040/498] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 041/498] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 9efd09bf3ca9fc3de4899135fd286db07b8df3d8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 042/498] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From 978f87cb89f5ec6dfdea380afb76aa952b77bb38 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 043/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e44ac83f4..16b67e903 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -120,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -132,7 +268,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -155,15 +291,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From a5dd40500fc88636982bcbb9dd8bf05803dbb3cc Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 044/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 000000000..b671a09a2 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 3579edc92ec3832c3116a3180af419029cb89b66 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 045/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 47d65ed1ef5545777e0aef73e13ba14dd231b51c Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 046/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 16b67e903..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e8ca3aaf6..b4b2128d3 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 55ce0bbf1fdb8ae5ebeea066fd3efe07cab9a0b8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 047/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From a4446a54dfcb2299392a2e3a59d0d755de693153 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 048/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From eead7b56753dba0923ef0ac41be1e3361ec70cd3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 049/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 43ab23bbd9a7699efed3d731d83df88e53afa451 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 050/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From df9417d840129cee1864a0a86c6ef33e82db1038 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 051/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From f5c4e0c67f148ad3f312ecea5801af9fd28a1877 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 052/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From eda2d83e77f223bc8e436aae6bd17b7eb6c83ece Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 053/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From ed1997e45fbed8609b9ffb8787dbb61059d5d7a2 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 054/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From c0d8b16d7fc7c2a404c5d0f3c18768bebd49aa0f Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 055/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 00e2ab175a7d00fca76d09433be4df6141ff4316 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 056/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From ee417b90a570747d57c2bffc75ec54f0c3e22c73 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 057/498] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index b4b2128d3..e8ca3aaf6 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 774e03dc5598ccd9627fd1e1aece3b9e883f38fa Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:25:03 +0100 Subject: [PATCH 058/498] delete sys --- modules/flowmldetection/flowmldetection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..c06755a59 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,6 @@ import json import traceback import warnings -import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 6220c230c86e0cbfd8148829e684335cc62f2a8e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 059/498] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115b..000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 4d2dd99cbec81de085e35ce087ab8ac634908768 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 060/498] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c06755a59..87e07c759 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -160,7 +160,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From f0e53cfc658c31e6046ed2cf4741819c89517576 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 061/498] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 144 +++++++++++---------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 87e07c759..e91495d64 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -55,8 +55,12 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -67,26 +71,25 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -95,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -118,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -144,9 +147,7 @@ def process_features(self, dataset): "history", "uid", "dir_", - "dbytes", "endtime", - "bytes", "flow_source", ] for field in to_drop: @@ -161,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -199,7 +199,11 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ dataset.proto, dataset.dport, dataset.sport, @@ -210,10 +214,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_flow: + for field in fields_to_convert_to_float: try: field = field.astype("float64") - except ValueError: + except (ValueError, AttributeError): pass return dataset @@ -222,9 +226,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: @@ -240,44 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "Established", - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "Established", - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: - given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", - "dbytes", - "dpkts", "endtime", - "bytes", "flow_source", "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", @@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" + f"Error in detect() while processing " f"\n{x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -437,18 +441,16 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): + # When a new flow arrives msg = json.loads(msg["data"]) - twid = msg["twid"] + self.twid = msg["twid"] + self.profileid = msg["profileid"] self.flow = msg["flow"] - # these fields are expected in testing. update the original + # These following extra fields are expected in testing. update the original # flow dict to have them self.flow.update( { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -461,23 +463,31 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -497,8 +507,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -506,9 +516,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From b7e82cf985596d60b66ee7ac7d2a7052a0b986dc Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:29 +0100 Subject: [PATCH 062/498] Fix the profiler handler for cases of nan in state --- .../core/database/redis_db/profile_handler.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 0489372cd..1ea764464 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -379,7 +379,12 @@ def get_final_state_from_flags(self, state, pkts): We receive the pakets to distinguish some Reset connections """ try: - pre = state.split("_")[0] + # In some flows the state is a nan + try: + pre = state.split("_")[0] + except AttributeError: + pre = '' + try: # Try suricata states """ @@ -401,7 +406,11 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - suf = state.split("_")[1] + # In some flows the state is a nan + try: + suf = state.split("_")[1] + except AttributeError: + suf = '' if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -502,7 +511,7 @@ def get_final_state_from_flags(self, state, pkts): except Exception: exception_line = sys.exc_info()[2].tb_lineno self.print( - f"Error in getFinalStateFromFlags() in database.py line {exception_line}", + f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}", 0, 1, ) From ccde23ede2ac27f38809ce5f1bf2e5518c1d73c1 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:20:42 +0100 Subject: [PATCH 063/498] slips.yaml. Update to have correct labels. By default test. Defaul training lbel is benign --- config/slips.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index f7089b41a..8736eaf51 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -106,13 +106,12 @@ parameters: deletePrevdb: true # Set the label for all the flows that are being read. - # For now only normal and malware directly. No option for setting labels - # with a filter + # For now only Benign and Malicious (Capitalized) # The purpose is to be used in the training of ML models and to output # flows with labels for other tools. - # label: malicious - # label: unknown - label: normal + # label: Malicious + # label: Benign + label: Benign # If Zeek files are rotated or not to avoid running out of disk. # Zeek rotation is enabled by default when using an interface, # which means Slips will delete all Zeek log files after 1 day From 667faa3f1bc572053f530e0e8b3e8ca40ef19976 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:21:21 +0100 Subject: [PATCH 064/498] First ipython to tst ML flow related models --- modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb new file mode 100644 index 000000000..d726cd280 --- /dev/null +++ b/modules/flowmldetection/flowmlanalysis.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of Flows with Machine Learning for Slips" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis of a fixed list of flows to try techniques and find parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.preprocessing import StandardScaler\n", + "import pickle\n", + "import pandas as pd\n", + "import json\n", + "import traceback\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "slips-new", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From ad1488054068bc9d5bc3b596f04248523ef42a83 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 065/498] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e91495d64..58b4ce1e4 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From 02804ca94b80f7a24374b36ec073af55aa272c3c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 066/498] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 58b4ce1e4..4a4d46e37 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From dea7702d8b5518fc4fc2d2fd5262e45c0ddec65d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:26:42 +0100 Subject: [PATCH 067/498] profile_handler. Small bug in how we handled the profiles, we were using 'in' instead of == for established. Some not established MAY not have been correctly captured --- slips_files/core/database/redis_db/profile_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 1ea764464..85fdec5a6 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -393,9 +393,10 @@ def get_final_state_from_flags(self, state, pkts): these are: New, Established and Closed,for UDP only new and established. For each of these states Suricata can employ different timeouts. """ - if "new" in state or "established" in state: + # This is controversial, but if we dont have a good state, we consider it not established for now + if "new" in state or state.lower() == "established": return "Established" - elif "closed" in state: + elif "closed" in state or state.lower() == 'not established': return "Not Established" # We have varius type of states depending on the type of flow. @@ -406,7 +407,6 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - # In some flows the state is a nan try: suf = state.split("_")[1] except AttributeError: From f7f2eb3b80d90e0dc31d3fcfe7394d11650f84f4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 068/498] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1073 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 411 zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice! z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWpQXRJ|Emr(~!@tY-9P0&;YE zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1 z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8 zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5 delta 380 zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN zW*@ diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch literal 890 zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD< z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*|B)5DrlQdy7+(!%6#F{QHuBFo&v zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~ zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E; z8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_ delta 525 zcmeyxHj9S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5mhSAF{H=RwvmLf> z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%YfQ^>gH&y>a4SnPazubB5!cqc1YnCGT))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4d&dkpP QMw@}2k%^wsl+q+U0P2|NZU6uP From 81b103d0dd8ef69f3cadec4ff92e8e6bbe2c0027 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 069/498] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 070/498] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4a4d46e37..d8e9ada27 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From 10fee830a3fecf11002d3037e75d8c094d72b4c8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 071/498] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 072/498] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From cbe0718e114b9413874ab6ccccb42da441dee2c4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 073/498] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From d74032c06c327b368e035142c939570182d4571c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 May 2025 07:43:50 +0000 Subject: [PATCH 074/498] build(deps): bump termcolor from 3.0.1 to 3.1.0 in /install Bumps [termcolor](https://github.com/termcolor/termcolor) from 3.0.1 to 3.1.0. - [Release notes](https://github.com/termcolor/termcolor/releases) - [Changelog](https://github.com/termcolor/termcolor/blob/main/CHANGES.md) - [Commits](https://github.com/termcolor/termcolor/compare/3.0.1...3.1.0) --- updated-dependencies: - dependency-name: termcolor dependency-version: 3.1.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- install/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install/requirements.txt b/install/requirements.txt index 040d72c2b..88d0ada48 100644 --- a/install/requirements.txt +++ b/install/requirements.txt @@ -29,7 +29,7 @@ pytest-dependency==0.6.0 whois==1.20240129.2 flask tldextract==5.3.0 -termcolor==3.0.1 +termcolor==3.1.0 yappi==1.6.10 pytest-sugar==1.0.0 aid_hash From 47d259cb44d935721d253ccd5ec92d68488b732c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 May 2025 07:44:03 +0000 Subject: [PATCH 075/498] build(deps): bump validators from 0.34.0 to 0.35.0 in /install Bumps [validators](https://github.com/python-validators/validators) from 0.34.0 to 0.35.0. - [Release notes](https://github.com/python-validators/validators/releases) - [Changelog](https://github.com/python-validators/validators/blob/master/CHANGES.md) - [Commits](https://github.com/python-validators/validators/compare/0.34.0...0.35.0) --- updated-dependencies: - dependency-name: validators dependency-version: 0.35.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- install/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install/requirements.txt b/install/requirements.txt index 040d72c2b..d5ecb3bb3 100644 --- a/install/requirements.txt +++ b/install/requirements.txt @@ -10,7 +10,7 @@ stix2==3.0.1 certifi==2025.4.26 tensorflow==2.16.1 Keras -validators==0.34.0 +validators==0.35.0 ipwhois==1.2.0 matplotlib==3.10.1 scikit_learn From 49db4ea471595c68ce27bc7e47a175b0d08dc0b8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 2 May 2025 08:04:14 +0000 Subject: [PATCH 076/498] build(deps): bump ruff from 0.11.7 to 0.11.8 in /install Bumps [ruff](https://github.com/astral-sh/ruff) from 0.11.7 to 0.11.8. - [Release notes](https://github.com/astral-sh/ruff/releases) - [Changelog](https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md) - [Commits](https://github.com/astral-sh/ruff/compare/0.11.7...0.11.8) --- updated-dependencies: - dependency-name: ruff dependency-version: 0.11.8 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- install/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install/requirements.txt b/install/requirements.txt index 040d72c2b..0fbbb581a 100644 --- a/install/requirements.txt +++ b/install/requirements.txt @@ -34,7 +34,7 @@ yappi==1.6.10 pytest-sugar==1.0.0 aid_hash black==24.10.0 -ruff==0.11.7 +ruff==0.11.8 pre-commit==4.0.1 coverage==7.8.0 netifaces==0.11.0 From 0d95d433c8f2dbabb22b9197edf4e677f0134691 Mon Sep 17 00:00:00 2001 From: alya Date: Fri, 2 May 2025 15:13:12 +0300 Subject: [PATCH 077/498] add an unblocker interface --- slips_files/common/abstracts/unblocker.py | 91 +++++++++++++++++++++++ slips_files/core/input.py | 8 +- 2 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 slips_files/common/abstracts/unblocker.py diff --git a/slips_files/common/abstracts/unblocker.py b/slips_files/common/abstracts/unblocker.py new file mode 100644 index 000000000..a07255b52 --- /dev/null +++ b/slips_files/common/abstracts/unblocker.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +# SPDX-License-Identifier: GPL-2.0-only +from abc import ABC, abstractmethod +import time +from datetime import datetime +from threading import Thread +from slips_files.core.database.database_manager import DBManager +from slips_files.core.structures.evidence import TimeWindow + + +class Unblocker(ABC): + """ + For every blocking method in slips, there should be an unblocker + implemented + """ + + @property + @abstractmethod + def name(self) -> str: + pass + + def __init__(self, db: DBManager): + self.db = db + self.checker = Thread( + target=self._check_if_time_to_unblock, + daemon=True, + name=f"{self.name}_unblocking_checker", + ) + self.requests = {} + + @abstractmethod + def _add_req(self, *args, **kwargs): + """Add an unblocking request to self.requests""" + + @abstractmethod + def _del_request(self, *args, **kwargs): + """Delete an unblocking request from self.requests""" + + @abstractmethod + def unblock_request(self, *args, **kwargs): + """ + Only public method. + Used by the blocking module to request an unblock + """ + + @abstractmethod + def _unblock(self, *args, **kwargs): + """ + Should contain the logic to unblock, throught the FW for + example. + is called whenever a ts is reached in _check_if_time_to_unblock() + to do the actual unblocking + """ + + def _check_if_time_to_unblock(self): + """ + This method should be called in a thread that checks the timestamps + in self.requests regularly. + Each time a ts is reached, it should call _unblock() + """ + while True: + requests_to_del = [] + + now = datetime.now().replace(microsecond=0) + for ip, request in self.requests.items(): + ts = self.request["ts_to_unblock"] + if ts >= now: + if self._unblock(ip): + requests_to_del.append(ip) + + for ip in requests_to_del: + self._del_req(ip) + + time.sleep(1) # sleep 1 second between checks + + def _calc_unblock_time( + self, ip: str, cur_tw: TimeWindow, how_many_tws_to_block + ) -> TimeWindow: + """ + Calculates the timestamp to unblock. + It adds how_many_tws_to_block to the current time window and + returns the resulting timewindow + """ + # we unblock at the end of this tw + tw_to_unblock: int = cur_tw.number + how_many_tws_to_block + tw_start, tw_end = self.db.get_tw_limits( + f"profile_{ip}", f"timewindow{tw_to_unblock}" + ) + return TimeWindow( + number=tw_to_unblock, start_time=tw_start, end_time=tw_end + ) diff --git a/slips_files/core/input.py b/slips_files/core/input.py index 72ca8306c..b35a4abaa 100644 --- a/slips_files/core/input.py +++ b/slips_files/core/input.py @@ -83,7 +83,9 @@ def init( # create the remover thread self.remover_thread = threading.Thread( - target=self.remove_old_zeek_files, daemon=True + target=self.remove_old_zeek_files, + daemon=True, + name="input_remover_thread", ) self.open_file_handlers = {} self.c1 = self.db.subscribe("remove_old_files") @@ -91,7 +93,9 @@ def init( self.timeout = None # zeek rotated files to be deleted after a period of time self.to_be_deleted = [] - self.zeek_thread = threading.Thread(target=self.run_zeek, daemon=True) + self.zeek_thread = threading.Thread( + target=self.run_zeek, daemon=True, name="run_zeek_thread" + ) # used to give the profiler the total amount of flows to # read with the first flow only self.is_first_flow = True From b9baedeaaafea62a873179e22b05b0fb5a2901ad Mon Sep 17 00:00:00 2001 From: alya Date: Fri, 2 May 2025 15:34:17 +0300 Subject: [PATCH 078/498] move exec_iptables_command() to a separate file to be used by the blocker and the unblocker --- modules/blocking/blocking.py | 80 ++------------------------- modules/blocking/exec_iptables_cmd.py | 29 ++++++++++ 2 files changed, 35 insertions(+), 74 deletions(-) create mode 100644 modules/blocking/exec_iptables_cmd.py diff --git a/modules/blocking/blocking.py b/modules/blocking/blocking.py index dd44089d2..df20fb1e5 100644 --- a/modules/blocking/blocking.py +++ b/modules/blocking/blocking.py @@ -9,6 +9,8 @@ import subprocess import time +from .exec_iptables_cmd import exec_iptables_command + class Blocking(IModule): """Data should be passed to this module as a json encoded python dict, @@ -169,31 +171,6 @@ def initialize_chains_in_firewall(self): os.system(f"{self.sudo}nft add table inet slipsBlocking") # TODO: HANDLE NFT TABLE - def exec_iptables_command(self, action, ip_to_block, flag, options): - """ - Constructs the iptables rule/command based on the options sent in the message - flag options: - -s : to block traffic from source ip - -d : to block to destination ip - action options: - insert : to insert a new rule at the top of slipsBlocking list - delete : to delete an existing rule - """ - - command = ( - f"{self.sudo}iptables --{action} slipsBlocking {flag} {ip_to_block} " - f'-m comment --comment "Slips rule" >/dev/null 2>&1' - ) - # Add the options constructed in block_ip or unblock_ip to the iptables command - for key in options.keys(): - command += options[key] - command += " -j DROP" - # Execute - exit_status = os.system(command) - - # 0 is the success value - return exit_status == 0 - def is_ip_blocked(self, ip) -> bool: """Checks if ip is already blocked or not""" @@ -240,7 +217,8 @@ def block_ip( if from_: # Add rule to block traffic from source ip_to_block (-s) - blocked = self.exec_iptables_command( + blocked = exec_iptables_command( + self.sudo, action="insert", ip_to_block=ip_to_block, flag="-s", @@ -251,7 +229,8 @@ def block_ip( if to: # Add rule to block traffic to ip_to_block (-d) - blocked = self.exec_iptables_command( + blocked = exec_iptables_command( + self.sudo, action="insert", ip_to_block=ip_to_block, flag="-d", @@ -285,53 +264,6 @@ def block_ip( return False - def unblock_ip( - self, - ip_to_unblock, - from_=None, - to=None, - dport=None, - sport=None, - protocol=None, - ): - """Unblocks an ip based on the flags passed in the message""" - # This dictionary will be used to construct the rule - options = { - "protocol": f" -p {protocol}" if protocol else "", - "dport": f" --dport {dport}" if dport else "", - "sport": f" --sport {sport}" if sport else "", - } - # Set the default behaviour to unblock all traffic from and to an ip - if from_ is None and to is None: - from_, to = True, True - # Set the appropriate iptables flag to use in the command - # The module sending the message HAS TO specify either 'from_' or 'to' or both - # so that this function knows which rule to delete - # if both or none were specified we'll be executing 2 commands/deleting 2 rules - - # Block traffic from source ip - if from_: - unblocked = self.exec_iptables_command( - action="delete", - ip_to_block=ip_to_unblock, - flag="-s", - options=options, - ) - # Block traffic from distination ip - if to: - unblocked = self.exec_iptables_command( - action="delete", - ip_to_block=ip_to_unblock, - flag="-d", - options=options, - ) - - if unblocked: - # Successfully blocked an ip - self.print(f"Unblocked: {ip_to_unblock}") - return True - return False - def check_for_ips_to_unblock(self): unblocked_ips = set() # check if any ip needs to be unblocked diff --git a/modules/blocking/exec_iptables_cmd.py b/modules/blocking/exec_iptables_cmd.py new file mode 100644 index 000000000..2d5fe08df --- /dev/null +++ b/modules/blocking/exec_iptables_cmd.py @@ -0,0 +1,29 @@ +import os + + +def exec_iptables_command(sudo: str, action, ip_to_block, flag, options): + """ + Constructs the iptables rule/command based on the options sent + + flag options: + -s : to block traffic from source ip + -d : to block to destination ip + action options: + insert : to insert a new rule at the top of slipsBlocking list + delete : to delete an existing rule + """ + + command = ( + f"{sudo}iptables --{action} slipsBlocking {flag} {ip_to_block} " + f'-m comment --comment "Slips rule" >/dev/null 2>&1' + ) + # Add the options constructed in block_ip or unblock_ip to the + # iptables command + for key in options.keys(): + command += options[key] + command += " -j DROP" + # Execute + exit_status = os.system(command) + + # 0 is the success value + return exit_status == 0 From 02ce12008f8d03f7f403a730673a07e750c996e2 Mon Sep 17 00:00:00 2001 From: alya Date: Fri, 2 May 2025 15:40:44 +0300 Subject: [PATCH 079/498] unblocker interface: remove the implemnetation of _check_if_time_to_unblock() from the interface --- slips_files/common/abstracts/unblocker.py | 26 ++++------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/slips_files/common/abstracts/unblocker.py b/slips_files/common/abstracts/unblocker.py index a07255b52..b976ab55d 100644 --- a/slips_files/common/abstracts/unblocker.py +++ b/slips_files/common/abstracts/unblocker.py @@ -1,14 +1,12 @@ # SPDX-FileCopyrightText: 2021 Sebastian Garcia # SPDX-License-Identifier: GPL-2.0-only from abc import ABC, abstractmethod -import time -from datetime import datetime from threading import Thread from slips_files.core.database.database_manager import DBManager from slips_files.core.structures.evidence import TimeWindow -class Unblocker(ABC): +class IUnblocker(ABC): """ For every blocking method in slips, there should be an unblocker implemented @@ -52,26 +50,10 @@ def _unblock(self, *args, **kwargs): to do the actual unblocking """ + @abstractmethod def _check_if_time_to_unblock(self): - """ - This method should be called in a thread that checks the timestamps - in self.requests regularly. - Each time a ts is reached, it should call _unblock() - """ - while True: - requests_to_del = [] - - now = datetime.now().replace(microsecond=0) - for ip, request in self.requests.items(): - ts = self.request["ts_to_unblock"] - if ts >= now: - if self._unblock(ip): - requests_to_del.append(ip) - - for ip in requests_to_del: - self._del_req(ip) - - time.sleep(1) # sleep 1 second between checks + """a bg thread that unblocks ips once their ts is reached""" + ... def _calc_unblock_time( self, ip: str, cur_tw: TimeWindow, how_many_tws_to_block From 89be7277ead51cce64b5ffd5838ffb91bb55d69a Mon Sep 17 00:00:00 2001 From: alya Date: Fri, 2 May 2025 16:18:52 +0300 Subject: [PATCH 080/498] blocking: handle unblocking through the Unblocker() helper class --- modules/blocking/blocking.py | 177 +++++++++++++++-------------------- 1 file changed, 78 insertions(+), 99 deletions(-) diff --git a/modules/blocking/blocking.py b/modules/blocking/blocking.py index df20fb1e5..d1eeffd54 100644 --- a/modules/blocking/blocking.py +++ b/modules/blocking/blocking.py @@ -1,6 +1,5 @@ # SPDX-FileCopyrightText: 2021 Sebastian Garcia # SPDX-License-Identifier: GPL-2.0-only -from slips_files.common.abstracts.module import IModule import platform import sys import os @@ -9,7 +8,9 @@ import subprocess import time +from slips_files.common.abstracts.module import IModule from .exec_iptables_cmd import exec_iptables_command +from modules.blocking.unblocker import Unblocker class Blocking(IModule): @@ -26,21 +27,19 @@ def init(self): self.channels = { "new_blocking": self.c1, } - self.os = platform.system() - if self.os == "Darwin": + if platform.system() == "Darwin": self.print("Mac OS blocking is not supported yet.") sys.exit() + self.firewall = self.determine_linux_firewall() self.set_sudo_according_to_env() self.initialize_chains_in_firewall() - # this will keep track of ips that are blocked only for a specific time - # format {ip: (block_for(seconds), time_of_blocking(epoch))} - self.unblock_ips = {} - + self.unblocker = Unblocker(self.db) # self.test() def test(self): - """For debugging purposes, once we're done with the module we'll delete it""" + """For debugging purposes, once we're done with the module we'll + delete it""" if not self.is_ip_blocked("2.2.0.0"): blocking_data = { @@ -62,7 +61,8 @@ def test(self): # self.unblock_ip("2.2.0.0",True,True) def set_sudo_according_to_env(self): - """Check if running in host or in docker and sets sudo string accordingly. + """ + Check if running in host or in docker and sets sudo string accordingly. There's no sudo in docker so we need to execute all commands without it """ # This env variable is defined in the Dockerfile @@ -72,13 +72,12 @@ def set_sudo_according_to_env(self): self.sudo = "" if self.running_in_docker else "sudo " def determine_linux_firewall(self): - """Returns the currently installed firewall and installs iptables if none was found""" + """Returns the currently installed firewall and installs iptables if + none was found""" if shutil.which("iptables"): # comes pre installed in docker return "iptables" - elif shutil.which("nftables"): - return "nftables" else: # no firewall installed # user doesn't have a firewall @@ -87,9 +86,10 @@ def determine_linux_firewall(self): ) sys.exit() - def delete_slipsBlocking_chain(self): + def delete_slips_blocking_chain(self): """Flushes and deletes everything in slipsBlocking chain""" - # check if slipsBlocking chain exists before flushing it and suppress stderr and stdout while checking + # check if slipsBlocking chain exists before flushing it and suppress + # stderr and stdout while checking # 0 means it exists chain_exists = ( os.system( @@ -98,82 +98,78 @@ def delete_slipsBlocking_chain(self): == 0 ) if self.firewall == "iptables" and chain_exists: - # Delete all references to slipsBlocking inserted in INPUT OUTPUT and FORWARD before deleting the chain - cmd = f"{self.sudo}iptables -D INPUT -j slipsBlocking >/dev/null 2>&1 ; {self.sudo}iptables -D OUTPUT -j slipsBlocking >/dev/null 2>&1 ; {self.sudo}iptables -D FORWARD -j slipsBlocking >/dev/null 2>&1" + # Delete all references to slipsBlocking inserted in INPUT OUTPUT + # and FORWARD before deleting the chain + cmd = ( + f"{self.sudo}iptables -D INPUT -j slipsBlocking " + f">/dev/null 2>&1 ; {self.sudo}iptables -D OUTPUT " + f"-j slipsBlocking >/dev/null 2>&1 ; " + f"{self.sudo}iptables -D FORWARD -j " + f"slipsBlocking >/dev/null 2>&1" + ) os.system(cmd) # flush and delete all the rules in slipsBlocking - cmd = f"{self.sudo}iptables -F slipsBlocking >/dev/null 2>&1 ; {self.sudo} iptables -X slipsBlocking >/dev/null 2>&1" + cmd = ( + f"{self.sudo}iptables -F slipsBlocking >/dev/null 2>&1 ; " + f"{self.sudo} iptables -X slipsBlocking >/dev/null 2>&1" + ) os.system(cmd) print("Successfully deleted slipsBlocking chain.") return True - elif self.firewall == "nftables": - # TODO: handle the creation of the slipsBlocking chain in nftables - # Flush rules in slipsBlocking chain because you can't delete a chain without flushing first - os.system(f"{self.sudo}nft flush chain inet slipsBlocking") - # Delete slipsBlocking chain from nftables - os.system(f"{self.sudo}nft delete chain inet slipsBlocking") - return True + return False def get_cmd_output(self, command): """Executes a command and returns the output""" - - # Execute command result = subprocess.run(command.split(), stdout=subprocess.PIPE) - # Get command output return result.stdout.decode("utf-8") def initialize_chains_in_firewall(self): """For linux: Adds a chain to iptables or a table to nftables called slipsBlocking where all the rules will reside""" - if self.firewall == "iptables": - # delete any pre existing slipsBlocking rules that may conflict before adding a new one - # self.delete_iptables_chain() - self.print('Executing "sudo iptables -N slipsBlocking"', 6, 0) - # Add a new chain to iptables - os.system(f"{self.sudo}iptables -N slipsBlocking >/dev/null 2>&1") + if self.firewall != "iptables": + return - # Check if we're already redirecting to slipsBlocking chain - INPUT_chain_rules = self.get_cmd_output( - f"{self.sudo} iptables -nvL INPUT" - ) - OUTPUT_chain_rules = self.get_cmd_output( - f"{self.sudo} iptables -nvL OUTPUT" + # delete any pre existing slipsBlocking rules that may conflict before + # adding a new one + # self.delete_iptables_chain() + self.print('Executing "sudo iptables -N slipsBlocking"', 6, 0) + # Add a new chain to iptables + os.system(f"{self.sudo}iptables -N slipsBlocking >/dev/null 2>&1") + + # Check if we're already redirecting to slipsBlocking chain + input_chain_rules = self.get_cmd_output( + f"{self.sudo} iptables -nvL INPUT" + ) + output_chain_rules = self.get_cmd_output( + f"{self.sudo} iptables -nvL OUTPUT" + ) + forward_chain_rules = self.get_cmd_output( + f"{self.sudo} iptables -nvL FORWARD" + ) + # Redirect the traffic from all other chains to slipsBlocking so rules + # in any pre-existing chains dont override it + # -I to insert slipsBlocking at the top of the INPUT, OUTPUT and + # FORWARD chains + if "slipsBlocking" not in input_chain_rules: + os.system( + self.sudo + + "iptables -I INPUT -j slipsBlocking >/dev/null 2>&1" ) - FORWARD_chain_rules = self.get_cmd_output( - f"{self.sudo} iptables -nvL FORWARD" + if "slipsBlocking" not in output_chain_rules: + os.system( + self.sudo + + "iptables -I OUTPUT -j slipsBlocking >/dev/null 2>&1" ) - # Redirect the traffic from all other chains to slipsBlocking so rules - # in any pre-existing chains dont override it - # -I to insert slipsBlocking at the top of the INPUT, OUTPUT and FORWARD chains - if "slipsBlocking" not in INPUT_chain_rules: - os.system( - self.sudo - + "iptables -I INPUT -j slipsBlocking >/dev/null 2>&1" - ) - if "slipsBlocking" not in OUTPUT_chain_rules: - os.system( - self.sudo - + "iptables -I OUTPUT -j slipsBlocking >/dev/null 2>&1" - ) - if "slipsBlocking" not in FORWARD_chain_rules: - os.system( - self.sudo - + "iptables -I FORWARD -j slipsBlocking >/dev/null 2>&1" - ) - - elif self.firewall == "nftables": - self.print( - 'Executing "sudo nft add table inet slipsBlocking"', 6, 0 + if "slipsBlocking" not in forward_chain_rules: + os.system( + self.sudo + + "iptables -I FORWARD -j slipsBlocking >/dev/null 2>&1" ) - # Add a new nft table that uses the inet family (ipv4,ipv6) - os.system(f"{self.sudo}nft add table inet slipsBlocking") - # TODO: HANDLE NFT TABLE def is_ip_blocked(self, ip) -> bool: """Checks if ip is already blocked or not""" - command = f"{self.sudo}iptables -L slipsBlocking -v -n" # Execute command result = subprocess.run(command.split(), stdout=subprocess.PIPE) @@ -214,7 +210,7 @@ def block_ip( "dport": f" --dport {str(dport)}" if dport is not None else "", "sport": f" --sport {str(sport)}" if sport is not None else "", } - + blocked = False if from_: # Add rule to block traffic from source ip_to_block (-s) blocked = exec_iptables_command( @@ -264,35 +260,6 @@ def block_ip( return False - def check_for_ips_to_unblock(self): - unblocked_ips = set() - # check if any ip needs to be unblocked - for ip, info in self.unblock_ips.items(): - # info is a dict with: - # 'block_for': block_for, - # 'time_of_blocking': time_of_blocking, - # 'blocking_details': { - # "from" : from_ , - # "to" : to, - # "dport" : dport, - # "sport" : sport, - # "protocol" : protocol}}} - if time.time() >= info["time_of_blocking"] + info["block_for"]: - blocking_details = info["blocking_details"] - self.unblock_ip( - ip, - blocking_details["from"], - blocking_details["to"], - blocking_details["dport"], - blocking_details["sport"], - blocking_details["protocol"], - ) - # make a list of unblocked IPs to remove from dict - unblocked_ips.add(ip) - - for ip in unblocked_ips: - self.unblock_ips.pop(ip) - def main(self): # There's an IP that needs to be blocked if msg := self.get_msg("new_blocking"): @@ -302,6 +269,7 @@ def main(self): # (notice you have to specify from,to,dport,sport,protocol or at least 2 of them when unblocking) # blocking_data = { # "ip" : "0.0.0.0" + # "tw" : 1 # "block" : True to block - False to unblock # "from" : True to block traffic from ip (default) - False does nothing # "to" : True to block traffic to ip (default) - False does nothing @@ -318,6 +286,7 @@ def main(self): data = json.loads(msg["data"]) # Parse the data dictionary ip = data.get("ip") + tw: int = data.get("tw") block = data.get("block") from_ = data.get("from") to = data.get("to") @@ -325,8 +294,18 @@ def main(self): sport = data.get("sport") protocol = data.get("protocol") block_for = data.get("block_for") + if block: self.block_ip(ip, from_, to, dport, sport, protocol, block_for) else: - self.unblock_ip(ip, from_, to, dport, sport, protocol) - self.check_for_ips_to_unblock() + how_many_tws_to_block = 1 + flags = { + "from_": from_, + "to": to, + "dport": dport, + "sport": sport, + "protocol": protocol, + } + self.unblocker.unblock_request( + ip, how_many_tws_to_block, tw, flags + ) From 96a9a8ed5b8095d1b2943db8a06b2ddbeb22a69e Mon Sep 17 00:00:00 2001 From: alya Date: Fri, 2 May 2025 16:19:24 +0300 Subject: [PATCH 081/498] evidence_handler.py: tell the blocking module which tw the ip is blocked in --- slips_files/core/evidence_handler.py | 10 ++++++++-- slips_files/core/helpers/checker.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/slips_files/core/evidence_handler.py b/slips_files/core/evidence_handler.py index 8cb365fc4..890bfbdd0 100644 --- a/slips_files/core/evidence_handler.py +++ b/slips_files/core/evidence_handler.py @@ -47,6 +47,7 @@ Evidence, Victim, EvidenceType, + TimeWindow, ) from slips_files.core.structures.alerts import ( Alert, @@ -389,14 +390,18 @@ def handle_new_alert( if self.popup_alerts: self.show_popup(alert) - is_blocked: bool = self.decide_blocking(alert.profile.ip) + is_blocked: bool = self.decide_blocking( + alert.profile.ip, alert.timewindow + ) if is_blocked: self.db.mark_profile_and_timewindow_as_blocked( str(alert.profile), str(alert.timewindow) ) self.log_alert(alert, blocked=is_blocked) - def decide_blocking(self, ip_to_block: str) -> bool: + def decide_blocking( + self, ip_to_block: str, timewindow: TimeWindow + ) -> bool: """ Decide whether to block or not and send to the blocking module returns True if the given IP was blocked by Slips blocking module @@ -419,6 +424,7 @@ def decide_blocking(self, ip_to_block: str) -> bool: blocking_data = { "ip": ip_to_block, "block": True, + "tw": timewindow.number, } blocking_data = json.dumps(blocking_data) self.db.publish("new_blocking", blocking_data) diff --git a/slips_files/core/helpers/checker.py b/slips_files/core/helpers/checker.py index 3edc45699..1ed1a65f1 100644 --- a/slips_files/core/helpers/checker.py +++ b/slips_files/core/helpers/checker.py @@ -183,7 +183,7 @@ def delete_blocking_chain(self): blocking = Blocking(Queue()) blocking.start() - blocking.delete_slipsBlocking_chain() + blocking.delete_slips_blocking_chain() # kill the blocking module manually because we can't # run shutdown_gracefully here (not all modules has started) for child in active_children(): From 78b54939976eb4c4897ae0f64de0d1e7d7336b0a Mon Sep 17 00:00:00 2001 From: alya Date: Fri, 2 May 2025 16:21:08 +0300 Subject: [PATCH 082/498] unblocker: add unblocking request handling logic --- modules/blocking/unblocker.py | 66 +++++++++++++++++++++++ slips_files/common/abstracts/unblocker.py | 4 +- 2 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 modules/blocking/unblocker.py diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py new file mode 100644 index 000000000..d1b59bf81 --- /dev/null +++ b/modules/blocking/unblocker.py @@ -0,0 +1,66 @@ +from threading import Lock +from typing import Dict +from slips_files.common.abstracts.unblocker import IUnblocker +from slips_files.core.structures.evidence import TimeWindow + + +class Unblocker(IUnblocker): + """ + For every blocking method in slips, there should be an unblocker + implemented + """ + + name = "iptables_unblocker" + + def __init__(self, db): + IUnblocker.__init__(self, db) + self.requests_lock = Lock() + self.requests = {} + + def unblock_request( + self, + ip: str, + how_many_tws_to_block: int, + current_tw: int, + flags: Dict[str, str], + ): + tw_to_unblock_at: TimeWindow = self._calc_unblock_time( + ip, current_tw, how_many_tws_to_block + ) + self._add_req(ip, tw_to_unblock_at, flags) + + def _check_if_time_to_unblock(self): + """ + This method should be called in a thread that checks the timestamps + in self.requests regularly. + Each time a ts is reached, it should call _unblock() + """ + ... + + def _add_req( + self, ip: str, tw_to_unblock_at: TimeWindow, flags: Dict[str, str] + ): + """ + Add an unblocking request to self.requests + :param ts_to_unblock: unix ts to unblock the given ip at + """ + with self.requests_lock: + self.requests[ip] = { + "tw_to_unblock": tw_to_unblock_at, + "flags": flags, + } + + def _del_request(self, ip): + """Delete an unblocking request from self.requests""" + if ip in self.requests: + with self.requests_lock: + del self.requests[ip] + + def _unblock( + self, + ip_to_unblock, + flags: Dict[str, str], + ): + """Unblocks an ip based on the given flags""" + + ... diff --git a/slips_files/common/abstracts/unblocker.py b/slips_files/common/abstracts/unblocker.py index b976ab55d..ae2922fd0 100644 --- a/slips_files/common/abstracts/unblocker.py +++ b/slips_files/common/abstracts/unblocker.py @@ -56,7 +56,7 @@ def _check_if_time_to_unblock(self): ... def _calc_unblock_time( - self, ip: str, cur_tw: TimeWindow, how_many_tws_to_block + self, ip: str, cur_tw: int, how_many_tws_to_block: int ) -> TimeWindow: """ Calculates the timestamp to unblock. @@ -64,7 +64,7 @@ def _calc_unblock_time( returns the resulting timewindow """ # we unblock at the end of this tw - tw_to_unblock: int = cur_tw.number + how_many_tws_to_block + tw_to_unblock: int = cur_tw + how_many_tws_to_block tw_start, tw_end = self.db.get_tw_limits( f"profile_{ip}", f"timewindow{tw_to_unblock}" ) From 0bb8df1c2af516bd7565a2cc542283fc506550db Mon Sep 17 00:00:00 2001 From: alya Date: Fri, 2 May 2025 16:22:11 +0300 Subject: [PATCH 083/498] unblocker: delete the blocking rule in slips chain once the ts for unblocking is reached --- modules/blocking/unblocker.py | 65 +++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index d1b59bf81..35212259b 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -1,4 +1,5 @@ from threading import Lock +import time from typing import Dict from slips_files.common.abstracts.unblocker import IUnblocker from slips_files.core.structures.evidence import TimeWindow @@ -35,7 +36,22 @@ def _check_if_time_to_unblock(self): in self.requests regularly. Each time a ts is reached, it should call _unblock() """ - ... + while True: + now = time.time() + requests_to_del = [] + + for ip, request in self.requests.items(): + ts: float = self.request["tw_to_unblock"].end_time + flags: Dict[str, str] = self.request["flags"] + + if ts >= now: + if self._unblock(ip, flags): + requests_to_del.append(ip) + + for ip in requests_to_del: + self._del_req(ip) + + time.sleep(5) def _add_req( self, ip: str, tw_to_unblock_at: TimeWindow, flags: Dict[str, str] @@ -63,4 +79,49 @@ def _unblock( ): """Unblocks an ip based on the given flags""" - ... + from_ = flags.get("from_") + to = flags.get("to") + dport = flags.get("dport") + sport = flags.get("sport") + protocol = flags.get("protocol") + + # This dictionary will be used to construct the rule + options = { + "protocol": f" -p {protocol}" if protocol else "", + "dport": f" --dport {dport}" if dport else "", + "sport": f" --sport {sport}" if sport else "", + } + # Set the default behaviour to unblock all traffic from and to an ip + if from_ is None and to is None: + from_, to = True, True + + # Set the appropriate iptables flag to use in the command + # The module sending the message HAS TO specify either + # 'from_' or 'to' or both + # so that this function knows which rule to delete + # if both or none were specified we'll be unblocking all traffic from + # and to the given ip + unblocked = False + # Block traffic from source ip + if from_: + unblocked = self.exec_iptables_command( + action="delete", + ip_to_block=ip_to_unblock, + flag="-s", + options=options, + ) + + # Block traffic to distination ip + if to: + unblocked = self.exec_iptables_command( + action="delete", + ip_to_block=ip_to_unblock, + flag="-d", + options=options, + ) + + if unblocked: + self.print(f"Unblocked: {ip_to_unblock}") + return True + + return False From cf6b939823f5d935a8afa647bb21c3d86d353aa9 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:49:23 +0000 Subject: [PATCH 084/498] Add plot for flowml train scores --- modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 modules/flowmldetection/plot_train_score.py diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py new file mode 100644 index 000000000..0b5b5b72b --- /dev/null +++ b/modules/flowmldetection/plot_train_score.py @@ -0,0 +1,56 @@ +import pandas as pd +import matplotlib.pyplot as plt +import re +import sys + +def plot_log_data(file_path): + # Read the log data from the file + with open(file_path, 'r') as file: + log_data = file.read() + + # Define regex pattern to extract relevant data from each line + pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + + # Parse the log file + data = re.findall(pattern, log_data) + + # Convert data to a DataFrame + df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + df = df.astype({ + "Background": int, + "Benign": int, + "Malicious": int, + "Total labels": float, + "Score": float + }) + + # Plotting the values + fig, ax1 = plt.subplots(figsize=(10, 6)) + + # Plotting Score on the left y-axis + ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + ax1.set_xlabel('Index') + ax1.set_ylabel('Score', color='tab:blue') + ax1.tick_params(axis='y', labelcolor='tab:blue') + + # Create the second y-axis for the Total labels + ax2 = ax1.twinx() + ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') + ax2.set_ylabel('Total labels', color='tab:red') + ax2.tick_params(axis='y', labelcolor='tab:red') + + # Adding title and legend + plt.title('Log Data Visualization') + fig.tight_layout() + + # Save plot to a PNG file + plt.savefig('log_data_plot_with_two_scales.png') + + # Display the plot + plt.show() + +# Make sure the file path is passed as an argument +if len(sys.argv) < 2: + print("Please provide the path to the log file as a parameter.") +else: + plot_log_data(sys.argv[1]) From 2966b1497c0fef94d4f9daccecfd1d5a9fd66691 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:04 +0000 Subject: [PATCH 085/498] Add a log file to store the training data output --- modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d8e9ada27..f9a303c1b 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -68,12 +68,29 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves self.label = conf.label() - def train(self): + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): """ Train a model based on the flows we receive and the labels """ From f817b6dcfb79c98ca770649447d788ad7bf0f50f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:32 +0000 Subject: [PATCH 086/498] Store data in the log file of training --- modules/flowmldetection/flowmldetection.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f9a303c1b..e97f4de53 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -137,9 +137,13 @@ def train(self, sum_labeled_flows): # Store the models on disk self.store_model() + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") def process_features(self, dataset): """ From 656264d4ddf93b7f3588202b0f88394a5fae4ca4 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:53 +0000 Subject: [PATCH 087/498] better comments --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e97f4de53..3aa030790 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -59,10 +59,9 @@ def init(self): self.minimum_labels_to_start_train = 50 # Minum amount of new labels needed to retrain self.minimum_labels_to_retrain = 50 - # The number of flows when last trained + # The number of flows when last trained. Used internally only to know + # when to retrain self.last_number_of_flows_when_trained = 0 - # To plot the scores of training - # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" From e33862c2792b964556310abe33b938ca6864d9e1 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:30 +0000 Subject: [PATCH 088/498] Fix issue not dropping detailed labels --- modules/flowmldetection/flowmldetection.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3aa030790..4b05c9b47 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -94,23 +94,19 @@ def train(self, sum_labeled_flows): Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Train try: self.clf.partial_fit( From ce583a878fa64066e593cb802aae70057db81122 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:53 +0000 Subject: [PATCH 089/498] Fix issue that not all labels sere given to the partial fit --- modules/flowmldetection/flowmldetection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4b05c9b47..f12bfaaa6 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -109,8 +109,9 @@ def train(self, sum_labeled_flows): # Train try: + # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") From 8ec673f039e86599b1260e3a97d7658c0aa81ac5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:52:08 +0000 Subject: [PATCH 090/498] count partial labels in this epoch --- modules/flowmldetection/flowmldetection.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f12bfaaa6..0fffda271 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -106,6 +106,12 @@ def train(self, sum_labeled_flows): # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } # Train try: From d4b39eea28c5ff30c1a5ee10ec7c3e874cbaa5bf Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:09 +0000 Subject: [PATCH 091/498] Dont print training in screen --- modules/flowmldetection/flowmldetection.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0fffda271..f374c2926 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -126,15 +126,8 @@ def train(self, sum_labeled_flows): # See score so far in training score = self.clf.score(X_flow, y_flow) - # To debug the training score - # self.scores.append(score) - - self.print(f" Training Score: {score}", 0, 1) - # self.print(f' Model Parameters: {self.clf.coef_}') - - # Debug code to store a plot in a png of the scores - # plt.plot(self.scores) - # plt.savefig('train-scores.png') + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) # Store the models on disk self.store_model() From a2d50c96523ce3f3e344e813286b0653854007cf Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:28 +0000 Subject: [PATCH 092/498] Add function to write to train log --- modules/flowmldetection/flowmldetection.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f374c2926..679e7c0cc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -247,28 +247,28 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_training_flows(self, last_number_of_flows_when_trained): """ - Process all the flows in the DB + Process only the new flows in the DB since the last training. Store the pandas df in self.flows """ try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + # We get all the flows so far - # because this retraining happens in batches flows = self.db.get_all_flows() - # Check how many different labels are in the DB - # We need both normal and malware + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB labels = self.db.get_labels() if len(labels) == 1: - # Only 1 label has flows - # There are not enough different labels, so insert two flows - # that are fake but representative of a normal and malware flow - # they are only for the training process - # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. - flows.append( + # Insert fake flows for both classes if needed + new_flows.append( { "starttime": 1594417039.029793, "dur": "1.9424750804901123", From 7e6325dab56e081fbb88ec996572fa4bea30e464 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:27 +0000 Subject: [PATCH 093/498] Fix label in dummy flow --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 679e7c0cc..95c9b82a7 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "ground_truth_label": "Malicious", "module_labels": { "flowalerts-long-connection": "Malicious" }, } ) - flows.append( + new_flows.append( { "starttime": 1382355032.706468, "dur": "10.896695", From 683d7c17e081820b4df383742c7d481442801188 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:39 +0000 Subject: [PATCH 094/498] Fix dummy flow --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 95c9b82a7..5ea48fbc4 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "label": "Benign", + "ground_truth_label": "Benign", "module_labels": { "flowalerts-long-connection": "Benign" }, From 26a1482c18bdc02ed46b815b60ba720200fafa8e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:58:28 +0000 Subject: [PATCH 095/498] Rename variable --- modules/flowmldetection/flowmldetection.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5ea48fbc4..ff68b8a27 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): }, } ) - # If there are enough flows, we dont insert them anymore # Convert to pandas df - df_flows = pd.DataFrame(flows) + df_flows = pd.DataFrame(new_flows) # Process features df_flows = self.process_features(df_flows) @@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained): # Update the flow to the processed version self.flows = df_flows except Exception: - # Stop the timer self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) From 34b754a257ce29bc1abe83c883dcd9b6a4076e35 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:32 +0000 Subject: [PATCH 096/498] Fix dummy flow label --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index ff68b8a27..6b41b4029 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -356,7 +356,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "dir_", "endtime", "flow_source", - "ground_truth_label", # todo now we can use them + "ground_truth_label", "detailed_ground_truth_label", ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. From 54f958d42542ed7041fbe43584deb422ac46c591 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:47 +0000 Subject: [PATCH 097/498] Pass values to train function --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6b41b4029..4d66aab85 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -521,9 +521,9 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows() + self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train() + self.train(sum_labeled_flows) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From a9236e6297c888d029bab09ffecb7270c0c9914a Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:01:47 +0000 Subject: [PATCH 098/498] import os --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4d66aab85..766178e12 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import os from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 3fe1eaf3d4d4a7446dfee38749eb7349254e38ae Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:15 +0000 Subject: [PATCH 099/498] Get issue of total flows zero --- slips_files/core/database/database_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e8ca3aaf6..892b923b4 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -661,7 +661,8 @@ def add_software_to_profile(self, *args, **kwargs): return self.rdb.add_software_to_profile(*args, **kwargs) def get_total_flows(self, *args, **kwargs): - return int(self.rdb.get_total_flows(*args, **kwargs)) + total_flows = self.rdb.get_total_flows(*args, **kwargs) + return int(total_flows) if total_flows is not None else 0 def increment_processed_flows(self, *args, **kwargs): return self.rdb.increment_processed_flows(*args, **kwargs) From 73a19e5a500dd615b7e991733a471a3e9ec9aa6c Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:32 +0000 Subject: [PATCH 100/498] Add comments --- slips_files/core/database/database_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 892b923b4..6dd1d9952 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -879,7 +879,10 @@ def get_flow(self, *args, **kwargs): """returns the raw flow as read from the log file""" return self.sqlite.get_flow(*args, **kwargs) - def add_flow(self, flow, profileid: str, twid: str, label="benign"): + def add_flow(self, flow, profileid: str, twid: str, label="Benign"): + """ + Just in case, by default if there are no labels in the flow, we consider it Benign + """ # stores it in the db self.sqlite.add_flow(flow, profileid, twid, label=label) # handles the channels and labels etc. From 3e3443af1b2ec4c6d91acf4ed69c76a6928696b7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:51 +0000 Subject: [PATCH 101/498] Rename var name to be more clear --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index a05557b9f..c0a426189 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -119,7 +119,7 @@ def read_configuration(self): self.local_whitelist_path = conf.local_whitelist_path() self.timeformat = conf.ts_format() self.analysis_direction = conf.analysis_direction() - self.label = conf.label() + self.configuration_label = conf.label() self.width = conf.get_tw_width_as_float() self.client_ips: List[ Union[IPv4Network, IPv6Network, IPv4Address, IPv6Address] From 4c3c3149d67b1dcf2d573a4879fcaab0078f971f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:10 +0000 Subject: [PATCH 102/498] Rename var name --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index c0a426189..42bf3355e 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -377,7 +377,7 @@ def store_features_going_in(self, profileid: str, twid: str, flow): flow, profileid=profileid, twid=twid, - label=self.label, + label=self.configuration_label, ) self.db.mark_profile_tw_as_modified(profileid, twid, "") From 18b7544ce9c6554b1bf95c4d7d19458df01f4105 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:31 +0000 Subject: [PATCH 103/498] Fix processeed flows being zero --- slips/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips/main.py b/slips/main.py index bd7890f5b..d960ce318 100644 --- a/slips/main.py +++ b/slips/main.py @@ -414,7 +414,7 @@ def get_analyzed_flows_percentage(self) -> str: self.total_flows = self.db.get_total_flows() flows_percentage = int( - (self.db.get_processed_flows_so_far() / self.total_flows) * 100 + (self.db.get_processed_flows_so_far() / self.total_flows) * 100 if self.total_flows != 0 else 0 ) return f"Analyzed Flows: {green(flows_percentage)}{green('%')}. " From c221fe75a1a8027f86a35e8080165d37dde8da97 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:53 +0000 Subject: [PATCH 104/498] Delete old comments --- modules/flowmldetection/flowmldetection.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 766178e12..6c3bfc127 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -28,10 +28,6 @@ Method, ) -# Only for debbuging -# from matplotlib import pyplot as plt - - # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass From 320e0fedf1ebed269a1c369e6716bb1440a94eca Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:13:22 +0000 Subject: [PATCH 105/498] Fix plots --- modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++----- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 0b5b5b72b..359df04ef 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -2,6 +2,8 @@ import matplotlib.pyplot as plt import re import sys +import argparse +import os def plot_log_data(file_path): # Read the log data from the file @@ -24,33 +26,59 @@ def plot_log_data(file_path): "Score": float }) + # Get the directory of the log file to store the plot in the same folder + dir_name = os.path.dirname(file_path) + plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis + # Plotting Score on the left y-axis (with proper scaling from 0 to 1) ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') ax1.set_xlabel('Index') ax1.set_ylabel('Score', color='tab:blue') + ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Total labels + # Create the second y-axis for the Background, Benign, Malicious, Total labels ax2 = ax1.twinx() + ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + + # Set appropriate scale for right y-axis based on the data + ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() - # Save plot to a PNG file - plt.savefig('log_data_plot_with_two_scales.png') + # Adding the legend with increased space for readability + ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') + ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + + # Increase right margin for better readability of legend + plt.subplots_adjust(right=0.75) + + # Save plot to the same folder as the log file + plt.savefig(plot_file) # Display the plot plt.show() -# Make sure the file path is passed as an argument -if len(sys.argv) < 2: - print("Please provide the path to the log file as a parameter.") -else: - plot_log_data(sys.argv[1]) +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") + parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + + # Handle -h / --help + args = parser.parse_args() + + # Call the function to process the log file + plot_log_data(args.log_file) + +if __name__ == "__main__": + main() From 1adc33a6d6de83ef13cad648ea6ccfb9f6ceda02 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:14:58 +0000 Subject: [PATCH 106/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 359df04ef..c7f374a7f 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -40,18 +40,21 @@ def plot_log_data(file_path): ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Background, Benign, Malicious, Total labels + # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) + ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') + # Annotating Total labels as text on the plot + for i, value in enumerate(df["Total labels"]): + ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') + # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() From 010fbcda3c6183a3a309726519519ffcd0b61927 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:16:23 +0000 Subject: [PATCH 107/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index c7f374a7f..4099c47c1 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -42,10 +42,10 @@ def plot_log_data(file_path): # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') + ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') + ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') # Set appropriate scale for right y-axis based on the data ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) @@ -56,7 +56,7 @@ def plot_log_data(file_path): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') # Adding title and legend - plt.title('Log Data Visualization') + plt.title('Training performance') fig.tight_layout() # Adding the legend with increased space for readability From 978eaa02e2d48e6d27ab1c579a90b8a21b666b41 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:24:43 +0000 Subject: [PATCH 108/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 4099c47c1..8437e968a 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -59,12 +59,12 @@ def plot_log_data(file_path): plt.title('Training performance') fig.tight_layout() - # Adding the legend with increased space for readability - ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') - ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + # Move both legends further to the right + ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) + plt.subplots_adjust(right=0.7) # Save plot to the same folder as the log file plt.savefig(plot_file) From 3571750a84fc29ee775f42eb9b90851818defa56 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:02:34 +0000 Subject: [PATCH 109/498] Plot testing performance from a log --- .../plot_testing_performance.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 modules/flowmldetection/plot_testing_performance.py diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py new file mode 100644 index 000000000..a38c7f059 --- /dev/null +++ b/modules/flowmldetection/plot_testing_performance.py @@ -0,0 +1,89 @@ +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Create the plot + plt.figure(figsize=(12, 8)) + + # Plot each metric + plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') + plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') + plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') + plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') + plt.plot(F1_values, label='F1 Score', marker='o') + plt.plot(accuracy_values, label='Accuracy', marker='o') + plt.plot(precision_values, label='Precision', marker='o') + plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') + plt.plot(recall_values, label='Recall (TPR)', marker='o') + + # Add labels and title + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title('Evaluation Metrics Over Time') + + # Add a legend + plt.legend() + + # Save the plot as a PNG file + plt.savefig('metrics_plot.png') + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From 1c0ea51fad5afbd9753a1d52c5369baca086a7d3 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:04:32 +0000 Subject: [PATCH 110/498] Fix the plot --- modules/flowmldetection/plot_testing_performance.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index a38c7f059..fac0acd64 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Set logarithmic scale on the y-axis + plt.yscale('log') + # Add labels and title plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title('Evaluation Metrics Over Time') + plt.ylabel('Metric Value (Log Scale)') + plt.title('Evaluation Metrics Over Time (Log Scale)') # Add a legend plt.legend() # Save the plot as a PNG file - plt.savefig('metrics_plot.png') + plt.savefig('metrics_plot_log_scale.png') plt.close() def main(): From 1bcca14a5068fbb68c8a38962f7b995314cc65d7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:12:40 +0000 Subject: [PATCH 111/498] Fix the plots --- .../plot_testing_performance.py | 76 ++++++++++++++----- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index fac0acd64..5581c72cd 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -50,33 +50,66 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Create the plot - plt.figure(figsize=(12, 8)) + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } - # Plot each metric - plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') - plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') - plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') - plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') - plt.plot(F1_values, label='F1 Score', marker='o') - plt.plot(accuracy_values, label='Accuracy', marker='o') - plt.plot(precision_values, label='Precision', marker='o') - plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') - plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') - # Set logarithmic scale on the y-axis - plt.yscale('log') + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + +def plot_single_group(metrics_dict, output_filename): + plt.figure(figsize=(12, 8)) - # Add labels and title + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + plt.xlabel('Index') - plt.ylabel('Metric Value (Log Scale)') - plt.title('Evaluation Metrics Over Time (Log Scale)') - - # Add a legend + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') plt.legend() - # Save the plot as a PNG file - plt.savefig('metrics_plot_log_scale.png') + # Save the plot + plt.savefig(output_filename) plt.close() def main(): @@ -85,6 +118,7 @@ def main(): sys.exit(1) file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) From ab4bcd82169f802615ea28755e6735a0c611e2e7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:16:50 +0000 Subject: [PATCH 112/498] Fix plot --- .../plot_testing_performance.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 5581c72cd..8f9e12cd8 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') -def plot_single_group(metrics_dict, output_filename): + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename): # Apply log scale by default plt.yscale('log') + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Manually set more Y-ticks for better visibility + plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 + plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + plt.xlabel('Index') plt.ylabel('Metric Value') plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') From b7e0c6f6b4cecc6a446dc322e320183999092fb6 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:22 +0000 Subject: [PATCH 113/498] Fix plots --- modules/flowmldetection/flowmldetection.py | 709 +++++---------------- 1 file changed, 143 insertions(+), 566 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6c3bfc127..37f076110 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,566 +1,143 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy -from sklearn.linear_model import SGDClassifier -from sklearn.preprocessing import StandardScaler -import pickle -import pandas as pd -import json -import traceback -import warnings -import os - -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( - Evidence, - ProfileID, - TimeWindow, - Attacker, - ThreatLevel, - EvidenceType, - IoCType, - Direction, - Victim, - Method, -) - -# This horrible hack is only to stop sklearn from printing those warnings -def warn(*args, **kwargs): - pass - - -warnings.warn = warn - - -class FlowMLDetection(IModule): - # Name: short name of the module. Do not use spaces - name = "Flow ML Detection" - description = ( - "Train or test a Machine Learning model to detect malicious flows" - ) - authors = ["Sebastian Garcia"] - - def init(self): - # Subscribe to the channel - self.c1 = self.db.subscribe("new_flow") - self.channels = {"new_flow": self.c1} - self.fieldseparator = self.db.get_field_separator() - # Set the output queue of our database instance - # Read the configuration - self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained. Used internally only to know - # when to retrain - self.last_number_of_flows_when_trained = 0 - # The scaler trained during training and to use during testing - self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") - - def read_configuration(self): - conf = ConfigParser() - self.mode = conf.get_ml_mode() - # This is the global label in the configuration, - # in case the flows do not have a label themselves - self.label = conf.label() - - def write_to_training_log(self, message: str): - """ - Write a message to the training log file. - """ - try: - with open(self.training_log_path, "a") as log_file: - log_file.write(message + "\n") - except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) - - def train(self, sum_labeled_flows): - """ - Train a model based on the flows we receive and the labels - """ - try: - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("ground_truth_label", axis=1) - # Drop the detailed labels - X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) - # Drop the module_labels - X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - - # Normalize this batch of data so far. This can get progressivle slow - X_flow = self.scaler.fit_transform(X_flow) - - # Count the number of labels of each type in this epoc - epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), - } - - # Train - try: - # Online incremental learning - self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] - ) - except Exception: - self.print("Error while calling clf.train()") - self.print(traceback.format_exc(), 0, 1) - - # See score so far in training - score = self.clf.score(X_flow, y_flow) - - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) - - # Store the models on disk - self.store_model() - - # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") - except Exception: - self.print("Error in train().", 0, 1) - self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") - - def process_features(self, dataset): - """ - Discards some features of the dataset and can create new. - Clean the dataset - """ - try: - # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] - for proto in to_discard: - dataset = dataset[dataset.proto != proto] - - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - - # For now, discard these - to_drop = [ - "appproto", - "daddr", - "saddr", - "starttime", - "type_", - "smac", - "dmac", - "history", - "uid", - "dir_", - "endtime", - "flow_source", - ] - for field in to_drop: - try: - dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): - pass - - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others - # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) - - # Convert state to categorical - dataset.state = dataset.state.str.replace( - r"(^.*Not Established.*$)", "0", regex=True - ) - dataset.state = dataset.state.str.replace( - r"(^.*Established.*$)", "1", regex=True - ) - - # Convert categories to floats - dataset.state = dataset.state.astype("float64") - - # Convert proto to categorical. For now we only have few states, so we can hardcode... - # We dont use the data to create categories because in testing mode - # we dont see all the protocols - # Also we dont store the Categorizer because the user can retrain - # with its own data. - dataset.proto = dataset.proto.str.lower() - dataset.proto = dataset.proto.str.replace( - r"(^.*tcp.*$)", "0", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*udp.*$)", "1", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp.*$)", "2", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp-ipv6.*$)", "3", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*arp.*$)", "4", regex=True - ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_float: - try: - field = field.astype("float64") - except (ValueError, AttributeError): - pass - - return dataset - except Exception: - # Stop the timer - self.print("Error in process_features()") - self.print(traceback.format_exc(), 0, 1) - - def process_training_flows(self, last_number_of_flows_when_trained): - """ - Process only the new flows in the DB since the last training. - Store the pandas df in self.flows - """ - try: - # Ensure the index is an integer - if last_number_of_flows_when_trained is None: - last_number_of_flows_when_trained = 0 - else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) - - # We get all the flows so far - flows = self.db.get_all_flows() - # Only process new flows since last training - new_flows = flows[last_number_of_flows_when_trained:] - - # Check how many **different** labels are in the DB - labels = self.db.get_labels() - if len(labels) == 1: - # Insert fake flows for both classes if needed - new_flows.append( - { - "starttime": 1594417039.029793, - "dur": "1.9424750804901123", - "saddr": "10.7.10.101", - "sport": "49733", - "daddr": "40.70.224.145", - "dport": "443", - "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, - "sbytes": 25517, - "dbytes": 17247, - "appproto": "ssl", - "ground_truth_label": "Malicious", - "module_labels": { - "flowalerts-long-connection": "Malicious" - }, - } - ) - new_flows.append( - { - "starttime": 1382355032.706468, - "dur": "10.896695", - "saddr": "147.32.83.52", - "sport": "47956", - "daddr": "80.242.138.72", - "dport": "80", - "proto": "tcp", - "state": "SF", - "spkts": 1, - "dpkts": 0, - "sbytes": 100, - "dbytes": 67596, - "appproto": "http", - "ground_truth_label": "Benign", - "module_labels": { - "flowalerts-long-connection": "Benign" - }, - } - ) - - # Convert to pandas df - df_flows = pd.DataFrame(new_flows) - - # Process features - df_flows = self.process_features(df_flows) - - # Update the flow to the processed version - self.flows = df_flows - except Exception: - self.print("Error in process_flows()") - self.print(traceback.format_exc(), 0, 1) - - def process_flow(self, flow_to_process: dict): - """ - Process one flow. Only used during detection in testing - returns the pandas df with the processed flow - """ - try: - # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) - dflow = self.process_features(raw_flow) - if dflow.empty: - return None - # Update the flow to the processed version - return dflow - except Exception: - # Stop the timer - self.print("Error in process_flow()") - self.print(traceback.format_exc(), 0, 1) - - def detect(self, x_flow) -> Optional[numpy.ndarray]: - """ - Detects the given flow with the current model stored - and returns the predection array - """ - try: - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "endtime", - "flow_source", - "ground_truth_label", - "detailed_ground_truth_label", - ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass - # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) - return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" - ) - self.print(traceback.format_exc(), 0, 1) - - def store_model(self): - """ - Store the trained model on disk - """ - self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: - data = pickle.dumps(self.clf) - f.write(data) - with open(self.scaler_path, "wb") as g: - data = pickle.dumps(self.scaler) - g.write(data) - - def read_model(self): - """ - Read the trained model from disk - """ - try: - self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: - self.clf = pickle.load(f) - self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: - self.scaler = pickle.load(g) - except FileNotFoundError: - # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - except EOFError: - self.print( - "Error reading model from disk. " - "Creating a new empty model.", - 0, - 2, - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - - def set_evidence_malicious_flow(self, flow: dict, twid: str): - confidence: float = 0.1 - description = ( - f"Flow with malicious characteristics by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" - ) - twid_number = int(twid.replace("timewindow", "")) - evidence: Evidence = Evidence( - evidence_type=EvidenceType.MALICIOUS_FLOW, - attacker=Attacker( - direction=Direction.SRC, - ioc_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - ioc_type=IoCType.IP, - value=flow["daddr"], - ), - threat_level=ThreatLevel.LOW, - confidence=confidence, - description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], - timestamp=flow["starttime"], - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], - ) - - self.db.set_evidence(evidence) - - def shutdown_gracefully(self): - # Confirm that the module is done processing - if self.mode == "train": - self.store_model() - - def pre_main(self): - utils.drop_root_privs() - # Load the model - self.read_model() - - def main(self): - if msg := self.get_msg("new_flow"): - # When a new flow arrives - msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] - self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "state": msg["interpreted_state"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) - - if self.mode == "train": - # We are training - - # Is the amount in the DB of labels enough to retrain? - # Use labeled flows - labels = self.db.get_labels() - sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) - # Train an algorithm - self.train(sum_labeled_flows) - self.last_number_of_flows_when_trained = sum_labeled_flows - - elif self.mode == "test": - # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) - - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: - # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return - - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) - if pred[0] == "Malicious": - # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) - self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 2, - ) +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } + + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): + plt.figure(figsize=(12, 8)) + + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Add more ticks between 0 and 1 (using a logarithmic scale) + plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) + + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.legend() + + # Save the plot + plt.savefig(output_filename) + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From 511291517c0ef8a3b791ba1accc72b83363e0425 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:52 +0000 Subject: [PATCH 114/498] Fix plots --- .../plot_testing_performance.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 8f9e12cd8..69b8c96a8 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['MCC'].append(MCC_values[i]) close_to_1['recall'].append(recall_values[i]) - # Plot metrics for values close to 0 + # Plot metrics for values close to 0 (linear scale) plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + # Plot metrics for values close to 1 (log scale) + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): if 'recall' in metrics_dict: plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - # Apply log scale by default - plt.yscale('log') + # If the plot is close to 1, apply log scale + if not is_close_to_0: + plt.yscale('log') - # If the plot is close to 0, set custom ticks + # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series if is_close_to_0: - # Manually set more Y-ticks for better visibility - plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 - plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR'])) + max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR'])) + + # Avoid log(0), so set the minimum limit a little higher than zero + if min_val == 0: + min_val = 1e-4 # Avoid zero values on the logarithmic scale + + plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 17a9c9a356bc8cf489c80dcc736124a3dc22b7b9 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:23:02 +0000 Subject: [PATCH 115/498] Fix plots --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 69b8c96a8..de4ada38b 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-4 # Avoid zero values on the logarithmic scale + min_val = 1e-8 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 8561011f8b5d0d3a50932d6f1ff16d90b9986a18 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:25:58 +0000 Subject: [PATCH 116/498] Change plot names --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index de4ada38b..1b4152c6e 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") From 75db21d8225a7e8ad9ae41e33b1f64f6e1ccf598 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:26:09 +0000 Subject: [PATCH 117/498] Rename file --- .../{plot_train_score.py => plot_train_performance.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename modules/flowmldetection/{plot_train_score.py => plot_train_performance.py} (97%) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_performance.py similarity index 97% rename from modules/flowmldetection/plot_train_score.py rename to modules/flowmldetection/plot_train_performance.py index 8437e968a..80e13e951 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -28,7 +28,7 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + plot_file = os.path.join(dir_name, 'performance_metrics_training.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) From 4a16fd6ebe7893df77dd14898c3270a989193e21 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:32 +0000 Subject: [PATCH 118/498] Recover good flowmldetection deleted by mistake --- modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++----- 1 file changed, 566 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 37f076110..5e4e9aa46 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,143 +1,566 @@ -import matplotlib.pyplot as plt -import sys -import numpy as np - -def process_file(file_path): - # Initialize the counters for the values - FPR_values = [] - FNR_values = [] - TNR_values = [] - TPR_values = [] - F1_values = [] - accuracy_values = [] - precision_values = [] - MCC_values = [] - recall_values = [] - - # Read the file and extract the data - with open(file_path, 'r') as file: - for line in file: - if "TP:" in line: - # Extract the values from the line - parts = line.split(',') - TP = int(parts[0].split(':')[1].strip()) - TN = int(parts[1].split(':')[1].strip()) - FP = int(parts[2].split(':')[1].strip()) - FN = int(parts[3].split(':')[1].strip()) - - # Calculate metrics - FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 - FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 - TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 - TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 - Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 - Recall = TPR # Recall is the same as TPR - F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 - Accuracy = (TP + TN) / (TP + TN + FP + FN) - MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 - - # Append the values to the respective lists - FPR_values.append(FPR) - FNR_values.append(FNR) - TNR_values.append(TNR) - TPR_values.append(TPR) - F1_values.append(F1) - accuracy_values.append(Accuracy) - precision_values.append(Precision) - MCC_values.append(MCC) - recall_values.append(Recall) - - return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values - -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Separate the values into two groups based on their proximity to 0 or 1 - close_to_0 = { - 'FPR': [], 'FNR': [] - } - close_to_1 = { - 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] - } - - # Categorize the metrics into two groups - for i in range(len(FPR_values)): - close_to_0['FPR'].append(FPR_values[i]) - close_to_0['FNR'].append(FNR_values[i]) - - close_to_1['TNR'].append(TNR_values[i]) - close_to_1['TPR'].append(TPR_values[i]) - close_to_1['F1'].append(F1_values[i]) - close_to_1['accuracy'].append(accuracy_values[i]) - close_to_1['precision'].append(precision_values[i]) - close_to_1['MCC'].append(MCC_values[i]) - close_to_1['recall'].append(recall_values[i]) - - # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') - - # Print the final values - print("\nFinal Metric Values:") - print(f"Final FPR: {FPR_values[-1]:.4f}") - print(f"Final FNR: {FNR_values[-1]:.4f}") - print(f"Final TNR: {TNR_values[-1]:.4f}") - print(f"Final TPR: {TPR_values[-1]:.4f}") - print(f"Final F1 Score: {F1_values[-1]:.4f}") - print(f"Final Accuracy: {accuracy_values[-1]:.4f}") - print(f"Final Precision: {precision_values[-1]:.4f}") - print(f"Final MCC: {MCC_values[-1]:.4f}") - print(f"Final Recall: {recall_values[-1]:.4f}") - -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): - plt.figure(figsize=(12, 8)) - - # Only plot the metrics that exist in the dictionary - if 'FPR' in metrics_dict: - plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') - if 'FNR' in metrics_dict: - plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') - if 'TNR' in metrics_dict: - plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') - if 'TPR' in metrics_dict: - plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') - if 'F1' in metrics_dict: - plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') - if 'accuracy' in metrics_dict: - plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') - if 'precision' in metrics_dict: - plt.plot(metrics_dict['precision'], label='Precision', marker='o') - if 'MCC' in metrics_dict: - plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') - if 'recall' in metrics_dict: - plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - - # Apply log scale by default - plt.yscale('log') - - # If the plot is close to 0, set custom ticks - if is_close_to_0: - # Add more ticks between 0 and 1 (using a logarithmic scale) - plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) - - plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') - plt.legend() - - # Save the plot - plt.savefig(output_filename) - plt.close() - -def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) - - file_path = sys.argv[1] - - FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) - -if __name__ == "__main__": - main() +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler +import pickle +import pandas as pd +import json +import traceback +import warnings +import os + +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.evidence import ( + Evidence, + ProfileID, + TimeWindow, + Attacker, + ThreatLevel, + EvidenceType, + IoCType, + Direction, + Victim, + Method, +) + +# This horrible hack is only to stop sklearn from printing those warnings +def warn(*args, **kwargs): + pass + + +warnings.warn = warn + + +class FlowMLDetection(IModule): + # Name: short name of the module. Do not use spaces + name = "Flow ML Detection" + description = ( + "Train or test a Machine Learning model to detect malicious flows" + ) + authors = ["Sebastian Garcia"] + + def init(self): + # Subscribe to the channel + self.c1 = self.db.subscribe("new_flow") + self.channels = {"new_flow": self.c1} + self.fieldseparator = self.db.get_field_separator() + # Set the output queue of our database instance + # Read the configuration + self.read_configuration() + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained. Used internally only to know + # when to retrain + self.last_number_of_flows_when_trained = 0 + # The scaler trained during training and to use during testing + self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" + + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + + def read_configuration(self): + conf = ConfigParser() + self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves + self.label = conf.label() + + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): + """ + Train a model based on the flows we receive and the labels + """ + try: + # Create X_flow with the current flows minus the label + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) + # Drop the module_labels + X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) + + # Normalize this batch of data so far. This can get progressivle slow + X_flow = self.scaler.fit_transform(X_flow) + + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } + + # Train + try: + # Online incremental learning + self.clf.partial_fit( + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + ) + except Exception: + self.print("Error while calling clf.train()") + self.print(traceback.format_exc(), 0, 1) + + # See score so far in training + score = self.clf.score(X_flow, y_flow) + + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + + # Store the models on disk + self.store_model() + + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + except Exception: + self.print("Error in train().", 0, 1) + self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") + + def process_features(self, dataset): + """ + Discards some features of the dataset and can create new. + Clean the dataset + """ + try: + # Discard some type of flows that dont have ports + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + for proto in to_discard: + dataset = dataset[dataset.proto != proto] + + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + + # For now, discard these + to_drop = [ + "appproto", + "daddr", + "saddr", + "starttime", + "type_", + "smac", + "dmac", + "history", + "uid", + "dir_", + "endtime", + "flow_source", + ] + for field in to_drop: + try: + dataset = dataset.drop(field, axis=1) + except (ValueError, KeyError): + pass + + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others + # So transform here + dataset["state"] = dataset.apply( + lambda row: self.db.get_final_state_from_flags( + row["state"], (row["spkts"] + row["dpkts"]) + ), + axis=1, + ) + + # Convert state to categorical + dataset.state = dataset.state.str.replace( + r"(^.*Not Established.*$)", "0", regex=True + ) + dataset.state = dataset.state.str.replace( + r"(^.*Established.*$)", "1", regex=True + ) + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... + # We dont use the data to create categories because in testing mode + # we dont see all the protocols + # Also we dont store the Categorizer because the user can retrain + # with its own data. + dataset.proto = dataset.proto.str.lower() + dataset.proto = dataset.proto.str.replace( + r"(^.*tcp.*$)", "0", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*udp.*$)", "1", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp.*$)", "2", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp-ipv6.*$)", "3", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*arp.*$)", "4", regex=True + ) + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + + return dataset + except Exception: + # Stop the timer + self.print("Error in process_features()") + self.print(traceback.format_exc(), 0, 1) + + def process_training_flows(self, last_number_of_flows_when_trained): + """ + Process only the new flows in the DB since the last training. + Store the pandas df in self.flows + """ + try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + + # We get all the flows so far + flows = self.db.get_all_flows() + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB + labels = self.db.get_labels() + if len(labels) == 1: + # Insert fake flows for both classes if needed + new_flows.append( + { + "starttime": 1594417039.029793, + "dur": "1.9424750804901123", + "saddr": "10.7.10.101", + "sport": "49733", + "daddr": "40.70.224.145", + "dport": "443", + "proto": "tcp", + "state": "SF", + "spkts": 17, + "dpkts": 27, + "sbytes": 25517, + "dbytes": 17247, + "appproto": "ssl", + "ground_truth_label": "Malicious", + "module_labels": { + "flowalerts-long-connection": "Malicious" + }, + } + ) + new_flows.append( + { + "starttime": 1382355032.706468, + "dur": "10.896695", + "saddr": "147.32.83.52", + "sport": "47956", + "daddr": "80.242.138.72", + "dport": "80", + "proto": "tcp", + "state": "SF", + "spkts": 1, + "dpkts": 0, + "sbytes": 100, + "dbytes": 67596, + "appproto": "http", + "ground_truth_label": "Benign", + "module_labels": { + "flowalerts-long-connection": "Benign" + }, + } + ) + + # Convert to pandas df + df_flows = pd.DataFrame(new_flows) + + # Process features + df_flows = self.process_features(df_flows) + + # Update the flow to the processed version + self.flows = df_flows + except Exception: + self.print("Error in process_flows()") + self.print(traceback.format_exc(), 0, 1) + + def process_flow(self, flow_to_process: dict): + """ + Process one flow. Only used during detection in testing + returns the pandas df with the processed flow + """ + try: + # Convert the flow to a pandas dataframe + raw_flow = pd.DataFrame(flow_to_process, index=[0]) + dflow = self.process_features(raw_flow) + if dflow.empty: + return None + # Update the flow to the processed version + return dflow + except Exception: + # Stop the timer + self.print("Error in process_flow()") + self.print(traceback.format_exc(), 0, 1) + + def detect(self, x_flow) -> Optional[numpy.ndarray]: + """ + Detects the given flow with the current model stored + and returns the predection array + """ + try: + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", + ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass + # Scale the flow + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) + return pred + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) + self.print(traceback.format_exc(), 0, 1) + + def store_model(self): + """ + Store the trained model on disk + """ + self.print("Storing the trained model and scaler on disk.", 0, 2) + with open(self.model_path, "wb") as f: + data = pickle.dumps(self.clf) + f.write(data) + with open(self.scaler_path, "wb") as g: + data = pickle.dumps(self.scaler) + g.write(data) + + def read_model(self): + """ + Read the trained model from disk + """ + try: + self.print("Reading the trained model from disk.", 0, 2) + with open(self.model_path, "rb") as f: + self.clf = pickle.load(f) + self.print("Reading the trained scaler from disk.", 0, 2) + with open(self.scaler_path, "rb") as g: + self.scaler = pickle.load(g) + except FileNotFoundError: + # If there is no model, create one empty + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + except EOFError: + self.print( + "Error reading model from disk. " + "Creating a new empty model.", + 0, + 2, + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + + def set_evidence_malicious_flow(self, flow: dict, twid: str): + confidence: float = 0.1 + description = ( + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" + ) + twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( + evidence_type=EvidenceType.MALICIOUS_FLOW, + attacker=Attacker( + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], + ), + threat_level=ThreatLevel.LOW, + confidence=confidence, + description=description, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], + ) + + self.db.set_evidence(evidence) + + def shutdown_gracefully(self): + # Confirm that the module is done processing + if self.mode == "train": + self.store_model() + + def pre_main(self): + utils.drop_root_privs() + # Load the model + self.read_model() + + def main(self): + if msg := self.get_msg("new_flow"): + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) + + if self.mode == "train": + # We are training + + # Is the amount in the DB of labels enough to retrain? + # Use labeled flows + labels = self.db.get_labels() + sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. + if ( + sum_labeled_flows >= self.minimum_labels_to_start_train + ): + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows(self.last_number_of_flows_when_trained) + # Train an algorithm + self.train(sum_labeled_flows) + self.last_number_of_flows_when_trained = sum_labeled_flows + + elif self.mode == "test": + # We are testing, which means using the model to detect + processed_flow = self.process_flow(self.flow) + + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: + # Predict + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return + + label = self.flow["label"] + if label and label != "unknown" and label != pred[0]: + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode + self.print( + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 3, + ) + if pred[0] == "Malicious": + # Generate an alert + self.set_evidence_malicious_flow(self.flow, self.twid) + self.print( + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 2, + ) \ No newline at end of file From 1fcb086b1a756442a338e39e63634b1c95402d21 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:43 +0000 Subject: [PATCH 119/498] Fix plot test --- modules/flowmldetection/plot_testing_performance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 1b4152c6e..977a68b2d 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-8 # Avoid zero values on the logarithmic scale + min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 7b18a530e0525f810109cc4ea78138707a588d24 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:50:33 +0000 Subject: [PATCH 120/498] Add testing code to evaluate performance. It is optional with a varible --- modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5e4e9aa46..b17a1baaf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -526,36 +526,21 @@ def main(self): elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: + original_label = processed_flow["ground_truth_label"].iloc[0] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: # an error occurred return - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) if pred[0] == "Malicious": # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f"Prediction {pred[0]} for label {label}" + f"Prediction {pred[0]} for label {original_label}" f' flow {self.flow["saddr"]}:' f'{self.flow["sport"]} -> ' f'{self.flow["daddr"]}:' @@ -563,4 +548,43 @@ def main(self): f'{self.flow["proto"]}', 0, 2, - ) \ No newline at end of file + ) + + # So you can disable this code easily. Since it is used only for evaluating a testing + log_testing_data = True + if log_testing_data: + # Initialize counters if not already done + if not hasattr(self, 'tp'): + self.tp = 0 + if not hasattr(self, 'tn'): + self.tn = 0 + if not hasattr(self, 'fp'): + self.fp = 0 + if not hasattr(self, 'fn'): + self.fn = 0 + + + # Update counters based on predictions and labels + if pred[0] == "Malicious" and original_label == "Malicious": + self.tp += 1 + elif pred[0] == "Benign" and original_label == "Benign": + self.tn += 1 + elif pred[0] == "Malicious" and original_label == "Benign": + self.fp += 1 + elif pred[0] == "Benign" and original_label == "Malicious": + self.fn += 1 + + testing_log_path = "./modules/flowmldetection/testing_performance.log" + try: + with open(testing_log_path, "a") as log_file: + log_file.write("Testing Performance Log Initialized\n") + # Log the testing performance metrics + log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") + + # Log the original flow for false positives and false negatives + if pred[0] == "Malicious" and original_label == "Benign": + log_file.write(f"False Positive Flow: {self.flow}\n") + elif pred[0] == "Benign" and original_label == "Malicious": + log_file.write(f"False Negative Flow: {self.flow}\n") + except Exception as e: + self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file From 4e8cbda03b1b8c7357c818f241b53b67afc86567 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:04:00 +0000 Subject: [PATCH 121/498] Fix plots --- .../plot_testing_performance.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 977a68b2d..6865415cd 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import sys import numpy as np +import argparse def process_file(file_path): # Initialize the counters for the values @@ -49,7 +50,7 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number): # Separate the values into two groups based on their proximity to 0 or 1 close_to_0 = { 'FPR': [], 'FNR': [] @@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False) # Print the final values - print("\nFinal Metric Values:") + print("\nFinal Metric Values for Experiment", experiment_number) print(f"Final FPR: {FPR_values[-1]:.4f}") print(f"Final FNR: {FNR_values[-1]:.4f}") print(f"Final TNR: {TNR_values[-1]:.4f}") @@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu print(f"Final MCC: {MCC_values[-1]:.4f}") print(f"Final Recall: {recall_values[-1]:.4f}") -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): +def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + # Add the experiment number to the plot title plt.xlabel('Index') plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time') plt.legend() # Save the plot @@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.close() def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) + # Set up argument parsing + parser = argparse.ArgumentParser(description='Plot testing performance metrics.') + parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file') + parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number') + + args = parser.parse_args() - file_path = sys.argv[1] + file_path = args.file + experiment_number = args.experiment FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number) if __name__ == "__main__": main() From d4cc5625cb18e8207c7aa6e1a42a5a88e3d57134 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:14:51 +0000 Subject: [PATCH 122/498] Fix train plot --- .../flowmldetection/plot_train_performance.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 80e13e951..244df13d2 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -5,7 +5,7 @@ import argparse import os -def plot_log_data(file_path): +def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() @@ -28,7 +28,8 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'performance_metrics_training.png') + # Append experiment number to the filename + plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) @@ -55,18 +56,18 @@ def plot_log_data(file_path): for i, value in enumerate(df["Total labels"]): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - # Adding title and legend - plt.title('Training performance') + # Adding title and legend with experiment number in title + plt.title(f'Training performance - Experiment {experiment_number}') fig.tight_layout() # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) + ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.7) + plt.subplots_adjust(right=0.75) - # Save plot to the same folder as the log file + # Save plot to the same folder as the log file with experiment number in filename plt.savefig(plot_file) # Display the plot @@ -75,13 +76,14 @@ def plot_log_data(file_path): def main(): # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") + parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") # Handle -h / --help args = parser.parse_args() # Call the function to process the log file - plot_log_data(args.log_file) + plot_log_data(args.file, args.experiment) if __name__ == "__main__": main() From d3b0190e39beb89cffaf8ad51a2cec0d787f7920 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:14:48 +0000 Subject: [PATCH 123/498] Fix plots --- .../flowmldetection/plot_train_performance.py | 122 ++++++++++-------- 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 244df13d2..5212dfeea 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -4,85 +4,105 @@ import sys import argparse import os +import matplotlib.ticker as ticker def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() - # Define regex pattern to extract relevant data from each line - pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + # Regex pattern for the new log format + pattern = ( + r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: " + r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), " + r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\." + ) # Parse the log file data = re.findall(pattern, log_data) # Convert data to a DataFrame - df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + columns = [ + "Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall" + ] + df = pd.DataFrame(data, columns=columns) df = df.astype({ + "Total labels": float, "Background": int, "Benign": int, "Malicious": int, - "Total labels": float, - "Score": float + "FPR": float, + "TNR": float, + "TPR": float, + "FNR": float, + "F1": float, + "Precision": float, + "Accuracy": float, + "MCC": float, + "Recall": float, }) - # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - # Append experiment number to the filename - plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') - - # Plotting the values - fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis (with proper scaling from 0 to 1) - ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + # --- Plot 1: Number of labels (linear scale, no total labels) --- + fig1, ax1 = plt.subplots(figsize=(10, 6)) + ax1.plot(df.index, df["Background"], label="Background", color='black') + ax1.plot(df.index, df["Benign"], label="Benign", color='cyan') + ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') - ax1.set_ylabel('Score', color='tab:blue') - ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 - ax1.tick_params(axis='y', labelcolor='tab:blue') - - # Create the second y-axis for the Background, Benign, Malicious - ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') - - # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) - ax2.tick_params(axis='y', labelcolor='tab:red') - - # Annotating Total labels as text on the plot - for i, value in enumerate(df["Total labels"]): - ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - - # Adding title and legend with experiment number in title - plt.title(f'Training performance - Experiment {experiment_number}') - fig.tight_layout() + ax1.set_ylabel('Label Counts') + # No log scale here + ax1.set_title(f'Label Counts - Experiment {experiment_number}') + ax1.legend() + ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + + # --- Plot 2: FNR and FPR (log scale) --- + fig2, ax2 = plt.subplots(figsize=(10, 6)) + ax2.plot(df.index, df["FNR"], label="FNR", color='red') + ax2.plot(df.index, df["FPR"], label="FPR", color='blue') + ax2.set_xlabel('Index') + ax2.set_ylabel('Rate') + ax2.set_yscale('log') + ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') + ax2.legend() + ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + + # --- Plot 3: Other metrics (log scale) --- + fig3, ax3 = plt.subplots(figsize=(12, 7)) + metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"] + colors_rest = [ + 'tab:blue', 'tab:green', 'tab:purple', 'tab:brown', + 'tab:gray', 'tab:pink', 'tab:olive' + ] + for metric, color in zip(metrics_rest, colors_rest): + ax3.plot(df.index, df[metric], label=metric, color=color) + ax3.set_xlabel('Index') + ax3.set_ylabel('Metric Value') + ax3.set_yscale('log') + ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') + ax3.legend() + ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) - # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) - - # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) - - # Save plot to the same folder as the log file with experiment number in filename - plt.savefig(plot_file) - - # Display the plot plt.show() + # --- Print final values in terminal --- + print("\nFinal values at last training step:") + for col in ["Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]: + print(f"{col}: {df[col].iloc[-1]}") + def main(): - # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") - - # Handle -h / --help args = parser.parse_args() - - # Call the function to process the log file plot_log_data(args.file, args.experiment) if __name__ == "__main__": From aa8331fa4417cf3912a623528607f7480edcb796 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:16:01 +0000 Subject: [PATCH 124/498] Add performance metrics to the training evaluation --- modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++----- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b17a1baaf..2c60cd403 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,16 @@ import json import traceback import warnings -import os +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import ( + confusion_matrix, + f1_score, + precision_score, + accuracy_score, + matthews_corrcoef, + recall_score, +) + from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -86,21 +95,21 @@ def write_to_training_log(self, message: str): except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) - def train(self, sum_labeled_flows): + def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ Train a model based on the flows we receive and the labels """ try: + # Create y_flow with the label + y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - # Normalize this batch of data so far. This can get progressivle slow + # Normalize this batch of data so far. This can get progressively slow X_flow = self.scaler.fit_transform(X_flow) # Count the number of labels of each type in this epoc @@ -120,18 +129,43 @@ def train(self, sum_labeled_flows): self.print("Error while calling clf.train()") self.print(traceback.format_exc(), 0, 1) - # See score so far in training - score = self.clf.score(X_flow, y_flow) + # Predict on the training data + y_pred = self.clf.predict(X_flow) - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + # For metrics, let's focus on Malicious vs Benign (ignore Background) + mask = (y_flow == "Malicious") | (y_flow == "Benign") + y_true_bin = y_flow[mask] + y_pred_bin = y_pred[mask] + + # Map to binary: Malicious=1, Benign=0 + y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) + y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + + # Compute confusion matrix: tn, fp, fn, tp + tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + + # Compute metrics + FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 + TNR = tn / (tn + fp) if (tn + fp) > 0 else 0 + TPR = tp / (tp + fn) if (tp + fn) > 0 else 0 + FNR = fn / (fn + tp) if (fn + tp) > 0 else 0 + F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) + PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) + ACCU = accuracy_score(y_true_bin, y_pred_bin) + MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk self.store_model() # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + self.write_to_training_log( + f"Total labels: {sum_labeled_flows}, " + f"Background: {epoch_label_counts['Background']}. " + f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + ) except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) @@ -520,7 +554,7 @@ def main(self): # for pandas self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train(sum_labeled_flows) + self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From bbd6e0a0e40db29a29481ac4839b4efa42252b34 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sun, 4 May 2025 12:50:46 +0000 Subject: [PATCH 125/498] Fix experiment names --- modules/flowmldetection/plot_train_performance.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 5212dfeea..304f0f4ea 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number): ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') ax1.set_ylabel('Label Counts') - # No log scale here ax1.set_title(f'Label Counts - Experiment {experiment_number}') ax1.legend() ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + ax1.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png')) # --- Plot 2: FNR and FPR (log scale) --- fig2, ax2 = plt.subplots(figsize=(10, 6)) @@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number): ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') ax2.legend() ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + ax2.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png')) # --- Plot 3: Other metrics (log scale) --- fig3, ax3 = plt.subplots(figsize=(12, 7)) @@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number): ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') ax3.legend() ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + ax3.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png')) plt.show() From 416bc48fd70e9f92b8a4cf4a192ae9f05a2ce4fc Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 126/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e44ac83f4..16b67e903 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -120,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -132,7 +268,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -155,15 +291,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From 82ff65455c8cea8514ef0285aaf98846a34eb8e8 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 127/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 000000000..b671a09a2 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 3e4bf3fbb9df71feb63e125ddae50e54b6a375f1 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 128/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 45710b72db50551053c09ed71059fa5d1bfcf712 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 129/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 16b67e903..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e8ca3aaf6..b4b2128d3 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 014ee473003b36f3680b7f40aa60dd9c7d4ae759 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 130/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From d5f6330c3e6bdb0a8f81d0f1349f927bfda8636e Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 131/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 2ad9ccb25fcd46b9da91c72f3400de5ae3ec364e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 132/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 88cb54b68e0da3088fc76cc8b351c5653aa93857 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 133/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 13dfd28bb8915c4c61798efa0051daaa7ee9daa9 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 134/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From d2a5935f05c2aee6c5cb4ae7285151258f836e13 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 135/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 60f0b286cb0b1172b01416bd66a45a117fa55577 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 136/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 6a05fa3efdb4254380541b7b7c32c4c02f829cf7 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 137/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 77c9a10ce27447cf87d9d4720132554bc1cb9f5c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 138/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 6e4841ffcffad999d14dcbc1354dbccc8f2cc546 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 139/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 0d8414e9937ce34aac0b6fbb7fe328c7d207ead6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 140/498] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index b4b2128d3..e8ca3aaf6 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 97f86a5709fad2144a97f643b8381cd48f86b148 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:25:03 +0100 Subject: [PATCH 141/498] delete sys --- modules/flowmldetection/flowmldetection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..c06755a59 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,6 @@ import json import traceback import warnings -import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From c468366fca89d0fe5d6d00ec8c660f62ed616b46 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 142/498] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115b..000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 952c3b29c7c6637bd78b66c1c2fc9a333f72a5d0 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 143/498] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c06755a59..87e07c759 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -160,7 +160,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From e7af6dc61e06a5759e5c5ddfcfe3ffadcdf67fb6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 144/498] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 144 +++++++++++---------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 87e07c759..e91495d64 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -55,8 +55,12 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -67,26 +71,25 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -95,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -118,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -144,9 +147,7 @@ def process_features(self, dataset): "history", "uid", "dir_", - "dbytes", "endtime", - "bytes", "flow_source", ] for field in to_drop: @@ -161,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -199,7 +199,11 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ dataset.proto, dataset.dport, dataset.sport, @@ -210,10 +214,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_flow: + for field in fields_to_convert_to_float: try: field = field.astype("float64") - except ValueError: + except (ValueError, AttributeError): pass return dataset @@ -222,9 +226,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: @@ -240,44 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "Established", - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "Established", - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: - given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", - "dbytes", - "dpkts", "endtime", - "bytes", "flow_source", "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", @@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" + f"Error in detect() while processing " f"\n{x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -437,18 +441,16 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): + # When a new flow arrives msg = json.loads(msg["data"]) - twid = msg["twid"] + self.twid = msg["twid"] + self.profileid = msg["profileid"] self.flow = msg["flow"] - # these fields are expected in testing. update the original + # These following extra fields are expected in testing. update the original # flow dict to have them self.flow.update( { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -461,23 +463,31 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -497,8 +507,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -506,9 +516,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From 1dbde99abda8734e06222b1149806e1b626d2602 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:29 +0100 Subject: [PATCH 145/498] Fix the profiler handler for cases of nan in state --- .../core/database/redis_db/profile_handler.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 0489372cd..1ea764464 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -379,7 +379,12 @@ def get_final_state_from_flags(self, state, pkts): We receive the pakets to distinguish some Reset connections """ try: - pre = state.split("_")[0] + # In some flows the state is a nan + try: + pre = state.split("_")[0] + except AttributeError: + pre = '' + try: # Try suricata states """ @@ -401,7 +406,11 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - suf = state.split("_")[1] + # In some flows the state is a nan + try: + suf = state.split("_")[1] + except AttributeError: + suf = '' if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -502,7 +511,7 @@ def get_final_state_from_flags(self, state, pkts): except Exception: exception_line = sys.exc_info()[2].tb_lineno self.print( - f"Error in getFinalStateFromFlags() in database.py line {exception_line}", + f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}", 0, 1, ) From 2c3a9eb2363c14e89d08bda2d8f7698c41f148a3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:20:42 +0100 Subject: [PATCH 146/498] slips.yaml. Update to have correct labels. By default test. Defaul training lbel is benign --- config/slips.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index f7089b41a..8736eaf51 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -106,13 +106,12 @@ parameters: deletePrevdb: true # Set the label for all the flows that are being read. - # For now only normal and malware directly. No option for setting labels - # with a filter + # For now only Benign and Malicious (Capitalized) # The purpose is to be used in the training of ML models and to output # flows with labels for other tools. - # label: malicious - # label: unknown - label: normal + # label: Malicious + # label: Benign + label: Benign # If Zeek files are rotated or not to avoid running out of disk. # Zeek rotation is enabled by default when using an interface, # which means Slips will delete all Zeek log files after 1 day From 38bdc30b059f670dde6817a575504f7f308f9ad0 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:21:21 +0100 Subject: [PATCH 147/498] First ipython to tst ML flow related models --- modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb new file mode 100644 index 000000000..d726cd280 --- /dev/null +++ b/modules/flowmldetection/flowmlanalysis.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of Flows with Machine Learning for Slips" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis of a fixed list of flows to try techniques and find parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.preprocessing import StandardScaler\n", + "import pickle\n", + "import pandas as pd\n", + "import json\n", + "import traceback\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "slips-new", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 8b2e850f150389ad00d4c10d65abd7c94f5b58fb Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 148/498] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e91495d64..58b4ce1e4 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From b179fac4ade82c0d1716ad13428a606e25f4fae9 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 149/498] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 58b4ce1e4..4a4d46e37 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From dd98ff1307fc64e517d3eff4a80301e6be8dd1e3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:26:42 +0100 Subject: [PATCH 150/498] profile_handler. Small bug in how we handled the profiles, we were using 'in' instead of == for established. Some not established MAY not have been correctly captured --- slips_files/core/database/redis_db/profile_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 1ea764464..85fdec5a6 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -393,9 +393,10 @@ def get_final_state_from_flags(self, state, pkts): these are: New, Established and Closed,for UDP only new and established. For each of these states Suricata can employ different timeouts. """ - if "new" in state or "established" in state: + # This is controversial, but if we dont have a good state, we consider it not established for now + if "new" in state or state.lower() == "established": return "Established" - elif "closed" in state: + elif "closed" in state or state.lower() == 'not established': return "Not Established" # We have varius type of states depending on the type of flow. @@ -406,7 +407,6 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - # In some flows the state is a nan try: suf = state.split("_")[1] except AttributeError: From 4de77d6fcb7a8acf5e2a1510950e28a285084344 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 151/498] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1073 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 411 zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice! z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWpQXRJ|Emr(~!@tY-9P0&;YE zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1 z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8 zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5 delta 380 zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN zW*@ diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch literal 890 zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD< z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*|B)5DrlQdy7+(!%6#F{QHuBFo&v zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~ zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E; z8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_ delta 525 zcmeyxHj9S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5mhSAF{H=RwvmLf> z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%YfQ^>gH&y>a4SnPazubB5!cqc1YnCGT))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4d&dkpP QMw@}2k%^wsl+q+U0P2|NZU6uP From 03e408119146041e40ad6c29370f1694eb1e40a8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 152/498] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 153/498] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4a4d46e37..d8e9ada27 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From 17ccb096b61ea71a780e182a2bb0626985e4c755 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 154/498] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 155/498] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From b58ca823684868b27c4357999e55d97d2b75ad4a Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 156/498] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From cdfd04f0667647f4d3d4a47bb56d7f6d7edc00d6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 157/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 317 +++++++++++++-------- 1 file changed, 206 insertions(+), 111 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d8e9ada27..8917fef6a 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -55,12 +56,8 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained - self.last_number_of_flows_when_trained = 0 + # Minum amount of new lables needed to trigger the train + self.minimum_lables_to_retrain = 50 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -71,25 +68,26 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() - self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) + # Process the labels to have only Normal and Malware + self.flows.label = self.flows.label.str.replace( + r"(^.*ormal.*$)", "Normal", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alware.*$)", "Malware", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alicious.*$)", "Malware", regex=True + ) - # Create X_flow with the current flows minus the label + # Separate + y_flow = self.flows["label"] X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) - # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -98,7 +96,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Malware", "Normal"] ) except Exception: self.print("Error while calling clf.train()") @@ -121,7 +119,142 @@ def train(self): self.store_model() except Exception: - self.print("Error in train().", 0, 1) + self.print("Error in train()", 0, 1) + self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -135,11 +268,6 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - # For now, discard these to_drop = [ "appproto", @@ -152,7 +280,9 @@ def process_features(self, dataset): "history", "uid", "dir_", + "dbytes", "endtime", + "bytes", "flow_source", ] for field in to_drop: @@ -161,16 +291,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -204,11 +330,7 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ + fields_to_convert_to_flow = [ dataset.proto, dataset.dport, dataset.sport, @@ -219,10 +341,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_float: + for field in fields_to_convert_to_flow: try: field = field.astype("float64") - except (ValueError, AttributeError): + except ValueError: pass return dataset @@ -231,9 +353,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_flows(self): """ - Process all the flows in the DB + Process all the flwos in the DB Store the pandas df in self.flows """ try: @@ -249,48 +371,44 @@ def process_training_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. + # self.print(f'Amount of labeled flows: {labels}', 0, 1) flows.append( { - "starttime": 1594417039.029793, + "ts": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, + "state": "Established", + "allbytes": 42764, + "spkts": 37, "sbytes": 25517, - "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "label": "Malware", "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": "Malware" }, } ) flows.append( { - "starttime": 1382355032.706468, + "ts": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "SF", + "state": "Established", + "allbytes": 67696, "spkts": 1, - "dpkts": 0, "sbytes": 100, - "dbytes": 67596, "appproto": "http", - "label": "Benign", + "label": "Normal", "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": "Normal" }, } ) @@ -318,8 +436,6 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) - if dflow.empty: - return None # Update the flow to the processed version return dflow except Exception: @@ -333,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: + given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -340,28 +457,12 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", + "dbytes", + "dpkts", "endtime", + "bytes", "flow_source", - "ground_truth_label", # todo now we can use them - "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -373,7 +474,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" + f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -465,16 +566,18 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - # When a new flow arrives msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] + twid = msg["twid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original + # these fields are expected in testing. update the original # flow dict to have them self.flow.update( { + "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), + # the flow["state"] is the origstate, we dont need that here + # we need the interpreted state "state": msg["interpreted_state"], + "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -487,31 +590,23 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_labels_to_start_train + sum_labeled_flows >= self.minimum_lables_to_retrain + and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows() - # Train an algorithm - self.train() - self.last_number_of_flows_when_trained = sum_labeled_flows - + # We get here every 'self.minimum_lables_to_retrain' + # amount of labels + # So for example we retrain every 100 labels and only when + # we have at least 100 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_flows() + # Train an algorithm + self.train() elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -531,8 +626,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' + f"Report Prediction {pred[0]} for label" + f' {label} flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -540,9 +635,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malicious": + if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) + self.set_evidence_malicious_flow(self.flow, twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From 6f548d14a61187b042083d8233a9d68f4dc9e525 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 158/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 000000000..b671a09a2 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 63dc0bd420f3ad6a4390d17b5ee9ce34de8774f5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 159/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 606fc6713ea8a9973d59696e813c708c2cdd64d6 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 160/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 8917fef6a..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e8ca3aaf6..b4b2128d3 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 3a13d07707eb85b773bcc61abd93d4d8294dc846 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 161/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..12c3589ed 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From b09101cf20d63c822cd82269e40dd9edb17ee624 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 162/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589ed..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 7c50d01107a6bac1ad4e22e67d9a56c9e75af2ca Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 163/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..12c3589ed 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e7c5d824bac46fd7d95499f020a2183e981efdb1 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 164/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589ed..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From b350dcea25090b195f6befbf434f2b4506350b2e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 165/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..12c3589ed 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 49ddfddfd34f0754927332be8e7b61cfa23553f3 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 166/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589ed..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 456bd7208ababe3b0081b46380466f1301f02c2f Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 167/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 163 +++++++++++++++++++-- 1 file changed, 149 insertions(+), 14 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..c8226368c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -8,6 +8,7 @@ import pickle import pandas as pd import json +import datetime import traceback import warnings import sys @@ -121,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -156,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -393,21 +524,25 @@ def read_model(self): def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 description = ( - f"Flow with malicious characteristics by ML. Src IP" + f"Malicious flow by ML. Src IP" f" {flow['saddr']}:{flow['sport']} to " f"{flow['daddr']}:{flow['dport']}" ) + + timestamp = utils.convert_format( + datetime.datetime.now(), utils.alerts_format + ) twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( direction=Direction.SRC, - ioc_type=IoCType.IP, + attacker_type=IoCType.IP, value=flow["saddr"], ), victim=Victim( direction=Direction.DST, - ioc_type=IoCType.IP, + victim_type=IoCType.IP, value=flow["daddr"], ), threat_level=ThreatLevel.LOW, @@ -416,7 +551,7 @@ def set_evidence_malicious_flow(self, flow: dict, twid: str): profile=ProfileID(ip=flow["saddr"]), timewindow=TimeWindow(twid_number), uid=[flow["uid"]], - timestamp=flow["starttime"], + timestamp=timestamp, method=Method.AI, src_port=flow["sport"], dst_port=flow["dport"], From 592edafb650e53bd0d2bcbc5bf94e5488e2807f7 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 168/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115b..b671a09a2 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 759e8597228c569727eb85c9c40aa5130903602f Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 169/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 07542a4d60f8828af3adb6b11de50356cd760dee Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 170/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From 748f2d35ebab0b22a0e993f7165c7fb6140d2749 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 171/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a70..94eb27afd 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 689dc79ef2926c581b2f0b9d7a4fd75a186f12ba Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 172/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 94eb27afd..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 34ca9a52592e632e6ea5d28dd486b84c0175fee1 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 173/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 150 +++++++++++++++++++-- 1 file changed, 140 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a70..c8226368c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -157,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From 4bd6701d5fbdc655bf1b08b34cbfd3089ea0b852 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 174/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115b..b671a09a2 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 0fa7bb66ea522aeaa6bc7ef6a128436cc38f61d9 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 175/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 26ef89d64d54e0b89815867791b76e31164fc076 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 176/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From e5902bd3d82d7454fabf81106c7df10f5ca2472f Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 177/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 438 +++++++++++++-------- 1 file changed, 278 insertions(+), 160 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a70..124ec61f9 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,8 +1,3 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle @@ -10,13 +5,10 @@ import json import datetime import traceback -import warnings import sys -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( +from slips_files.common.imports import * +from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, TimeWindow, @@ -25,8 +17,7 @@ EvidenceType, IoCType, Direction, - Victim, - Method, + IDEACategory, ) # Only for debbuging @@ -38,6 +29,8 @@ def warn(*args, **kwargs): pass +import warnings + warnings.warn = warn @@ -63,8 +56,6 @@ def init(self): # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() @@ -122,6 +113,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -130,7 +256,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -139,35 +265,28 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "starttime", + "ts", + "origstate", "type_", - "smac", - "dmac", - "history", - "uid", "dir_", + "history", "dbytes", - "endtime", - "bytes", - "flow_source", + "dpkts", + "smac", + "dmac", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): + except ValueError: pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -201,23 +320,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_flow: - try: - field = field.astype("float64") - except ValueError: - pass - + dataset.proto = dataset.proto.astype("float64") + try: + # Convert dport to float + dataset.dport = dataset.dport.astype("float") + except ValueError: + pass + try: + # Convert sport to float + dataset.sport = dataset.sport.astype("float") + except ValueError: + pass + try: + # Convert Dur to float + dataset.dur = dataset.dur.astype("float") + except ValueError: + pass + try: + # Convert TotPkts to float + dataset.pkts = dataset.pkts.astype("float") + except ValueError: + pass + try: + # Convert SrcPkts to float + dataset.spkts = dataset.spkts.astype("float") + except ValueError: + pass + try: + # Convert TotBytes to float + dataset.allbytes = dataset.allbytes.astype("float") + except ValueError: + pass + try: + # Convert SrcBytes to float + dataset.sbytes = dataset.sbytes.astype("float") + except ValueError: + pass return dataset except Exception: # Stop the timer @@ -233,6 +371,7 @@ def process_flows(self): # We get all the flows so far # because this retraining happens in batches flows = self.db.get_all_flows() + # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -252,7 +391,9 @@ def process_flows(self): "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 84, "allbytes": 42764, "spkts": 37, "sbytes": 25517, @@ -272,7 +413,9 @@ def process_flows(self): "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 67, "allbytes": 67696, "spkts": 1, "sbytes": 100, @@ -298,55 +441,42 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self, flow_to_process: dict): + def process_flow(self): """ Process one flow. Only used during detection in testing - returns the pandas df with the processed flow + Store the pandas df in self.flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) + raw_flow = pd.DataFrame(self.flow_dict, index=[0]) + # Process features dflow = self.process_features(raw_flow) # Update the flow to the processed version - return dflow + self.flow = dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self, x_flow) -> Optional[numpy.ndarray]: + def detect(self): """ - Detects the given flow with the current model stored - and returns the predection array + Detect this flow with the current model stored """ try: - given_x_flow = x_flow - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "dbytes", - "dpkts", - "endtime", - "bytes", - "flow_source", - ] - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass + # Store the real label if there is one + y_flow = self.flow["label"] + # remove the real label column + self.flow = self.flow.drop("label", axis=1) + # remove the label predictions column of the other modules + X_flow = self.flow.drop("module_labels", axis=1) # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) + X_flow = self.scaler.transform(X_flow) + pred = self.clf.predict(X_flow) return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" - ) + except Exception: + # Stop the timer + self.print("Error in detect() X_flow:") + self.print(X_flow) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -354,10 +484,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: + with open("./modules/flowmldetection/model.bin", "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open(self.scaler_path, "wb") as g: + with open("./modules/flowmldetection/scaler.bin", "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -367,23 +497,20 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: + with open("./modules/flowmldetection/model.bin", "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: + with open("./modules/flowmldetection/scaler.bin", "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) + self.print("There was no model. Creating a new empty model.", 0, 2) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. " - "Creating a new empty model.", + "Error reading model from disk. Creating a new empty model.", 0, 2, ) @@ -391,40 +518,39 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow(self, flow: dict, twid: str): + def set_evidence_malicious_flow( + self, + saddr: str, + sport: str, + daddr: str, + dport: str, + twid: str, + uid: str, + ): confidence: float = 0.1 + ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" + f"Malicious flow by ML. Src IP {saddr}:{sport} to " + f"{daddr}:{dport} {ip_identification}" ) timestamp = utils.convert_format( datetime.datetime.now(), utils.alerts_format ) - twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, - attacker_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - victim_type=IoCType.IP, - value=flow["daddr"], + direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], + profile=ProfileID(ip=saddr), + timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), + uid=[uid], timestamp=timestamp, - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], + category=IDEACategory.ANOMALY_TRAFFIC, ) self.db.set_evidence(evidence) @@ -441,22 +567,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - msg = json.loads(msg["data"]) - twid = msg["twid"] - self.flow = msg["flow"] - # these fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state - "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) + data = msg["data"] + # Convert from json to dict + data = json.loads(data) + profileid = data["profileid"] + twid = data["twid"] + # Get flow that is now in json format + flow = data["flow"] + # Convert flow to a dict + flow = json.loads(flow) + # Convert the common fields to something that can + # be interpreted + # Get the uid which is the key + uid = next(iter(flow)) + self.flow_dict = json.loads(flow[uid]) if self.mode == "train": # We are training @@ -469,57 +593,51 @@ def main(self): sum_labeled_flows >= self.minimum_lables_to_retrain and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels + # We get here every 'self.minimum_lables_to_retrain' amount of labels + # So for example we retrain every 100 labels and only when we have at least 100 labels self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." + f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." ) - # Process all flows in the DB and make them ready - # for pandas + # Process all flows in the DB and make them ready for pandas self.process_flows() # Train an algorithm self.train() elif self.mode == "test": # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) + self.process_flow() - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: + # After processing the flow, it may happen that we delete icmp/arp/etc + # so the dataframe can be empty + if self.flow is not None and not self.flow.empty: # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return + pred = self.detect() + label = self.flow_dict["label"] - label = self.flow["label"] + # Report if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode + # If the user specified a label in test mode, and the label + # is diff from the prediction, print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 3, ) if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow( + self.flow_dict["saddr"], + self.flow_dict["sport"], + self.flow_dict["daddr"], + self.flow_dict["dport"], + twid, + uid, + ) self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 2, ) From e5ee4b746411b114c9a96fc98aa97d130a75faee Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 178/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115b..b671a09a2 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 840822e5c232b4b3fefa206b99b331759ff2877d Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 179/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From a30b45c3016d25e45c6038b66e25eb155c6a72c3 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 180/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 169 +++------------------ 1 file changed, 19 insertions(+), 150 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 124ec61f9..c57a7a358 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -5,9 +5,13 @@ import json import datetime import traceback -import sys +import warnings + -from slips_files.common.imports import * +from slips_files.common.state_handler import get_final_state_from_flags +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, @@ -29,8 +33,6 @@ def warn(*args, **kwargs): pass -import warnings - warnings.warn = warn @@ -113,141 +115,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -281,12 +148,17 @@ def process_features(self, dataset): except ValueError: pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -370,7 +242,7 @@ def process_flows(self): try: # We get all the flows so far # because this retraining happens in batches - flows = self.db.get_all_flows() + flows: list = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware @@ -464,7 +336,7 @@ def detect(self): """ try: # Store the real label if there is one - y_flow = self.flow["label"] + # y_flow = self.flow["label"] # remove the real label column self.flow = self.flow.drop("label", axis=1) # remove the label predictions column of the other modules @@ -568,13 +440,10 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): data = msg["data"] - # Convert from json to dict data = json.loads(data) - profileid = data["profileid"] + # profileid = data["profileid"] twid = data["twid"] - # Get flow that is now in json format flow = data["flow"] - # Convert flow to a dict flow = json.loads(flow) # Convert the common fields to something that can # be interpreted From bf4c8cf95ca6cfa2d28ca270560e9001fd6f127c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:36:55 +0200 Subject: [PATCH 181/498] mlflow. Ignore UID column --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c57a7a358..e2aa1e0ee 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -141,6 +141,7 @@ def process_features(self, dataset): "dpkts", "smac", "dmac", + "uid", ] for field in to_drop: try: From 59a109713f00126acf7633e9435156c49b5ec580 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 182/498] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index b4b2128d3..e8ca3aaf6 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From bb582a55c2a8460bcf408204dc175207b2499682 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 183/498] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115b..000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From b586ac78776b01465a9476771ccec69b3df635c3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 184/498] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e2aa1e0ee..9269b6701 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -154,7 +154,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From 5ccc0dd3da3eb9f31c5b4a2ab5dbdf89e9b32898 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 185/498] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 378 +++++++++++---------- 1 file changed, 197 insertions(+), 181 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9269b6701..e6ea0b517 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,18 +1,20 @@ +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle import pandas as pd import json -import datetime import traceback import warnings - -from slips_files.common.state_handler import get_final_state_from_flags from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils from slips_files.common.abstracts.module import IModule -from slips_files.core.evidence_structure.evidence import ( +from slips_files.core.structures.evidence import ( Evidence, ProfileID, TimeWindow, @@ -21,7 +23,8 @@ EvidenceType, IoCType, Direction, - IDEACategory, + Victim, + Method, ) # Only for debbuging @@ -52,36 +55,41 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -90,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -113,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -123,7 +131,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -132,21 +140,20 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "ts", - "origstate", + "starttime", "type_", - "dir_", - "history", - "dbytes", - "dpkts", "smac", "dmac", + "history", "uid", + "dir_", + "endtime", + "flow_source", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except ValueError: + except (ValueError, KeyError): pass # When flows are read from Slips sqlite, @@ -155,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -193,58 +199,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - dataset.proto = dataset.proto.astype("float64") - try: - # Convert dport to float - dataset.dport = dataset.dport.astype("float") - except ValueError: - pass - try: - # Convert sport to float - dataset.sport = dataset.sport.astype("float") - except ValueError: - pass - try: - # Convert Dur to float - dataset.dur = dataset.dur.astype("float") - except ValueError: - pass - try: - # Convert TotPkts to float - dataset.pkts = dataset.pkts.astype("float") - except ValueError: - pass - try: - # Convert SrcPkts to float - dataset.spkts = dataset.spkts.astype("float") - except ValueError: - pass - try: - # Convert TotBytes to float - dataset.allbytes = dataset.allbytes.astype("float") - except ValueError: - pass - try: - # Convert SrcBytes to float - dataset.sbytes = dataset.sbytes.astype("float") - except ValueError: - pass + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + return dataset except Exception: # Stop the timer self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: # We get all the flows so far # because this retraining happens in batches - flows: list = self.db.get_all_flows() - + flows = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -254,48 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 84, - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 67, - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -314,42 +304,51 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self): + def process_flow(self, flow_to_process: dict): """ Process one flow. Only used during detection in testing - Store the pandas df in self.flow + returns the pandas df with the processed flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(self.flow_dict, index=[0]) - # Process features + raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) # Update the flow to the processed version - self.flow = dflow + return dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self): + def detect(self, x_flow) -> Optional[numpy.ndarray]: """ - Detect this flow with the current model stored + Detects the given flow with the current model stored + and returns the predection array """ try: - # Store the real label if there is one - # y_flow = self.flow["label"] - # remove the real label column - self.flow = self.flow.drop("label", axis=1) - # remove the label predictions column of the other modules - X_flow = self.flow.drop("module_labels", axis=1) + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + ] + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass # Scale the flow - X_flow = self.scaler.transform(X_flow) - pred = self.clf.predict(X_flow) + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) return pred - except Exception: - # Stop the timer - self.print("Error in detect() X_flow:") - self.print(X_flow) + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -357,10 +356,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "wb") as f: + with open(self.model_path, "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open("./modules/flowmldetection/scaler.bin", "wb") as g: + with open(self.scaler_path, "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -370,20 +369,23 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "rb") as f: + with open(self.model_path, "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open("./modules/flowmldetection/scaler.bin", "rb") as g: + with open(self.scaler_path, "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print("There was no model. Creating a new empty model.", 0, 2) + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. Creating a new empty model.", + "Error reading model from disk. " + "Creating a new empty model.", 0, 2, ) @@ -391,39 +393,36 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow( - self, - saddr: str, - sport: str, - daddr: str, - dport: str, - twid: str, - uid: str, - ): + def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 - ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP {saddr}:{sport} to " - f"{daddr}:{dport} {ip_identification}" - ) - - timestamp = utils.convert_format( - datetime.datetime.now(), utils.alerts_format + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" ) - + twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=saddr), - timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), - uid=[uid], - timestamp=timestamp, - category=IDEACategory.ANOMALY_TRAFFIC, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], ) self.db.set_evidence(evidence) @@ -440,17 +439,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - data = msg["data"] - data = json.loads(data) - # profileid = data["profileid"] - twid = data["twid"] - flow = data["flow"] - flow = json.loads(flow) - # Convert the common fields to something that can - # be interpreted - # Get the uid which is the key - uid = next(iter(flow)) - self.flow_dict = json.loads(flow[uid]) + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) if self.mode == "train": # We are training @@ -459,55 +461,69 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' amount of labels - # So for example we retrain every 100 labels and only when we have at least 100 labels - self.print( - f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect - self.process_flow() + processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we delete icmp/arp/etc - # so the dataframe can be empty - if self.flow is not None and not self.flow.empty: + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: # Predict - pred = self.detect() - label = self.flow_dict["label"] + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return - # Report + label = self.flow["label"] if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, and the label - # is diff from the prediction, print in debug mode + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode self.print( - f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow( - self.flow_dict["saddr"], - self.flow_dict["sport"], - self.flow_dict["daddr"], - self.flow_dict["dport"], - twid, - uid, - ) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 2, ) From fe91a3c6a427b86f3957864dcdea67a52b7a861d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 186/498] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e6ea0b517..0fa1e4d76 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From 31d8b921d59719a665de7b0195eeac37e2ad7d81 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 187/498] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0fa1e4d76..5c5f9943f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From 689b570abe330277d9af665e0d99b6ae2354d384 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 188/498] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1124 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 130 zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KL^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8 z4q_05T(v)g91VHfmFeIMvRKFpJJ~89v lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl# diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch delta 290 zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGcfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXo Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 189/498] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 190/498] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5c5f9943f..fe950ed4b 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "endtime", "flow_source", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From fba965a9409ff15bbb4ed677fe658f85c1b1b02a Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 191/498] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 192/498] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 844a04314f76516c8ec2afaf8c3cc040955c62a2 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 193/498] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From 70c222ea6b8661e903dfc4ae93855d8ee2614ca5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:49:23 +0000 Subject: [PATCH 194/498] Add plot for flowml train scores --- modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 modules/flowmldetection/plot_train_score.py diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py new file mode 100644 index 000000000..0b5b5b72b --- /dev/null +++ b/modules/flowmldetection/plot_train_score.py @@ -0,0 +1,56 @@ +import pandas as pd +import matplotlib.pyplot as plt +import re +import sys + +def plot_log_data(file_path): + # Read the log data from the file + with open(file_path, 'r') as file: + log_data = file.read() + + # Define regex pattern to extract relevant data from each line + pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + + # Parse the log file + data = re.findall(pattern, log_data) + + # Convert data to a DataFrame + df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + df = df.astype({ + "Background": int, + "Benign": int, + "Malicious": int, + "Total labels": float, + "Score": float + }) + + # Plotting the values + fig, ax1 = plt.subplots(figsize=(10, 6)) + + # Plotting Score on the left y-axis + ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + ax1.set_xlabel('Index') + ax1.set_ylabel('Score', color='tab:blue') + ax1.tick_params(axis='y', labelcolor='tab:blue') + + # Create the second y-axis for the Total labels + ax2 = ax1.twinx() + ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') + ax2.set_ylabel('Total labels', color='tab:red') + ax2.tick_params(axis='y', labelcolor='tab:red') + + # Adding title and legend + plt.title('Log Data Visualization') + fig.tight_layout() + + # Save plot to a PNG file + plt.savefig('log_data_plot_with_two_scales.png') + + # Display the plot + plt.show() + +# Make sure the file path is passed as an argument +if len(sys.argv) < 2: + print("Please provide the path to the log file as a parameter.") +else: + plot_log_data(sys.argv[1]) From a721639f4e90e0db5e9464b7fda27454e305ab5f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:04 +0000 Subject: [PATCH 195/498] Add a log file to store the training data output --- modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fe950ed4b..60217ada2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -68,12 +68,29 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves self.label = conf.label() - def train(self): + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): """ Train a model based on the flows we receive and the labels """ From 2d65486fa55caae847d9cfb709e8aedf57b2b7d6 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:32 +0000 Subject: [PATCH 196/498] Store data in the log file of training --- modules/flowmldetection/flowmldetection.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 60217ada2..6f732da63 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -137,9 +137,13 @@ def train(self, sum_labeled_flows): # Store the models on disk self.store_model() + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") def process_features(self, dataset): """ From b0324a55a34f5e2f5780bfb755863fbe6662dcc7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:53 +0000 Subject: [PATCH 197/498] better comments --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6f732da63..ed3aecf1b 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -59,10 +59,9 @@ def init(self): self.minimum_labels_to_start_train = 50 # Minum amount of new labels needed to retrain self.minimum_labels_to_retrain = 50 - # The number of flows when last trained + # The number of flows when last trained. Used internally only to know + # when to retrain self.last_number_of_flows_when_trained = 0 - # To plot the scores of training - # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" From 1e91a10fa051a06cb27ebf5e9e0c505fe4210f32 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:30 +0000 Subject: [PATCH 198/498] Fix issue not dropping detailed labels --- modules/flowmldetection/flowmldetection.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index ed3aecf1b..25b30cf51 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -94,23 +94,19 @@ def train(self, sum_labeled_flows): Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Train try: self.clf.partial_fit( From d97a4ddb3e8af4bee1cbe98d980e55fe5b8f8139 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:53 +0000 Subject: [PATCH 199/498] Fix issue that not all labels sere given to the partial fit --- modules/flowmldetection/flowmldetection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 25b30cf51..b2d0db5e5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -109,8 +109,9 @@ def train(self, sum_labeled_flows): # Train try: + # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") From 10560192bfae39975002f518114f03ad2d56ed83 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:52:08 +0000 Subject: [PATCH 200/498] count partial labels in this epoch --- modules/flowmldetection/flowmldetection.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b2d0db5e5..1146091a9 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -106,6 +106,12 @@ def train(self, sum_labeled_flows): # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } # Train try: From 2a61b4608e234655f284cac29951f33c756bc7f9 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:09 +0000 Subject: [PATCH 201/498] Dont print training in screen --- modules/flowmldetection/flowmldetection.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 1146091a9..4bb2ad7db 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -126,15 +126,8 @@ def train(self, sum_labeled_flows): # See score so far in training score = self.clf.score(X_flow, y_flow) - # To debug the training score - # self.scores.append(score) - - self.print(f" Training Score: {score}", 0, 1) - # self.print(f' Model Parameters: {self.clf.coef_}') - - # Debug code to store a plot in a png of the scores - # plt.plot(self.scores) - # plt.savefig('train-scores.png') + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) # Store the models on disk self.store_model() From eef7992b26c5e8ff0db0ec8c14ce9bd3064f7fd6 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:28 +0000 Subject: [PATCH 202/498] Add function to write to train log --- modules/flowmldetection/flowmldetection.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4bb2ad7db..d4b2762f5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -247,28 +247,28 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_training_flows(self, last_number_of_flows_when_trained): """ - Process all the flows in the DB + Process only the new flows in the DB since the last training. Store the pandas df in self.flows """ try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + # We get all the flows so far - # because this retraining happens in batches flows = self.db.get_all_flows() - # Check how many different labels are in the DB - # We need both normal and malware + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB labels = self.db.get_labels() if len(labels) == 1: - # Only 1 label has flows - # There are not enough different labels, so insert two flows - # that are fake but representative of a normal and malware flow - # they are only for the training process - # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. - flows.append( + # Insert fake flows for both classes if needed + new_flows.append( { "starttime": 1594417039.029793, "dur": "1.9424750804901123", From b253aecbdf6797bee21511fc6faa84f0dcf6dd08 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:27 +0000 Subject: [PATCH 203/498] Fix label in dummy flow --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d4b2762f5..6a44422cc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "ground_truth_label": "Malicious", "module_labels": { "flowalerts-long-connection": "Malicious" }, } ) - flows.append( + new_flows.append( { "starttime": 1382355032.706468, "dur": "10.896695", From 8b5dccc0afc99f5a2bd1c6175d034b890135178d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:39 +0000 Subject: [PATCH 204/498] Fix dummy flow --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6a44422cc..20f1f8ca8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "label": "Benign", + "ground_truth_label": "Benign", "module_labels": { "flowalerts-long-connection": "Benign" }, From 11fb0096098f3ac57267593712f8b545b1ca84a2 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:58:28 +0000 Subject: [PATCH 205/498] Rename variable --- modules/flowmldetection/flowmldetection.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 20f1f8ca8..59064d61a 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): }, } ) - # If there are enough flows, we dont insert them anymore # Convert to pandas df - df_flows = pd.DataFrame(flows) + df_flows = pd.DataFrame(new_flows) # Process features df_flows = self.process_features(df_flows) @@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained): # Update the flow to the processed version self.flows = df_flows except Exception: - # Stop the timer self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) From 1acb03086bc424093508484dfa70176c696f8777 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:32 +0000 Subject: [PATCH 206/498] Fix dummy flow label --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 59064d61a..6b41b4029 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -356,6 +356,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "dir_", "endtime", "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. # Error From 5f61978998876e7e30511a2e7a378bf914ec022a Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:47 +0000 Subject: [PATCH 207/498] Pass values to train function --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6b41b4029..4d66aab85 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -521,9 +521,9 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows() + self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train() + self.train(sum_labeled_flows) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From 4a486284e59952de7c793ee55cd2e627fd7f2830 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:01:47 +0000 Subject: [PATCH 208/498] import os --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4d66aab85..766178e12 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import os from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 19b5bdde44678c80365f8c6aeda8b9d3b67f7a6f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:15 +0000 Subject: [PATCH 209/498] Get issue of total flows zero --- slips_files/core/database/database_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e8ca3aaf6..892b923b4 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -661,7 +661,8 @@ def add_software_to_profile(self, *args, **kwargs): return self.rdb.add_software_to_profile(*args, **kwargs) def get_total_flows(self, *args, **kwargs): - return int(self.rdb.get_total_flows(*args, **kwargs)) + total_flows = self.rdb.get_total_flows(*args, **kwargs) + return int(total_flows) if total_flows is not None else 0 def increment_processed_flows(self, *args, **kwargs): return self.rdb.increment_processed_flows(*args, **kwargs) From cf87d4260a971d8e81d1474b0d0968dba12e68b7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:32 +0000 Subject: [PATCH 210/498] Add comments --- slips_files/core/database/database_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 892b923b4..6dd1d9952 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -879,7 +879,10 @@ def get_flow(self, *args, **kwargs): """returns the raw flow as read from the log file""" return self.sqlite.get_flow(*args, **kwargs) - def add_flow(self, flow, profileid: str, twid: str, label="benign"): + def add_flow(self, flow, profileid: str, twid: str, label="Benign"): + """ + Just in case, by default if there are no labels in the flow, we consider it Benign + """ # stores it in the db self.sqlite.add_flow(flow, profileid, twid, label=label) # handles the channels and labels etc. From 5a7c0ded0fcf0c46666839a155556f09409687cc Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:51 +0000 Subject: [PATCH 211/498] Rename var name to be more clear --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index 3c4d59db2..3dd478dcf 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -119,7 +119,7 @@ def read_configuration(self): self.local_whitelist_path = conf.local_whitelist_path() self.timeformat = conf.ts_format() self.analysis_direction = conf.analysis_direction() - self.label = conf.label() + self.configuration_label = conf.label() self.width = conf.get_tw_width_as_float() self.client_ips: List[ Union[IPv4Network, IPv6Network, IPv4Address, IPv6Address] From 24e638bdba4dedacff0e2af93b701f3d1b75403e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:10 +0000 Subject: [PATCH 212/498] Rename var name --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index 3dd478dcf..429faae5c 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -377,7 +377,7 @@ def store_features_going_in(self, profileid: str, twid: str, flow): flow, profileid=profileid, twid=twid, - label=self.label, + label=self.configuration_label, ) self.db.mark_profile_tw_as_modified(profileid, twid, "") From f872498d1f7848c293a1c71e03b21f35b0eba1d3 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:31 +0000 Subject: [PATCH 213/498] Fix processeed flows being zero --- slips/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips/main.py b/slips/main.py index df49ffb97..39e8b2a67 100644 --- a/slips/main.py +++ b/slips/main.py @@ -414,7 +414,7 @@ def get_analyzed_flows_percentage(self) -> str: self.total_flows = self.db.get_total_flows() flows_percentage = int( - (self.db.get_processed_flows_so_far() / self.total_flows) * 100 + (self.db.get_processed_flows_so_far() / self.total_flows) * 100 if self.total_flows != 0 else 0 ) return f"Analyzed Flows: {green(flows_percentage)}{green('%')}. " From 19c3116d79ae35e0138b623dd05d0994dcabd679 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:53 +0000 Subject: [PATCH 214/498] Delete old comments --- modules/flowmldetection/flowmldetection.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 766178e12..6c3bfc127 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -28,10 +28,6 @@ Method, ) -# Only for debbuging -# from matplotlib import pyplot as plt - - # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass From 0d6d1da5f8494e912ceb600fcc14c93c7dd36204 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:13:22 +0000 Subject: [PATCH 215/498] Fix plots --- modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++----- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 0b5b5b72b..359df04ef 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -2,6 +2,8 @@ import matplotlib.pyplot as plt import re import sys +import argparse +import os def plot_log_data(file_path): # Read the log data from the file @@ -24,33 +26,59 @@ def plot_log_data(file_path): "Score": float }) + # Get the directory of the log file to store the plot in the same folder + dir_name = os.path.dirname(file_path) + plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis + # Plotting Score on the left y-axis (with proper scaling from 0 to 1) ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') ax1.set_xlabel('Index') ax1.set_ylabel('Score', color='tab:blue') + ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Total labels + # Create the second y-axis for the Background, Benign, Malicious, Total labels ax2 = ax1.twinx() + ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + + # Set appropriate scale for right y-axis based on the data + ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() - # Save plot to a PNG file - plt.savefig('log_data_plot_with_two_scales.png') + # Adding the legend with increased space for readability + ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') + ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + + # Increase right margin for better readability of legend + plt.subplots_adjust(right=0.75) + + # Save plot to the same folder as the log file + plt.savefig(plot_file) # Display the plot plt.show() -# Make sure the file path is passed as an argument -if len(sys.argv) < 2: - print("Please provide the path to the log file as a parameter.") -else: - plot_log_data(sys.argv[1]) +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") + parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + + # Handle -h / --help + args = parser.parse_args() + + # Call the function to process the log file + plot_log_data(args.log_file) + +if __name__ == "__main__": + main() From da5d1875a5f4ce9ec016e5cfa8f41e31ed5862b5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:14:58 +0000 Subject: [PATCH 216/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 359df04ef..c7f374a7f 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -40,18 +40,21 @@ def plot_log_data(file_path): ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Background, Benign, Malicious, Total labels + # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) + ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') + # Annotating Total labels as text on the plot + for i, value in enumerate(df["Total labels"]): + ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') + # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() From 0f3d1f5b26d0a8c25cfdfc9b758e249fa48fface Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:16:23 +0000 Subject: [PATCH 217/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index c7f374a7f..4099c47c1 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -42,10 +42,10 @@ def plot_log_data(file_path): # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') + ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') + ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') # Set appropriate scale for right y-axis based on the data ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) @@ -56,7 +56,7 @@ def plot_log_data(file_path): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') # Adding title and legend - plt.title('Log Data Visualization') + plt.title('Training performance') fig.tight_layout() # Adding the legend with increased space for readability From b000f176f8278d4fa86a2f4fb2d994da9813aaca Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:24:43 +0000 Subject: [PATCH 218/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 4099c47c1..8437e968a 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -59,12 +59,12 @@ def plot_log_data(file_path): plt.title('Training performance') fig.tight_layout() - # Adding the legend with increased space for readability - ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') - ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + # Move both legends further to the right + ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) + plt.subplots_adjust(right=0.7) # Save plot to the same folder as the log file plt.savefig(plot_file) From bd1f21b2101ae36b11bc5e3a866de745a8c3e2e8 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:02:34 +0000 Subject: [PATCH 219/498] Plot testing performance from a log --- .../plot_testing_performance.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 modules/flowmldetection/plot_testing_performance.py diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py new file mode 100644 index 000000000..a38c7f059 --- /dev/null +++ b/modules/flowmldetection/plot_testing_performance.py @@ -0,0 +1,89 @@ +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Create the plot + plt.figure(figsize=(12, 8)) + + # Plot each metric + plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') + plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') + plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') + plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') + plt.plot(F1_values, label='F1 Score', marker='o') + plt.plot(accuracy_values, label='Accuracy', marker='o') + plt.plot(precision_values, label='Precision', marker='o') + plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') + plt.plot(recall_values, label='Recall (TPR)', marker='o') + + # Add labels and title + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title('Evaluation Metrics Over Time') + + # Add a legend + plt.legend() + + # Save the plot as a PNG file + plt.savefig('metrics_plot.png') + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From fd21630441d02796cd0aae52b5e13492a2d731d0 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:04:32 +0000 Subject: [PATCH 220/498] Fix the plot --- modules/flowmldetection/plot_testing_performance.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index a38c7f059..fac0acd64 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Set logarithmic scale on the y-axis + plt.yscale('log') + # Add labels and title plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title('Evaluation Metrics Over Time') + plt.ylabel('Metric Value (Log Scale)') + plt.title('Evaluation Metrics Over Time (Log Scale)') # Add a legend plt.legend() # Save the plot as a PNG file - plt.savefig('metrics_plot.png') + plt.savefig('metrics_plot_log_scale.png') plt.close() def main(): From ee0deaf2a3229c26a5c734a314878b9b0a393c01 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:12:40 +0000 Subject: [PATCH 221/498] Fix the plots --- .../plot_testing_performance.py | 76 ++++++++++++++----- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index fac0acd64..5581c72cd 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -50,33 +50,66 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Create the plot - plt.figure(figsize=(12, 8)) + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } - # Plot each metric - plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') - plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') - plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') - plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') - plt.plot(F1_values, label='F1 Score', marker='o') - plt.plot(accuracy_values, label='Accuracy', marker='o') - plt.plot(precision_values, label='Precision', marker='o') - plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') - plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') - # Set logarithmic scale on the y-axis - plt.yscale('log') + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + +def plot_single_group(metrics_dict, output_filename): + plt.figure(figsize=(12, 8)) - # Add labels and title + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + plt.xlabel('Index') - plt.ylabel('Metric Value (Log Scale)') - plt.title('Evaluation Metrics Over Time (Log Scale)') - - # Add a legend + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') plt.legend() - # Save the plot as a PNG file - plt.savefig('metrics_plot_log_scale.png') + # Save the plot + plt.savefig(output_filename) plt.close() def main(): @@ -85,6 +118,7 @@ def main(): sys.exit(1) file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) From f9d8806d2c2035b3cb57e69a70b462cec05e5f57 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:16:50 +0000 Subject: [PATCH 222/498] Fix plot --- .../plot_testing_performance.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 5581c72cd..8f9e12cd8 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') -def plot_single_group(metrics_dict, output_filename): + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename): # Apply log scale by default plt.yscale('log') + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Manually set more Y-ticks for better visibility + plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 + plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + plt.xlabel('Index') plt.ylabel('Metric Value') plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') From 15e37d2d67dc27f0aaabb5cb40dbc3fe397d64ec Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:22 +0000 Subject: [PATCH 223/498] Fix plots --- modules/flowmldetection/flowmldetection.py | 709 +++++---------------- 1 file changed, 143 insertions(+), 566 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6c3bfc127..37f076110 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,566 +1,143 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy -from sklearn.linear_model import SGDClassifier -from sklearn.preprocessing import StandardScaler -import pickle -import pandas as pd -import json -import traceback -import warnings -import os - -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( - Evidence, - ProfileID, - TimeWindow, - Attacker, - ThreatLevel, - EvidenceType, - IoCType, - Direction, - Victim, - Method, -) - -# This horrible hack is only to stop sklearn from printing those warnings -def warn(*args, **kwargs): - pass - - -warnings.warn = warn - - -class FlowMLDetection(IModule): - # Name: short name of the module. Do not use spaces - name = "Flow ML Detection" - description = ( - "Train or test a Machine Learning model to detect malicious flows" - ) - authors = ["Sebastian Garcia"] - - def init(self): - # Subscribe to the channel - self.c1 = self.db.subscribe("new_flow") - self.channels = {"new_flow": self.c1} - self.fieldseparator = self.db.get_field_separator() - # Set the output queue of our database instance - # Read the configuration - self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained. Used internally only to know - # when to retrain - self.last_number_of_flows_when_trained = 0 - # The scaler trained during training and to use during testing - self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") - - def read_configuration(self): - conf = ConfigParser() - self.mode = conf.get_ml_mode() - # This is the global label in the configuration, - # in case the flows do not have a label themselves - self.label = conf.label() - - def write_to_training_log(self, message: str): - """ - Write a message to the training log file. - """ - try: - with open(self.training_log_path, "a") as log_file: - log_file.write(message + "\n") - except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) - - def train(self, sum_labeled_flows): - """ - Train a model based on the flows we receive and the labels - """ - try: - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("ground_truth_label", axis=1) - # Drop the detailed labels - X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) - # Drop the module_labels - X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - - # Normalize this batch of data so far. This can get progressivle slow - X_flow = self.scaler.fit_transform(X_flow) - - # Count the number of labels of each type in this epoc - epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), - } - - # Train - try: - # Online incremental learning - self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] - ) - except Exception: - self.print("Error while calling clf.train()") - self.print(traceback.format_exc(), 0, 1) - - # See score so far in training - score = self.clf.score(X_flow, y_flow) - - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) - - # Store the models on disk - self.store_model() - - # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") - except Exception: - self.print("Error in train().", 0, 1) - self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") - - def process_features(self, dataset): - """ - Discards some features of the dataset and can create new. - Clean the dataset - """ - try: - # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] - for proto in to_discard: - dataset = dataset[dataset.proto != proto] - - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - - # For now, discard these - to_drop = [ - "appproto", - "daddr", - "saddr", - "starttime", - "type_", - "smac", - "dmac", - "history", - "uid", - "dir_", - "endtime", - "flow_source", - ] - for field in to_drop: - try: - dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): - pass - - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others - # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) - - # Convert state to categorical - dataset.state = dataset.state.str.replace( - r"(^.*Not Established.*$)", "0", regex=True - ) - dataset.state = dataset.state.str.replace( - r"(^.*Established.*$)", "1", regex=True - ) - - # Convert categories to floats - dataset.state = dataset.state.astype("float64") - - # Convert proto to categorical. For now we only have few states, so we can hardcode... - # We dont use the data to create categories because in testing mode - # we dont see all the protocols - # Also we dont store the Categorizer because the user can retrain - # with its own data. - dataset.proto = dataset.proto.str.lower() - dataset.proto = dataset.proto.str.replace( - r"(^.*tcp.*$)", "0", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*udp.*$)", "1", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp.*$)", "2", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp-ipv6.*$)", "3", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*arp.*$)", "4", regex=True - ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_float: - try: - field = field.astype("float64") - except (ValueError, AttributeError): - pass - - return dataset - except Exception: - # Stop the timer - self.print("Error in process_features()") - self.print(traceback.format_exc(), 0, 1) - - def process_training_flows(self, last_number_of_flows_when_trained): - """ - Process only the new flows in the DB since the last training. - Store the pandas df in self.flows - """ - try: - # Ensure the index is an integer - if last_number_of_flows_when_trained is None: - last_number_of_flows_when_trained = 0 - else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) - - # We get all the flows so far - flows = self.db.get_all_flows() - # Only process new flows since last training - new_flows = flows[last_number_of_flows_when_trained:] - - # Check how many **different** labels are in the DB - labels = self.db.get_labels() - if len(labels) == 1: - # Insert fake flows for both classes if needed - new_flows.append( - { - "starttime": 1594417039.029793, - "dur": "1.9424750804901123", - "saddr": "10.7.10.101", - "sport": "49733", - "daddr": "40.70.224.145", - "dport": "443", - "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, - "sbytes": 25517, - "dbytes": 17247, - "appproto": "ssl", - "ground_truth_label": "Malicious", - "module_labels": { - "flowalerts-long-connection": "Malicious" - }, - } - ) - new_flows.append( - { - "starttime": 1382355032.706468, - "dur": "10.896695", - "saddr": "147.32.83.52", - "sport": "47956", - "daddr": "80.242.138.72", - "dport": "80", - "proto": "tcp", - "state": "SF", - "spkts": 1, - "dpkts": 0, - "sbytes": 100, - "dbytes": 67596, - "appproto": "http", - "ground_truth_label": "Benign", - "module_labels": { - "flowalerts-long-connection": "Benign" - }, - } - ) - - # Convert to pandas df - df_flows = pd.DataFrame(new_flows) - - # Process features - df_flows = self.process_features(df_flows) - - # Update the flow to the processed version - self.flows = df_flows - except Exception: - self.print("Error in process_flows()") - self.print(traceback.format_exc(), 0, 1) - - def process_flow(self, flow_to_process: dict): - """ - Process one flow. Only used during detection in testing - returns the pandas df with the processed flow - """ - try: - # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) - dflow = self.process_features(raw_flow) - if dflow.empty: - return None - # Update the flow to the processed version - return dflow - except Exception: - # Stop the timer - self.print("Error in process_flow()") - self.print(traceback.format_exc(), 0, 1) - - def detect(self, x_flow) -> Optional[numpy.ndarray]: - """ - Detects the given flow with the current model stored - and returns the predection array - """ - try: - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "endtime", - "flow_source", - "ground_truth_label", - "detailed_ground_truth_label", - ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass - # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) - return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" - ) - self.print(traceback.format_exc(), 0, 1) - - def store_model(self): - """ - Store the trained model on disk - """ - self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: - data = pickle.dumps(self.clf) - f.write(data) - with open(self.scaler_path, "wb") as g: - data = pickle.dumps(self.scaler) - g.write(data) - - def read_model(self): - """ - Read the trained model from disk - """ - try: - self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: - self.clf = pickle.load(f) - self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: - self.scaler = pickle.load(g) - except FileNotFoundError: - # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - except EOFError: - self.print( - "Error reading model from disk. " - "Creating a new empty model.", - 0, - 2, - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - - def set_evidence_malicious_flow(self, flow: dict, twid: str): - confidence: float = 0.1 - description = ( - f"Flow with malicious characteristics by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" - ) - twid_number = int(twid.replace("timewindow", "")) - evidence: Evidence = Evidence( - evidence_type=EvidenceType.MALICIOUS_FLOW, - attacker=Attacker( - direction=Direction.SRC, - ioc_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - ioc_type=IoCType.IP, - value=flow["daddr"], - ), - threat_level=ThreatLevel.LOW, - confidence=confidence, - description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], - timestamp=flow["starttime"], - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], - ) - - self.db.set_evidence(evidence) - - def shutdown_gracefully(self): - # Confirm that the module is done processing - if self.mode == "train": - self.store_model() - - def pre_main(self): - utils.drop_root_privs() - # Load the model - self.read_model() - - def main(self): - if msg := self.get_msg("new_flow"): - # When a new flow arrives - msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] - self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "state": msg["interpreted_state"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) - - if self.mode == "train": - # We are training - - # Is the amount in the DB of labels enough to retrain? - # Use labeled flows - labels = self.db.get_labels() - sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) - # Train an algorithm - self.train(sum_labeled_flows) - self.last_number_of_flows_when_trained = sum_labeled_flows - - elif self.mode == "test": - # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) - - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: - # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return - - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) - if pred[0] == "Malicious": - # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) - self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 2, - ) +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } + + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): + plt.figure(figsize=(12, 8)) + + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Add more ticks between 0 and 1 (using a logarithmic scale) + plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) + + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.legend() + + # Save the plot + plt.savefig(output_filename) + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From 9ddaf31f83a34962af33188b0f88176dc8ec33fd Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:52 +0000 Subject: [PATCH 224/498] Fix plots --- .../plot_testing_performance.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 8f9e12cd8..69b8c96a8 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['MCC'].append(MCC_values[i]) close_to_1['recall'].append(recall_values[i]) - # Plot metrics for values close to 0 + # Plot metrics for values close to 0 (linear scale) plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + # Plot metrics for values close to 1 (log scale) + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): if 'recall' in metrics_dict: plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - # Apply log scale by default - plt.yscale('log') + # If the plot is close to 1, apply log scale + if not is_close_to_0: + plt.yscale('log') - # If the plot is close to 0, set custom ticks + # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series if is_close_to_0: - # Manually set more Y-ticks for better visibility - plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 - plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR'])) + max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR'])) + + # Avoid log(0), so set the minimum limit a little higher than zero + if min_val == 0: + min_val = 1e-4 # Avoid zero values on the logarithmic scale + + plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 878812adb8ffbdb24c82525a2b45580dd2aad4d5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:23:02 +0000 Subject: [PATCH 225/498] Fix plots --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 69b8c96a8..de4ada38b 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-4 # Avoid zero values on the logarithmic scale + min_val = 1e-8 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From b1909a50ed00fe86cebd6b037556ee7f5a419403 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:25:58 +0000 Subject: [PATCH 226/498] Change plot names --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index de4ada38b..1b4152c6e 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") From 213b6a5b6597b8b568ee45755d44b5e334c668b7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:26:09 +0000 Subject: [PATCH 227/498] Rename file --- .../{plot_train_score.py => plot_train_performance.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename modules/flowmldetection/{plot_train_score.py => plot_train_performance.py} (97%) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_performance.py similarity index 97% rename from modules/flowmldetection/plot_train_score.py rename to modules/flowmldetection/plot_train_performance.py index 8437e968a..80e13e951 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -28,7 +28,7 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + plot_file = os.path.join(dir_name, 'performance_metrics_training.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) From 20db5dbd1db02d06af5a6a9d7b6bb27e0e40a66f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:32 +0000 Subject: [PATCH 228/498] Recover good flowmldetection deleted by mistake --- modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++----- 1 file changed, 566 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 37f076110..5e4e9aa46 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,143 +1,566 @@ -import matplotlib.pyplot as plt -import sys -import numpy as np - -def process_file(file_path): - # Initialize the counters for the values - FPR_values = [] - FNR_values = [] - TNR_values = [] - TPR_values = [] - F1_values = [] - accuracy_values = [] - precision_values = [] - MCC_values = [] - recall_values = [] - - # Read the file and extract the data - with open(file_path, 'r') as file: - for line in file: - if "TP:" in line: - # Extract the values from the line - parts = line.split(',') - TP = int(parts[0].split(':')[1].strip()) - TN = int(parts[1].split(':')[1].strip()) - FP = int(parts[2].split(':')[1].strip()) - FN = int(parts[3].split(':')[1].strip()) - - # Calculate metrics - FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 - FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 - TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 - TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 - Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 - Recall = TPR # Recall is the same as TPR - F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 - Accuracy = (TP + TN) / (TP + TN + FP + FN) - MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 - - # Append the values to the respective lists - FPR_values.append(FPR) - FNR_values.append(FNR) - TNR_values.append(TNR) - TPR_values.append(TPR) - F1_values.append(F1) - accuracy_values.append(Accuracy) - precision_values.append(Precision) - MCC_values.append(MCC) - recall_values.append(Recall) - - return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values - -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Separate the values into two groups based on their proximity to 0 or 1 - close_to_0 = { - 'FPR': [], 'FNR': [] - } - close_to_1 = { - 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] - } - - # Categorize the metrics into two groups - for i in range(len(FPR_values)): - close_to_0['FPR'].append(FPR_values[i]) - close_to_0['FNR'].append(FNR_values[i]) - - close_to_1['TNR'].append(TNR_values[i]) - close_to_1['TPR'].append(TPR_values[i]) - close_to_1['F1'].append(F1_values[i]) - close_to_1['accuracy'].append(accuracy_values[i]) - close_to_1['precision'].append(precision_values[i]) - close_to_1['MCC'].append(MCC_values[i]) - close_to_1['recall'].append(recall_values[i]) - - # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') - - # Print the final values - print("\nFinal Metric Values:") - print(f"Final FPR: {FPR_values[-1]:.4f}") - print(f"Final FNR: {FNR_values[-1]:.4f}") - print(f"Final TNR: {TNR_values[-1]:.4f}") - print(f"Final TPR: {TPR_values[-1]:.4f}") - print(f"Final F1 Score: {F1_values[-1]:.4f}") - print(f"Final Accuracy: {accuracy_values[-1]:.4f}") - print(f"Final Precision: {precision_values[-1]:.4f}") - print(f"Final MCC: {MCC_values[-1]:.4f}") - print(f"Final Recall: {recall_values[-1]:.4f}") - -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): - plt.figure(figsize=(12, 8)) - - # Only plot the metrics that exist in the dictionary - if 'FPR' in metrics_dict: - plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') - if 'FNR' in metrics_dict: - plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') - if 'TNR' in metrics_dict: - plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') - if 'TPR' in metrics_dict: - plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') - if 'F1' in metrics_dict: - plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') - if 'accuracy' in metrics_dict: - plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') - if 'precision' in metrics_dict: - plt.plot(metrics_dict['precision'], label='Precision', marker='o') - if 'MCC' in metrics_dict: - plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') - if 'recall' in metrics_dict: - plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - - # Apply log scale by default - plt.yscale('log') - - # If the plot is close to 0, set custom ticks - if is_close_to_0: - # Add more ticks between 0 and 1 (using a logarithmic scale) - plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) - - plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') - plt.legend() - - # Save the plot - plt.savefig(output_filename) - plt.close() - -def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) - - file_path = sys.argv[1] - - FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) - -if __name__ == "__main__": - main() +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler +import pickle +import pandas as pd +import json +import traceback +import warnings +import os + +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.evidence import ( + Evidence, + ProfileID, + TimeWindow, + Attacker, + ThreatLevel, + EvidenceType, + IoCType, + Direction, + Victim, + Method, +) + +# This horrible hack is only to stop sklearn from printing those warnings +def warn(*args, **kwargs): + pass + + +warnings.warn = warn + + +class FlowMLDetection(IModule): + # Name: short name of the module. Do not use spaces + name = "Flow ML Detection" + description = ( + "Train or test a Machine Learning model to detect malicious flows" + ) + authors = ["Sebastian Garcia"] + + def init(self): + # Subscribe to the channel + self.c1 = self.db.subscribe("new_flow") + self.channels = {"new_flow": self.c1} + self.fieldseparator = self.db.get_field_separator() + # Set the output queue of our database instance + # Read the configuration + self.read_configuration() + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained. Used internally only to know + # when to retrain + self.last_number_of_flows_when_trained = 0 + # The scaler trained during training and to use during testing + self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" + + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + + def read_configuration(self): + conf = ConfigParser() + self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves + self.label = conf.label() + + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): + """ + Train a model based on the flows we receive and the labels + """ + try: + # Create X_flow with the current flows minus the label + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) + # Drop the module_labels + X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) + + # Normalize this batch of data so far. This can get progressivle slow + X_flow = self.scaler.fit_transform(X_flow) + + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } + + # Train + try: + # Online incremental learning + self.clf.partial_fit( + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + ) + except Exception: + self.print("Error while calling clf.train()") + self.print(traceback.format_exc(), 0, 1) + + # See score so far in training + score = self.clf.score(X_flow, y_flow) + + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + + # Store the models on disk + self.store_model() + + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + except Exception: + self.print("Error in train().", 0, 1) + self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") + + def process_features(self, dataset): + """ + Discards some features of the dataset and can create new. + Clean the dataset + """ + try: + # Discard some type of flows that dont have ports + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + for proto in to_discard: + dataset = dataset[dataset.proto != proto] + + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + + # For now, discard these + to_drop = [ + "appproto", + "daddr", + "saddr", + "starttime", + "type_", + "smac", + "dmac", + "history", + "uid", + "dir_", + "endtime", + "flow_source", + ] + for field in to_drop: + try: + dataset = dataset.drop(field, axis=1) + except (ValueError, KeyError): + pass + + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others + # So transform here + dataset["state"] = dataset.apply( + lambda row: self.db.get_final_state_from_flags( + row["state"], (row["spkts"] + row["dpkts"]) + ), + axis=1, + ) + + # Convert state to categorical + dataset.state = dataset.state.str.replace( + r"(^.*Not Established.*$)", "0", regex=True + ) + dataset.state = dataset.state.str.replace( + r"(^.*Established.*$)", "1", regex=True + ) + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... + # We dont use the data to create categories because in testing mode + # we dont see all the protocols + # Also we dont store the Categorizer because the user can retrain + # with its own data. + dataset.proto = dataset.proto.str.lower() + dataset.proto = dataset.proto.str.replace( + r"(^.*tcp.*$)", "0", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*udp.*$)", "1", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp.*$)", "2", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp-ipv6.*$)", "3", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*arp.*$)", "4", regex=True + ) + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + + return dataset + except Exception: + # Stop the timer + self.print("Error in process_features()") + self.print(traceback.format_exc(), 0, 1) + + def process_training_flows(self, last_number_of_flows_when_trained): + """ + Process only the new flows in the DB since the last training. + Store the pandas df in self.flows + """ + try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + + # We get all the flows so far + flows = self.db.get_all_flows() + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB + labels = self.db.get_labels() + if len(labels) == 1: + # Insert fake flows for both classes if needed + new_flows.append( + { + "starttime": 1594417039.029793, + "dur": "1.9424750804901123", + "saddr": "10.7.10.101", + "sport": "49733", + "daddr": "40.70.224.145", + "dport": "443", + "proto": "tcp", + "state": "SF", + "spkts": 17, + "dpkts": 27, + "sbytes": 25517, + "dbytes": 17247, + "appproto": "ssl", + "ground_truth_label": "Malicious", + "module_labels": { + "flowalerts-long-connection": "Malicious" + }, + } + ) + new_flows.append( + { + "starttime": 1382355032.706468, + "dur": "10.896695", + "saddr": "147.32.83.52", + "sport": "47956", + "daddr": "80.242.138.72", + "dport": "80", + "proto": "tcp", + "state": "SF", + "spkts": 1, + "dpkts": 0, + "sbytes": 100, + "dbytes": 67596, + "appproto": "http", + "ground_truth_label": "Benign", + "module_labels": { + "flowalerts-long-connection": "Benign" + }, + } + ) + + # Convert to pandas df + df_flows = pd.DataFrame(new_flows) + + # Process features + df_flows = self.process_features(df_flows) + + # Update the flow to the processed version + self.flows = df_flows + except Exception: + self.print("Error in process_flows()") + self.print(traceback.format_exc(), 0, 1) + + def process_flow(self, flow_to_process: dict): + """ + Process one flow. Only used during detection in testing + returns the pandas df with the processed flow + """ + try: + # Convert the flow to a pandas dataframe + raw_flow = pd.DataFrame(flow_to_process, index=[0]) + dflow = self.process_features(raw_flow) + if dflow.empty: + return None + # Update the flow to the processed version + return dflow + except Exception: + # Stop the timer + self.print("Error in process_flow()") + self.print(traceback.format_exc(), 0, 1) + + def detect(self, x_flow) -> Optional[numpy.ndarray]: + """ + Detects the given flow with the current model stored + and returns the predection array + """ + try: + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", + ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass + # Scale the flow + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) + return pred + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) + self.print(traceback.format_exc(), 0, 1) + + def store_model(self): + """ + Store the trained model on disk + """ + self.print("Storing the trained model and scaler on disk.", 0, 2) + with open(self.model_path, "wb") as f: + data = pickle.dumps(self.clf) + f.write(data) + with open(self.scaler_path, "wb") as g: + data = pickle.dumps(self.scaler) + g.write(data) + + def read_model(self): + """ + Read the trained model from disk + """ + try: + self.print("Reading the trained model from disk.", 0, 2) + with open(self.model_path, "rb") as f: + self.clf = pickle.load(f) + self.print("Reading the trained scaler from disk.", 0, 2) + with open(self.scaler_path, "rb") as g: + self.scaler = pickle.load(g) + except FileNotFoundError: + # If there is no model, create one empty + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + except EOFError: + self.print( + "Error reading model from disk. " + "Creating a new empty model.", + 0, + 2, + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + + def set_evidence_malicious_flow(self, flow: dict, twid: str): + confidence: float = 0.1 + description = ( + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" + ) + twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( + evidence_type=EvidenceType.MALICIOUS_FLOW, + attacker=Attacker( + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], + ), + threat_level=ThreatLevel.LOW, + confidence=confidence, + description=description, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], + ) + + self.db.set_evidence(evidence) + + def shutdown_gracefully(self): + # Confirm that the module is done processing + if self.mode == "train": + self.store_model() + + def pre_main(self): + utils.drop_root_privs() + # Load the model + self.read_model() + + def main(self): + if msg := self.get_msg("new_flow"): + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) + + if self.mode == "train": + # We are training + + # Is the amount in the DB of labels enough to retrain? + # Use labeled flows + labels = self.db.get_labels() + sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. + if ( + sum_labeled_flows >= self.minimum_labels_to_start_train + ): + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows(self.last_number_of_flows_when_trained) + # Train an algorithm + self.train(sum_labeled_flows) + self.last_number_of_flows_when_trained = sum_labeled_flows + + elif self.mode == "test": + # We are testing, which means using the model to detect + processed_flow = self.process_flow(self.flow) + + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: + # Predict + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return + + label = self.flow["label"] + if label and label != "unknown" and label != pred[0]: + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode + self.print( + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 3, + ) + if pred[0] == "Malicious": + # Generate an alert + self.set_evidence_malicious_flow(self.flow, self.twid) + self.print( + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 2, + ) \ No newline at end of file From 01a1a6156e0d0626e327d683cb828d44475e9eab Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:43 +0000 Subject: [PATCH 229/498] Fix plot test --- modules/flowmldetection/plot_testing_performance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 1b4152c6e..977a68b2d 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-8 # Avoid zero values on the logarithmic scale + min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 0b51f71948efe37e361836cb04bfcedba58dad66 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:50:33 +0000 Subject: [PATCH 230/498] Add testing code to evaluate performance. It is optional with a varible --- modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5e4e9aa46..b17a1baaf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -526,36 +526,21 @@ def main(self): elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: + original_label = processed_flow["ground_truth_label"].iloc[0] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: # an error occurred return - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) if pred[0] == "Malicious": # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f"Prediction {pred[0]} for label {label}" + f"Prediction {pred[0]} for label {original_label}" f' flow {self.flow["saddr"]}:' f'{self.flow["sport"]} -> ' f'{self.flow["daddr"]}:' @@ -563,4 +548,43 @@ def main(self): f'{self.flow["proto"]}', 0, 2, - ) \ No newline at end of file + ) + + # So you can disable this code easily. Since it is used only for evaluating a testing + log_testing_data = True + if log_testing_data: + # Initialize counters if not already done + if not hasattr(self, 'tp'): + self.tp = 0 + if not hasattr(self, 'tn'): + self.tn = 0 + if not hasattr(self, 'fp'): + self.fp = 0 + if not hasattr(self, 'fn'): + self.fn = 0 + + + # Update counters based on predictions and labels + if pred[0] == "Malicious" and original_label == "Malicious": + self.tp += 1 + elif pred[0] == "Benign" and original_label == "Benign": + self.tn += 1 + elif pred[0] == "Malicious" and original_label == "Benign": + self.fp += 1 + elif pred[0] == "Benign" and original_label == "Malicious": + self.fn += 1 + + testing_log_path = "./modules/flowmldetection/testing_performance.log" + try: + with open(testing_log_path, "a") as log_file: + log_file.write("Testing Performance Log Initialized\n") + # Log the testing performance metrics + log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") + + # Log the original flow for false positives and false negatives + if pred[0] == "Malicious" and original_label == "Benign": + log_file.write(f"False Positive Flow: {self.flow}\n") + elif pred[0] == "Benign" and original_label == "Malicious": + log_file.write(f"False Negative Flow: {self.flow}\n") + except Exception as e: + self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file From e2da4cbde7d3b54ce2e90749bcd9e4c7bdbb8be2 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:04:00 +0000 Subject: [PATCH 231/498] Fix plots --- .../plot_testing_performance.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 977a68b2d..6865415cd 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import sys import numpy as np +import argparse def process_file(file_path): # Initialize the counters for the values @@ -49,7 +50,7 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number): # Separate the values into two groups based on their proximity to 0 or 1 close_to_0 = { 'FPR': [], 'FNR': [] @@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False) # Print the final values - print("\nFinal Metric Values:") + print("\nFinal Metric Values for Experiment", experiment_number) print(f"Final FPR: {FPR_values[-1]:.4f}") print(f"Final FNR: {FNR_values[-1]:.4f}") print(f"Final TNR: {TNR_values[-1]:.4f}") @@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu print(f"Final MCC: {MCC_values[-1]:.4f}") print(f"Final Recall: {recall_values[-1]:.4f}") -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): +def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + # Add the experiment number to the plot title plt.xlabel('Index') plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time') plt.legend() # Save the plot @@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.close() def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) + # Set up argument parsing + parser = argparse.ArgumentParser(description='Plot testing performance metrics.') + parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file') + parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number') + + args = parser.parse_args() - file_path = sys.argv[1] + file_path = args.file + experiment_number = args.experiment FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number) if __name__ == "__main__": main() From e174fc4574b68e1aa2dedfdab223d3b42c60f282 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:14:51 +0000 Subject: [PATCH 232/498] Fix train plot --- .../flowmldetection/plot_train_performance.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 80e13e951..244df13d2 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -5,7 +5,7 @@ import argparse import os -def plot_log_data(file_path): +def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() @@ -28,7 +28,8 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'performance_metrics_training.png') + # Append experiment number to the filename + plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) @@ -55,18 +56,18 @@ def plot_log_data(file_path): for i, value in enumerate(df["Total labels"]): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - # Adding title and legend - plt.title('Training performance') + # Adding title and legend with experiment number in title + plt.title(f'Training performance - Experiment {experiment_number}') fig.tight_layout() # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) + ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.7) + plt.subplots_adjust(right=0.75) - # Save plot to the same folder as the log file + # Save plot to the same folder as the log file with experiment number in filename plt.savefig(plot_file) # Display the plot @@ -75,13 +76,14 @@ def plot_log_data(file_path): def main(): # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") + parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") # Handle -h / --help args = parser.parse_args() # Call the function to process the log file - plot_log_data(args.log_file) + plot_log_data(args.file, args.experiment) if __name__ == "__main__": main() From e7fdbfdbd1b5c3de8bb60227c4e02454abe5c993 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:14:48 +0000 Subject: [PATCH 233/498] Fix plots --- .../flowmldetection/plot_train_performance.py | 122 ++++++++++-------- 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 244df13d2..5212dfeea 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -4,85 +4,105 @@ import sys import argparse import os +import matplotlib.ticker as ticker def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() - # Define regex pattern to extract relevant data from each line - pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + # Regex pattern for the new log format + pattern = ( + r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: " + r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), " + r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\." + ) # Parse the log file data = re.findall(pattern, log_data) # Convert data to a DataFrame - df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + columns = [ + "Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall" + ] + df = pd.DataFrame(data, columns=columns) df = df.astype({ + "Total labels": float, "Background": int, "Benign": int, "Malicious": int, - "Total labels": float, - "Score": float + "FPR": float, + "TNR": float, + "TPR": float, + "FNR": float, + "F1": float, + "Precision": float, + "Accuracy": float, + "MCC": float, + "Recall": float, }) - # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - # Append experiment number to the filename - plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') - - # Plotting the values - fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis (with proper scaling from 0 to 1) - ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + # --- Plot 1: Number of labels (linear scale, no total labels) --- + fig1, ax1 = plt.subplots(figsize=(10, 6)) + ax1.plot(df.index, df["Background"], label="Background", color='black') + ax1.plot(df.index, df["Benign"], label="Benign", color='cyan') + ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') - ax1.set_ylabel('Score', color='tab:blue') - ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 - ax1.tick_params(axis='y', labelcolor='tab:blue') - - # Create the second y-axis for the Background, Benign, Malicious - ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') - - # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) - ax2.tick_params(axis='y', labelcolor='tab:red') - - # Annotating Total labels as text on the plot - for i, value in enumerate(df["Total labels"]): - ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - - # Adding title and legend with experiment number in title - plt.title(f'Training performance - Experiment {experiment_number}') - fig.tight_layout() + ax1.set_ylabel('Label Counts') + # No log scale here + ax1.set_title(f'Label Counts - Experiment {experiment_number}') + ax1.legend() + ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + + # --- Plot 2: FNR and FPR (log scale) --- + fig2, ax2 = plt.subplots(figsize=(10, 6)) + ax2.plot(df.index, df["FNR"], label="FNR", color='red') + ax2.plot(df.index, df["FPR"], label="FPR", color='blue') + ax2.set_xlabel('Index') + ax2.set_ylabel('Rate') + ax2.set_yscale('log') + ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') + ax2.legend() + ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + + # --- Plot 3: Other metrics (log scale) --- + fig3, ax3 = plt.subplots(figsize=(12, 7)) + metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"] + colors_rest = [ + 'tab:blue', 'tab:green', 'tab:purple', 'tab:brown', + 'tab:gray', 'tab:pink', 'tab:olive' + ] + for metric, color in zip(metrics_rest, colors_rest): + ax3.plot(df.index, df[metric], label=metric, color=color) + ax3.set_xlabel('Index') + ax3.set_ylabel('Metric Value') + ax3.set_yscale('log') + ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') + ax3.legend() + ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) - # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) - - # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) - - # Save plot to the same folder as the log file with experiment number in filename - plt.savefig(plot_file) - - # Display the plot plt.show() + # --- Print final values in terminal --- + print("\nFinal values at last training step:") + for col in ["Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]: + print(f"{col}: {df[col].iloc[-1]}") + def main(): - # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") - - # Handle -h / --help args = parser.parse_args() - - # Call the function to process the log file plot_log_data(args.file, args.experiment) if __name__ == "__main__": From fdbbbb5e9b127117ca089dab05bd1fe49f4e5508 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:16:01 +0000 Subject: [PATCH 234/498] Add performance metrics to the training evaluation --- modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++----- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b17a1baaf..2c60cd403 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,16 @@ import json import traceback import warnings -import os +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import ( + confusion_matrix, + f1_score, + precision_score, + accuracy_score, + matthews_corrcoef, + recall_score, +) + from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -86,21 +95,21 @@ def write_to_training_log(self, message: str): except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) - def train(self, sum_labeled_flows): + def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ Train a model based on the flows we receive and the labels """ try: + # Create y_flow with the label + y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - # Normalize this batch of data so far. This can get progressivle slow + # Normalize this batch of data so far. This can get progressively slow X_flow = self.scaler.fit_transform(X_flow) # Count the number of labels of each type in this epoc @@ -120,18 +129,43 @@ def train(self, sum_labeled_flows): self.print("Error while calling clf.train()") self.print(traceback.format_exc(), 0, 1) - # See score so far in training - score = self.clf.score(X_flow, y_flow) + # Predict on the training data + y_pred = self.clf.predict(X_flow) - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + # For metrics, let's focus on Malicious vs Benign (ignore Background) + mask = (y_flow == "Malicious") | (y_flow == "Benign") + y_true_bin = y_flow[mask] + y_pred_bin = y_pred[mask] + + # Map to binary: Malicious=1, Benign=0 + y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) + y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + + # Compute confusion matrix: tn, fp, fn, tp + tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + + # Compute metrics + FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 + TNR = tn / (tn + fp) if (tn + fp) > 0 else 0 + TPR = tp / (tp + fn) if (tp + fn) > 0 else 0 + FNR = fn / (fn + tp) if (fn + tp) > 0 else 0 + F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) + PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) + ACCU = accuracy_score(y_true_bin, y_pred_bin) + MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk self.store_model() # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + self.write_to_training_log( + f"Total labels: {sum_labeled_flows}, " + f"Background: {epoch_label_counts['Background']}. " + f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + ) except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) @@ -520,7 +554,7 @@ def main(self): # for pandas self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train(sum_labeled_flows) + self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From b7b2477f4939479d223c699e240cf3f6a33d2c10 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sun, 4 May 2025 12:50:46 +0000 Subject: [PATCH 235/498] Fix experiment names --- modules/flowmldetection/plot_train_performance.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 5212dfeea..304f0f4ea 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number): ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') ax1.set_ylabel('Label Counts') - # No log scale here ax1.set_title(f'Label Counts - Experiment {experiment_number}') ax1.legend() ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + ax1.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png')) # --- Plot 2: FNR and FPR (log scale) --- fig2, ax2 = plt.subplots(figsize=(10, 6)) @@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number): ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') ax2.legend() ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + ax2.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png')) # --- Plot 3: Other metrics (log scale) --- fig3, ax3 = plt.subplots(figsize=(12, 7)) @@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number): ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') ax3.legend() ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + ax3.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png')) plt.show() From 27b2b567ea395023664434d1bbb11819e3625776 Mon Sep 17 00:00:00 2001 From: alya Date: Mon, 5 May 2025 15:24:12 +0300 Subject: [PATCH 236/498] test_profiler: update unit tests --- tests/test_profiler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_profiler.py b/tests/test_profiler.py index b967c7880..e62bdd8e7 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -481,7 +481,6 @@ def test_read_configuration( mock_conf.local_whitelist_path.return_value = "path/to/whitelist" mock_conf.ts_format.return_value = "unixtimestamp" mock_conf.analysis_direction.return_value = "all" - mock_conf.label.return_value = "malicious" mock_conf.get_tw_width_as_float.return_value = 1.0 mock_conf.client_ips.return_value = ["192.168.1.1", "10.0.0.1"] @@ -490,7 +489,6 @@ def test_read_configuration( assert profiler.local_whitelist_path == "path/to/whitelist" assert profiler.timeformat == "unixtimestamp" assert profiler.analysis_direction == "all" - assert profiler.label == "malicious" assert profiler.width == 1.0 assert profiler.client_ips == ["192.168.1.1", "10.0.0.1"] From 493f8738cfd5e498c0a9f6edee15424e0e365e2e Mon Sep 17 00:00:00 2001 From: alya Date: Mon, 5 May 2025 16:16:16 +0300 Subject: [PATCH 237/498] add debugging prints --- config/slips.yaml | 2 +- modules/blocking/blocking.py | 180 ++++++++++++--------------- modules/blocking/unblocker.py | 26 +++- slips_files/core/evidence_handler.py | 8 ++ slips_files/core/helpers/checker.py | 2 +- tests/test_blocking.py | 14 +-- 6 files changed, 121 insertions(+), 111 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index f7089b41a..d758fb9f7 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -182,7 +182,7 @@ detection: # - 0.43: Use this threshold If you want Slips to be insensitive. # Using this means Slips will need so many evidence to trigger an alert # May lead to false negatives - evidence_detection_threshold: 0.25 + evidence_detection_threshold: 0.08 # Make Slips pop up alerts? Both Linux and Macos popup_alerts: false diff --git a/modules/blocking/blocking.py b/modules/blocking/blocking.py index d1eeffd54..cce2a901b 100644 --- a/modules/blocking/blocking.py +++ b/modules/blocking/blocking.py @@ -6,7 +6,7 @@ import shutil import json import subprocess -import time +from typing import Dict from slips_files.common.abstracts.module import IModule from .exec_iptables_cmd import exec_iptables_command @@ -31,9 +31,9 @@ def init(self): self.print("Mac OS blocking is not supported yet.") sys.exit() - self.firewall = self.determine_linux_firewall() - self.set_sudo_according_to_env() - self.initialize_chains_in_firewall() + self.firewall = self._determine_linux_firewall() + self._set_sudo_according_to_env() + self._init_chains_in_firewall() self.unblocker = Unblocker(self.db) # self.test() @@ -41,7 +41,7 @@ def test(self): """For debugging purposes, once we're done with the module we'll delete it""" - if not self.is_ip_blocked("2.2.0.0"): + if not self._is_ip_blocked("2.2.0.0"): blocking_data = { "ip": "2.2.0.0", "block": True, @@ -60,7 +60,7 @@ def test(self): self.print("[test] IP is already blocked") # self.unblock_ip("2.2.0.0",True,True) - def set_sudo_according_to_env(self): + def _set_sudo_according_to_env(self): """ Check if running in host or in docker and sets sudo string accordingly. There's no sudo in docker so we need to execute all commands without it @@ -71,7 +71,7 @@ def set_sudo_according_to_env(self): ) self.sudo = "" if self.running_in_docker else "sudo " - def determine_linux_firewall(self): + def _determine_linux_firewall(self): """Returns the currently installed firewall and installs iptables if none was found""" @@ -86,7 +86,7 @@ def determine_linux_firewall(self): ) sys.exit() - def delete_slips_blocking_chain(self): + def _del_slips_blocking_chain(self): """Flushes and deletes everything in slipsBlocking chain""" # check if slipsBlocking chain exists before flushing it and suppress # stderr and stdout while checking @@ -119,12 +119,12 @@ def delete_slips_blocking_chain(self): return False - def get_cmd_output(self, command): + def _get_cmd_output(self, command): """Executes a command and returns the output""" result = subprocess.run(command.split(), stdout=subprocess.PIPE) return result.stdout.decode("utf-8") - def initialize_chains_in_firewall(self): + def _init_chains_in_firewall(self): """For linux: Adds a chain to iptables or a table to nftables called slipsBlocking where all the rules will reside""" @@ -139,13 +139,13 @@ def initialize_chains_in_firewall(self): os.system(f"{self.sudo}iptables -N slipsBlocking >/dev/null 2>&1") # Check if we're already redirecting to slipsBlocking chain - input_chain_rules = self.get_cmd_output( + input_chain_rules = self._get_cmd_output( f"{self.sudo} iptables -nvL INPUT" ) - output_chain_rules = self.get_cmd_output( + output_chain_rules = self._get_cmd_output( f"{self.sudo} iptables -nvL OUTPUT" ) - forward_chain_rules = self.get_cmd_output( + forward_chain_rules = self._get_cmd_output( f"{self.sudo} iptables -nvL FORWARD" ) # Redirect the traffic from all other chains to slipsBlocking so rules @@ -168,7 +168,7 @@ def initialize_chains_in_firewall(self): + "iptables -I FORWARD -j slipsBlocking >/dev/null 2>&1" ) - def is_ip_blocked(self, ip) -> bool: + def _is_ip_blocked(self, ip) -> bool: """Checks if ip is already blocked or not""" command = f"{self.sudo}iptables -L slipsBlocking -v -n" # Execute command @@ -176,97 +176,72 @@ def is_ip_blocked(self, ip) -> bool: result = result.stdout.decode("utf-8") return ip in result - def block_ip( - self, - ip_to_block=None, - from_=True, - to=True, - dport=None, - sport=None, - protocol=None, - block_for=False, - ): + def _block_ip(self, ip_to_block: str, flags: Dict[str, str]) -> bool: """ This function determines the user's platform and firewall and calls the appropriate function to add the rules to the used firewall. By default this function blocks all traffic from and to the given ip. + return strue if the ip is successfully blocked """ + if self.firewall != "iptables": + return + if not isinstance(ip_to_block, str): return False # Make sure ip isn't already blocked before blocking - if self.is_ip_blocked(ip_to_block): + if self._is_ip_blocked(ip_to_block): return False - if self.firewall == "iptables": - # Blocking in iptables - # Set the default behaviour to block all traffic from and to an ip - if from_ is None and to is None: - from_, to = True, True - # This dictionary will be used to construct the rule - options = { - "protocol": f" -p {protocol}" if protocol is not None else "", - "dport": f" --dport {str(dport)}" if dport is not None else "", - "sport": f" --sport {str(sport)}" if sport is not None else "", - } - blocked = False - if from_: - # Add rule to block traffic from source ip_to_block (-s) - blocked = exec_iptables_command( - self.sudo, - action="insert", - ip_to_block=ip_to_block, - flag="-s", - options=options, - ) - if blocked: - self.print(f"Blocked all traffic from: {ip_to_block}") - - if to: - # Add rule to block traffic to ip_to_block (-d) - blocked = exec_iptables_command( - self.sudo, - action="insert", - ip_to_block=ip_to_block, - flag="-d", - options=options, - ) - if blocked: - self.print(f"Blocked all traffic to: {ip_to_block}") - - if block_for: - time_of_blocking = time.time() - # unblock ip after block_for period passes - self.unblock_ips.update( - { - ip_to_block: { - "block_for": block_for, - "time_of_blocking": time_of_blocking, - "blocking_details": { - "from": from_, - "to": to, - "dport": dport, - "sport": sport, - "protocol": protocol, - }, - } - } - ) + from_ = flags.get("from_") + to = flags.get("to") + dport = flags.get("dport") + sport = flags.get("sport") + protocol = flags.get("protocol") + # Set the default behaviour to block all traffic from and to an ip + if from_ is None and to is None: + from_, to = True, True + # This dictionary will be used to construct the rule + options = { + "protocol": f" -p {protocol}" if protocol is not None else "", + "dport": f" --dport {str(dport)}" if dport is not None else "", + "sport": f" --sport {str(sport)}" if sport is not None else "", + } + blocked = False + if from_: + # Add rule to block traffic from source ip_to_block (-s) + blocked = exec_iptables_command( + self.sudo, + action="insert", + ip_to_block=ip_to_block, + flag="-s", + options=options, + ) + if blocked: + self.print(f"Blocked all traffic from: {ip_to_block}") + if to: + # Add rule to block traffic to ip_to_block (-d) + blocked = exec_iptables_command( + self.sudo, + action="insert", + ip_to_block=ip_to_block, + flag="-d", + options=options, + ) if blocked: - # Successfully blocked an ip - return True + self.print(f"Blocked all traffic to: {ip_to_block}") - return False + return blocked def main(self): - # There's an IP that needs to be blocked if msg := self.get_msg("new_blocking"): # message['data'] in the new_blocking channel is a dictionary that contains # the ip and the blocking options # Example of the data dictionary to block or unblock an ip: - # (notice you have to specify from,to,dport,sport,protocol or at least 2 of them when unblocking) + # (notice you have to specify from,to,dport,sport,protocol or at + # least 2 of them when unblocking) # blocking_data = { # "ip" : "0.0.0.0" # "tw" : 1 @@ -288,24 +263,31 @@ def main(self): ip = data.get("ip") tw: int = data.get("tw") block = data.get("block") - from_ = data.get("from") - to = data.get("to") - dport = data.get("dport") - sport = data.get("sport") - protocol = data.get("protocol") - block_for = data.get("block_for") + # number of tws to block for + # blocking should last until the end of the next + # timewindow by default. we'll be blocking in the cur + # timewindow anyways, this number is "how many tws AFTER the + # cur tw to keep this ip blocked in" + how_many_tws_to_block = data.get("block_for", 1) + + flags = { + "from_": data.get("from"), + "to": data.get("to"), + "dport": data.get("dport"), + "sport": data.get("sport"), + "protocol": data.get("protocol"), + } if block: - self.block_ip(ip, from_, to, dport, sport, protocol, block_for) + # blocking request + blocked = self._block_ip(ip, flags) + if blocked: + print(f"@@@@@@@@@@@@@@@@ all good {ip} is blocked") + self.unblocker.unblock_request( + ip, how_many_tws_to_block, tw, flags + ) else: - how_many_tws_to_block = 1 - flags = { - "from_": from_, - "to": to, - "dport": dport, - "sport": sport, - "protocol": protocol, - } + # unblocking request self.unblocker.unblock_request( ip, how_many_tws_to_block, tw, flags ) diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index 35212259b..6633a5db3 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -25,9 +25,13 @@ def unblock_request( current_tw: int, flags: Dict[str, str], ): + print(f"@@@@@@@@@@@@@@@@ unblock_request for ip {ip}") tw_to_unblock_at: TimeWindow = self._calc_unblock_time( ip, current_tw, how_many_tws_to_block ) + print( + f"@@@@@@@@@@@@@@@@ unblocking {ip} at the end of {tw_to_unblock_at}" + ) self._add_req(ip, tw_to_unblock_at, flags) def _check_if_time_to_unblock(self): @@ -43,14 +47,26 @@ def _check_if_time_to_unblock(self): for ip, request in self.requests.items(): ts: float = self.request["tw_to_unblock"].end_time flags: Dict[str, str] = self.request["flags"] - + print( + f"@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock]" + f" checking if time to unvblock {ip} {request}" + ) if ts >= now: + print( + f"@@@@@@@@@@@@@@@@ time to unblock {ip} in the " + f"fw {request}" + ) if self._unblock(ip, flags): requests_to_del.append(ip) for ip in requests_to_del: - self._del_req(ip) + print( + f"@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock] " + f"seleting request for {ip}" + ) + self._del_req(ip) + print("@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock] sleeping 5") time.sleep(5) def _add_req( @@ -65,6 +81,10 @@ def _add_req( "tw_to_unblock": tw_to_unblock_at, "flags": flags, } + print(f"@@@@@@@@@@@@@@@@ added req for {ip} ") + from pprint import pp + + pp(self.requests) def _del_request(self, ip): """Delete an unblocking request from self.requests""" @@ -78,7 +98,6 @@ def _unblock( flags: Dict[str, str], ): """Unblocks an ip based on the given flags""" - from_ = flags.get("from_") to = flags.get("to") dport = flags.get("dport") @@ -122,6 +141,7 @@ def _unblock( if unblocked: self.print(f"Unblocked: {ip_to_unblock}") + print(f"@@@@@@@@@@@@@@@@ unblocked {ip_to_unblock} in the fw") return True return False diff --git a/slips_files/core/evidence_handler.py b/slips_files/core/evidence_handler.py index 890bfbdd0..70b9e2582 100644 --- a/slips_files/core/evidence_handler.py +++ b/slips_files/core/evidence_handler.py @@ -408,6 +408,10 @@ def decide_blocking( """ # send ip to the blocking module if not self.is_blocking_module_supported(): + print( + "@@@@@@@@@@@@@@@@ decide_blocking blocking module " + "unsupported" + ) return False # now since this source ip(profileid) caused an alert, # it means it caused so many evidence(attacked others a lot) @@ -415,6 +419,9 @@ def decide_blocking( # First, Make sure we don't block our own IP if ip_to_block in self.our_ips: + print( + f"@@@@@@@@@@@@@@@@ decide_blocking thats own ip! {ip_to_block}" + ) return False # TODO: edit the options here. by default it'll block @@ -428,6 +435,7 @@ def decide_blocking( } blocking_data = json.dumps(blocking_data) self.db.publish("new_blocking", blocking_data) + print("@@@@@@@@@@@@@@@@ published st in new_blocking") return True def increment_attack_counter( diff --git a/slips_files/core/helpers/checker.py b/slips_files/core/helpers/checker.py index 1ed1a65f1..690de5b64 100644 --- a/slips_files/core/helpers/checker.py +++ b/slips_files/core/helpers/checker.py @@ -183,7 +183,7 @@ def delete_blocking_chain(self): blocking = Blocking(Queue()) blocking.start() - blocking.delete_slips_blocking_chain() + blocking._del_slips_blocking_chain() # kill the blocking module manually because we can't # run shutdown_gracefully here (not all modules has started) for child in active_children(): diff --git a/tests/test_blocking.py b/tests/test_blocking.py index 513c7a954..263d76d95 100644 --- a/tests/test_blocking.py +++ b/tests/test_blocking.py @@ -47,7 +47,7 @@ def has_netadmin_cap(): @has_net_admin_cap def is_slipschain_initialized() -> bool: blocking = ModuleFactory().create_blocking_obj() - output = blocking.get_cmd_output(f"{blocking.sudo} iptables -S") + output = blocking._get_cmd_output(f"{blocking.sudo} iptables -S") rules = [ "-A INPUT -j slipsBlocking", "-A FORWARD -j slipsBlocking", @@ -63,7 +63,7 @@ def test_initialize_chains_in_firewall(): blocking = ModuleFactory().create_blocking_obj() # manually set the firewall blocking.firewall = "iptables" - blocking.initialize_chains_in_firewall() + blocking._init_chains_in_firewall() assert is_slipschain_initialized() is True @@ -82,12 +82,12 @@ def test_initialize_chains_in_firewall(): @has_net_admin_cap def test_block_ip(): blocking = ModuleFactory().create_blocking_obj() - blocking.initialize_chains_in_firewall() - if not blocking.is_ip_blocked("2.2.0.0"): + blocking._init_chains_in_firewall() + if not blocking._is_ip_blocked("2.2.0.0"): ip = "2.2.0.0" from_ = True to = True - assert blocking.block_ip(ip, from_, to) is True + assert blocking._block_ip(ip, from_, to) is True @linuxOS @@ -99,6 +99,6 @@ def test_unblock_ip(): from_ = True to = True # first make sure that it's blocked - if not blocking.is_ip_blocked("2.2.0.0"): - assert blocking.block_ip(ip, from_, to) is True + if not blocking._is_ip_blocked("2.2.0.0"): + assert blocking._block_ip(ip, from_, to) is True assert blocking.unblock_ip(ip, from_, to) is True From c20f8477d16c893899be337994957ff23db2e9e5 Mon Sep 17 00:00:00 2001 From: alya Date: Mon, 5 May 2025 17:31:52 +0300 Subject: [PATCH 238/498] fix problem getting the start and end time of a tw in the future --- docs/architecture.md | 2 +- .../core/database/redis_db/alert_handler.py | 27 ++++++++++++++++++- .../core/database/redis_db/profile_handler.py | 21 +++++++++------ slips_files/core/input.py | 3 --- 4 files changed, 40 insertions(+), 13 deletions(-) diff --git a/docs/architecture.md b/docs/architecture.md index ae941a49d..9a23b1efa 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -12,7 +12,7 @@ Slips is heavily based on the Zeek monitoring tool as input tool for packets fro Figure 1 shows how the data is analyzed by Slips. As we can see, Slips internally uses Zeek, an open source network security monitoring tool. Slips divides flows into profiles and -each profile into a timewindows. +each profile into a timewindows, timewindows are numbered from 1 to infinity. Slips runs detection modules on each flow and stores all evidence, alerts and features in an appropriate profile structure. All profile info, performed detections, profiles and timewindows' data, diff --git a/slips_files/core/database/redis_db/alert_handler.py b/slips_files/core/database/redis_db/alert_handler.py index aaf9b023a..7c12fafcb 100644 --- a/slips_files/core/database/redis_db/alert_handler.py +++ b/slips_files/core/database/redis_db/alert_handler.py @@ -141,8 +141,33 @@ def get_victim(self, profileid, attacker): return "" def get_tw_limits(self, profileid, twid: str) -> Tuple[float, float]: - """returns the timewindow start and endtime""" + """ + returns the timewindow start and endtime + """ twid_start_time: float = self.get_tw_start_time(profileid, twid) + if not twid_start_time: + # the given tw is in the future + # calc the start time of the twid manually based on the first + # twid + first_twid_start_time: float = self.get_first_flow_time() + print( + f"@@@@@@@@@@@@@@@@ first_twid_start_time {first_twid_start_time}" + ) + given_twid: int = int(twid.replace("timewindow", "")) + print(f"@@@@@@@@@@@@@@@@ given_twid {twid} -> {given_twid}") + # tws in slips start from 1. + # tw1 tw2 tw3 tw4 + # 0 ──────┬─────┬──────┬────── + # │ │ │ + # 2 4 6 + twid_start_time = first_twid_start_time + ( + self.width * (given_twid - 1) + ) + print( + f"@@@@@@@@@@@@@@@@ given twid ({twid}) start time" + f" {twid_start_time}" + ) + twid_end_time: float = twid_start_time + self.width return twid_start_time, twid_end_time diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 0489372cd..aadffc88a 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -81,6 +81,18 @@ def set_dhcp_flow(self, profileid, twid, requested_addr, uid): json.dumps(flow), ) + def get_first_flow_time(self) -> Optional[float]: + """ + Get the starttime of the first timewindow + aka ts of the first flow + first tw is always timewindow1 + """ + starttime_of_first_tw: str = self.r.hget( + self.constants.ANALYSIS, "file_start" + ) + if starttime_of_first_tw: + return float(starttime_of_first_tw) + def get_timewindow(self, flowtime, profileid): """ This function returns the TW in the database where the flow belongs. @@ -111,11 +123,8 @@ def get_timewindow(self, flowtime, profileid): tw_start = float(flowtime - (31536000 * 100)) tw_number: int = 1 else: - starttime_of_first_tw: str = self.r.hget( - self.constants.ANALYSIS, "file_start" - ) + starttime_of_first_tw: float = self.get_first_flow_time() if starttime_of_first_tw: - starttime_of_first_tw = float(starttime_of_first_tw) tw_number: int = ( floor((flowtime - starttime_of_first_tw) / self.width) + 1 ) @@ -1150,7 +1159,6 @@ def add_new_tw(self, profileid, timewindow: str, startoftw: float): def get_tw_start_time(self, profileid, twid): """Return the time when this TW in this profile was created""" - # Get all the TW for this profile # We need to encode it to 'search' because the data in the # sorted set is encoded return self.r.zscore(f"tws{profileid}", twid.encode("utf-8")) @@ -1451,9 +1459,6 @@ def mark_profile_as_dhcp(self, profileid): if not is_dhcp_set: self.r.hset(profileid, "dhcp", "true") - def get_first_flow_time(self) -> Optional[str]: - return self.r.hget(self.constants.ANALYSIS, "file_start") - def add_profile(self, profileid, starttime): """ Add a new profile to the DB. Both the list of profiles and the diff --git a/slips_files/core/input.py b/slips_files/core/input.py index b35a4abaa..fccb495ef 100644 --- a/slips_files/core/input.py +++ b/slips_files/core/input.py @@ -96,9 +96,6 @@ def init( self.zeek_thread = threading.Thread( target=self.run_zeek, daemon=True, name="run_zeek_thread" ) - # used to give the profiler the total amount of flows to - # read with the first flow only - self.is_first_flow = True # is set by the profiler to tell this proc that we it is done processing # the input process and shut down and close the profiler queue no issue self.is_profiler_done_event = is_profiler_done_event From 8632675f640b5dd90aed7557a07c83df91d0c230 Mon Sep 17 00:00:00 2001 From: alya Date: Mon, 5 May 2025 17:44:50 +0300 Subject: [PATCH 239/498] unblocking: convert timewindow limits to iso before creating timewindow object --- slips_files/common/abstracts/unblocker.py | 6 ++++++ slips_files/common/slips_utils.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/slips_files/common/abstracts/unblocker.py b/slips_files/common/abstracts/unblocker.py index ae2922fd0..f9ddccbc0 100644 --- a/slips_files/common/abstracts/unblocker.py +++ b/slips_files/common/abstracts/unblocker.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: GPL-2.0-only from abc import ABC, abstractmethod from threading import Thread + +from slips_files.common.slips_utils import utils from slips_files.core.database.database_manager import DBManager from slips_files.core.structures.evidence import TimeWindow @@ -68,6 +70,10 @@ def _calc_unblock_time( tw_start, tw_end = self.db.get_tw_limits( f"profile_{ip}", f"timewindow{tw_to_unblock}" ) + + tw_start: str = utils.convert_format(tw_start, "iso") + tw_end: str = utils.convert_format(tw_end, "iso") + return TimeWindow( number=tw_to_unblock, start_time=tw_start, end_time=tw_end ) diff --git a/slips_files/common/slips_utils.py b/slips_files/common/slips_utils.py index ba4e6fd69..aa2216ea2 100644 --- a/slips_files/common/slips_utils.py +++ b/slips_files/common/slips_utils.py @@ -90,7 +90,7 @@ def is_iso_format(self, date_time: str) -> bool: try: datetime.fromisoformat(date_time) return True - except ValueError: + except (ValueError, TypeError): return False def extract_hostname(self, url: str) -> str: From 2c9fea74846d842820fa227c36742c1f91eb153e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Mon, 5 May 2025 16:43:05 +0000 Subject: [PATCH 240/498] Fix that the training and testing logs files were appened instead of rewritten --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 2c60cd403..9a920b4e2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -90,7 +90,7 @@ def write_to_training_log(self, message: str): Write a message to the training log file. """ try: - with open(self.training_log_path, "a") as log_file: + with open(self.training_log_path, "w") as log_file: log_file.write(message + "\n") except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) @@ -610,8 +610,7 @@ def main(self): testing_log_path = "./modules/flowmldetection/testing_performance.log" try: - with open(testing_log_path, "a") as log_file: - log_file.write("Testing Performance Log Initialized\n") + with open(testing_log_path, "w") as log_file: # Log the testing performance metrics log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") From f5b28994ab20da76a77c42ebea793d31f81d9850 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Mon, 5 May 2025 22:45:16 +0000 Subject: [PATCH 241/498] Fix an issue of storing the new log files --- modules/flowmldetection/flowmldetection.py | 49 ++++++++++------------ 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9a920b4e2..9139066f0 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -72,11 +72,19 @@ def init(self): self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") + self.init_log_file() + + def init_log_file(self): + """ + Init the log file for training or testing + """ + if self.mode == "train": + # Initialize the training log file + self.log_path = "./modules/flowmldetection/training.log" + elif self.mode == "test": + # Initialize the testing log file + self.log_path = "./modules/flowmldetection/testing.log" + self.log_file = open(self.log_path, "w") def read_configuration(self): conf = ConfigParser() @@ -85,15 +93,14 @@ def read_configuration(self): # in case the flows do not have a label themselves self.label = conf.label() - def write_to_training_log(self, message: str): + def write_to_log(self, message: str): """ - Write a message to the training log file. + Write a message to the local log file. """ try: - with open(self.training_log_path, "w") as log_file: - log_file.write(message + "\n") + self.log_file.write(message + "\n") except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) + self.print(f"Error writing to log: {e}", 0, 1) def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ @@ -159,7 +166,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.store_model() # Log training information - self.write_to_training_log( + self.write_to_log( f"Total labels: {sum_labeled_flows}, " f"Background: {epoch_label_counts['Background']}. " f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " @@ -169,7 +176,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") + self.write_to_log("Error occurred during training.") def process_features(self, dataset): """ @@ -597,7 +604,6 @@ def main(self): if not hasattr(self, 'fn'): self.fn = 0 - # Update counters based on predictions and labels if pred[0] == "Malicious" and original_label == "Malicious": self.tp += 1 @@ -605,19 +611,10 @@ def main(self): self.tn += 1 elif pred[0] == "Malicious" and original_label == "Benign": self.fp += 1 + self.write_to_log(f"False Positive Flow: {self.flow}") elif pred[0] == "Benign" and original_label == "Malicious": self.fn += 1 + self.write_to_log(f"False Negative Flow: {self.flow}") - testing_log_path = "./modules/flowmldetection/testing_performance.log" - try: - with open(testing_log_path, "w") as log_file: - # Log the testing performance metrics - log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") - - # Log the original flow for false positives and false negatives - if pred[0] == "Malicious" and original_label == "Benign": - log_file.write(f"False Positive Flow: {self.flow}\n") - elif pred[0] == "Benign" and original_label == "Malicious": - log_file.write(f"False Negative Flow: {self.flow}\n") - except Exception as e: - self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file + # Log the testing performance metrics + self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}") \ No newline at end of file From 105c679823a42959be21b28cad950a639a0da29d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 May 2025 07:53:44 +0000 Subject: [PATCH 242/498] build(deps): bump maxminddb from 2.6.3 to 2.7.0 in /install Bumps [maxminddb](https://github.com/maxmind/MaxMind-DB-Reader-python) from 2.6.3 to 2.7.0. - [Release notes](https://github.com/maxmind/MaxMind-DB-Reader-python/releases) - [Changelog](https://github.com/maxmind/MaxMind-DB-Reader-python/blob/main/HISTORY.rst) - [Commits](https://github.com/maxmind/MaxMind-DB-Reader-python/compare/v2.6.3...v2.7.0) --- updated-dependencies: - dependency-name: maxminddb dependency-version: 2.7.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- install/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install/requirements.txt b/install/requirements.txt index 040d72c2b..0d93b075a 100644 --- a/install/requirements.txt +++ b/install/requirements.txt @@ -1,4 +1,4 @@ -maxminddb==2.6.3 +maxminddb==2.7.0 numpy==1.26.4 watchdog==5.0.0 redis==5.2.1 From 5ec46c3e2365ca2df3516a8573150974332382a2 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 6 May 2025 18:17:54 +0300 Subject: [PATCH 243/498] arg_parser: remove -cb from params because argparse things it's -c "b", like b is a config file --- slips_files/common/parsers/arg_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/slips_files/common/parsers/arg_parser.py b/slips_files/common/parsers/arg_parser.py index 54ad285b3..46ae5ec44 100644 --- a/slips_files/common/parsers/arg_parser.py +++ b/slips_files/common/parsers/arg_parser.py @@ -188,7 +188,6 @@ def parse_arguments(self): action="store_true", ) self.add_argument( - "-cb", "--clearblocking", help="Flush and delete slipsBlocking iptables chain", required=False, From 49fcf8a01e5a2d57458d6705fde9687746d33368 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 6 May 2025 18:19:10 +0300 Subject: [PATCH 244/498] blocking: move clearing the blocking chain to a new file to be able to use it on --clearblocking without having to start the blocking module --- modules/blocking/blocking.py | 50 ++----------------------- modules/blocking/slips_chain_manager.py | 43 +++++++++++++++++++++ 2 files changed, 46 insertions(+), 47 deletions(-) create mode 100644 modules/blocking/slips_chain_manager.py diff --git a/modules/blocking/blocking.py b/modules/blocking/blocking.py index cce2a901b..c2f19f7e2 100644 --- a/modules/blocking/blocking.py +++ b/modules/blocking/blocking.py @@ -9,6 +9,7 @@ from typing import Dict from slips_files.common.abstracts.module import IModule +from slips_files.common.slips_utils import utils from .exec_iptables_cmd import exec_iptables_command from modules.blocking.unblocker import Unblocker @@ -32,15 +33,14 @@ def init(self): sys.exit() self.firewall = self._determine_linux_firewall() - self._set_sudo_according_to_env() + self.sudo = utils.get_sudo_according_to_env() self._init_chains_in_firewall() - self.unblocker = Unblocker(self.db) + self.unblocker = Unblocker(self.db, self.sudo) # self.test() def test(self): """For debugging purposes, once we're done with the module we'll delete it""" - if not self._is_ip_blocked("2.2.0.0"): blocking_data = { "ip": "2.2.0.0", @@ -60,17 +60,6 @@ def test(self): self.print("[test] IP is already blocked") # self.unblock_ip("2.2.0.0",True,True) - def _set_sudo_according_to_env(self): - """ - Check if running in host or in docker and sets sudo string accordingly. - There's no sudo in docker so we need to execute all commands without it - """ - # This env variable is defined in the Dockerfile - self.running_in_docker = os.environ.get( - "IS_IN_A_DOCKER_CONTAINER", False - ) - self.sudo = "" if self.running_in_docker else "sudo " - def _determine_linux_firewall(self): """Returns the currently installed firewall and installs iptables if none was found""" @@ -86,39 +75,6 @@ def _determine_linux_firewall(self): ) sys.exit() - def _del_slips_blocking_chain(self): - """Flushes and deletes everything in slipsBlocking chain""" - # check if slipsBlocking chain exists before flushing it and suppress - # stderr and stdout while checking - # 0 means it exists - chain_exists = ( - os.system( - f"{self.sudo}iptables -nvL slipsBlocking >/dev/null 2>&1" - ) - == 0 - ) - if self.firewall == "iptables" and chain_exists: - # Delete all references to slipsBlocking inserted in INPUT OUTPUT - # and FORWARD before deleting the chain - cmd = ( - f"{self.sudo}iptables -D INPUT -j slipsBlocking " - f">/dev/null 2>&1 ; {self.sudo}iptables -D OUTPUT " - f"-j slipsBlocking >/dev/null 2>&1 ; " - f"{self.sudo}iptables -D FORWARD -j " - f"slipsBlocking >/dev/null 2>&1" - ) - os.system(cmd) - # flush and delete all the rules in slipsBlocking - cmd = ( - f"{self.sudo}iptables -F slipsBlocking >/dev/null 2>&1 ; " - f"{self.sudo} iptables -X slipsBlocking >/dev/null 2>&1" - ) - os.system(cmd) - print("Successfully deleted slipsBlocking chain.") - return True - - return False - def _get_cmd_output(self, command): """Executes a command and returns the output""" result = subprocess.run(command.split(), stdout=subprocess.PIPE) diff --git a/modules/blocking/slips_chain_manager.py b/modules/blocking/slips_chain_manager.py new file mode 100644 index 000000000..50b8abc7d --- /dev/null +++ b/modules/blocking/slips_chain_manager.py @@ -0,0 +1,43 @@ +from slips_files.common.slips_utils import utils +import os + + +def _chain_exists() -> bool: + """ + Check if the slipsBlocking chain exists + :return: True if it exists, False otherwise + """ + sudo = utils.get_sudo_according_to_env() + # check if slipsBlocking chain exists before flushing it and suppress + # stderr and stdout while checking + # 0 means it exists + return os.system(f"{sudo}iptables -nvL slipsBlocking >/dev/null 2>&1") == 0 + + +def del_slips_blocking_chain() -> bool: + """Flushes and deletes everything in slipsBlocking chain""" + if not _chain_exists(): + return False + + sudo = utils.get_sudo_according_to_env() + + # Delete all references to slipsBlocking inserted in INPUT OUTPUT + # and FORWARD before deleting the chain + cmd = ( + f"{sudo}iptables -D INPUT -j slipsBlocking " + f">/dev/null 2>&1 ; {sudo}iptables -D OUTPUT " + f"-j slipsBlocking >/dev/null 2>&1 ; " + f"{sudo}iptables -D FORWARD -j " + f"slipsBlocking >/dev/null 2>&1" + ) + os.system(cmd) + + # flush and delete all the rules in slipsBlocking + cmd = ( + f"{sudo}iptables -F slipsBlocking >/dev/null 2>&1 ; " + f"{sudo} iptables -X slipsBlocking >/dev/null 2>&1" + ) + os.system(cmd) + + print("Successfully deleted slipsBlocking chain.") + return True From 87796845a88f8f0b67fb2557640926c17cfbcee3 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 6 May 2025 18:21:18 +0300 Subject: [PATCH 245/498] process_manager.py: refactor get_modules() --- managers/process_manager.py | 162 +++++++++++++++++++++--------------- 1 file changed, 95 insertions(+), 67 deletions(-) diff --git a/managers/process_manager.py b/managers/process_manager.py index 238b462b0..836f8cd12 100644 --- a/managers/process_manager.py +++ b/managers/process_manager.py @@ -242,92 +242,120 @@ def is_abstract_module(self, obj) -> bool: def get_modules(self): """ - Get modules from the 'modules' folder. + get modules to load from the modules/ dir and ignore the ones in + the disable param in the config file. + Starts the blocking module only if --clearblocking in given + and returns a list of modules to load in the correct order if + applicable. """ - # This plugins import will automatically load the modules - # and put them in the __modules__ variable plugins = {} failed_to_load_modules = 0 + for module_name in self._discover_module_names(): + if not self._should_load_module(module_name): + continue + + module = self._import_module(module_name) + if not module: + failed_to_load_modules += 1 + continue + + plugins = self._load_valid_classes_from_module(module, plugins) + + plugins = self._reorder_modules(plugins) + return plugins, failed_to_load_modules + + def _reorder_modules(self, plugins): + plugins = self._prioritize_blocking_module(plugins) + plugins = self._start_cyst_module_last(plugins) + return plugins + + def _discover_module_names(self): + """ + walk recursively through all modules and packages found in modules/ + """ # __path__ is the current path of this python program look_for_modules_in = modules.__path__ prefix = f"{modules.__name__}." - # Walk recursively through all modules and packages found on the . - # folder. + for loader, module_name, ispkg in pkgutil.walk_packages( look_for_modules_in, prefix ): - # If current item is a package, skip. if ispkg: - continue + continue # skip if current item is a package + + dir_name, file_name = module_name.split(".")[1:3] # to avoid loading everything in the dir, # only load modules that have the same name as the dir name - dir_name = module_name.split(".")[1] - file_name = module_name.split(".")[2] - if dir_name != file_name: - continue - - if self.bootstrap_p2p: # if bootstrapping the p2p network - if not self.is_bootstrapping_module( - module_name - ): # keep only the bootstrapping-necessary modules - continue - else: # if not bootstrappig mode - if self.is_ignored_module( - module_name - ): # ignore blacklisted modules - continue - - # Try to import the module, otherwise skip. - try: - # "level specifies whether to use absolute or relative imports. - # The default is -1 which - # indicates both absolute and relative imports will - # be attempted. - # 0 means only perform absolute imports. - # Positive values for level indicate the number of parent - # directories to search relative to the directory of the - # module calling __import__()." - module = importlib.import_module(module_name) - except ImportError as e: - print( - f"Something wrong happened while " - f"importing the module {module_name}: {e}" - ) - print(traceback.format_exc()) - failed_to_load_modules += 1 - continue - - # Walk through all members of currently imported modules. - for member_name, member_object in inspect.getmembers(module): - # Check if current member is a class. - if inspect.isclass(member_object) and ( - issubclass(member_object, IModule) - and not self.is_abstract_module(member_object) - ): - plugins[member_object.name] = dict( - obj=member_object, - description=member_object.description, - ) + if dir_name == file_name: + yield module_name + + def _should_load_module(self, module_name): + # filter modules based on bootstrapping or blacklist conditions + if self.bootstrap_p2p: + if not self.is_bootstrapping_module(module_name): + return False # keep only the bootstrapping-necessary modules + else: + if self.is_ignored_module(module_name): + return False # ignore blacklisted modules + return True - # Change the order of the blocking module(load it first) + def _import_module(self, module_name): + # try to import the module, otherwise return None + try: + # "level" specifies how importlib should resolve the module + return importlib.import_module(module_name) + except ImportError as e: + print( + f"Something wrong happened while importing the module" + f" {module_name}: {e}" + ) + print(traceback.format_exc()) + return None + + def _load_valid_classes_from_module(self, module, plugins): + # walk through all members of the given module + for member_name, member_object in inspect.getmembers(module): + if inspect.isclass(member_object): + if issubclass( + member_object, IModule + ) and not self.is_abstract_module(member_object): + plugins[member_object.name] = { + "obj": member_object, + "description": member_object.description, + } + return plugins + + def _prioritize_blocking_module(self, plugins): + # change the order of the blocking module (load it first) # so it can receive msgs sent from other modules - if "Blocking" in plugins: - plugins = OrderedDict(plugins) - # last=False to move to the beginning of the dict - plugins.move_to_end("Blocking", last=False) - + if "Blocking" not in plugins: + return plugins + + ordered = OrderedDict(plugins) + ordered.move_to_end( + "Blocking", last=False + ) # last=False to move to the beginning of the dict + plugins.clear() + plugins.update(ordered) + return plugins + + def _start_cyst_module_last(self, plugins): # when cyst starts first, as soon as slips connects to cyst, # cyst sends slips the flows, # but the inputprocess didn't even start yet so the flows are lost - # to fix this, change the order of the CYST module(load it last) - if "cyst" in plugins: - plugins = OrderedDict(plugins) - # last=False to move to the beginning of the dict - plugins.move_to_end("cyst", last=True) - - return plugins, failed_to_load_modules + # to fix this, change the order of the CYST module (load it last) + if "cyst" not in plugins: + return plugins + + ordered = OrderedDict(plugins) + ordered.move_to_end( + "cyst", last=True + ) # last=True to move to the end of the dict + plugins.clear() + plugins.update(ordered) + return plugins def print_disabled_modules(self): print("-" * 27) From 7aa17c3793c4de84c0bbc1295063285008ded528 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 6 May 2025 18:22:22 +0300 Subject: [PATCH 246/498] slips_utils: add a function to determine whether to use sudo or not according to the docker env --- slips_files/common/slips_utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/slips_files/common/slips_utils.py b/slips_files/common/slips_utils.py index aa2216ea2..9abd1ddf4 100644 --- a/slips_files/common/slips_utils.py +++ b/slips_files/common/slips_utils.py @@ -538,6 +538,15 @@ def get_sha256_hash_of_file_contents(self, filename: str): return file_hash.hexdigest() + def get_sudo_according_to_env(self) -> str: + """ + Check if running in host or in docker and sets sudo string accordingly. + There's no sudo in docker so we need to execute all commands without it + """ + # This env variable is defined in the Dockerfile + running_in_docker = os.environ.get("IS_IN_A_DOCKER_CONTAINER", False) + return "" if running_in_docker else "sudo " + def is_msg_intended_for(self, message, channel): """ Function to check From cae77d6e3bfa09523628277659548d14eb075d79 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 6 May 2025 18:23:58 +0300 Subject: [PATCH 247/498] clear blocking chain and exit if --clearblocking is givin without starting modules --- modules/blocking/unblocker.py | 3 ++- slips_files/core/helpers/checker.py | 19 +++++++------------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index 6633a5db3..ed5e08f1a 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -13,8 +13,9 @@ class Unblocker(IUnblocker): name = "iptables_unblocker" - def __init__(self, db): + def __init__(self, db, sudo): IUnblocker.__init__(self, db) + self.sudo = sudo self.requests_lock = Lock() self.requests = {} diff --git a/slips_files/core/helpers/checker.py b/slips_files/core/helpers/checker.py index 690de5b64..12e775f1e 100644 --- a/slips_files/core/helpers/checker.py +++ b/slips_files/core/helpers/checker.py @@ -159,7 +159,7 @@ def check_given_flags(self): ): # If the user wants to blocks, we need permission to modify # iptables - print("Run Slips with sudo to enable the blocking module.") + print("Run Slips with sudo to use the blocking module.") self.main.terminate_slips() if self.main.args.clearblocking: @@ -171,23 +171,18 @@ def check_given_flags(self): else: self.delete_blocking_chain() self.main.terminate_slips() + # Check if user want to save and load a db at the same time if self.main.args.save and self.main.args.db: print("Can't use -s and -d together") self.main.terminate_slips() def delete_blocking_chain(self): - # start only the blocking module process and the db - from multiprocessing import Queue, active_children - from modules.blocking.blocking import Blocking - - blocking = Blocking(Queue()) - blocking.start() - blocking._del_slips_blocking_chain() - # kill the blocking module manually because we can't - # run shutdown_gracefully here (not all modules has started) - for child in active_children(): - child.kill() + from modules.blocking.slips_chain_manager import ( + del_slips_blocking_chain, + ) + + del_slips_blocking_chain() def clear_redis_cache(self): redis_cache_default_server_port = 6379 From 5bc18dc6453d67a2cd0152311960c9a476db6825 Mon Sep 17 00:00:00 2001 From: alya Date: Wed, 7 May 2025 02:59:19 +0300 Subject: [PATCH 248/498] input.py: dont drop root privs as they're needed to create zeek dirs inside the output dir that is created by root --- slips_files/core/input.py | 1 - 1 file changed, 1 deletion(-) diff --git a/slips_files/core/input.py b/slips_files/core/input.py index fccb495ef..cac9b9049 100644 --- a/slips_files/core/input.py +++ b/slips_files/core/input.py @@ -903,7 +903,6 @@ def give_profiler(self, line): self.profiler_queue.put(to_send) def main(self): - utils.drop_root_privs() if self.is_running_non_stop: # this thread should be started from run() to get the PID of inputprocess and have shared variables # if it started from __init__() it will have the PID of slips.py therefore, From c0cd6b7a907ac82500b3886aa6af763f02ea824f Mon Sep 17 00:00:00 2001 From: alya Date: Wed, 7 May 2025 03:16:06 +0300 Subject: [PATCH 249/498] dont drop root privs in all files that require logging to files in the output dir --- modules/update_manager/update_manager.py | 1 - slips_files/core/helpers/filemonitor.py | 2 -- slips_files/core/profiler.py | 1 - slips_files/core/structures/evidence.py | 6 +++++- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/update_manager/update_manager.py b/modules/update_manager/update_manager.py index c6bf0013e..ba8106aa5 100644 --- a/modules/update_manager/update_manager.py +++ b/modules/update_manager/update_manager.py @@ -1765,7 +1765,6 @@ def shutdown_gracefully(self): def pre_main(self): """this method runs only once""" - utils.drop_root_privs() try: # only one instance of slips should be able to update TI files at a time # so this function will only be allowed to run from 1 slips instance. diff --git a/slips_files/core/helpers/filemonitor.py b/slips_files/core/helpers/filemonitor.py index 3ad91723b..de9e9694d 100644 --- a/slips_files/core/helpers/filemonitor.py +++ b/slips_files/core/helpers/filemonitor.py @@ -22,7 +22,6 @@ import json import time from watchdog.events import RegexMatchingEventHandler -from slips_files.common.slips_utils import utils class FileEventHandler(RegexMatchingEventHandler): @@ -31,7 +30,6 @@ class FileEventHandler(RegexMatchingEventHandler): def __init__(self, dir_to_monitor, input_type, db): super().__init__(regexes=self.REGEX) self.dir_to_monitor = dir_to_monitor - utils.drop_root_privs() self.db = db self.input_type = input_type diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index 3c4d59db2..b417bfa2a 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -679,7 +679,6 @@ def shutdown_gracefully(self): self.mark_process_as_done_processing() def pre_main(self): - utils.drop_root_privs() client_ips = [str(ip) for ip in self.client_ips] if client_ips: self.print(f"Used client IPs: {green(', '.join(client_ips))}") diff --git a/slips_files/core/structures/evidence.py b/slips_files/core/structures/evidence.py index 0bf6d8524..b4d3424fd 100644 --- a/slips_files/core/structures/evidence.py +++ b/slips_files/core/structures/evidence.py @@ -235,7 +235,11 @@ def __post_init__(self): ) def __repr__(self): - return f"timewindow{self.number}" + return ( + f"timewindow{self.number}, " + f"start_time: {self.start_time}, " + f"end_time: {self.end_time}" + ) class Method(Enum): From 4455138faa50e42fad91d8d2297599cb51db4ced Mon Sep 17 00:00:00 2001 From: alya Date: Wed, 7 May 2025 03:30:06 +0300 Subject: [PATCH 250/498] fix waiting forever for profiler to stop when ctrl c is pressed while running on an interface --- slips/main.py | 2 +- slips_files/common/abstracts/module.py | 15 +++-- slips_files/core/input.py | 80 ++++++++++++++------------ slips_files/core/profiler.py | 1 + 4 files changed, 54 insertions(+), 44 deletions(-) diff --git a/slips/main.py b/slips/main.py index df49ffb97..21b8aa123 100644 --- a/slips/main.py +++ b/slips/main.py @@ -397,7 +397,7 @@ def update_stats(self): f"Evidence: {green(evidence_number)}. " f"Number of IPs seen in the last ({self.twid_width}):" f" {green(modified_ips_in_the_last_tw)}. " - f"Analyzed {flow_per_min} flows/min." + f"Analyzed {green(flow_per_min)} flows/min." ) self.print(stats) sys.stdout.flush() # Make sure the output is displayed immediately diff --git a/slips_files/common/abstracts/module.py b/slips_files/common/abstracts/module.py index 59568f187..e2128ad7b 100644 --- a/slips_files/common/abstracts/module.py +++ b/slips_files/common/abstracts/module.py @@ -133,13 +133,16 @@ def pre_main(self) -> bool: """ def get_msg(self, channel: str) -> Optional[dict]: - message = self.db.get_message(self.channels[channel]) - if utils.is_msg_intended_for(message, channel): - self.channel_tracker[channel]["msg_received"] = True - self.db.incr_msgs_received_in_channel(self.name, channel) - return message + try: + message = self.db.get_message(self.channels[channel]) + if utils.is_msg_intended_for(message, channel): + self.channel_tracker[channel]["msg_received"] = True + self.db.incr_msgs_received_in_channel(self.name, channel) + return message - self.channel_tracker[channel]["msg_received"] = False + self.channel_tracker[channel]["msg_received"] = False + except KeyboardInterrupt: + return None def print_traceback(self): exception_line = sys.exc_info()[2].tb_lineno diff --git a/slips_files/core/input.py b/slips_files/core/input.py index cac9b9049..b5db653c3 100644 --- a/slips_files/core/input.py +++ b/slips_files/core/input.py @@ -337,49 +337,55 @@ def get_earliest_line(self): return earliest_line, file_with_earliest_flow def read_zeek_files(self) -> int: - self.zeek_files = self.db.get_all_zeek_files() - self.open_file_handlers = {} - # stores zeek_log_file_name: timestamp of the last flow read from - # that file - self.file_time = {} - self.cache_lines = {} - # Try to keep track of when was the last update so we stop this reading - self.last_updated_file_time = datetime.datetime.now() - while not self.should_stop(): - self.check_if_time_to_del_rotated_files() - # Go to all the files generated by Zeek and read 1 - # line from each of them - for filename in self.zeek_files: - if utils.is_ignored_zeek_log_file(filename): - continue - - # reads 1 line from the given file and cache it - # from in self.cache_lines - self.cache_nxt_line_in_file(filename) + try: + self.zeek_files = self.db.get_all_zeek_files() + self.open_file_handlers = {} + # stores zeek_log_file_name: timestamp of the last flow read from + # that file + self.file_time = {} + self.cache_lines = {} + # Try to keep track of when was the last update so we stop this reading + self.last_updated_file_time = datetime.datetime.now() + while not self.should_stop(): + self.check_if_time_to_del_rotated_files() + # Go to all the files generated by Zeek and read 1 + # line from each of them + for filename in self.zeek_files: + if utils.is_ignored_zeek_log_file(filename): + continue + + # reads 1 line from the given file and cache it + # from in self.cache_lines + self.cache_nxt_line_in_file(filename) + + if self.reached_timeout(): + break - if self.reached_timeout(): - break + earliest_line, file_with_earliest_flow = ( + self.get_earliest_line() + ) + if not file_with_earliest_flow: + continue - earliest_line, file_with_earliest_flow = self.get_earliest_line() - if not file_with_earliest_flow: - continue + # self.print(' > Sent Line: {}'.format(earliest_line), 0, 3) - # self.print(' > Sent Line: {}'.format(earliest_line), 0, 3) + self.give_profiler(earliest_line) + self.lines += 1 + # when testing, no need to read the whole file! + if self.lines == 10 and self.testing: + break + # Delete this line from the cache and the time list + del self.cache_lines[file_with_earliest_flow] + del self.file_time[file_with_earliest_flow] - self.give_profiler(earliest_line) - self.lines += 1 - # when testing, no need to read the whole file! - if self.lines == 10 and self.testing: - break - # Delete this line from the cache and the time list - del self.cache_lines[file_with_earliest_flow] - del self.file_time[file_with_earliest_flow] + # Get the new list of files. Since new files may have been created by + # Zeek while we were processing them. + self.zeek_files = self.db.get_all_zeek_files() - # Get the new list of files. Since new files may have been created by - # Zeek while we were processing them. - self.zeek_files = self.db.get_all_zeek_files() + self.close_all_handles() + except KeyboardInterrupt: + pass - self.close_all_handles() return self.lines def _make_gen(self, reader): diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index b417bfa2a..4a504e566 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -715,3 +715,4 @@ def main(self): self.pending_flows_queue_lock.acquire() self.flows_to_process_q.put(msg) self.pending_flows_queue_lock.release() + return None From ff71a47c3ff42971873413937465e5286c4e8944 Mon Sep 17 00:00:00 2001 From: alya Date: Wed, 7 May 2025 03:31:40 +0300 Subject: [PATCH 251/498] Handle starting and shutting down the unblocker thread --- modules/blocking/blocking.py | 10 +++++++++- modules/blocking/unblocker.py | 25 ++++++++++++++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/modules/blocking/blocking.py b/modules/blocking/blocking.py index c2f19f7e2..b5dda6f31 100644 --- a/modules/blocking/blocking.py +++ b/modules/blocking/blocking.py @@ -35,7 +35,7 @@ def init(self): self.firewall = self._determine_linux_firewall() self.sudo = utils.get_sudo_according_to_env() self._init_chains_in_firewall() - self.unblocker = Unblocker(self.db, self.sudo) + # self.test() def test(self): @@ -191,6 +191,14 @@ def _block_ip(self, ip_to_block: str, flags: Dict[str, str]) -> bool: return blocked + def shutdown_gracefully(self): + self.unblocker.unblocker_thread.join(30) + if self.unblocker.unblocker_thread.is_alive(): + self.print("Problem shutting down unblocker thread.") + + def pre_main(self): + self.unblocker = Unblocker(self.db, self.sudo, self.should_stop) + def main(self): if msg := self.get_msg("new_blocking"): # message['data'] in the new_blocking channel is a dictionary that contains diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index ed5e08f1a..f754b31b3 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -1,7 +1,9 @@ from threading import Lock import time -from typing import Dict +import threading +from typing import Dict, Callable from slips_files.common.abstracts.unblocker import IUnblocker +from slips_files.common.slips_utils import utils from slips_files.core.structures.evidence import TimeWindow @@ -13,11 +15,24 @@ class Unblocker(IUnblocker): name = "iptables_unblocker" - def __init__(self, db, sudo): + def __init__(self, db, sudo, should_stop: Callable): IUnblocker.__init__(self, db) + # this is the blocking module's should_stop method + # the goal is to stop the threads started by this module when the + # blocking module's should_stop returns True + self.should_stop = should_stop self.sudo = sudo self.requests_lock = Lock() self.requests = {} + self._start_checker_thread() + + def _start_checker_thread(self): + self.unblocker_thread = threading.Thread( + target=self._check_if_time_to_unblock, + daemon=True, + name="iptables_unblocker_thread", + ) + utils.start_thread(self.unblocker_thread, self.db) def unblock_request( self, @@ -41,7 +56,7 @@ def _check_if_time_to_unblock(self): in self.requests regularly. Each time a ts is reached, it should call _unblock() """ - while True: + while not self.should_stop(): now = time.time() requests_to_del = [] @@ -67,8 +82,8 @@ def _check_if_time_to_unblock(self): ) self._del_req(ip) - print("@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock] sleeping 5") - time.sleep(5) + print("@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock] sleeping 10") + time.sleep(10) def _add_req( self, ip: str, tw_to_unblock_at: TimeWindow, flags: Dict[str, str] From 8d0a12774c0bf20001fbad1a698a042591f0838f Mon Sep 17 00:00:00 2001 From: alya Date: Wed, 7 May 2025 13:55:00 +0300 Subject: [PATCH 252/498] rename convert_format to convert_ts_format() for clarity --- managers/metadata_manager.py | 2 +- managers/process_manager.py | 2 +- modules/blocking/unblocker.py | 12 ++++++++-- modules/flowalerts/set_evidence.py | 4 +++- modules/leak_detector/leak_detector.py | 2 +- modules/p2ptrust/utils/go_director.py | 6 ++--- modules/rnn_cc_detection/rnn_cc_detection.py | 4 +++- .../threat_intelligence.py | 24 ++++++++++--------- modules/timeline/timeline.py | 2 +- slips/main.py | 4 ++-- slips_files/common/abstracts/unblocker.py | 4 ++-- slips_files/common/idmefv2.py | 6 ++--- slips_files/common/slips_utils.py | 8 +++---- .../core/database/redis_db/alert_handler.py | 2 +- .../core/database/sqlite_db/database.py | 2 +- slips_files/core/evidence_handler.py | 2 +- slips_files/core/helpers/flow_handler.py | 8 +++++-- slips_files/core/input.py | 4 ++-- slips_files/core/input_profilers/nfdump.py | 4 ++-- slips_files/core/input_profilers/suricata.py | 4 ++-- slips_files/core/profiler.py | 2 +- slips_files/core/structures/alerts.py | 8 +++---- slips_files/core/text_formatters/evidence.py | 6 ++--- tests/test_metadata_manager.py | 2 +- tests/test_process_manager.py | 4 ++-- tests/test_redis_manager.py | 2 +- tests/test_slips_utils.py | 4 +++- webinterface/analysis/analysis.py | 4 ++-- 28 files changed, 79 insertions(+), 59 deletions(-) diff --git a/managers/metadata_manager.py b/managers/metadata_manager.py index 746b9366b..ea1075bc0 100644 --- a/managers/metadata_manager.py +++ b/managers/metadata_manager.py @@ -77,7 +77,7 @@ def set_analysis_end_date(self, end_date): if not self.enable_metadata: return - end_date = utils.convert_format(end_date, utils.alerts_format) + end_date = utils.convert_ts_format(end_date, utils.alerts_format) self.main.db.set_input_metadata({"analysis_end": end_date}) # add slips end date in the metadata dir diff --git a/managers/process_manager.py b/managers/process_manager.py index 836f8cd12..26f8d6356 100644 --- a/managers/process_manager.py +++ b/managers/process_manager.py @@ -543,7 +543,7 @@ def get_analysis_time(self) -> Tuple[str, str]: returns analysis_time in minutes and slips end_time as a date """ start_time = self.main.db.get_slips_start_time() - end_time = utils.convert_format(datetime.now(), "unixtimestamp") + end_time = utils.convert_ts_format(datetime.now(), "unixtimestamp") return ( utils.get_time_diff(start_time, end_time, return_type="minutes"), end_time, diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index f754b31b3..76b84ab61 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -61,8 +61,8 @@ def _check_if_time_to_unblock(self): requests_to_del = [] for ip, request in self.requests.items(): - ts: float = self.request["tw_to_unblock"].end_time - flags: Dict[str, str] = self.request["flags"] + ts: str = self.request["tw_to_unblock"].end_time + ts: float = utils.convert_ts_format(ts, "unixtimestamp") print( f"@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock]" f" checking if time to unvblock {ip} {request}" @@ -72,6 +72,7 @@ def _check_if_time_to_unblock(self): f"@@@@@@@@@@@@@@@@ time to unblock {ip} in the " f"fw {request}" ) + flags: Dict[str, str] = self.request["flags"] if self._unblock(ip, flags): requests_to_del.append(ip) @@ -93,6 +94,13 @@ def _add_req( :param ts_to_unblock: unix ts to unblock the given ip at """ with self.requests_lock: + ts = utils.convert_ts_format(time.time() + 30, "iso") + tw_to_unblock_at.end_time = ts # @@@@@@@@@@@@@ + # del this + + print( + f"@@@@@@@@@@@@@@@@ tw_to_unblock_at.end_time {tw_to_unblock_at.end_time}" + ) self.requests[ip] = { "tw_to_unblock": tw_to_unblock_at, "flags": flags, diff --git a/modules/flowalerts/set_evidence.py b/modules/flowalerts/set_evidence.py index 5fa7accac..7be9437c4 100644 --- a/modules/flowalerts/set_evidence.py +++ b/modules/flowalerts/set_evidence.py @@ -1315,7 +1315,9 @@ def data_exfiltration( ) -> None: saddr: str = profileid.split("_")[-1] description: str = f"Large data upload. {src_mbs} MBs sent to {daddr}" - timestamp: str = utils.convert_format(timestamp, utils.alerts_format) + timestamp: str = utils.convert_ts_format( + timestamp, utils.alerts_format + ) twid_number = int(twid.replace("timewindow", "")) # to add a correlation the 2 evidence in alerts.json evidence_id_of_dstip_as_the_attacker = str(uuid4()) diff --git a/modules/leak_detector/leak_detector.py b/modules/leak_detector/leak_detector.py index 8eb57bfe7..fa202f70b 100644 --- a/modules/leak_detector/leak_detector.py +++ b/modules/leak_detector/leak_detector.py @@ -218,7 +218,7 @@ def set_evidence_yara_match(self, info: dict): # in which tw is this ts? twid = self.db.get_tw_of_ts(profileid, ts) # convert ts to a readable format - ts = utils.convert_format(ts, utils.alerts_format) + ts = utils.convert_ts_format(ts, utils.alerts_format) if not twid: return diff --git a/modules/p2ptrust/utils/go_director.py b/modules/p2ptrust/utils/go_director.py index e9d1132f1..a7ffc0519 100644 --- a/modules/p2ptrust/utils/go_director.py +++ b/modules/p2ptrust/utils/go_director.py @@ -91,7 +91,7 @@ def log(self, text: str): Writes the log text to p2p_reports.log """ now = time.time() - human_readable_datetime = utils.convert_format( + human_readable_datetime = utils.convert_ts_format( now, utils.alerts_format ) self.reports_logfile.write(f"{human_readable_datetime} - {text}\n") @@ -453,7 +453,7 @@ def process_evaluation_score_confidence( # convert ts to human readable format report_info = { "reporter": reporter, - "report_time": utils.convert_format( + "report_time": utils.convert_ts_format( report_time, utils.alerts_format ), } @@ -512,7 +512,7 @@ def set_evidence_p2p_report( # report time to add this evidence to twid = self.db.get_timewindow(timestamp, profileid_of_attacker) - timestamp = utils.convert_format(timestamp, utils.alerts_format) + timestamp = utils.convert_ts_format(timestamp, utils.alerts_format) evidence = Evidence( evidence_type=EvidenceType.P2P_REPORT, attacker=Attacker( diff --git a/modules/rnn_cc_detection/rnn_cc_detection.py b/modules/rnn_cc_detection/rnn_cc_detection.py index aad8f8b53..127f86175 100644 --- a/modules/rnn_cc_detection/rnn_cc_detection.py +++ b/modules/rnn_cc_detection/rnn_cc_detection.py @@ -74,7 +74,9 @@ def set_evidence_cc_channel( f'score: {format(score, ".4f")}.' ) - timestamp: str = utils.convert_format(timestamp, utils.alerts_format) + timestamp: str = utils.convert_ts_format( + timestamp, utils.alerts_format + ) twid_int = int(twid.replace("timewindow", "")) # to add a correlation between the 2 evidence in alerts.json evidence_id_of_dstip_as_the_attacker = str(uuid4()) diff --git a/modules/threat_intelligence/threat_intelligence.py b/modules/threat_intelligence/threat_intelligence.py index dcaddc3b4..3d662b0b7 100644 --- a/modules/threat_intelligence/threat_intelligence.py +++ b/modules/threat_intelligence/threat_intelligence.py @@ -216,7 +216,7 @@ def set_evidence_malicious_asn( profile=ProfileID(ip=saddr), timewindow=TimeWindow(number=twid_int), uid=[uid], - timestamp=utils.convert_format(timestamp, utils.alerts_format), + timestamp=utils.convert_ts_format(timestamp, utils.alerts_format), ) self.db.set_evidence(evidence) @@ -236,7 +236,7 @@ def set_evidence_malicious_asn( profile=ProfileID(ip=daddr), timewindow=TimeWindow(number=twid_int), uid=[uid], - timestamp=utils.convert_format(timestamp, utils.alerts_format), + timestamp=utils.convert_ts_format(timestamp, utils.alerts_format), ) self.db.set_evidence(evidence) @@ -314,7 +314,7 @@ def set_evidence_malicious_ip_in_dns_response( profile=ProfileID(ip=ip), timewindow=TimeWindow(number=twid_int), uid=[uid], - timestamp=utils.convert_format(timestamp, utils.alerts_format), + timestamp=utils.convert_ts_format(timestamp, utils.alerts_format), ) self.db.set_evidence(evidence) @@ -337,7 +337,7 @@ def set_evidence_malicious_ip_in_dns_response( profile=ProfileID(ip=saddr), timewindow=TimeWindow(number=twid_int), uid=[uid], - timestamp=utils.convert_format(timestamp, utils.alerts_format), + timestamp=utils.convert_ts_format(timestamp, utils.alerts_format), ) self.db.set_evidence(evidence) @@ -407,7 +407,7 @@ def set_evidence_conn_from_malicious_ip( profile=ProfileID(ip=saddr), timewindow=TimeWindow(number=twid_int), uid=[uid], - timestamp=utils.convert_format(timestamp, utils.alerts_format), + timestamp=utils.convert_ts_format(timestamp, utils.alerts_format), ) self.db.set_evidence(evidence) # mark this ip as malicious in our database @@ -457,7 +457,7 @@ def set_evidence_conn_to_malicious_ip( profile=ProfileID(ip=daddr), timewindow=TimeWindow(number=twid_int), uid=[uid], - timestamp=utils.convert_format(timestamp, utils.alerts_format), + timestamp=utils.convert_ts_format(timestamp, utils.alerts_format), ) self.db.set_evidence(evidence) @@ -477,7 +477,7 @@ def set_evidence_conn_to_malicious_ip( profile=ProfileID(ip=saddr), timewindow=TimeWindow(number=twid_int), uid=[uid], - timestamp=utils.convert_format(timestamp, utils.alerts_format), + timestamp=utils.convert_ts_format(timestamp, utils.alerts_format), ) self.db.set_evidence(evidence) # mark this ip as malicious in our database @@ -571,7 +571,7 @@ def set_evidence_malicious_domain( profile=ProfileID(ip=srcip), timewindow=TimeWindow(number=twid_number), uid=[uid], - timestamp=utils.convert_format(timestamp, utils.alerts_format), + timestamp=utils.convert_ts_format(timestamp, utils.alerts_format), ) self.db.set_evidence(evidence) @@ -598,7 +598,9 @@ def set_evidence_malicious_domain( profile=ProfileID(ip=domain_resolution), timewindow=TimeWindow(number=twid_number), uid=[uid], - timestamp=utils.convert_format(timestamp, utils.alerts_format), + timestamp=utils.convert_ts_format( + timestamp, utils.alerts_format + ), ) self.db.set_evidence(evidence) @@ -1092,7 +1094,7 @@ def set_evidence_malicious_hash(self, file_info: Dict[str, any]): f'Detected by: {file_info["blacklist"]}. ' f"Confidence: {confidence}. " ) - ts = utils.convert_format( + ts = utils.convert_ts_format( file_info["flow"]["starttime"], utils.alerts_format ) twid = TimeWindow( @@ -1579,7 +1581,7 @@ def set_evidence_malicious_cname_in_dns_response( profile=ProfileID(ip=srcip), timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), uid=[uid], - timestamp=utils.convert_format(timestamp, utils.alerts_format), + timestamp=utils.convert_ts_format(timestamp, utils.alerts_format), ) self.db.set_evidence(evidence) diff --git a/modules/timeline/timeline.py b/modules/timeline/timeline.py index 12cdcadcb..45f8603c8 100644 --- a/modules/timeline/timeline.py +++ b/modules/timeline/timeline.py @@ -41,7 +41,7 @@ def read_configuration(self): def convert_timestamp_to_slips_format(self, timestamp: float) -> str: if self.is_human_timestamp: - timestamp = utils.convert_format(timestamp, utils.alerts_format) + timestamp = utils.convert_ts_format(timestamp, utils.alerts_format) return str(timestamp) def ensure_int_bytes(self, bytes: Any) -> int: diff --git a/slips/main.py b/slips/main.py index 21b8aa123..b00cc8f3d 100644 --- a/slips/main.py +++ b/slips/main.py @@ -205,7 +205,7 @@ def prepare_output_dir(self): ), # get pcap name from path ) # add timestamp to avoid conflicts wlp3s0_2022-03-1_03:55 - ts = utils.convert_format(datetime.now(), "%Y-%m-%d_%H:%M:%S") + ts = utils.convert_ts_format(datetime.now(), "%Y-%m-%d_%H:%M:%S") self.args.output += f"_{ts}/" os.makedirs(self.args.output) @@ -386,7 +386,7 @@ def update_stats(self): return self.last_updated_stats_time = now - now = utils.convert_format(now, "%Y/%m/%d %H:%M:%S") + now = utils.convert_ts_format(now, "%Y/%m/%d %H:%M:%S") modified_ips_in_the_last_tw = self.db.get_modified_ips_in_the_last_tw() profiles_len = self.db.get_profiles_len() evidence_number = self.db.get_evidence_number() or 0 diff --git a/slips_files/common/abstracts/unblocker.py b/slips_files/common/abstracts/unblocker.py index f9ddccbc0..b46e84874 100644 --- a/slips_files/common/abstracts/unblocker.py +++ b/slips_files/common/abstracts/unblocker.py @@ -71,8 +71,8 @@ def _calc_unblock_time( f"profile_{ip}", f"timewindow{tw_to_unblock}" ) - tw_start: str = utils.convert_format(tw_start, "iso") - tw_end: str = utils.convert_format(tw_end, "iso") + tw_start: str = utils.convert_ts_format(tw_start, "iso") + tw_end: str = utils.convert_ts_format(tw_end, "iso") return TimeWindow( number=tw_to_unblock, start_time=tw_start, end_time=tw_end diff --git a/slips_files/common/idmefv2.py b/slips_files/common/idmefv2.py index 43ff722f8..cfd90a39d 100644 --- a/slips_files/common/idmefv2.py +++ b/slips_files/common/idmefv2.py @@ -129,10 +129,10 @@ def convert_to_idmef_alert(self, alert: Alert) -> Message: """ try: now = datetime.now(utils.local_tz).isoformat("T") - iso_start_time = utils.convert_format( + iso_start_time = utils.convert_ts_format( alert.timewindow.start_time, "iso" ).replace(" ", "T") - iso_end_time = utils.convert_format( + iso_end_time = utils.convert_ts_format( alert.timewindow.end_time, "iso" ).replace(" ", "T") @@ -184,7 +184,7 @@ def convert_to_idmef_event(self, evidence: Evidence) -> Message: """ try: now = datetime.now(utils.local_tz).isoformat("T") - iso_ts: str = utils.convert_format( + iso_ts: str = utils.convert_ts_format( evidence.timestamp, "iso" ).replace(" ", "T") attacker, attacker_type = self.extract_role_type( diff --git a/slips_files/common/slips_utils.py b/slips_files/common/slips_utils.py index 9abd1ddf4..315af1057 100644 --- a/slips_files/common/slips_utils.py +++ b/slips_files/common/slips_utils.py @@ -315,7 +315,7 @@ def start_thread(self, thread: Thread, db): thread.start() db.store_pid(thread.name, int(thread._native_id)) - def convert_format(self, ts, required_format: str): + def convert_ts_format(self, ts, required_format: str): """ Detects and converts the given ts to the given format PS: it sets iso format datetime in the local timezone @@ -399,7 +399,7 @@ def to_delta(self, time_in_seconds): return timedelta(seconds=int(time_in_seconds)) def get_human_readable_datetime(self) -> str: - return utils.convert_format(datetime.now(), self.alerts_format) + return utils.convert_ts_format(datetime.now(), self.alerts_format) def get_own_ips(self, ret=Dict) -> Union[Dict[str, List[str]], List[str]]: """ @@ -677,7 +677,7 @@ def assert_microseconds(self, ts: str): :param ts: unix ts :return: ts """ - ts = self.convert_format(ts, "unixtimestamp") + ts = self.convert_ts_format(ts, "unixtimestamp") ts = str(ts) # pattern of unix ts with microseconds @@ -698,7 +698,7 @@ def get_aid(self, flow): proto = flow.proto.lower() # aid_hash lib only accepts unix ts - ts = utils.convert_format(flow.starttime, "unixtimestamp") + ts = utils.convert_ts_format(flow.starttime, "unixtimestamp") ts: str = self.assert_microseconds(ts) cases = { diff --git a/slips_files/core/database/redis_db/alert_handler.py b/slips_files/core/database/redis_db/alert_handler.py index 7c12fafcb..74dfab4a7 100644 --- a/slips_files/core/database/redis_db/alert_handler.py +++ b/slips_files/core/database/redis_db/alert_handler.py @@ -450,7 +450,7 @@ def update_past_threat_levels(self, profileid, threat_level, confidence): if the past threat level and confidence are the same as the ones we wanna store, we replace the timestamp only """ - now = utils.convert_format(time.time(), utils.alerts_format) + now = utils.convert_ts_format(time.time(), utils.alerts_format) confidence = f"confidence: {confidence}" # this is what we'll be storing in the db, tl, ts, and confidence threat_level_data = (threat_level, now, confidence) diff --git a/slips_files/core/database/sqlite_db/database.py b/slips_files/core/database/sqlite_db/database.py index c3d5062f3..683284675 100644 --- a/slips_files/core/database/sqlite_db/database.py +++ b/slips_files/core/database/sqlite_db/database.py @@ -305,7 +305,7 @@ def add_alert(self, alert: Alert): """ adds an alert to the alerts table """ - now = utils.convert_format(datetime.now(), "unixtimestamp") + now = utils.convert_ts_format(datetime.now(), "unixtimestamp") self.execute( "INSERT OR REPLACE INTO alerts " "(alert_id, ip_alerted, timewindow, tw_start, tw_end, label, alert_time) " diff --git a/slips_files/core/evidence_handler.py b/slips_files/core/evidence_handler.py index 70b9e2582..3047f35bd 100644 --- a/slips_files/core/evidence_handler.py +++ b/slips_files/core/evidence_handler.py @@ -526,7 +526,7 @@ def main(self): timestamp: datetime = utils.convert_to_local_timezone( timestamp ) - flow_datetime = utils.convert_format(timestamp, "iso") + flow_datetime = utils.convert_ts_format(timestamp, "iso") evidence: Evidence = ( self.formatter.add_threat_level_to_evidence_description( diff --git a/slips_files/core/helpers/flow_handler.py b/slips_files/core/helpers/flow_handler.py index 469fa4cbd..2ccb1a7c3 100644 --- a/slips_files/core/helpers/flow_handler.py +++ b/slips_files/core/helpers/flow_handler.py @@ -201,7 +201,9 @@ def handle_smtp(self): def handle_software(self): self.db.add_software_to_profile(self.profileid, self.flow) - epoch_time = utils.convert_format(self.flow.starttime, "unixtimestamp") + epoch_time = utils.convert_ts_format( + self.flow.starttime, "unixtimestamp" + ) self.flow.starttime = epoch_time self.publisher.new_software(self.profileid, self.flow) @@ -220,7 +222,9 @@ def handle_dhcp(self): self.db.store_dhcp_server(self.flow.server_addr) self.db.mark_profile_as_dhcp(self.profileid) - epoch_time = utils.convert_format(self.flow.starttime, "unixtimestamp") + epoch_time = utils.convert_ts_format( + self.flow.starttime, "unixtimestamp" + ) self.flow.starttime = epoch_time self.publisher.new_dhcp(self.profileid, self.flow) diff --git a/slips_files/core/input.py b/slips_files/core/input.py index b5db653c3..36d646d5e 100644 --- a/slips_files/core/input.py +++ b/slips_files/core/input.py @@ -169,7 +169,7 @@ def check_if_time_to_del_rotated_files(self): return False now = float( - utils.convert_format(datetime.datetime.now(), "unixtimestamp") + utils.convert_ts_format(datetime.datetime.now(), "unixtimestamp") ) time_to_delete = now >= self.time_rotated + self.keep_rotated_files_for if time_to_delete: @@ -740,7 +740,7 @@ def remove_old_zeek_files(self): # delete the old log file (the one with the ts) self.to_be_deleted.append(old_log_file) self.time_rotated = float( - utils.convert_format( + utils.convert_ts_format( datetime.datetime.now(), "unixtimestamp" ) ) diff --git a/slips_files/core/input_profilers/nfdump.py b/slips_files/core/input_profilers/nfdump.py index 401013159..016077430 100644 --- a/slips_files/core/input_profilers/nfdump.py +++ b/slips_files/core/input_profilers/nfdump.py @@ -26,8 +26,8 @@ def get_value_at(indx, default_=False): except (IndexError, KeyError): return default_ - starttime = utils.convert_format(get_value_at(0), "unixtimestamp") - endtime = utils.convert_format(get_value_at(1), "unixtimestamp") + starttime = utils.convert_ts_format(get_value_at(0), "unixtimestamp") + endtime = utils.convert_ts_format(get_value_at(1), "unixtimestamp") self.flow: NfdumpConn = NfdumpConn( starttime, endtime, diff --git a/slips_files/core/input_profilers/suricata.py b/slips_files/core/input_profilers/suricata.py index eca742192..e5ed50c77 100644 --- a/slips_files/core/input_profilers/suricata.py +++ b/slips_files/core/input_profilers/suricata.py @@ -76,10 +76,10 @@ def get_value_at(field, subfield, default_=False): return default_ if event_type == "flow": - starttime = utils.convert_format( + starttime = utils.convert_ts_format( get_value_at("flow", "start"), "unixtimestamp" ) - endtime = utils.convert_format( + endtime = utils.convert_ts_format( get_value_at("flow", "end"), "unixtimestamp" ) self.flow: SuricataFlow = SuricataFlow( diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index 4a504e566..0d9b11bd2 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -128,7 +128,7 @@ def read_configuration(self): def convert_starttime_to_epoch(self, starttime) -> str: try: - return utils.convert_format(starttime, "unixtimestamp") + return utils.convert_ts_format(starttime, "unixtimestamp") except ValueError: self.print( f"We can not recognize time format of " diff --git a/slips_files/core/structures/alerts.py b/slips_files/core/structures/alerts.py index dc6249589..c84e896ac 100644 --- a/slips_files/core/structures/alerts.py +++ b/slips_files/core/structures/alerts.py @@ -78,7 +78,7 @@ def __post_init__(self): # timestamp of the flow causing the last evidence of this alert if not self.last_flow_datetime: last_flow_timestamp: str = self.last_evidence.timestamp - self.last_flow_datetime = utils.convert_format( + self.last_flow_datetime = utils.convert_ts_format( last_flow_timestamp, "iso" ) @@ -103,14 +103,14 @@ def dict_to_alert(alert: dict) -> Alert: ), timewindow=TimeWindow( alert["timewindow"]["number"], - utils.convert_format(alert["timewindow"]["start_time"], "iso"), - utils.convert_format(alert["timewindow"]["end_time"], "iso"), + utils.convert_ts_format(alert["timewindow"]["start_time"], "iso"), + utils.convert_ts_format(alert["timewindow"]["end_time"], "iso"), ), last_evidence=dict_to_evidence(alert["last_evidence"]), accumulated_threat_level=alert.get("accumulated_threat_level"), id=alert.get("id", ""), correl_id=alert.get("correl_id"), - last_flow_datetime=utils.convert_format( + last_flow_datetime=utils.convert_ts_format( alert["last_flow_datetime"], "iso" ), threat_level=ThreatLevel[alert["threat_level"].upper()], diff --git a/slips_files/core/text_formatters/evidence.py b/slips_files/core/text_formatters/evidence.py index 68f53af3c..a1b99ebad 100644 --- a/slips_files/core/text_formatters/evidence.py +++ b/slips_files/core/text_formatters/evidence.py @@ -65,10 +65,10 @@ def get_printable_alert(self, alert: Alert) -> str: aka the start and end time of the timewindow causing the alert """ time_format = "%Y/%m/%d %H:%M:%S" - twid_start_time: str = utils.convert_format( + twid_start_time: str = utils.convert_ts_format( alert.timewindow.start_time, time_format ) - tw_stop_time: str = utils.convert_format( + tw_stop_time: str = utils.convert_ts_format( alert.timewindow.end_time, time_format ) @@ -114,7 +114,7 @@ def format_evidence_for_printing( # Add the timestamp to the alert. # this datetime, the one that is printed, will be of the last # evidence only - readable_datetime: str = utils.convert_format( + readable_datetime: str = utils.convert_ts_format( alert.last_evidence.timestamp, utils.alerts_format ) alert_to_print: str = red(f"{readable_datetime} ") + alert_to_print diff --git a/tests/test_metadata_manager.py b/tests/test_metadata_manager.py index be2af307c..722c3baf1 100644 --- a/tests/test_metadata_manager.py +++ b/tests/test_metadata_manager.py @@ -31,7 +31,7 @@ def test_set_analysis_end_date( enable_metadata ) - utils.convert_format = Mock(return_value=expected_end_date) + utils.convert_ts_format = Mock(return_value=expected_end_date) with patch("builtins.open", create=True) as mock_open: result = metadata_manager.set_analysis_end_date("dummy_end_date") diff --git a/tests/test_process_manager.py b/tests/test_process_manager.py index a8eccff4f..0f232ed4b 100644 --- a/tests/test_process_manager.py +++ b/tests/test_process_manager.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2021 Sebastian Garcia # SPDX-License-Identifier: GPL-2.0-only import pytest -from unittest.mock import Mock, patch, MagicMock +from unittest.mock import Mock, patch from managers.process_manager import ProcessManager from tests.module_factory import ModuleFactory from slips_files.common.slips_utils import utils @@ -241,7 +241,7 @@ def test_get_analysis_time( end_date_str, start_time_str, expected_analysis_time ): process_manager = ModuleFactory().create_process_manager_obj() - utils.convert_format = Mock(return_value=end_date_str) + utils.convert_ts_format = Mock(return_value=end_date_str) process_manager.main.db.get_slips_start_time.return_value = start_time_str analysis_time = process_manager.get_analysis_time() diff --git a/tests/test_redis_manager.py b/tests/test_redis_manager.py index 8f8c527ba..9e9627c55 100644 --- a/tests/test_redis_manager.py +++ b/tests/test_redis_manager.py @@ -52,7 +52,7 @@ def test_log_redis_server_pid_normal_ports( redis_manager.main.args.daemon = is_daemon redis_manager.main.args.save = save_db redis_manager.remove_old_logline = Mock() - slips_files.common.slips_utils.utils.convert_format = Mock( + slips_files.common.slips_utils.utils.convert_ts_format = Mock( return_value="Date" ) diff --git a/tests/test_slips_utils.py b/tests/test_slips_utils.py index 1a8fbae87..3c2f86d3c 100644 --- a/tests/test_slips_utils.py +++ b/tests/test_slips_utils.py @@ -185,7 +185,9 @@ def test_calculate_confidence(input_value, expected_output): def test_convert_format(input_value, input_format, expected_output): utils = ModuleFactory().create_utils_obj() utils.local_tz = datetime.timezone.utc - assert utils.convert_format(input_value, input_format) == expected_output + assert ( + utils.convert_ts_format(input_value, input_format) == expected_output + ) @pytest.mark.parametrize( diff --git a/webinterface/analysis/analysis.py b/webinterface/analysis/analysis.py index 00e204e0c..fbc075fab 100644 --- a/webinterface/analysis/analysis.py +++ b/webinterface/analysis/analysis.py @@ -22,8 +22,8 @@ # ---------------------------------------- def ts_to_date(ts, seconds=False): if seconds: - return utils.convert_format(ts, "%Y/%m/%d %H:%M:%S.%f") - return utils.convert_format(ts, "%Y/%m/%d %H:%M:%S") + return utils.convert_ts_format(ts, "%Y/%m/%d %H:%M:%S.%f") + return utils.convert_ts_format(ts, "%Y/%m/%d %H:%M:%S") def get_all_tw_with_ts(profileid): From 2a5096dafd16b8e8c124b966a799917ef7c44dfc Mon Sep 17 00:00:00 2001 From: alya Date: Wed, 7 May 2025 14:04:08 +0300 Subject: [PATCH 253/498] change the __repr__ of timewindow obj for backwards compatibility --- slips_files/core/database/redis_db/alert_handler.py | 6 ++++++ slips_files/core/database/redis_db/profile_handler.py | 6 ------ slips_files/core/structures/evidence.py | 6 +----- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/slips_files/core/database/redis_db/alert_handler.py b/slips_files/core/database/redis_db/alert_handler.py index 74dfab4a7..7353bde2c 100644 --- a/slips_files/core/database/redis_db/alert_handler.py +++ b/slips_files/core/database/redis_db/alert_handler.py @@ -140,6 +140,12 @@ def get_victim(self, profileid, attacker): # the victim is the whole network return "" + def get_tw_start_time(self, profileid, twid): + """Return the time when this TW in this profile was created""" + # We need to encode it to 'search' because the data in the + # sorted set is encoded + return self.r.zscore(f"tws{profileid}", twid.encode("utf-8")) + def get_tw_limits(self, profileid, twid: str) -> Tuple[float, float]: """ returns the timewindow start and endtime diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index aadffc88a..553cb0363 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -1157,12 +1157,6 @@ def add_new_tw(self, profileid, timewindow: str, startoftw: float): self.print("Error in addNewTW", 0, 1) self.print(traceback.format_exc(), 0, 1) - def get_tw_start_time(self, profileid, twid): - """Return the time when this TW in this profile was created""" - # We need to encode it to 'search' because the data in the - # sorted set is encoded - return self.r.zscore(f"tws{profileid}", twid.encode("utf-8")) - def get_number_of_tws(self, profileid): """Return the number of tws for this profile id""" return self.r.zcard(f"tws{profileid}") if profileid else False diff --git a/slips_files/core/structures/evidence.py b/slips_files/core/structures/evidence.py index b4d3424fd..0bf6d8524 100644 --- a/slips_files/core/structures/evidence.py +++ b/slips_files/core/structures/evidence.py @@ -235,11 +235,7 @@ def __post_init__(self): ) def __repr__(self): - return ( - f"timewindow{self.number}, " - f"start_time: {self.start_time}, " - f"end_time: {self.end_time}" - ) + return f"timewindow{self.number}" class Method(Enum): From f11137092f8f1b05d0900e0e61bba223a0c92784 Mon Sep 17 00:00:00 2001 From: alya Date: Wed, 7 May 2025 14:20:35 +0300 Subject: [PATCH 254/498] give the unblocker object access to the printer --- modules/blocking/blocking.py | 4 +++- modules/blocking/exec_iptables_cmd.py | 1 + modules/blocking/unblocker.py | 23 ++++++++++++++++------- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/modules/blocking/blocking.py b/modules/blocking/blocking.py index b5dda6f31..b6ba2abdd 100644 --- a/modules/blocking/blocking.py +++ b/modules/blocking/blocking.py @@ -197,7 +197,9 @@ def shutdown_gracefully(self): self.print("Problem shutting down unblocker thread.") def pre_main(self): - self.unblocker = Unblocker(self.db, self.sudo, self.should_stop) + self.unblocker = Unblocker( + self.db, self.sudo, self.should_stop, self.logger + ) def main(self): if msg := self.get_msg("new_blocking"): diff --git a/modules/blocking/exec_iptables_cmd.py b/modules/blocking/exec_iptables_cmd.py index 2d5fe08df..69c78fa54 100644 --- a/modules/blocking/exec_iptables_cmd.py +++ b/modules/blocking/exec_iptables_cmd.py @@ -17,6 +17,7 @@ def exec_iptables_command(sudo: str, action, ip_to_block, flag, options): f"{sudo}iptables --{action} slipsBlocking {flag} {ip_to_block} " f'-m comment --comment "Slips rule" >/dev/null 2>&1' ) + print(f"@@@@@@@@@@@@@@@@ executing {command}") # Add the options constructed in block_ip or unblock_ip to the # iptables command for key in options.keys(): diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index 76b84ab61..c6423d365 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -3,8 +3,10 @@ import threading from typing import Dict, Callable from slips_files.common.abstracts.unblocker import IUnblocker +from slips_files.common.printer import Printer from slips_files.common.slips_utils import utils from slips_files.core.structures.evidence import TimeWindow +from modules.blocking.exec_iptables_cmd import exec_iptables_command class Unblocker(IUnblocker): @@ -15,17 +17,22 @@ class Unblocker(IUnblocker): name = "iptables_unblocker" - def __init__(self, db, sudo, should_stop: Callable): + def __init__(self, db, sudo, should_stop: Callable, logger): IUnblocker.__init__(self, db) # this is the blocking module's should_stop method # the goal is to stop the threads started by this module when the # blocking module's should_stop returns True self.should_stop = should_stop + self.logger = logger + self.printer = Printer(self.logger, self.name) self.sudo = sudo self.requests_lock = Lock() self.requests = {} self._start_checker_thread() + def print(self, *args, **kwargs): + return self.printer.print(*args, **kwargs) + def _start_checker_thread(self): self.unblocker_thread = threading.Thread( target=self._check_if_time_to_unblock, @@ -61,7 +68,7 @@ def _check_if_time_to_unblock(self): requests_to_del = [] for ip, request in self.requests.items(): - ts: str = self.request["tw_to_unblock"].end_time + ts: str = request["tw_to_unblock"].end_time ts: float = utils.convert_ts_format(ts, "unixtimestamp") print( f"@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock]" @@ -72,7 +79,7 @@ def _check_if_time_to_unblock(self): f"@@@@@@@@@@@@@@@@ time to unblock {ip} in the " f"fw {request}" ) - flags: Dict[str, str] = self.request["flags"] + flags: Dict[str, str] = request["flags"] if self._unblock(ip, flags): requests_to_del.append(ip) @@ -82,7 +89,7 @@ def _check_if_time_to_unblock(self): f"seleting request for {ip}" ) - self._del_req(ip) + self._del_request(ip) print("@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock] sleeping 10") time.sleep(10) @@ -91,7 +98,7 @@ def _add_req( ): """ Add an unblocking request to self.requests - :param ts_to_unblock: unix ts to unblock the given ip at + :param tw_to_unblock_at: unix ts to unblock the given ip at """ with self.requests_lock: ts = utils.convert_ts_format(time.time() + 30, "iso") @@ -147,7 +154,8 @@ def _unblock( unblocked = False # Block traffic from source ip if from_: - unblocked = self.exec_iptables_command( + unblocked = exec_iptables_command( + self.sudo, action="delete", ip_to_block=ip_to_unblock, flag="-s", @@ -156,7 +164,8 @@ def _unblock( # Block traffic to distination ip if to: - unblocked = self.exec_iptables_command( + unblocked = exec_iptables_command( + self.sudo, action="delete", ip_to_block=ip_to_unblock, flag="-d", From e2e5481b4c5eb8418ea1f5be974914363e1ff452 Mon Sep 17 00:00:00 2001 From: alya Date: Wed, 7 May 2025 15:31:57 +0300 Subject: [PATCH 255/498] log blocking and unblocking requests and store them in the database --- modules/blocking/blocking.py | 49 +++++++++---------- modules/blocking/unblocker.py | 42 ++++++++++++++-- slips_files/core/database/database_manager.py | 9 ++++ .../core/database/redis_db/alert_handler.py | 13 +++++ 4 files changed, 84 insertions(+), 29 deletions(-) diff --git a/modules/blocking/blocking.py b/modules/blocking/blocking.py index b6ba2abdd..a0675aeef 100644 --- a/modules/blocking/blocking.py +++ b/modules/blocking/blocking.py @@ -7,6 +7,8 @@ import json import subprocess from typing import Dict +import time +from threading import Lock from slips_files.common.abstracts.module import IModule from slips_files.common.slips_utils import utils @@ -35,30 +37,20 @@ def init(self): self.firewall = self._determine_linux_firewall() self.sudo = utils.get_sudo_according_to_env() self._init_chains_in_firewall() + self.blocking_log_path = os.path.join(self.output_dir, "blocking.log") + self.blocking_logfile_lock = Lock() + # clear it + open(self.blocking_log_path, "w").close() - # self.test() - - def test(self): - """For debugging purposes, once we're done with the module we'll - delete it""" - if not self._is_ip_blocked("2.2.0.0"): - blocking_data = { - "ip": "2.2.0.0", - "block": True, - "from": True, - "to": True, - "block_for": 5, - # "dport" : Optional destination port number - # "sport" : Optional source port number - # "protocol" : Optional protocol - } - # Example of passing blocking_data to this module: - blocking_data = json.dumps(blocking_data) - self.db.publish("new_blocking", blocking_data) - self.print("[test] Blocked ip.") - else: - self.print("[test] IP is already blocked") - # self.unblock_ip("2.2.0.0",True,True) + def log(self, text: str): + """Logs the given text to the blocking log file""" + with self.blocking_logfile_lock: + with open(self.blocking_log_path, "a") as f: + now = time.time() + human_readable_datetime = utils.convert_ts_format( + now, utils.alerts_format + ) + f.write(f"{human_readable_datetime} - {text}\n") def _determine_linux_firewall(self): """Returns the currently installed firewall and installs iptables if @@ -175,7 +167,9 @@ def _block_ip(self, ip_to_block: str, flags: Dict[str, str]) -> bool: options=options, ) if blocked: - self.print(f"Blocked all traffic from: {ip_to_block}") + txt = f"Blocked all traffic from: {ip_to_block}" + self.print(txt) + self.log(txt) if to: # Add rule to block traffic to ip_to_block (-d) @@ -187,7 +181,10 @@ def _block_ip(self, ip_to_block: str, flags: Dict[str, str]) -> bool: options=options, ) if blocked: - self.print(f"Blocked all traffic to: {ip_to_block}") + txt = f"Blocked all traffic to: {ip_to_block}" + self.print(txt) + self.log(f"Blocked all traffic to: {ip_to_block}") + self.db.set_blocked_ip(ip_to_block) return blocked @@ -198,7 +195,7 @@ def shutdown_gracefully(self): def pre_main(self): self.unblocker = Unblocker( - self.db, self.sudo, self.should_stop, self.logger + self.db, self.sudo, self.should_stop, self.logger, self.log ) def main(self): diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index c6423d365..6dfd10c17 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -17,15 +17,18 @@ class Unblocker(IUnblocker): name = "iptables_unblocker" - def __init__(self, db, sudo, should_stop: Callable, logger): + def __init__(self, db, sudo, should_stop: Callable, logger, log: Callable): IUnblocker.__init__(self, db) # this is the blocking module's should_stop method # the goal is to stop the threads started by this module when the # blocking module's should_stop returns True self.should_stop = should_stop + # this logger's main purpose is to start the printer self.logger = logger self.printer = Printer(self.logger, self.name) self.sudo = sudo + # this log method is used to log unblocking requests to blocking.log + self.log = log self.requests_lock = Lock() self.requests = {} self._start_checker_thread() @@ -81,6 +84,8 @@ def _check_if_time_to_unblock(self): ) flags: Dict[str, str] = request["flags"] if self._unblock(ip, flags): + self._log_successful_unblock(ip) + self.db.del_blocked_ip(ip) requests_to_del.append(ip) for ip in requests_to_del: @@ -93,6 +98,22 @@ def _check_if_time_to_unblock(self): print("@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock] sleeping 10") time.sleep(10) + def _log_successful_unblock(self, ip): + blocking_ts: float = self.db.get_blocking_timestamp(ip) + now = time.time() + blocking_hrs: int = utils.get_time_diff(blocking_ts, now, "hours") + blocking_tws: int = self.db.get_equivalent_tws(blocking_hrs) + printable_blocking_ts = utils.convert_ts_format( + blocking_ts, utils.alerts_format + ) + printable_now = utils.convert_ts_format(now, utils.alerts_format) + txt = ( + f"The blocking of {ip} lasted {blocking_tws} timewindows. " + f"({blocking_hrs}hrs - " + f"From {printable_blocking_ts} to {printable_now})" + ) + self.log(txt) + def _add_req( self, ip: str, tw_to_unblock_at: TimeWindow, flags: Dict[str, str] ): @@ -104,7 +125,6 @@ def _add_req( ts = utils.convert_ts_format(time.time() + 30, "iso") tw_to_unblock_at.end_time = ts # @@@@@@@@@@@@@ # del this - print( f"@@@@@@@@@@@@@@@@ tw_to_unblock_at.end_time {tw_to_unblock_at.end_time}" ) @@ -112,6 +132,13 @@ def _add_req( "tw_to_unblock": tw_to_unblock_at, "flags": flags, } + + self.log( + f"Registered unblocking request to unblock {ip} at the end " + f"of the next timewindow. " + f"Timewindow to unblock: {tw_to_unblock_at} " + f"Timestamp to unblock: {tw_to_unblock_at.end_time}) " + ) print(f"@@@@@@@@@@@@@@@@ added req for {ip} ") from pprint import pp @@ -173,8 +200,17 @@ def _unblock( ) if unblocked: - self.print(f"Unblocked: {ip_to_unblock}") + cur_timewindow = self.db.get_timewindow( + time.time(), f"profile_{ip_to_unblock}" + ) + txt = f"IP {ip_to_unblock} is unblocked in {cur_timewindow}." + self.print(txt) + self.log(txt) print(f"@@@@@@@@@@@@@@@@ unblocked {ip_to_unblock} in the fw") return True + else: + txt = f"An errror occured. Unable to unblock {ip_to_unblock}" + self.print(txt) + self.log(txt) return False diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index e8ca3aaf6..0b805976d 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -823,6 +823,15 @@ def get_profiled_tw_timeline(self, *args, **kwargs): def mark_profile_as_gateway(self, *args, **kwargs): return self.rdb.mark_profile_as_gateway(*args, **kwargs) + def set_blocked_ip(self, *args, **kwargs): + return self.rdb.set_blocked_ip(*args, **kwargs) + + def get_blocking_timestamp(self, *args, **kwargs): + return self.rdb.is_ip_blocked(*args, **kwargs) + + def del_blocked_ip(self, *args, **kwargs): + return self.rdb.del_blocked_ip(*args, **kwargs) + def set_ipv6_of_profile(self, *args, **kwargs): return self.rdb.set_ipv6_of_profile(*args, **kwargs) diff --git a/slips_files/core/database/redis_db/alert_handler.py b/slips_files/core/database/redis_db/alert_handler.py index 7353bde2c..584817d21 100644 --- a/slips_files/core/database/redis_db/alert_handler.py +++ b/slips_files/core/database/redis_db/alert_handler.py @@ -140,6 +140,19 @@ def get_victim(self, profileid, attacker): # the victim is the whole network return "" + def set_blocked_ip(self, ip: str): + self.r.zadd("blocked_ips", {ip: time.time()}) + + def is_ip_blocked(self, ip: str) -> Optional[float]: + ts = self.r.zscore("blocked_ips", ip) + if ts is not None: + return ts + return None + + def del_blocked_ip(self, ip: str): + # remove ip from the blocked_ips sorted set + self.r.zrem("blocked_ips", ip) + def get_tw_start_time(self, profileid, twid): """Return the time when this TW in this profile was created""" # We need to encode it to 'search' because the data in the From 156d4e188393c70024e7432f100d11b93296643e Mon Sep 17 00:00:00 2001 From: alya Date: Wed, 7 May 2025 16:19:57 +0300 Subject: [PATCH 256/498] if a blocked profile generated more than 1 alert, extend its blocking by 1 timewindow for each one --- modules/blocking/unblocker.py | 35 +++++++++++++++---- .../core/database/redis_db/constants.py | 1 + slips_files/core/evidence_handler.py | 23 ++++++++---- 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index 6dfd10c17..5b12d9f9c 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -51,13 +51,31 @@ def unblock_request( current_tw: int, flags: Dict[str, str], ): - print(f"@@@@@@@@@@@@@@@@ unblock_request for ip {ip}") - tw_to_unblock_at: TimeWindow = self._calc_unblock_time( - ip, current_tw, how_many_tws_to_block - ) - print( - f"@@@@@@@@@@@@@@@@ unblocking {ip} at the end of {tw_to_unblock_at}" - ) + # if this ip was blocked, and is still setting alerts, how many tws + # to extend its blocking? + extend_blocking_for = 1 + # first check if there's already an unblocking request, if so, + # we extend the blocking 1 more timewindow. + try: + tw_to_unblock_at: TimeWindow = self.requests[ip]["tw_to_unblock"] + tw_to_unblock_at: TimeWindow = self._calc_unblock_time( + ip, + tw_to_unblock_at.number + extend_blocking_for, + how_many_tws_to_block, + ) + self.log( + f"Extending the blocking period for {ip} for" + f" {extend_blocking_for} extra timewindow." + ) + except KeyError: + print(f"@@@@@@@@@@@@@@@@ unblock_request for ip {ip}") + tw_to_unblock_at: TimeWindow = self._calc_unblock_time( + ip, current_tw, how_many_tws_to_block + ) + print( + f"@@@@@@@@@@@@@@@@ unblocking {ip} at the end of {tw_to_unblock_at}" + ) + self._add_req(ip, tw_to_unblock_at, flags) def _check_if_time_to_unblock(self): @@ -101,7 +119,10 @@ def _check_if_time_to_unblock(self): def _log_successful_unblock(self, ip): blocking_ts: float = self.db.get_blocking_timestamp(ip) now = time.time() + blocking_hrs: int = utils.get_time_diff(blocking_ts, now, "hours") + blocking_hrs = round(blocking_hrs, 1) + blocking_tws: int = self.db.get_equivalent_tws(blocking_hrs) printable_blocking_ts = utils.convert_ts_format( blocking_ts, utils.alerts_format diff --git a/slips_files/core/database/redis_db/constants.py b/slips_files/core/database/redis_db/constants.py index 33c15da37..084be9f41 100644 --- a/slips_files/core/database/redis_db/constants.py +++ b/slips_files/core/database/redis_db/constants.py @@ -61,6 +61,7 @@ class Constants: PORT_INFO = "portinfo" DHCP_FLOWS = "DHCP_flows" REDIS_USED_PORT = "port" + # used in the web interface BLOCKED_PROFILES_AND_TWS = "BlockedProfTW" PROFILES = "profiles" NUMBER_OF_ALERTS = "number_of_alerts" diff --git a/slips_files/core/evidence_handler.py b/slips_files/core/evidence_handler.py index 3047f35bd..1f3b8fec3 100644 --- a/slips_files/core/evidence_handler.py +++ b/slips_files/core/evidence_handler.py @@ -379,6 +379,20 @@ def handle_new_alert( """ saves alert details in the db and informs exporting modules about it """ + # like in the firewall + profile_already_blocked: bool = self.db.is_blocked_profile_and_tw( + str(alert.profile), str(alert.timewindow) + ) + + if profile_already_blocked: + print( + f"@@@@@@@@@@@@@@@@ [handle_new_alert] profiler already " + f"blocked and is setting another alert!!! {alert}" + ) + # send another blocking request to extend the blocking period + self.decide_blocking(alert.profile.ip, alert.timewindow) + return + self.db.set_alert(alert, evidence_causing_the_alert) self.send_to_exporting_module(evidence_causing_the_alert) alert_to_print: str = self.formatter.format_evidence_for_printing( @@ -397,6 +411,7 @@ def handle_new_alert( self.db.mark_profile_and_timewindow_as_blocked( str(alert.profile), str(alert.timewindow) ) + self.log_alert(alert, blocked=is_blocked) def decide_blocking( @@ -432,6 +447,8 @@ def decide_blocking( "ip": ip_to_block, "block": True, "tw": timewindow.number, + # block until the end of the next 1 timewindow + "block_for": 1, } blocking_data = json.dumps(blocking_data) self.db.publish("new_blocking", blocking_data) @@ -569,11 +586,6 @@ def main(self): evidence_dict: dict = utils.to_dict(evidence) self.db.publish("report_to_peers", json.dumps(evidence_dict)) - # if the profile was already blocked in - # this twid, we shouldn't alert - profile_already_blocked = self.db.is_blocked_profile_and_tw( - profileid, twid - ) # This is the part to detect if the accumulated # evidence was enough for generating a detection # The detection should be done in attacks per minute. @@ -584,7 +596,6 @@ def main(self): if ( accumulated_threat_level >= self.detection_threshold_in_this_width - and not profile_already_blocked ): tw_evidence: Dict[str, Evidence] tw_evidence = self.get_evidence_for_tw(profileid, twid) From 757c0fe8c81e521794b76e8e0d4055632621b886 Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 14:01:50 +0300 Subject: [PATCH 257/498] host_ip_manager.py: get the host ip using netifaces --- managers/host_ip_manager.py | 32 ++++++++++------------------ slips_files/core/evidence_handler.py | 5 +++-- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/managers/host_ip_manager.py b/managers/host_ip_manager.py index b4320b43c..fbaba6644 100644 --- a/managers/host_ip_manager.py +++ b/managers/host_ip_manager.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2021 Sebastian Garcia # SPDX-License-Identifier: GPL-2.0-only -import socket import time +import netifaces from typing import ( Set, Optional, @@ -16,29 +16,19 @@ def __init__(self, main): def get_host_ip(self) -> Optional[str]: """ - tries to determine the machine's IP address by creating a UDP - connection to cloudflare - returns ipv4 or ipv6 of the current computer + tries to determine the machine's IP """ - for address_family in (socket.AF_INET, socket.AF_INET6): - try: - s = socket.socket(address_family, socket.SOCK_DGRAM) + interfaces = netifaces.interfaces() - test_address = ( - ("1.1.1.1", 80) - if address_family == socket.AF_INET - else ("2606:4700:4700::1111", 80) - ) - - s.connect(test_address) - ipaddr_check = s.getsockname()[0] - s.close() - return ipaddr_check - except socket.error: + for iface in interfaces: + addrs = netifaces.ifaddresses(iface) + # check for IPv4 address + if netifaces.AF_INET not in addrs: continue - - # neither ipv4 nor ipv6 worked - return None + for addr in addrs[netifaces.AF_INET]: + ip = addr.get("addr") + if ip and not ip.startswith("127."): + return ip def store_host_ip(self) -> Optional[str]: """ diff --git a/slips_files/core/evidence_handler.py b/slips_files/core/evidence_handler.py index 1f3b8fec3..1ef17de7f 100644 --- a/slips_files/core/evidence_handler.py +++ b/slips_files/core/evidence_handler.py @@ -92,6 +92,7 @@ def init(self): utils.change_logfiles_ownership(self.logfile.name, self.UID, self.GID) self.is_running_non_stop = self.db.is_running_non_stop() + self.blocking_module_supported = self.is_blocking_module_supported() # clear output/alerts.json self.jsonfile = self.clean_file(self.output_dir, "alerts.json") @@ -422,7 +423,7 @@ def decide_blocking( returns True if the given IP was blocked by Slips blocking module """ # send ip to the blocking module - if not self.is_blocking_module_supported(): + if not self.blocking_module_supported: print( "@@@@@@@@@@@@@@@@ decide_blocking blocking module " "unsupported" @@ -452,7 +453,7 @@ def decide_blocking( } blocking_data = json.dumps(blocking_data) self.db.publish("new_blocking", blocking_data) - print("@@@@@@@@@@@@@@@@ published st in new_blocking") + print(f"@@@@@@@@@@@@@@@@ published {blocking_data} in new_blocking") return True def increment_attack_counter( From d14c80af776d07422f4b12690b7b9d47130d627e Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 14:15:03 +0300 Subject: [PATCH 258/498] unblocker: remove debugging logic to test blocking extension --- modules/blocking/unblocker.py | 17 +++++++---------- slips_files/common/abstracts/unblocker.py | 2 +- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index 5b12d9f9c..458eb7800 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -57,8 +57,12 @@ def unblock_request( # first check if there's already an unblocking request, if so, # we extend the blocking 1 more timewindow. try: + print( + f"@@@@@@@@@@@@@@@@ !!!!!!!!!!!!!!!!!!!!!!!! Extending " + f"the blockinnnnggg for {ip}" + ) tw_to_unblock_at: TimeWindow = self.requests[ip]["tw_to_unblock"] - tw_to_unblock_at: TimeWindow = self._calc_unblock_time( + tw_to_unblock_at: TimeWindow = self._get_tw_to_unblock_at( ip, tw_to_unblock_at.number + extend_blocking_for, how_many_tws_to_block, @@ -69,7 +73,7 @@ def unblock_request( ) except KeyError: print(f"@@@@@@@@@@@@@@@@ unblock_request for ip {ip}") - tw_to_unblock_at: TimeWindow = self._calc_unblock_time( + tw_to_unblock_at: TimeWindow = self._get_tw_to_unblock_at( ip, current_tw, how_many_tws_to_block ) print( @@ -143,12 +147,6 @@ def _add_req( :param tw_to_unblock_at: unix ts to unblock the given ip at """ with self.requests_lock: - ts = utils.convert_ts_format(time.time() + 30, "iso") - tw_to_unblock_at.end_time = ts # @@@@@@@@@@@@@ - # del this - print( - f"@@@@@@@@@@@@@@@@ tw_to_unblock_at.end_time {tw_to_unblock_at.end_time}" - ) self.requests[ip] = { "tw_to_unblock": tw_to_unblock_at, "flags": flags, @@ -233,5 +231,4 @@ def _unblock( txt = f"An errror occured. Unable to unblock {ip_to_unblock}" self.print(txt) self.log(txt) - - return False + return False diff --git a/slips_files/common/abstracts/unblocker.py b/slips_files/common/abstracts/unblocker.py index b46e84874..d13147f07 100644 --- a/slips_files/common/abstracts/unblocker.py +++ b/slips_files/common/abstracts/unblocker.py @@ -57,7 +57,7 @@ def _check_if_time_to_unblock(self): """a bg thread that unblocks ips once their ts is reached""" ... - def _calc_unblock_time( + def _get_tw_to_unblock_at( self, ip: str, cur_tw: int, how_many_tws_to_block: int ) -> TimeWindow: """ From 090ab9415dc4135e2231c55377cd455929374787 Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 14:27:40 +0300 Subject: [PATCH 259/498] unblocker: fix issue coparing the ts to unblock with the current ts --- modules/blocking/unblocker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index 458eb7800..c96df83f4 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -99,7 +99,7 @@ def _check_if_time_to_unblock(self): f"@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock]" f" checking if time to unvblock {ip} {request}" ) - if ts >= now: + if now >= ts: print( f"@@@@@@@@@@@@@@@@ time to unblock {ip} in the " f"fw {request}" @@ -113,7 +113,7 @@ def _check_if_time_to_unblock(self): for ip in requests_to_del: print( f"@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock] " - f"seleting request for {ip}" + f"deleting request for {ip}" ) self._del_request(ip) From 19b4c7fb8d8c5696c10b05539feb15c8da0a0af5 Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 14:56:26 +0300 Subject: [PATCH 260/498] edit debugging prints --- slips_files/core/evidence_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/evidence_handler.py b/slips_files/core/evidence_handler.py index 1ef17de7f..e2d2291b2 100644 --- a/slips_files/core/evidence_handler.py +++ b/slips_files/core/evidence_handler.py @@ -388,7 +388,7 @@ def handle_new_alert( if profile_already_blocked: print( f"@@@@@@@@@@@@@@@@ [handle_new_alert] profiler already " - f"blocked and is setting another alert!!! {alert}" + f"blocked and is setting another alert!!! {alert.profile}" ) # send another blocking request to extend the blocking period self.decide_blocking(alert.profile.ip, alert.timewindow) From 73898b12c70f0a7173aae4c529bc575ea27acc07 Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 15:04:16 +0300 Subject: [PATCH 261/498] edit debugging prints --- modules/blocking/unblocker.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index c96df83f4..8fd5d04a6 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -57,11 +57,11 @@ def unblock_request( # first check if there's already an unblocking request, if so, # we extend the blocking 1 more timewindow. try: + tw_to_unblock_at: TimeWindow = self.requests[ip]["tw_to_unblock"] print( f"@@@@@@@@@@@@@@@@ !!!!!!!!!!!!!!!!!!!!!!!! Extending " f"the blockinnnnggg for {ip}" ) - tw_to_unblock_at: TimeWindow = self.requests[ip]["tw_to_unblock"] tw_to_unblock_at: TimeWindow = self._get_tw_to_unblock_at( ip, tw_to_unblock_at.number + extend_blocking_for, @@ -117,7 +117,6 @@ def _check_if_time_to_unblock(self): ) self._del_request(ip) - print("@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock] sleeping 10") time.sleep(10) def _log_successful_unblock(self, ip): @@ -158,7 +157,7 @@ def _add_req( f"Timewindow to unblock: {tw_to_unblock_at} " f"Timestamp to unblock: {tw_to_unblock_at.end_time}) " ) - print(f"@@@@@@@@@@@@@@@@ added req for {ip} ") + print(f"@@@@@@@@@@@@@@@@ [_add_req] DONEE. added req for {ip} to ") from pprint import pp pp(self.requests) From 8599dfda711e12cdcee816dfbbb749edb27de92f Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 15:36:55 +0300 Subject: [PATCH 262/498] fix problem extending the blocking of an already blocked ip --- modules/blocking/blocking.py | 10 +++++++--- modules/blocking/unblocker.py | 4 ++++ slips_files/core/database/redis_db/alert_handler.py | 4 ---- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/modules/blocking/blocking.py b/modules/blocking/blocking.py index a0675aeef..5f200f75a 100644 --- a/modules/blocking/blocking.py +++ b/modules/blocking/blocking.py @@ -117,7 +117,7 @@ def _init_chains_in_firewall(self): ) def _is_ip_blocked(self, ip) -> bool: - """Checks if ip is already blocked or not""" + """Checks if ip is already blocked or not using iptables""" command = f"{self.sudo}iptables -L slipsBlocking -v -n" # Execute command result = subprocess.run(command.split(), stdout=subprocess.PIPE) @@ -244,11 +244,15 @@ def main(self): if block: # blocking request blocked = self._block_ip(ip, flags) - if blocked: - print(f"@@@@@@@@@@@@@@@@ all good {ip} is blocked") + if blocked or self._is_ip_blocked(ip): + print( + f"@@@@@@@@@@@@@@@@ calling unblocker for ip {ip} " + f".. whether extend the blocking OR block." + ) self.unblocker.unblock_request( ip, how_many_tws_to_block, tw, flags ) + else: # unblocking request self.unblocker.unblock_request( diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index 8fd5d04a6..8e5225f79 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -51,6 +51,10 @@ def unblock_request( current_tw: int, flags: Dict[str, str], ): + """ + schedules unblocking for th egiven ip for the next timewindow. + and extends the blocking by 1 tw if the given ip is already blocked + """ # if this ip was blocked, and is still setting alerts, how many tws # to extend its blocking? extend_blocking_for = 1 diff --git a/slips_files/core/database/redis_db/alert_handler.py b/slips_files/core/database/redis_db/alert_handler.py index 584817d21..8460f9853 100644 --- a/slips_files/core/database/redis_db/alert_handler.py +++ b/slips_files/core/database/redis_db/alert_handler.py @@ -169,11 +169,7 @@ def get_tw_limits(self, profileid, twid: str) -> Tuple[float, float]: # calc the start time of the twid manually based on the first # twid first_twid_start_time: float = self.get_first_flow_time() - print( - f"@@@@@@@@@@@@@@@@ first_twid_start_time {first_twid_start_time}" - ) given_twid: int = int(twid.replace("timewindow", "")) - print(f"@@@@@@@@@@@@@@@@ given_twid {twid} -> {given_twid}") # tws in slips start from 1. # tw1 tw2 tw3 tw4 # 0 ──────┬─────┬──────┬────── From a69242139adeb7988b27d786fdd809eeba6888d1 Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 16:48:23 +0300 Subject: [PATCH 263/498] keep track of how many extra tws ips are blocked for --- modules/blocking/blocking.py | 41 ++++++--------- modules/blocking/unblocker.py | 75 +++++++++++++++------------- slips_files/core/evidence_handler.py | 3 -- tests/test_blocking.py | 4 +- 4 files changed, 57 insertions(+), 66 deletions(-) diff --git a/modules/blocking/blocking.py b/modules/blocking/blocking.py index 5f200f75a..45bb65123 100644 --- a/modules/blocking/blocking.py +++ b/modules/blocking/blocking.py @@ -27,8 +27,10 @@ class Blocking(IModule): def init(self): self.c1 = self.db.subscribe("new_blocking") + self.c2 = self.db.subscribe("tw_closed") self.channels = { "new_blocking": self.c1, + "tw_closed": self.c2, } if platform.system() == "Darwin": self.print("Mac OS blocking is not supported yet.") @@ -37,6 +39,7 @@ def init(self): self.firewall = self._determine_linux_firewall() self.sudo = utils.get_sudo_according_to_env() self._init_chains_in_firewall() + self.blocked_ips = {} self.blocking_log_path = os.path.join(self.output_dir, "blocking.log") self.blocking_logfile_lock = Lock() # clear it @@ -116,7 +119,7 @@ def _init_chains_in_firewall(self): + "iptables -I FORWARD -j slipsBlocking >/dev/null 2>&1" ) - def _is_ip_blocked(self, ip) -> bool: + def _is_ip_already_blocked(self, ip) -> bool: """Checks if ip is already blocked or not using iptables""" command = f"{self.sudo}iptables -L slipsBlocking -v -n" # Execute command @@ -139,7 +142,7 @@ def _block_ip(self, ip_to_block: str, flags: Dict[str, str]) -> bool: return False # Make sure ip isn't already blocked before blocking - if self._is_ip_blocked(ip_to_block): + if self._is_ip_already_blocked(ip_to_block): return False from_ = flags.get("from_") @@ -185,7 +188,6 @@ def _block_ip(self, ip_to_block: str, flags: Dict[str, str]) -> bool: self.print(txt) self.log(f"Blocked all traffic to: {ip_to_block}") self.db.set_blocked_ip(ip_to_block) - return blocked def shutdown_gracefully(self): @@ -214,7 +216,6 @@ def main(self): # "dport" : Optional destination port number # "sport" : Optional source port number # "protocol" : Optional protocol - # 'block_for': Optional, after this time (in seconds) this ip will be unblocked # } # Example of passing blocking_data to this module: # blocking_data = json.dumps(blocking_data) @@ -226,12 +227,6 @@ def main(self): ip = data.get("ip") tw: int = data.get("tw") block = data.get("block") - # number of tws to block for - # blocking should last until the end of the next - # timewindow by default. we'll be blocking in the cur - # timewindow anyways, this number is "how many tws AFTER the - # cur tw to keep this ip blocked in" - how_many_tws_to_block = data.get("block_for", 1) flags = { "from_": data.get("from"), @@ -240,21 +235,15 @@ def main(self): "sport": data.get("sport"), "protocol": data.get("protocol"), } - if block: - # blocking request - blocked = self._block_ip(ip, flags) - if blocked or self._is_ip_blocked(ip): - print( - f"@@@@@@@@@@@@@@@@ calling unblocker for ip {ip} " - f".. whether extend the blocking OR block." - ) - self.unblocker.unblock_request( - ip, how_many_tws_to_block, tw, flags - ) + self._block_ip(ip, flags) + # whether this ip is blocked now, or was already blocked, make an unblocking request to either extend its + # blocking period, or block it until the next timewindow is over. + print( + f"@@@@@@@@@@@@@@@@ calling unblocker for ip {ip} " + f".. whether extend the blocking OR block." + ) + self.unblocker.unblock_request(ip, tw, flags) - else: - # unblocking request - self.unblocker.unblock_request( - ip, how_many_tws_to_block, tw, flags - ) + if msg := self.get_msg("tw_closed"): + self.unblocker.update_requests() diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index 8e5225f79..d6dfdb62f 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -47,44 +47,24 @@ def _start_checker_thread(self): def unblock_request( self, ip: str, - how_many_tws_to_block: int, current_tw: int, flags: Dict[str, str], ): """ - schedules unblocking for th egiven ip for the next timewindow. - and extends the blocking by 1 tw if the given ip is already blocked + schedules unblocking for the given ip for the next timewindow. """ - # if this ip was blocked, and is still setting alerts, how many tws - # to extend its blocking? - extend_blocking_for = 1 - # first check if there's already an unblocking request, if so, - # we extend the blocking 1 more timewindow. - try: - tw_to_unblock_at: TimeWindow = self.requests[ip]["tw_to_unblock"] - print( - f"@@@@@@@@@@@@@@@@ !!!!!!!!!!!!!!!!!!!!!!!! Extending " - f"the blockinnnnggg for {ip}" - ) - tw_to_unblock_at: TimeWindow = self._get_tw_to_unblock_at( - ip, - tw_to_unblock_at.number + extend_blocking_for, - how_many_tws_to_block, - ) - self.log( - f"Extending the blocking period for {ip} for" - f" {extend_blocking_for} extra timewindow." - ) - except KeyError: - print(f"@@@@@@@@@@@@@@@@ unblock_request for ip {ip}") - tw_to_unblock_at: TimeWindow = self._get_tw_to_unblock_at( - ip, current_tw, how_many_tws_to_block - ) - print( - f"@@@@@@@@@@@@@@@@ unblocking {ip} at the end of {tw_to_unblock_at}" - ) + if ip in self.requests: + # ip is already blocked, extend the blocking by 1 tw + tws = self.requests[ip]["block_this_ip_for"] + block_this_ip_for = tws + 1 + else: + # measured in tws + block_this_ip_for = 1 - self._add_req(ip, tw_to_unblock_at, flags) + tw_to_unblock_at: TimeWindow = self._get_tw_to_unblock_at( + ip, current_tw, block_this_ip_for + ) + self._add_req(ip, tw_to_unblock_at, flags, block_this_ip_for) def _check_if_time_to_unblock(self): """ @@ -142,23 +122,48 @@ def _log_successful_unblock(self, ip): ) self.log(txt) + def update_requests(self): + """ + is called whenever a new timewindow starts. (on msgs to tw_closed) + the only purpose of this is to keep track of how many tws the ips in + self.requests will stay blocked for. + it answers this question + "how many extra tws should IP X stay blocked in?" + """ + new_requests = {} + with self.requests_lock: + for ip, req in self.requests.items(): + new_req = req + new_req["block_this_ip_for"] = req["block_this_ip_for"] - 1 + new_requests[ip] = new_req + + return new_requests + def _add_req( - self, ip: str, tw_to_unblock_at: TimeWindow, flags: Dict[str, str] + self, + ip: str, + tw_to_unblock_at: TimeWindow, + flags: Dict[str, str], + block_this_ip_for: int, ): """ Add an unblocking request to self.requests :param tw_to_unblock_at: unix ts to unblock the given ip at + :param block_this_ip_for: number of following timewindows this ip + will remain blocked in. """ with self.requests_lock: self.requests[ip] = { "tw_to_unblock": tw_to_unblock_at, + "block_this_ip_for": block_this_ip_for, "flags": flags, } + interval = self.requests[ip]["block_this_ip_for"] self.log( f"Registered unblocking request to unblock {ip} at the end " - f"of the next timewindow. " - f"Timewindow to unblock: {tw_to_unblock_at} " + f"of the next timewindow. {tw_to_unblock_at}. IP will be " + f"blocked for {interval} timewindows. " f"Timestamp to unblock: {tw_to_unblock_at.end_time}) " ) print(f"@@@@@@@@@@@@@@@@ [_add_req] DONEE. added req for {ip} to ") diff --git a/slips_files/core/evidence_handler.py b/slips_files/core/evidence_handler.py index e2d2291b2..b7704b083 100644 --- a/slips_files/core/evidence_handler.py +++ b/slips_files/core/evidence_handler.py @@ -448,8 +448,6 @@ def decide_blocking( "ip": ip_to_block, "block": True, "tw": timewindow.number, - # block until the end of the next 1 timewindow - "block_for": 1, } blocking_data = json.dumps(blocking_data) self.db.publish("new_blocking", blocking_data) @@ -651,7 +649,6 @@ def main(self): "block": True, "to": True, "from": True, - "block_for": self.width * 2, # block for 2 timewindows } blocking_data = json.dumps(blocking_data) self.db.publish("new_blocking", blocking_data) diff --git a/tests/test_blocking.py b/tests/test_blocking.py index 263d76d95..36994775a 100644 --- a/tests/test_blocking.py +++ b/tests/test_blocking.py @@ -83,7 +83,7 @@ def test_initialize_chains_in_firewall(): def test_block_ip(): blocking = ModuleFactory().create_blocking_obj() blocking._init_chains_in_firewall() - if not blocking._is_ip_blocked("2.2.0.0"): + if not blocking._is_ip_already_blocked("2.2.0.0"): ip = "2.2.0.0" from_ = True to = True @@ -99,6 +99,6 @@ def test_unblock_ip(): from_ = True to = True # first make sure that it's blocked - if not blocking._is_ip_blocked("2.2.0.0"): + if not blocking._is_ip_already_blocked("2.2.0.0"): assert blocking._block_ip(ip, from_, to) is True assert blocking.unblock_ip(ip, from_, to) is True From 8b2636994eb630bd4f481c39766e6566554f0b68 Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 17:02:02 +0300 Subject: [PATCH 264/498] unblocker: fix problem updating self.requests when a tw is closed --- modules/blocking/unblocker.py | 31 ++++++++++++------- .../core/database/redis_db/alert_handler.py | 4 --- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index d6dfdb62f..19770eb34 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -53,13 +53,25 @@ def unblock_request( """ schedules unblocking for the given ip for the next timewindow. """ + print( + f"@@@@@@@@@@@@@@@@ [unblock_request] recvd an unblock request for {ip} in" + f" {current_tw}" + ) if ip in self.requests: # ip is already blocked, extend the blocking by 1 tw tws = self.requests[ip]["block_this_ip_for"] block_this_ip_for = tws + 1 + print( + f"@@@@@@@@@@@@@@@@ [unblock_request] extended the " + f"blocking for ip {ip}" + ) else: # measured in tws block_this_ip_for = 1 + print( + f"@@@@@@@@@@@@@@@@ [unblock_request] first time blocking " + f"for ip {ip}" + ) tw_to_unblock_at: TimeWindow = self._get_tw_to_unblock_at( ip, current_tw, block_this_ip_for @@ -79,13 +91,10 @@ def _check_if_time_to_unblock(self): for ip, request in self.requests.items(): ts: str = request["tw_to_unblock"].end_time ts: float = utils.convert_ts_format(ts, "unixtimestamp") - print( - f"@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock]" - f" checking if time to unvblock {ip} {request}" - ) if now >= ts: print( - f"@@@@@@@@@@@@@@@@ time to unblock {ip} in the " + f"@@@@@@@@@@@@@@@@ [ringringringringggg] time to " + f"unblock {ip} in the " f"fw {request}" ) flags: Dict[str, str] = request["flags"] @@ -95,11 +104,6 @@ def _check_if_time_to_unblock(self): requests_to_del.append(ip) for ip in requests_to_del: - print( - f"@@@@@@@@@@@@@@@@ [_check_if_time_to_unblock] " - f"deleting request for {ip}" - ) - self._del_request(ip) time.sleep(10) @@ -136,8 +140,11 @@ def update_requests(self): new_req = req new_req["block_this_ip_for"] = req["block_this_ip_for"] - 1 new_requests[ip] = new_req + self.requests = new_requests + print("@@@@@@@@@@@@@@@@ tw closed!! requests updatedd!!") + from pprint import pp - return new_requests + pp(self.requests) def _add_req( self, @@ -166,7 +173,7 @@ def _add_req( f"blocked for {interval} timewindows. " f"Timestamp to unblock: {tw_to_unblock_at.end_time}) " ) - print(f"@@@@@@@@@@@@@@@@ [_add_req] DONEE. added req for {ip} to ") + print(f"@@@@@@@@@@@@@@@@ [_add_req] DONEE. added req for {ip} ... ") from pprint import pp pp(self.requests) diff --git a/slips_files/core/database/redis_db/alert_handler.py b/slips_files/core/database/redis_db/alert_handler.py index 8460f9853..4eba13b72 100644 --- a/slips_files/core/database/redis_db/alert_handler.py +++ b/slips_files/core/database/redis_db/alert_handler.py @@ -178,10 +178,6 @@ def get_tw_limits(self, profileid, twid: str) -> Tuple[float, float]: twid_start_time = first_twid_start_time + ( self.width * (given_twid - 1) ) - print( - f"@@@@@@@@@@@@@@@@ given twid ({twid}) start time" - f" {twid_start_time}" - ) twid_end_time: float = twid_start_time + self.width return twid_start_time, twid_end_time From b91230b9f42ff112fbe9240c130601cc297c2d0e Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 17:26:39 +0300 Subject: [PATCH 265/498] evidence: if a profiler generates 1+ alerts in the esame tw, log the first one only, and extend the blocking by 1 tw starting from the second one --- config/slips.yaml | 2 +- slips_files/core/evidence_handler.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index d758fb9f7..e364806b4 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -27,7 +27,7 @@ parameters: # time_window_width : 300 # For 1 hour # time_window_width : 3600 - time_window_width: 3600 + time_window_width: 30 # For 1 day # time_window_width = 86400 # Make Slips use only one time window (also like if no TW is used) diff --git a/slips_files/core/evidence_handler.py b/slips_files/core/evidence_handler.py index b7704b083..f532edce2 100644 --- a/slips_files/core/evidence_handler.py +++ b/slips_files/core/evidence_handler.py @@ -379,22 +379,28 @@ def handle_new_alert( ): """ saves alert details in the db and informs exporting modules about it + + if a profile already generated an alert in this tw, we send a + blocking request (to extend its blocking period), and log the alert + in the db only, without printing it to cli. """ + + self.db.set_alert(alert, evidence_causing_the_alert) + self.decide_blocking(alert.profile.ip, alert.timewindow) # like in the firewall profile_already_blocked: bool = self.db.is_blocked_profile_and_tw( str(alert.profile), str(alert.timewindow) ) - if profile_already_blocked: print( f"@@@@@@@@@@@@@@@@ [handle_new_alert] profiler already " f"blocked and is setting another alert!!! {alert.profile}" ) - # send another blocking request to extend the blocking period - self.decide_blocking(alert.profile.ip, alert.timewindow) + + # that's it, dont keep logging new alerts if 1 alerts is logged + # in this tw. return - self.db.set_alert(alert, evidence_causing_the_alert) self.send_to_exporting_module(evidence_causing_the_alert) alert_to_print: str = self.formatter.format_evidence_for_printing( alert, evidence_causing_the_alert From 9f3f2d719d8ae607f1769ed437b873e7b3f4546d Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 17:47:59 +0300 Subject: [PATCH 266/498] evidence: fix problem getting evidence that were part of a past alert --- slips_files/core/database/redis_db/alert_handler.py | 3 +-- slips_files/core/evidence_handler.py | 13 +++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/slips_files/core/database/redis_db/alert_handler.py b/slips_files/core/database/redis_db/alert_handler.py index 4eba13b72..bba2a2514 100644 --- a/slips_files/core/database/redis_db/alert_handler.py +++ b/slips_files/core/database/redis_db/alert_handler.py @@ -59,7 +59,6 @@ def set_evidence_causing_alert(self, alert: Alert): """ When we have a bunch of evidence causing an alert, we associate all evidence IDs with the alert ID in our database - this function stores evidence in 'alerts_profile_twid' key only """ old_profileid_twid_alerts: Dict[str, List[str]] @@ -368,7 +367,7 @@ def get_profileid_twid_alerts( ) -> Dict[str, List[str]]: """ The format for the returned dict is - {profile123_twid1_: [ev_uuid1, ev_uuid2, ev_uuid3]} + {: [ev_uuid1, ev_uuid2, ev_uuid3]} """ alerts: str = self.r.hget(f"{profileid}_{twid}", "alerts") if not alerts: diff --git a/slips_files/core/evidence_handler.py b/slips_files/core/evidence_handler.py index f532edce2..4fc6b10eb 100644 --- a/slips_files/core/evidence_handler.py +++ b/slips_files/core/evidence_handler.py @@ -246,12 +246,13 @@ def get_evidence_that_were_part_of_a_past_alert( given timewindow """ past_alerts: dict = self.db.get_profileid_twid_alerts(profileid, twid) - try: - past_evidence_ids = list(past_alerts.values())[0] - past_evidence_ids: List[str] = json.loads(past_evidence_ids) - except IndexError: - # no past evidence - past_evidence_ids = [] + + past_evidence_ids = [] + if past_alerts: + for evidence_id_list in list(past_alerts.values()): + evidence_id_list: List[str] = json.loads(evidence_id_list) + past_evidence_ids += evidence_id_list + return past_evidence_ids def is_evidence_done_by_others(self, evidence: Evidence) -> bool: From e3db63b7844e5a38a88951bcfa2099fc59a40453 Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 17:56:08 +0300 Subject: [PATCH 267/498] remove debugging prints --- .secrets.baseline | 6 +++--- config/slips.yaml | 5 ++--- modules/blocking/blocking.py | 4 ---- modules/blocking/exec_iptables_cmd.py | 1 - modules/blocking/unblocker.py | 20 -------------------- slips_files/core/evidence_handler.py | 13 ------------- 6 files changed, 5 insertions(+), 44 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 37fe2abcb..fc1ac4872 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -149,14 +149,14 @@ "filename": "config/slips.yaml", "hashed_secret": "4cac50cee3ad8e462728e711eac3e670753d5016", "is_verified": false, - "line_number": 224 + "line_number": 223 }, { "type": "Secret Keyword", "filename": "config/slips.yaml", "hashed_secret": "d033e22ae348aeb5660fc2140aec35850c4da997", "is_verified": false, - "line_number": 394 + "line_number": 393 } ], "dataset/test14-malicious-zeek-dir/http.log": [ @@ -7192,5 +7192,5 @@ } ] }, - "generated_at": "2025-02-13T22:47:52Z" + "generated_at": "2025-05-08T14:51:28Z" } diff --git a/config/slips.yaml b/config/slips.yaml index e364806b4..02adc7f1b 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -26,8 +26,7 @@ parameters: # For 5 min # time_window_width : 300 # For 1 hour - # time_window_width : 3600 - time_window_width: 30 + time_window_width: 3600 # For 1 day # time_window_width = 86400 # Make Slips use only one time window (also like if no TW is used) @@ -182,7 +181,7 @@ detection: # - 0.43: Use this threshold If you want Slips to be insensitive. # Using this means Slips will need so many evidence to trigger an alert # May lead to false negatives - evidence_detection_threshold: 0.08 + evidence_detection_threshold: 0.25 # Make Slips pop up alerts? Both Linux and Macos popup_alerts: false diff --git a/modules/blocking/blocking.py b/modules/blocking/blocking.py index 45bb65123..53640b3f3 100644 --- a/modules/blocking/blocking.py +++ b/modules/blocking/blocking.py @@ -239,10 +239,6 @@ def main(self): self._block_ip(ip, flags) # whether this ip is blocked now, or was already blocked, make an unblocking request to either extend its # blocking period, or block it until the next timewindow is over. - print( - f"@@@@@@@@@@@@@@@@ calling unblocker for ip {ip} " - f".. whether extend the blocking OR block." - ) self.unblocker.unblock_request(ip, tw, flags) if msg := self.get_msg("tw_closed"): diff --git a/modules/blocking/exec_iptables_cmd.py b/modules/blocking/exec_iptables_cmd.py index 69c78fa54..2d5fe08df 100644 --- a/modules/blocking/exec_iptables_cmd.py +++ b/modules/blocking/exec_iptables_cmd.py @@ -17,7 +17,6 @@ def exec_iptables_command(sudo: str, action, ip_to_block, flag, options): f"{sudo}iptables --{action} slipsBlocking {flag} {ip_to_block} " f'-m comment --comment "Slips rule" >/dev/null 2>&1' ) - print(f"@@@@@@@@@@@@@@@@ executing {command}") # Add the options constructed in block_ip or unblock_ip to the # iptables command for key in options.keys(): diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index 19770eb34..d119e6fd9 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -53,25 +53,13 @@ def unblock_request( """ schedules unblocking for the given ip for the next timewindow. """ - print( - f"@@@@@@@@@@@@@@@@ [unblock_request] recvd an unblock request for {ip} in" - f" {current_tw}" - ) if ip in self.requests: # ip is already blocked, extend the blocking by 1 tw tws = self.requests[ip]["block_this_ip_for"] block_this_ip_for = tws + 1 - print( - f"@@@@@@@@@@@@@@@@ [unblock_request] extended the " - f"blocking for ip {ip}" - ) else: # measured in tws block_this_ip_for = 1 - print( - f"@@@@@@@@@@@@@@@@ [unblock_request] first time blocking " - f"for ip {ip}" - ) tw_to_unblock_at: TimeWindow = self._get_tw_to_unblock_at( ip, current_tw, block_this_ip_for @@ -92,11 +80,6 @@ def _check_if_time_to_unblock(self): ts: str = request["tw_to_unblock"].end_time ts: float = utils.convert_ts_format(ts, "unixtimestamp") if now >= ts: - print( - f"@@@@@@@@@@@@@@@@ [ringringringringggg] time to " - f"unblock {ip} in the " - f"fw {request}" - ) flags: Dict[str, str] = request["flags"] if self._unblock(ip, flags): self._log_successful_unblock(ip) @@ -141,7 +124,6 @@ def update_requests(self): new_req["block_this_ip_for"] = req["block_this_ip_for"] - 1 new_requests[ip] = new_req self.requests = new_requests - print("@@@@@@@@@@@@@@@@ tw closed!! requests updatedd!!") from pprint import pp pp(self.requests) @@ -173,7 +155,6 @@ def _add_req( f"blocked for {interval} timewindows. " f"Timestamp to unblock: {tw_to_unblock_at.end_time}) " ) - print(f"@@@@@@@@@@@@@@@@ [_add_req] DONEE. added req for {ip} ... ") from pprint import pp pp(self.requests) @@ -240,7 +221,6 @@ def _unblock( txt = f"IP {ip_to_unblock} is unblocked in {cur_timewindow}." self.print(txt) self.log(txt) - print(f"@@@@@@@@@@@@@@@@ unblocked {ip_to_unblock} in the fw") return True else: txt = f"An errror occured. Unable to unblock {ip_to_unblock}" diff --git a/slips_files/core/evidence_handler.py b/slips_files/core/evidence_handler.py index 4fc6b10eb..efa838691 100644 --- a/slips_files/core/evidence_handler.py +++ b/slips_files/core/evidence_handler.py @@ -393,11 +393,6 @@ def handle_new_alert( str(alert.profile), str(alert.timewindow) ) if profile_already_blocked: - print( - f"@@@@@@@@@@@@@@@@ [handle_new_alert] profiler already " - f"blocked and is setting another alert!!! {alert.profile}" - ) - # that's it, dont keep logging new alerts if 1 alerts is logged # in this tw. return @@ -431,10 +426,6 @@ def decide_blocking( """ # send ip to the blocking module if not self.blocking_module_supported: - print( - "@@@@@@@@@@@@@@@@ decide_blocking blocking module " - "unsupported" - ) return False # now since this source ip(profileid) caused an alert, # it means it caused so many evidence(attacked others a lot) @@ -442,9 +433,6 @@ def decide_blocking( # First, Make sure we don't block our own IP if ip_to_block in self.our_ips: - print( - f"@@@@@@@@@@@@@@@@ decide_blocking thats own ip! {ip_to_block}" - ) return False # TODO: edit the options here. by default it'll block @@ -458,7 +446,6 @@ def decide_blocking( } blocking_data = json.dumps(blocking_data) self.db.publish("new_blocking", blocking_data) - print(f"@@@@@@@@@@@@@@@@ published {blocking_data} in new_blocking") return True def increment_attack_counter( From b9da2164936038f3ed3efe36be4c267679c7f737 Mon Sep 17 00:00:00 2001 From: alya Date: Thu, 8 May 2025 18:41:15 +0300 Subject: [PATCH 268/498] update blocking module unit tests --- modules/blocking/blocking.py | 15 +- modules/blocking/exec_iptables_cmd.py | 2 +- modules/blocking/slips_chain_manager.py | 14 +- slips_files/common/slips_utils.py | 2 +- tests/module_factory.py | 9 +- tests/test_blocking.py | 229 ++++++++++++++++++------ 6 files changed, 199 insertions(+), 72 deletions(-) diff --git a/modules/blocking/blocking.py b/modules/blocking/blocking.py index 53640b3f3..2147f06cc 100644 --- a/modules/blocking/blocking.py +++ b/modules/blocking/blocking.py @@ -43,7 +43,10 @@ def init(self): self.blocking_log_path = os.path.join(self.output_dir, "blocking.log") self.blocking_logfile_lock = Lock() # clear it - open(self.blocking_log_path, "w").close() + try: + open(self.blocking_log_path, "w").close() + except FileNotFoundError: + pass def log(self, text: str): """Logs the given text to the blocking log file""" @@ -87,7 +90,7 @@ def _init_chains_in_firewall(self): # self.delete_iptables_chain() self.print('Executing "sudo iptables -N slipsBlocking"', 6, 0) # Add a new chain to iptables - os.system(f"{self.sudo}iptables -N slipsBlocking >/dev/null 2>&1") + os.system(f"{self.sudo} iptables -N slipsBlocking >/dev/null 2>&1") # Check if we're already redirecting to slipsBlocking chain input_chain_rules = self._get_cmd_output( @@ -106,22 +109,22 @@ def _init_chains_in_firewall(self): if "slipsBlocking" not in input_chain_rules: os.system( self.sudo - + "iptables -I INPUT -j slipsBlocking >/dev/null 2>&1" + + " iptables -I INPUT -j slipsBlocking >/dev/null 2>&1" ) if "slipsBlocking" not in output_chain_rules: os.system( self.sudo - + "iptables -I OUTPUT -j slipsBlocking >/dev/null 2>&1" + + " iptables -I OUTPUT -j slipsBlocking >/dev/null 2>&1" ) if "slipsBlocking" not in forward_chain_rules: os.system( self.sudo - + "iptables -I FORWARD -j slipsBlocking >/dev/null 2>&1" + + " iptables -I FORWARD -j slipsBlocking >/dev/null 2>&1" ) def _is_ip_already_blocked(self, ip) -> bool: """Checks if ip is already blocked or not using iptables""" - command = f"{self.sudo}iptables -L slipsBlocking -v -n" + command = f"{self.sudo} iptables -L slipsBlocking -v -n" # Execute command result = subprocess.run(command.split(), stdout=subprocess.PIPE) result = result.stdout.decode("utf-8") diff --git a/modules/blocking/exec_iptables_cmd.py b/modules/blocking/exec_iptables_cmd.py index 2d5fe08df..704c34a18 100644 --- a/modules/blocking/exec_iptables_cmd.py +++ b/modules/blocking/exec_iptables_cmd.py @@ -14,7 +14,7 @@ def exec_iptables_command(sudo: str, action, ip_to_block, flag, options): """ command = ( - f"{sudo}iptables --{action} slipsBlocking {flag} {ip_to_block} " + f"{sudo} iptables --{action} slipsBlocking {flag} {ip_to_block} " f'-m comment --comment "Slips rule" >/dev/null 2>&1' ) # Add the options constructed in block_ip or unblock_ip to the diff --git a/modules/blocking/slips_chain_manager.py b/modules/blocking/slips_chain_manager.py index 50b8abc7d..ff670c697 100644 --- a/modules/blocking/slips_chain_manager.py +++ b/modules/blocking/slips_chain_manager.py @@ -11,7 +11,9 @@ def _chain_exists() -> bool: # check if slipsBlocking chain exists before flushing it and suppress # stderr and stdout while checking # 0 means it exists - return os.system(f"{sudo}iptables -nvL slipsBlocking >/dev/null 2>&1") == 0 + return ( + os.system(f"{sudo} iptables -nvL slipsBlocking >/dev/null 2>&1") == 0 + ) def del_slips_blocking_chain() -> bool: @@ -24,17 +26,15 @@ def del_slips_blocking_chain() -> bool: # Delete all references to slipsBlocking inserted in INPUT OUTPUT # and FORWARD before deleting the chain cmd = ( - f"{sudo}iptables -D INPUT -j slipsBlocking " - f">/dev/null 2>&1 ; {sudo}iptables -D OUTPUT " - f"-j slipsBlocking >/dev/null 2>&1 ; " - f"{sudo}iptables -D FORWARD -j " - f"slipsBlocking >/dev/null 2>&1" + f"{sudo} iptables -D INPUT -j slipsBlocking >/dev/null 2>&1 ;" + f" {sudo} iptables -D OUTPUT -j slipsBlocking >/dev/null 2>&1 ; " + f"{sudo} iptables -D FORWARD -j slipsBlocking >/dev/null 2>&1" ) os.system(cmd) # flush and delete all the rules in slipsBlocking cmd = ( - f"{sudo}iptables -F slipsBlocking >/dev/null 2>&1 ; " + f"{sudo} iptables -F slipsBlocking >/dev/null 2>&1 ; " f"{sudo} iptables -X slipsBlocking >/dev/null 2>&1" ) os.system(cmd) diff --git a/slips_files/common/slips_utils.py b/slips_files/common/slips_utils.py index 315af1057..a9af32aaa 100644 --- a/slips_files/common/slips_utils.py +++ b/slips_files/common/slips_utils.py @@ -545,7 +545,7 @@ def get_sudo_according_to_env(self) -> str: """ # This env variable is defined in the Dockerfile running_in_docker = os.environ.get("IS_IN_A_DOCKER_CONTAINER", False) - return "" if running_in_docker else "sudo " + return "" if running_in_docker else "sudo" def is_msg_intended_for(self, message, channel): """ diff --git a/tests/module_factory.py b/tests/module_factory.py index 9485bd7ee..09e3ff0bc 100644 --- a/tests/module_factory.py +++ b/tests/module_factory.py @@ -214,6 +214,8 @@ def create_blocking_obj(self, mock_db): ) # override the print function to avoid broken pipes blocking.print = Mock() + blocking.blocking_log_path = Mock() + blocking.unblocker = Mock() return blocking @patch(MODULE_DB_MANAGER, name="mock_db") @@ -679,9 +681,12 @@ def create_profile_handler_obj(self): def create_process_manager_obj(self): main_mock = Mock() main_mock.conf.get_disabled_modules.return_value = [] - #main_mock.conf.get_bootstrapping_setting.return_value = (False, []) + # main_mock.conf.get_bootstrapping_setting.return_value = (False, []) main_mock.conf.is_bootstrapping_node.return_value = False - main_mock.conf.get_bootstrapping_modules.return_value = ["fidesModule", "irisModule"] + main_mock.conf.get_bootstrapping_modules.return_value = [ + "fidesModule", + "irisModule", + ] main_mock.input_type = "pcap" main_mock.mode = "normal" main_mock.stdout = "" diff --git a/tests/test_blocking.py b/tests/test_blocking.py index 36994775a..92e119e38 100644 --- a/tests/test_blocking.py +++ b/tests/test_blocking.py @@ -3,12 +3,17 @@ """Unit test for modules/blocking/blocking.py this file needs sudoroot to run """ - -from tests.common_test_utils import IS_IN_A_DOCKER_CONTAINER from tests.module_factory import ModuleFactory -import platform +import subprocess +from unittest.mock import patch import pytest +import json + +import platform import os +from unittest.mock import call +from unittest import mock +from tests.common_test_utils import IS_IN_A_DOCKER_CONTAINER def has_netadmin_cap(): @@ -42,63 +47,177 @@ def has_netadmin_cap(): ) -@linuxOS -@isroot -@has_net_admin_cap -def is_slipschain_initialized() -> bool: +def test_init_chains_in_firewall(): blocking = ModuleFactory().create_blocking_obj() - output = blocking._get_cmd_output(f"{blocking.sudo} iptables -S") - rules = [ - "-A INPUT -j slipsBlocking", - "-A FORWARD -j slipsBlocking", - "-A OUTPUT -j slipsBlocking", - ] - return all(rule in output for rule in rules) - - -@linuxOS -@isroot -@has_net_admin_cap -def test_initialize_chains_in_firewall(): + with patch("os.system") as mock_system, patch.object( + blocking.__class__, "_get_cmd_output" + ) as mock_get_output: + + # simulate slipsBlocking not in any chain + mock_get_output.side_effect = ["", "", ""] # input, output, forward + + blocking._init_chains_in_firewall() + + # ensure the chain is created + mock_system.assert_any_call( + f"{blocking.sudo} iptables -N slipsBlocking >/dev/null 2>&1" + ) + + # ensure the redirections are added + expected_calls = [ + call( + f"{blocking.sudo} iptables -I INPUT -j " + f"slipsBlocking >/dev/null 2>&1" + ), + call( + f"{blocking.sudo} iptables -I OUTPUT -j " + f"slipsBlocking >/dev/null 2>&1" + ), + call( + f"{blocking.sudo} iptables -I FORWARD -j " + f"slipsBlocking >/dev/null 2>&1" + ), + ] + mock_system.assert_has_calls(expected_calls, any_order=True) + + # ensure _get_cmd_output was called with correct chain checks + mock_get_output.assert_has_calls( + [ + call(f"{blocking.sudo} iptables -nvL INPUT"), + call(f"{blocking.sudo} iptables -nvL OUTPUT"), + call(f"{blocking.sudo} iptables -nvL FORWARD"), + ] + ) + + +def test_is_ip_already_blocked(): blocking = ModuleFactory().create_blocking_obj() - # manually set the firewall - blocking.firewall = "iptables" - blocking._init_chains_in_firewall() - assert is_slipschain_initialized() is True + # define the fake output that subprocess.run should return + fake_output = "Chain slipsBlocking (1 references)\n target prot opt source destination\n REJECT all -- 192.168.1.100 anywhere" + + # mock subprocess.run to return the fake output + with mock.patch("subprocess.run") as mock_run: + mock_run.return_value.stdout = fake_output.encode("utf-8") + ip = "192.168.1.100" -# todo -# def test_delete_slipsBlocking_chain(): -# blocking = ModuleFactory().create_blocking_obj() -# # first make sure they are initialized -# if not is_slipschain_initialized(output_queue): -# blocking.initialize_chains_in_firewall() -# os.system('./slips.py -cb') -# assert is_slipschain_initialized(output_queue) == False + result = blocking._is_ip_already_blocked(ip) + # assert the result is True because the IP is in the fake output + assert result is True -@linuxOS -@isroot -@has_net_admin_cap -def test_block_ip(): + # assert subprocess.run was called with the correct command + mock_run.assert_called_once_with( + ["sudo", "iptables", "-L", "slipsBlocking", "-v", "-n"], + stdout=subprocess.PIPE, + ) + + +@pytest.mark.parametrize( + "ip,flags,already_blocked,expected", + [ + ("192.168.1.10", {}, False, True), # normal block + ("192.168.1.10", {"from_": True}, False, True), # only from + ("192.168.1.10", {"to": True}, False, True), # only to + ("192.168.1.10", {}, True, False), # already blocked + (None, {}, False, False), # invalid ip type + ], +) +def test_block_ip(ip, flags, already_blocked, expected): blocking = ModuleFactory().create_blocking_obj() - blocking._init_chains_in_firewall() - if not blocking._is_ip_already_blocked("2.2.0.0"): - ip = "2.2.0.0" - from_ = True - to = True - assert blocking._block_ip(ip, from_, to) is True - - -@linuxOS -@isroot -@has_net_admin_cap -def test_unblock_ip(): + blocking.firewall = "iptables" + blocking.sudo = "sudo" + + with patch.object( + blocking, "_is_ip_already_blocked", return_value=already_blocked + ), patch( + "modules.blocking.exec_iptables_cmd.exec_iptables_command", + return_value=True, + ) as _, patch.object( + blocking, "print" + ), patch.object( + blocking, "log" + ), patch.object( + blocking.db, "set_blocked_ip" + ): + + result = blocking._block_ip(ip, flags) + assert result is expected + + +@pytest.mark.parametrize( + "block,expected_block_called", + [ + (True, True), + (False, False), + ], +) +def test_main_blocking_logic(block, expected_block_called): + blocking = ModuleFactory().create_blocking_obj() + blocking_data = { + "ip": "1.2.3.4", + "tw": 5, + "block": block, + "from": True, + "to": False, + "dport": 80, + "sport": 12345, + "protocol": "tcp", + } + + msg_block = {"data": json.dumps(blocking_data)} + msg_tw_closed = None + + with patch.object( + blocking, "get_msg", side_effect=[msg_block, msg_tw_closed] + ): + with patch.object(blocking, "_block_ip") as mock_block, patch.object( + blocking.unblocker, "unblock_request" + ) as mock_unblock_req, patch.object( + blocking.unblocker, "update_requests" + ) as mock_update: + + blocking.main() + + if expected_block_called: + mock_block.assert_called_once_with( + "1.2.3.4", + { + "from_": True, + "to": False, + "dport": 80, + "sport": 12345, + "protocol": "tcp", + }, + ) + else: + mock_block.assert_not_called() + + mock_unblock_req.assert_called_once_with( + "1.2.3.4", + 5, + { + "from_": True, + "to": False, + "dport": 80, + "sport": 12345, + "protocol": "tcp", + }, + ) + mock_update.assert_not_called() + + +def test_main_tw_closed_triggers_update(): blocking = ModuleFactory().create_blocking_obj() - ip = "2.2.0.0" - from_ = True - to = True - # first make sure that it's blocked - if not blocking._is_ip_already_blocked("2.2.0.0"): - assert blocking._block_ip(ip, from_, to) is True - assert blocking.unblock_ip(ip, from_, to) is True + + msg_block = None + msg_tw_closed = {"data": "whatever"} + + with patch.object( + blocking, "get_msg", side_effect=[msg_block, msg_tw_closed] + ): + with patch.object( + blocking.unblocker, "update_requests" + ) as mock_update: + blocking.main() + mock_update.assert_called_once() From 3f8302ab7bda9da757877ab7b257cb34bfda074b Mon Sep 17 00:00:00 2001 From: alya Date: Fri, 9 May 2025 15:59:50 +0300 Subject: [PATCH 269/498] add unblocker unit tests --- .github/workflows/unit-tests.yml | 1 + modules/blocking/unblocker.py | 6 ++--- tests/module_factory.py | 13 +++++++++++ tests/test_blocking.py | 38 +------------------------------- 4 files changed, 17 insertions(+), 41 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index de44ad1d0..75f986920 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -37,6 +37,7 @@ jobs: - test_whitelist.py - test_arp.py - test_blocking.py + - test_unblocker.py - test_flow_handler.py - test_horizontal_portscans.py - test_http_analyzer.py diff --git a/modules/blocking/unblocker.py b/modules/blocking/unblocker.py index d119e6fd9..3033d6b2e 100644 --- a/modules/blocking/unblocker.py +++ b/modules/blocking/unblocker.py @@ -11,8 +11,9 @@ class Unblocker(IUnblocker): """ - For every blocking method in slips, there should be an unblocker + For every blocking module in slips, there should be an unblocker implemented + this is the one for the firewall blocker. """ name = "iptables_unblocker" @@ -124,9 +125,6 @@ def update_requests(self): new_req["block_this_ip_for"] = req["block_this_ip_for"] - 1 new_requests[ip] = new_req self.requests = new_requests - from pprint import pp - - pp(self.requests) def _add_req( self, diff --git a/tests/module_factory.py b/tests/module_factory.py index 09e3ff0bc..f7142dfec 100644 --- a/tests/module_factory.py +++ b/tests/module_factory.py @@ -13,6 +13,7 @@ from managers.host_ip_manager import HostIPManager from managers.metadata_manager import MetadataManager from managers.profilers_manager import ProfilersManager +from modules.blocking.unblocker import Unblocker from modules.flowalerts.conn import Conn from modules.threat_intelligence.circl_lu import Circllu from modules.threat_intelligence.spamhaus import Spamhaus @@ -218,6 +219,18 @@ def create_blocking_obj(self, mock_db): blocking.unblocker = Mock() return blocking + @patch(MODULE_DB_MANAGER, name="mock_db") + def create_unblocker_obj(self, mock_db): + unblocker = Unblocker( + mock_db, + "", # sudo + Mock(return_value=False), + self.logger, + Mock(), # mocking log() + ) + unblocker.print = Mock() + return unblocker + @patch(MODULE_DB_MANAGER, name="mock_db") def create_flowalerts_obj(self, mock_db): flowalerts = FlowAlerts( diff --git a/tests/test_blocking.py b/tests/test_blocking.py index 92e119e38..23fce22f1 100644 --- a/tests/test_blocking.py +++ b/tests/test_blocking.py @@ -1,50 +1,14 @@ # SPDX-FileCopyrightText: 2021 Sebastian Garcia # SPDX-License-Identifier: GPL-2.0-only -"""Unit test for modules/blocking/blocking.py -this file needs sudoroot to run -""" +"""Unit test for modules/blocking/blocking.py""" from tests.module_factory import ModuleFactory import subprocess from unittest.mock import patch import pytest import json -import platform -import os from unittest.mock import call from unittest import mock -from tests.common_test_utils import IS_IN_A_DOCKER_CONTAINER - - -def has_netadmin_cap(): - """Check the capabilities given to this docker container""" - cmd = ( - 'capsh --print | grep "Current:" | cut -d' " -f3 | grep cap_net_admin" - ) - output = os.popen(cmd).read() - return "cap_net_admin" in output - - -IS_DEPENDENCY_IMAGE = os.environ.get("IS_DEPENDENCY_IMAGE", False) -# ignore all tests if not using linux -linuxOS = pytest.mark.skipif( - platform.system() != "Linux", - reason="Blocking is supported only in Linux with root priveledges", -) -# When using docker in github actions, we can't use --cap-add NET_ADMIN -# so all blocking module unit tests will fail because we don't have admin privs -# we use this environment variable to check if slips is -# running in github actions -isroot = pytest.mark.skipif( - os.geteuid() != 0 or IS_DEPENDENCY_IMAGE is not False, - reason="Blocking is supported only with root priveledges", -) - -# blocking requires net admin capabilities in docker, otherwise skips blocking tests -has_net_admin_cap = pytest.mark.skipif( - IS_IN_A_DOCKER_CONTAINER and not has_netadmin_cap(), - reason="Blocking is supported only with --cap-add=NET_ADMIN", -) def test_init_chains_in_firewall(): From fbcfc6db639ad182480e56c0d71f74a7919161b6 Mon Sep 17 00:00:00 2001 From: alya Date: Fri, 9 May 2025 17:33:58 +0300 Subject: [PATCH 270/498] updat unittests --- tests/test_host_ip_manager.py | 54 ++++++++++++++++------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/tests/test_host_ip_manager.py b/tests/test_host_ip_manager.py index cd2a34ffb..bea10ff8d 100644 --- a/tests/test_host_ip_manager.py +++ b/tests/test_host_ip_manager.py @@ -1,6 +1,5 @@ # SPDX-FileCopyrightText: 2021 Sebastian Garcia # SPDX-License-Identifier: GPL-2.0-only -import socket from unittest.mock import MagicMock, patch, Mock import pytest from tests.module_factory import ModuleFactory @@ -36,38 +35,33 @@ def test_update_host_ip( assert host_ip_man.get_host_ip.call_count == expected_calls -def test_get_host_ip_success(): - host_ip_man = ModuleFactory().create_host_ip_manager_obj() - expected_ip = "192.168.1.100" - - with patch("socket.socket") as mock_socket: - mock_instance = MagicMock() - mock_socket.return_value = mock_instance - - mock_instance.getsockname.return_value = (expected_ip, 80) - - result = host_ip_man.get_host_ip() - - assert result == expected_ip - mock_instance.connect.assert_any_call(("1.1.1.1", 80)) - mock_instance.getsockname.assert_called_once() - - -def test_get_host_ip_failure(): +@pytest.mark.parametrize( + "interfaces, ifaddresses, expected", + [ + ( # 2 here is AF_INET + ["lo", "eth0"], + {"lo": {}, "eth0": {2: [{"addr": "192.168.1.10"}]}}, + "192.168.1.10", + ), + ( + ["lo", "eth0"], + { + "lo": {2: [{"addr": "127.0.0.1"}]}, + "eth0": {2: [{"addr": "127.0.0.2"}]}, + }, + None, + ), + (["lo"], {"lo": {2: [{"addr": "127.0.0.1"}]}}, None), + ], +) +def test_get_host_ip(interfaces, ifaddresses, expected): host_ip_man = ModuleFactory().create_host_ip_manager_obj() - with patch("socket.socket") as mock_socket: - mock_instance = MagicMock() - mock_socket.return_value = mock_instance - - mock_instance.connect.side_effect = socket.error() - + with patch("netifaces.interfaces", return_value=interfaces), patch( + "netifaces.ifaddresses", side_effect=lambda iface: ifaddresses[iface] + ), patch("netifaces.AF_INET", 2): result = host_ip_man.get_host_ip() - - assert result is None - mock_instance.connect.assert_any_call(("1.1.1.1", 80)) - mock_instance.connect.assert_any_call(("2606:4700:4700::1111", 80)) - mock_instance.getsockname.assert_not_called() + assert result == expected @pytest.mark.parametrize( From da2ecf501681ebe0275c5f506fb239d82b77f59f Mon Sep 17 00:00:00 2001 From: alya Date: Fri, 9 May 2025 17:43:30 +0300 Subject: [PATCH 271/498] fix convert_ts_format() function name in all unnit tests --- tests/test_evidence_formatter.py | 12 ++++++------ tests/test_metadata_manager.py | 2 +- tests/test_output.py | 6 +++--- tests/test_profiler.py | 10 +++++----- tests/test_rnn_cc_detection.py | 2 +- tests/test_slips_utils.py | 2 +- tests/test_timeline.py | 2 +- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/test_evidence_formatter.py b/tests/test_evidence_formatter.py index 2c4d1dfa4..c837dce4c 100644 --- a/tests/test_evidence_formatter.py +++ b/tests/test_evidence_formatter.py @@ -88,9 +88,9 @@ def test_format_evidence_for_printing( with patch.object( formatter, "get_printable_alert" ) as mock_get_alert_time, patch( - "slips_files.common.slips_utils.utils.convert_format" - ) as mock_convert_format: - mock_convert_format.return_value = "converted_time" + "slips_files.common.slips_utils.utils.convert_ts_format" + ) as mock_convert_ts_format: + mock_convert_ts_format.return_value = "converted_time" mock_get_alert_time.return_value = ( f"IP {profileid.ip} detected as malicious " @@ -219,9 +219,9 @@ def test_get_printable_alert( last_flow_datetime="", ) with patch( - "slips_files.common.slips_utils.utils.convert_format" - ) as mock_convert_format: - mock_convert_format.return_value = "converted_time" + "slips_files.common.slips_utils.utils.convert_ts_format" + ) as mock_convert_ts_format: + mock_convert_ts_format.return_value = "converted_time" result = formatter.get_printable_alert(alert) diff --git a/tests/test_metadata_manager.py b/tests/test_metadata_manager.py index 722c3baf1..2ae52ca0c 100644 --- a/tests/test_metadata_manager.py +++ b/tests/test_metadata_manager.py @@ -143,7 +143,7 @@ def test_add_metadata( with patch("os.mkdir"), patch("shutil.copy"), patch( "builtins.open", create=True ), patch.object( - utils, "convert_format", return_value="2023-01-01 00:00:00" + utils, "convert_ts_format", return_value="2023-01-01 00:00:00" ): result = metadata_manager._add_metadata() assert result == expected_result diff --git a/tests/test_output.py b/tests/test_output.py index 185a45e20..836d5e686 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -26,11 +26,11 @@ ), ], ) -@patch("slips_files.common.slips_utils.Utils.convert_format") -def test_log_line(mock_convert_format, msg, expected_log_content): +@patch("slips_files.common.slips_utils.Utils.convert_ts_format") +def test_log_line(mock_convert_ts_format, msg, expected_log_content): """Test that the log_line method logs the correct message to the slips.log file.""" - mock_convert_format.return_value = "formatted_datetime" + mock_convert_ts_format.return_value = "formatted_datetime" output = ModuleFactory().create_output_obj() output.slips_logfile = "path/to/slips.log" diff --git a/tests/test_profiler.py b/tests/test_profiler.py index b967c7880..50b537732 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -351,13 +351,13 @@ def test_convert_starttime_to_epoch(): starttime = "2023-04-04 12:00:00" with patch( - "slips_files.core.profiler.utils.convert_format" - ) as mock_convert_format: - mock_convert_format.return_value = 1680604800 + "slips_files.core.profiler.utils.convert_ts_format" + ) as mock_convert_ts_format: + mock_convert_ts_format.return_value = 1680604800 converted = profiler.convert_starttime_to_epoch(starttime) - mock_convert_format.assert_called_once_with( + mock_convert_ts_format.assert_called_once_with( "2023-04-04 12:00:00", "unixtimestamp" ) assert converted == 1680604800 @@ -367,7 +367,7 @@ def test_convert_starttime_to_epoch_invalid_format(monkeypatch): profiler = ModuleFactory().create_profiler_obj() starttime = "not a real time" monkeypatch.setattr( - "slips_files.core.profiler.utils.convert_format", + "slips_files.core.profiler.utils.convert_ts_format", Mock(side_effect=ValueError), ) converted = profiler.convert_starttime_to_epoch(starttime) diff --git a/tests/test_rnn_cc_detection.py b/tests/test_rnn_cc_detection.py index e603d2e29..45bae988a 100644 --- a/tests/test_rnn_cc_detection.py +++ b/tests/test_rnn_cc_detection.py @@ -90,7 +90,7 @@ def test_set_evidence_cc_channel( cc_detection.db.get_ip_identification.return_value = "Some IP info" with patch( - "slips_files.common.slips_utils.utils.convert_format", + "slips_files.common.slips_utils.utils.convert_ts_format", return_value=timestamp, ): cc_detection.set_evidence_cc_channel( diff --git a/tests/test_slips_utils.py b/tests/test_slips_utils.py index 3c2f86d3c..9c467da83 100644 --- a/tests/test_slips_utils.py +++ b/tests/test_slips_utils.py @@ -182,7 +182,7 @@ def test_calculate_confidence(input_value, expected_output): ), ], ) -def test_convert_format(input_value, input_format, expected_output): +def test_convert_ts_format(input_value, input_format, expected_output): utils = ModuleFactory().create_utils_obj() utils.local_tz = datetime.timezone.utc assert ( diff --git a/tests/test_timeline.py b/tests/test_timeline.py index 98e94beea..cec7472dc 100644 --- a/tests/test_timeline.py +++ b/tests/test_timeline.py @@ -522,7 +522,7 @@ def test_convert_timestamp_to_slips_format(timestamp, is_human, expected): timeline = ModuleFactory().create_timeline_object() timeline.is_human_timestamp = is_human with patch( - "slips_files.common.slips_utils.utils.convert_format", + "slips_files.common.slips_utils.utils.convert_ts_format", return_value=expected, ): result = timeline.convert_timestamp_to_slips_format(timestamp) From e2d3b962b046483425ea5bebd535425da3005723 Mon Sep 17 00:00:00 2001 From: alya Date: Fri, 9 May 2025 18:17:27 +0300 Subject: [PATCH 272/498] update evidence handler unit tests --- tests/test_evidence_handler.py | 112 +++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 47 deletions(-) diff --git a/tests/test_evidence_handler.py b/tests/test_evidence_handler.py index 8b9a54e91..233321e44 100644 --- a/tests/test_evidence_handler.py +++ b/tests/test_evidence_handler.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0-only import pytest import os -from unittest.mock import Mock, patch, call +from unittest.mock import Mock, MagicMock, patch, call from slips_files.core.structures.alerts import Alert from slips_files.core.structures.evidence import ( @@ -35,10 +35,13 @@ def test_decide_blocking( profileid, our_ips, expected_result, expected_publish_call_count ): evidence_handler = ModuleFactory().create_evidence_handler_obj() - evidence_handler.is_blocking_module_supported = Mock(return_value=True) + evidence_handler.blocking_module_supported = True evidence_handler.our_ips = our_ips with patch.object(evidence_handler.db, "publish") as mock_publish: - result = evidence_handler.decide_blocking(profileid) + tw = TimeWindow( + 2, "2025-05-09T13:27:45.123456", "2025-05-09T13:27:45.123456" + ) + result = evidence_handler.decide_blocking(profileid, tw) assert result == expected_result assert mock_publish.call_count == expected_publish_call_count @@ -80,56 +83,71 @@ def test_get_evidence_that_were_part_of_a_past_alert( assert result == expected_output -@pytest.mark.parametrize( - "profile_ip, timewindow, tw_evidence, block", - [ - # testcase1: Basic alert - ("192.168.1.1", 1, {"evidence1": Mock(spec=Evidence)}, True), - # testcase2: Multiple evidence - ( - "10.0.0.1", - 2, - { - "evidence1": Mock(spec=Evidence), - "evidence2": Mock(spec=Evidence), - }, - False, - ), - ], -) -def test_handle_new_alert(profile_ip, timewindow, tw_evidence, block): - evidence_handler = ModuleFactory().create_evidence_handler_obj() +def setup_handler(popup_enabled, blocked, mark_blocked=None): + handler = ModuleFactory().create_evidence_handler_obj() + handler.popup_alerts = popup_enabled + alert = Alert( - profile=ProfileID(profile_ip), - timewindow=TimeWindow(timewindow), + profile=ProfileID("1.2.3.4"), + timewindow=TimeWindow(1), last_evidence=Mock(), accumulated_threat_level=12.2, last_flow_datetime="2024/10/04 15:45:30.123456+0000", ) - evidence_handler.db.set_alert = Mock() - evidence_handler.db.mark_profile_and_timewindow_as_blocked = Mock() - evidence_handler.send_to_exporting_module = Mock() - evidence_handler.formatter.format_evidence_for_printing = Mock( - return_value="evidence to print" + evidence = {"k": MagicMock(spec=Evidence)} + + handler.db.set_alert = MagicMock() + handler.decide_blocking = MagicMock(side_effect=[None, mark_blocked]) + handler.db.is_blocked_profile_and_tw = MagicMock(return_value=blocked) + handler.send_to_exporting_module = MagicMock() + handler.formatter.format_evidence_for_printing = MagicMock( + return_value="formatted_alert" ) - evidence_handler.log_alert = Mock() - evidence_handler.decide_blocking = Mock(return_value=block) - evidence_handler.show_popup = Mock() - evidence_handler.print = Mock() - evidence_handler.db._set_accumulated_threat_level = Mock() - - evidence_handler.handle_new_alert(alert, tw_evidence) - if evidence_handler.popup_alerts: - evidence_handler.show_popup.assert_called_once() - if block: - ( - evidence_handler.db.mark_profile_and_timewindow_as_blocked.assert_called_once() - ) - evidence_handler.decide_blocking.assert_called_once() - evidence_handler.send_to_exporting_module.assert_called_once() - evidence_handler.print.assert_called_once_with("evidence to print", 1, 0) - evidence_handler.db.set_alert.assert_called_once() - evidence_handler.log_alert.assert_called_once() + handler.print = MagicMock() + handler.show_popup = MagicMock() + handler.db.mark_profile_and_timewindow_as_blocked = MagicMock() + handler.log_alert = MagicMock() + + return handler, alert, evidence + + +@pytest.mark.parametrize("popup_enabled", [True, False]) +def test_handle_new_alert_already_blocked(popup_enabled): + handler, alert, evidence = setup_handler(popup_enabled, blocked=True) + handler.handle_new_alert(alert, evidence) + + handler.db.set_alert.assert_called_once_with(alert, evidence) + handler.db.is_blocked_profile_and_tw.assert_called_once() + handler.send_to_exporting_module.assert_not_called() + handler.print.assert_not_called() + handler.show_popup.assert_not_called() + handler.db.mark_profile_and_timewindow_as_blocked.assert_not_called() + handler.log_alert.assert_not_called() + + +@pytest.mark.parametrize( + "popup_enabled, expect_popup", + [ + (True, True), + (False, False), + ], +) +def test_handle_new_alert_not_blocked(popup_enabled, expect_popup): + handler, alert, evidence = setup_handler( + popup_enabled, blocked=False, mark_blocked=True + ) + handler.handle_new_alert(alert, evidence) + + handler.send_to_exporting_module.assert_called_once_with(evidence) + handler.print.assert_called_once_with("formatted_alert", 1, 0) + + if expect_popup: + handler.show_popup.assert_called_once_with(alert) + else: + handler.show_popup.assert_not_called() + + handler.db.mark_profile_and_timewindow_as_blocked.assert_called_once() + handler.log_alert.assert_called_once_with(alert, blocked=True) @pytest.mark.parametrize( From 07f6f9d3ee865999d2f24c47751a5260cf1a4c6b Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 00:05:14 +0300 Subject: [PATCH 273/498] pofile_handler: handle the case where the starttime of the first flow is 0 --- .../core/database/redis_db/profile_handler.py | 3 ++- tests/test_profile_handler.py | 14 +++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 553cb0363..5bccb894d 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -124,7 +124,8 @@ def get_timewindow(self, flowtime, profileid): tw_number: int = 1 else: starttime_of_first_tw: float = self.get_first_flow_time() - if starttime_of_first_tw: + if starttime_of_first_tw is not None: # because 0 is a valid + # value tw_number: int = ( floor((flowtime - starttime_of_first_tw) / self.width) + 1 ) diff --git a/tests/test_profile_handler.py b/tests/test_profile_handler.py index 828af7ef9..9cc180cb6 100644 --- a/tests/test_profile_handler.py +++ b/tests/test_profile_handler.py @@ -2216,14 +2216,14 @@ def test_get_tw_of_ts(): @pytest.mark.parametrize( - "flowtime, width, hget_return_value, expected_twid, " - "expected_tw_start, expected_add_new_tw_call", + "flowtime, width, first_flow_time, " + "expected_twid, expected_tw_start, expected_add_new_tw_call", [ # Testcase 1: Normal case, existing start time ( 26, 5, - "0", + 0, "timewindow6", 25, call("profile_1", "timewindow6", 25), @@ -2232,7 +2232,7 @@ def test_get_tw_of_ts(): ( 1600000100.0, 100.0, - "1600000000.0", + 1600000000.0, "timewindow2", 1600000100.0, call("profile_1", "timewindow2", 1600000100.0), @@ -2251,7 +2251,7 @@ def test_get_tw_of_ts(): def test_get_timewindow( flowtime, width, - hget_return_value, + first_flow_time, expected_twid, expected_tw_start, expected_add_new_tw_call, @@ -2260,11 +2260,11 @@ def test_get_timewindow( profileid = "profile_1" handler.add_new_tw = MagicMock() handler.width = width - handler.r.hget.return_value = hget_return_value + handler.get_first_flow_time = Mock(return_value=first_flow_time) twid = handler.get_timewindow(flowtime, profileid) - handler.r.hget.assert_called_once_with("analysis", "file_start") + handler.get_first_flow_time.assert_called_once() handler.add_new_tw.assert_called_once_with(*expected_add_new_tw_call.args) assert twid == expected_twid From 9fe676ce48c3bdcf494b0f5673be73f12c8bcfba Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 00:20:04 +0300 Subject: [PATCH 274/498] update the db unit tests --- .../core/database/redis_db/alert_handler.py | 6 --- .../core/database/redis_db/profile_handler.py | 6 +++ tests/test_profile_handler.py | 50 +++++++++---------- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/slips_files/core/database/redis_db/alert_handler.py b/slips_files/core/database/redis_db/alert_handler.py index bba2a2514..234ccb55b 100644 --- a/slips_files/core/database/redis_db/alert_handler.py +++ b/slips_files/core/database/redis_db/alert_handler.py @@ -152,12 +152,6 @@ def del_blocked_ip(self, ip: str): # remove ip from the blocked_ips sorted set self.r.zrem("blocked_ips", ip) - def get_tw_start_time(self, profileid, twid): - """Return the time when this TW in this profile was created""" - # We need to encode it to 'search' because the data in the - # sorted set is encoded - return self.r.zscore(f"tws{profileid}", twid.encode("utf-8")) - def get_tw_limits(self, profileid, twid: str) -> Tuple[float, float]: """ returns the timewindow start and endtime diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 5bccb894d..edbbf3a12 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -81,6 +81,12 @@ def set_dhcp_flow(self, profileid, twid, requested_addr, uid): json.dumps(flow), ) + def get_tw_start_time(self, profileid, twid): + """Return the time when this TW in this profile was created""" + # We need to encode it to 'search' because the data in the + # sorted set is encoded + return self.r.zscore(f"tws{profileid}", twid.encode("utf-8")) + def get_first_flow_time(self) -> Optional[float]: """ Get the starttime of the first timewindow diff --git a/tests/test_profile_handler.py b/tests/test_profile_handler.py index 9cc180cb6..e2a7625eb 100644 --- a/tests/test_profile_handler.py +++ b/tests/test_profile_handler.py @@ -811,30 +811,6 @@ def test_add_new_tw( ) -@pytest.mark.parametrize( - "zscore_return_value, expected_start_time", - [ # Testcase 1: TW exists and has a start time - (1100.0, 1100.0), - # Testcase 2: TW does not exist - (None, None), - ], -) -def test_get_tw_start_time(zscore_return_value, expected_start_time): - handler = ModuleFactory().create_profile_handler_obj() - - profileid = "profile_1" - twid = "timewindow2" - - handler.r.zscore.return_value = zscore_return_value - - start_time = handler.get_tw_start_time(profileid, twid) - - handler.r.zscore.assert_called_once_with( - f"tws{profileid}", twid.encode("utf-8") - ) - assert start_time == expected_start_time - - @pytest.mark.parametrize( "profileid, zcard_return_value, expected_num_tws", [ # Testcase 1: Profile with 3 timewindows @@ -2506,10 +2482,34 @@ def test_mark_profile_as_dhcp_profile_already_dhcp(): assert result is None +@pytest.mark.parametrize( + "zscore_return_value, expected_start_time", + [ # Testcase 1: TW exists and has a start time + (1100.0, 1100.0), + # Testcase 2: TW does not exist + (None, None), + ], +) +def test_get_tw_start_time(zscore_return_value, expected_start_time): + handler = ModuleFactory().create_profile_handler_obj() + + profileid = "profile_1" + twid = "timewindow2" + + handler.r.zscore.return_value = zscore_return_value + + start_time = handler.get_tw_start_time(profileid, twid) + + handler.r.zscore.assert_called_once_with( + f"tws{profileid}", twid.encode("utf-8") + ) + assert start_time == expected_start_time + + @pytest.mark.parametrize( "hget_return_value, expected_first_flow_time", [ # Testcase 1: First flow time exists - ("1600000000.0", "1600000000.0"), + ("1600000000.0", 1600000000.0), # Testcase 2: First flow time does not exist (None, None), ], From ff686316fa4cf68ebeab5d3b42cf7303737cc547 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 00:31:13 +0300 Subject: [PATCH 275/498] test_profiler: remove theunit test checking for dropping root privs --- tests/test_profiler.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/test_profiler.py b/tests/test_profiler.py index 50b537732..36733d2b8 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -405,20 +405,6 @@ def test_check_for_stop_msg(monkeypatch): assert profiler.is_stop_msg("not_stop") is False -def test_pre_main(monkeypatch): - profiler = ModuleFactory().create_profiler_obj() - - with monkeypatch.context() as m: - mock_drop_root_privs = Mock() - m.setattr( - "slips_files.core.profiler.utils.drop_root_privs", - mock_drop_root_privs, - ) - profiler.pre_main() - - mock_drop_root_privs.assert_called_once() - - def test_main_stop_msg_received(): profiler = ModuleFactory().create_profiler_obj() profiler.should_stop = Mock(side_effect=[False, True]) From fb416e58531e5bc68f94afbf5035336a7e6b17f7 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 00:40:54 +0300 Subject: [PATCH 276/498] add test_unblocker.py --- tests/test_unblocker.py | 113 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 tests/test_unblocker.py diff --git a/tests/test_unblocker.py b/tests/test_unblocker.py new file mode 100644 index 000000000..717b82e84 --- /dev/null +++ b/tests/test_unblocker.py @@ -0,0 +1,113 @@ +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +# SPDX-License-Identifier: GPL-2.0-only +"""Unit test for modules/blocking/blocking.py""" +from tests.module_factory import ModuleFactory +from unittest.mock import patch +import pytest + +from unittest.mock import MagicMock + + +@pytest.mark.parametrize( + "ip, existing_requests, current_tw, expected_block_duration", + [ + # ip being blocked for the first time + ("1.2.3.4", {}, 100, 1), + # test ip in self.requests + ("5.6.7.8", {"5.6.7.8": {"block_this_ip_for": 2}}, 200, 3), + ], +) +def test_unblock_request( + ip, existing_requests, current_tw, expected_block_duration +): + unblocker = ModuleFactory().create_unblocker_obj() + unblocker.requests = existing_requests + + with patch.object( + unblocker, "_get_tw_to_unblock_at" + ) as mock_get_tw, patch.object(unblocker, "_add_req") as mock_add_req: + mock_get_tw.return_value = "fake-tw" + + flags = {"reason": "test"} + unblocker.unblock_request(ip, current_tw, flags=flags) + + mock_get_tw.assert_called_once_with( + ip, current_tw, expected_block_duration + ) + mock_add_req.assert_called_once_with( + ip, "fake-tw", flags, expected_block_duration + ) + + +def test__check_if_time_to_unblock(): + unblocker = ModuleFactory().create_unblocker_obj() + + unblocker.requests = { + "1.2.3.4": { + "tw_to_unblock": MagicMock(end_time="2025-01-01T00:00:00"), + "flags": {"src": "test"}, + } + } + # loop only once + unblocker.should_stop = MagicMock(side_effect=[False, True]) + + with patch("time.sleep"), patch( + "time.time", return_value=1735689600.0 + ), patch( + "modules.blocking.unblocker.utils.convert_ts_format", + return_value=1735689600.0, + ), patch.object( + unblocker, "_unblock", return_value=True + ) as mock_unblock, patch.object( + unblocker, "_log_successful_unblock" + ) as mock_log, patch.object( + unblocker.db, "del_blocked_ip" + ) as mock_del, patch.object( + unblocker, "_del_request" + ) as mock_del_req: + + unblocker._check_if_time_to_unblock() + + mock_unblock.assert_called_once_with("1.2.3.4", {"src": "test"}) + mock_log.assert_called_once_with("1.2.3.4") + mock_del.assert_called_once_with("1.2.3.4") + mock_del_req.assert_called_once_with("1.2.3.4") + + +@pytest.mark.parametrize( + "flags, expected_calls, unblock_success", + [ + ({"from_": True}, 1, True), + ({"to": True}, 1, True), + ({"from_": True, "to": True}, 2, True), + ({}, 2, True), # defaults to both True + ({"from_": True}, 1, False), + ], +) +def test__unblock(flags, expected_calls, unblock_success): + unblocker = ModuleFactory().create_unblocker_obj() + unblocker.db.get_timewindow.return_value = "tw-1337" + + ip = "1.2.3.4" + path = "modules.blocking.unblocker.exec_iptables_command" + + with patch(path, return_value=unblock_success) as mock_exec: + result = unblocker._unblock(ip, flags) + + assert result == unblock_success + assert mock_exec.call_count == expected_calls + + if unblock_success: + unblocker.print.assert_called_once_with( + f"IP {ip} is unblocked in tw-1337." + ) + unblocker.log.assert_called_once_with( + f"IP {ip} is unblocked in tw-1337." + ) + else: + unblocker.print.assert_called_once_with( + f"An errror occured. Unable to unblock {ip}" + ) + unblocker.log.assert_called_once_with( + f"An errror occured. Unable to unblock {ip}" + ) From 1e6d0d18f18e03a35ba414072ad58c4d033b4383 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:21:08 +0300 Subject: [PATCH 277/498] enable/ disable training and testing.log with a param in the config file --- .secrets.baseline | 6 +- config/slips.yaml | 3 + modules/flowmldetection/flowmldetection.py | 140 +++++++++++++------- modules/riskiq/riskiq.py | 2 +- modules/update_manager/update_manager.py | 2 +- slips_files/common/parsers/config_parser.py | 7 +- 6 files changed, 109 insertions(+), 51 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index 37fe2abcb..aa5615109 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -149,14 +149,14 @@ "filename": "config/slips.yaml", "hashed_secret": "4cac50cee3ad8e462728e711eac3e670753d5016", "is_verified": false, - "line_number": 224 + "line_number": 226 }, { "type": "Secret Keyword", "filename": "config/slips.yaml", "hashed_secret": "d033e22ae348aeb5660fc2140aec35850c4da997", "is_verified": false, - "line_number": 394 + "line_number": 396 } ], "dataset/test14-malicious-zeek-dir/http.log": [ @@ -7192,5 +7192,5 @@ } ] }, - "generated_at": "2025-02-13T22:47:52Z" + "generated_at": "2025-05-10T13:18:46Z" } diff --git a/config/slips.yaml b/config/slips.yaml index 8736eaf51..dabb388c0 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -214,6 +214,9 @@ flowmldetection: # You should have trained at least once with 'Normal' data and once with # 'Malicious' data in order for the test to work. mode: test + # creates an extra log file called training.log/testing.log in the + # ouptput dir with performance metrics depending on the mode. + create_performance_metrics_log_files: False ############################# virustotal: diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9139066f0..2a515d0cf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,9 +10,8 @@ import json import traceback import warnings -from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import confusion_matrix from sklearn.metrics import ( - confusion_matrix, f1_score, precision_score, accuracy_score, @@ -37,6 +36,7 @@ Method, ) + # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass @@ -73,7 +73,7 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" self.init_log_file() - + def init_log_file(self): """ Init the log file for training or testing @@ -92,11 +92,16 @@ def read_configuration(self): # This is the global label in the configuration, # in case the flows do not have a label themselves self.label = conf.label() + self.enable_logs: bool = conf.create_performance_metrics_log_files() def write_to_log(self, message: str): """ - Write a message to the local log file. + Write a message to the local log file if + create_performance_metrics_log_files is enabled in slips.yaml """ + if not self.enable_logs: + return + try: self.log_file.write(message + "\n") except Exception as e: @@ -108,7 +113,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ try: # Create y_flow with the label - y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) + y_flow = numpy.full( + self.flows.shape[0], self.flows.ground_truth_label + ) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels @@ -130,7 +137,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): try: # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + X_flow, + y_flow, + classes=["Background", "Malicious", "Benign"], ) except Exception: self.print("Error while calling clf.train()") @@ -149,7 +158,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) # Compute confusion matrix: tn, fp, fn, tp - tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + tn, fp, fn, tp = ( + confusion_matrix(y_true_bin, y_pred_bin, labels=[0, 1]).ravel() + if len(set(y_true_bin)) > 1 + else (0, 0, 0, 0) + ) # Compute metrics FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 @@ -159,7 +172,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) ACCU = accuracy_score(y_true_bin, y_pred_bin) - MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + MCC = ( + matthews_corrcoef(y_true_bin, y_pred_bin) + if len(set(y_true_bin)) > 1 + else 0 + ) RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk @@ -189,7 +206,8 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + # If te proto is in the list to delete and there is only one flow, + # then the dataset will be empty if dataset.empty: # DataFrame is empty now, so return empty return dataset @@ -295,7 +313,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): if last_number_of_flows_when_trained is None: last_number_of_flows_when_trained = 0 else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + last_number_of_flows_when_trained = int( + last_number_of_flows_when_trained + ) # We get all the flows so far flows = self.db.get_all_flows() @@ -399,21 +419,21 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' + """ [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + """ # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -540,17 +560,19 @@ def main(self): labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - # The min labels to retrain is the min number of flows + # The min labels to retrain is the min number of flows # we should have seen so far in this capture to start training # This is so we dont _start_ training with only 1 flow - # Once we are over the start minimum, the second condition is + # Once we are over the start minimum, the second condition is # to force to retrain every a minimum_labels_to_retrain number # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + if sum_labeled_flows >= self.minimum_labels_to_start_train: + if ( + sum_labeled_flows + - self.last_number_of_flows_when_trained + >= self.minimum_labels_to_retrain + ): # So for example we retrain every 50 labels and only when # we have at least 50 labels self.print( @@ -559,10 +581,17 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) + self.process_training_flows( + self.last_number_of_flows_when_trained + ) # Train an algorithm - self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) - self.last_number_of_flows_when_trained = sum_labeled_flows + self.train( + sum_labeled_flows, + self.last_number_of_flows_when_trained, + ) + self.last_number_of_flows_when_trained = ( + sum_labeled_flows + ) elif self.mode == "test": # We are testing, which means using the model to detect @@ -570,7 +599,9 @@ def main(self): # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: - original_label = processed_flow["ground_truth_label"].iloc[0] + original_label = processed_flow["ground_truth_label"].iloc[ + 0 + ] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: @@ -591,30 +622,49 @@ def main(self): 2, ) - # So you can disable this code easily. Since it is used only for evaluating a testing + # So you can disable this code easily. Since it is used + # only for evaluating a testing log_testing_data = True if log_testing_data: # Initialize counters if not already done - if not hasattr(self, 'tp'): + if not hasattr(self, "tp"): self.tp = 0 - if not hasattr(self, 'tn'): + if not hasattr(self, "tn"): self.tn = 0 - if not hasattr(self, 'fp'): + if not hasattr(self, "fp"): self.fp = 0 - if not hasattr(self, 'fn'): + if not hasattr(self, "fn"): self.fn = 0 # Update counters based on predictions and labels - if pred[0] == "Malicious" and original_label == "Malicious": + if ( + pred[0] == "Malicious" + and original_label == "Malicious" + ): self.tp += 1 - elif pred[0] == "Benign" and original_label == "Benign": + elif ( + pred[0] == "Benign" and original_label == "Benign" + ): self.tn += 1 - elif pred[0] == "Malicious" and original_label == "Benign": + elif ( + pred[0] == "Malicious" + and original_label == "Benign" + ): self.fp += 1 - self.write_to_log(f"False Positive Flow: {self.flow}") - elif pred[0] == "Benign" and original_label == "Malicious": + self.write_to_log( + f"False Positive Flow: {self.flow}" + ) + elif ( + pred[0] == "Benign" + and original_label == "Malicious" + ): self.fn += 1 - self.write_to_log(f"False Negative Flow: {self.flow}") + self.write_to_log( + f"False Negative Flow: {self.flow}" + ) # Log the testing performance metrics - self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}") \ No newline at end of file + self.write_to_log( + f"TP: {self.tp}, TN: {self.tn}," + f" FP: {self.fp}, FN: {self.fn}" + ) diff --git a/modules/riskiq/riskiq.py b/modules/riskiq/riskiq.py index 5abf2ddb1..7b5653997 100644 --- a/modules/riskiq/riskiq.py +++ b/modules/riskiq/riskiq.py @@ -25,7 +25,7 @@ def init(self): def read_configuration(self): conf = ConfigParser() - risk_iq_credentials_path = conf.RiskIQ_credentials_path() + risk_iq_credentials_path = conf.risk_iq_credentials_path() try: with open(risk_iq_credentials_path, "r") as f: self.riskiq_email = f.readline().replace("\n", "") diff --git a/modules/update_manager/update_manager.py b/modules/update_manager/update_manager.py index c6bf0013e..2de0abf8e 100644 --- a/modules/update_manager/update_manager.py +++ b/modules/update_manager/update_manager.py @@ -119,7 +119,7 @@ def read_riskiq_creds(risk_iq_credentials_path): self.ssl_feeds_path = conf.ssl_feeds() self.ssl_feeds = self.get_feed_details(self.ssl_feeds_path) - risk_iq_credentials_path = conf.RiskIQ_credentials_path() + risk_iq_credentials_path = conf.risk_iq_credentials_path() read_riskiq_creds(risk_iq_credentials_path) self.riskiq_update_period = conf.riskiq_update_period() diff --git a/slips_files/common/parsers/config_parser.py b/slips_files/common/parsers/config_parser.py index 40f1b044b..e208f7881 100644 --- a/slips_files/common/parsers/config_parser.py +++ b/slips_files/common/parsers/config_parser.py @@ -418,7 +418,12 @@ def data_exfiltration_threshold(self): def get_ml_mode(self): return self.read_configuration("flowmldetection", "mode", "test") - def RiskIQ_credentials_path(self): + def create_performance_metrics_log_files(self) -> bool: + return self.read_configuration( + "flowmldetection", "create_performance_metrics_log_files", False + ) + + def risk_iq_credentials_path(self): return self.read_configuration( "threatintelligence", "RiskIQ_credentials_path", "" ) From 65206b61a2009dfebd8bdc938ffe0a23fd90c943 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:23:58 +0300 Subject: [PATCH 278/498] dont create an empty logfile when create_performance_metrics_log_files is set to false --- modules/flowmldetection/flowmldetection.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 2a515d0cf..9305197d3 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -78,6 +78,9 @@ def init_log_file(self): """ Init the log file for training or testing """ + if not self.enable_logs: + return + if self.mode == "train": # Initialize the training log file self.log_path = "./modules/flowmldetection/training.log" From cdbf9d386f4c4063bbf237e952bbadafef307d7f Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:29:30 +0300 Subject: [PATCH 279/498] when enabled, create testing.log or training.log in the current output dir --- modules/flowmldetection/flowmldetection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9305197d3..f618195bc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: GPL-2.0-only import numpy +import os from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle @@ -83,10 +84,10 @@ def init_log_file(self): if self.mode == "train": # Initialize the training log file - self.log_path = "./modules/flowmldetection/training.log" + self.log_path = os.path.join(self.output_dir, "training.log") elif self.mode == "test": # Initialize the testing log file - self.log_path = "./modules/flowmldetection/testing.log" + self.log_path = os.path.join(self.output_dir, "testing.log") self.log_file = open(self.log_path, "w") def read_configuration(self): From 68e588ab828fbd2da1b0251e3e3c2fd00f736796 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:43:32 +0300 Subject: [PATCH 280/498] Add an enum called labels with either Benign or Malicious so the labels are unified. --- modules/flowmldetection/flowmldetection.py | 65 +++++++++++----------- slips_files/core/structures/labels.py | 11 ++++ 2 files changed, 43 insertions(+), 33 deletions(-) create mode 100644 slips_files/core/structures/labels.py diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f618195bc..e828058ee 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -19,11 +19,10 @@ matthews_corrcoef, recall_score, ) - - from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.labels import Label from slips_files.core.structures.evidence import ( Evidence, ProfileID, @@ -45,6 +44,10 @@ def warn(*args, **kwargs): warnings.warn = warn +BACKGROUND = Label.BACKGROUND.name +BENIGN = Label.BENIGN.name +MALICIOUS = Label.MALICIOUS.name + class FlowMLDetection(IModule): # Name: short name of the module. Do not use spaces @@ -132,9 +135,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): # Count the number of labels of each type in this epoc epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), + BACKGROUND: (y_flow == BACKGROUND).sum(), + MALICIOUS: (y_flow == MALICIOUS).sum(), + BENIGN: (y_flow == BENIGN).sum(), } # Train @@ -143,7 +146,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.clf.partial_fit( X_flow, y_flow, - classes=["Background", "Malicious", "Benign"], + classes=[BACKGROUND, MALICIOUS, BENIGN], ) except Exception: self.print("Error while calling clf.train()") @@ -153,13 +156,13 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): y_pred = self.clf.predict(X_flow) # For metrics, let's focus on Malicious vs Benign (ignore Background) - mask = (y_flow == "Malicious") | (y_flow == "Benign") + mask = (y_flow == MALICIOUS) | (y_flow == BENIGN) y_true_bin = y_flow[mask] y_pred_bin = y_pred[mask] # Map to binary: Malicious=1, Benign=0 - y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) - y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + y_true_bin = numpy.where(y_true_bin == MALICIOUS, 1, 0) + y_pred_bin = numpy.where(y_pred_bin == MALICIOUS, 1, 0) # Compute confusion matrix: tn, fp, fn, tp tn, fp, fn, tp = ( @@ -190,9 +193,12 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.write_to_log( f"Total labels: {sum_labeled_flows}, " f"Background: {epoch_label_counts['Background']}. " - f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " - f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " - f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + f"Benign: {epoch_label_counts['Benign']}. " + f"Malicious: {epoch_label_counts[MALICIOUS]}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, " + f"TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, " + f"Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." ) except Exception: self.print("Error in train().", 0, 1) @@ -345,9 +351,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "ground_truth_label": "Malicious", + "ground_truth_label": MALICIOUS, "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": MALICIOUS }, } ) @@ -366,9 +372,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "ground_truth_label": "Benign", + "ground_truth_label": BENIGN, "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": BENIGN }, } ) @@ -421,7 +427,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # For argus binetflows this fails because ther is a field calle + # bytes that was not in other flows. It should be called allbytes. # Error """ [Flow ML Detection] Error in detect() while processing dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes @@ -546,8 +553,8 @@ def main(self): self.twid = msg["twid"] self.profileid = msg["profileid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them + # These following extra fields are expected in testing. + # update the original flow dict to have them self.flow.update( { "state": msg["interpreted_state"], @@ -612,7 +619,7 @@ def main(self): # an error occurred return - if pred[0] == "Malicious": + if pred[0] == MALICIOUS: # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( @@ -642,26 +649,18 @@ def main(self): # Update counters based on predictions and labels if ( - pred[0] == "Malicious" - and original_label == "Malicious" + pred[0] == MALICIOUS + and original_label == MALICIOUS ): self.tp += 1 - elif ( - pred[0] == "Benign" and original_label == "Benign" - ): + elif pred[0] == BENIGN and original_label == BENIGN: self.tn += 1 - elif ( - pred[0] == "Malicious" - and original_label == "Benign" - ): + elif pred[0] == MALICIOUS and original_label == BENIGN: self.fp += 1 self.write_to_log( f"False Positive Flow: {self.flow}" ) - elif ( - pred[0] == "Benign" - and original_label == "Malicious" - ): + elif pred[0] == BENIGN and original_label == MALICIOUS: self.fn += 1 self.write_to_log( f"False Negative Flow: {self.flow}" diff --git a/slips_files/core/structures/labels.py b/slips_files/core/structures/labels.py new file mode 100644 index 000000000..b1dc64234 --- /dev/null +++ b/slips_files/core/structures/labels.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class Label(Enum): + """ + label of flows should be one of the following + """ + + MALICIOUS = "Malicious" + BENIGN = "Benign" + BACKGROUND = "Background" From 705f63d56c98f536e52a1b0cd0c02836c14aa4b4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 281/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 154 ++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e44ac83f4..16b67e903 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -120,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -132,7 +268,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -155,15 +291,25 @@ def process_features(self, dataset): except (ValueError, KeyError): pass + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # So transform here + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column + # Convert state to categorical dataset.state = dataset.state.str.replace( - r"(^.*NotEstablished.*$)", "0", regex=True + r"(^.*Not Established.*$)", "0", regex=True ) dataset.state = dataset.state.str.replace( r"(^.*Established.*$)", "1", regex=True ) - # Convert proto to categorical. For now we only have few states, - # so we can hardcode... + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... # We dont use the data to create categories because in testing mode # we dont see all the protocols # Also we dont store the Categorizer because the user can retrain From b690ea70e919e7ca95227684396e811a349dd771 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 282/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 000000000..b671a09a2 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 00415c7c2bdf9900eee91682602db8ff609ec19d Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 283/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From f2de4e978cc9755565a87f168ee6d7c2cbd4abba Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 284/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 16b67e903..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 0b805976d..3a7f783ea 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From bfc1221692fc0d0e8d72ad157f2eeff254706cc5 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 285/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e9c16da10372297e2c4258b11dd94f02475c6f2d Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 286/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From ff289cbf8018779acd8a4ab08a8448223e5a24b8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 287/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 31f5e9c653792a09dfe8ce215e1f57b0b2e71e59 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 288/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 777c76da4098c59526bbce25139ed973129a8460 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 289/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 8c7df7c47300cc7f1507a71b98d3252cb10dcb4e Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 290/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 25d09337a3341a8831684f00875d9e32bba520c4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 291/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e140a0c122398fc669668f26ae5d808d9ea662a8 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 292/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 104379e99f054bc8b99813a428c62b05c7b6181a Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 293/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 22244a7ec594088f70514e5efef966d20732d064 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 294/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From f06b6a3ff035031735ec93a106d4ea0a4315d50e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 295/498] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 3a7f783ea..0b805976d 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 9e0355a012f073928a7edcb388701a0e7e26748c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:25:03 +0100 Subject: [PATCH 296/498] delete sys --- modules/flowmldetection/flowmldetection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..c06755a59 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,6 @@ import json import traceback import warnings -import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From c98a3cd4ea7da549834fee1a3d5d34c33f068266 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 297/498] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115b..000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 1a133431aba6f1a40e525206cc3ea14749136ffd Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 298/498] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c06755a59..87e07c759 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -160,7 +160,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From b7af797fc757d7e3cbfc2317edc7381e5ee1e203 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 299/498] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 144 +++++++++++---------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 87e07c759..e91495d64 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -55,8 +55,12 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -67,26 +71,25 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -95,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -118,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -144,9 +147,7 @@ def process_features(self, dataset): "history", "uid", "dir_", - "dbytes", "endtime", - "bytes", "flow_source", ] for field in to_drop: @@ -161,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -199,7 +199,11 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ dataset.proto, dataset.dport, dataset.sport, @@ -210,10 +214,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_flow: + for field in fields_to_convert_to_float: try: field = field.astype("float64") - except ValueError: + except (ValueError, AttributeError): pass return dataset @@ -222,9 +226,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: @@ -240,44 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "Established", - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "Established", - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: - given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", - "dbytes", - "dpkts", "endtime", - "bytes", "flow_source", "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", @@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" + f"Error in detect() while processing " f"\n{x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -437,18 +441,16 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): + # When a new flow arrives msg = json.loads(msg["data"]) - twid = msg["twid"] + self.twid = msg["twid"] + self.profileid = msg["profileid"] self.flow = msg["flow"] - # these fields are expected in testing. update the original + # These following extra fields are expected in testing. update the original # flow dict to have them self.flow.update( { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -461,23 +463,31 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -497,8 +507,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -506,9 +516,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From 3faff9b5bd3aeb53c306324572e39e743f43272d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:29 +0100 Subject: [PATCH 300/498] Fix the profiler handler for cases of nan in state --- .../core/database/redis_db/profile_handler.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index edbbf3a12..4d91b43a9 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -395,7 +395,12 @@ def get_final_state_from_flags(self, state, pkts): We receive the pakets to distinguish some Reset connections """ try: - pre = state.split("_")[0] + # In some flows the state is a nan + try: + pre = state.split("_")[0] + except AttributeError: + pre = '' + try: # Try suricata states """ @@ -417,7 +422,11 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - suf = state.split("_")[1] + # In some flows the state is a nan + try: + suf = state.split("_")[1] + except AttributeError: + suf = '' if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -518,7 +527,7 @@ def get_final_state_from_flags(self, state, pkts): except Exception: exception_line = sys.exc_info()[2].tb_lineno self.print( - f"Error in getFinalStateFromFlags() in database.py line {exception_line}", + f"Error in get_final_state_from_flags() in profile_handler.py line {exception_line}", 0, 1, ) From 2e0603b2c8e0adb327bf5249a30d2894a7d02adb Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:20:42 +0100 Subject: [PATCH 301/498] slips.yaml. Update to have correct labels. By default test. Defaul training lbel is benign --- config/slips.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index 02adc7f1b..1b73e7b54 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -105,13 +105,12 @@ parameters: deletePrevdb: true # Set the label for all the flows that are being read. - # For now only normal and malware directly. No option for setting labels - # with a filter + # For now only Benign and Malicious (Capitalized) # The purpose is to be used in the training of ML models and to output # flows with labels for other tools. - # label: malicious - # label: unknown - label: normal + # label: Malicious + # label: Benign + label: Benign # If Zeek files are rotated or not to avoid running out of disk. # Zeek rotation is enabled by default when using an interface, # which means Slips will delete all Zeek log files after 1 day From 6f2e3c3be24352300ad435be5734a92cb917ab52 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:21:21 +0100 Subject: [PATCH 302/498] First ipython to tst ML flow related models --- modules/flowmldetection/flowmlanalysis.ipynb | 76 ++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 modules/flowmldetection/flowmlanalysis.ipynb diff --git a/modules/flowmldetection/flowmlanalysis.ipynb b/modules/flowmldetection/flowmlanalysis.ipynb new file mode 100644 index 000000000..d726cd280 --- /dev/null +++ b/modules/flowmldetection/flowmlanalysis.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of Flows with Machine Learning for Slips" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis of a fixed list of flows to try techniques and find parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.preprocessing import StandardScaler\n", + "import pickle\n", + "import pandas as pd\n", + "import json\n", + "import traceback\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "slips-new", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 9a91a801f64855f3d9dbb64a013160e7ebc97d2d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 303/498] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e91495d64..58b4ce1e4 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From b7c55c1fb89e829950ff3f1e4075135f92eb0f8d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 304/498] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 58b4ce1e4..4a4d46e37 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From 1336ced589060f2382bfdcc41b883aab7cff2530 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:26:42 +0100 Subject: [PATCH 305/498] profile_handler. Small bug in how we handled the profiles, we were using 'in' instead of == for established. Some not established MAY not have been correctly captured --- slips_files/core/database/redis_db/profile_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index 4d91b43a9..a6669c92a 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -409,9 +409,10 @@ def get_final_state_from_flags(self, state, pkts): these are: New, Established and Closed,for UDP only new and established. For each of these states Suricata can employ different timeouts. """ - if "new" in state or "established" in state: + # This is controversial, but if we dont have a good state, we consider it not established for now + if "new" in state or state.lower() == "established": return "Established" - elif "closed" in state: + elif "closed" in state or state.lower() == 'not established': return "Not Established" # We have varius type of states depending on the type of flow. @@ -422,7 +423,6 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus - # In some flows the state is a nan try: suf = state.split("_")[1] except AttributeError: From 9dc77cd61c1b6431af32903d5003111405945ff3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 306/498] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1073 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 666 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index aef4cba35b7b18287b2be11df2c45e9187d053e0..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 411 zcmdnUafpMZfn}=sMwVTSLVS7gX{m`NrA4X5@tJw?Q@pt+?`4dsSMFg=DXA<-oice! z4^yb+l+KR!DM3>-yqS9#ZKnA7`FX2&vv_lqOiAi=_HbfgU`PexJRr^l;&dn*NWpQXRJ|Emr(~!@tY-9P0&;YE zn9?i|&dD&Ce2~eg9z!XkH+S2V4C_wkh?WyPJF_Dc9WKfLlU4d@b|Cws{>t}{)DLw1 z+&}AzmCAv&TP(zH?>}O{hvWaISvjr_FKvFL2{5=iNW7SLz3Rc`{Vb{$-4P-p4qe>8 zsxB_naJaLsc#$rfl>^wdT$y<#sYS_psRbqRQ!+4J0dz3X4IB|u^h2FOBOrSC<0o%q b_GNYGp25H{nVChIi`7uiOwVviX_6iQocV_5 delta 380 zcmX@av5|wNfn}=NMwVTSlP@wxCo1)@rj%3`q)wSUrH3ihd`f3W`;?$58s5x3j5bsJ z{QSISyji?CN~R=rI$QWMFfb$naSjldL)k?@Iu(fhfH)tBLHt}GhN)wiJeNsPO)*2o zo1t||h8o0hMsFq{N4tk9%>rRzhW_LyOh)w>N*TR5+ps7Gy3Ol95P%792Cxe~W+qIF za^V+n06G8#1cs%Z;j-Fjug&$3=gY2%`}>OzXw^50I4Ib2ygQ=8?eISD+{&-(1rNNN zW*@ diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 9292bda6a6f307edcbe83de0ccfac2437950fa6f..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch literal 890 zcmZo*nVQec00uqM#o0NjiA8yO1x2X^Mfu68#l@L<>3Z=gi6x0sdia7%67y0Li&BD< z6LV6Frc9oq**K+kibfA-d1gsQd~RxD-jt3W4lt*|B)5DrlQdy7+(!%6#F{QHuBFo&v zSTn`X&(G^W5P%79`;sY1ov}cNGpCdm0WAP}wIm-%7Z>Cgl>q4!C|z8TT~Z9@gK4(n zq{@<1pa5G6l)+kDl30>DrH3OiCkHCT0@jlXw4sMDFCOY|h?l*&d-(I>ixYDTa)5zQ zoSF&@h78Ugwqjs#CW3-9L$rq}6UF5=Xf9^}n(NGw#R!S7DQ!Ui_ppKj9%vGG1|KL~ zG6axA1!!U#l8G7ms38F|uhTi=d{gv8|4E7tk+E%iFU@~!UvXb5a&PSo2flmT-L8E; z8bB?zwe6@e~Ltx1rmTHK2M45x=S>Y_k5fcAZRq4P``?^7;27=%1`Z_~Q$U{zGVUTcbp z!_f@A=ox0(4vo(kWe&dF=y3Qof57>#YaOosUa>d6=$d`i{)GMC=j?EBVaVAqgWVBg dy+AxPN5z+=78Pga=K-V3P|r-ya7t;C9st`jc!2-_ delta 525 zcmeyxHj9S2q>4Eng}#5gR6%pz9=<0Kd-o?s5H3*s5pan;tH91&ekaz{5?!*7E?Mq z+JPE0yqPnE{rvpA{sRG+@Mb8PlGN$!ajPlrTH5mhSAF{H=RwvmLf> z0f8s>T))|-e^~LtKJ)mxuX}VF9GJI$RyS%Ec8u45-?98hw!_K^*XJ>X{BW28G_HrG zEU^gaDyjMm8E*!l6XYOvGkSyEqXM*T;RVCs$??LDiHthAxexX@M(tU$UF%YfQ^>gH&y>a4SnPazubB5!cqc1YnCGT))m9-o7^A!Lv41W!_hYO<*VziI$XJV#W|*OtHau9f5TQzy=Sjs!`l%1;+FkJ z-k|Geeq3|-Iw{uqQ%0WS6}i^>);WhA{C@WaM)rg_g6$KCFV4d&dkpP QMw@}2k%^wsl+q+U0P2|NZU6uP From 12e3d93823589e3314325b158b8becc66e8d5d21 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 307/498] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 308/498] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4a4d46e37..d8e9ada27 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From 83a9128ed9c44fbf9d55c05523a627a97bd60766 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 309/498] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 310/498] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 71b93a508e1d8d625fb51ae4a698360044f2af34 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 311/498] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From 2c70aa760e24cc16268efd553a3f94747b12a15e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 312/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 317 +++++++++++++-------- 1 file changed, 206 insertions(+), 111 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d8e9ada27..8917fef6a 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -55,12 +56,8 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained - self.last_number_of_flows_when_trained = 0 + # Minum amount of new lables needed to trigger the train + self.minimum_lables_to_retrain = 50 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -71,25 +68,26 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() - self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) + # Process the labels to have only Normal and Malware + self.flows.label = self.flows.label.str.replace( + r"(^.*ormal.*$)", "Normal", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alware.*$)", "Malware", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alicious.*$)", "Malware", regex=True + ) - # Create X_flow with the current flows minus the label + # Separate + y_flow = self.flows["label"] X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) - # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -98,7 +96,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Malware", "Normal"] ) except Exception: self.print("Error while calling clf.train()") @@ -121,7 +119,142 @@ def train(self): self.store_model() except Exception: - self.print("Error in train().", 0, 1) + self.print("Error in train()", 0, 1) + self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -135,11 +268,6 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - # For now, discard these to_drop = [ "appproto", @@ -152,7 +280,9 @@ def process_features(self, dataset): "history", "uid", "dir_", + "dbytes", "endtime", + "bytes", "flow_source", ] for field in to_drop: @@ -161,16 +291,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -204,11 +330,7 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ + fields_to_convert_to_flow = [ dataset.proto, dataset.dport, dataset.sport, @@ -219,10 +341,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_float: + for field in fields_to_convert_to_flow: try: field = field.astype("float64") - except (ValueError, AttributeError): + except ValueError: pass return dataset @@ -231,9 +353,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_flows(self): """ - Process all the flows in the DB + Process all the flwos in the DB Store the pandas df in self.flows """ try: @@ -249,48 +371,44 @@ def process_training_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. + # self.print(f'Amount of labeled flows: {labels}', 0, 1) flows.append( { - "starttime": 1594417039.029793, + "ts": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, + "state": "Established", + "allbytes": 42764, + "spkts": 37, "sbytes": 25517, - "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "label": "Malware", "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": "Malware" }, } ) flows.append( { - "starttime": 1382355032.706468, + "ts": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "SF", + "state": "Established", + "allbytes": 67696, "spkts": 1, - "dpkts": 0, "sbytes": 100, - "dbytes": 67596, "appproto": "http", - "label": "Benign", + "label": "Normal", "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": "Normal" }, } ) @@ -318,8 +436,6 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) - if dflow.empty: - return None # Update the flow to the processed version return dflow except Exception: @@ -333,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: + given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -340,28 +457,12 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", + "dbytes", + "dpkts", "endtime", + "bytes", "flow_source", - "ground_truth_label", # todo now we can use them - "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -373,7 +474,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" + f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -465,16 +566,18 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - # When a new flow arrives msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] + twid = msg["twid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original + # these fields are expected in testing. update the original # flow dict to have them self.flow.update( { + "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), + # the flow["state"] is the origstate, we dont need that here + # we need the interpreted state "state": msg["interpreted_state"], + "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -487,31 +590,23 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_labels_to_start_train + sum_labeled_flows >= self.minimum_lables_to_retrain + and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows() - # Train an algorithm - self.train() - self.last_number_of_flows_when_trained = sum_labeled_flows - + # We get here every 'self.minimum_lables_to_retrain' + # amount of labels + # So for example we retrain every 100 labels and only when + # we have at least 100 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_flows() + # Train an algorithm + self.train() elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -531,8 +626,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' + f"Report Prediction {pred[0]} for label" + f' {label} flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -540,9 +635,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malicious": + if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) + self.set_evidence_malicious_flow(self.flow, twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From e04e6c61fe8584afe0247f8b21fe2b865cdafe71 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 313/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 000000000..b671a09a2 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 8a30e90ccdcecc165d280d5f47bde3d370fabe00 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 314/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 3c7af271be30bc4b2a1f8fdf466941f9bfa5b5a9 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 315/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 8917fef6a..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 0b805976d..3a7f783ea 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 561049fd9988c8435cff5ac5027e3602c2409088 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 316/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..12c3589ed 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From da9a6b009a0cf1899f2739b9061558ff730ca3b6 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 317/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589ed..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 48b4255302ec79e0d4a9e675b42f08721411e34d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 318/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..12c3589ed 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 5be432f747eccfde0a25cf4d9f97cf6996fff206 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 319/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589ed..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 43f078f96a223cb031b6973dc4c0f4dcb34ac76b Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 320/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..12c3589ed 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 6be1da4f70112a4bf1a49010dfbbf0123e2936bd Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 321/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 12c3589ed..fb17b57f2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 4c52dd2a3fff6acfaa6e4c51593818fbedf73a39 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 322/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 163 +++++++++++++++++++-- 1 file changed, 149 insertions(+), 14 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fb17b57f2..c8226368c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -8,6 +8,7 @@ import pickle import pandas as pd import json +import datetime import traceback import warnings import sys @@ -121,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -156,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -393,21 +524,25 @@ def read_model(self): def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 description = ( - f"Flow with malicious characteristics by ML. Src IP" + f"Malicious flow by ML. Src IP" f" {flow['saddr']}:{flow['sport']} to " f"{flow['daddr']}:{flow['dport']}" ) + + timestamp = utils.convert_format( + datetime.datetime.now(), utils.alerts_format + ) twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( direction=Direction.SRC, - ioc_type=IoCType.IP, + attacker_type=IoCType.IP, value=flow["saddr"], ), victim=Victim( direction=Direction.DST, - ioc_type=IoCType.IP, + victim_type=IoCType.IP, value=flow["daddr"], ), threat_level=ThreatLevel.LOW, @@ -416,7 +551,7 @@ def set_evidence_malicious_flow(self, flow: dict, twid: str): profile=ProfileID(ip=flow["saddr"]), timewindow=TimeWindow(twid_number), uid=[flow["uid"]], - timestamp=flow["starttime"], + timestamp=timestamp, method=Method.AI, src_port=flow["sport"], dst_port=flow["dport"], From 0b646faa189b0097648fb7283e91121aa211f19f Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 323/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115b..b671a09a2 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From a477d089a3d8dd0391bb34de0261d7dafe23af2a Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 324/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From a74d1c5c6fc38842a6a3143ba91e8aae0c4c8599 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 325/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From 560a37b8ef1724010ec2f653ab6e686efbfe9fdb Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 326/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a70..94eb27afd 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 5190917ba7031d744def42bf9d0d1510a59746cc Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 327/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 94eb27afd..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 567f4393ad7832b554e8684c026fad71fe6d0b3e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 328/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 150 +++++++++++++++++++-- 1 file changed, 140 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a70..c8226368c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -157,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From 626a5c3d5bb9f9cb94d5b1d91f4c61c4913247a1 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 329/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115b..b671a09a2 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 2c2212290619b7bccb25ef045f3a2ba3f4f5a270 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 330/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 6bed5ff1a0bef41b33a1cd5b07dcf89cb2a43ab6 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 331/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From d5ea6803c87520eee8061d06dfce7a75159238b3 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 332/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 438 +++++++++++++-------- 1 file changed, 278 insertions(+), 160 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a70..124ec61f9 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,8 +1,3 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle @@ -10,13 +5,10 @@ import json import datetime import traceback -import warnings import sys -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( +from slips_files.common.imports import * +from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, TimeWindow, @@ -25,8 +17,7 @@ EvidenceType, IoCType, Direction, - Victim, - Method, + IDEACategory, ) # Only for debbuging @@ -38,6 +29,8 @@ def warn(*args, **kwargs): pass +import warnings + warnings.warn = warn @@ -63,8 +56,6 @@ def init(self): # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() @@ -122,6 +113,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -130,7 +256,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -139,35 +265,28 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "starttime", + "ts", + "origstate", "type_", - "smac", - "dmac", - "history", - "uid", "dir_", + "history", "dbytes", - "endtime", - "bytes", - "flow_source", + "dpkts", + "smac", + "dmac", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): + except ValueError: pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -201,23 +320,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_flow: - try: - field = field.astype("float64") - except ValueError: - pass - + dataset.proto = dataset.proto.astype("float64") + try: + # Convert dport to float + dataset.dport = dataset.dport.astype("float") + except ValueError: + pass + try: + # Convert sport to float + dataset.sport = dataset.sport.astype("float") + except ValueError: + pass + try: + # Convert Dur to float + dataset.dur = dataset.dur.astype("float") + except ValueError: + pass + try: + # Convert TotPkts to float + dataset.pkts = dataset.pkts.astype("float") + except ValueError: + pass + try: + # Convert SrcPkts to float + dataset.spkts = dataset.spkts.astype("float") + except ValueError: + pass + try: + # Convert TotBytes to float + dataset.allbytes = dataset.allbytes.astype("float") + except ValueError: + pass + try: + # Convert SrcBytes to float + dataset.sbytes = dataset.sbytes.astype("float") + except ValueError: + pass return dataset except Exception: # Stop the timer @@ -233,6 +371,7 @@ def process_flows(self): # We get all the flows so far # because this retraining happens in batches flows = self.db.get_all_flows() + # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -252,7 +391,9 @@ def process_flows(self): "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 84, "allbytes": 42764, "spkts": 37, "sbytes": 25517, @@ -272,7 +413,9 @@ def process_flows(self): "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 67, "allbytes": 67696, "spkts": 1, "sbytes": 100, @@ -298,55 +441,42 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self, flow_to_process: dict): + def process_flow(self): """ Process one flow. Only used during detection in testing - returns the pandas df with the processed flow + Store the pandas df in self.flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) + raw_flow = pd.DataFrame(self.flow_dict, index=[0]) + # Process features dflow = self.process_features(raw_flow) # Update the flow to the processed version - return dflow + self.flow = dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self, x_flow) -> Optional[numpy.ndarray]: + def detect(self): """ - Detects the given flow with the current model stored - and returns the predection array + Detect this flow with the current model stored """ try: - given_x_flow = x_flow - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "dbytes", - "dpkts", - "endtime", - "bytes", - "flow_source", - ] - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass + # Store the real label if there is one + y_flow = self.flow["label"] + # remove the real label column + self.flow = self.flow.drop("label", axis=1) + # remove the label predictions column of the other modules + X_flow = self.flow.drop("module_labels", axis=1) # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) + X_flow = self.scaler.transform(X_flow) + pred = self.clf.predict(X_flow) return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" - ) + except Exception: + # Stop the timer + self.print("Error in detect() X_flow:") + self.print(X_flow) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -354,10 +484,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: + with open("./modules/flowmldetection/model.bin", "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open(self.scaler_path, "wb") as g: + with open("./modules/flowmldetection/scaler.bin", "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -367,23 +497,20 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: + with open("./modules/flowmldetection/model.bin", "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: + with open("./modules/flowmldetection/scaler.bin", "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) + self.print("There was no model. Creating a new empty model.", 0, 2) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. " - "Creating a new empty model.", + "Error reading model from disk. Creating a new empty model.", 0, 2, ) @@ -391,40 +518,39 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow(self, flow: dict, twid: str): + def set_evidence_malicious_flow( + self, + saddr: str, + sport: str, + daddr: str, + dport: str, + twid: str, + uid: str, + ): confidence: float = 0.1 + ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" + f"Malicious flow by ML. Src IP {saddr}:{sport} to " + f"{daddr}:{dport} {ip_identification}" ) timestamp = utils.convert_format( datetime.datetime.now(), utils.alerts_format ) - twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, - attacker_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - victim_type=IoCType.IP, - value=flow["daddr"], + direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], + profile=ProfileID(ip=saddr), + timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), + uid=[uid], timestamp=timestamp, - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], + category=IDEACategory.ANOMALY_TRAFFIC, ) self.db.set_evidence(evidence) @@ -441,22 +567,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - msg = json.loads(msg["data"]) - twid = msg["twid"] - self.flow = msg["flow"] - # these fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state - "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) + data = msg["data"] + # Convert from json to dict + data = json.loads(data) + profileid = data["profileid"] + twid = data["twid"] + # Get flow that is now in json format + flow = data["flow"] + # Convert flow to a dict + flow = json.loads(flow) + # Convert the common fields to something that can + # be interpreted + # Get the uid which is the key + uid = next(iter(flow)) + self.flow_dict = json.loads(flow[uid]) if self.mode == "train": # We are training @@ -469,57 +593,51 @@ def main(self): sum_labeled_flows >= self.minimum_lables_to_retrain and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels + # We get here every 'self.minimum_lables_to_retrain' amount of labels + # So for example we retrain every 100 labels and only when we have at least 100 labels self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." + f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." ) - # Process all flows in the DB and make them ready - # for pandas + # Process all flows in the DB and make them ready for pandas self.process_flows() # Train an algorithm self.train() elif self.mode == "test": # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) + self.process_flow() - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: + # After processing the flow, it may happen that we delete icmp/arp/etc + # so the dataframe can be empty + if self.flow is not None and not self.flow.empty: # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return + pred = self.detect() + label = self.flow_dict["label"] - label = self.flow["label"] + # Report if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode + # If the user specified a label in test mode, and the label + # is diff from the prediction, print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 3, ) if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow( + self.flow_dict["saddr"], + self.flow_dict["sport"], + self.flow_dict["daddr"], + self.flow_dict["dport"], + twid, + uid, + ) self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 2, ) From 0e07e32ecc9922fb33f034bf05c3f8888b0938ab Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 333/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 67 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115b..b671a09a2 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,7 +1,9 @@ from typing import Optional +import sys +import traceback -def interpret_suricata_states(state) -> Optional[str]: +def check_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -16,7 +18,7 @@ def interpret_suricata_states(state) -> Optional[str]: return "Not Established" -def interpret_zeek_states(state) -> Optional[str]: +def check_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -25,13 +27,9 @@ def interpret_zeek_states(state) -> Optional[str]: return "Established" -def interpret_argus_states(state) -> Optional[str]: +def check_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - + suf = state.split("_")[1] if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -88,7 +86,7 @@ def interpret_argus_states(state) -> Optional[str]: return "Not Established" -def interpret_tcp_states(state, pkts) -> Optional[str]: +def check_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -124,7 +122,7 @@ def interpret_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def interpret_udp_states(state) -> Optional[str]: +def check_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -136,7 +134,7 @@ def interpret_udp_states(state) -> Optional[str]: return "Not Established" -def interpret_icmp_states(state) -> Optional[str]: +def check_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -146,25 +144,36 @@ def interpret_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(state, pkts) -> str: +def get_final_state_from_flags(self, state, pkts) -> str: """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state + return "Not Established" - return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 000e8926166c4c4f4af17b8cf157bf2d37472950 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 334/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 0955f66abeb7f5e0f97459abc63d276730ab6868 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 335/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 169 +++------------------ 1 file changed, 19 insertions(+), 150 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 124ec61f9..c57a7a358 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -5,9 +5,13 @@ import json import datetime import traceback -import sys +import warnings + -from slips_files.common.imports import * +from slips_files.common.state_handler import get_final_state_from_flags +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, @@ -29,8 +33,6 @@ def warn(*args, **kwargs): pass -import warnings - warnings.warn = warn @@ -113,141 +115,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -281,12 +148,17 @@ def process_features(self, dataset): except ValueError: pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -370,7 +242,7 @@ def process_flows(self): try: # We get all the flows so far # because this retraining happens in batches - flows = self.db.get_all_flows() + flows: list = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware @@ -464,7 +336,7 @@ def detect(self): """ try: # Store the real label if there is one - y_flow = self.flow["label"] + # y_flow = self.flow["label"] # remove the real label column self.flow = self.flow.drop("label", axis=1) # remove the label predictions column of the other modules @@ -568,13 +440,10 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): data = msg["data"] - # Convert from json to dict data = json.loads(data) - profileid = data["profileid"] + # profileid = data["profileid"] twid = data["twid"] - # Get flow that is now in json format flow = data["flow"] - # Convert flow to a dict flow = json.loads(flow) # Convert the common fields to something that can # be interpreted From 088d9270622d332b34eb39fe23d1e540257188b6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:36:55 +0200 Subject: [PATCH 336/498] mlflow. Ignore UID column --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c57a7a358..e2aa1e0ee 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -141,6 +141,7 @@ def process_features(self, dataset): "dpkts", "smac", "dmac", + "uid", ] for field in to_drop: try: From 51f5f2f76934d8add93b8ec09190317d421cdc93 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 337/498] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 3a7f783ea..0b805976d 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 38c5d55481cc57d81ccba540ffbb2d4811c39e6d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 338/498] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115b..000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From c15b430c419997b224a9ef1b4d5a8cd99195d0b8 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 339/498] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e2aa1e0ee..9269b6701 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -154,7 +154,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From dc2ced3b23a3dac2e11b8d71a3d3bb236d7a7703 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 340/498] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 378 +++++++++++---------- 1 file changed, 197 insertions(+), 181 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9269b6701..e6ea0b517 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,18 +1,20 @@ +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle import pandas as pd import json -import datetime import traceback import warnings - -from slips_files.common.state_handler import get_final_state_from_flags from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils from slips_files.common.abstracts.module import IModule -from slips_files.core.evidence_structure.evidence import ( +from slips_files.core.structures.evidence import ( Evidence, ProfileID, TimeWindow, @@ -21,7 +23,8 @@ EvidenceType, IoCType, Direction, - IDEACategory, + Victim, + Method, ) # Only for debbuging @@ -52,36 +55,41 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -90,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -113,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -123,7 +131,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -132,21 +140,20 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "ts", - "origstate", + "starttime", "type_", - "dir_", - "history", - "dbytes", - "dpkts", "smac", "dmac", + "history", "uid", + "dir_", + "endtime", + "flow_source", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except ValueError: + except (ValueError, KeyError): pass # When flows are read from Slips sqlite, @@ -155,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -193,58 +199,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - dataset.proto = dataset.proto.astype("float64") - try: - # Convert dport to float - dataset.dport = dataset.dport.astype("float") - except ValueError: - pass - try: - # Convert sport to float - dataset.sport = dataset.sport.astype("float") - except ValueError: - pass - try: - # Convert Dur to float - dataset.dur = dataset.dur.astype("float") - except ValueError: - pass - try: - # Convert TotPkts to float - dataset.pkts = dataset.pkts.astype("float") - except ValueError: - pass - try: - # Convert SrcPkts to float - dataset.spkts = dataset.spkts.astype("float") - except ValueError: - pass - try: - # Convert TotBytes to float - dataset.allbytes = dataset.allbytes.astype("float") - except ValueError: - pass - try: - # Convert SrcBytes to float - dataset.sbytes = dataset.sbytes.astype("float") - except ValueError: - pass + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + return dataset except Exception: # Stop the timer self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: # We get all the flows so far # because this retraining happens in batches - flows: list = self.db.get_all_flows() - + flows = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -254,48 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 84, - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 67, - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -314,42 +304,51 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self): + def process_flow(self, flow_to_process: dict): """ Process one flow. Only used during detection in testing - Store the pandas df in self.flow + returns the pandas df with the processed flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(self.flow_dict, index=[0]) - # Process features + raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) # Update the flow to the processed version - self.flow = dflow + return dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self): + def detect(self, x_flow) -> Optional[numpy.ndarray]: """ - Detect this flow with the current model stored + Detects the given flow with the current model stored + and returns the predection array """ try: - # Store the real label if there is one - # y_flow = self.flow["label"] - # remove the real label column - self.flow = self.flow.drop("label", axis=1) - # remove the label predictions column of the other modules - X_flow = self.flow.drop("module_labels", axis=1) + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + ] + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass # Scale the flow - X_flow = self.scaler.transform(X_flow) - pred = self.clf.predict(X_flow) + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) return pred - except Exception: - # Stop the timer - self.print("Error in detect() X_flow:") - self.print(X_flow) + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -357,10 +356,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "wb") as f: + with open(self.model_path, "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open("./modules/flowmldetection/scaler.bin", "wb") as g: + with open(self.scaler_path, "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -370,20 +369,23 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "rb") as f: + with open(self.model_path, "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open("./modules/flowmldetection/scaler.bin", "rb") as g: + with open(self.scaler_path, "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print("There was no model. Creating a new empty model.", 0, 2) + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. Creating a new empty model.", + "Error reading model from disk. " + "Creating a new empty model.", 0, 2, ) @@ -391,39 +393,36 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow( - self, - saddr: str, - sport: str, - daddr: str, - dport: str, - twid: str, - uid: str, - ): + def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 - ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP {saddr}:{sport} to " - f"{daddr}:{dport} {ip_identification}" - ) - - timestamp = utils.convert_format( - datetime.datetime.now(), utils.alerts_format + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" ) - + twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=saddr), - timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), - uid=[uid], - timestamp=timestamp, - category=IDEACategory.ANOMALY_TRAFFIC, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], ) self.db.set_evidence(evidence) @@ -440,17 +439,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - data = msg["data"] - data = json.loads(data) - # profileid = data["profileid"] - twid = data["twid"] - flow = data["flow"] - flow = json.loads(flow) - # Convert the common fields to something that can - # be interpreted - # Get the uid which is the key - uid = next(iter(flow)) - self.flow_dict = json.loads(flow[uid]) + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) if self.mode == "train": # We are training @@ -459,55 +461,69 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' amount of labels - # So for example we retrain every 100 labels and only when we have at least 100 labels - self.print( - f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect - self.process_flow() + processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we delete icmp/arp/etc - # so the dataframe can be empty - if self.flow is not None and not self.flow.empty: + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: # Predict - pred = self.detect() - label = self.flow_dict["label"] + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return - # Report + label = self.flow["label"] if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, and the label - # is diff from the prediction, print in debug mode + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode self.print( - f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow( - self.flow_dict["saddr"], - self.flow_dict["sport"], - self.flow_dict["daddr"], - self.flow_dict["dport"], - twid, - uid, - ) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 2, ) From 76ae27f6a3389245e3fd6365f6176415ae1d7b61 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 341/498] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e6ea0b517..0fa1e4d76 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From e216d5bce7de6261f5b9f4cf99d5a6212d79338d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 342/498] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0fa1e4d76..5c5f9943f 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From 90e2344f104ac3bc43ad17e6c18151b7939764e2 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 343/498] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1124 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 130 zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KL^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8 z4q_05T(v)g91VHfmFeIMvRKFpJJ~89v lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl# diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch delta 290 zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGcfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXo Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 344/498] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 345/498] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5c5f9943f..fe950ed4b 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "endtime", "flow_source", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From b57b591133d2579418191ead001227c27d258432 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 346/498] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 347/498] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 259169c206001f6495880b8fcc942fd7b87878e9 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 348/498] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From 0789af56c5c7b8d00382002ef30f5b5d30e9a92f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:49:23 +0000 Subject: [PATCH 349/498] Add plot for flowml train scores --- modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 modules/flowmldetection/plot_train_score.py diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py new file mode 100644 index 000000000..0b5b5b72b --- /dev/null +++ b/modules/flowmldetection/plot_train_score.py @@ -0,0 +1,56 @@ +import pandas as pd +import matplotlib.pyplot as plt +import re +import sys + +def plot_log_data(file_path): + # Read the log data from the file + with open(file_path, 'r') as file: + log_data = file.read() + + # Define regex pattern to extract relevant data from each line + pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + + # Parse the log file + data = re.findall(pattern, log_data) + + # Convert data to a DataFrame + df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + df = df.astype({ + "Background": int, + "Benign": int, + "Malicious": int, + "Total labels": float, + "Score": float + }) + + # Plotting the values + fig, ax1 = plt.subplots(figsize=(10, 6)) + + # Plotting Score on the left y-axis + ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + ax1.set_xlabel('Index') + ax1.set_ylabel('Score', color='tab:blue') + ax1.tick_params(axis='y', labelcolor='tab:blue') + + # Create the second y-axis for the Total labels + ax2 = ax1.twinx() + ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') + ax2.set_ylabel('Total labels', color='tab:red') + ax2.tick_params(axis='y', labelcolor='tab:red') + + # Adding title and legend + plt.title('Log Data Visualization') + fig.tight_layout() + + # Save plot to a PNG file + plt.savefig('log_data_plot_with_two_scales.png') + + # Display the plot + plt.show() + +# Make sure the file path is passed as an argument +if len(sys.argv) < 2: + print("Please provide the path to the log file as a parameter.") +else: + plot_log_data(sys.argv[1]) From 6c4e7f16e84bc7d501031d7209fc3975087ef1c3 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:04 +0000 Subject: [PATCH 350/498] Add a log file to store the training data output --- modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index fe950ed4b..60217ada2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -68,12 +68,29 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves self.label = conf.label() - def train(self): + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): """ Train a model based on the flows we receive and the labels """ From d1f4f4873e56c4a5ffea27e384d75a244c3dc717 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:32 +0000 Subject: [PATCH 351/498] Store data in the log file of training --- modules/flowmldetection/flowmldetection.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 60217ada2..6f732da63 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -137,9 +137,13 @@ def train(self, sum_labeled_flows): # Store the models on disk self.store_model() + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") def process_features(self, dataset): """ From 38347dcbcd0a5bd2f8f0313160d26aadb4d460aa Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:53 +0000 Subject: [PATCH 352/498] better comments --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6f732da63..ed3aecf1b 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -59,10 +59,9 @@ def init(self): self.minimum_labels_to_start_train = 50 # Minum amount of new labels needed to retrain self.minimum_labels_to_retrain = 50 - # The number of flows when last trained + # The number of flows when last trained. Used internally only to know + # when to retrain self.last_number_of_flows_when_trained = 0 - # To plot the scores of training - # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" From b9ff8e3090942b37c032fb535a31d6518b22fae7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:30 +0000 Subject: [PATCH 353/498] Fix issue not dropping detailed labels --- modules/flowmldetection/flowmldetection.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index ed3aecf1b..25b30cf51 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -94,23 +94,19 @@ def train(self, sum_labeled_flows): Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Train try: self.clf.partial_fit( From 8da38939309e7bc3cb878b4c4c20ae2dd8bb56e1 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:53 +0000 Subject: [PATCH 354/498] Fix issue that not all labels sere given to the partial fit --- modules/flowmldetection/flowmldetection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 25b30cf51..b2d0db5e5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -109,8 +109,9 @@ def train(self, sum_labeled_flows): # Train try: + # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") From f1b5b683153abe35d4b28dbc03152bebfa4cb8a2 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:52:08 +0000 Subject: [PATCH 355/498] count partial labels in this epoch --- modules/flowmldetection/flowmldetection.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b2d0db5e5..1146091a9 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -106,6 +106,12 @@ def train(self, sum_labeled_flows): # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } # Train try: From 84480185bdbd1eb9887b86fcc75a889e43f57964 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:09 +0000 Subject: [PATCH 356/498] Dont print training in screen --- modules/flowmldetection/flowmldetection.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 1146091a9..4bb2ad7db 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -126,15 +126,8 @@ def train(self, sum_labeled_flows): # See score so far in training score = self.clf.score(X_flow, y_flow) - # To debug the training score - # self.scores.append(score) - - self.print(f" Training Score: {score}", 0, 1) - # self.print(f' Model Parameters: {self.clf.coef_}') - - # Debug code to store a plot in a png of the scores - # plt.plot(self.scores) - # plt.savefig('train-scores.png') + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) # Store the models on disk self.store_model() From 7c2b383edbda7283716ebc5b894fd5d8fc62f7da Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:28 +0000 Subject: [PATCH 357/498] Add function to write to train log --- modules/flowmldetection/flowmldetection.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4bb2ad7db..d4b2762f5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -247,28 +247,28 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_training_flows(self, last_number_of_flows_when_trained): """ - Process all the flows in the DB + Process only the new flows in the DB since the last training. Store the pandas df in self.flows """ try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + # We get all the flows so far - # because this retraining happens in batches flows = self.db.get_all_flows() - # Check how many different labels are in the DB - # We need both normal and malware + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB labels = self.db.get_labels() if len(labels) == 1: - # Only 1 label has flows - # There are not enough different labels, so insert two flows - # that are fake but representative of a normal and malware flow - # they are only for the training process - # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. - flows.append( + # Insert fake flows for both classes if needed + new_flows.append( { "starttime": 1594417039.029793, "dur": "1.9424750804901123", From ad07f7c245eea515e4395b1216f3c564068067ae Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:27 +0000 Subject: [PATCH 358/498] Fix label in dummy flow --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d4b2762f5..6a44422cc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "ground_truth_label": "Malicious", "module_labels": { "flowalerts-long-connection": "Malicious" }, } ) - flows.append( + new_flows.append( { "starttime": 1382355032.706468, "dur": "10.896695", From d3736905508aa9dbcfbd7044532d0aed3501db5f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:39 +0000 Subject: [PATCH 359/498] Fix dummy flow --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6a44422cc..20f1f8ca8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "label": "Benign", + "ground_truth_label": "Benign", "module_labels": { "flowalerts-long-connection": "Benign" }, From 867da84a20fb4c6b695906f94c9ba1b7b967d38d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:58:28 +0000 Subject: [PATCH 360/498] Rename variable --- modules/flowmldetection/flowmldetection.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 20f1f8ca8..59064d61a 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): }, } ) - # If there are enough flows, we dont insert them anymore # Convert to pandas df - df_flows = pd.DataFrame(flows) + df_flows = pd.DataFrame(new_flows) # Process features df_flows = self.process_features(df_flows) @@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained): # Update the flow to the processed version self.flows = df_flows except Exception: - # Stop the timer self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) From aeebcbc24872621b69dd030456ccea86053e2948 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:32 +0000 Subject: [PATCH 361/498] Fix dummy flow label --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 59064d61a..6b41b4029 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -356,6 +356,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "dir_", "endtime", "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. # Error From 5fef371864f1faa6d45f5ad54813dd4b5354171f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:47 +0000 Subject: [PATCH 362/498] Pass values to train function --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6b41b4029..4d66aab85 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -521,9 +521,9 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows() + self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train() + self.train(sum_labeled_flows) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From 3d8f125ec27114c35e5c552cbbf7c1c5d3baadb4 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:01:47 +0000 Subject: [PATCH 363/498] import os --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4d66aab85..766178e12 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import os from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 260d6845ce3775c84f93cc6a79f04812c9ca50be Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:15 +0000 Subject: [PATCH 364/498] Get issue of total flows zero --- slips_files/core/database/database_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 0b805976d..b32c004a3 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -661,7 +661,8 @@ def add_software_to_profile(self, *args, **kwargs): return self.rdb.add_software_to_profile(*args, **kwargs) def get_total_flows(self, *args, **kwargs): - return int(self.rdb.get_total_flows(*args, **kwargs)) + total_flows = self.rdb.get_total_flows(*args, **kwargs) + return int(total_flows) if total_flows is not None else 0 def increment_processed_flows(self, *args, **kwargs): return self.rdb.increment_processed_flows(*args, **kwargs) From c65e8f15d3e641afe585428f2526c6f50117c791 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:32 +0000 Subject: [PATCH 365/498] Add comments --- slips_files/core/database/database_manager.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index b32c004a3..1d339685f 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -888,7 +888,10 @@ def get_flow(self, *args, **kwargs): """returns the raw flow as read from the log file""" return self.sqlite.get_flow(*args, **kwargs) - def add_flow(self, flow, profileid: str, twid: str, label="benign"): + def add_flow(self, flow, profileid: str, twid: str, label="Benign"): + """ + Just in case, by default if there are no labels in the flow, we consider it Benign + """ # stores it in the db self.sqlite.add_flow(flow, profileid, twid, label=label) # handles the channels and labels etc. From 8ae122121f8d9ccca31942e3d7b7f64cd48c8bad Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:02:51 +0000 Subject: [PATCH 366/498] Rename var name to be more clear --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index 0d9b11bd2..d22069d9e 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -119,7 +119,7 @@ def read_configuration(self): self.local_whitelist_path = conf.local_whitelist_path() self.timeformat = conf.ts_format() self.analysis_direction = conf.analysis_direction() - self.label = conf.label() + self.configuration_label = conf.label() self.width = conf.get_tw_width_as_float() self.client_ips: List[ Union[IPv4Network, IPv6Network, IPv4Address, IPv6Address] From 5fbe43ad6bb445795cb8a7c2317cf6b91acecfd0 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:10 +0000 Subject: [PATCH 367/498] Rename var name --- slips_files/core/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips_files/core/profiler.py b/slips_files/core/profiler.py index d22069d9e..e8fdf5cc5 100644 --- a/slips_files/core/profiler.py +++ b/slips_files/core/profiler.py @@ -377,7 +377,7 @@ def store_features_going_in(self, profileid: str, twid: str, flow): flow, profileid=profileid, twid=twid, - label=self.label, + label=self.configuration_label, ) self.db.mark_profile_tw_as_modified(profileid, twid, "") From 85ac73dca750a6467e9e345b2daa42ebe4dded90 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:31 +0000 Subject: [PATCH 368/498] Fix processeed flows being zero --- slips/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slips/main.py b/slips/main.py index b00cc8f3d..3f661c884 100644 --- a/slips/main.py +++ b/slips/main.py @@ -414,7 +414,7 @@ def get_analyzed_flows_percentage(self) -> str: self.total_flows = self.db.get_total_flows() flows_percentage = int( - (self.db.get_processed_flows_so_far() / self.total_flows) * 100 + (self.db.get_processed_flows_so_far() / self.total_flows) * 100 if self.total_flows != 0 else 0 ) return f"Analyzed Flows: {green(flows_percentage)}{green('%')}. " From 058b603df40e65aa6dad514fbed6aaa1c9362bcb Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:53 +0000 Subject: [PATCH 369/498] Delete old comments --- modules/flowmldetection/flowmldetection.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 766178e12..6c3bfc127 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -28,10 +28,6 @@ Method, ) -# Only for debbuging -# from matplotlib import pyplot as plt - - # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass From ff9eff155b4989bdecf1b60d34e97f739a5510f7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:13:22 +0000 Subject: [PATCH 370/498] Fix plots --- modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++----- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 0b5b5b72b..359df04ef 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -2,6 +2,8 @@ import matplotlib.pyplot as plt import re import sys +import argparse +import os def plot_log_data(file_path): # Read the log data from the file @@ -24,33 +26,59 @@ def plot_log_data(file_path): "Score": float }) + # Get the directory of the log file to store the plot in the same folder + dir_name = os.path.dirname(file_path) + plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis + # Plotting Score on the left y-axis (with proper scaling from 0 to 1) ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') ax1.set_xlabel('Index') ax1.set_ylabel('Score', color='tab:blue') + ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Total labels + # Create the second y-axis for the Background, Benign, Malicious, Total labels ax2 = ax1.twinx() + ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + + # Set appropriate scale for right y-axis based on the data + ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() - # Save plot to a PNG file - plt.savefig('log_data_plot_with_two_scales.png') + # Adding the legend with increased space for readability + ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') + ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + + # Increase right margin for better readability of legend + plt.subplots_adjust(right=0.75) + + # Save plot to the same folder as the log file + plt.savefig(plot_file) # Display the plot plt.show() -# Make sure the file path is passed as an argument -if len(sys.argv) < 2: - print("Please provide the path to the log file as a parameter.") -else: - plot_log_data(sys.argv[1]) +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") + parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + + # Handle -h / --help + args = parser.parse_args() + + # Call the function to process the log file + plot_log_data(args.log_file) + +if __name__ == "__main__": + main() From e55edf8709ac90ca8e30de4d3bf1d3d381c7ff3b Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:14:58 +0000 Subject: [PATCH 371/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 359df04ef..c7f374a7f 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -40,18 +40,21 @@ def plot_log_data(file_path): ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Background, Benign, Malicious, Total labels + # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) + ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') + # Annotating Total labels as text on the plot + for i, value in enumerate(df["Total labels"]): + ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') + # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() From 5fbff61521b897f5cc047040bbe9adc54eeee126 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:16:23 +0000 Subject: [PATCH 372/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index c7f374a7f..4099c47c1 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -42,10 +42,10 @@ def plot_log_data(file_path): # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') + ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') + ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') # Set appropriate scale for right y-axis based on the data ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) @@ -56,7 +56,7 @@ def plot_log_data(file_path): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') # Adding title and legend - plt.title('Log Data Visualization') + plt.title('Training performance') fig.tight_layout() # Adding the legend with increased space for readability From ff987fc2450326739b4635275f24648799f32659 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:24:43 +0000 Subject: [PATCH 373/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 4099c47c1..8437e968a 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -59,12 +59,12 @@ def plot_log_data(file_path): plt.title('Training performance') fig.tight_layout() - # Adding the legend with increased space for readability - ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') - ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + # Move both legends further to the right + ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) + plt.subplots_adjust(right=0.7) # Save plot to the same folder as the log file plt.savefig(plot_file) From bf9d7200d01b9f941612b2f0a83e308225396ab0 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:02:34 +0000 Subject: [PATCH 374/498] Plot testing performance from a log --- .../plot_testing_performance.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 modules/flowmldetection/plot_testing_performance.py diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py new file mode 100644 index 000000000..a38c7f059 --- /dev/null +++ b/modules/flowmldetection/plot_testing_performance.py @@ -0,0 +1,89 @@ +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Create the plot + plt.figure(figsize=(12, 8)) + + # Plot each metric + plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') + plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') + plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') + plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') + plt.plot(F1_values, label='F1 Score', marker='o') + plt.plot(accuracy_values, label='Accuracy', marker='o') + plt.plot(precision_values, label='Precision', marker='o') + plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') + plt.plot(recall_values, label='Recall (TPR)', marker='o') + + # Add labels and title + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title('Evaluation Metrics Over Time') + + # Add a legend + plt.legend() + + # Save the plot as a PNG file + plt.savefig('metrics_plot.png') + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From f146fbf84544323511db94d721e971b6da33ad0f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:04:32 +0000 Subject: [PATCH 375/498] Fix the plot --- modules/flowmldetection/plot_testing_performance.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index a38c7f059..fac0acd64 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Set logarithmic scale on the y-axis + plt.yscale('log') + # Add labels and title plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title('Evaluation Metrics Over Time') + plt.ylabel('Metric Value (Log Scale)') + plt.title('Evaluation Metrics Over Time (Log Scale)') # Add a legend plt.legend() # Save the plot as a PNG file - plt.savefig('metrics_plot.png') + plt.savefig('metrics_plot_log_scale.png') plt.close() def main(): From 37bf4f6a0c187b76a443c3f1f855f0278da65065 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:12:40 +0000 Subject: [PATCH 376/498] Fix the plots --- .../plot_testing_performance.py | 76 ++++++++++++++----- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index fac0acd64..5581c72cd 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -50,33 +50,66 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Create the plot - plt.figure(figsize=(12, 8)) + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } - # Plot each metric - plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') - plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') - plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') - plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') - plt.plot(F1_values, label='F1 Score', marker='o') - plt.plot(accuracy_values, label='Accuracy', marker='o') - plt.plot(precision_values, label='Precision', marker='o') - plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') - plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') - # Set logarithmic scale on the y-axis - plt.yscale('log') + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + +def plot_single_group(metrics_dict, output_filename): + plt.figure(figsize=(12, 8)) - # Add labels and title + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + plt.xlabel('Index') - plt.ylabel('Metric Value (Log Scale)') - plt.title('Evaluation Metrics Over Time (Log Scale)') - - # Add a legend + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') plt.legend() - # Save the plot as a PNG file - plt.savefig('metrics_plot_log_scale.png') + # Save the plot + plt.savefig(output_filename) plt.close() def main(): @@ -85,6 +118,7 @@ def main(): sys.exit(1) file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) From 5936fc882ebfb7a8e82c4b8696891d6ead982194 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:16:50 +0000 Subject: [PATCH 377/498] Fix plot --- .../plot_testing_performance.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 5581c72cd..8f9e12cd8 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') -def plot_single_group(metrics_dict, output_filename): + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename): # Apply log scale by default plt.yscale('log') + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Manually set more Y-ticks for better visibility + plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 + plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + plt.xlabel('Index') plt.ylabel('Metric Value') plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') From bfc10bea2cf0ec9e6ce3f2a66484cd023f58e4ad Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:22 +0000 Subject: [PATCH 378/498] Fix plots --- modules/flowmldetection/flowmldetection.py | 709 +++++---------------- 1 file changed, 143 insertions(+), 566 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6c3bfc127..37f076110 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,566 +1,143 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy -from sklearn.linear_model import SGDClassifier -from sklearn.preprocessing import StandardScaler -import pickle -import pandas as pd -import json -import traceback -import warnings -import os - -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( - Evidence, - ProfileID, - TimeWindow, - Attacker, - ThreatLevel, - EvidenceType, - IoCType, - Direction, - Victim, - Method, -) - -# This horrible hack is only to stop sklearn from printing those warnings -def warn(*args, **kwargs): - pass - - -warnings.warn = warn - - -class FlowMLDetection(IModule): - # Name: short name of the module. Do not use spaces - name = "Flow ML Detection" - description = ( - "Train or test a Machine Learning model to detect malicious flows" - ) - authors = ["Sebastian Garcia"] - - def init(self): - # Subscribe to the channel - self.c1 = self.db.subscribe("new_flow") - self.channels = {"new_flow": self.c1} - self.fieldseparator = self.db.get_field_separator() - # Set the output queue of our database instance - # Read the configuration - self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained. Used internally only to know - # when to retrain - self.last_number_of_flows_when_trained = 0 - # The scaler trained during training and to use during testing - self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") - - def read_configuration(self): - conf = ConfigParser() - self.mode = conf.get_ml_mode() - # This is the global label in the configuration, - # in case the flows do not have a label themselves - self.label = conf.label() - - def write_to_training_log(self, message: str): - """ - Write a message to the training log file. - """ - try: - with open(self.training_log_path, "a") as log_file: - log_file.write(message + "\n") - except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) - - def train(self, sum_labeled_flows): - """ - Train a model based on the flows we receive and the labels - """ - try: - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("ground_truth_label", axis=1) - # Drop the detailed labels - X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) - # Drop the module_labels - X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - - # Normalize this batch of data so far. This can get progressivle slow - X_flow = self.scaler.fit_transform(X_flow) - - # Count the number of labels of each type in this epoc - epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), - } - - # Train - try: - # Online incremental learning - self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] - ) - except Exception: - self.print("Error while calling clf.train()") - self.print(traceback.format_exc(), 0, 1) - - # See score so far in training - score = self.clf.score(X_flow, y_flow) - - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) - - # Store the models on disk - self.store_model() - - # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") - except Exception: - self.print("Error in train().", 0, 1) - self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") - - def process_features(self, dataset): - """ - Discards some features of the dataset and can create new. - Clean the dataset - """ - try: - # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] - for proto in to_discard: - dataset = dataset[dataset.proto != proto] - - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - - # For now, discard these - to_drop = [ - "appproto", - "daddr", - "saddr", - "starttime", - "type_", - "smac", - "dmac", - "history", - "uid", - "dir_", - "endtime", - "flow_source", - ] - for field in to_drop: - try: - dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): - pass - - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others - # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) - - # Convert state to categorical - dataset.state = dataset.state.str.replace( - r"(^.*Not Established.*$)", "0", regex=True - ) - dataset.state = dataset.state.str.replace( - r"(^.*Established.*$)", "1", regex=True - ) - - # Convert categories to floats - dataset.state = dataset.state.astype("float64") - - # Convert proto to categorical. For now we only have few states, so we can hardcode... - # We dont use the data to create categories because in testing mode - # we dont see all the protocols - # Also we dont store the Categorizer because the user can retrain - # with its own data. - dataset.proto = dataset.proto.str.lower() - dataset.proto = dataset.proto.str.replace( - r"(^.*tcp.*$)", "0", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*udp.*$)", "1", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp.*$)", "2", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp-ipv6.*$)", "3", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*arp.*$)", "4", regex=True - ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_float: - try: - field = field.astype("float64") - except (ValueError, AttributeError): - pass - - return dataset - except Exception: - # Stop the timer - self.print("Error in process_features()") - self.print(traceback.format_exc(), 0, 1) - - def process_training_flows(self, last_number_of_flows_when_trained): - """ - Process only the new flows in the DB since the last training. - Store the pandas df in self.flows - """ - try: - # Ensure the index is an integer - if last_number_of_flows_when_trained is None: - last_number_of_flows_when_trained = 0 - else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) - - # We get all the flows so far - flows = self.db.get_all_flows() - # Only process new flows since last training - new_flows = flows[last_number_of_flows_when_trained:] - - # Check how many **different** labels are in the DB - labels = self.db.get_labels() - if len(labels) == 1: - # Insert fake flows for both classes if needed - new_flows.append( - { - "starttime": 1594417039.029793, - "dur": "1.9424750804901123", - "saddr": "10.7.10.101", - "sport": "49733", - "daddr": "40.70.224.145", - "dport": "443", - "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, - "sbytes": 25517, - "dbytes": 17247, - "appproto": "ssl", - "ground_truth_label": "Malicious", - "module_labels": { - "flowalerts-long-connection": "Malicious" - }, - } - ) - new_flows.append( - { - "starttime": 1382355032.706468, - "dur": "10.896695", - "saddr": "147.32.83.52", - "sport": "47956", - "daddr": "80.242.138.72", - "dport": "80", - "proto": "tcp", - "state": "SF", - "spkts": 1, - "dpkts": 0, - "sbytes": 100, - "dbytes": 67596, - "appproto": "http", - "ground_truth_label": "Benign", - "module_labels": { - "flowalerts-long-connection": "Benign" - }, - } - ) - - # Convert to pandas df - df_flows = pd.DataFrame(new_flows) - - # Process features - df_flows = self.process_features(df_flows) - - # Update the flow to the processed version - self.flows = df_flows - except Exception: - self.print("Error in process_flows()") - self.print(traceback.format_exc(), 0, 1) - - def process_flow(self, flow_to_process: dict): - """ - Process one flow. Only used during detection in testing - returns the pandas df with the processed flow - """ - try: - # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) - dflow = self.process_features(raw_flow) - if dflow.empty: - return None - # Update the flow to the processed version - return dflow - except Exception: - # Stop the timer - self.print("Error in process_flow()") - self.print(traceback.format_exc(), 0, 1) - - def detect(self, x_flow) -> Optional[numpy.ndarray]: - """ - Detects the given flow with the current model stored - and returns the predection array - """ - try: - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "endtime", - "flow_source", - "ground_truth_label", - "detailed_ground_truth_label", - ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass - # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) - return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" - ) - self.print(traceback.format_exc(), 0, 1) - - def store_model(self): - """ - Store the trained model on disk - """ - self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: - data = pickle.dumps(self.clf) - f.write(data) - with open(self.scaler_path, "wb") as g: - data = pickle.dumps(self.scaler) - g.write(data) - - def read_model(self): - """ - Read the trained model from disk - """ - try: - self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: - self.clf = pickle.load(f) - self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: - self.scaler = pickle.load(g) - except FileNotFoundError: - # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - except EOFError: - self.print( - "Error reading model from disk. " - "Creating a new empty model.", - 0, - 2, - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - - def set_evidence_malicious_flow(self, flow: dict, twid: str): - confidence: float = 0.1 - description = ( - f"Flow with malicious characteristics by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" - ) - twid_number = int(twid.replace("timewindow", "")) - evidence: Evidence = Evidence( - evidence_type=EvidenceType.MALICIOUS_FLOW, - attacker=Attacker( - direction=Direction.SRC, - ioc_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - ioc_type=IoCType.IP, - value=flow["daddr"], - ), - threat_level=ThreatLevel.LOW, - confidence=confidence, - description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], - timestamp=flow["starttime"], - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], - ) - - self.db.set_evidence(evidence) - - def shutdown_gracefully(self): - # Confirm that the module is done processing - if self.mode == "train": - self.store_model() - - def pre_main(self): - utils.drop_root_privs() - # Load the model - self.read_model() - - def main(self): - if msg := self.get_msg("new_flow"): - # When a new flow arrives - msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] - self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "state": msg["interpreted_state"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) - - if self.mode == "train": - # We are training - - # Is the amount in the DB of labels enough to retrain? - # Use labeled flows - labels = self.db.get_labels() - sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) - # Train an algorithm - self.train(sum_labeled_flows) - self.last_number_of_flows_when_trained = sum_labeled_flows - - elif self.mode == "test": - # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) - - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: - # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return - - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) - if pred[0] == "Malicious": - # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) - self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 2, - ) +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } + + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): + plt.figure(figsize=(12, 8)) + + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Add more ticks between 0 and 1 (using a logarithmic scale) + plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) + + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.legend() + + # Save the plot + plt.savefig(output_filename) + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From 672a109958264697c25f80d7a25881c93752ce2e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:52 +0000 Subject: [PATCH 379/498] Fix plots --- .../plot_testing_performance.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 8f9e12cd8..69b8c96a8 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['MCC'].append(MCC_values[i]) close_to_1['recall'].append(recall_values[i]) - # Plot metrics for values close to 0 + # Plot metrics for values close to 0 (linear scale) plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + # Plot metrics for values close to 1 (log scale) + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): if 'recall' in metrics_dict: plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - # Apply log scale by default - plt.yscale('log') + # If the plot is close to 1, apply log scale + if not is_close_to_0: + plt.yscale('log') - # If the plot is close to 0, set custom ticks + # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series if is_close_to_0: - # Manually set more Y-ticks for better visibility - plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 - plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR'])) + max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR'])) + + # Avoid log(0), so set the minimum limit a little higher than zero + if min_val == 0: + min_val = 1e-4 # Avoid zero values on the logarithmic scale + + plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From aa87ed17add17251345579b8963bda7230043c6b Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:23:02 +0000 Subject: [PATCH 380/498] Fix plots --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 69b8c96a8..de4ada38b 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-4 # Avoid zero values on the logarithmic scale + min_val = 1e-8 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 148181f2d4f0d08df508dc85b545c5a18f2a6c3b Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:25:58 +0000 Subject: [PATCH 381/498] Change plot names --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index de4ada38b..1b4152c6e 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") From 057beb3ae401f31c605fe6845957090faec1e195 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:26:09 +0000 Subject: [PATCH 382/498] Rename file --- .../{plot_train_score.py => plot_train_performance.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename modules/flowmldetection/{plot_train_score.py => plot_train_performance.py} (97%) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_performance.py similarity index 97% rename from modules/flowmldetection/plot_train_score.py rename to modules/flowmldetection/plot_train_performance.py index 8437e968a..80e13e951 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -28,7 +28,7 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + plot_file = os.path.join(dir_name, 'performance_metrics_training.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) From f8aa2eb76ccca709d051497f7ca76b8316de4a47 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:32 +0000 Subject: [PATCH 383/498] Recover good flowmldetection deleted by mistake --- modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++----- 1 file changed, 566 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 37f076110..5e4e9aa46 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,143 +1,566 @@ -import matplotlib.pyplot as plt -import sys -import numpy as np - -def process_file(file_path): - # Initialize the counters for the values - FPR_values = [] - FNR_values = [] - TNR_values = [] - TPR_values = [] - F1_values = [] - accuracy_values = [] - precision_values = [] - MCC_values = [] - recall_values = [] - - # Read the file and extract the data - with open(file_path, 'r') as file: - for line in file: - if "TP:" in line: - # Extract the values from the line - parts = line.split(',') - TP = int(parts[0].split(':')[1].strip()) - TN = int(parts[1].split(':')[1].strip()) - FP = int(parts[2].split(':')[1].strip()) - FN = int(parts[3].split(':')[1].strip()) - - # Calculate metrics - FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 - FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 - TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 - TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 - Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 - Recall = TPR # Recall is the same as TPR - F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 - Accuracy = (TP + TN) / (TP + TN + FP + FN) - MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 - - # Append the values to the respective lists - FPR_values.append(FPR) - FNR_values.append(FNR) - TNR_values.append(TNR) - TPR_values.append(TPR) - F1_values.append(F1) - accuracy_values.append(Accuracy) - precision_values.append(Precision) - MCC_values.append(MCC) - recall_values.append(Recall) - - return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values - -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Separate the values into two groups based on their proximity to 0 or 1 - close_to_0 = { - 'FPR': [], 'FNR': [] - } - close_to_1 = { - 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] - } - - # Categorize the metrics into two groups - for i in range(len(FPR_values)): - close_to_0['FPR'].append(FPR_values[i]) - close_to_0['FNR'].append(FNR_values[i]) - - close_to_1['TNR'].append(TNR_values[i]) - close_to_1['TPR'].append(TPR_values[i]) - close_to_1['F1'].append(F1_values[i]) - close_to_1['accuracy'].append(accuracy_values[i]) - close_to_1['precision'].append(precision_values[i]) - close_to_1['MCC'].append(MCC_values[i]) - close_to_1['recall'].append(recall_values[i]) - - # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') - - # Print the final values - print("\nFinal Metric Values:") - print(f"Final FPR: {FPR_values[-1]:.4f}") - print(f"Final FNR: {FNR_values[-1]:.4f}") - print(f"Final TNR: {TNR_values[-1]:.4f}") - print(f"Final TPR: {TPR_values[-1]:.4f}") - print(f"Final F1 Score: {F1_values[-1]:.4f}") - print(f"Final Accuracy: {accuracy_values[-1]:.4f}") - print(f"Final Precision: {precision_values[-1]:.4f}") - print(f"Final MCC: {MCC_values[-1]:.4f}") - print(f"Final Recall: {recall_values[-1]:.4f}") - -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): - plt.figure(figsize=(12, 8)) - - # Only plot the metrics that exist in the dictionary - if 'FPR' in metrics_dict: - plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') - if 'FNR' in metrics_dict: - plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') - if 'TNR' in metrics_dict: - plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') - if 'TPR' in metrics_dict: - plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') - if 'F1' in metrics_dict: - plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') - if 'accuracy' in metrics_dict: - plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') - if 'precision' in metrics_dict: - plt.plot(metrics_dict['precision'], label='Precision', marker='o') - if 'MCC' in metrics_dict: - plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') - if 'recall' in metrics_dict: - plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - - # Apply log scale by default - plt.yscale('log') - - # If the plot is close to 0, set custom ticks - if is_close_to_0: - # Add more ticks between 0 and 1 (using a logarithmic scale) - plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) - - plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') - plt.legend() - - # Save the plot - plt.savefig(output_filename) - plt.close() - -def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) - - file_path = sys.argv[1] - - FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) - -if __name__ == "__main__": - main() +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler +import pickle +import pandas as pd +import json +import traceback +import warnings +import os + +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.evidence import ( + Evidence, + ProfileID, + TimeWindow, + Attacker, + ThreatLevel, + EvidenceType, + IoCType, + Direction, + Victim, + Method, +) + +# This horrible hack is only to stop sklearn from printing those warnings +def warn(*args, **kwargs): + pass + + +warnings.warn = warn + + +class FlowMLDetection(IModule): + # Name: short name of the module. Do not use spaces + name = "Flow ML Detection" + description = ( + "Train or test a Machine Learning model to detect malicious flows" + ) + authors = ["Sebastian Garcia"] + + def init(self): + # Subscribe to the channel + self.c1 = self.db.subscribe("new_flow") + self.channels = {"new_flow": self.c1} + self.fieldseparator = self.db.get_field_separator() + # Set the output queue of our database instance + # Read the configuration + self.read_configuration() + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained. Used internally only to know + # when to retrain + self.last_number_of_flows_when_trained = 0 + # The scaler trained during training and to use during testing + self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" + + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + + def read_configuration(self): + conf = ConfigParser() + self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves + self.label = conf.label() + + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): + """ + Train a model based on the flows we receive and the labels + """ + try: + # Create X_flow with the current flows minus the label + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) + # Drop the module_labels + X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) + + # Normalize this batch of data so far. This can get progressivle slow + X_flow = self.scaler.fit_transform(X_flow) + + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } + + # Train + try: + # Online incremental learning + self.clf.partial_fit( + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + ) + except Exception: + self.print("Error while calling clf.train()") + self.print(traceback.format_exc(), 0, 1) + + # See score so far in training + score = self.clf.score(X_flow, y_flow) + + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + + # Store the models on disk + self.store_model() + + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + except Exception: + self.print("Error in train().", 0, 1) + self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") + + def process_features(self, dataset): + """ + Discards some features of the dataset and can create new. + Clean the dataset + """ + try: + # Discard some type of flows that dont have ports + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + for proto in to_discard: + dataset = dataset[dataset.proto != proto] + + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + + # For now, discard these + to_drop = [ + "appproto", + "daddr", + "saddr", + "starttime", + "type_", + "smac", + "dmac", + "history", + "uid", + "dir_", + "endtime", + "flow_source", + ] + for field in to_drop: + try: + dataset = dataset.drop(field, axis=1) + except (ValueError, KeyError): + pass + + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others + # So transform here + dataset["state"] = dataset.apply( + lambda row: self.db.get_final_state_from_flags( + row["state"], (row["spkts"] + row["dpkts"]) + ), + axis=1, + ) + + # Convert state to categorical + dataset.state = dataset.state.str.replace( + r"(^.*Not Established.*$)", "0", regex=True + ) + dataset.state = dataset.state.str.replace( + r"(^.*Established.*$)", "1", regex=True + ) + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... + # We dont use the data to create categories because in testing mode + # we dont see all the protocols + # Also we dont store the Categorizer because the user can retrain + # with its own data. + dataset.proto = dataset.proto.str.lower() + dataset.proto = dataset.proto.str.replace( + r"(^.*tcp.*$)", "0", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*udp.*$)", "1", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp.*$)", "2", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp-ipv6.*$)", "3", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*arp.*$)", "4", regex=True + ) + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + + return dataset + except Exception: + # Stop the timer + self.print("Error in process_features()") + self.print(traceback.format_exc(), 0, 1) + + def process_training_flows(self, last_number_of_flows_when_trained): + """ + Process only the new flows in the DB since the last training. + Store the pandas df in self.flows + """ + try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + + # We get all the flows so far + flows = self.db.get_all_flows() + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB + labels = self.db.get_labels() + if len(labels) == 1: + # Insert fake flows for both classes if needed + new_flows.append( + { + "starttime": 1594417039.029793, + "dur": "1.9424750804901123", + "saddr": "10.7.10.101", + "sport": "49733", + "daddr": "40.70.224.145", + "dport": "443", + "proto": "tcp", + "state": "SF", + "spkts": 17, + "dpkts": 27, + "sbytes": 25517, + "dbytes": 17247, + "appproto": "ssl", + "ground_truth_label": "Malicious", + "module_labels": { + "flowalerts-long-connection": "Malicious" + }, + } + ) + new_flows.append( + { + "starttime": 1382355032.706468, + "dur": "10.896695", + "saddr": "147.32.83.52", + "sport": "47956", + "daddr": "80.242.138.72", + "dport": "80", + "proto": "tcp", + "state": "SF", + "spkts": 1, + "dpkts": 0, + "sbytes": 100, + "dbytes": 67596, + "appproto": "http", + "ground_truth_label": "Benign", + "module_labels": { + "flowalerts-long-connection": "Benign" + }, + } + ) + + # Convert to pandas df + df_flows = pd.DataFrame(new_flows) + + # Process features + df_flows = self.process_features(df_flows) + + # Update the flow to the processed version + self.flows = df_flows + except Exception: + self.print("Error in process_flows()") + self.print(traceback.format_exc(), 0, 1) + + def process_flow(self, flow_to_process: dict): + """ + Process one flow. Only used during detection in testing + returns the pandas df with the processed flow + """ + try: + # Convert the flow to a pandas dataframe + raw_flow = pd.DataFrame(flow_to_process, index=[0]) + dflow = self.process_features(raw_flow) + if dflow.empty: + return None + # Update the flow to the processed version + return dflow + except Exception: + # Stop the timer + self.print("Error in process_flow()") + self.print(traceback.format_exc(), 0, 1) + + def detect(self, x_flow) -> Optional[numpy.ndarray]: + """ + Detects the given flow with the current model stored + and returns the predection array + """ + try: + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", + ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass + # Scale the flow + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) + return pred + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) + self.print(traceback.format_exc(), 0, 1) + + def store_model(self): + """ + Store the trained model on disk + """ + self.print("Storing the trained model and scaler on disk.", 0, 2) + with open(self.model_path, "wb") as f: + data = pickle.dumps(self.clf) + f.write(data) + with open(self.scaler_path, "wb") as g: + data = pickle.dumps(self.scaler) + g.write(data) + + def read_model(self): + """ + Read the trained model from disk + """ + try: + self.print("Reading the trained model from disk.", 0, 2) + with open(self.model_path, "rb") as f: + self.clf = pickle.load(f) + self.print("Reading the trained scaler from disk.", 0, 2) + with open(self.scaler_path, "rb") as g: + self.scaler = pickle.load(g) + except FileNotFoundError: + # If there is no model, create one empty + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + except EOFError: + self.print( + "Error reading model from disk. " + "Creating a new empty model.", + 0, + 2, + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + + def set_evidence_malicious_flow(self, flow: dict, twid: str): + confidence: float = 0.1 + description = ( + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" + ) + twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( + evidence_type=EvidenceType.MALICIOUS_FLOW, + attacker=Attacker( + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], + ), + threat_level=ThreatLevel.LOW, + confidence=confidence, + description=description, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], + ) + + self.db.set_evidence(evidence) + + def shutdown_gracefully(self): + # Confirm that the module is done processing + if self.mode == "train": + self.store_model() + + def pre_main(self): + utils.drop_root_privs() + # Load the model + self.read_model() + + def main(self): + if msg := self.get_msg("new_flow"): + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) + + if self.mode == "train": + # We are training + + # Is the amount in the DB of labels enough to retrain? + # Use labeled flows + labels = self.db.get_labels() + sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. + if ( + sum_labeled_flows >= self.minimum_labels_to_start_train + ): + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows(self.last_number_of_flows_when_trained) + # Train an algorithm + self.train(sum_labeled_flows) + self.last_number_of_flows_when_trained = sum_labeled_flows + + elif self.mode == "test": + # We are testing, which means using the model to detect + processed_flow = self.process_flow(self.flow) + + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: + # Predict + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return + + label = self.flow["label"] + if label and label != "unknown" and label != pred[0]: + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode + self.print( + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 3, + ) + if pred[0] == "Malicious": + # Generate an alert + self.set_evidence_malicious_flow(self.flow, self.twid) + self.print( + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 2, + ) \ No newline at end of file From f53d7e6c8528af2bf011039e37324b1249bfbaa8 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:43 +0000 Subject: [PATCH 384/498] Fix plot test --- modules/flowmldetection/plot_testing_performance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 1b4152c6e..977a68b2d 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-8 # Avoid zero values on the logarithmic scale + min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 6a2c1379d07b8d65b1e9fbbd3c6c64061723f8b7 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:50:33 +0000 Subject: [PATCH 385/498] Add testing code to evaluate performance. It is optional with a varible --- modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5e4e9aa46..b17a1baaf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -526,36 +526,21 @@ def main(self): elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: + original_label = processed_flow["ground_truth_label"].iloc[0] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: # an error occurred return - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) if pred[0] == "Malicious": # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f"Prediction {pred[0]} for label {label}" + f"Prediction {pred[0]} for label {original_label}" f' flow {self.flow["saddr"]}:' f'{self.flow["sport"]} -> ' f'{self.flow["daddr"]}:' @@ -563,4 +548,43 @@ def main(self): f'{self.flow["proto"]}', 0, 2, - ) \ No newline at end of file + ) + + # So you can disable this code easily. Since it is used only for evaluating a testing + log_testing_data = True + if log_testing_data: + # Initialize counters if not already done + if not hasattr(self, 'tp'): + self.tp = 0 + if not hasattr(self, 'tn'): + self.tn = 0 + if not hasattr(self, 'fp'): + self.fp = 0 + if not hasattr(self, 'fn'): + self.fn = 0 + + + # Update counters based on predictions and labels + if pred[0] == "Malicious" and original_label == "Malicious": + self.tp += 1 + elif pred[0] == "Benign" and original_label == "Benign": + self.tn += 1 + elif pred[0] == "Malicious" and original_label == "Benign": + self.fp += 1 + elif pred[0] == "Benign" and original_label == "Malicious": + self.fn += 1 + + testing_log_path = "./modules/flowmldetection/testing_performance.log" + try: + with open(testing_log_path, "a") as log_file: + log_file.write("Testing Performance Log Initialized\n") + # Log the testing performance metrics + log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") + + # Log the original flow for false positives and false negatives + if pred[0] == "Malicious" and original_label == "Benign": + log_file.write(f"False Positive Flow: {self.flow}\n") + elif pred[0] == "Benign" and original_label == "Malicious": + log_file.write(f"False Negative Flow: {self.flow}\n") + except Exception as e: + self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file From 9fd5cff376977d9a4d970033c5e824d80fed51a6 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:04:00 +0000 Subject: [PATCH 386/498] Fix plots --- .../plot_testing_performance.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 977a68b2d..6865415cd 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import sys import numpy as np +import argparse def process_file(file_path): # Initialize the counters for the values @@ -49,7 +50,7 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number): # Separate the values into two groups based on their proximity to 0 or 1 close_to_0 = { 'FPR': [], 'FNR': [] @@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False) # Print the final values - print("\nFinal Metric Values:") + print("\nFinal Metric Values for Experiment", experiment_number) print(f"Final FPR: {FPR_values[-1]:.4f}") print(f"Final FNR: {FNR_values[-1]:.4f}") print(f"Final TNR: {TNR_values[-1]:.4f}") @@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu print(f"Final MCC: {MCC_values[-1]:.4f}") print(f"Final Recall: {recall_values[-1]:.4f}") -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): +def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + # Add the experiment number to the plot title plt.xlabel('Index') plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time') plt.legend() # Save the plot @@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.close() def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) + # Set up argument parsing + parser = argparse.ArgumentParser(description='Plot testing performance metrics.') + parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file') + parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number') + + args = parser.parse_args() - file_path = sys.argv[1] + file_path = args.file + experiment_number = args.experiment FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number) if __name__ == "__main__": main() From 3b88f410d4eebf2c8bc5cc7fc8056756d18d5e73 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:14:51 +0000 Subject: [PATCH 387/498] Fix train plot --- .../flowmldetection/plot_train_performance.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 80e13e951..244df13d2 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -5,7 +5,7 @@ import argparse import os -def plot_log_data(file_path): +def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() @@ -28,7 +28,8 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'performance_metrics_training.png') + # Append experiment number to the filename + plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) @@ -55,18 +56,18 @@ def plot_log_data(file_path): for i, value in enumerate(df["Total labels"]): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - # Adding title and legend - plt.title('Training performance') + # Adding title and legend with experiment number in title + plt.title(f'Training performance - Experiment {experiment_number}') fig.tight_layout() # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) + ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.7) + plt.subplots_adjust(right=0.75) - # Save plot to the same folder as the log file + # Save plot to the same folder as the log file with experiment number in filename plt.savefig(plot_file) # Display the plot @@ -75,13 +76,14 @@ def plot_log_data(file_path): def main(): # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") + parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") # Handle -h / --help args = parser.parse_args() # Call the function to process the log file - plot_log_data(args.log_file) + plot_log_data(args.file, args.experiment) if __name__ == "__main__": main() From 9e683fa5a09f6e25d7bc4cd09a382c999400e85b Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:14:48 +0000 Subject: [PATCH 388/498] Fix plots --- .../flowmldetection/plot_train_performance.py | 122 ++++++++++-------- 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 244df13d2..5212dfeea 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -4,85 +4,105 @@ import sys import argparse import os +import matplotlib.ticker as ticker def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() - # Define regex pattern to extract relevant data from each line - pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + # Regex pattern for the new log format + pattern = ( + r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: " + r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), " + r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\." + ) # Parse the log file data = re.findall(pattern, log_data) # Convert data to a DataFrame - df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + columns = [ + "Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall" + ] + df = pd.DataFrame(data, columns=columns) df = df.astype({ + "Total labels": float, "Background": int, "Benign": int, "Malicious": int, - "Total labels": float, - "Score": float + "FPR": float, + "TNR": float, + "TPR": float, + "FNR": float, + "F1": float, + "Precision": float, + "Accuracy": float, + "MCC": float, + "Recall": float, }) - # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - # Append experiment number to the filename - plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') - - # Plotting the values - fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis (with proper scaling from 0 to 1) - ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + # --- Plot 1: Number of labels (linear scale, no total labels) --- + fig1, ax1 = plt.subplots(figsize=(10, 6)) + ax1.plot(df.index, df["Background"], label="Background", color='black') + ax1.plot(df.index, df["Benign"], label="Benign", color='cyan') + ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') - ax1.set_ylabel('Score', color='tab:blue') - ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 - ax1.tick_params(axis='y', labelcolor='tab:blue') - - # Create the second y-axis for the Background, Benign, Malicious - ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') - - # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) - ax2.tick_params(axis='y', labelcolor='tab:red') - - # Annotating Total labels as text on the plot - for i, value in enumerate(df["Total labels"]): - ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - - # Adding title and legend with experiment number in title - plt.title(f'Training performance - Experiment {experiment_number}') - fig.tight_layout() + ax1.set_ylabel('Label Counts') + # No log scale here + ax1.set_title(f'Label Counts - Experiment {experiment_number}') + ax1.legend() + ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + + # --- Plot 2: FNR and FPR (log scale) --- + fig2, ax2 = plt.subplots(figsize=(10, 6)) + ax2.plot(df.index, df["FNR"], label="FNR", color='red') + ax2.plot(df.index, df["FPR"], label="FPR", color='blue') + ax2.set_xlabel('Index') + ax2.set_ylabel('Rate') + ax2.set_yscale('log') + ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') + ax2.legend() + ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + + # --- Plot 3: Other metrics (log scale) --- + fig3, ax3 = plt.subplots(figsize=(12, 7)) + metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"] + colors_rest = [ + 'tab:blue', 'tab:green', 'tab:purple', 'tab:brown', + 'tab:gray', 'tab:pink', 'tab:olive' + ] + for metric, color in zip(metrics_rest, colors_rest): + ax3.plot(df.index, df[metric], label=metric, color=color) + ax3.set_xlabel('Index') + ax3.set_ylabel('Metric Value') + ax3.set_yscale('log') + ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') + ax3.legend() + ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) - # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) - - # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) - - # Save plot to the same folder as the log file with experiment number in filename - plt.savefig(plot_file) - - # Display the plot plt.show() + # --- Print final values in terminal --- + print("\nFinal values at last training step:") + for col in ["Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]: + print(f"{col}: {df[col].iloc[-1]}") + def main(): - # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") - - # Handle -h / --help args = parser.parse_args() - - # Call the function to process the log file plot_log_data(args.file, args.experiment) if __name__ == "__main__": From 632ddbcd650375a5b6a41d0bb724c20fd3766e4f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:16:01 +0000 Subject: [PATCH 389/498] Add performance metrics to the training evaluation --- modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++----- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b17a1baaf..2c60cd403 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,16 @@ import json import traceback import warnings -import os +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import ( + confusion_matrix, + f1_score, + precision_score, + accuracy_score, + matthews_corrcoef, + recall_score, +) + from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -86,21 +95,21 @@ def write_to_training_log(self, message: str): except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) - def train(self, sum_labeled_flows): + def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ Train a model based on the flows we receive and the labels """ try: + # Create y_flow with the label + y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - # Normalize this batch of data so far. This can get progressivle slow + # Normalize this batch of data so far. This can get progressively slow X_flow = self.scaler.fit_transform(X_flow) # Count the number of labels of each type in this epoc @@ -120,18 +129,43 @@ def train(self, sum_labeled_flows): self.print("Error while calling clf.train()") self.print(traceback.format_exc(), 0, 1) - # See score so far in training - score = self.clf.score(X_flow, y_flow) + # Predict on the training data + y_pred = self.clf.predict(X_flow) - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + # For metrics, let's focus on Malicious vs Benign (ignore Background) + mask = (y_flow == "Malicious") | (y_flow == "Benign") + y_true_bin = y_flow[mask] + y_pred_bin = y_pred[mask] + + # Map to binary: Malicious=1, Benign=0 + y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) + y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + + # Compute confusion matrix: tn, fp, fn, tp + tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + + # Compute metrics + FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 + TNR = tn / (tn + fp) if (tn + fp) > 0 else 0 + TPR = tp / (tp + fn) if (tp + fn) > 0 else 0 + FNR = fn / (fn + tp) if (fn + tp) > 0 else 0 + F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) + PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) + ACCU = accuracy_score(y_true_bin, y_pred_bin) + MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk self.store_model() # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + self.write_to_training_log( + f"Total labels: {sum_labeled_flows}, " + f"Background: {epoch_label_counts['Background']}. " + f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + ) except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) @@ -520,7 +554,7 @@ def main(self): # for pandas self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train(sum_labeled_flows) + self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From 1d3346dbeb3653238427b291b9b8d90e01a2f578 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sun, 4 May 2025 12:50:46 +0000 Subject: [PATCH 390/498] Fix experiment names --- modules/flowmldetection/plot_train_performance.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 5212dfeea..304f0f4ea 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number): ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') ax1.set_ylabel('Label Counts') - # No log scale here ax1.set_title(f'Label Counts - Experiment {experiment_number}') ax1.legend() ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + ax1.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png')) # --- Plot 2: FNR and FPR (log scale) --- fig2, ax2 = plt.subplots(figsize=(10, 6)) @@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number): ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') ax2.legend() ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + ax2.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png')) # --- Plot 3: Other metrics (log scale) --- fig3, ax3 = plt.subplots(figsize=(12, 7)) @@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number): ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') ax3.legend() ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + ax3.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png')) plt.show() From 36129e51da4879ee590f2c76ad502372fb6954e7 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 391/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 480 +++++++++++---------- 1 file changed, 254 insertions(+), 226 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 2c60cd403..16b67e903 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,16 +10,7 @@ import json import traceback import warnings -from sklearn.metrics import classification_report, confusion_matrix -from sklearn.metrics import ( - confusion_matrix, - f1_score, - precision_score, - accuracy_score, - matthews_corrcoef, - recall_score, -) - +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -37,6 +28,10 @@ Method, ) +# Only for debbuging +# from matplotlib import pyplot as plt + + # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass @@ -61,115 +56,206 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained. Used internally only to know - # when to retrain - self.last_number_of_flows_when_trained = 0 + # Minum amount of new lables needed to trigger the train + self.minimum_lables_to_retrain = 50 + # To plot the scores of training + # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") - def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() - # This is the global label in the configuration, - # in case the flows do not have a label themselves - self.label = conf.label() - - def write_to_training_log(self, message: str): - """ - Write a message to the training log file. - """ - try: - with open(self.training_log_path, "a") as log_file: - log_file.write(message + "\n") - except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) - def train(self, sum_labeled_flows, last_number_of_flows_when_trained): + def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Create y_flow with the label - y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("ground_truth_label", axis=1) - # Drop the detailed labels - X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) - # Drop the module_labels + # Process the labels to have only Normal and Malware + self.flows.label = self.flows.label.str.replace( + r"(^.*ormal.*$)", "Normal", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alware.*$)", "Malware", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alicious.*$)", "Malware", regex=True + ) + + # Separate + y_flow = self.flows["label"] + X_flow = self.flows.drop("label", axis=1) X_flow = X_flow.drop("module_labels", axis=1) - # Normalize this batch of data so far. This can get progressively slow + # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) - # Count the number of labels of each type in this epoc - epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), - } - # Train try: - # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + X_flow, y_flow, classes=["Malware", "Normal"] ) except Exception: self.print("Error while calling clf.train()") self.print(traceback.format_exc(), 0, 1) - # Predict on the training data - y_pred = self.clf.predict(X_flow) + # See score so far in training + score = self.clf.score(X_flow, y_flow) - # For metrics, let's focus on Malicious vs Benign (ignore Background) - mask = (y_flow == "Malicious") | (y_flow == "Benign") - y_true_bin = y_flow[mask] - y_pred_bin = y_pred[mask] + # To debug the training score + # self.scores.append(score) - # Map to binary: Malicious=1, Benign=0 - y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) - y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + self.print(f" Training Score: {score}", 0, 1) + # self.print(f' Model Parameters: {self.clf.coef_}') - # Compute confusion matrix: tn, fp, fn, tp - tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) - - # Compute metrics - FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 - TNR = tn / (tn + fp) if (tn + fp) > 0 else 0 - TPR = tp / (tp + fn) if (tp + fn) > 0 else 0 - FNR = fn / (fn + tp) if (fn + tp) > 0 else 0 - F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) - PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) - ACCU = accuracy_score(y_true_bin, y_pred_bin) - MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 - RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) + # Debug code to store a plot in a png of the scores + # plt.plot(self.scores) + # plt.savefig('train-scores.png') # Store the models on disk self.store_model() - # Log training information - self.write_to_training_log( - f"Total labels: {sum_labeled_flows}, " - f"Background: {epoch_label_counts['Background']}. " - f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " - f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " - f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." - ) except Exception: - self.print("Error in train().", 0, 1) + self.print("Error in train()", 0, 1) + self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") def process_features(self, dataset): """ @@ -182,11 +268,6 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - # For now, discard these to_drop = [ "appproto", @@ -199,7 +280,9 @@ def process_features(self, dataset): "history", "uid", "dir_", + "dbytes", "endtime", + "bytes", "flow_source", ] for field in to_drop: @@ -208,16 +291,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -251,11 +330,7 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ + fields_to_convert_to_flow = [ dataset.proto, dataset.dport, dataset.sport, @@ -266,10 +341,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_float: + for field in fields_to_convert_to_flow: try: field = field.astype("float64") - except (ValueError, AttributeError): + except ValueError: pass return dataset @@ -278,72 +353,69 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self, last_number_of_flows_when_trained): + def process_flows(self): """ - Process only the new flows in the DB since the last training. + Process all the flwos in the DB Store the pandas df in self.flows """ try: - # Ensure the index is an integer - if last_number_of_flows_when_trained is None: - last_number_of_flows_when_trained = 0 - else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) - # We get all the flows so far + # because this retraining happens in batches flows = self.db.get_all_flows() - # Only process new flows since last training - new_flows = flows[last_number_of_flows_when_trained:] - - # Check how many **different** labels are in the DB + # Check how many different labels are in the DB + # We need both normal and malware labels = self.db.get_labels() if len(labels) == 1: - # Insert fake flows for both classes if needed - new_flows.append( + # Only 1 label has flows + # There are not enough different labels, so insert two flows + # that are fake but representative of a normal and malware flow + # they are only for the training process + # At least 1 flow of each label is required + # self.print(f'Amount of labeled flows: {labels}', 0, 1) + flows.append( { - "starttime": 1594417039.029793, + "ts": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, + "state": "Established", + "allbytes": 42764, + "spkts": 37, "sbytes": 25517, - "dbytes": 17247, "appproto": "ssl", - "ground_truth_label": "Malicious", + "label": "Malware", "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": "Malware" }, } ) - new_flows.append( + flows.append( { - "starttime": 1382355032.706468, + "ts": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "SF", + "state": "Established", + "allbytes": 67696, "spkts": 1, - "dpkts": 0, "sbytes": 100, - "dbytes": 67596, "appproto": "http", - "ground_truth_label": "Benign", + "label": "Normal", "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": "Normal" }, } ) + # If there are enough flows, we dont insert them anymore # Convert to pandas df - df_flows = pd.DataFrame(new_flows) + df_flows = pd.DataFrame(flows) # Process features df_flows = self.process_features(df_flows) @@ -351,6 +423,7 @@ def process_training_flows(self, last_number_of_flows_when_trained): # Update the flow to the processed version self.flows = df_flows except Exception: + # Stop the timer self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) @@ -363,8 +436,6 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) - if dflow.empty: - return None # Update the flow to the processed version return dflow except Exception: @@ -378,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: + given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -385,28 +457,14 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", + "dbytes", + "dpkts", "endtime", + "bytes", "flow_source", - "ground_truth_label", + "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -418,7 +476,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" + f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -510,16 +568,18 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - # When a new flow arrives msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] + twid = msg["twid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original + # these fields are expected in testing. update the original # flow dict to have them self.flow.update( { + "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), + # the flow["state"] is the origstate, we dont need that here + # we need the interpreted state "state": msg["interpreted_state"], + "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -532,49 +592,56 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_labels_to_start_train + sum_labeled_flows >= self.minimum_lables_to_retrain + and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) - # Train an algorithm - self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) - self.last_number_of_flows_when_trained = sum_labeled_flows - + # We get here every 'self.minimum_lables_to_retrain' + # amount of labels + # So for example we retrain every 100 labels and only when + # we have at least 100 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_flows() + # Train an algorithm + self.train() elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) + # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: - original_label = processed_flow["ground_truth_label"].iloc[0] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: # an error occurred return - if pred[0] == "Malicious": + label = self.flow["label"] + if label and label != "unknown" and label != pred[0]: + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode + self.print( + f"Report Prediction {pred[0]} for label" + f' {label} flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 3, + ) + if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) + self.set_evidence_malicious_flow(self.flow, twid) self.print( - f"Prediction {pred[0]} for label {original_label}" + f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' f'{self.flow["sport"]} -> ' f'{self.flow["daddr"]}:' @@ -583,42 +650,3 @@ def main(self): 0, 2, ) - - # So you can disable this code easily. Since it is used only for evaluating a testing - log_testing_data = True - if log_testing_data: - # Initialize counters if not already done - if not hasattr(self, 'tp'): - self.tp = 0 - if not hasattr(self, 'tn'): - self.tn = 0 - if not hasattr(self, 'fp'): - self.fp = 0 - if not hasattr(self, 'fn'): - self.fn = 0 - - - # Update counters based on predictions and labels - if pred[0] == "Malicious" and original_label == "Malicious": - self.tp += 1 - elif pred[0] == "Benign" and original_label == "Benign": - self.tn += 1 - elif pred[0] == "Malicious" and original_label == "Benign": - self.fp += 1 - elif pred[0] == "Benign" and original_label == "Malicious": - self.fn += 1 - - testing_log_path = "./modules/flowmldetection/testing_performance.log" - try: - with open(testing_log_path, "a") as log_file: - log_file.write("Testing Performance Log Initialized\n") - # Log the testing performance metrics - log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") - - # Log the original flow for false positives and false negatives - if pred[0] == "Malicious" and original_label == "Benign": - log_file.write(f"False Positive Flow: {self.flow}\n") - elif pred[0] == "Benign" and original_label == "Malicious": - log_file.write(f"False Negative Flow: {self.flow}\n") - except Exception as e: - self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file From a9a38be1d23ebb45330d8bc616c9701c5181db61 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 392/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 000000000..b671a09a2 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 96e0e65f772b4d7542b762fb500b73aff90b262b Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 393/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From 5d655d2d2d16440bc9bf6eb07262cbbba7bddb3d Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 394/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 16b67e903..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 1d339685f..568e78ff4 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 8cd019f174817eee464c90c05ba2a3d60365a852 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 395/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From fdfd7fa0e06079e258530995ee65436f0f56bbf9 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 396/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 5a5b751e2a4491b5cac57dfe3be26643d9d19b26 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 397/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 2400ee226cf7d7678e06570988af29782c1eec10 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 398/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 457cf59da0f4e4be130f661a5eefb01b01c238d4 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 399/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From c35018ef7db18a9cb3b8facaee69b1dc3ec58479 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 400/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 311e8de82f933c87f1d079613ebf2c8fd5e1a5c9 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 401/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 75bb4ea33838004df0241d5c68561b77f642e3de Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 402/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 6be900429ac675632a0d35e137f45bcb025a12f1 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 403/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..f052931c8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e08f2903f4a43ae0ccdbce860e8e0639525ad2f7 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 404/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f052931c8..3379f5077 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 5de25cdb8e5b0d027fbc3df2f8f0467c2a53d489 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 405/498] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 568e78ff4..1d339685f 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 2b614c84fb077b37ecff4613981bc5e7bc031574 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:25:03 +0100 Subject: [PATCH 406/498] delete sys --- modules/flowmldetection/flowmldetection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 3379f5077..c06755a59 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,6 @@ import json import traceback import warnings -import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From 7bce2ca4fc01178dddafb04b4dcb64a8295e142c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 407/498] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115b..000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 62cf6cd7fd287ff669faa225e315eed8ef045b73 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 408/498] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c06755a59..87e07c759 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -160,7 +160,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From 4c8f42673eac97e521e16d94d3bbbe03138d3e4f Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 409/498] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 144 +++++++++++---------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 87e07c759..e91495d64 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -55,8 +55,12 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -67,26 +71,25 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -95,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -118,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -144,9 +147,7 @@ def process_features(self, dataset): "history", "uid", "dir_", - "dbytes", "endtime", - "bytes", "flow_source", ] for field in to_drop: @@ -161,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -199,7 +199,11 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ dataset.proto, dataset.dport, dataset.sport, @@ -210,10 +214,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_flow: + for field in fields_to_convert_to_float: try: field = field.astype("float64") - except ValueError: + except (ValueError, AttributeError): pass return dataset @@ -222,9 +226,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: @@ -240,44 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "Established", - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "Established", - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -318,7 +326,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: - given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -326,10 +333,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", - "dbytes", - "dpkts", "endtime", - "bytes", "flow_source", "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", @@ -345,7 +349,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" + f"Error in detect() while processing " f"\n{x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -437,18 +441,16 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): + # When a new flow arrives msg = json.loads(msg["data"]) - twid = msg["twid"] + self.twid = msg["twid"] + self.profileid = msg["profileid"] self.flow = msg["flow"] - # these fields are expected in testing. update the original + # These following extra fields are expected in testing. update the original # flow dict to have them self.flow.update( { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -461,23 +463,31 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -497,8 +507,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -506,9 +516,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From 7a1e10fb8a2e19c8a158e05aa9c9fda0157cdbd6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:29 +0100 Subject: [PATCH 410/498] Fix the profiler handler for cases of nan in state --- slips_files/core/database/redis_db/profile_handler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/slips_files/core/database/redis_db/profile_handler.py b/slips_files/core/database/redis_db/profile_handler.py index a6669c92a..ab53cc4ab 100644 --- a/slips_files/core/database/redis_db/profile_handler.py +++ b/slips_files/core/database/redis_db/profile_handler.py @@ -423,6 +423,7 @@ def get_final_state_from_flags(self, state, pkts): return "Established" # For Argus + # In some flows the state is a nan try: suf = state.split("_")[1] except AttributeError: From c76c96344d42a17d3c3e5d51c868abe3896e5d76 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 411/498] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e91495d64..58b4ce1e4 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From 74007e82690dbbd14787bd237f37e5507ca62b90 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 412/498] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 58b4ce1e4..4a4d46e37 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From deefde05178f98f7b1ef9ee9c7b54c6b549b0f5b Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 413/498] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1124 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 130 zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KL^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8 z4q_05T(v)g91VHfmFeIMvRKFpJJ~89v lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl# diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch delta 290 zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGcfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXo Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 414/498] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 415/498] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4a4d46e37..d8e9ada27 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -345,6 +345,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", # todo now we can use them "detailed_ground_truth_label", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From b558c05d455ee9651e29e7eef3d4045ad1241ade Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 416/498] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 417/498] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 4a448bc3b8ece80ad6b783d0809e6c93ad0c452e Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 418/498] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From a2b5b9917a802f3810fa3c7b4719e69dfbb1b37c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 419/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 319 +++++++++++++-------- 1 file changed, 207 insertions(+), 112 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index d8e9ada27..1fa77de01 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import sys from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -55,12 +56,8 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained - self.last_number_of_flows_when_trained = 0 + # Minum amount of new lables needed to trigger the train + self.minimum_lables_to_retrain = 50 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing @@ -71,25 +68,26 @@ def init(self): def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() - self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) + # Process the labels to have only Normal and Malware + self.flows.label = self.flows.label.str.replace( + r"(^.*ormal.*$)", "Normal", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alware.*$)", "Malware", regex=True + ) + self.flows.label = self.flows.label.str.replace( + r"(^.*alicious.*$)", "Malware", regex=True + ) - # Create X_flow with the current flows minus the label + # Separate + y_flow = self.flows["label"] X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) - # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -98,7 +96,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Malware", "Normal"] ) except Exception: self.print("Error while calling clf.train()") @@ -121,7 +119,142 @@ def train(self): self.store_model() except Exception: - self.print("Error in train().", 0, 1) + self.print("Error in train()", 0, 1) + self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -135,12 +268,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - - # For now, discard these + # For now, discard the ports to_drop = [ "appproto", "daddr", @@ -152,7 +280,9 @@ def process_features(self, dataset): "history", "uid", "dir_", + "dbytes", "endtime", + "bytes", "flow_source", ] for field in to_drop: @@ -161,16 +291,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -204,11 +330,7 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ + fields_to_convert_to_flow = [ dataset.proto, dataset.dport, dataset.sport, @@ -219,10 +341,10 @@ def process_features(self, dataset): dataset.sbytes, dataset.state, ] - for field in fields_to_convert_to_float: + for field in fields_to_convert_to_flow: try: field = field.astype("float64") - except (ValueError, AttributeError): + except ValueError: pass return dataset @@ -231,9 +353,9 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_flows(self): """ - Process all the flows in the DB + Process all the flwos in the DB Store the pandas df in self.flows """ try: @@ -249,48 +371,44 @@ def process_training_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. + # self.print(f'Amount of labeled flows: {labels}', 0, 1) flows.append( { - "starttime": 1594417039.029793, + "ts": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, + "state": "Established", + "allbytes": 42764, + "spkts": 37, "sbytes": 25517, - "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "label": "Malware", "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": "Malware" }, } ) flows.append( { - "starttime": 1382355032.706468, + "ts": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "state": "SF", + "state": "Established", + "allbytes": 67696, "spkts": 1, - "dpkts": 0, "sbytes": 100, - "dbytes": 67596, "appproto": "http", - "label": "Benign", + "label": "Normal", "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": "Normal" }, } ) @@ -318,8 +436,6 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) - if dflow.empty: - return None # Update the flow to the processed version return dflow except Exception: @@ -333,6 +449,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: and returns the predection array """ try: + given_x_flow = x_flow # clean the flow fields_to_drop = [ "label", @@ -340,28 +457,12 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "uid", "history", "dir_", + "dbytes", + "dpkts", "endtime", + "bytes", "flow_source", - "ground_truth_label", # todo now we can use them - "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -373,7 +474,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: return pred except Exception as e: self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" + f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" ) self.print(traceback.format_exc(), 0, 1) @@ -465,16 +566,18 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - # When a new flow arrives msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] + twid = msg["twid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original + # these fields are expected in testing. update the original # flow dict to have them self.flow.update( { + "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), + # the flow["state"] is the origstate, we dont need that here + # we need the interpreted state "state": msg["interpreted_state"], + "pkts": self.flow["spkts"] + self.flow["dpkts"], "label": msg["label"], "module_labels": msg["module_labels"], } @@ -487,31 +590,23 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_labels_to_start_train + sum_labeled_flows >= self.minimum_lables_to_retrain + and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows() - # Train an algorithm - self.train() - self.last_number_of_flows_when_trained = sum_labeled_flows - + # We get here every 'self.minimum_lables_to_retrain' + # amount of labels + # So for example we retrain every 100 labels and only when + # we have at least 100 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_flows() + # Train an algorithm + self.train() elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) @@ -531,8 +626,8 @@ def main(self): # and the label is diff from the prediction, # print in debug mode self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' + f"Report Prediction {pred[0]} for label" + f' {label} flow {self.flow["saddr"]}:' f'{self.flow["sport"]} ->' f' {self.flow["daddr"]}:' f'{self.flow["dport"]}/' @@ -540,9 +635,9 @@ def main(self): 0, 3, ) - if pred[0] == "Malicious": + if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) + self.set_evidence_malicious_flow(self.flow, twid) self.print( f"Prediction {pred[0]} for label {label}" f' flow {self.flow["saddr"]}:' From 5df2e70c0ea96004493eca3423768d6ab4347cab Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 420/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py new file mode 100644 index 000000000..b671a09a2 --- /dev/null +++ b/slips_files/common/state_handler.py @@ -0,0 +1,179 @@ +from typing import Optional +import sys +import traceback + + +def check_suricata_states(state) -> Optional[str]: + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for + UDP. For TCP, + these are: New, Established and Closed,for UDP only new and + established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + +def check_zeek_states(state) -> Optional[str]: + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + +def check_argus_states(state) -> Optional[str]: + pre = state.split("_")[0] + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + + +def check_tcp_states(state, pkts) -> Optional[str]: + pre = state.split("_")[0] + if "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are reseted when finished and therefore are + # established + # It can happen that is reseted being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. + # Most connections are finished with FIN when finished and + # therefore are established + # It can happen that is finished being not established, but we + # can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is + # not established because the OS retries 3 times. + return "Not Established" if int(pkts) <= 3 else "Established" + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + + +def check_udp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also + # NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + + +def check_icmp_states(state) -> Optional[str]: + pre = state.split("_")[0] + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + + +def get_final_state_from_flags(self, state, pkts) -> str: + """ + Analyze the flags given and return a summary of the state. + Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + if state := check_suricata_states(state): + return state + if state := check_zeek_states(state): + return state + if state := check_argus_states(state): + return state + except IndexError: + # suf does not exist, which means that this is some ICMP or + # no response was sent for UDP or TCP + if state := check_icmp_states(state): + return state + if state := check_udp_states(state): + return state + if state := check_tcp_states(state, pkts): + return state + + return "Not Established" + + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() " f"line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) From 92316cf2520fa980dcc14d808a1393e7e0968eb5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 421/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 67 +++++++++++++---------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index b671a09a2..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,9 +1,7 @@ from typing import Optional -import sys -import traceback -def check_suricata_states(state) -> Optional[str]: +def interpret_suricata_states(state) -> Optional[str]: """ There are different states in which a flow can be. Suricata distinguishes three flow-states for TCP and two for @@ -18,7 +16,7 @@ def check_suricata_states(state) -> Optional[str]: return "Not Established" -def check_zeek_states(state) -> Optional[str]: +def interpret_zeek_states(state) -> Optional[str]: # We have varius type of states depending on the type of flow. # For Zeek if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): @@ -27,9 +25,13 @@ def check_zeek_states(state) -> Optional[str]: return "Established" -def check_argus_states(state) -> Optional[str]: +def interpret_argus_states(state) -> Optional[str]: pre = state.split("_")[0] - suf = state.split("_")[1] + try: + suf = state.split("_")[1] + except IndexError: + return + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: """ Examples: @@ -86,7 +88,7 @@ def check_argus_states(state) -> Optional[str]: return "Not Established" -def check_tcp_states(state, pkts) -> Optional[str]: +def interpret_tcp_states(state, pkts) -> Optional[str]: pre = state.split("_")[0] if "EST" in pre: # TCP @@ -122,7 +124,7 @@ def check_tcp_states(state, pkts) -> Optional[str]: return "Not Established" -def check_udp_states(state) -> Optional[str]: +def interpret_udp_states(state) -> Optional[str]: pre = state.split("_")[0] if "CON" in pre: # UDP @@ -134,7 +136,7 @@ def check_udp_states(state) -> Optional[str]: return "Not Established" -def check_icmp_states(state) -> Optional[str]: +def interpret_icmp_states(state) -> Optional[str]: pre = state.split("_")[0] if "ECO" in pre: # ICMP @@ -144,36 +146,25 @@ def check_icmp_states(state) -> Optional[str]: return "Established" -def get_final_state_from_flags(self, state, pkts) -> str: +def get_final_state_from_flags(state, pkts) -> str: """ - Analyze the flags given and return a summary of the state. - Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections + Converts the original flags from the flow, to a state that slips + understands + Works with Argus, suricata, and Bro flags + We receive the packets to distinguish some Reset connections """ - try: - if state := check_suricata_states(state): - return state - if state := check_zeek_states(state): - return state - if state := check_argus_states(state): - return state - except IndexError: - # suf does not exist, which means that this is some ICMP or - # no response was sent for UDP or TCP - if state := check_icmp_states(state): - return state - if state := check_udp_states(state): - return state - if state := check_tcp_states(state, pkts): - return state - return "Not Established" + for interpreter in ( + interpret_suricata_states, + interpret_zeek_states, + interpret_argus_states, + interpret_icmp_states, + interpret_udp_states, + ): + if interpreted_state := interpreter(state): + return interpreted_state + + if interpreted_state := interpret_tcp_states(state, pkts): + return interpreted_state - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() " f"line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) + return "Not Established" From eb778265b8d6f98c27489081a478a2b0ae744da0 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 422/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++---------------- slips_files/core/database/database_manager.py | 3 - 2 files changed, 10 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 1fa77de01..0e7c4b78e 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -291,12 +156,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 1d339685f..568e78ff4 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,9 +613,6 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) - def get_final_state_from_flags(self, *args, **kwargs): - return self.rdb.get_final_state_from_flags(*args, **kwargs) - def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 28d2199e094edbaab33620c6cd8c56252d67c0be Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 423/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0e7c4b78e..19e829e11 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From cbe80f8e80d05d147a1e54544f01ee4b2ab18cab Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 424/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 19e829e11..0e7c4b78e 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From aa68a909bb8309e70b15ca70958076a368dbe0c7 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 425/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0e7c4b78e..19e829e11 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From aee1e13912d8bf414b5f924e6684187b7c114a68 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 426/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 19e829e11..0e7c4b78e 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From fc14125fe16615de2e29e40fc98e215bd4648bbd Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 427/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0e7c4b78e..19e829e11 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,6 +121,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 9c95c76b54f429f9eaf2c8035d60b98f5bf8dffe Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 428/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 19e829e11..0e7c4b78e 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -121,141 +121,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From 1b20f2ab937725762ca307dee70a3cb517d8d579 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 429/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 165 +++++++++++++++++++-- 1 file changed, 150 insertions(+), 15 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0e7c4b78e..c8226368c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -8,6 +8,7 @@ import pickle import pandas as pd import json +import datetime import traceback import warnings import sys @@ -121,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -133,7 +269,7 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # For now, discard the ports + # For now, discard these to_drop = [ "appproto", "daddr", @@ -156,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -393,21 +524,25 @@ def read_model(self): def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 description = ( - f"Flow with malicious characteristics by ML. Src IP" + f"Malicious flow by ML. Src IP" f" {flow['saddr']}:{flow['sport']} to " f"{flow['daddr']}:{flow['dport']}" ) + + timestamp = utils.convert_format( + datetime.datetime.now(), utils.alerts_format + ) twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( direction=Direction.SRC, - ioc_type=IoCType.IP, + attacker_type=IoCType.IP, value=flow["saddr"], ), victim=Victim( direction=Direction.DST, - ioc_type=IoCType.IP, + victim_type=IoCType.IP, value=flow["daddr"], ), threat_level=ThreatLevel.LOW, @@ -416,7 +551,7 @@ def set_evidence_malicious_flow(self, flow: dict, twid: str): profile=ProfileID(ip=flow["saddr"]), timewindow=TimeWindow(twid_number), uid=[flow["uid"]], - timestamp=flow["starttime"], + timestamp=timestamp, method=Method.AI, src_port=flow["sport"], dst_port=flow["dport"], From 2b9ed84a6a2bdbe9a2ec8a109da92df4d627b994 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:35:21 +0300 Subject: [PATCH 430/498] state_handler: split get_final_state_from_flags() into smaller functions --- slips_files/common/state_handler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index d0a05115b..43d9b5461 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,4 +1,6 @@ from typing import Optional +import sys +import traceback def interpret_suricata_states(state) -> Optional[str]: From 736cf0b76411e510c34b586f644895cbf9250e75 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:48:16 +0300 Subject: [PATCH 431/498] state_handler: refactor get_final_state_from_flags() --- slips_files/common/state_handler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py index 43d9b5461..d0a05115b 100644 --- a/slips_files/common/state_handler.py +++ b/slips_files/common/state_handler.py @@ -1,6 +1,4 @@ from typing import Optional -import sys -import traceback def interpret_suricata_states(state) -> Optional[str]: From 2b576c42258e49f2bdcc008964e04e35b7aeb972 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 432/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From 47d05a060ed6f78fb47892d9756998e775e05b94 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 433/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a70..94eb27afd 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From e197df04e3e44f4318289706ede7a3483ec7feb2 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 434/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 135 --------------------- 1 file changed, 135 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 94eb27afd..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ From d95f4c938e6fdf0ca5bf7ccd607cfb71e2a34c34 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 435/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 150 +++++++++++++++++++-- 1 file changed, 140 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a70..c8226368c 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,6 +122,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -157,17 +292,12 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From c9d2395cd1bfd3f19b1ec80bbde1a6b322e866f5 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 436/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 150 ++------------------- 1 file changed, 10 insertions(+), 140 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c8226368c..9af514a70 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -122,141 +122,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -292,12 +157,17 @@ def process_features(self, dataset): except (ValueError, KeyError): pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( From f6de6fe7db854dcd9ee932e602b7d15af93f80cd Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:32:14 +0200 Subject: [PATCH 437/498] mlflow. Add a function to convert the state again --- modules/flowmldetection/flowmldetection.py | 438 +++++++++++++-------- 1 file changed, 278 insertions(+), 160 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9af514a70..124ec61f9 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,8 +1,3 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle @@ -10,13 +5,10 @@ import json import datetime import traceback -import warnings import sys -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( +from slips_files.common.imports import * +from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, TimeWindow, @@ -25,8 +17,7 @@ EvidenceType, IoCType, Direction, - Victim, - Method, + IDEACategory, ) # Only for debbuging @@ -38,6 +29,8 @@ def warn(*args, **kwargs): pass +import warnings + warnings.warn = warn @@ -63,8 +56,6 @@ def init(self): # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() @@ -122,6 +113,141 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) + + def get_final_state_from_flags(self, state, pkts): + """ + Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags + We receive the pakets to distinguish some Reset connections + """ + try: + pre = state.split("_")[0] + try: + # Try suricata states + """ + There are different states in which a flow can be. + Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, + these are: New, Established and Closed,for UDP only new and established. + For each of these states Suricata can employ different timeouts. + """ + if "new" in state or "established" in state: + return "Established" + elif "closed" in state: + return "Not Established" + + # We have varius type of states depending on the type of flow. + # For Zeek + if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): + return "Not Established" + elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): + return "Established" + + # For Argus + suf = state.split("_")[1] + if "S" in pre and "A" in pre and "S" in suf and "A" in suf: + """ + Examples: + SA_SA + SR_SA + FSRA_SA + SPA_SPA + SRA_SPA + FSA_FSA + FSA_FSPA + SAEC_SPA + SRPA_SPA + FSPA_SPA + FSRPA_SPA + FSPA_FSPA + FSRA_FSPA + SRAEC_SPA + FSPA_FSRPA + FSAEC_FSPA + FSRPA_FSPA + SRPAEC_SPA + FSPAEC_FSPA + SRPAEC_FSRPA + """ + return "Established" + elif "PA" in pre and "PA" in suf: + # Tipical flow that was reported in the middle + """ + Examples: + PA_PA + FPA_FPA + """ + return "Established" + elif "ECO" in pre: + return "ICMP Echo" + elif "ECR" in pre: + return "ICMP Reply" + elif "URH" in pre: + return "ICMP Host Unreachable" + elif "URP" in pre: + return "ICMP Port Unreachable" + else: + """ + Examples: + S_RA + S_R + A_R + S_SA + SR_SA + FA_FA + SR_RA + SEC_RA + """ + return "Not Established" + except IndexError: + # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP + if "ECO" in pre: + # ICMP + return "Established" + elif "UNK" in pre: + # ICMP6 unknown upper layer + return "Established" + elif "CON" in pre: + # UDP + return "Established" + elif "INT" in pre: + # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there + # is no confirmation of what happened. + return "Not Established" + elif "EST" in pre: + # TCP + return "Established" + elif "RST" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established + # It can happen that is reseted being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + elif "FIN" in pre: + # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established + # It can happen that is finished being not established, but we can't tell without -z b. + # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. + return ( + "Not Established" if int(pkts) <= 3 else "Established" + ) + else: + """ + Examples: + S_ + FA_ + PA_ + FSA_ + SEC_ + SRPA_ + """ + return "Not Established" + except Exception: + exception_line = sys.exc_info()[2].tb_lineno + self.print( + f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", + 0, + 1, + ) + self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -130,7 +256,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -139,35 +265,28 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "starttime", + "ts", + "origstate", "type_", - "smac", - "dmac", - "history", - "uid", "dir_", + "history", "dbytes", - "endtime", - "bytes", - "flow_source", + "dpkts", + "smac", + "dmac", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): + except ValueError: pass - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others # So transform here - dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( - row["state"], row["pkts"] - ), - axis=1, - ) - # dataset.state = new_state_column + #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) + dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) + + #dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -201,23 +320,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - fields_to_convert_to_flow = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_flow: - try: - field = field.astype("float64") - except ValueError: - pass - + dataset.proto = dataset.proto.astype("float64") + try: + # Convert dport to float + dataset.dport = dataset.dport.astype("float") + except ValueError: + pass + try: + # Convert sport to float + dataset.sport = dataset.sport.astype("float") + except ValueError: + pass + try: + # Convert Dur to float + dataset.dur = dataset.dur.astype("float") + except ValueError: + pass + try: + # Convert TotPkts to float + dataset.pkts = dataset.pkts.astype("float") + except ValueError: + pass + try: + # Convert SrcPkts to float + dataset.spkts = dataset.spkts.astype("float") + except ValueError: + pass + try: + # Convert TotBytes to float + dataset.allbytes = dataset.allbytes.astype("float") + except ValueError: + pass + try: + # Convert SrcBytes to float + dataset.sbytes = dataset.sbytes.astype("float") + except ValueError: + pass return dataset except Exception: # Stop the timer @@ -233,6 +371,7 @@ def process_flows(self): # We get all the flows so far # because this retraining happens in batches flows = self.db.get_all_flows() + # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -252,7 +391,9 @@ def process_flows(self): "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 84, "allbytes": 42764, "spkts": 37, "sbytes": 25517, @@ -272,7 +413,9 @@ def process_flows(self): "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", + "origstate": "SRPA_SPA", "state": "Established", + "pkts": 67, "allbytes": 67696, "spkts": 1, "sbytes": 100, @@ -298,55 +441,42 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self, flow_to_process: dict): + def process_flow(self): """ Process one flow. Only used during detection in testing - returns the pandas df with the processed flow + Store the pandas df in self.flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) + raw_flow = pd.DataFrame(self.flow_dict, index=[0]) + # Process features dflow = self.process_features(raw_flow) # Update the flow to the processed version - return dflow + self.flow = dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self, x_flow) -> Optional[numpy.ndarray]: + def detect(self): """ - Detects the given flow with the current model stored - and returns the predection array + Detect this flow with the current model stored """ try: - given_x_flow = x_flow - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "dbytes", - "dpkts", - "endtime", - "bytes", - "flow_source", - ] - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass + # Store the real label if there is one + y_flow = self.flow["label"] + # remove the real label column + self.flow = self.flow.drop("label", axis=1) + # remove the label predictions column of the other modules + X_flow = self.flow.drop("module_labels", axis=1) # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) + X_flow = self.scaler.transform(X_flow) + pred = self.clf.predict(X_flow) return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{given_x_flow}\n{e}" - ) + except Exception: + # Stop the timer + self.print("Error in detect() X_flow:") + self.print(X_flow) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -354,10 +484,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: + with open("./modules/flowmldetection/model.bin", "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open(self.scaler_path, "wb") as g: + with open("./modules/flowmldetection/scaler.bin", "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -367,23 +497,20 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: + with open("./modules/flowmldetection/model.bin", "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: + with open("./modules/flowmldetection/scaler.bin", "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) + self.print("There was no model. Creating a new empty model.", 0, 2) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. " - "Creating a new empty model.", + "Error reading model from disk. Creating a new empty model.", 0, 2, ) @@ -391,40 +518,39 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow(self, flow: dict, twid: str): + def set_evidence_malicious_flow( + self, + saddr: str, + sport: str, + daddr: str, + dport: str, + twid: str, + uid: str, + ): confidence: float = 0.1 + ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" + f"Malicious flow by ML. Src IP {saddr}:{sport} to " + f"{daddr}:{dport} {ip_identification}" ) timestamp = utils.convert_format( datetime.datetime.now(), utils.alerts_format ) - twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, - attacker_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - victim_type=IoCType.IP, - value=flow["daddr"], + direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], + profile=ProfileID(ip=saddr), + timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), + uid=[uid], timestamp=timestamp, - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], + category=IDEACategory.ANOMALY_TRAFFIC, ) self.db.set_evidence(evidence) @@ -441,22 +567,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - msg = json.loads(msg["data"]) - twid = msg["twid"] - self.flow = msg["flow"] - # these fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "allbytes": (self.flow["sbytes"] + self.flow["dbytes"]), - # the flow["state"] is the origstate, we dont need that here - # we need the interpreted state - "state": msg["interpreted_state"], - "pkts": self.flow["spkts"] + self.flow["dpkts"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) + data = msg["data"] + # Convert from json to dict + data = json.loads(data) + profileid = data["profileid"] + twid = data["twid"] + # Get flow that is now in json format + flow = data["flow"] + # Convert flow to a dict + flow = json.loads(flow) + # Convert the common fields to something that can + # be interpreted + # Get the uid which is the key + uid = next(iter(flow)) + self.flow_dict = json.loads(flow[uid]) if self.mode == "train": # We are training @@ -469,57 +593,51 @@ def main(self): sum_labeled_flows >= self.minimum_lables_to_retrain and sum_labeled_flows % self.minimum_lables_to_retrain == 1 ): - # We get here every 'self.minimum_lables_to_retrain' - # amount of labels - # So for example we retrain every 100 labels and only when - # we have at least 100 labels + # We get here every 'self.minimum_lables_to_retrain' amount of labels + # So for example we retrain every 100 labels and only when we have at least 100 labels self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." + f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." ) - # Process all flows in the DB and make them ready - # for pandas + # Process all flows in the DB and make them ready for pandas self.process_flows() # Train an algorithm self.train() elif self.mode == "test": # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) + self.process_flow() - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: + # After processing the flow, it may happen that we delete icmp/arp/etc + # so the dataframe can be empty + if self.flow is not None and not self.flow.empty: # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return + pred = self.detect() + label = self.flow_dict["label"] - label = self.flow["label"] + # Report if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode + # If the user specified a label in test mode, and the label + # is diff from the prediction, print in debug mode self.print( - f"Report Prediction {pred[0]} for label" - f' {label} flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 3, ) if pred[0] == "Malware": # Generate an alert - self.set_evidence_malicious_flow(self.flow, twid) + self.set_evidence_malicious_flow( + self.flow_dict["saddr"], + self.flow_dict["sport"], + self.flow_dict["daddr"], + self.flow_dict["dport"], + twid, + uid, + ) self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', + f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' + f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' + f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', 0, 2, ) From 1b46d82aa527373f28ad89932d12fbf7775a8561 Mon Sep 17 00:00:00 2001 From: alya Date: Tue, 30 Jul 2024 14:59:22 +0300 Subject: [PATCH 438/498] delete get_final_state_from_flags() from flowmldetection, profiler, and the db --- modules/flowmldetection/flowmldetection.py | 169 +++------------------ 1 file changed, 19 insertions(+), 150 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 124ec61f9..c57a7a358 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -5,9 +5,13 @@ import json import datetime import traceback -import sys +import warnings + -from slips_files.common.imports import * +from slips_files.common.state_handler import get_final_state_from_flags +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule from slips_files.core.evidence_structure.evidence import ( Evidence, ProfileID, @@ -29,8 +33,6 @@ def warn(*args, **kwargs): pass -import warnings - warnings.warn = warn @@ -113,141 +115,6 @@ def train(self): except Exception: self.print("Error in train()", 0, 1) self.print(traceback.format_exc(), 0, 1) - - def get_final_state_from_flags(self, state, pkts): - """ - Analyze the flags given and return a summary of the state. Should work with Argus and Bro flags - We receive the pakets to distinguish some Reset connections - """ - try: - pre = state.split("_")[0] - try: - # Try suricata states - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for UDP. For TCP, - these are: New, Established and Closed,for UDP only new and established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - # For Argus - suf = state.split("_")[1] - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - except IndexError: - # suf does not exist, which means that this is some ICMP or no response was sent for UDP or TCP - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - elif "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - elif "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are reseted when finished and therefore are established - # It can happen that is reseted being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. Most connections are finished with FIN when finished and therefore are established - # It can happen that is finished being not established, but we can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is not established because the OS retries 3 times. - return ( - "Not Established" if int(pkts) <= 3 else "Established" - ) - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - except Exception: - exception_line = sys.exc_info()[2].tb_lineno - self.print( - f"Error in get_final_state_from_flags() in FlowMLDetection.py line {exception_line}", - 0, - 1, - ) - self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): """ @@ -281,12 +148,17 @@ def process_features(self, dataset): except ValueError: pass - # When flows are read from Slips sqlite, the state is not transformed to 'Established' or 'Not Established', it is still 'S0' and others + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others # So transform here - #new_state_column = dataset.state.apply(self.get_final_state_from_flags(dataset.state, dataset.pkts)) - dataset['state'] = dataset.apply(lambda row: self.get_final_state_from_flags(row['state'], row['pkts']), axis=1) - - #dataset.state = new_state_column + dataset["state"] = dataset.apply( + lambda row: get_final_state_from_flags( + row["state"], row["pkts"] + ), + axis=1, + ) + # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -370,7 +242,7 @@ def process_flows(self): try: # We get all the flows so far # because this retraining happens in batches - flows = self.db.get_all_flows() + flows: list = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware @@ -464,7 +336,7 @@ def detect(self): """ try: # Store the real label if there is one - y_flow = self.flow["label"] + # y_flow = self.flow["label"] # remove the real label column self.flow = self.flow.drop("label", axis=1) # remove the label predictions column of the other modules @@ -568,13 +440,10 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): data = msg["data"] - # Convert from json to dict data = json.loads(data) - profileid = data["profileid"] + # profileid = data["profileid"] twid = data["twid"] - # Get flow that is now in json format flow = data["flow"] - # Convert flow to a dict flow = json.loads(flow) # Convert the common fields to something that can # be interpreted From 299d2ab8fd04e70a3a7b4f9bc287a3a642faf542 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Mon, 29 Jul 2024 16:36:55 +0200 Subject: [PATCH 439/498] mlflow. Ignore UID column --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c57a7a358..e2aa1e0ee 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -141,6 +141,7 @@ def process_features(self, dataset): "dpkts", "smac", "dmac", + "uid", ] for field in to_drop: try: From 06bbbcfd5bdbefc4da9940c62949a5178fe58209 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:23:29 +0100 Subject: [PATCH 440/498] Re add function that alya added --- slips_files/core/database/database_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/slips_files/core/database/database_manager.py b/slips_files/core/database/database_manager.py index 568e78ff4..1d339685f 100644 --- a/slips_files/core/database/database_manager.py +++ b/slips_files/core/database/database_manager.py @@ -613,6 +613,9 @@ def add_out_dns(self, *args, **kwargs): def add_port(self, *args, **kwargs): return self.rdb.add_port(*args, **kwargs) + def get_final_state_from_flags(self, *args, **kwargs): + return self.rdb.get_final_state_from_flags(*args, **kwargs) + def add_ips(self, *args, **kwargs): return self.rdb.add_ips(*args, **kwargs) From 98e29a6c43277e0577924a1d8c130f300c3cdca2 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:27:23 +0100 Subject: [PATCH 441/498] Delete file that was deleted from develop --- slips_files/common/state_handler.py | 170 ---------------------------- 1 file changed, 170 deletions(-) delete mode 100644 slips_files/common/state_handler.py diff --git a/slips_files/common/state_handler.py b/slips_files/common/state_handler.py deleted file mode 100644 index d0a05115b..000000000 --- a/slips_files/common/state_handler.py +++ /dev/null @@ -1,170 +0,0 @@ -from typing import Optional - - -def interpret_suricata_states(state) -> Optional[str]: - """ - There are different states in which a flow can be. - Suricata distinguishes three flow-states for TCP and two for - UDP. For TCP, - these are: New, Established and Closed,for UDP only new and - established. - For each of these states Suricata can employ different timeouts. - """ - if "new" in state or "established" in state: - return "Established" - elif "closed" in state: - return "Not Established" - - -def interpret_zeek_states(state) -> Optional[str]: - # We have varius type of states depending on the type of flow. - # For Zeek - if state in ("S0", "REJ", "RSTOS0", "RSTRH", "SH", "SHR"): - return "Not Established" - elif state in ("S1", "SF", "S2", "S3", "RSTO", "RSTP", "OTH"): - return "Established" - - -def interpret_argus_states(state) -> Optional[str]: - pre = state.split("_")[0] - try: - suf = state.split("_")[1] - except IndexError: - return - - if "S" in pre and "A" in pre and "S" in suf and "A" in suf: - """ - Examples: - SA_SA - SR_SA - FSRA_SA - SPA_SPA - SRA_SPA - FSA_FSA - FSA_FSPA - SAEC_SPA - SRPA_SPA - FSPA_SPA - FSRPA_SPA - FSPA_FSPA - FSRA_FSPA - SRAEC_SPA - FSPA_FSRPA - FSAEC_FSPA - FSRPA_FSPA - SRPAEC_SPA - FSPAEC_FSPA - SRPAEC_FSRPA - """ - return "Established" - elif "PA" in pre and "PA" in suf: - # Tipical flow that was reported in the middle - """ - Examples: - PA_PA - FPA_FPA - """ - return "Established" - elif "ECO" in pre: - return "ICMP Echo" - elif "ECR" in pre: - return "ICMP Reply" - elif "URH" in pre: - return "ICMP Host Unreachable" - elif "URP" in pre: - return "ICMP Port Unreachable" - else: - """ - Examples: - S_RA - S_R - A_R - S_SA - SR_SA - FA_FA - SR_RA - SEC_RA - """ - return "Not Established" - - -def interpret_tcp_states(state, pkts) -> Optional[str]: - pre = state.split("_")[0] - if "EST" in pre: - # TCP - return "Established" - elif "RST" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are reseted when finished and therefore are - # established - # It can happen that is reseted being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - elif "FIN" in pre: - # TCP. When -z B is not used in argus, states are single words. - # Most connections are finished with FIN when finished and - # therefore are established - # It can happen that is finished being not established, but we - # can't tell without -z b. - # So we use as heuristic the amount of packets. If <=3, then is - # not established because the OS retries 3 times. - return "Not Established" if int(pkts) <= 3 else "Established" - else: - """ - Examples: - S_ - FA_ - PA_ - FSA_ - SEC_ - SRPA_ - """ - return "Not Established" - - -def interpret_udp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "CON" in pre: - # UDP - return "Established" - elif "INT" in pre: - # UDP trying to connect, NOT preciselly not established but also - # NOT 'Established'. So we considered not established because there - # is no confirmation of what happened. - return "Not Established" - - -def interpret_icmp_states(state) -> Optional[str]: - pre = state.split("_")[0] - if "ECO" in pre: - # ICMP - return "Established" - elif "UNK" in pre: - # ICMP6 unknown upper layer - return "Established" - - -def get_final_state_from_flags(state, pkts) -> str: - """ - Converts the original flags from the flow, to a state that slips - understands - Works with Argus, suricata, and Bro flags - We receive the packets to distinguish some Reset connections - """ - - for interpreter in ( - interpret_suricata_states, - interpret_zeek_states, - interpret_argus_states, - interpret_icmp_states, - interpret_udp_states, - ): - if interpreted_state := interpreter(state): - return interpreted_state - - if interpreted_state := interpret_tcp_states(state, pkts): - return interpreted_state - - return "Not Established" From 045947ffdfb935b57f705baba86df81216eef573 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Sat, 15 Mar 2025 19:32:01 +0100 Subject: [PATCH 442/498] Flowmldetection. Fix missing db reference --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e2aa1e0ee..9269b6701 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -154,7 +154,7 @@ def process_features(self, dataset): # 'Not Established', it is still 'S0' and others # So transform here dataset["state"] = dataset.apply( - lambda row: get_final_state_from_flags( + lambda row: self.db.get_final_state_from_flags( row["state"], row["pkts"] ), axis=1, From e793c517a247a98ea25d278c35f38c9e16c8772d Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Tue, 18 Mar 2025 12:08:08 +0100 Subject: [PATCH 443/498] Fix the training of flows with ML in new version --- modules/flowmldetection/flowmldetection.py | 378 +++++++++++---------- 1 file changed, 197 insertions(+), 181 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9269b6701..1cfbaf925 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,18 +1,20 @@ +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle import pandas as pd import json -import datetime import traceback import warnings - -from slips_files.common.state_handler import get_final_state_from_flags from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils from slips_files.common.abstracts.module import IModule -from slips_files.core.evidence_structure.evidence import ( +from slips_files.core.structures.evidence import ( Evidence, ProfileID, TimeWindow, @@ -21,7 +23,8 @@ EvidenceType, IoCType, Direction, - IDEACategory, + Victim, + Method, ) # Only for debbuging @@ -52,36 +55,41 @@ def init(self): # Set the output queue of our database instance # Read the configuration self.read_configuration() - # Minum amount of new lables needed to trigger the train - self.minimum_lables_to_retrain = 50 + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained + self.last_number_of_flows_when_trained = 0 # To plot the scores of training # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + self.label = conf.label() def train(self): """ Train a model based on the flows we receive and the labels """ try: - # Process the labels to have only Normal and Malware - self.flows.label = self.flows.label.str.replace( - r"(^.*ormal.*$)", "Normal", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alware.*$)", "Malware", regex=True - ) - self.flows.label = self.flows.label.str.replace( - r"(^.*alicious.*$)", "Malware", regex=True - ) + # Get the flows from the DB + # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) + # Convert to pandas df + # self.flows = pd.DataFrame(self.flows) + # Process the features + # X_flow = self.process_features(self.flows) - # Separate - y_flow = self.flows["label"] + # Create X_flow with the current flows minus the label X_flow = self.flows.drop("label", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.label) + # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) # Normalize this batch of data so far. This can get progressivle slow @@ -90,7 +98,7 @@ def train(self): # Train try: self.clf.partial_fit( - X_flow, y_flow, classes=["Malware", "Normal"] + X_flow, y_flow, classes=["Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") @@ -113,7 +121,7 @@ def train(self): self.store_model() except Exception: - self.print("Error in train()", 0, 1) + self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) def process_features(self, dataset): @@ -123,7 +131,7 @@ def process_features(self, dataset): """ try: # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp"] + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] for proto in to_discard: dataset = dataset[dataset.proto != proto] @@ -132,21 +140,20 @@ def process_features(self, dataset): "appproto", "daddr", "saddr", - "ts", - "origstate", + "starttime", "type_", - "dir_", - "history", - "dbytes", - "dpkts", "smac", "dmac", + "history", "uid", + "dir_", + "endtime", + "flow_source", ] for field in to_drop: try: dataset = dataset.drop(field, axis=1) - except ValueError: + except (ValueError, KeyError): pass # When flows are read from Slips sqlite, @@ -155,11 +162,10 @@ def process_features(self, dataset): # So transform here dataset["state"] = dataset.apply( lambda row: self.db.get_final_state_from_flags( - row["state"], row["pkts"] + row["state"], (row["spkts"] + row["dpkts"]) ), axis=1, ) - # dataset.state = new_state_column # Convert state to categorical dataset.state = dataset.state.str.replace( @@ -193,58 +199,42 @@ def process_features(self, dataset): dataset.proto = dataset.proto.str.replace( r"(^.*arp.*$)", "4", regex=True ) - dataset.proto = dataset.proto.astype("float64") - try: - # Convert dport to float - dataset.dport = dataset.dport.astype("float") - except ValueError: - pass - try: - # Convert sport to float - dataset.sport = dataset.sport.astype("float") - except ValueError: - pass - try: - # Convert Dur to float - dataset.dur = dataset.dur.astype("float") - except ValueError: - pass - try: - # Convert TotPkts to float - dataset.pkts = dataset.pkts.astype("float") - except ValueError: - pass - try: - # Convert SrcPkts to float - dataset.spkts = dataset.spkts.astype("float") - except ValueError: - pass - try: - # Convert TotBytes to float - dataset.allbytes = dataset.allbytes.astype("float") - except ValueError: - pass - try: - # Convert SrcBytes to float - dataset.sbytes = dataset.sbytes.astype("float") - except ValueError: - pass + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + return dataset except Exception: # Stop the timer self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_flows(self): + def process_training_flows(self): """ - Process all the flwos in the DB + Process all the flows in the DB Store the pandas df in self.flows """ try: # We get all the flows so far # because this retraining happens in batches - flows: list = self.db.get_all_flows() - + flows = self.db.get_all_flows() # Check how many different labels are in the DB # We need both normal and malware labels = self.db.get_labels() @@ -254,48 +244,48 @@ def process_flows(self): # that are fake but representative of a normal and malware flow # they are only for the training process # At least 1 flow of each label is required - # self.print(f'Amount of labeled flows: {labels}', 0, 1) + + # These flows should be in the same format as the ones in the DB. + # Which means the satate is still SF, S0, etc. flows.append( { - "ts": 1594417039.029793, + "starttime": 1594417039.029793, "dur": "1.9424750804901123", "saddr": "10.7.10.101", "sport": "49733", "daddr": "40.70.224.145", "dport": "443", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 84, - "allbytes": 42764, - "spkts": 37, + "state": "SF", + "spkts": 17, + "dpkts": 27, "sbytes": 25517, + "dbytes": 17247, "appproto": "ssl", - "label": "Malware", + "label": "Malicious", "module_labels": { - "flowalerts-long-connection": "Malware" + "flowalerts-long-connection": "Malicious" }, } ) flows.append( { - "ts": 1382355032.706468, + "starttime": 1382355032.706468, "dur": "10.896695", "saddr": "147.32.83.52", "sport": "47956", "daddr": "80.242.138.72", "dport": "80", "proto": "tcp", - "origstate": "SRPA_SPA", - "state": "Established", - "pkts": 67, - "allbytes": 67696, + "state": "SF", "spkts": 1, + "dpkts": 0, "sbytes": 100, + "dbytes": 67596, "appproto": "http", - "label": "Normal", + "label": "Benign", "module_labels": { - "flowalerts-long-connection": "Normal" + "flowalerts-long-connection": "Benign" }, } ) @@ -314,42 +304,51 @@ def process_flows(self): self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) - def process_flow(self): + def process_flow(self, flow_to_process: dict): """ Process one flow. Only used during detection in testing - Store the pandas df in self.flow + returns the pandas df with the processed flow """ try: # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(self.flow_dict, index=[0]) - # Process features + raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) # Update the flow to the processed version - self.flow = dflow + return dflow except Exception: # Stop the timer self.print("Error in process_flow()") self.print(traceback.format_exc(), 0, 1) - def detect(self): + def detect(self, x_flow) -> Optional[numpy.ndarray]: """ - Detect this flow with the current model stored + Detects the given flow with the current model stored + and returns the predection array """ try: - # Store the real label if there is one - # y_flow = self.flow["label"] - # remove the real label column - self.flow = self.flow.drop("label", axis=1) - # remove the label predictions column of the other modules - X_flow = self.flow.drop("module_labels", axis=1) + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + ] + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass # Scale the flow - X_flow = self.scaler.transform(X_flow) - pred = self.clf.predict(X_flow) + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) return pred - except Exception: - # Stop the timer - self.print("Error in detect() X_flow:") - self.print(X_flow) + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) self.print(traceback.format_exc(), 0, 1) def store_model(self): @@ -357,10 +356,10 @@ def store_model(self): Store the trained model on disk """ self.print("Storing the trained model and scaler on disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "wb") as f: + with open(self.model_path, "wb") as f: data = pickle.dumps(self.clf) f.write(data) - with open("./modules/flowmldetection/scaler.bin", "wb") as g: + with open(self.scaler_path, "wb") as g: data = pickle.dumps(self.scaler) g.write(data) @@ -370,20 +369,23 @@ def read_model(self): """ try: self.print("Reading the trained model from disk.", 0, 2) - with open("./modules/flowmldetection/model.bin", "rb") as f: + with open(self.model_path, "rb") as f: self.clf = pickle.load(f) self.print("Reading the trained scaler from disk.", 0, 2) - with open("./modules/flowmldetection/scaler.bin", "rb") as g: + with open(self.scaler_path, "rb") as g: self.scaler = pickle.load(g) except FileNotFoundError: # If there is no model, create one empty - self.print("There was no model. Creating a new empty model.", 0, 2) + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) self.clf = SGDClassifier( warm_start=True, loss="hinge", penalty="l1" ) except EOFError: self.print( - "Error reading model from disk. Creating a new empty model.", + "Error reading model from disk. " + "Creating a new empty model.", 0, 2, ) @@ -391,39 +393,36 @@ def read_model(self): warm_start=True, loss="hinge", penalty="l1" ) - def set_evidence_malicious_flow( - self, - saddr: str, - sport: str, - daddr: str, - dport: str, - twid: str, - uid: str, - ): + def set_evidence_malicious_flow(self, flow: dict, twid: str): confidence: float = 0.1 - ip_identification = self.db.get_ip_identification(daddr) description = ( - f"Malicious flow by ML. Src IP {saddr}:{sport} to " - f"{daddr}:{dport} {ip_identification}" - ) - - timestamp = utils.convert_format( - datetime.datetime.now(), utils.alerts_format + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" ) - + twid_number = int(twid.replace("timewindow", "")) evidence: Evidence = Evidence( evidence_type=EvidenceType.MALICIOUS_FLOW, attacker=Attacker( - direction=Direction.SRC, attacker_type=IoCType.IP, value=saddr + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], ), threat_level=ThreatLevel.LOW, confidence=confidence, description=description, - profile=ProfileID(ip=saddr), - timewindow=TimeWindow(number=int(twid.replace("timewindow", ""))), - uid=[uid], - timestamp=timestamp, - category=IDEACategory.ANOMALY_TRAFFIC, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], ) self.db.set_evidence(evidence) @@ -440,17 +439,20 @@ def pre_main(self): def main(self): if msg := self.get_msg("new_flow"): - data = msg["data"] - data = json.loads(data) - # profileid = data["profileid"] - twid = data["twid"] - flow = data["flow"] - flow = json.loads(flow) - # Convert the common fields to something that can - # be interpreted - # Get the uid which is the key - uid = next(iter(flow)) - self.flow_dict = json.loads(flow[uid]) + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) if self.mode == "train": # We are training @@ -459,55 +461,69 @@ def main(self): # Use labeled flows labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. if ( - sum_labeled_flows >= self.minimum_lables_to_retrain - and sum_labeled_flows % self.minimum_lables_to_retrain == 1 + sum_labeled_flows >= self.minimum_labels_to_start_train ): - # We get here every 'self.minimum_lables_to_retrain' amount of labels - # So for example we retrain every 100 labels and only when we have at least 100 labels - self.print( - f"Training the model with the last group of flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready for pandas - self.process_flows() - # Train an algorithm - self.train() + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows() + # Train an algorithm + self.train() + self.last_number_of_flows_when_trained = sum_labeled_flows + elif self.mode == "test": # We are testing, which means using the model to detect - self.process_flow() + processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we delete icmp/arp/etc - # so the dataframe can be empty - if self.flow is not None and not self.flow.empty: + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: # Predict - pred = self.detect() - label = self.flow_dict["label"] + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return - # Report + label = self.flow["label"] if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, and the label - # is diff from the prediction, print in debug mode + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode self.print( - f'Report Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 3, ) - if pred[0] == "Malware": + if pred[0] == "Malicious": # Generate an alert - self.set_evidence_malicious_flow( - self.flow_dict["saddr"], - self.flow_dict["sport"], - self.flow_dict["daddr"], - self.flow_dict["dport"], - twid, - uid, - ) + self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f'Prediction {pred[0]} for label {label} flow {self.flow_dict["saddr"]}:' - f'{self.flow_dict["sport"]} -> {self.flow_dict["daddr"]}:' - f'{self.flow_dict["dport"]}/{self.flow_dict["proto"]}', + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', 0, 2, ) From 57e144cc7fe5f3dda58e0db65af60bd23cac5aa2 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:22:38 +0100 Subject: [PATCH 444/498] flowml. If the dataset has one flow and that is deleted, then return empty fast. --- modules/flowmldetection/flowmldetection.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 1cfbaf925..0bfaef283 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -135,6 +135,11 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + # For now, discard these to_drop = [ "appproto", From 5c562206d67d1e98ff72f75af90a2c27685724c5 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:23:05 +0100 Subject: [PATCH 445/498] flowml. If the datasert is empty. Return none --- modules/flowmldetection/flowmldetection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 0bfaef283..df1572fa5 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -318,6 +318,8 @@ def process_flow(self, flow_to_process: dict): # Convert the flow to a pandas dataframe raw_flow = pd.DataFrame(flow_to_process, index=[0]) dflow = self.process_features(raw_flow) + if dflow.empty: + return None # Update the flow to the processed version return dflow except Exception: From a8c11a868b4bc7d5919344c8211c6bfac164c343 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 19 Mar 2025 14:27:16 +0100 Subject: [PATCH 446/498] First new version of the model and scaler. Not good yet, but working. --- modules/flowmldetection/model.bin | Bin 1124 -> 1090 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index a6648cf72179520975b0e9ad1164f7d574e87140..7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f 100644 GIT binary patch delta 130 zcmV-|0Db@D2*L;ifCQB{u>>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KL^fCQCUu>>gtlhXnvSy-jcOc_je6TteGguV+y7r>O2=ii6N`@aE8 z4q_05T(v)g91VHfmFeIMvRKFpJJ~89v lBES;fQwQSX>_3x<11kt29Z*;7CnUg=yaQDN?(vfo1TSe6JFWl# diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 17115724b9536f6093f9d72f3b58a5c22c562a9a..bfba4d107224e5e6e5a1e8c8f4d463b48131d111 100644 GIT binary patch delta 290 zcmV+-0p0%k2KolDvjGBX0h6@>Pgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGcfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXo Date: Thu, 20 Mar 2025 13:16:06 +0100 Subject: [PATCH 447/498] model and scaler with 1 malicious and 1 benign --- modules/flowmldetection/model.bin | Bin 1090 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 7ea3d27c97ca27abe7bcef4f35f2057da4bf2b0f..0fac693b39f8e2f0e826471e72a52010709a2a4a 100644 GIT binary patch delta 132 zcmX@a@q~k=fn{psMix!x$(NZ_BO`RCb>5jSYK$N!g2|oC+8oRs(l>gtTsw(FScJ$7xocP4|5 zn$$WZz^$}67u&zYKfDP4vYKo~KfmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoPgl=^Tj5WVAwXGOhP~3C<3D)c8d<%4*gy{6w?@|U z%s@r+gj&Fu5J2_kY{!Qn^*`?T5Tx7$dq73=gj&E@c0d))6KVlt!ao`IN`KC-h(JOu zU`*oR6_Z;6CRfo>BRz3ectB?@J_D&+&OhD+nN>A{w?PryCGc Date: Thu, 20 Mar 2025 13:16:27 +0100 Subject: [PATCH 448/498] cleaner jupyter --- modules/flowmldetection/flowmldetection.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index df1572fa5..a9b8a1358 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -343,6 +343,23 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "endtime", "flow_source", ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) From 9682f8c59aaa9a372f73447a7579c1ee2bfc478c Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Thu, 20 Mar 2025 22:26:27 +0100 Subject: [PATCH 449/498] New models after 3rd ttrain --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 0fac693b39f8e2f0e826471e72a52010709a2a4a..5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8 100644 GIT binary patch delta 99 zcmaFD@q}YTFtfkevYaFSxkd-_3;rCozh!)2lYmpEbEdGvA?N`Qv**dmWkY#6EnbNU>V$pfHc5?(6z~90v14#>uEKZXcoY2J@~Q; zm@vT916v-#h9y9THMMtpw*Eia3Y9($*ABq#7l@Rev)(_(4lO-NtuB+I1CayN#ekDG F1TN2UERg^J diff --git a/modules/flowmldetection/scaler.bin b/modules/flowmldetection/scaler.bin index 758909b289238ff282b2e056a9b3e83768b8472a..821344a0c69d116622b02e2a0daa1554cb5d308e 100644 GIT binary patch delta 43 zcmV+`0M!5b2KolDfdU!c4zU>a=p#W80?-ZtN@qVmt(3Za=p#W80?-ZtN@qVmt(3Z Date: Wed, 26 Mar 2025 00:08:50 +0100 Subject: [PATCH 450/498] Models after 4th train --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 5ebcb37205a7a3e8d0a20ab78a1219f0ddf95dd8..3ab5a240bb45f88d026d1d9d1959cfa384e2473b 100644 GIT binary patch delta 120 zcmV-;0EhqN2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P as{<PghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T delta 290 zcmV+-0p0%k2KolDvjGBo1(US_Pgl4iHtM!%U_hWRf%FVpXFo>fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRg4Lu^9H~BS8=X&<+7gXFor!l)9lJXF)RTPqWf%b3r!tJEWTtnm`Ac zf^|*Gc0elJHxDoXoty4FCWD From 237b6ef13aca3eddca3de9b5cf8f255260238bb6 Mon Sep 17 00:00:00 2001 From: Sebas Garcia Date: Wed, 26 Mar 2025 08:28:59 +0100 Subject: [PATCH 451/498] Models of ml flow with the first good performance in small tests --- modules/flowmldetection/model.bin | Bin 1124 -> 1124 bytes modules/flowmldetection/scaler.bin | Bin 890 -> 890 bytes 2 files changed, 0 insertions(+), 0 deletions(-) diff --git a/modules/flowmldetection/model.bin b/modules/flowmldetection/model.bin index 3ab5a240bb45f88d026d1d9d1959cfa384e2473b..a6648cf72179520975b0e9ad1164f7d574e87140 100644 GIT binary patch delta 121 zcmV-<0EYkM2;>N`Qv+C~&P*9hb`!w*mV~|wLl?l5mFM4w$NRqlOAcXmpyfaAG1(); zYA3+ulRv$a;~zkIU#E>ocI-ba#L|>%Hv~ZN`Y?b6^limYW1McwvlQsk{8#y@u delta 121 zcmV-<0EYkM2;>N`Qv+ChN@}Q2`wPIHjIJ0X6&b*sXgPyxz!<>7-o1V`boIY<&cGv* zMA;T^cz6USY()1i3vc*(ICW>9REM~el3OL8yLVz+8BZ7+Y~^P bs{<fmcQ~I?m(e2&@mlG zqd?6RN*?ZoGC;4up#2u+D?s5YufU*hus|2)Nzwn}vOwP`V1tl2?mv8p{76u#!9bEa zjm~^LK$BYnCRf}Ju^9H~BS8=X&<+7gXFor!l)9lJXF)UUPqWf%b3r-wJEWTtnm`Ac zf^|*Gc0elJHxDoXoPghU~{@&OfB0!%%<|y61;y>t*IZ8K{*+AXCvU*qB z%0Pe2QM!F)5kR-EtllLL_CF>M2ApwSd_Z`~g8^q;cR(Gb+gDca!aq;!#7^woi9l`m z{4q!A7L!{7CRc2&pGU{qctGGXl*-2r&p+4h*(lZXw?Qu&=iylTwLyoc9%Oz?d_V=A za=mcAXh1m7B9yUmbwL&Oq6=*NZ9yf7dx}OPz(0XAuDu67ghArqkBr`9lats2C|9(o zHuM(pGeE6wqA`7j-#@s^-O-hk(?CdW8YGjz(m<`3N;vgKH9+Ch9fX%7EI^{)^od!9 ovOtBo|Dy%^u0ZgT2LsNy);|K#Ksj}DxjML6cVlBO2|D9RL6T From 43aae2e88f823e4a3d5e751b02b521d5487d231e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:49:23 +0000 Subject: [PATCH 452/498] Add plot for flowml train scores --- modules/flowmldetection/plot_train_score.py | 56 +++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 modules/flowmldetection/plot_train_score.py diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py new file mode 100644 index 000000000..0b5b5b72b --- /dev/null +++ b/modules/flowmldetection/plot_train_score.py @@ -0,0 +1,56 @@ +import pandas as pd +import matplotlib.pyplot as plt +import re +import sys + +def plot_log_data(file_path): + # Read the log data from the file + with open(file_path, 'r') as file: + log_data = file.read() + + # Define regex pattern to extract relevant data from each line + pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + + # Parse the log file + data = re.findall(pattern, log_data) + + # Convert data to a DataFrame + df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + df = df.astype({ + "Background": int, + "Benign": int, + "Malicious": int, + "Total labels": float, + "Score": float + }) + + # Plotting the values + fig, ax1 = plt.subplots(figsize=(10, 6)) + + # Plotting Score on the left y-axis + ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + ax1.set_xlabel('Index') + ax1.set_ylabel('Score', color='tab:blue') + ax1.tick_params(axis='y', labelcolor='tab:blue') + + # Create the second y-axis for the Total labels + ax2 = ax1.twinx() + ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') + ax2.set_ylabel('Total labels', color='tab:red') + ax2.tick_params(axis='y', labelcolor='tab:red') + + # Adding title and legend + plt.title('Log Data Visualization') + fig.tight_layout() + + # Save plot to a PNG file + plt.savefig('log_data_plot_with_two_scales.png') + + # Display the plot + plt.show() + +# Make sure the file path is passed as an argument +if len(sys.argv) < 2: + print("Please provide the path to the log file as a parameter.") +else: + plot_log_data(sys.argv[1]) From 6f045c72b8ac57f7b866f8cd14b0fe98fc668a9c Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:04 +0000 Subject: [PATCH 453/498] Add a log file to store the training data output --- modules/flowmldetection/flowmldetection.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index a9b8a1358..8a319cb4e 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -68,12 +68,29 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + def read_configuration(self): conf = ConfigParser() self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves self.label = conf.label() - def train(self): + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): """ Train a model based on the flows we receive and the labels """ From 8a42f14ad61b5230c8426dbfef1f8bc0bd839a0b Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:32 +0000 Subject: [PATCH 454/498] Store data in the log file of training --- modules/flowmldetection/flowmldetection.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 8a319cb4e..28e8e7eca 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -137,9 +137,13 @@ def train(self, sum_labeled_flows): # Store the models on disk self.store_model() + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") def process_features(self, dataset): """ From f4dd77bff3cdb4428269ab005fb0c4b451efc9f8 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:50:53 +0000 Subject: [PATCH 455/498] better comments --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 28e8e7eca..676907a6d 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -59,10 +59,9 @@ def init(self): self.minimum_labels_to_start_train = 50 # Minum amount of new labels needed to retrain self.minimum_labels_to_retrain = 50 - # The number of flows when last trained + # The number of flows when last trained. Used internally only to know + # when to retrain self.last_number_of_flows_when_trained = 0 - # To plot the scores of training - # self.scores = [] # The scaler trained during training and to use during testing self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" From 7e72af1c156068ff3e4b91217d53830c9a4f6262 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:30 +0000 Subject: [PATCH 456/498] Fix issue not dropping detailed labels --- modules/flowmldetection/flowmldetection.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 676907a6d..483c6a1d6 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -94,23 +94,19 @@ def train(self, sum_labeled_flows): Train a model based on the flows we receive and the labels """ try: - # Get the flows from the DB - # self.flows = self.db.get_all_flows_in_profileid_twid(self.profileid, self.twid) - # Convert to pandas df - # self.flows = pd.DataFrame(self.flows) - # Process the features - # X_flow = self.process_features(self.flows) - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("label", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.label) + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Train try: self.clf.partial_fit( From beaf213d6167832d8c3f1e98eb6bc98d2e40d29d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:51:53 +0000 Subject: [PATCH 457/498] Fix issue that not all labels sere given to the partial fit --- modules/flowmldetection/flowmldetection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 483c6a1d6..b06c9a54e 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -109,8 +109,9 @@ def train(self, sum_labeled_flows): # Train try: + # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Malicious", "Benign"] + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] ) except Exception: self.print("Error while calling clf.train()") From 5b290a7fc764e26766d3519bbafe54b43cdae603 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:52:08 +0000 Subject: [PATCH 458/498] count partial labels in this epoch --- modules/flowmldetection/flowmldetection.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b06c9a54e..184a6b345 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -106,6 +106,12 @@ def train(self, sum_labeled_flows): # Normalize this batch of data so far. This can get progressivle slow X_flow = self.scaler.fit_transform(X_flow) + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } # Train try: From 1cb44821b4885c0a648bf5183dfdde83c4d71cc8 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:09 +0000 Subject: [PATCH 459/498] Dont print training in screen --- modules/flowmldetection/flowmldetection.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 184a6b345..4dd8191f8 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -126,15 +126,8 @@ def train(self, sum_labeled_flows): # See score so far in training score = self.clf.score(X_flow, y_flow) - # To debug the training score - # self.scores.append(score) - - self.print(f" Training Score: {score}", 0, 1) - # self.print(f' Model Parameters: {self.clf.coef_}') - - # Debug code to store a plot in a png of the scores - # plt.plot(self.scores) - # plt.savefig('train-scores.png') + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) # Store the models on disk self.store_model() From a38524eada2e31b202392335cf470a1b08bbd25f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:55:28 +0000 Subject: [PATCH 460/498] Add function to write to train log --- modules/flowmldetection/flowmldetection.py | 34 ++++++++++++---------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4dd8191f8..679e7c0cc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -247,28 +247,28 @@ def process_features(self, dataset): self.print("Error in process_features()") self.print(traceback.format_exc(), 0, 1) - def process_training_flows(self): + def process_training_flows(self, last_number_of_flows_when_trained): """ - Process all the flows in the DB + Process only the new flows in the DB since the last training. Store the pandas df in self.flows """ try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + # We get all the flows so far - # because this retraining happens in batches flows = self.db.get_all_flows() - # Check how many different labels are in the DB - # We need both normal and malware + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB labels = self.db.get_labels() if len(labels) == 1: - # Only 1 label has flows - # There are not enough different labels, so insert two flows - # that are fake but representative of a normal and malware flow - # they are only for the training process - # At least 1 flow of each label is required - - # These flows should be in the same format as the ones in the DB. - # Which means the satate is still SF, S0, etc. - flows.append( + # Insert fake flows for both classes if needed + new_flows.append( { "starttime": 1594417039.029793, "dur": "1.9424750804901123", @@ -358,6 +358,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "dir_", "endtime", "flow_source", + "ground_truth_label", # todo now we can use them + "detailed_ground_truth_label", ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. # Error @@ -502,11 +504,11 @@ def main(self): labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - # The min labels to retrain is the min number of flows + # The min labels to retrain is the min number of flows # we should have seen so far in this capture to start training # This is so we dont _start_ training with only 1 flow - # Once we are over the start minimum, the second condition is + # Once we are over the start minimum, the second condition is # to force to retrain every a minimum_labels_to_retrain number # of flows. So we dont retrain every 1 flow. if ( From 9a888b7055b804316775159042255e84a191869c Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:27 +0000 Subject: [PATCH 461/498] Fix label in dummy flow --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 679e7c0cc..95c9b82a7 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -283,13 +283,13 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "label": "Malicious", + "ground_truth_label": "Malicious", "module_labels": { "flowalerts-long-connection": "Malicious" }, } ) - flows.append( + new_flows.append( { "starttime": 1382355032.706468, "dur": "10.896695", From 8f8a5443834244a4522f80ef17cdb073d3976bc4 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:57:39 +0000 Subject: [PATCH 462/498] Fix dummy flow --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 95c9b82a7..5ea48fbc4 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -304,7 +304,7 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "label": "Benign", + "ground_truth_label": "Benign", "module_labels": { "flowalerts-long-connection": "Benign" }, From d27350f5678356eda2dfdea7722c4a2567a3a93f Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 16:58:28 +0000 Subject: [PATCH 463/498] Rename variable --- modules/flowmldetection/flowmldetection.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5ea48fbc4..ff68b8a27 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -310,10 +310,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): }, } ) - # If there are enough flows, we dont insert them anymore # Convert to pandas df - df_flows = pd.DataFrame(flows) + df_flows = pd.DataFrame(new_flows) # Process features df_flows = self.process_features(df_flows) @@ -321,7 +320,6 @@ def process_training_flows(self, last_number_of_flows_when_trained): # Update the flow to the processed version self.flows = df_flows except Exception: - # Stop the timer self.print("Error in process_flows()") self.print(traceback.format_exc(), 0, 1) From 4242689cf0a9b71ba877668080c5f7907d944d45 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:32 +0000 Subject: [PATCH 464/498] Fix dummy flow label --- modules/flowmldetection/flowmldetection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index ff68b8a27..6b41b4029 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -356,7 +356,7 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "dir_", "endtime", "flow_source", - "ground_truth_label", # todo now we can use them + "ground_truth_label", "detailed_ground_truth_label", ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. From 6d561e03770607761204e82b027fc8f167c0887e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:00:47 +0000 Subject: [PATCH 465/498] Pass values to train function --- modules/flowmldetection/flowmldetection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6b41b4029..4d66aab85 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -521,9 +521,9 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows() + self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train() + self.train(sum_labeled_flows) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From 50d892127da4c1bbaf150997363c3cc9b1d41f9a Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:01:47 +0000 Subject: [PATCH 466/498] import os --- modules/flowmldetection/flowmldetection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 4d66aab85..766178e12 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,6 +10,7 @@ import json import traceback import warnings +import os from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils From a7cf82be948b4ff673f189d62d89276b1b385471 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:03:53 +0000 Subject: [PATCH 467/498] Delete old comments --- modules/flowmldetection/flowmldetection.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 766178e12..6c3bfc127 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -28,10 +28,6 @@ Method, ) -# Only for debbuging -# from matplotlib import pyplot as plt - - # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass From 06add4106a0c833a368dad445a094a0a76f11f3d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:13:22 +0000 Subject: [PATCH 468/498] Fix plots --- modules/flowmldetection/plot_train_score.py | 48 ++++++++++++++++----- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 0b5b5b72b..359df04ef 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -2,6 +2,8 @@ import matplotlib.pyplot as plt import re import sys +import argparse +import os def plot_log_data(file_path): # Read the log data from the file @@ -24,33 +26,59 @@ def plot_log_data(file_path): "Score": float }) + # Get the directory of the log file to store the plot in the same folder + dir_name = os.path.dirname(file_path) + plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') + # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis + # Plotting Score on the left y-axis (with proper scaling from 0 to 1) ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') ax1.set_xlabel('Index') ax1.set_ylabel('Score', color='tab:blue') + ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Total labels + # Create the second y-axis for the Background, Benign, Malicious, Total labels ax2 = ax1.twinx() + ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + + # Set appropriate scale for right y-axis based on the data + ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() - # Save plot to a PNG file - plt.savefig('log_data_plot_with_two_scales.png') + # Adding the legend with increased space for readability + ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') + ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + + # Increase right margin for better readability of legend + plt.subplots_adjust(right=0.75) + + # Save plot to the same folder as the log file + plt.savefig(plot_file) # Display the plot plt.show() -# Make sure the file path is passed as an argument -if len(sys.argv) < 2: - print("Please provide the path to the log file as a parameter.") -else: - plot_log_data(sys.argv[1]) +def main(): + # Parse command-line arguments + parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") + parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + + # Handle -h / --help + args = parser.parse_args() + + # Call the function to process the log file + plot_log_data(args.log_file) + +if __name__ == "__main__": + main() From f5160524451637eb0ad20db0b277395d0683f368 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:14:58 +0000 Subject: [PATCH 469/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 359df04ef..c7f374a7f 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -40,18 +40,21 @@ def plot_log_data(file_path): ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 ax1.tick_params(axis='y', labelcolor='tab:blue') - # Create the second y-axis for the Background, Benign, Malicious, Total labels + # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.plot(df.index, df["Total labels"], label="Total labels", color='tab:red') - ax2.set_ylabel('Background, Benign, Malicious, Total labels', color='tab:red') + ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious", "Total labels"]].max().max()) + ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) ax2.tick_params(axis='y', labelcolor='tab:red') + # Annotating Total labels as text on the plot + for i, value in enumerate(df["Total labels"]): + ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') + # Adding title and legend plt.title('Log Data Visualization') fig.tight_layout() From d1b2bd882e7718d8923436b5485fe0e5398b4383 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:16:23 +0000 Subject: [PATCH 470/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index c7f374a7f..4099c47c1 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -42,10 +42,10 @@ def plot_log_data(file_path): # Create the second y-axis for the Background, Benign, Malicious ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious', color='tab:red') + ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') + ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') # Set appropriate scale for right y-axis based on the data ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) @@ -56,7 +56,7 @@ def plot_log_data(file_path): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') # Adding title and legend - plt.title('Log Data Visualization') + plt.title('Training performance') fig.tight_layout() # Adding the legend with increased space for readability From ba0e9f1a8cc05c044b76810c1e9fa164492732a5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 17:24:43 +0000 Subject: [PATCH 471/498] Fix plot --- modules/flowmldetection/plot_train_score.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py index 4099c47c1..8437e968a 100644 --- a/modules/flowmldetection/plot_train_score.py +++ b/modules/flowmldetection/plot_train_score.py @@ -59,12 +59,12 @@ def plot_log_data(file_path): plt.title('Training performance') fig.tight_layout() - # Adding the legend with increased space for readability - ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small') - ax2.legend(loc='upper left', bbox_to_anchor=(1, 0.7), fontsize='small') + # Move both legends further to the right + ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) + plt.subplots_adjust(right=0.7) # Save plot to the same folder as the log file plt.savefig(plot_file) From e089bec8ae86ab1fb938a03b08430b6eace488e2 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:02:34 +0000 Subject: [PATCH 472/498] Plot testing performance from a log --- .../plot_testing_performance.py | 116 ++++-------------- 1 file changed, 24 insertions(+), 92 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 6865415cd..a38c7f059 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -1,7 +1,6 @@ import matplotlib.pyplot as plt import sys import numpy as np -import argparse def process_file(file_path): # Initialize the counters for the values @@ -50,108 +49,41 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number): - # Separate the values into two groups based on their proximity to 0 or 1 - close_to_0 = { - 'FPR': [], 'FNR': [] - } - close_to_1 = { - 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] - } - - # Categorize the metrics into two groups - for i in range(len(FPR_values)): - close_to_0['FPR'].append(FPR_values[i]) - close_to_0['FNR'].append(FNR_values[i]) - - close_to_1['TNR'].append(TNR_values[i]) - close_to_1['TPR'].append(TPR_values[i]) - close_to_1['F1'].append(F1_values[i]) - close_to_1['accuracy'].append(accuracy_values[i]) - close_to_1['precision'].append(precision_values[i]) - close_to_1['MCC'].append(MCC_values[i]) - close_to_1['recall'].append(recall_values[i]) - - # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True) - - # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False) - - # Print the final values - print("\nFinal Metric Values for Experiment", experiment_number) - print(f"Final FPR: {FPR_values[-1]:.4f}") - print(f"Final FNR: {FNR_values[-1]:.4f}") - print(f"Final TNR: {TNR_values[-1]:.4f}") - print(f"Final TPR: {TPR_values[-1]:.4f}") - print(f"Final F1 Score: {F1_values[-1]:.4f}") - print(f"Final Accuracy: {accuracy_values[-1]:.4f}") - print(f"Final Precision: {precision_values[-1]:.4f}") - print(f"Final MCC: {MCC_values[-1]:.4f}") - print(f"Final Recall: {recall_values[-1]:.4f}") - -def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False): +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Create the plot plt.figure(figsize=(12, 8)) - # Only plot the metrics that exist in the dictionary - if 'FPR' in metrics_dict: - plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') - if 'FNR' in metrics_dict: - plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') - if 'TNR' in metrics_dict: - plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') - if 'TPR' in metrics_dict: - plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') - if 'F1' in metrics_dict: - plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') - if 'accuracy' in metrics_dict: - plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') - if 'precision' in metrics_dict: - plt.plot(metrics_dict['precision'], label='Precision', marker='o') - if 'MCC' in metrics_dict: - plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') - if 'recall' in metrics_dict: - plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - - # If the plot is close to 1, apply log scale - if not is_close_to_0: - plt.yscale('log') - - # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series - if is_close_to_0: - min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR'])) - max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR'])) - - # Avoid log(0), so set the minimum limit a little higher than zero - if min_val == 0: - min_val = 1e-4 # Avoid zero values on the logarithmic scale - - plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically - - # Add the experiment number to the plot title + # Plot each metric + plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') + plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') + plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') + plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') + plt.plot(F1_values, label='F1 Score', marker='o') + plt.plot(accuracy_values, label='Accuracy', marker='o') + plt.plot(precision_values, label='Precision', marker='o') + plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') + plt.plot(recall_values, label='Recall (TPR)', marker='o') + + # Add labels and title plt.xlabel('Index') plt.ylabel('Metric Value') - plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time') + plt.title('Evaluation Metrics Over Time') + + # Add a legend plt.legend() - # Save the plot - plt.savefig(output_filename) + # Save the plot as a PNG file + plt.savefig('metrics_plot.png') plt.close() def main(): - # Set up argument parsing - parser = argparse.ArgumentParser(description='Plot testing performance metrics.') - parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file') - parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number') - - args = parser.parse_args() - - file_path = args.file - experiment_number = args.experiment + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + file_path = sys.argv[1] FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) if __name__ == "__main__": main() From 499f08bdbda9d16604b33df6e0b60c54cdec709d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:04:32 +0000 Subject: [PATCH 473/498] Fix the plot --- modules/flowmldetection/plot_testing_performance.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index a38c7f059..fac0acd64 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -64,16 +64,19 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Set logarithmic scale on the y-axis + plt.yscale('log') + # Add labels and title plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title('Evaluation Metrics Over Time') + plt.ylabel('Metric Value (Log Scale)') + plt.title('Evaluation Metrics Over Time (Log Scale)') # Add a legend plt.legend() # Save the plot as a PNG file - plt.savefig('metrics_plot.png') + plt.savefig('metrics_plot_log_scale.png') plt.close() def main(): From 9007dfbdaccdaaa852e6c1e30e93746fb6052478 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:12:40 +0000 Subject: [PATCH 474/498] Fix the plots --- .../plot_testing_performance.py | 76 ++++++++++++++----- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index fac0acd64..5581c72cd 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -50,33 +50,66 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Create the plot - plt.figure(figsize=(12, 8)) + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } - # Plot each metric - plt.plot(FPR_values, label='False Positive Rate (FPR)', marker='o') - plt.plot(FNR_values, label='False Negative Rate (FNR)', marker='o') - plt.plot(TNR_values, label='True Negative Rate (TNR)', marker='o') - plt.plot(TPR_values, label='True Positive Rate (TPR)', marker='o') - plt.plot(F1_values, label='F1 Score', marker='o') - plt.plot(accuracy_values, label='Accuracy', marker='o') - plt.plot(precision_values, label='Precision', marker='o') - plt.plot(MCC_values, label='Matthews Correlation Coefficient (MCC)', marker='o') - plt.plot(recall_values, label='Recall (TPR)', marker='o') + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') - # Set logarithmic scale on the y-axis - plt.yscale('log') + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + +def plot_single_group(metrics_dict, output_filename): + plt.figure(figsize=(12, 8)) - # Add labels and title + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + plt.xlabel('Index') - plt.ylabel('Metric Value (Log Scale)') - plt.title('Evaluation Metrics Over Time (Log Scale)') - - # Add a legend + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') plt.legend() - # Save the plot as a PNG file - plt.savefig('metrics_plot_log_scale.png') + # Save the plot + plt.savefig(output_filename) plt.close() def main(): @@ -85,6 +118,7 @@ def main(): sys.exit(1) file_path = sys.argv[1] + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) From fb2e163811d92a22203ad14e5462c74c8514c6cf Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:16:50 +0000 Subject: [PATCH 475/498] Fix plot --- .../plot_testing_performance.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 5581c72cd..8f9e12cd8 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,12 +72,24 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png') + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') -def plot_single_group(metrics_dict, output_filename): + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -103,6 +115,12 @@ def plot_single_group(metrics_dict, output_filename): # Apply log scale by default plt.yscale('log') + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Manually set more Y-ticks for better visibility + plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 + plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + plt.xlabel('Index') plt.ylabel('Metric Value') plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') From acac48b8feccf08958d19f68d0375bb4bb7e6df1 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:22 +0000 Subject: [PATCH 476/498] Fix plots --- modules/flowmldetection/flowmldetection.py | 709 +++++---------------- 1 file changed, 143 insertions(+), 566 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 6c3bfc127..37f076110 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,566 +1,143 @@ -# SPDX-FileCopyrightText: 2021 Sebastian Garcia -from typing import Optional - -# SPDX-License-Identifier: GPL-2.0-only -import numpy -from sklearn.linear_model import SGDClassifier -from sklearn.preprocessing import StandardScaler -import pickle -import pandas as pd -import json -import traceback -import warnings -import os - -from slips_files.common.parsers.config_parser import ConfigParser -from slips_files.common.slips_utils import utils -from slips_files.common.abstracts.module import IModule -from slips_files.core.structures.evidence import ( - Evidence, - ProfileID, - TimeWindow, - Attacker, - ThreatLevel, - EvidenceType, - IoCType, - Direction, - Victim, - Method, -) - -# This horrible hack is only to stop sklearn from printing those warnings -def warn(*args, **kwargs): - pass - - -warnings.warn = warn - - -class FlowMLDetection(IModule): - # Name: short name of the module. Do not use spaces - name = "Flow ML Detection" - description = ( - "Train or test a Machine Learning model to detect malicious flows" - ) - authors = ["Sebastian Garcia"] - - def init(self): - # Subscribe to the channel - self.c1 = self.db.subscribe("new_flow") - self.channels = {"new_flow": self.c1} - self.fieldseparator = self.db.get_field_separator() - # Set the output queue of our database instance - # Read the configuration - self.read_configuration() - # Minum amount of new labels needed to start the train - self.minimum_labels_to_start_train = 50 - # Minum amount of new labels needed to retrain - self.minimum_labels_to_retrain = 50 - # The number of flows when last trained. Used internally only to know - # when to retrain - self.last_number_of_flows_when_trained = 0 - # The scaler trained during training and to use during testing - self.scaler = StandardScaler() - self.model_path = "./modules/flowmldetection/model.bin" - self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") - - def read_configuration(self): - conf = ConfigParser() - self.mode = conf.get_ml_mode() - # This is the global label in the configuration, - # in case the flows do not have a label themselves - self.label = conf.label() - - def write_to_training_log(self, message: str): - """ - Write a message to the training log file. - """ - try: - with open(self.training_log_path, "a") as log_file: - log_file.write(message + "\n") - except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) - - def train(self, sum_labeled_flows): - """ - Train a model based on the flows we receive and the labels - """ - try: - # Create X_flow with the current flows minus the label - X_flow = self.flows.drop("ground_truth_label", axis=1) - # Drop the detailed labels - X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) - # Drop the module_labels - X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - - # Normalize this batch of data so far. This can get progressivle slow - X_flow = self.scaler.fit_transform(X_flow) - - # Count the number of labels of each type in this epoc - epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), - } - - # Train - try: - # Online incremental learning - self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] - ) - except Exception: - self.print("Error while calling clf.train()") - self.print(traceback.format_exc(), 0, 1) - - # See score so far in training - score = self.clf.score(X_flow, y_flow) - - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) - - # Store the models on disk - self.store_model() - - # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") - except Exception: - self.print("Error in train().", 0, 1) - self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") - - def process_features(self, dataset): - """ - Discards some features of the dataset and can create new. - Clean the dataset - """ - try: - # Discard some type of flows that dont have ports - to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] - for proto in to_discard: - dataset = dataset[dataset.proto != proto] - - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty - if dataset.empty: - # DataFrame is empty now, so return empty - return dataset - - # For now, discard these - to_drop = [ - "appproto", - "daddr", - "saddr", - "starttime", - "type_", - "smac", - "dmac", - "history", - "uid", - "dir_", - "endtime", - "flow_source", - ] - for field in to_drop: - try: - dataset = dataset.drop(field, axis=1) - except (ValueError, KeyError): - pass - - # When flows are read from Slips sqlite, - # the state is not transformed to 'Established' or - # 'Not Established', it is still 'S0' and others - # So transform here - dataset["state"] = dataset.apply( - lambda row: self.db.get_final_state_from_flags( - row["state"], (row["spkts"] + row["dpkts"]) - ), - axis=1, - ) - - # Convert state to categorical - dataset.state = dataset.state.str.replace( - r"(^.*Not Established.*$)", "0", regex=True - ) - dataset.state = dataset.state.str.replace( - r"(^.*Established.*$)", "1", regex=True - ) - - # Convert categories to floats - dataset.state = dataset.state.astype("float64") - - # Convert proto to categorical. For now we only have few states, so we can hardcode... - # We dont use the data to create categories because in testing mode - # we dont see all the protocols - # Also we dont store the Categorizer because the user can retrain - # with its own data. - dataset.proto = dataset.proto.str.lower() - dataset.proto = dataset.proto.str.replace( - r"(^.*tcp.*$)", "0", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*udp.*$)", "1", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp.*$)", "2", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*icmp-ipv6.*$)", "3", regex=True - ) - dataset.proto = dataset.proto.str.replace( - r"(^.*arp.*$)", "4", regex=True - ) - - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] - dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] - - fields_to_convert_to_float = [ - dataset.proto, - dataset.dport, - dataset.sport, - dataset.dur, - dataset.pkts, - dataset.spkts, - dataset.allbytes, - dataset.sbytes, - dataset.state, - ] - for field in fields_to_convert_to_float: - try: - field = field.astype("float64") - except (ValueError, AttributeError): - pass - - return dataset - except Exception: - # Stop the timer - self.print("Error in process_features()") - self.print(traceback.format_exc(), 0, 1) - - def process_training_flows(self, last_number_of_flows_when_trained): - """ - Process only the new flows in the DB since the last training. - Store the pandas df in self.flows - """ - try: - # Ensure the index is an integer - if last_number_of_flows_when_trained is None: - last_number_of_flows_when_trained = 0 - else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) - - # We get all the flows so far - flows = self.db.get_all_flows() - # Only process new flows since last training - new_flows = flows[last_number_of_flows_when_trained:] - - # Check how many **different** labels are in the DB - labels = self.db.get_labels() - if len(labels) == 1: - # Insert fake flows for both classes if needed - new_flows.append( - { - "starttime": 1594417039.029793, - "dur": "1.9424750804901123", - "saddr": "10.7.10.101", - "sport": "49733", - "daddr": "40.70.224.145", - "dport": "443", - "proto": "tcp", - "state": "SF", - "spkts": 17, - "dpkts": 27, - "sbytes": 25517, - "dbytes": 17247, - "appproto": "ssl", - "ground_truth_label": "Malicious", - "module_labels": { - "flowalerts-long-connection": "Malicious" - }, - } - ) - new_flows.append( - { - "starttime": 1382355032.706468, - "dur": "10.896695", - "saddr": "147.32.83.52", - "sport": "47956", - "daddr": "80.242.138.72", - "dport": "80", - "proto": "tcp", - "state": "SF", - "spkts": 1, - "dpkts": 0, - "sbytes": 100, - "dbytes": 67596, - "appproto": "http", - "ground_truth_label": "Benign", - "module_labels": { - "flowalerts-long-connection": "Benign" - }, - } - ) - - # Convert to pandas df - df_flows = pd.DataFrame(new_flows) - - # Process features - df_flows = self.process_features(df_flows) - - # Update the flow to the processed version - self.flows = df_flows - except Exception: - self.print("Error in process_flows()") - self.print(traceback.format_exc(), 0, 1) - - def process_flow(self, flow_to_process: dict): - """ - Process one flow. Only used during detection in testing - returns the pandas df with the processed flow - """ - try: - # Convert the flow to a pandas dataframe - raw_flow = pd.DataFrame(flow_to_process, index=[0]) - dflow = self.process_features(raw_flow) - if dflow.empty: - return None - # Update the flow to the processed version - return dflow - except Exception: - # Stop the timer - self.print("Error in process_flow()") - self.print(traceback.format_exc(), 0, 1) - - def detect(self, x_flow) -> Optional[numpy.ndarray]: - """ - Detects the given flow with the current model stored - and returns the predection array - """ - try: - # clean the flow - fields_to_drop = [ - "label", - "module_labels", - "uid", - "history", - "dir_", - "endtime", - "flow_source", - "ground_truth_label", - "detailed_ground_truth_label", - ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. - # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - - for field in fields_to_drop: - try: - x_flow = x_flow.drop(field, axis=1) - except (KeyError, ValueError): - pass - # Scale the flow - x_flow: numpy.ndarray = self.scaler.transform(x_flow) - pred: numpy.ndarray = self.clf.predict(x_flow) - return pred - except Exception as e: - self.print( - f"Error in detect() while processing " f"\n{x_flow}\n{e}" - ) - self.print(traceback.format_exc(), 0, 1) - - def store_model(self): - """ - Store the trained model on disk - """ - self.print("Storing the trained model and scaler on disk.", 0, 2) - with open(self.model_path, "wb") as f: - data = pickle.dumps(self.clf) - f.write(data) - with open(self.scaler_path, "wb") as g: - data = pickle.dumps(self.scaler) - g.write(data) - - def read_model(self): - """ - Read the trained model from disk - """ - try: - self.print("Reading the trained model from disk.", 0, 2) - with open(self.model_path, "rb") as f: - self.clf = pickle.load(f) - self.print("Reading the trained scaler from disk.", 0, 2) - with open(self.scaler_path, "rb") as g: - self.scaler = pickle.load(g) - except FileNotFoundError: - # If there is no model, create one empty - self.print( - "There was no model. " "Creating a new empty model.", 0, 2 - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - except EOFError: - self.print( - "Error reading model from disk. " - "Creating a new empty model.", - 0, - 2, - ) - self.clf = SGDClassifier( - warm_start=True, loss="hinge", penalty="l1" - ) - - def set_evidence_malicious_flow(self, flow: dict, twid: str): - confidence: float = 0.1 - description = ( - f"Flow with malicious characteristics by ML. Src IP" - f" {flow['saddr']}:{flow['sport']} to " - f"{flow['daddr']}:{flow['dport']}" - ) - twid_number = int(twid.replace("timewindow", "")) - evidence: Evidence = Evidence( - evidence_type=EvidenceType.MALICIOUS_FLOW, - attacker=Attacker( - direction=Direction.SRC, - ioc_type=IoCType.IP, - value=flow["saddr"], - ), - victim=Victim( - direction=Direction.DST, - ioc_type=IoCType.IP, - value=flow["daddr"], - ), - threat_level=ThreatLevel.LOW, - confidence=confidence, - description=description, - profile=ProfileID(ip=flow["saddr"]), - timewindow=TimeWindow(twid_number), - uid=[flow["uid"]], - timestamp=flow["starttime"], - method=Method.AI, - src_port=flow["sport"], - dst_port=flow["dport"], - ) - - self.db.set_evidence(evidence) - - def shutdown_gracefully(self): - # Confirm that the module is done processing - if self.mode == "train": - self.store_model() - - def pre_main(self): - utils.drop_root_privs() - # Load the model - self.read_model() - - def main(self): - if msg := self.get_msg("new_flow"): - # When a new flow arrives - msg = json.loads(msg["data"]) - self.twid = msg["twid"] - self.profileid = msg["profileid"] - self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them - self.flow.update( - { - "state": msg["interpreted_state"], - "label": msg["label"], - "module_labels": msg["module_labels"], - } - ) - - if self.mode == "train": - # We are training - - # Is the amount in the DB of labels enough to retrain? - # Use labeled flows - labels = self.db.get_labels() - sum_labeled_flows = sum(i[1] for i in labels) - - # The min labels to retrain is the min number of flows - # we should have seen so far in this capture to start training - # This is so we dont _start_ training with only 1 flow - - # Once we are over the start minimum, the second condition is - # to force to retrain every a minimum_labels_to_retrain number - # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): - # So for example we retrain every 50 labels and only when - # we have at least 50 labels - self.print( - f"Training the model with the last group of " - f"flows and labels. Total flows: {sum_labeled_flows}." - ) - # Process all flows in the DB and make them ready - # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) - # Train an algorithm - self.train(sum_labeled_flows) - self.last_number_of_flows_when_trained = sum_labeled_flows - - elif self.mode == "test": - # We are testing, which means using the model to detect - processed_flow = self.process_flow(self.flow) - - # After processing the flow, it may happen that we - # delete icmp/arp/etc so the dataframe can be empty - if processed_flow is not None and not processed_flow.empty: - # Predict - pred: numpy.ndarray = self.detect(processed_flow) - if not pred: - # an error occurred - return - - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) - if pred[0] == "Malicious": - # Generate an alert - self.set_evidence_malicious_flow(self.flow, self.twid) - self.print( - f"Prediction {pred[0]} for label {label}" - f' flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} -> ' - f'{self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 2, - ) +import matplotlib.pyplot as plt +import sys +import numpy as np + +def process_file(file_path): + # Initialize the counters for the values + FPR_values = [] + FNR_values = [] + TNR_values = [] + TPR_values = [] + F1_values = [] + accuracy_values = [] + precision_values = [] + MCC_values = [] + recall_values = [] + + # Read the file and extract the data + with open(file_path, 'r') as file: + for line in file: + if "TP:" in line: + # Extract the values from the line + parts = line.split(',') + TP = int(parts[0].split(':')[1].strip()) + TN = int(parts[1].split(':')[1].strip()) + FP = int(parts[2].split(':')[1].strip()) + FN = int(parts[3].split(':')[1].strip()) + + # Calculate metrics + FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 + FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 + TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 + TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 + Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 + Recall = TPR # Recall is the same as TPR + F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 + Accuracy = (TP + TN) / (TP + TN + FP + FN) + MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 + + # Append the values to the respective lists + FPR_values.append(FPR) + FNR_values.append(FNR) + TNR_values.append(TNR) + TPR_values.append(TPR) + F1_values.append(F1) + accuracy_values.append(Accuracy) + precision_values.append(Precision) + MCC_values.append(MCC) + recall_values.append(Recall) + + return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values + +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): + # Separate the values into two groups based on their proximity to 0 or 1 + close_to_0 = { + 'FPR': [], 'FNR': [] + } + close_to_1 = { + 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] + } + + # Categorize the metrics into two groups + for i in range(len(FPR_values)): + close_to_0['FPR'].append(FPR_values[i]) + close_to_0['FNR'].append(FNR_values[i]) + + close_to_1['TNR'].append(TNR_values[i]) + close_to_1['TPR'].append(TPR_values[i]) + close_to_1['F1'].append(F1_values[i]) + close_to_1['accuracy'].append(accuracy_values[i]) + close_to_1['precision'].append(precision_values[i]) + close_to_1['MCC'].append(MCC_values[i]) + close_to_1['recall'].append(recall_values[i]) + + # Plot metrics for values close to 0 + plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + + # Plot metrics for values close to 1 + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + + # Print the final values + print("\nFinal Metric Values:") + print(f"Final FPR: {FPR_values[-1]:.4f}") + print(f"Final FNR: {FNR_values[-1]:.4f}") + print(f"Final TNR: {TNR_values[-1]:.4f}") + print(f"Final TPR: {TPR_values[-1]:.4f}") + print(f"Final F1 Score: {F1_values[-1]:.4f}") + print(f"Final Accuracy: {accuracy_values[-1]:.4f}") + print(f"Final Precision: {precision_values[-1]:.4f}") + print(f"Final MCC: {MCC_values[-1]:.4f}") + print(f"Final Recall: {recall_values[-1]:.4f}") + +def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): + plt.figure(figsize=(12, 8)) + + # Only plot the metrics that exist in the dictionary + if 'FPR' in metrics_dict: + plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') + if 'FNR' in metrics_dict: + plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') + if 'TNR' in metrics_dict: + plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') + if 'TPR' in metrics_dict: + plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') + if 'F1' in metrics_dict: + plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') + if 'accuracy' in metrics_dict: + plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') + if 'precision' in metrics_dict: + plt.plot(metrics_dict['precision'], label='Precision', marker='o') + if 'MCC' in metrics_dict: + plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') + if 'recall' in metrics_dict: + plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') + + # Apply log scale by default + plt.yscale('log') + + # If the plot is close to 0, set custom ticks + if is_close_to_0: + # Add more ticks between 0 and 1 (using a logarithmic scale) + plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) + + plt.xlabel('Index') + plt.ylabel('Metric Value') + plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.legend() + + # Save the plot + plt.savefig(output_filename) + plt.close() + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + file_path = sys.argv[1] + + FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + +if __name__ == "__main__": + main() From 41961660beaf2d95a10273bdebceae4388fafd95 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:20:52 +0000 Subject: [PATCH 477/498] Fix plots --- .../plot_testing_performance.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 8f9e12cd8..69b8c96a8 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -71,11 +71,11 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['MCC'].append(MCC_values[i]) close_to_1['recall'].append(recall_values[i]) - # Plot metrics for values close to 0 + # Plot metrics for values close to 0 (linear scale) plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') + # Plot metrics for values close to 1 (log scale) + plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -112,14 +112,21 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): if 'recall' in metrics_dict: plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - # Apply log scale by default - plt.yscale('log') + # If the plot is close to 1, apply log scale + if not is_close_to_0: + plt.yscale('log') - # If the plot is close to 0, set custom ticks + # If the plot is close to 0, set dynamic Y-ticks based on the min/max values of the series if is_close_to_0: - # Manually set more Y-ticks for better visibility - plt.ylim(0.0001, 1) # Set Y-axis limits between 0.0001 and 1 - plt.yticks([0.0001, 0.001, 0.01, 0.1, 1], ['0.0001', '0.001', '0.01', '0.1', '1']) # Adjust Y-ticks + min_val = min(min(metrics_dict['FPR']), min(metrics_dict['FNR'])) + max_val = max(max(metrics_dict['FPR']), max(metrics_dict['FNR'])) + + # Avoid log(0), so set the minimum limit a little higher than zero + if min_val == 0: + min_val = 1e-4 # Avoid zero values on the logarithmic scale + + plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From dcd73e24811c9ebd2e4aadfea719b851736d72ab Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:23:02 +0000 Subject: [PATCH 478/498] Fix plots --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 69b8c96a8..de4ada38b 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-4 # Avoid zero values on the logarithmic scale + min_val = 1e-8 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From 499fe19c08b34469a0f7826d614ceababc9d0849 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:25:58 +0000 Subject: [PATCH 479/498] Change plot names --- modules/flowmldetection/plot_testing_performance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index de4ada38b..1b4152c6e 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -72,10 +72,10 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") From 8735210db117c14006ef382bf21051b90cd6c01c Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:26:09 +0000 Subject: [PATCH 480/498] Rename file --- .../flowmldetection/plot_train_performance.py | 130 +++++++----------- modules/flowmldetection/plot_train_score.py | 87 ------------ 2 files changed, 53 insertions(+), 164 deletions(-) delete mode 100644 modules/flowmldetection/plot_train_score.py diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 304f0f4ea..80e13e951 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -4,108 +4,84 @@ import sys import argparse import os -import matplotlib.ticker as ticker -def plot_log_data(file_path, experiment_number): +def plot_log_data(file_path): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() - # Regex pattern for the new log format - pattern = ( - r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: " - r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), " - r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\." - ) + # Define regex pattern to extract relevant data from each line + pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" # Parse the log file data = re.findall(pattern, log_data) # Convert data to a DataFrame - columns = [ - "Total labels", "Background", "Benign", "Malicious", - "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall" - ] - df = pd.DataFrame(data, columns=columns) + df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) df = df.astype({ - "Total labels": float, "Background": int, "Benign": int, "Malicious": int, - "FPR": float, - "TNR": float, - "TPR": float, - "FNR": float, - "F1": float, - "Precision": float, - "Accuracy": float, - "MCC": float, - "Recall": float, + "Total labels": float, + "Score": float }) + # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) + plot_file = os.path.join(dir_name, 'performance_metrics_training.png') + + # Plotting the values + fig, ax1 = plt.subplots(figsize=(10, 6)) - # --- Plot 1: Number of labels (linear scale, no total labels) --- - fig1, ax1 = plt.subplots(figsize=(10, 6)) - ax1.plot(df.index, df["Background"], label="Background", color='black') - ax1.plot(df.index, df["Benign"], label="Benign", color='cyan') - ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') + # Plotting Score on the left y-axis (with proper scaling from 0 to 1) + ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') ax1.set_xlabel('Index') - ax1.set_ylabel('Label Counts') - ax1.set_title(f'Label Counts - Experiment {experiment_number}') - ax1.legend() - ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) - ax1.xaxis.set_major_locator(ticker.MaxNLocator(50)) - plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png')) - - # --- Plot 2: FNR and FPR (log scale) --- - fig2, ax2 = plt.subplots(figsize=(10, 6)) - ax2.plot(df.index, df["FNR"], label="FNR", color='red') - ax2.plot(df.index, df["FPR"], label="FPR", color='blue') - ax2.set_xlabel('Index') - ax2.set_ylabel('Rate') - ax2.set_yscale('log') - ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') - ax2.legend() - ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) - ax2.xaxis.set_major_locator(ticker.MaxNLocator(50)) - plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png')) - - # --- Plot 3: Other metrics (log scale) --- - fig3, ax3 = plt.subplots(figsize=(12, 7)) - metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"] - colors_rest = [ - 'tab:blue', 'tab:green', 'tab:purple', 'tab:brown', - 'tab:gray', 'tab:pink', 'tab:olive' - ] - for metric, color in zip(metrics_rest, colors_rest): - ax3.plot(df.index, df[metric], label=metric, color=color) - ax3.set_xlabel('Index') - ax3.set_ylabel('Metric Value') - ax3.set_yscale('log') - ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') - ax3.legend() - ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) - ax3.xaxis.set_major_locator(ticker.MaxNLocator(50)) - plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png')) + ax1.set_ylabel('Score', color='tab:blue') + ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 + ax1.tick_params(axis='y', labelcolor='tab:blue') - plt.show() + # Create the second y-axis for the Background, Benign, Malicious + ax2 = ax1.twinx() + ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') + ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') + ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') + ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') + + # Set appropriate scale for right y-axis based on the data + ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) + ax2.tick_params(axis='y', labelcolor='tab:red') + + # Annotating Total labels as text on the plot + for i, value in enumerate(df["Total labels"]): + ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') + + # Adding title and legend + plt.title('Training performance') + fig.tight_layout() - # --- Print final values in terminal --- - print("\nFinal values at last training step:") - for col in ["Total labels", "Background", "Benign", "Malicious", - "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]: - print(f"{col}: {df[col].iloc[-1]}") + # Move both legends further to the right + ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) + + # Increase right margin for better readability of legend + plt.subplots_adjust(right=0.7) + + # Save plot to the same folder as the log file + plt.savefig(plot_file) + + # Display the plot + plt.show() def main(): + # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") - parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") + parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + + # Handle -h / --help args = parser.parse_args() - plot_log_data(args.file, args.experiment) + + # Call the function to process the log file + plot_log_data(args.log_file) if __name__ == "__main__": main() diff --git a/modules/flowmldetection/plot_train_score.py b/modules/flowmldetection/plot_train_score.py deleted file mode 100644 index 8437e968a..000000000 --- a/modules/flowmldetection/plot_train_score.py +++ /dev/null @@ -1,87 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt -import re -import sys -import argparse -import os - -def plot_log_data(file_path): - # Read the log data from the file - with open(file_path, 'r') as file: - log_data = file.read() - - # Define regex pattern to extract relevant data from each line - pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" - - # Parse the log file - data = re.findall(pattern, log_data) - - # Convert data to a DataFrame - df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) - df = df.astype({ - "Background": int, - "Benign": int, - "Malicious": int, - "Total labels": float, - "Score": float - }) - - # Get the directory of the log file to store the plot in the same folder - dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'log_data_plot_with_two_scales.png') - - # Plotting the values - fig, ax1 = plt.subplots(figsize=(10, 6)) - - # Plotting Score on the left y-axis (with proper scaling from 0 to 1) - ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') - ax1.set_xlabel('Index') - ax1.set_ylabel('Score', color='tab:blue') - ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 - ax1.tick_params(axis='y', labelcolor='tab:blue') - - # Create the second y-axis for the Background, Benign, Malicious - ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') - - # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) - ax2.tick_params(axis='y', labelcolor='tab:red') - - # Annotating Total labels as text on the plot - for i, value in enumerate(df["Total labels"]): - ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - - # Adding title and legend - plt.title('Training performance') - fig.tight_layout() - - # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) - - # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.7) - - # Save plot to the same folder as the log file - plt.savefig(plot_file) - - # Display the plot - plt.show() - -def main(): - # Parse command-line arguments - parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") - - # Handle -h / --help - args = parser.parse_args() - - # Call the function to process the log file - plot_log_data(args.log_file) - -if __name__ == "__main__": - main() From a454bd7b3fca49d80a02d05783b2637b57101d9c Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:32 +0000 Subject: [PATCH 481/498] Recover good flowmldetection deleted by mistake --- modules/flowmldetection/flowmldetection.py | 709 ++++++++++++++++----- 1 file changed, 566 insertions(+), 143 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 37f076110..5e4e9aa46 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -1,143 +1,566 @@ -import matplotlib.pyplot as plt -import sys -import numpy as np - -def process_file(file_path): - # Initialize the counters for the values - FPR_values = [] - FNR_values = [] - TNR_values = [] - TPR_values = [] - F1_values = [] - accuracy_values = [] - precision_values = [] - MCC_values = [] - recall_values = [] - - # Read the file and extract the data - with open(file_path, 'r') as file: - for line in file: - if "TP:" in line: - # Extract the values from the line - parts = line.split(',') - TP = int(parts[0].split(':')[1].strip()) - TN = int(parts[1].split(':')[1].strip()) - FP = int(parts[2].split(':')[1].strip()) - FN = int(parts[3].split(':')[1].strip()) - - # Calculate metrics - FPR = FP / (FP + TN) if (FP + TN) != 0 else 0 - FNR = FN / (FN + TP) if (FN + TP) != 0 else 0 - TNR = TN / (TN + FP) if (TN + FP) != 0 else 0 - TPR = TP / (TP + FN) if (TP + FN) != 0 else 0 - Precision = TP / (TP + FP) if (TP + FP) != 0 else 0 - Recall = TPR # Recall is the same as TPR - F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) != 0 else 0 - Accuracy = (TP + TN) / (TP + TN + FP + FN) - MCC = ((TP * TN) - (FP * FN)) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) != 0 else 0 - - # Append the values to the respective lists - FPR_values.append(FPR) - FNR_values.append(FNR) - TNR_values.append(TNR) - TPR_values.append(TPR) - F1_values.append(F1) - accuracy_values.append(Accuracy) - precision_values.append(Precision) - MCC_values.append(MCC) - recall_values.append(Recall) - - return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values - -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): - # Separate the values into two groups based on their proximity to 0 or 1 - close_to_0 = { - 'FPR': [], 'FNR': [] - } - close_to_1 = { - 'TNR': [], 'TPR': [], 'F1': [], 'accuracy': [], 'precision': [], 'MCC': [], 'recall': [] - } - - # Categorize the metrics into two groups - for i in range(len(FPR_values)): - close_to_0['FPR'].append(FPR_values[i]) - close_to_0['FNR'].append(FNR_values[i]) - - close_to_1['TNR'].append(TNR_values[i]) - close_to_1['TPR'].append(TPR_values[i]) - close_to_1['F1'].append(F1_values[i]) - close_to_1['accuracy'].append(accuracy_values[i]) - close_to_1['precision'].append(precision_values[i]) - close_to_1['MCC'].append(MCC_values[i]) - close_to_1['recall'].append(recall_values[i]) - - # Plot metrics for values close to 0 - plot_single_group(close_to_0, 'metrics_plot_close_to_0.png', is_close_to_0=True) - - # Plot metrics for values close to 1 - plot_single_group(close_to_1, 'metrics_plot_close_to_1.png') - - # Print the final values - print("\nFinal Metric Values:") - print(f"Final FPR: {FPR_values[-1]:.4f}") - print(f"Final FNR: {FNR_values[-1]:.4f}") - print(f"Final TNR: {TNR_values[-1]:.4f}") - print(f"Final TPR: {TPR_values[-1]:.4f}") - print(f"Final F1 Score: {F1_values[-1]:.4f}") - print(f"Final Accuracy: {accuracy_values[-1]:.4f}") - print(f"Final Precision: {precision_values[-1]:.4f}") - print(f"Final MCC: {MCC_values[-1]:.4f}") - print(f"Final Recall: {recall_values[-1]:.4f}") - -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): - plt.figure(figsize=(12, 8)) - - # Only plot the metrics that exist in the dictionary - if 'FPR' in metrics_dict: - plt.plot(metrics_dict['FPR'], label='False Positive Rate (FPR)', marker='o') - if 'FNR' in metrics_dict: - plt.plot(metrics_dict['FNR'], label='False Negative Rate (FNR)', marker='o') - if 'TNR' in metrics_dict: - plt.plot(metrics_dict['TNR'], label='True Negative Rate (TNR)', marker='o') - if 'TPR' in metrics_dict: - plt.plot(metrics_dict['TPR'], label='True Positive Rate (TPR)', marker='o') - if 'F1' in metrics_dict: - plt.plot(metrics_dict['F1'], label='F1 Score', marker='o') - if 'accuracy' in metrics_dict: - plt.plot(metrics_dict['accuracy'], label='Accuracy', marker='o') - if 'precision' in metrics_dict: - plt.plot(metrics_dict['precision'], label='Precision', marker='o') - if 'MCC' in metrics_dict: - plt.plot(metrics_dict['MCC'], label='Matthews Correlation Coefficient (MCC)', marker='o') - if 'recall' in metrics_dict: - plt.plot(metrics_dict['recall'], label='Recall (TPR)', marker='o') - - # Apply log scale by default - plt.yscale('log') - - # If the plot is close to 0, set custom ticks - if is_close_to_0: - # Add more ticks between 0 and 1 (using a logarithmic scale) - plt.yticks([0.01, 0.1, 1, 10, 100], ['0.01', '0.1', '1', '10', '100']) - - plt.xlabel('Index') - plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') - plt.legend() - - # Save the plot - plt.savefig(output_filename) - plt.close() - -def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) - - file_path = sys.argv[1] - - FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) - -if __name__ == "__main__": - main() +# SPDX-FileCopyrightText: 2021 Sebastian Garcia +from typing import Optional + +# SPDX-License-Identifier: GPL-2.0-only +import numpy +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import StandardScaler +import pickle +import pandas as pd +import json +import traceback +import warnings +import os + +from slips_files.common.parsers.config_parser import ConfigParser +from slips_files.common.slips_utils import utils +from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.evidence import ( + Evidence, + ProfileID, + TimeWindow, + Attacker, + ThreatLevel, + EvidenceType, + IoCType, + Direction, + Victim, + Method, +) + +# This horrible hack is only to stop sklearn from printing those warnings +def warn(*args, **kwargs): + pass + + +warnings.warn = warn + + +class FlowMLDetection(IModule): + # Name: short name of the module. Do not use spaces + name = "Flow ML Detection" + description = ( + "Train or test a Machine Learning model to detect malicious flows" + ) + authors = ["Sebastian Garcia"] + + def init(self): + # Subscribe to the channel + self.c1 = self.db.subscribe("new_flow") + self.channels = {"new_flow": self.c1} + self.fieldseparator = self.db.get_field_separator() + # Set the output queue of our database instance + # Read the configuration + self.read_configuration() + # Minum amount of new labels needed to start the train + self.minimum_labels_to_start_train = 50 + # Minum amount of new labels needed to retrain + self.minimum_labels_to_retrain = 50 + # The number of flows when last trained. Used internally only to know + # when to retrain + self.last_number_of_flows_when_trained = 0 + # The scaler trained during training and to use during testing + self.scaler = StandardScaler() + self.model_path = "./modules/flowmldetection/model.bin" + self.scaler_path = "./modules/flowmldetection/scaler.bin" + + # Initialize the training log file + self.training_log_path = "./modules/flowmldetection/training.log" + with open(self.training_log_path, "w") as log_file: + log_file.write("Training Log Initialized\n") + + def read_configuration(self): + conf = ConfigParser() + self.mode = conf.get_ml_mode() + # This is the global label in the configuration, + # in case the flows do not have a label themselves + self.label = conf.label() + + def write_to_training_log(self, message: str): + """ + Write a message to the training log file. + """ + try: + with open(self.training_log_path, "a") as log_file: + log_file.write(message + "\n") + except Exception as e: + self.print(f"Error writing to training log: {e}", 0, 1) + + def train(self, sum_labeled_flows): + """ + Train a model based on the flows we receive and the labels + """ + try: + # Create X_flow with the current flows minus the label + X_flow = self.flows.drop("ground_truth_label", axis=1) + # Drop the detailed labels + X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) + # Drop the module_labels + X_flow = X_flow.drop("module_labels", axis=1) + # Create y_flow with the label + y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) + + # Normalize this batch of data so far. This can get progressivle slow + X_flow = self.scaler.fit_transform(X_flow) + + # Count the number of labels of each type in this epoc + epoch_label_counts = { + "Background": (y_flow == "Background").sum(), + "Malicious": (y_flow == "Malicious").sum(), + "Benign": (y_flow == "Benign").sum(), + } + + # Train + try: + # Online incremental learning + self.clf.partial_fit( + X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + ) + except Exception: + self.print("Error while calling clf.train()") + self.print(traceback.format_exc(), 0, 1) + + # See score so far in training + score = self.clf.score(X_flow, y_flow) + + #self.print(f" Training Score: {score}", 1, 0) + #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + + # Store the models on disk + self.store_model() + + # Log training information + self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") + #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + except Exception: + self.print("Error in train().", 0, 1) + self.print(traceback.format_exc(), 0, 1) + self.write_to_training_log("Error occurred during training.") + + def process_features(self, dataset): + """ + Discards some features of the dataset and can create new. + Clean the dataset + """ + try: + # Discard some type of flows that dont have ports + to_discard = ["arp", "ARP", "icmp", "igmp", "ipv6-icmp", ""] + for proto in to_discard: + dataset = dataset[dataset.proto != proto] + + # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + if dataset.empty: + # DataFrame is empty now, so return empty + return dataset + + # For now, discard these + to_drop = [ + "appproto", + "daddr", + "saddr", + "starttime", + "type_", + "smac", + "dmac", + "history", + "uid", + "dir_", + "endtime", + "flow_source", + ] + for field in to_drop: + try: + dataset = dataset.drop(field, axis=1) + except (ValueError, KeyError): + pass + + # When flows are read from Slips sqlite, + # the state is not transformed to 'Established' or + # 'Not Established', it is still 'S0' and others + # So transform here + dataset["state"] = dataset.apply( + lambda row: self.db.get_final_state_from_flags( + row["state"], (row["spkts"] + row["dpkts"]) + ), + axis=1, + ) + + # Convert state to categorical + dataset.state = dataset.state.str.replace( + r"(^.*Not Established.*$)", "0", regex=True + ) + dataset.state = dataset.state.str.replace( + r"(^.*Established.*$)", "1", regex=True + ) + + # Convert categories to floats + dataset.state = dataset.state.astype("float64") + + # Convert proto to categorical. For now we only have few states, so we can hardcode... + # We dont use the data to create categories because in testing mode + # we dont see all the protocols + # Also we dont store the Categorizer because the user can retrain + # with its own data. + dataset.proto = dataset.proto.str.lower() + dataset.proto = dataset.proto.str.replace( + r"(^.*tcp.*$)", "0", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*udp.*$)", "1", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp.*$)", "2", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*icmp-ipv6.*$)", "3", regex=True + ) + dataset.proto = dataset.proto.str.replace( + r"(^.*arp.*$)", "4", regex=True + ) + + dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] + + fields_to_convert_to_float = [ + dataset.proto, + dataset.dport, + dataset.sport, + dataset.dur, + dataset.pkts, + dataset.spkts, + dataset.allbytes, + dataset.sbytes, + dataset.state, + ] + for field in fields_to_convert_to_float: + try: + field = field.astype("float64") + except (ValueError, AttributeError): + pass + + return dataset + except Exception: + # Stop the timer + self.print("Error in process_features()") + self.print(traceback.format_exc(), 0, 1) + + def process_training_flows(self, last_number_of_flows_when_trained): + """ + Process only the new flows in the DB since the last training. + Store the pandas df in self.flows + """ + try: + # Ensure the index is an integer + if last_number_of_flows_when_trained is None: + last_number_of_flows_when_trained = 0 + else: + last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + + # We get all the flows so far + flows = self.db.get_all_flows() + # Only process new flows since last training + new_flows = flows[last_number_of_flows_when_trained:] + + # Check how many **different** labels are in the DB + labels = self.db.get_labels() + if len(labels) == 1: + # Insert fake flows for both classes if needed + new_flows.append( + { + "starttime": 1594417039.029793, + "dur": "1.9424750804901123", + "saddr": "10.7.10.101", + "sport": "49733", + "daddr": "40.70.224.145", + "dport": "443", + "proto": "tcp", + "state": "SF", + "spkts": 17, + "dpkts": 27, + "sbytes": 25517, + "dbytes": 17247, + "appproto": "ssl", + "ground_truth_label": "Malicious", + "module_labels": { + "flowalerts-long-connection": "Malicious" + }, + } + ) + new_flows.append( + { + "starttime": 1382355032.706468, + "dur": "10.896695", + "saddr": "147.32.83.52", + "sport": "47956", + "daddr": "80.242.138.72", + "dport": "80", + "proto": "tcp", + "state": "SF", + "spkts": 1, + "dpkts": 0, + "sbytes": 100, + "dbytes": 67596, + "appproto": "http", + "ground_truth_label": "Benign", + "module_labels": { + "flowalerts-long-connection": "Benign" + }, + } + ) + + # Convert to pandas df + df_flows = pd.DataFrame(new_flows) + + # Process features + df_flows = self.process_features(df_flows) + + # Update the flow to the processed version + self.flows = df_flows + except Exception: + self.print("Error in process_flows()") + self.print(traceback.format_exc(), 0, 1) + + def process_flow(self, flow_to_process: dict): + """ + Process one flow. Only used during detection in testing + returns the pandas df with the processed flow + """ + try: + # Convert the flow to a pandas dataframe + raw_flow = pd.DataFrame(flow_to_process, index=[0]) + dflow = self.process_features(raw_flow) + if dflow.empty: + return None + # Update the flow to the processed version + return dflow + except Exception: + # Stop the timer + self.print("Error in process_flow()") + self.print(traceback.format_exc(), 0, 1) + + def detect(self, x_flow) -> Optional[numpy.ndarray]: + """ + Detects the given flow with the current model stored + and returns the predection array + """ + try: + # clean the flow + fields_to_drop = [ + "label", + "module_labels", + "uid", + "history", + "dir_", + "endtime", + "flow_source", + "ground_truth_label", + "detailed_ground_truth_label", + ] + # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # Error + ''' [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + ''' + + # IF we delete here the filed bytes the error is + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + + for field in fields_to_drop: + try: + x_flow = x_flow.drop(field, axis=1) + except (KeyError, ValueError): + pass + # Scale the flow + x_flow: numpy.ndarray = self.scaler.transform(x_flow) + pred: numpy.ndarray = self.clf.predict(x_flow) + return pred + except Exception as e: + self.print( + f"Error in detect() while processing " f"\n{x_flow}\n{e}" + ) + self.print(traceback.format_exc(), 0, 1) + + def store_model(self): + """ + Store the trained model on disk + """ + self.print("Storing the trained model and scaler on disk.", 0, 2) + with open(self.model_path, "wb") as f: + data = pickle.dumps(self.clf) + f.write(data) + with open(self.scaler_path, "wb") as g: + data = pickle.dumps(self.scaler) + g.write(data) + + def read_model(self): + """ + Read the trained model from disk + """ + try: + self.print("Reading the trained model from disk.", 0, 2) + with open(self.model_path, "rb") as f: + self.clf = pickle.load(f) + self.print("Reading the trained scaler from disk.", 0, 2) + with open(self.scaler_path, "rb") as g: + self.scaler = pickle.load(g) + except FileNotFoundError: + # If there is no model, create one empty + self.print( + "There was no model. " "Creating a new empty model.", 0, 2 + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + except EOFError: + self.print( + "Error reading model from disk. " + "Creating a new empty model.", + 0, + 2, + ) + self.clf = SGDClassifier( + warm_start=True, loss="hinge", penalty="l1" + ) + + def set_evidence_malicious_flow(self, flow: dict, twid: str): + confidence: float = 0.1 + description = ( + f"Flow with malicious characteristics by ML. Src IP" + f" {flow['saddr']}:{flow['sport']} to " + f"{flow['daddr']}:{flow['dport']}" + ) + twid_number = int(twid.replace("timewindow", "")) + evidence: Evidence = Evidence( + evidence_type=EvidenceType.MALICIOUS_FLOW, + attacker=Attacker( + direction=Direction.SRC, + ioc_type=IoCType.IP, + value=flow["saddr"], + ), + victim=Victim( + direction=Direction.DST, + ioc_type=IoCType.IP, + value=flow["daddr"], + ), + threat_level=ThreatLevel.LOW, + confidence=confidence, + description=description, + profile=ProfileID(ip=flow["saddr"]), + timewindow=TimeWindow(twid_number), + uid=[flow["uid"]], + timestamp=flow["starttime"], + method=Method.AI, + src_port=flow["sport"], + dst_port=flow["dport"], + ) + + self.db.set_evidence(evidence) + + def shutdown_gracefully(self): + # Confirm that the module is done processing + if self.mode == "train": + self.store_model() + + def pre_main(self): + utils.drop_root_privs() + # Load the model + self.read_model() + + def main(self): + if msg := self.get_msg("new_flow"): + # When a new flow arrives + msg = json.loads(msg["data"]) + self.twid = msg["twid"] + self.profileid = msg["profileid"] + self.flow = msg["flow"] + # These following extra fields are expected in testing. update the original + # flow dict to have them + self.flow.update( + { + "state": msg["interpreted_state"], + "label": msg["label"], + "module_labels": msg["module_labels"], + } + ) + + if self.mode == "train": + # We are training + + # Is the amount in the DB of labels enough to retrain? + # Use labeled flows + labels = self.db.get_labels() + sum_labeled_flows = sum(i[1] for i in labels) + + # The min labels to retrain is the min number of flows + # we should have seen so far in this capture to start training + # This is so we dont _start_ training with only 1 flow + + # Once we are over the start minimum, the second condition is + # to force to retrain every a minimum_labels_to_retrain number + # of flows. So we dont retrain every 1 flow. + if ( + sum_labeled_flows >= self.minimum_labels_to_start_train + ): + if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + # So for example we retrain every 50 labels and only when + # we have at least 50 labels + self.print( + f"Training the model with the last group of " + f"flows and labels. Total flows: {sum_labeled_flows}." + ) + # Process all flows in the DB and make them ready + # for pandas + self.process_training_flows(self.last_number_of_flows_when_trained) + # Train an algorithm + self.train(sum_labeled_flows) + self.last_number_of_flows_when_trained = sum_labeled_flows + + elif self.mode == "test": + # We are testing, which means using the model to detect + processed_flow = self.process_flow(self.flow) + + # After processing the flow, it may happen that we + # delete icmp/arp/etc so the dataframe can be empty + if processed_flow is not None and not processed_flow.empty: + # Predict + pred: numpy.ndarray = self.detect(processed_flow) + if not pred: + # an error occurred + return + + label = self.flow["label"] + if label and label != "unknown" and label != pred[0]: + # If the user specified a label in test mode, + # and the label is diff from the prediction, + # print in debug mode + self.print( + f"Predicted {pred[0]} for ground-truth label" + f' {label}. Flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} ->' + f' {self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 3, + ) + if pred[0] == "Malicious": + # Generate an alert + self.set_evidence_malicious_flow(self.flow, self.twid) + self.print( + f"Prediction {pred[0]} for label {label}" + f' flow {self.flow["saddr"]}:' + f'{self.flow["sport"]} -> ' + f'{self.flow["daddr"]}:' + f'{self.flow["dport"]}/' + f'{self.flow["proto"]}', + 0, + 2, + ) \ No newline at end of file From 3da80024964515c0df1aee115d68a9c73cba1c7e Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:31:43 +0000 Subject: [PATCH 482/498] Fix plot test --- modules/flowmldetection/plot_testing_performance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 1b4152c6e..977a68b2d 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -75,7 +75,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performnace_metrics_teting_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) # Print the final values print("\nFinal Metric Values:") @@ -123,10 +123,10 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): # Avoid log(0), so set the minimum limit a little higher than zero if min_val == 0: - min_val = 1e-8 # Avoid zero values on the logarithmic scale + min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically plt.xlabel('Index') plt.ylabel('Metric Value') From d4e2666af9c2454ebbffd2dbc7f338c99bfc63a5 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 18:50:33 +0000 Subject: [PATCH 483/498] Add testing code to evaluate performance. It is optional with a varible --- modules/flowmldetection/flowmldetection.py | 60 +++++++++++++++------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 5e4e9aa46..b17a1baaf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -526,36 +526,21 @@ def main(self): elif self.mode == "test": # We are testing, which means using the model to detect processed_flow = self.process_flow(self.flow) - # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: + original_label = processed_flow["ground_truth_label"].iloc[0] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: # an error occurred return - label = self.flow["label"] - if label and label != "unknown" and label != pred[0]: - # If the user specified a label in test mode, - # and the label is diff from the prediction, - # print in debug mode - self.print( - f"Predicted {pred[0]} for ground-truth label" - f' {label}. Flow {self.flow["saddr"]}:' - f'{self.flow["sport"]} ->' - f' {self.flow["daddr"]}:' - f'{self.flow["dport"]}/' - f'{self.flow["proto"]}', - 0, - 3, - ) if pred[0] == "Malicious": # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( - f"Prediction {pred[0]} for label {label}" + f"Prediction {pred[0]} for label {original_label}" f' flow {self.flow["saddr"]}:' f'{self.flow["sport"]} -> ' f'{self.flow["daddr"]}:' @@ -563,4 +548,43 @@ def main(self): f'{self.flow["proto"]}', 0, 2, - ) \ No newline at end of file + ) + + # So you can disable this code easily. Since it is used only for evaluating a testing + log_testing_data = True + if log_testing_data: + # Initialize counters if not already done + if not hasattr(self, 'tp'): + self.tp = 0 + if not hasattr(self, 'tn'): + self.tn = 0 + if not hasattr(self, 'fp'): + self.fp = 0 + if not hasattr(self, 'fn'): + self.fn = 0 + + + # Update counters based on predictions and labels + if pred[0] == "Malicious" and original_label == "Malicious": + self.tp += 1 + elif pred[0] == "Benign" and original_label == "Benign": + self.tn += 1 + elif pred[0] == "Malicious" and original_label == "Benign": + self.fp += 1 + elif pred[0] == "Benign" and original_label == "Malicious": + self.fn += 1 + + testing_log_path = "./modules/flowmldetection/testing_performance.log" + try: + with open(testing_log_path, "a") as log_file: + log_file.write("Testing Performance Log Initialized\n") + # Log the testing performance metrics + log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") + + # Log the original flow for false positives and false negatives + if pred[0] == "Malicious" and original_label == "Benign": + log_file.write(f"False Positive Flow: {self.flow}\n") + elif pred[0] == "Benign" and original_label == "Malicious": + log_file.write(f"False Negative Flow: {self.flow}\n") + except Exception as e: + self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file From 5d2d84a80cf2a77f160bc5cb16a46ae9700ff9a0 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:04:00 +0000 Subject: [PATCH 484/498] Fix plots --- .../plot_testing_performance.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/plot_testing_performance.py b/modules/flowmldetection/plot_testing_performance.py index 977a68b2d..6865415cd 100644 --- a/modules/flowmldetection/plot_testing_performance.py +++ b/modules/flowmldetection/plot_testing_performance.py @@ -1,6 +1,7 @@ import matplotlib.pyplot as plt import sys import numpy as np +import argparse def process_file(file_path): # Initialize the counters for the values @@ -49,7 +50,7 @@ def process_file(file_path): return FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values -def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values): +def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number): # Separate the values into two groups based on their proximity to 0 or 1 close_to_0 = { 'FPR': [], 'FNR': [] @@ -72,13 +73,13 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu close_to_1['recall'].append(recall_values[i]) # Plot metrics for values close to 0 (linear scale) - plot_single_group(close_to_0, 'performance_metrics_testing_close_to_0.png', is_close_to_0=True) + plot_single_group(close_to_0, f'performance_metrics_testing_close_to_0_experiment_{experiment_number}.png', experiment_number, is_close_to_0=True) # Plot metrics for values close to 1 (log scale) - plot_single_group(close_to_1, 'performance_metrics_testing_close_to_1.png', is_close_to_0=False) + plot_single_group(close_to_1, f'performance_metrics_testing_close_to_1_experiment_{experiment_number}.png', experiment_number, is_close_to_0=False) # Print the final values - print("\nFinal Metric Values:") + print("\nFinal Metric Values for Experiment", experiment_number) print(f"Final FPR: {FPR_values[-1]:.4f}") print(f"Final FNR: {FNR_values[-1]:.4f}") print(f"Final TNR: {TNR_values[-1]:.4f}") @@ -89,7 +90,7 @@ def plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accu print(f"Final MCC: {MCC_values[-1]:.4f}") print(f"Final Recall: {recall_values[-1]:.4f}") -def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): +def plot_single_group(metrics_dict, output_filename, experiment_number, is_close_to_0=False): plt.figure(figsize=(12, 8)) # Only plot the metrics that exist in the dictionary @@ -126,11 +127,12 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): min_val = 1e-4 # Avoid zero values on the logarithmic scale plt.ylim(min_val, max_val) # Set Y-axis limits based on the data range - plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=6)) # Set ticks logarithmically + plt.yticks(np.logspace(np.log10(min_val), np.log10(max_val), num=60)) # Set ticks logarithmically + # Add the experiment number to the plot title plt.xlabel('Index') plt.ylabel('Metric Value') - plt.title(f'Evaluation Metrics Over Time ({output_filename.split("_")[2].replace(".png", "")})') + plt.title(f'Experiment {experiment_number} - Evaluation Metrics Over Time') plt.legend() # Save the plot @@ -138,14 +140,18 @@ def plot_single_group(metrics_dict, output_filename, is_close_to_0=False): plt.close() def main(): - if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) + # Set up argument parsing + parser = argparse.ArgumentParser(description='Plot testing performance metrics.') + parser.add_argument('-f', '--file', type=str, required=True, help='Path to the testing performance log file') + parser.add_argument('-e', '--experiment', type=str, required=True, help='Experiment number') + + args = parser.parse_args() - file_path = sys.argv[1] + file_path = args.file + experiment_number = args.experiment FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values = process_file(file_path) - plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values) + plot_metrics(FPR_values, FNR_values, TNR_values, TPR_values, F1_values, accuracy_values, precision_values, MCC_values, recall_values, experiment_number) if __name__ == "__main__": main() From e400c0354f3c7ce82739100a48e394c026b02514 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 19:14:51 +0000 Subject: [PATCH 485/498] Fix train plot --- .../flowmldetection/plot_train_performance.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 80e13e951..244df13d2 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -5,7 +5,7 @@ import argparse import os -def plot_log_data(file_path): +def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() @@ -28,7 +28,8 @@ def plot_log_data(file_path): # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - plot_file = os.path.join(dir_name, 'performance_metrics_training.png') + # Append experiment number to the filename + plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') # Plotting the values fig, ax1 = plt.subplots(figsize=(10, 6)) @@ -55,18 +56,18 @@ def plot_log_data(file_path): for i, value in enumerate(df["Total labels"]): ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - # Adding title and legend - plt.title('Training performance') + # Adding title and legend with experiment number in title + plt.title(f'Training performance - Experiment {experiment_number}') fig.tight_layout() # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.26, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.4, 0.95), fontsize='small', ncol=1) + ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) + ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.7) + plt.subplots_adjust(right=0.75) - # Save plot to the same folder as the log file + # Save plot to the same folder as the log file with experiment number in filename plt.savefig(plot_file) # Display the plot @@ -75,13 +76,14 @@ def plot_log_data(file_path): def main(): # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") - parser.add_argument('log_file', metavar='log_file', type=str, help="Path to the log file") + parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") + parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") # Handle -h / --help args = parser.parse_args() # Call the function to process the log file - plot_log_data(args.log_file) + plot_log_data(args.file, args.experiment) if __name__ == "__main__": main() From 8983a7f529e987e11dc915513179f0b1620e3f64 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:14:48 +0000 Subject: [PATCH 486/498] Fix plots --- .../flowmldetection/plot_train_performance.py | 122 ++++++++++-------- 1 file changed, 71 insertions(+), 51 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 244df13d2..5212dfeea 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -4,85 +4,105 @@ import sys import argparse import os +import matplotlib.ticker as ticker def plot_log_data(file_path, experiment_number): # Read the log data from the file with open(file_path, 'r') as file: log_data = file.read() - # Define regex pattern to extract relevant data from each line - pattern = r"Background: (\d+). Benign: (\d+). Malicious: (\d+). Total labels: (\d+\.\d+). Score: (\d+\.\d+)" + # Regex pattern for the new log format + pattern = ( + r"Total labels: ([\d\.]+), Background: (\d+). Benign: (\d+). Malicious: (\d+). Metrics: " + r"FPR=([\d\.]+), TNR=([\d\.]+), TPR=([\d\.]+), FNR=([\d\.]+), " + r"F1=([\d\.]+), Precision=([\d\.]+), Accuracy=([\d\.]+), MCC=([\d\.]+), Recall=([\d\.]+)\." + ) # Parse the log file data = re.findall(pattern, log_data) # Convert data to a DataFrame - df = pd.DataFrame(data, columns=["Background", "Benign", "Malicious", "Total labels", "Score"]) + columns = [ + "Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall" + ] + df = pd.DataFrame(data, columns=columns) df = df.astype({ + "Total labels": float, "Background": int, "Benign": int, "Malicious": int, - "Total labels": float, - "Score": float + "FPR": float, + "TNR": float, + "TPR": float, + "FNR": float, + "F1": float, + "Precision": float, + "Accuracy": float, + "MCC": float, + "Recall": float, }) - # Get the directory of the log file to store the plot in the same folder dir_name = os.path.dirname(file_path) - # Append experiment number to the filename - plot_file = os.path.join(dir_name, f'performance_metrics_training_{experiment_number}.png') - - # Plotting the values - fig, ax1 = plt.subplots(figsize=(10, 6)) - # Plotting Score on the left y-axis (with proper scaling from 0 to 1) - ax1.plot(df.index, df["Score"], label="Score", color='tab:blue') + # --- Plot 1: Number of labels (linear scale, no total labels) --- + fig1, ax1 = plt.subplots(figsize=(10, 6)) + ax1.plot(df.index, df["Background"], label="Background", color='black') + ax1.plot(df.index, df["Benign"], label="Benign", color='cyan') + ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') - ax1.set_ylabel('Score', color='tab:blue') - ax1.set_ylim(0, 1) # Set y-axis for Score from 0 to 1 - ax1.tick_params(axis='y', labelcolor='tab:blue') - - # Create the second y-axis for the Background, Benign, Malicious - ax2 = ax1.twinx() - ax2.plot(df.index, df["Background"], label="Background Labels", color='tab:green', linestyle='--') - ax2.plot(df.index, df["Benign"], label="Benign Labels", color='tab:orange', linestyle='--') - ax2.plot(df.index, df["Malicious"], label="Malicious Labels", color='tab:pink', linestyle='--') - ax2.set_ylabel('Background, Benign, Malicious Labels', color='tab:red') - - # Set appropriate scale for right y-axis based on the data - ax2.set_ylim(0, df[["Background", "Benign", "Malicious"]].max().max()) - ax2.tick_params(axis='y', labelcolor='tab:red') - - # Annotating Total labels as text on the plot - for i, value in enumerate(df["Total labels"]): - ax1.text(i, value, f'{value:.1f}', color='tab:gray', fontsize=8, ha='center', va='bottom') - - # Adding title and legend with experiment number in title - plt.title(f'Training performance - Experiment {experiment_number}') - fig.tight_layout() + ax1.set_ylabel('Label Counts') + # No log scale here + ax1.set_title(f'Label Counts - Experiment {experiment_number}') + ax1.legend() + ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + + # --- Plot 2: FNR and FPR (log scale) --- + fig2, ax2 = plt.subplots(figsize=(10, 6)) + ax2.plot(df.index, df["FNR"], label="FNR", color='red') + ax2.plot(df.index, df["FPR"], label="FPR", color='blue') + ax2.set_xlabel('Index') + ax2.set_ylabel('Rate') + ax2.set_yscale('log') + ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') + ax2.legend() + ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + + # --- Plot 3: Other metrics (log scale) --- + fig3, ax3 = plt.subplots(figsize=(12, 7)) + metrics_rest = ["TNR", "TPR", "F1", "Precision", "Accuracy", "MCC", "Recall"] + colors_rest = [ + 'tab:blue', 'tab:green', 'tab:purple', 'tab:brown', + 'tab:gray', 'tab:pink', 'tab:olive' + ] + for metric, color in zip(metrics_rest, colors_rest): + ax3.plot(df.index, df[metric], label=metric, color=color) + ax3.set_xlabel('Index') + ax3.set_ylabel('Metric Value') + ax3.set_yscale('log') + ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') + ax3.legend() + ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + plt.tight_layout() + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) - # Move both legends further to the right - ax1.legend(loc='upper right', bbox_to_anchor=(1.3, 1), fontsize='small', ncol=1) - ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), fontsize='small', ncol=1) - - # Increase right margin for better readability of legend - plt.subplots_adjust(right=0.75) - - # Save plot to the same folder as the log file with experiment number in filename - plt.savefig(plot_file) - - # Display the plot plt.show() + # --- Print final values in terminal --- + print("\nFinal values at last training step:") + for col in ["Total labels", "Background", "Benign", "Malicious", + "FPR", "TNR", "TPR", "FNR", "F1", "Precision", "Accuracy", "MCC", "Recall"]: + print(f"{col}: {df[col].iloc[-1]}") + def main(): - # Parse command-line arguments parser = argparse.ArgumentParser(description="Process a log file and plot the data with two y-axes.") parser.add_argument('-f', '--file', metavar='log_file', type=str, required=True, help="Path to the log file") parser.add_argument('-e', '--experiment', metavar='experiment_number', type=str, required=True, help="Experiment number to add to the filename") - - # Handle -h / --help args = parser.parse_args() - - # Call the function to process the log file plot_log_data(args.file, args.experiment) if __name__ == "__main__": From 4cca7685112dc012940248c7e647a56806fb5b83 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sat, 3 May 2025 21:16:01 +0000 Subject: [PATCH 487/498] Add performance metrics to the training evaluation --- modules/flowmldetection/flowmldetection.py | 58 +++++++++++++++++----- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index b17a1baaf..2c60cd403 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,7 +10,16 @@ import json import traceback import warnings -import os +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import ( + confusion_matrix, + f1_score, + precision_score, + accuracy_score, + matthews_corrcoef, + recall_score, +) + from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils @@ -86,21 +95,21 @@ def write_to_training_log(self, message: str): except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) - def train(self, sum_labeled_flows): + def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ Train a model based on the flows we receive and the labels """ try: + # Create y_flow with the label + y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels X_flow = X_flow.drop("detailed_ground_truth_label", axis=1) # Drop the module_labels X_flow = X_flow.drop("module_labels", axis=1) - # Create y_flow with the label - y_flow = numpy.full(X_flow.shape[0], self.flows.ground_truth_label) - # Normalize this batch of data so far. This can get progressivle slow + # Normalize this batch of data so far. This can get progressively slow X_flow = self.scaler.fit_transform(X_flow) # Count the number of labels of each type in this epoc @@ -120,18 +129,43 @@ def train(self, sum_labeled_flows): self.print("Error while calling clf.train()") self.print(traceback.format_exc(), 0, 1) - # See score so far in training - score = self.clf.score(X_flow, y_flow) + # Predict on the training data + y_pred = self.clf.predict(X_flow) - #self.print(f" Training Score: {score}", 1, 0) - #self.print(f' Model Parameters: {self.clf.coef_}', 1, 0) + # For metrics, let's focus on Malicious vs Benign (ignore Background) + mask = (y_flow == "Malicious") | (y_flow == "Benign") + y_true_bin = y_flow[mask] + y_pred_bin = y_pred[mask] + + # Map to binary: Malicious=1, Benign=0 + y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) + y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + + # Compute confusion matrix: tn, fp, fn, tp + tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + + # Compute metrics + FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 + TNR = tn / (tn + fp) if (tn + fp) > 0 else 0 + TPR = tp / (tp + fn) if (tp + fn) > 0 else 0 + FNR = fn / (fn + tp) if (fn + tp) > 0 else 0 + F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) + PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) + ACCU = accuracy_score(y_true_bin, y_pred_bin) + MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk self.store_model() # Log training information - self.write_to_training_log(f"Training completed. Background: {epoch_label_counts['Background']}. Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. Total labels: {sum_labeled_flows}. Score: {score}") - #self.write_to_training_log(f"Model parameters: {self.clf.coef_}") + self.write_to_training_log( + f"Total labels: {sum_labeled_flows}, " + f"Background: {epoch_label_counts['Background']}. " + f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + ) except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) @@ -520,7 +554,7 @@ def main(self): # for pandas self.process_training_flows(self.last_number_of_flows_when_trained) # Train an algorithm - self.train(sum_labeled_flows) + self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) self.last_number_of_flows_when_trained = sum_labeled_flows elif self.mode == "test": From addd26bc0cf43e5426fd63b5dd73962c78b898dd Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Sun, 4 May 2025 12:50:46 +0000 Subject: [PATCH 488/498] Fix experiment names --- modules/flowmldetection/plot_train_performance.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/plot_train_performance.py b/modules/flowmldetection/plot_train_performance.py index 5212dfeea..304f0f4ea 100644 --- a/modules/flowmldetection/plot_train_performance.py +++ b/modules/flowmldetection/plot_train_performance.py @@ -52,12 +52,12 @@ def plot_log_data(file_path, experiment_number): ax1.plot(df.index, df["Malicious"], label="Malicious", color='magenta') ax1.set_xlabel('Index') ax1.set_ylabel('Label Counts') - # No log scale here ax1.set_title(f'Label Counts - Experiment {experiment_number}') ax1.legend() ax1.yaxis.set_major_locator(ticker.MaxNLocator(70)) + ax1.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_labels.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_labels.png')) # --- Plot 2: FNR and FPR (log scale) --- fig2, ax2 = plt.subplots(figsize=(10, 6)) @@ -69,8 +69,9 @@ def plot_log_data(file_path, experiment_number): ax2.set_title(f'FNR and FPR - Experiment {experiment_number}') ax2.legend() ax2.yaxis.set_major_locator(ticker.MaxNLocator(100)) + ax2.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_fnr_fpr.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_fnr_fpr.png')) # --- Plot 3: Other metrics (log scale) --- fig3, ax3 = plt.subplots(figsize=(12, 7)) @@ -87,8 +88,9 @@ def plot_log_data(file_path, experiment_number): ax3.set_title(f'Performance Metrics (except FNR/FPR) - Experiment {experiment_number}') ax3.legend() ax3.yaxis.set_major_locator(ticker.MaxNLocator(50)) + ax3.xaxis.set_major_locator(ticker.MaxNLocator(50)) plt.tight_layout() - plt.savefig(os.path.join(dir_name, f'performance_metrics_training_0_other_metrics.png')) + plt.savefig(os.path.join(dir_name, f'performance_metrics_training_{experiment_number}_other_metrics.png')) plt.show() From 01a6450fcf21b60387711cf5d2dc55800aabd5dc Mon Sep 17 00:00:00 2001 From: alya Date: Mon, 5 May 2025 15:24:12 +0300 Subject: [PATCH 489/498] test_profiler: update unit tests --- tests/test_profiler.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_profiler.py b/tests/test_profiler.py index 36733d2b8..465bc5922 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -467,7 +467,6 @@ def test_read_configuration( mock_conf.local_whitelist_path.return_value = "path/to/whitelist" mock_conf.ts_format.return_value = "unixtimestamp" mock_conf.analysis_direction.return_value = "all" - mock_conf.label.return_value = "malicious" mock_conf.get_tw_width_as_float.return_value = 1.0 mock_conf.client_ips.return_value = ["192.168.1.1", "10.0.0.1"] @@ -476,7 +475,6 @@ def test_read_configuration( assert profiler.local_whitelist_path == "path/to/whitelist" assert profiler.timeformat == "unixtimestamp" assert profiler.analysis_direction == "all" - assert profiler.label == "malicious" assert profiler.width == 1.0 assert profiler.client_ips == ["192.168.1.1", "10.0.0.1"] From 99a276f9caae1a8621146209d2bfdefa756a0297 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Mon, 5 May 2025 16:43:05 +0000 Subject: [PATCH 490/498] Fix that the training and testing logs files were appened instead of rewritten --- modules/flowmldetection/flowmldetection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 2c60cd403..9a920b4e2 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -90,7 +90,7 @@ def write_to_training_log(self, message: str): Write a message to the training log file. """ try: - with open(self.training_log_path, "a") as log_file: + with open(self.training_log_path, "w") as log_file: log_file.write(message + "\n") except Exception as e: self.print(f"Error writing to training log: {e}", 0, 1) @@ -610,8 +610,7 @@ def main(self): testing_log_path = "./modules/flowmldetection/testing_performance.log" try: - with open(testing_log_path, "a") as log_file: - log_file.write("Testing Performance Log Initialized\n") + with open(testing_log_path, "w") as log_file: # Log the testing performance metrics log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") From cb22b3103a300fce293bf9ab34355d774f6a2b5d Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Mon, 5 May 2025 22:45:16 +0000 Subject: [PATCH 491/498] Fix an issue of storing the new log files --- modules/flowmldetection/flowmldetection.py | 49 ++++++++++------------ 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9a920b4e2..9139066f0 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -72,11 +72,19 @@ def init(self): self.scaler = StandardScaler() self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" - - # Initialize the training log file - self.training_log_path = "./modules/flowmldetection/training.log" - with open(self.training_log_path, "w") as log_file: - log_file.write("Training Log Initialized\n") + self.init_log_file() + + def init_log_file(self): + """ + Init the log file for training or testing + """ + if self.mode == "train": + # Initialize the training log file + self.log_path = "./modules/flowmldetection/training.log" + elif self.mode == "test": + # Initialize the testing log file + self.log_path = "./modules/flowmldetection/testing.log" + self.log_file = open(self.log_path, "w") def read_configuration(self): conf = ConfigParser() @@ -85,15 +93,14 @@ def read_configuration(self): # in case the flows do not have a label themselves self.label = conf.label() - def write_to_training_log(self, message: str): + def write_to_log(self, message: str): """ - Write a message to the training log file. + Write a message to the local log file. """ try: - with open(self.training_log_path, "w") as log_file: - log_file.write(message + "\n") + self.log_file.write(message + "\n") except Exception as e: - self.print(f"Error writing to training log: {e}", 0, 1) + self.print(f"Error writing to log: {e}", 0, 1) def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ @@ -159,7 +166,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.store_model() # Log training information - self.write_to_training_log( + self.write_to_log( f"Total labels: {sum_labeled_flows}, " f"Background: {epoch_label_counts['Background']}. " f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " @@ -169,7 +176,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): except Exception: self.print("Error in train().", 0, 1) self.print(traceback.format_exc(), 0, 1) - self.write_to_training_log("Error occurred during training.") + self.write_to_log("Error occurred during training.") def process_features(self, dataset): """ @@ -597,7 +604,6 @@ def main(self): if not hasattr(self, 'fn'): self.fn = 0 - # Update counters based on predictions and labels if pred[0] == "Malicious" and original_label == "Malicious": self.tp += 1 @@ -605,19 +611,10 @@ def main(self): self.tn += 1 elif pred[0] == "Malicious" and original_label == "Benign": self.fp += 1 + self.write_to_log(f"False Positive Flow: {self.flow}") elif pred[0] == "Benign" and original_label == "Malicious": self.fn += 1 + self.write_to_log(f"False Negative Flow: {self.flow}") - testing_log_path = "./modules/flowmldetection/testing_performance.log" - try: - with open(testing_log_path, "w") as log_file: - # Log the testing performance metrics - log_file.write(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}\n") - - # Log the original flow for false positives and false negatives - if pred[0] == "Malicious" and original_label == "Benign": - log_file.write(f"False Positive Flow: {self.flow}\n") - elif pred[0] == "Benign" and original_label == "Malicious": - log_file.write(f"False Negative Flow: {self.flow}\n") - except Exception as e: - self.print(f"Error initializing testing performance log: {e}", 0, 1) \ No newline at end of file + # Log the testing performance metrics + self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}") \ No newline at end of file From e0cc7c2f946a8fb4db664bbbc42422e6c54458a7 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:21:08 +0300 Subject: [PATCH 492/498] enable/ disable training and testing.log with a param in the config file --- .secrets.baseline | 6 +- config/slips.yaml | 3 + modules/flowmldetection/flowmldetection.py | 140 +++++++++++++------- modules/riskiq/riskiq.py | 2 +- modules/update_manager/update_manager.py | 2 +- slips_files/common/parsers/config_parser.py | 7 +- 6 files changed, 109 insertions(+), 51 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index fc1ac4872..aa5615109 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -149,14 +149,14 @@ "filename": "config/slips.yaml", "hashed_secret": "4cac50cee3ad8e462728e711eac3e670753d5016", "is_verified": false, - "line_number": 223 + "line_number": 226 }, { "type": "Secret Keyword", "filename": "config/slips.yaml", "hashed_secret": "d033e22ae348aeb5660fc2140aec35850c4da997", "is_verified": false, - "line_number": 393 + "line_number": 396 } ], "dataset/test14-malicious-zeek-dir/http.log": [ @@ -7192,5 +7192,5 @@ } ] }, - "generated_at": "2025-05-08T14:51:28Z" + "generated_at": "2025-05-10T13:18:46Z" } diff --git a/config/slips.yaml b/config/slips.yaml index 1b73e7b54..ac2010e6b 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -213,6 +213,9 @@ flowmldetection: # You should have trained at least once with 'Normal' data and once with # 'Malicious' data in order for the test to work. mode: test + # creates an extra log file called training.log/testing.log in the + # ouptput dir with performance metrics depending on the mode. + create_performance_metrics_log_files: False ############################# virustotal: diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9139066f0..2a515d0cf 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -10,9 +10,8 @@ import json import traceback import warnings -from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import confusion_matrix from sklearn.metrics import ( - confusion_matrix, f1_score, precision_score, accuracy_score, @@ -37,6 +36,7 @@ Method, ) + # This horrible hack is only to stop sklearn from printing those warnings def warn(*args, **kwargs): pass @@ -73,7 +73,7 @@ def init(self): self.model_path = "./modules/flowmldetection/model.bin" self.scaler_path = "./modules/flowmldetection/scaler.bin" self.init_log_file() - + def init_log_file(self): """ Init the log file for training or testing @@ -92,11 +92,16 @@ def read_configuration(self): # This is the global label in the configuration, # in case the flows do not have a label themselves self.label = conf.label() + self.enable_logs: bool = conf.create_performance_metrics_log_files() def write_to_log(self, message: str): """ - Write a message to the local log file. + Write a message to the local log file if + create_performance_metrics_log_files is enabled in slips.yaml """ + if not self.enable_logs: + return + try: self.log_file.write(message + "\n") except Exception as e: @@ -108,7 +113,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): """ try: # Create y_flow with the label - y_flow = numpy.full(self.flows.shape[0], self.flows.ground_truth_label) + y_flow = numpy.full( + self.flows.shape[0], self.flows.ground_truth_label + ) # Create X_flow with the current flows minus the label X_flow = self.flows.drop("ground_truth_label", axis=1) # Drop the detailed labels @@ -130,7 +137,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): try: # Online incremental learning self.clf.partial_fit( - X_flow, y_flow, classes=["Background", "Malicious", "Benign"] + X_flow, + y_flow, + classes=["Background", "Malicious", "Benign"], ) except Exception: self.print("Error while calling clf.train()") @@ -149,7 +158,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) # Compute confusion matrix: tn, fp, fn, tp - tn, fp, fn, tp = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1]).ravel() if len(set(y_true_bin)) > 1 else (0,0,0,0) + tn, fp, fn, tp = ( + confusion_matrix(y_true_bin, y_pred_bin, labels=[0, 1]).ravel() + if len(set(y_true_bin)) > 1 + else (0, 0, 0, 0) + ) # Compute metrics FPR = fp / (fp + tn) if (fp + tn) > 0 else 0 @@ -159,7 +172,11 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): F1 = f1_score(y_true_bin, y_pred_bin, zero_division=0) PREC = precision_score(y_true_bin, y_pred_bin, zero_division=0) ACCU = accuracy_score(y_true_bin, y_pred_bin) - MCC = matthews_corrcoef(y_true_bin, y_pred_bin) if len(set(y_true_bin)) > 1 else 0 + MCC = ( + matthews_corrcoef(y_true_bin, y_pred_bin) + if len(set(y_true_bin)) > 1 + else 0 + ) RECALL = recall_score(y_true_bin, y_pred_bin, zero_division=0) # Store the models on disk @@ -189,7 +206,8 @@ def process_features(self, dataset): for proto in to_discard: dataset = dataset[dataset.proto != proto] - # If te proto is in the list to delete and there is only one flow, then the dataset will be empty + # If te proto is in the list to delete and there is only one flow, + # then the dataset will be empty if dataset.empty: # DataFrame is empty now, so return empty return dataset @@ -295,7 +313,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): if last_number_of_flows_when_trained is None: last_number_of_flows_when_trained = 0 else: - last_number_of_flows_when_trained = int(last_number_of_flows_when_trained) + last_number_of_flows_when_trained = int( + last_number_of_flows_when_trained + ) # We get all the flows so far flows = self.db.get_all_flows() @@ -399,21 +419,21 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: ] # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. # Error - ''' [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - ''' + """ [Flow ML Detection] Error in detect() while processing + dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes + 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 + The feature names should match those that were passed during fit. + Feature names unseen at fit time: + - bytes + """ # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - + # [Flow ML Detection] Error in detect() while processing + # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes + # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 + # The feature names should match those that were passed during fit. + # Feature names must be in the same order as they were in fit. + for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1) @@ -540,17 +560,19 @@ def main(self): labels = self.db.get_labels() sum_labeled_flows = sum(i[1] for i in labels) - # The min labels to retrain is the min number of flows + # The min labels to retrain is the min number of flows # we should have seen so far in this capture to start training # This is so we dont _start_ training with only 1 flow - # Once we are over the start minimum, the second condition is + # Once we are over the start minimum, the second condition is # to force to retrain every a minimum_labels_to_retrain number # of flows. So we dont retrain every 1 flow. - if ( - sum_labeled_flows >= self.minimum_labels_to_start_train - ): - if (sum_labeled_flows - self.last_number_of_flows_when_trained >= self.minimum_labels_to_retrain): + if sum_labeled_flows >= self.minimum_labels_to_start_train: + if ( + sum_labeled_flows + - self.last_number_of_flows_when_trained + >= self.minimum_labels_to_retrain + ): # So for example we retrain every 50 labels and only when # we have at least 50 labels self.print( @@ -559,10 +581,17 @@ def main(self): ) # Process all flows in the DB and make them ready # for pandas - self.process_training_flows(self.last_number_of_flows_when_trained) + self.process_training_flows( + self.last_number_of_flows_when_trained + ) # Train an algorithm - self.train(sum_labeled_flows, self.last_number_of_flows_when_trained) - self.last_number_of_flows_when_trained = sum_labeled_flows + self.train( + sum_labeled_flows, + self.last_number_of_flows_when_trained, + ) + self.last_number_of_flows_when_trained = ( + sum_labeled_flows + ) elif self.mode == "test": # We are testing, which means using the model to detect @@ -570,7 +599,9 @@ def main(self): # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: - original_label = processed_flow["ground_truth_label"].iloc[0] + original_label = processed_flow["ground_truth_label"].iloc[ + 0 + ] # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: @@ -591,30 +622,49 @@ def main(self): 2, ) - # So you can disable this code easily. Since it is used only for evaluating a testing + # So you can disable this code easily. Since it is used + # only for evaluating a testing log_testing_data = True if log_testing_data: # Initialize counters if not already done - if not hasattr(self, 'tp'): + if not hasattr(self, "tp"): self.tp = 0 - if not hasattr(self, 'tn'): + if not hasattr(self, "tn"): self.tn = 0 - if not hasattr(self, 'fp'): + if not hasattr(self, "fp"): self.fp = 0 - if not hasattr(self, 'fn'): + if not hasattr(self, "fn"): self.fn = 0 # Update counters based on predictions and labels - if pred[0] == "Malicious" and original_label == "Malicious": + if ( + pred[0] == "Malicious" + and original_label == "Malicious" + ): self.tp += 1 - elif pred[0] == "Benign" and original_label == "Benign": + elif ( + pred[0] == "Benign" and original_label == "Benign" + ): self.tn += 1 - elif pred[0] == "Malicious" and original_label == "Benign": + elif ( + pred[0] == "Malicious" + and original_label == "Benign" + ): self.fp += 1 - self.write_to_log(f"False Positive Flow: {self.flow}") - elif pred[0] == "Benign" and original_label == "Malicious": + self.write_to_log( + f"False Positive Flow: {self.flow}" + ) + elif ( + pred[0] == "Benign" + and original_label == "Malicious" + ): self.fn += 1 - self.write_to_log(f"False Negative Flow: {self.flow}") + self.write_to_log( + f"False Negative Flow: {self.flow}" + ) # Log the testing performance metrics - self.write_to_log(f"TP: {self.tp}, TN: {self.tn}, FP: {self.fp}, FN: {self.fn}") \ No newline at end of file + self.write_to_log( + f"TP: {self.tp}, TN: {self.tn}," + f" FP: {self.fp}, FN: {self.fn}" + ) diff --git a/modules/riskiq/riskiq.py b/modules/riskiq/riskiq.py index 5abf2ddb1..7b5653997 100644 --- a/modules/riskiq/riskiq.py +++ b/modules/riskiq/riskiq.py @@ -25,7 +25,7 @@ def init(self): def read_configuration(self): conf = ConfigParser() - risk_iq_credentials_path = conf.RiskIQ_credentials_path() + risk_iq_credentials_path = conf.risk_iq_credentials_path() try: with open(risk_iq_credentials_path, "r") as f: self.riskiq_email = f.readline().replace("\n", "") diff --git a/modules/update_manager/update_manager.py b/modules/update_manager/update_manager.py index ba8106aa5..b791bfc13 100644 --- a/modules/update_manager/update_manager.py +++ b/modules/update_manager/update_manager.py @@ -119,7 +119,7 @@ def read_riskiq_creds(risk_iq_credentials_path): self.ssl_feeds_path = conf.ssl_feeds() self.ssl_feeds = self.get_feed_details(self.ssl_feeds_path) - risk_iq_credentials_path = conf.RiskIQ_credentials_path() + risk_iq_credentials_path = conf.risk_iq_credentials_path() read_riskiq_creds(risk_iq_credentials_path) self.riskiq_update_period = conf.riskiq_update_period() diff --git a/slips_files/common/parsers/config_parser.py b/slips_files/common/parsers/config_parser.py index 40f1b044b..e208f7881 100644 --- a/slips_files/common/parsers/config_parser.py +++ b/slips_files/common/parsers/config_parser.py @@ -418,7 +418,12 @@ def data_exfiltration_threshold(self): def get_ml_mode(self): return self.read_configuration("flowmldetection", "mode", "test") - def RiskIQ_credentials_path(self): + def create_performance_metrics_log_files(self) -> bool: + return self.read_configuration( + "flowmldetection", "create_performance_metrics_log_files", False + ) + + def risk_iq_credentials_path(self): return self.read_configuration( "threatintelligence", "RiskIQ_credentials_path", "" ) From adcbafd997d538cf7d8041f6317dd48f3cef0f54 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:23:58 +0300 Subject: [PATCH 493/498] dont create an empty logfile when create_performance_metrics_log_files is set to false --- modules/flowmldetection/flowmldetection.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 2a515d0cf..9305197d3 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -78,6 +78,9 @@ def init_log_file(self): """ Init the log file for training or testing """ + if not self.enable_logs: + return + if self.mode == "train": # Initialize the training log file self.log_path = "./modules/flowmldetection/training.log" From c45e77594002748fdd1e2c5ddd559c92416eb3f5 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:29:30 +0300 Subject: [PATCH 494/498] when enabled, create testing.log or training.log in the current output dir --- modules/flowmldetection/flowmldetection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index 9305197d3..f618195bc 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: GPL-2.0-only import numpy +import os from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler import pickle @@ -83,10 +84,10 @@ def init_log_file(self): if self.mode == "train": # Initialize the training log file - self.log_path = "./modules/flowmldetection/training.log" + self.log_path = os.path.join(self.output_dir, "training.log") elif self.mode == "test": # Initialize the testing log file - self.log_path = "./modules/flowmldetection/testing.log" + self.log_path = os.path.join(self.output_dir, "testing.log") self.log_file = open(self.log_path, "w") def read_configuration(self): From b2452494a0d32f394b5ddc15e5cb6afc47df2855 Mon Sep 17 00:00:00 2001 From: alya Date: Sat, 10 May 2025 16:43:32 +0300 Subject: [PATCH 495/498] Add an enum called labels with either Benign or Malicious so the labels are unified. --- modules/flowmldetection/flowmldetection.py | 65 +++++++++++----------- slips_files/core/structures/labels.py | 11 ++++ 2 files changed, 43 insertions(+), 33 deletions(-) create mode 100644 slips_files/core/structures/labels.py diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index f618195bc..e828058ee 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -19,11 +19,10 @@ matthews_corrcoef, recall_score, ) - - from slips_files.common.parsers.config_parser import ConfigParser from slips_files.common.slips_utils import utils from slips_files.common.abstracts.module import IModule +from slips_files.core.structures.labels import Label from slips_files.core.structures.evidence import ( Evidence, ProfileID, @@ -45,6 +44,10 @@ def warn(*args, **kwargs): warnings.warn = warn +BACKGROUND = Label.BACKGROUND.name +BENIGN = Label.BENIGN.name +MALICIOUS = Label.MALICIOUS.name + class FlowMLDetection(IModule): # Name: short name of the module. Do not use spaces @@ -132,9 +135,9 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): # Count the number of labels of each type in this epoc epoch_label_counts = { - "Background": (y_flow == "Background").sum(), - "Malicious": (y_flow == "Malicious").sum(), - "Benign": (y_flow == "Benign").sum(), + BACKGROUND: (y_flow == BACKGROUND).sum(), + MALICIOUS: (y_flow == MALICIOUS).sum(), + BENIGN: (y_flow == BENIGN).sum(), } # Train @@ -143,7 +146,7 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.clf.partial_fit( X_flow, y_flow, - classes=["Background", "Malicious", "Benign"], + classes=[BACKGROUND, MALICIOUS, BENIGN], ) except Exception: self.print("Error while calling clf.train()") @@ -153,13 +156,13 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): y_pred = self.clf.predict(X_flow) # For metrics, let's focus on Malicious vs Benign (ignore Background) - mask = (y_flow == "Malicious") | (y_flow == "Benign") + mask = (y_flow == MALICIOUS) | (y_flow == BENIGN) y_true_bin = y_flow[mask] y_pred_bin = y_pred[mask] # Map to binary: Malicious=1, Benign=0 - y_true_bin = numpy.where(y_true_bin == "Malicious", 1, 0) - y_pred_bin = numpy.where(y_pred_bin == "Malicious", 1, 0) + y_true_bin = numpy.where(y_true_bin == MALICIOUS, 1, 0) + y_pred_bin = numpy.where(y_pred_bin == MALICIOUS, 1, 0) # Compute confusion matrix: tn, fp, fn, tp tn, fp, fn, tp = ( @@ -190,9 +193,12 @@ def train(self, sum_labeled_flows, last_number_of_flows_when_trained): self.write_to_log( f"Total labels: {sum_labeled_flows}, " f"Background: {epoch_label_counts['Background']}. " - f"Benign: {epoch_label_counts['Benign']}. Malicious: {epoch_label_counts['Malicious']}. " - f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, TPR={TPR:.4f}, FNR={FNR:.4f}, " - f"F1={F1:.4f}, Precision={PREC:.4f}, Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." + f"Benign: {epoch_label_counts['Benign']}. " + f"Malicious: {epoch_label_counts[MALICIOUS]}. " + f"Metrics: FPR={FPR:.4f}, TNR={TNR:.4f}, " + f"TPR={TPR:.4f}, FNR={FNR:.4f}, " + f"F1={F1:.4f}, Precision={PREC:.4f}, " + f"Accuracy={ACCU:.4f}, MCC={MCC:.4f}, Recall={RECALL:.4f}." ) except Exception: self.print("Error in train().", 0, 1) @@ -345,9 +351,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 25517, "dbytes": 17247, "appproto": "ssl", - "ground_truth_label": "Malicious", + "ground_truth_label": MALICIOUS, "module_labels": { - "flowalerts-long-connection": "Malicious" + "flowalerts-long-connection": MALICIOUS }, } ) @@ -366,9 +372,9 @@ def process_training_flows(self, last_number_of_flows_when_trained): "sbytes": 100, "dbytes": 67596, "appproto": "http", - "ground_truth_label": "Benign", + "ground_truth_label": BENIGN, "module_labels": { - "flowalerts-long-connection": "Benign" + "flowalerts-long-connection": BENIGN }, } ) @@ -421,7 +427,8 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle bytes that was not in other flows. It should be called allbytes. + # For argus binetflows this fails because ther is a field calle + # bytes that was not in other flows. It should be called allbytes. # Error """ [Flow ML Detection] Error in detect() while processing dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes @@ -546,8 +553,8 @@ def main(self): self.twid = msg["twid"] self.profileid = msg["profileid"] self.flow = msg["flow"] - # These following extra fields are expected in testing. update the original - # flow dict to have them + # These following extra fields are expected in testing. + # update the original flow dict to have them self.flow.update( { "state": msg["interpreted_state"], @@ -612,7 +619,7 @@ def main(self): # an error occurred return - if pred[0] == "Malicious": + if pred[0] == MALICIOUS: # Generate an alert self.set_evidence_malicious_flow(self.flow, self.twid) self.print( @@ -642,26 +649,18 @@ def main(self): # Update counters based on predictions and labels if ( - pred[0] == "Malicious" - and original_label == "Malicious" + pred[0] == MALICIOUS + and original_label == MALICIOUS ): self.tp += 1 - elif ( - pred[0] == "Benign" and original_label == "Benign" - ): + elif pred[0] == BENIGN and original_label == BENIGN: self.tn += 1 - elif ( - pred[0] == "Malicious" - and original_label == "Benign" - ): + elif pred[0] == MALICIOUS and original_label == BENIGN: self.fp += 1 self.write_to_log( f"False Positive Flow: {self.flow}" ) - elif ( - pred[0] == "Benign" - and original_label == "Malicious" - ): + elif pred[0] == BENIGN and original_label == MALICIOUS: self.fn += 1 self.write_to_log( f"False Negative Flow: {self.flow}" diff --git a/slips_files/core/structures/labels.py b/slips_files/core/structures/labels.py new file mode 100644 index 000000000..b1dc64234 --- /dev/null +++ b/slips_files/core/structures/labels.py @@ -0,0 +1,11 @@ +from enum import Enum + + +class Label(Enum): + """ + label of flows should be one of the following + """ + + MALICIOUS = "Malicious" + BENIGN = "Benign" + BACKGROUND = "Background" From 31a49bdefd4bbb1cfe7834b59dbfc9e137a66418 Mon Sep 17 00:00:00 2001 From: alya Date: Mon, 12 May 2025 20:57:25 +0300 Subject: [PATCH 496/498] set the config label as the GT label if not founf in the given file --- modules/flowmldetection/flowmldetection.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index e828058ee..c2b184cb1 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -98,7 +98,7 @@ def read_configuration(self): self.mode = conf.get_ml_mode() # This is the global label in the configuration, # in case the flows do not have a label themselves - self.label = conf.label() + self.ground_truth_config_label = conf.label() self.enable_logs: bool = conf.create_performance_metrics_log_files() def write_to_log(self, message: str): @@ -610,9 +610,15 @@ def main(self): # After processing the flow, it may happen that we # delete icmp/arp/etc so the dataframe can be empty if processed_flow is not None and not processed_flow.empty: - original_label = processed_flow["ground_truth_label"].iloc[ - 0 - ] + try: + original_label = processed_flow[ + "ground_truth_label" + ].iloc[0] + except KeyError: + # If there are no labels in the flows, the default + # label should be the one in the config file. + original_label = self.ground_truth_config_label + # Predict pred: numpy.ndarray = self.detect(processed_flow) if not pred: From a6ad940c2b134f1dd220e07f4f2d16419d545f08 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Tue, 20 May 2025 11:13:27 +0000 Subject: [PATCH 497/498] By default train and store logs --- config/slips.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/slips.yaml b/config/slips.yaml index ac2010e6b..635df7f91 100644 --- a/config/slips.yaml +++ b/config/slips.yaml @@ -212,10 +212,10 @@ flowmldetection: # training the models, to test in unknown data. # You should have trained at least once with 'Normal' data and once with # 'Malicious' data in order for the test to work. - mode: test + mode: train # creates an extra log file called training.log/testing.log in the # ouptput dir with performance metrics depending on the mode. - create_performance_metrics_log_files: False + create_performance_metrics_log_files: True ############################# virustotal: From c7ab0a2c2ee14ddc5b009bc4011095b0ae2044f4 Mon Sep 17 00:00:00 2001 From: Seba Garcia Date: Tue, 20 May 2025 11:13:56 +0000 Subject: [PATCH 498/498] Fix the labels to .value --- modules/flowmldetection/flowmldetection.py | 28 ++++------------------ 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/modules/flowmldetection/flowmldetection.py b/modules/flowmldetection/flowmldetection.py index c2b184cb1..4ef661146 100644 --- a/modules/flowmldetection/flowmldetection.py +++ b/modules/flowmldetection/flowmldetection.py @@ -44,9 +44,9 @@ def warn(*args, **kwargs): warnings.warn = warn -BACKGROUND = Label.BACKGROUND.name -BENIGN = Label.BENIGN.name -MALICIOUS = Label.MALICIOUS.name +BACKGROUND = Label.BACKGROUND.value +BENIGN = Label.BENIGN.value +MALICIOUS = Label.MALICIOUS.value class FlowMLDetection(IModule): @@ -287,7 +287,7 @@ def process_features(self, dataset): r"(^.*arp.*$)", "4", regex=True ) - dataset["allbytes"] = dataset["sbytes"] + dataset["dbytes"] + dataset["bytes"] = dataset["sbytes"] + dataset["dbytes"] dataset["pkts"] = dataset["spkts"] + dataset["dpkts"] fields_to_convert_to_float = [ @@ -297,7 +297,7 @@ def process_features(self, dataset): dataset.dur, dataset.pkts, dataset.spkts, - dataset.allbytes, + dataset.bytes, dataset.sbytes, dataset.state, ] @@ -427,24 +427,6 @@ def detect(self, x_flow) -> Optional[numpy.ndarray]: "ground_truth_label", "detailed_ground_truth_label", ] - # For argus binetflows this fails because ther is a field calle - # bytes that was not in other flows. It should be called allbytes. - # Error - """ [Flow ML Detection] Error in detect() while processing - dur proto sport dport state pkts spkts dpkts bytes sbytes dbytes allbytes - 0 63.822830 0 56119 981 0.0 15 15 0 8764 1887 0 1887 - The feature names should match those that were passed during fit. - Feature names unseen at fit time: - - bytes - """ - - # IF we delete here the filed bytes the error is - # [Flow ML Detection] Error in detect() while processing - # dur proto sport dport state pkts spkts dpkts sbytes dbytes allbytes - # 0 63.822830 0 56120 980 0.0 15 15 0 1887 0 1887 - # The feature names should match those that were passed during fit. - # Feature names must be in the same order as they were in fit. - for field in fields_to_drop: try: x_flow = x_flow.drop(field, axis=1)