-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
82 lines (60 loc) · 2.28 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import lib
def preprocess():
# Define paths
training_data_path = "data/ADFA-LD/Training_Data_Master/"
attack_data_path = "data/ADFA-LD/Attack_Data_Master/"
# Initialize data storage
data = []
# Process normal data
for filename in os.listdir(training_data_path):
filepath = os.path.join(training_data_path, filename)
with open(filepath, "r") as file:
sequence = file.read().strip()
data.append({"file_name": filename, "sequence": sequence, "label": "normal"})
# Process abnormal data
for root, _, files in os.walk(attack_data_path):
for filename in files:
filepath = os.path.join(root, filename)
with open(filepath, "r") as file:
sequence = file.read().strip()
data.append({"file_name": filename, "sequence": sequence, "label": "abnormal"})
# Convert to DataFrame
df = pd.DataFrame(data)
# Split data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
# Save to CSV or other format if needed
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)
print("Training and testing data successfully split and saved.")
def check_unique_vals():
# Load data from CSV files
train_d = pd.read_csv("data/train_data.csv")
test_d = pd.read_csv("data/test_data.csv")
train_sequences = lib.get_seq(train_d['sequence'])
test_sequences = lib.get_seq(test_d['sequence'])
uniq = set()
for a in train_sequences + test_sequences:
for v in a:
uniq.add(v)
print(uniq)
def check_syscalls():
# Load data from CSV files
train_d = pd.read_csv("data/train_data.csv")
test_d = pd.read_csv("data/test_data.csv")
train_sequences = lib.get_seq(train_d['sequence'])
test_sequences = lib.get_seq(test_d['sequence'])
cnt = 0
for a in train_sequences:
cnt+=len(a)
print(f"train_sequences={cnt}")
cnt = 0
for a in test_sequences:
cnt+=len(a)
print(f"test_sequences={cnt}")
if __name__ == "__main__":
# preprocess()
# check_unique_vals()
check_syscalls()