-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmalware_classifier.py
53 lines (43 loc) · 1.39 KB
/
malware_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
DATA_FILE = 'data/arff_backlog.arff.txt'
def main():
with open(DATA_FILE, 'r') as file_:
raw_data = file_.readlines()
data_array = []
label_array = []
for line in raw_data:
line = line.strip()
if line and line[0].isdigit():
data_point = line.split(',')
class_ = data_point[-1]
if 'good' in class_:
label = 0
else:
label = 1
x = data_point[:-1]
num_x = [int(s) for s in x]
data_array.append(num_x)
label_array.append(label)
X_train, X_test, y_train, y_test = train_test_split(
data_array, label_array, random_state=0
)
param_grid = {
'n_estimators': [1, 10, 100, 1000, 10000],
'max_features': [1, 2, 3, 4, 5]
}
grid = GridSearchCV(
RandomForestClassifier(), param_grid,
cv=5, n_jobs=-1
)
#### This grid search found best performance w/
#### max_features = 4 and n_estimators = 10,000
grid.fit(X_train, y_train)
print('Best params: {}'.format(grid.best_params_))
print('Test score: {:.4f}'.format(
grid.score(X_test, y_test)
))
#### Accuracy is around .90
if __name__ == '__main__':
main()