Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
339 changes: 16 additions & 323 deletions experiment/main.py
Original file line number Diff line number Diff line change
@@ -1,306 +1,12 @@
import hashlib
import os
import pathlib
import pickle
import random
import subprocess
import sys
from argparse import ArgumentParser, BooleanOptionalAction
from datetime import datetime
from typing import List
from argparse import ArgumentParser

import keras_tuner as kt
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Model # type: ignore
from numpy import ndarray
from sklearn.metrics import f1_score, precision_score, recall_score, log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from experiment.plot import save_plot
from experiment.src.data_loader import read_detected_data, read_metadata, join_label, get_y_labels
from experiment.src.features import prepare_data
from experiment.src.log_callback import LogCallback
from experiment.src.ml_model import MlModel
from experiment.src.model_config_preprocess import model_config_preprocess
from experiment.src.prepare_data import prepare_train_data


def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray], y_label: np.ndarray):
"""Evaluate Keras model with printing scores

Args:
thresholds: dict of credsweeper thresholds
keras_model: fitted keras model
x_data: List of np.arrays. Number and shape depends on model
y_label: expected result

"""
predictions_proba = keras_model.predict(x_data, verbose=2).ravel()
for name, threshold in thresholds.items():
predictions = (predictions_proba > threshold)
accuracy = accuracy_score(y_label, predictions)
precision = precision_score(y_label, predictions)
recall = recall_score(y_label, predictions)
loss = log_loss(y_label, predictions)
f1 = f1_score(y_label, predictions)
print(f"{name}: {threshold:0.6f}, "
f"accuracy: {accuracy:0.6f}, "
f"precision:{precision:0.6f}, "
f"recall: {recall:0.6f}, "
f"loss: {loss:0.6f}, "
f"F1:{f1:0.6f}")


def main(
cred_data_location: str,
jobs: int,
epochs: int,
batch_size: int,
patience: int,
doc_target: bool,
use_tuner: bool,
eval_test: bool,
eval_train: bool,
eval_full: bool,
) -> str:
print(f"Memory at start: {LogCallback.get_memory_info()}")

current_time = datetime.now().strftime("%Y%m%d_%H%M%S")

dir_path = pathlib.Path("results")
os.makedirs(dir_path, exist_ok=True)

print(f"Train model on data from {cred_data_location}")
meta_checksum, data_checksum = prepare_train_data(cred_data_location, jobs, doc_target)

df_all_file = dir_path / f"{meta_checksum}-{data_checksum}.pkl"
if df_all_file.exists():
df_all = pd.read_pickle(df_all_file)
print(f"Read from {df_all_file}")
else:
# detected data means which data is passed to ML validator of credsweeper after filters with RuleName
detected_data = read_detected_data(f"results/detected_data.{data_checksum}.json")
print(f"CredSweeper detected {len(detected_data)} credentials without ML")
# all markup data
meta_data = read_metadata(f"{cred_data_location}/meta")
print(f"Metadata markup: {len(meta_data)} items")
df_all = join_label(detected_data, meta_data, cred_data_location)
# np.save(df_all_file, df_all)
df_all.to_pickle(df_all_file)
print(f"Stored to {df_all_file}")
# to prevent extra memory consumption - delete unnecessary objects
del detected_data
del meta_data

# workaround for CI step
for i in range(3):
# there are 2 times possible fails due ml config was updated
try:
thresholds = model_config_preprocess(df_all, doc_target)
break
except RuntimeError as exc:
if "RESTART:" in str(exc):
continue
else:
raise
else:
raise RuntimeError("Something went wrong")

print(f"Common dataset: {len(df_all)} items")
df_all = df_all.drop_duplicates(subset=["line", "variable", "value", "path", "ext"])
print(f"Common dataset: {len(df_all)} items after drop duplicates")

# random split
lucky_number = random.randint(1, 1 << 32)
print(f"Lucky number: {lucky_number}")
df_train, df_test = train_test_split(df_all, test_size=0.15, random_state=lucky_number)
len_df_train = len(df_train)
print(f"Train size: {len_df_train}")
len_df_test = len(df_test)
print(f"Test size: {len_df_test}")

print(f"Prepare full data")
x_full_line, x_full_variable, x_full_value, x_full_features = prepare_data(df_all)
y_full: ndarray = get_y_labels(df_all)
del df_all

print(f"Prepare train data")
x_train_line, x_train_variable, x_train_value, x_train_features = prepare_data(df_train)
print("x_train_value dtype ", x_train_value.dtype) # dbg
print("x_train_features dtype", x_train_features.dtype) # dbg
y_train = get_y_labels(df_train)
print("y_train dtype", y_train.dtype) # dbg
del df_train

print(f"Class-1 prop on train: {np.mean(y_train):.4f}")

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
max_weight = max(class_weights)
class_weights = [weight / max_weight for weight in class_weights]
print(f"y_train size:{len(y_train)}, 0: {np.count_nonzero(y_train == 0)}, 1: {np.count_nonzero(y_train == 1)}")
class_weight = dict(zip(classes, class_weights))
print(f"class_weight: {class_weight}") # information about class weights

print(f"Prepare test data")
x_test_line, x_test_variable, x_test_value, x_test_features = prepare_data(df_test)
y_test = get_y_labels(df_test)
print(f"Class-1 prop on test: {np.mean(y_test):.4f}")
del df_test

print(f"Memory before search / compile: {LogCallback.get_memory_info()}")

hp_dict = {
"value_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41),
"line_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.41),
"variable_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.46),
"dense_a_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.2),
"dense_b_lstm_dropout_rate": ((0.1, 0.5, 0.01), 0.18),
}
log_callback = LogCallback()
if use_tuner:
print(f"Tuner initial dict:{hp_dict}")
tuner_kwargs = {k: v[0] for k, v in hp_dict.items()}
print(f"Tuner kwargs:{tuner_kwargs}")

tuner = kt.BayesianOptimization(
hypermodel=MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape,
**tuner_kwargs),
objective='val_loss',
directory=str(dir_path / f"{current_time}.tuner"),
project_name='ml_tuning',
)
search_early_stopping = EarlyStopping(monitor="val_loss",
patience=patience,
mode="min",
restore_best_weights=True,
verbose=1)
tuner.search(
x=[x_train_line, x_train_variable, x_train_value, x_train_features],
y=y_train,
epochs=epochs,
batch_size=batch_size,
callbacks=[search_early_stopping, log_callback],
validation_data=([x_test_line, x_test_variable, x_test_value, x_test_features], y_test),
verbose=2,
)
print("Best Hyperparameters:")
for k, v in tuner.get_best_hyperparameters()[0].values.items():
print(f"{k}: {v}")
param_kwargs = {k: float(v) for k, v in tuner.get_best_hyperparameters()[0].values.items() if k in hp_dict}
del tuner
else:
print(f"Model is trained with params from dict:{hp_dict}")
param_kwargs = {k: v[1] for k, v in hp_dict.items()}

print(f"Model hyper parameters: {param_kwargs}")

# repeat train step to obtain actual history chart
keras_model = MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape,
**param_kwargs).build()
if not eval_full:
# the data are not necessary
del x_full_line
del x_full_variable
del x_full_value
del x_full_features
del y_full

early_stopping = EarlyStopping(monitor="val_loss",
patience=patience,
mode="min",
restore_best_weights=True,
verbose=1)
model_checkpoint = ModelCheckpoint(filepath=str(dir_path / f"{current_time}.best_model"),
monitor="val_loss",
save_best_only=True,
mode="min",
verbose=1)

print(f"Memory before train: {LogCallback.get_memory_info()}")

fit_history = keras_model.fit(x=[x_train_line, x_train_variable, x_train_value, x_train_features],
y=y_train,
batch_size=batch_size,
epochs=epochs,
verbose=2,
validation_data=([x_test_line, x_test_variable, x_test_value,
x_test_features], y_test),
class_weight=class_weight,
callbacks=[early_stopping, model_checkpoint, log_callback],
use_multiprocessing=True)

# if best_val_loss is not None and best_val_loss + 0.00001 < early_stopping.best:
# print(f"CHECK BEST TUNER EARLY STOP : {best_val_loss} vs CURRENT: {early_stopping.best}")

print(f"Memory after train: {LogCallback.get_memory_info()}")

with open(dir_path / f"{current_time}.history.pickle", "wb") as f:
pickle.dump(fit_history, f)

model_file_name = dir_path / f"ml_model_at-{current_time}"
keras_model.save(model_file_name, include_optimizer=False)

if eval_test:
print(f"Validate results on the test subset. Size: {len(y_test)} {np.mean(y_test):.4f}")
evaluate_model(thresholds, keras_model, [x_test_line, x_test_variable, x_test_value, x_test_features], y_test)
# drop small test set first to free a bit more memory for next evaluation
del x_test_line
del x_test_variable
del x_test_value
del x_test_features
del y_test

if eval_train:
print(f"Validate results on the train subset. Size: {len(y_train)} {np.mean(y_train):.4f}")
evaluate_model(thresholds, keras_model, [x_train_line, x_train_variable, x_train_value, x_train_features],
y_train)
del x_train_line
del x_train_variable
del x_train_value
del x_train_features
del y_train

if eval_full:
print(f"Validate results on the full set. Size: {len(y_full)} {np.mean(y_full):.4f}")
evaluate_model(thresholds, keras_model, [x_full_line, x_full_variable, x_full_value, x_full_features], y_full)
del x_full_line
del x_full_variable
del x_full_value
del x_full_features
del y_full

onnx_model_file = pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_model.onnx"
# convert the model to onnx right now
convert_args = f"{sys.executable} -m tf2onnx.convert --saved-model {model_file_name.absolute()}" \
f" --output {str(onnx_model_file)} --verbose"
subprocess.check_call(convert_args, shell=True, cwd=pathlib.Path(__file__).parent)
with open(onnx_model_file, "rb") as f:
onnx_md5 = hashlib.md5(f.read()).hexdigest()
print(f"ml_model.onnx:{onnx_md5}")

with open(pathlib.Path(__file__).parent.parent / "credsweeper" / "ml_model" / "ml_config.json", "rb") as f:
config_md5 = hashlib.md5(f.read()).hexdigest()
print(f"ml_config.json:{config_md5}")

best_epoch = 1 + np.argmin(np.array(fit_history.history['val_loss']))

# ml history analysis
save_plot(
stamp=current_time,
title=f"batch:{batch_size} train:{len_df_train} test:{len_df_test} weights:{class_weights}",
history=fit_history,
dir_path=dir_path,
best_epoch=int(best_epoch),
info=f"ml_config.json:{config_md5} ml_model.onnx:{onnx_md5} best_epoch:{best_epoch}",
)

return str(model_file_name.absolute())
import torch

from train import train

if __name__ == "__main__":
parser = ArgumentParser()
Expand Down Expand Up @@ -329,59 +35,46 @@ def main(
default=256,
dest="batch_size",
metavar="POSITIVE_INT")
parser.add_argument("--device",
help="The device(CPU or GPU) that will be used to train the model",
default="cpu",
type=str,
choices=["cpu", "cuda"],
dest="device")
parser.add_argument("-p",
"--patience",
help="early stopping patience (default: 5)",
default=5,
dest="patience",
metavar="POSITIVE_INT")
parser.add_argument("--doc", help="use doc target", dest="doc_target", action=BooleanOptionalAction, default=False)
parser.add_argument("--tuner",
help="use keras tuner",
dest="use_tuner",
action=BooleanOptionalAction,
default=False)
parser.add_argument("--eval-test",
help="evaluate model for test dataset",
dest="eval_test",
action=BooleanOptionalAction,
default=False)
parser.add_argument("--eval-train",
help="evaluate model for train dataset",
dest="eval_train",
action=BooleanOptionalAction,
default=False)
parser.add_argument("--doc", help="use doc target", dest="doc_target", action="store_true")
parser.add_argument("--tuner", help="use parameter tuner", dest="use_tuner", action="store_true")
parser.add_argument("--eval-full",
help="evaluate model for full dataset after train",
dest="eval_full",
action=BooleanOptionalAction,
default=False)
action="store_true")
args = parser.parse_args()

fixed_seed = 20250721
fixed_seed = 20250124
print(f"Fixed seed:{fixed_seed}")
tf.random.set_seed(fixed_seed)
np.random.seed(fixed_seed)
random.seed(fixed_seed)
torch.manual_seed(fixed_seed)

# to keep the hash in log and verify
command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_config.json"
subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)
command = f"md5sum {pathlib.Path(__file__).parent.parent}/credsweeper/ml_model/ml_model.onnx"
subprocess.check_call(command, shell=True, cwd=pathlib.Path(__file__).parent)

print(args) # dbg
_model_file_name = main(
_model_file_name = train(
cred_data_location=args.cred_data_location,
jobs=int(args.jobs),
epochs=int(args.epochs),
device=str(args.device),
batch_size=int(args.batch_size),
patience=int(args.patience),
doc_target=bool(args.doc_target),
use_tuner=bool(args.use_tuner),
eval_test=bool(args.eval_test),
eval_train=bool(args.eval_train),
eval_full=bool(args.eval_full),
)
# print in last line the name
print(f"\nYou can find your model in:\n{_model_file_name}")
Loading
Loading