-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
137 lines (113 loc) · 4.8 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
# Step 1: Load the dataset from JSON
def load_data(filename='augmented_fraud_dataset.json'):
"""
Load the dataset from a JSON file.
"""
return pd.read_json(filename, lines=True)
# Step 2: Preprocess the data
def preprocess_data(df):
"""
Preprocess the data by scaling the features and splitting into training/testing sets.
"""
# Separate features and target
X = df.drop(['label', 'predicted_fraud'], axis=1)
y = df['label']
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
return X_train, X_test, y_train, y_test, scaler
# Step 3: Enhanced Data Augmentation
def augment_data(df, fraud_cases=100, noise_level=0.1):
"""
Enhance the dataset by adding noise and generating additional synthetic fraud cases with more variability.
"""
# Add noise to continuous features
for feature in ['amount', 'distance_from_home', 'average_monthly_spending']:
noise = np.random.normal(0, noise_level * df[feature].std(), df[feature].shape)
df[feature] = np.clip(df[feature] + noise, a_min=0, a_max=None) # Ensure no negative values
# Generate diverse synthetic fraud cases
fraud_data = {
'amount': np.random.uniform(5000, 20000, fraud_cases), # Wider range for fraud amounts
'distance_from_home': np.random.uniform(1000, 10000, fraud_cases), # Longer distances for fraud
'high_risk_country': np.random.choice([0, 1], fraud_cases, p=[0.3, 0.7]), # More weight to high-risk countries
'average_monthly_spending': np.random.uniform(2000, 10000, fraud_cases),
'transaction_hour': np.random.choice(range(24), fraud_cases), # Random distribution across all hours
'label': [1] * fraud_cases,
'predicted_fraud': [1] * fraud_cases
}
fraud_df = pd.DataFrame(fraud_data)
# Concatenate the original data with synthetic fraud cases
augmented_df = pd.concat([df, fraud_df], ignore_index=True)
# Shuffle the dataset to mix synthetic and real data
augmented_df = augmented_df.sample(frac=1, random_state=42).reset_index(drop=True)
return augmented_df
# Step 4: Resample the data using SMOTE
def resample_data(X_train, y_train):
"""
Apply SMOTE to handle class imbalance in the training data.
"""
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
return X_resampled, y_resampled
# Step 5: Train the model
def train_model(X_train, y_train):
"""
Train a RandomForestClassifier with class weights to handle class imbalance.
"""
model = RandomForestClassifier(n_estimators=100, class_weight={0: 1, 1: 10}, random_state=42)
model.fit(X_train, y_train)
return model
# Step 6: Evaluate the model
def evaluate_model(model, X_test, y_test):
"""
Evaluate the model using classification report and confusion matrix.
"""
y_pred = model.predict(X_test)
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Normal", "Fraud"], yticklabels=["Normal", "Fraud"])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
# Step 7: Save the model and scaler
def save_model(model, scaler, model_filename='fraud_detection_model.pkl', scaler_filename='scaler.pkl'):
"""
Save the trained model and scaler to disk.
"""
import joblib
joblib.dump(model, model_filename)
joblib.dump(scaler, scaler_filename)
print(f"Model saved to {model_filename} and scaler saved to {scaler_filename}")
# Main function
def main():
# Load the dataset
df = load_data('augmented_fraud_dataset.json')
# Augment the dataset
df = augment_data(df, fraud_cases=200, noise_level=0.15) # Increased fraud cases and noise level
# Preprocess the data
X_train, X_test, y_train, y_test, scaler = preprocess_data(df)
# Resample the training data
X_resampled, y_resampled = resample_data(X_train, y_train)
# Train the model
model = train_model(X_resampled, y_resampled)
# Evaluate the model
evaluate_model(model, X_test, y_test)
# Save the model and scaler
save_model(model, scaler)
if __name__ == "__main__":
main()