Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
283 changes: 283 additions & 0 deletions Tools/911-models.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 33,
"id": "1b3edd9c-127a-4f3a-9082-31776f2be0da",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: imbalanced-learn in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.12.4)\n",
"Requirement already satisfied: numpy>=1.17.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (2.0.0)\n",
"Requirement already satisfied: scipy>=1.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (1.14.0)\n",
"Requirement already satisfied: scikit-learn>=1.0.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (1.5.1)\n",
"Requirement already satisfied: joblib>=1.1.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (1.4.2)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (3.5.0)\n"
]
}
],
"source": [
"!pip install imbalanced-learn"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "93251236-2f47-466f-9d5a-fa8a42a07cc5",
"metadata": {},
"outputs": [],
"source": [
"import xml.etree.ElementTree as ET\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"from imblearn.over_sampling import SMOTE\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "cbae9878-d5c3-4db4-8950-13f549c6dae7",
"metadata": {},
"outputs": [],
"source": [
"tree = ET.parse('research-summer-24/data-generation/cluster_point/SYNTH_OUTPUT2_cluster_point_process.xml')\n",
"root = tree.getroot()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "ba059b95-f435-4e7f-8715-9ca77de8ce9e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of calls in training data: 1055\n",
"Number of calls in test data: 264\n",
"Class distribution after undersampling: Counter({0: 159, 1: 159, 2: 159})\n",
"Test set accuracy: 0.5303\n",
"{0: 'EMS', 1: 'Fire', 2: 'Law'}\n",
"\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" EMS 0.40 0.59 0.48 63\n",
" Fire 0.35 0.60 0.44 42\n",
" Law 0.77 0.49 0.60 159\n",
"\n",
" accuracy 0.53 264\n",
" macro avg 0.51 0.56 0.51 264\n",
"weighted avg 0.62 0.53 0.55 264\n",
"\n",
"\n",
"Confusion Matrix:\n",
"[[37 12 14]\n",
" [ 8 25 9]\n",
" [47 34 78]]\n",
"Predicted type for the next call: 1\n",
"Counter({2: 635, 0: 261, 1: 159})\n"
]
}
],
"source": [
"import pandas as pd\n",
"import xml.etree.ElementTree as ET\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"from collections import Counter\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
"\n",
"\n",
"# Step 1: Parse the XML file and load data into a DataFrame\n",
"tree = ET.parse(\"research-summer-24/data-generation/cluster_point/SYNTH_OUTPUT2_cluster_point_process.xml\") ## at least more than 300 elements in the training set (lows 1000s)\n",
"root = tree.getroot()\n",
"\n",
"# Initialize a list to store event data\n",
"events = []\n",
"\n",
"# Extract attributes from each event element\n",
"for event in root.findall(\".//event\"):\n",
" event_data = {\n",
" 'time': int(event.get('time')),\n",
" 'x': float(event.get('x')),\n",
" 'y': float(event.get('y')),\n",
" 'type': event.get('type') # Target variable\n",
" }\n",
" events.append(event_data)\n",
"\n",
"# Convert the list of events to a DataFrame\n",
"df = pd.DataFrame(events)\n",
"\n",
"# Step 2: Calculate time differences between consecutive events\n",
"df['time_diff'] = df['time'].diff().fillna(0)\n",
"\n",
"# Step 3: Transform data to include previous calls as features\n",
"n = 5 # Number of calls to consider (4 previous, 1 target)\n",
"processed_data = []\n",
"\n",
"for i in range(n - 1, len(df)):\n",
" features = {}\n",
"\n",
" # Include time_diff, x, y, and type of the previous n-1 calls as features\n",
" for j in range(n - 1):\n",
" call = df.iloc[i - (n - 1 - j)]\n",
" features[f'time_diff_{j+1}'] = call['time_diff']\n",
" features[f'x_{j+1}'] = call['x']\n",
" features[f'y_{j+1}'] = call['y']\n",
" features[f'type_{j+1}'] = call['type']\n",
" \n",
" # Set the current call's type as the target\n",
" features['type'] = df.iloc[i]['type']\n",
" processed_data.append(features)\n",
"\n",
"# Step 4: Convert the list to a DataFrame and one-hot encode previous types\n",
"df_transformed = pd.DataFrame(processed_data)\n",
"df_transformed = pd.get_dummies(df_transformed, columns=[f'type_{j+1}' for j in range(n - 1)])\n",
"\n",
"# Step 5: Separate features and target variable, convert target to categorical\n",
"X = df_transformed.drop(columns=['type'])\n",
"y = df_transformed['type'].astype('category').cat.codes # Encodes target as numeric\n",
"\n",
"# Split data into training and test sets\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"print(f\"Number of calls in training data: {len(X_train)}\")\n",
"print(f\"Number of calls in test data: {len(X_test)}\")\n",
"\n",
"# Apply undersampling to balance classes\n",
"undersample = RandomUnderSampler(sampling_strategy='auto')\n",
"X_train_resampled, y_train_resampled = undersample.fit_resample(X_train, y_train)\n",
"\n",
"\"\"\"param_grid = {\n",
" 'n_estimators': [100, 200, 300],\n",
" 'max_depth': [None, 10, 20],\n",
" 'min_samples_split': [2, 5],\n",
" 'min_samples_leaf': [1, 2],\n",
" 'max_features': ['sqrt', 'log2']\n",
"}\n",
"\n",
"# Perform grid search\n",
"grid_search = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), \n",
" param_grid, cv=3, scoring='accuracy', n_jobs=-1)\n",
"grid_search.fit(X_train_resampled, y_train_resampled)\n",
"\n",
"# Use the best model\n",
"rf_model = grid_search.best_estimator_\n",
"print(f\"Best Parameters: {grid_search.best_params_}\")\"\"\"\n",
"\n",
"# Check the class distribution after undersampling\n",
"print(f\"Class distribution after undersampling: {Counter(y_train_resampled)}\")\n",
" \n",
"# Step 6: Train the RandomForestClassifier\n",
"rf_model = RandomForestClassifier(n_estimators=500, random_state=42)\n",
"rf_model.fit(X_train_resampled, y_train_resampled)\n",
"\n",
"# Evaluate the model on the test set\n",
"y_pred = rf_model.predict(X_test)\n",
"score = accuracy_score(y_test, y_pred)\n",
"print(f\"Test set accuracy: {score:.4f}\") ### known frequency of different classes. balance, experimenter's notebook. size of the training set after balancing has to be much larger than number of parameters\n",
"\n",
"type_mapping = dict(enumerate(df['type'].astype('category').cat.categories))\n",
"print(type_mapping)\n",
"\n",
"print(\"\\nClassification Report:\")\n",
"print(classification_report(y_test, y_pred, target_names=type_mapping.values()))\n",
"\n",
"# Print confusion matrix\n",
"print(\"\\nConfusion Matrix:\")\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"print(conf_matrix)\n",
"\n",
"# Prediction function for the next call type\n",
"def predict_next_call(df, model, n=5):\n",
" features = {}\n",
"\n",
" # Extract the last n-1 calls to construct features for prediction\n",
" for j in range(n - 1):\n",
" call = df.iloc[-(n - 1 - j)]\n",
" features[f'time_diff_{j+1}'] = call['time_diff']\n",
" features[f'x_{j+1}'] = call['x']\n",
" features[f'y_{j+1}'] = call['y']\n",
" features[f'type_{j+1}'] = call['type']\n",
" \n",
" # Convert to DataFrame and one-hot encode\n",
" next_call_features = pd.DataFrame([features])\n",
" next_call_features = pd.get_dummies(next_call_features, columns=[f'type_{j+1}' for j in range(n - 1)])\n",
" next_call_features = next_call_features.reindex(columns=X.columns, fill_value=0)\n",
" \n",
" # Predict the next call type\n",
" prediction = model.predict(next_call_features)\n",
" return prediction[0]\n",
"\n",
"# Example usage to predict the next call type\n",
"predicted_type = predict_next_call(df, rf_model, n)\n",
"print(f\"Predicted type for the next call: {predicted_type}\")\n",
"\n",
"class_frequencies = Counter(y_train)\n",
"print(class_frequencies) ## understand what those numbers mean, interpret the confusion matrix\n",
"\n",
"## comment out grid search and use 200 estimators and increase training set significantly\n",
"## seed the random number generator for the testing set so that testing set is not subset of training set"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ecbe6bf3-758e-412e-ada8-a62313f40c33",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "03a59dc5-13be-4feb-8ce5-e6eaf1d86e4e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "5a40d247-cf52-4993-96f7-fc20ad1d52fd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading