diff --git a/Tools/911-models.ipynb b/Tools/911-models.ipynb new file mode 100644 index 000000000..fe8142df1 --- /dev/null +++ b/Tools/911-models.ipynb @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 33, + "id": "1b3edd9c-127a-4f3a-9082-31776f2be0da", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: imbalanced-learn in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.12.4)\n", + "Requirement already satisfied: numpy>=1.17.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (2.0.0)\n", + "Requirement already satisfied: scipy>=1.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (1.14.0)\n", + "Requirement already satisfied: scikit-learn>=1.0.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (1.5.1)\n", + "Requirement already satisfied: joblib>=1.1.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (3.5.0)\n" + ] + } + ], + "source": [ + "!pip install imbalanced-learn" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "93251236-2f47-466f-9d5a-fa8a42a07cc5", + "metadata": {}, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "from imblearn.over_sampling import SMOTE\n", + "from imblearn.over_sampling import RandomOverSampler\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "cbae9878-d5c3-4db4-8950-13f549c6dae7", + "metadata": {}, + "outputs": [], + "source": [ + "tree = ET.parse('research-summer-24/data-generation/cluster_point/SYNTH_OUTPUT2_cluster_point_process.xml')\n", + "root = tree.getroot()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "ba059b95-f435-4e7f-8715-9ca77de8ce9e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of calls in training data: 1055\n", + "Number of calls in test data: 264\n", + "Class distribution after undersampling: Counter({0: 159, 1: 159, 2: 159})\n", + "Test set accuracy: 0.5303\n", + "{0: 'EMS', 1: 'Fire', 2: 'Law'}\n", + "\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " EMS 0.40 0.59 0.48 63\n", + " Fire 0.35 0.60 0.44 42\n", + " Law 0.77 0.49 0.60 159\n", + "\n", + " accuracy 0.53 264\n", + " macro avg 0.51 0.56 0.51 264\n", + "weighted avg 0.62 0.53 0.55 264\n", + "\n", + "\n", + "Confusion Matrix:\n", + "[[37 12 14]\n", + " [ 8 25 9]\n", + " [47 34 78]]\n", + "Predicted type for the next call: 1\n", + "Counter({2: 635, 0: 261, 1: 159})\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import xml.etree.ElementTree as ET\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "from imblearn.under_sampling import RandomUnderSampler\n", + "from collections import Counter\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n", + "\n", + "\n", + "# Step 1: Parse the XML file and load data into a DataFrame\n", + "tree = ET.parse(\"research-summer-24/data-generation/cluster_point/SYNTH_OUTPUT2_cluster_point_process.xml\") ## at least more than 300 elements in the training set (lows 1000s)\n", + "root = tree.getroot()\n", + "\n", + "# Initialize a list to store event data\n", + "events = []\n", + "\n", + "# Extract attributes from each event element\n", + "for event in root.findall(\".//event\"):\n", + " event_data = {\n", + " 'time': int(event.get('time')),\n", + " 'x': float(event.get('x')),\n", + " 'y': float(event.get('y')),\n", + " 'type': event.get('type') # Target variable\n", + " }\n", + " events.append(event_data)\n", + "\n", + "# Convert the list of events to a DataFrame\n", + "df = pd.DataFrame(events)\n", + "\n", + "# Step 2: Calculate time differences between consecutive events\n", + "df['time_diff'] = df['time'].diff().fillna(0)\n", + "\n", + "# Step 3: Transform data to include previous calls as features\n", + "n = 5 # Number of calls to consider (4 previous, 1 target)\n", + "processed_data = []\n", + "\n", + "for i in range(n - 1, len(df)):\n", + " features = {}\n", + "\n", + " # Include time_diff, x, y, and type of the previous n-1 calls as features\n", + " for j in range(n - 1):\n", + " call = df.iloc[i - (n - 1 - j)]\n", + " features[f'time_diff_{j+1}'] = call['time_diff']\n", + " features[f'x_{j+1}'] = call['x']\n", + " features[f'y_{j+1}'] = call['y']\n", + " features[f'type_{j+1}'] = call['type']\n", + " \n", + " # Set the current call's type as the target\n", + " features['type'] = df.iloc[i]['type']\n", + " processed_data.append(features)\n", + "\n", + "# Step 4: Convert the list to a DataFrame and one-hot encode previous types\n", + "df_transformed = pd.DataFrame(processed_data)\n", + "df_transformed = pd.get_dummies(df_transformed, columns=[f'type_{j+1}' for j in range(n - 1)])\n", + "\n", + "# Step 5: Separate features and target variable, convert target to categorical\n", + "X = df_transformed.drop(columns=['type'])\n", + "y = df_transformed['type'].astype('category').cat.codes # Encodes target as numeric\n", + "\n", + "# Split data into training and test sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "print(f\"Number of calls in training data: {len(X_train)}\")\n", + "print(f\"Number of calls in test data: {len(X_test)}\")\n", + "\n", + "# Apply undersampling to balance classes\n", + "undersample = RandomUnderSampler(sampling_strategy='auto')\n", + "X_train_resampled, y_train_resampled = undersample.fit_resample(X_train, y_train)\n", + "\n", + "\"\"\"param_grid = {\n", + " 'n_estimators': [100, 200, 300],\n", + " 'max_depth': [None, 10, 20],\n", + " 'min_samples_split': [2, 5],\n", + " 'min_samples_leaf': [1, 2],\n", + " 'max_features': ['sqrt', 'log2']\n", + "}\n", + "\n", + "# Perform grid search\n", + "grid_search = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), \n", + " param_grid, cv=3, scoring='accuracy', n_jobs=-1)\n", + "grid_search.fit(X_train_resampled, y_train_resampled)\n", + "\n", + "# Use the best model\n", + "rf_model = grid_search.best_estimator_\n", + "print(f\"Best Parameters: {grid_search.best_params_}\")\"\"\"\n", + "\n", + "# Check the class distribution after undersampling\n", + "print(f\"Class distribution after undersampling: {Counter(y_train_resampled)}\")\n", + " \n", + "# Step 6: Train the RandomForestClassifier\n", + "rf_model = RandomForestClassifier(n_estimators=500, random_state=42)\n", + "rf_model.fit(X_train_resampled, y_train_resampled)\n", + "\n", + "# Evaluate the model on the test set\n", + "y_pred = rf_model.predict(X_test)\n", + "score = accuracy_score(y_test, y_pred)\n", + "print(f\"Test set accuracy: {score:.4f}\") ### known frequency of different classes. balance, experimenter's notebook. size of the training set after balancing has to be much larger than number of parameters\n", + "\n", + "type_mapping = dict(enumerate(df['type'].astype('category').cat.categories))\n", + "print(type_mapping)\n", + "\n", + "print(\"\\nClassification Report:\")\n", + "print(classification_report(y_test, y_pred, target_names=type_mapping.values()))\n", + "\n", + "# Print confusion matrix\n", + "print(\"\\nConfusion Matrix:\")\n", + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "print(conf_matrix)\n", + "\n", + "# Prediction function for the next call type\n", + "def predict_next_call(df, model, n=5):\n", + " features = {}\n", + "\n", + " # Extract the last n-1 calls to construct features for prediction\n", + " for j in range(n - 1):\n", + " call = df.iloc[-(n - 1 - j)]\n", + " features[f'time_diff_{j+1}'] = call['time_diff']\n", + " features[f'x_{j+1}'] = call['x']\n", + " features[f'y_{j+1}'] = call['y']\n", + " features[f'type_{j+1}'] = call['type']\n", + " \n", + " # Convert to DataFrame and one-hot encode\n", + " next_call_features = pd.DataFrame([features])\n", + " next_call_features = pd.get_dummies(next_call_features, columns=[f'type_{j+1}' for j in range(n - 1)])\n", + " next_call_features = next_call_features.reindex(columns=X.columns, fill_value=0)\n", + " \n", + " # Predict the next call type\n", + " prediction = model.predict(next_call_features)\n", + " return prediction[0]\n", + "\n", + "# Example usage to predict the next call type\n", + "predicted_type = predict_next_call(df, rf_model, n)\n", + "print(f\"Predicted type for the next call: {predicted_type}\")\n", + "\n", + "class_frequencies = Counter(y_train)\n", + "print(class_frequencies) ## understand what those numbers mean, interpret the confusion matrix\n", + "\n", + "## comment out grid search and use 200 estimators and increase training set significantly\n", + "## seed the random number generator for the testing set so that testing set is not subset of training set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecbe6bf3-758e-412e-ada8-a62313f40c33", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03a59dc5-13be-4feb-8ce5-e6eaf1d86e4e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a40d247-cf52-4993-96f7-fc20ad1d52fd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}