UWB-Biocomputing · rimjhimsudhesh · Jan 16, 2025
diff --git a/Tools/911-models.ipynb b/Tools/911-models.ipynb
@@ -0,0 +1,283 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "1b3edd9c-127a-4f3a-9082-31776f2be0da",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: imbalanced-learn in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.12.4)\n",
+      "Requirement already satisfied: numpy>=1.17.3 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (2.0.0)\n",
+      "Requirement already satisfied: scipy>=1.5.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (1.14.0)\n",
+      "Requirement already satisfied: scikit-learn>=1.0.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (1.5.1)\n",
+      "Requirement already satisfied: joblib>=1.1.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (1.4.2)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from imbalanced-learn) (3.5.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install imbalanced-learn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "93251236-2f47-466f-9d5a-fa8a42a07cc5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xml.etree.ElementTree as ET\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "from imblearn.over_sampling import RandomOverSampler\n",
+    "from imblearn.under_sampling import RandomUnderSampler\n",
+    "from collections import Counter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "cbae9878-d5c3-4db4-8950-13f549c6dae7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tree = ET.parse('research-summer-24/data-generation/cluster_point/SYNTH_OUTPUT2_cluster_point_process.xml')\n",
+    "root = tree.getroot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "ba059b95-f435-4e7f-8715-9ca77de8ce9e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of calls in training data: 1055\n",
+      "Number of calls in test data: 264\n",
+      "Class distribution after undersampling: Counter({0: 159, 1: 159, 2: 159})\n",
+      "Test set accuracy: 0.5303\n",
+      "{0: 'EMS', 1: 'Fire', 2: 'Law'}\n",
+      "\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "         EMS       0.40      0.59      0.48        63\n",
+      "        Fire       0.35      0.60      0.44        42\n",
+      "         Law       0.77      0.49      0.60       159\n",
+      "\n",
+      "    accuracy                           0.53       264\n",
+      "   macro avg       0.51      0.56      0.51       264\n",
+      "weighted avg       0.62      0.53      0.55       264\n",
+      "\n",
+      "\n",
+      "Confusion Matrix:\n",
+      "[[37 12 14]\n",
+      " [ 8 25  9]\n",
+      " [47 34 78]]\n",
+      "Predicted type for the next call: 1\n",
+      "Counter({2: 635, 0: 261, 1: 159})\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import xml.etree.ElementTree as ET\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from imblearn.under_sampling import RandomUnderSampler\n",
+    "from collections import Counter\n",
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
+    "\n",
+    "\n",
+    "# Step 1: Parse the XML file and load data into a DataFrame\n",
+    "tree = ET.parse(\"research-summer-24/data-generation/cluster_point/SYNTH_OUTPUT2_cluster_point_process.xml\") ## at least more than 300 elements in the training set (lows 1000s)\n",
+    "root = tree.getroot()\n",
+    "\n",
+    "# Initialize a list to store event data\n",
+    "events = []\n",
+    "\n",
+    "# Extract attributes from each event element\n",
+    "for event in root.findall(\".//event\"):\n",
+    "    event_data = {\n",
+    "        'time': int(event.get('time')),\n",
+    "        'x': float(event.get('x')),\n",
+    "        'y': float(event.get('y')),\n",
+    "        'type': event.get('type')  # Target variable\n",
+    "    }\n",
+    "    events.append(event_data)\n",
+    "\n",
+    "# Convert the list of events to a DataFrame\n",
+    "df = pd.DataFrame(events)\n",
+    "\n",
+    "# Step 2: Calculate time differences between consecutive events\n",
+    "df['time_diff'] = df['time'].diff().fillna(0)\n",
+    "\n",
+    "# Step 3: Transform data to include previous calls as features\n",
+    "n = 5  # Number of calls to consider (4 previous, 1 target)\n",
+    "processed_data = []\n",
+    "\n",
+    "for i in range(n - 1, len(df)):\n",
+    "    features = {}\n",
+    "\n",
+    "    # Include time_diff, x, y, and type of the previous n-1 calls as features\n",
+    "    for j in range(n - 1):\n",
+    "        call = df.iloc[i - (n - 1 - j)]\n",
+    "        features[f'time_diff_{j+1}'] = call['time_diff']\n",
+    "        features[f'x_{j+1}'] = call['x']\n",
+    "        features[f'y_{j+1}'] = call['y']\n",
+    "        features[f'type_{j+1}'] = call['type']\n",
+    "    \n",
+    "    # Set the current call's type as the target\n",
+    "    features['type'] = df.iloc[i]['type']\n",
+    "    processed_data.append(features)\n",
+    "\n",
+    "# Step 4: Convert the list to a DataFrame and one-hot encode previous types\n",
+    "df_transformed = pd.DataFrame(processed_data)\n",
+    "df_transformed = pd.get_dummies(df_transformed, columns=[f'type_{j+1}' for j in range(n - 1)])\n",
+    "\n",
+    "# Step 5: Separate features and target variable, convert target to categorical\n",
+    "X = df_transformed.drop(columns=['type'])\n",
+    "y = df_transformed['type'].astype('category').cat.codes  # Encodes target as numeric\n",
+    "\n",
+    "# Split data into training and test sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+    "print(f\"Number of calls in training data: {len(X_train)}\")\n",
+    "print(f\"Number of calls in test data: {len(X_test)}\")\n",
+    "\n",
+    "# Apply undersampling to balance classes\n",
+    "undersample = RandomUnderSampler(sampling_strategy='auto')\n",
+    "X_train_resampled, y_train_resampled = undersample.fit_resample(X_train, y_train)\n",
+    "\n",
+    "\"\"\"param_grid = {\n",
+    "    'n_estimators': [100, 200, 300],\n",
+    "    'max_depth': [None, 10, 20],\n",
+    "    'min_samples_split': [2, 5],\n",
+    "    'min_samples_leaf': [1, 2],\n",
+    "    'max_features': ['sqrt', 'log2']\n",
+    "}\n",
+    "\n",
+    "# Perform grid search\n",
+    "grid_search = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), \n",
+    "                           param_grid, cv=3, scoring='accuracy', n_jobs=-1)\n",
+    "grid_search.fit(X_train_resampled, y_train_resampled)\n",
+    "\n",
+    "# Use the best model\n",
+    "rf_model = grid_search.best_estimator_\n",
+    "print(f\"Best Parameters: {grid_search.best_params_}\")\"\"\"\n",
+    "\n",
+    "# Check the class distribution after undersampling\n",
+    "print(f\"Class distribution after undersampling: {Counter(y_train_resampled)}\")\n",
+    " \n",
+    "# Step 6: Train the RandomForestClassifier\n",
+    "rf_model = RandomForestClassifier(n_estimators=500, random_state=42)\n",
+    "rf_model.fit(X_train_resampled, y_train_resampled)\n",
+    "\n",
+    "# Evaluate the model on the test set\n",
+    "y_pred = rf_model.predict(X_test)\n",
+    "score = accuracy_score(y_test, y_pred)\n",
+    "print(f\"Test set accuracy: {score:.4f}\") ### known frequency of different classes. balance, experimenter's notebook. size of the training set after balancing has to be much larger than number of parameters\n",
+    "\n",
+    "type_mapping = dict(enumerate(df['type'].astype('category').cat.categories))\n",
+    "print(type_mapping)\n",
+    "\n",
+    "print(\"\\nClassification Report:\")\n",
+    "print(classification_report(y_test, y_pred, target_names=type_mapping.values()))\n",
+    "\n",
+    "# Print confusion matrix\n",
+    "print(\"\\nConfusion Matrix:\")\n",
+    "conf_matrix = confusion_matrix(y_test, y_pred)\n",
+    "print(conf_matrix)\n",
+    "\n",
+    "# Prediction function for the next call type\n",
+    "def predict_next_call(df, model, n=5):\n",
+    "    features = {}\n",
+    "\n",
+    "    # Extract the last n-1 calls to construct features for prediction\n",
+    "    for j in range(n - 1):\n",
+    "        call = df.iloc[-(n - 1 - j)]\n",
+    "        features[f'time_diff_{j+1}'] = call['time_diff']\n",
+    "        features[f'x_{j+1}'] = call['x']\n",
+    "        features[f'y_{j+1}'] = call['y']\n",
+    "        features[f'type_{j+1}'] = call['type']\n",
+    "    \n",
+    "    # Convert to DataFrame and one-hot encode\n",
+    "    next_call_features = pd.DataFrame([features])\n",
+    "    next_call_features = pd.get_dummies(next_call_features, columns=[f'type_{j+1}' for j in range(n - 1)])\n",
+    "    next_call_features = next_call_features.reindex(columns=X.columns, fill_value=0)\n",
+    "    \n",
+    "    # Predict the next call type\n",
+    "    prediction = model.predict(next_call_features)\n",
+    "    return prediction[0]\n",
+    "\n",
+    "# Example usage to predict the next call type\n",
+    "predicted_type = predict_next_call(df, rf_model, n)\n",
+    "print(f\"Predicted type for the next call: {predicted_type}\")\n",
+    "\n",
+    "class_frequencies = Counter(y_train)\n",
+    "print(class_frequencies) ## understand what those numbers mean, interpret the confusion matrix\n",
+    "\n",
+    "## comment out grid search and use 200 estimators and increase training set significantly\n",
+    "## seed the random number generator for the testing set so that testing set is not subset of training set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ecbe6bf3-758e-412e-ada8-a62313f40c33",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03a59dc5-13be-4feb-8ce5-e6eaf1d86e4e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a40d247-cf52-4993-96f7-fc20ad1d52fd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}