feat: SageMaker notebook for tire model training + evaluation with visualizations

givenand · givenand · commit 4afc30318010 · 2026-04-03T14:08:00.000-04:00
diff --git a/guidance-for-predictive-maintenance/notebooks/train_tire_model.ipynb b/guidance-for-predictive-maintenance/notebooks/train_tire_model.ipynb
@@ -0,0 +1,345 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tire Anomaly Detection — Model Training & Evaluation\n",
+    "\n",
+    "This notebook trains a SageMaker Random Cut Forest model for tire pressure anomaly detection.\n",
+    "\n",
+    "**What it does:**\n",
+    "1. Loads the synthetic training dataset (721K records, 50 vehicles, 6 months)\n",
+    "2. Prepares and normalizes features (pressure, temperature, delta_pressure, delta_temp)\n",
+    "3. Trains an RCF model on normal data only (unsupervised anomaly detection)\n",
+    "4. Evaluates on labeled test data (slow leaks, punctures, valve failures)\n",
+    "5. Deploys a real-time inference endpoint\n",
+    "\n",
+    "**Prerequisites:**\n",
+    "- Run `python3 scripts/generate_training_data.py` first to create the dataset\n",
+    "- SageMaker execution role with S3 and SSM access\n",
+    "- S3 bucket for training artifacts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import boto3\n",
+    "import json\n",
+    "import io\n",
+    "import time\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from datetime import datetime\n",
+    "\n",
+    "# Configuration\n",
+    "REGION = 'us-east-2'\n",
+    "BUCKET = 'cms-tire-prediction-ACCOUNT-REGION'  # Update with your bucket\n",
+    "ROLE_ARN = 'arn:aws:iam::ACCOUNT:role/cms-sagemaker-execution-role'  # Update\n",
+    "STAGE = 'prod'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Load and Explore Training Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_parquet('../data/training/tire_telemetry_full.parquet')\n",
+    "print(f'Dataset: {len(df):,} records')\n",
+    "print(f'Vehicles: {df.vehicle_id.nunique()}')\n",
+    "print(f'Date range: {df.timestamp.min()} → {df.timestamp.max()}')\n",
+    "print(f'\\nLabel distribution:')\n",
+    "print(df.label.value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize pressure distribution by label\n",
+    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
+    "\n",
+    "for label in df.label.unique():\n",
+    "    subset = df[df.label == label]\n",
+    "    axes[0].hist(subset.pressure, bins=50, alpha=0.5, label=label)\n",
+    "axes[0].set_xlabel('Pressure (PSI)')\n",
+    "axes[0].set_ylabel('Count')\n",
+    "axes[0].set_title('Pressure Distribution by Label')\n",
+    "axes[0].legend()\n",
+    "axes[0].axvline(x=28, color='red', linestyle='--', label='Alert threshold')\n",
+    "\n",
+    "# Show a slow leak example\n",
+    "leak = df[(df.label == 'slow_leak') & (df.tire_id == 'FL')].sort_values('timestamp').head(500)\n",
+    "if len(leak) > 0:\n",
+    "    vid = leak.vehicle_id.iloc[0]\n",
+    "    vehicle_leak = df[(df.vehicle_id == vid) & (df.tire_id == 'FL')].sort_values('timestamp')\n",
+    "    axes[1].plot(range(len(vehicle_leak)), vehicle_leak.pressure.values, linewidth=0.5)\n",
+    "    axes[1].axhline(y=28, color='red', linestyle='--', label='Alert threshold')\n",
+    "    axes[1].set_xlabel('Reading #')\n",
+    "    axes[1].set_ylabel('Pressure (PSI)')\n",
+    "    axes[1].set_title(f'Slow Leak Example ({vid} FL)')\n",
+    "    axes[1].legend()\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Prepare Features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features = ['pressure', 'temperature', 'delta_pressure', 'delta_temp']\n",
+    "\n",
+    "# Train on normal data only (unsupervised)\n",
+    "normal = df[df.label == 'normal'][features].dropna()\n",
+    "test = df[features + ['label']].dropna()\n",
+    "\n",
+    "# Normalize\n",
+    "stats = {}\n",
+    "for col in features:\n",
+    "    stats[col] = {'mean': float(normal[col].mean()), 'std': float(normal[col].std())}\n",
+    "\n",
+    "train_norm = normal.copy()\n",
+    "test_norm = test.copy()\n",
+    "for col in features:\n",
+    "    train_norm[col] = (train_norm[col] - stats[col]['mean']) / stats[col]['std']\n",
+    "    test_norm[col] = (test_norm[col] - stats[col]['mean']) / stats[col]['std']\n",
+    "\n",
+    "print(f'Training: {len(train_norm):,} (normal only)')\n",
+    "print(f'Test: {len(test_norm):,} (all labels)')\n",
+    "print(f'\\nNormalization stats:')\n",
+    "for k, v in stats.items():\n",
+    "    print(f'  {k}: mean={v[\"mean\"]:.3f}, std={v[\"std\"]:.3f}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Train Random Cut Forest Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sm = boto3.client('sagemaker', region_name=REGION)\n",
+    "s3 = boto3.client('s3', region_name=REGION)\n",
+    "\n",
+    "train_array = train_norm[features].values.astype('float32')\n",
+    "job_name = f'tire-rcf-{datetime.now().strftime(\"%Y%m%d-%H%M%S\")}'\n",
+    "prefix = f'tire-prediction/training/{job_name}'\n",
+    "\n",
+    "# Upload training CSV\n",
+    "buf = io.StringIO()\n",
+    "pd.DataFrame(train_array).to_csv(buf, header=False, index=False)\n",
+    "s3.put_object(Bucket=BUCKET, Key=f'{prefix}/train/train.csv', Body=buf.getvalue())\n",
+    "print(f'Uploaded {len(train_array):,} training samples to s3://{BUCKET}/{prefix}/train/')\n",
+    "\n",
+    "# RCF container\n",
+    "acct_map = {'us-east-1': '382416733822', 'us-east-2': '404615174143', 'us-west-2': '174872318107'}\n",
+    "image = f'{acct_map.get(REGION, \"404615174143\")}.dkr.ecr.{REGION}.amazonaws.com/randomcutforest:latest'\n",
+    "\n",
+    "sm.create_training_job(\n",
+    "    TrainingJobName=job_name,\n",
+    "    AlgorithmSpecification={'TrainingImage': image, 'TrainingInputMode': 'File'},\n",
+    "    RoleArn=ROLE_ARN,\n",
+    "    InputDataConfig=[{'ChannelName': 'train', 'DataSource': {'S3DataSource': {\n",
+    "        'S3DataType': 'S3Prefix', 'S3Uri': f's3://{BUCKET}/{prefix}/train',\n",
+    "        'S3DataDistributionType': 'ShardedByS3Key'}}, 'ContentType': 'text/csv;label_size=0'}],\n",
+    "    OutputDataConfig={'S3OutputPath': f's3://{BUCKET}/{prefix}/output'},\n",
+    "    ResourceConfig={'InstanceType': 'ml.m5.large', 'InstanceCount': 1, 'VolumeSizeInGB': 10},\n",
+    "    StoppingCondition={'MaxRuntimeInSeconds': 600},\n",
+    "    HyperParameters={'num_samples_per_tree': '256', 'num_trees': '100', 'feature_dim': '4'},\n",
+    ")\n",
+    "print(f'Training job started: {job_name}')\n",
+    "\n",
+    "# Wait\n",
+    "while True:\n",
+    "    status = sm.describe_training_job(TrainingJobName=job_name)['TrainingJobStatus']\n",
+    "    print(f'  {status}')\n",
+    "    if status in ('Completed', 'Failed', 'Stopped'): break\n",
+    "    time.sleep(30)\n",
+    "\n",
+    "model_data = sm.describe_training_job(TrainingJobName=job_name)['ModelArtifacts']['S3ModelArtifacts']\n",
+    "print(f'\\n✅ Model: {model_data}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Deploy Endpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ts = datetime.now().strftime('%Y%m%d-%H%M%S')\n",
+    "endpoint_name = f'tire-anomaly-{datetime.now().strftime(\"%Y%m%d\")}'\n",
+    "model_name = f'tire-rcf-{ts}'\n",
+    "config_name = f'tire-rcf-cfg-{ts}'\n",
+    "\n",
+    "sm.create_model(ModelName=model_name, ExecutionRoleArn=ROLE_ARN,\n",
+    "                PrimaryContainer={'Image': image, 'ModelDataUrl': model_data})\n",
+    "\n",
+    "sm.create_endpoint_config(EndpointConfigName=config_name, ProductionVariants=[{\n",
+    "    'VariantName': 'default', 'ModelName': model_name,\n",
+    "    'InstanceType': 'ml.m5.large', 'InitialInstanceCount': 1}])\n",
+    "\n",
+    "sm.create_endpoint(EndpointName=endpoint_name, EndpointConfigName=config_name)\n",
+    "print(f'Creating endpoint: {endpoint_name}')\n",
+    "\n",
+    "while True:\n",
+    "    status = sm.describe_endpoint(EndpointName=endpoint_name)['EndpointStatus']\n",
+    "    print(f'  {status}')\n",
+    "    if status in ('InService', 'Failed'): break\n",
+    "    time.sleep(30)\n",
+    "\n",
+    "print(f'\\n✅ Endpoint ready: {endpoint_name}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Evaluate Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sm_runtime = boto3.client('sagemaker-runtime', region_name=REGION)\n",
+    "\n",
+    "test_array = test_norm[features].values.astype('float32')\n",
+    "labels = test_norm['label'].values\n",
+    "\n",
+    "# Get anomaly scores in batches\n",
+    "scores = []\n",
+    "batch_size = 500\n",
+    "for i in range(0, min(len(test_array), 10000), batch_size):  # Sample 10K for speed\n",
+    "    batch = test_array[i:i+batch_size]\n",
+    "    body = '\\n'.join(','.join(str(v) for v in row) for row in batch)\n",
+    "    resp = sm_runtime.invoke_endpoint(\n",
+    "        EndpointName=endpoint_name, ContentType='text/csv', Body=body)\n",
+    "    result = json.loads(resp['Body'].read().decode())\n",
+    "    scores.extend([r['score'] for r in result['scores']])\n",
+    "    if i % 2000 == 0: print(f'  {i}/{min(len(test_array), 10000)}')\n",
+    "\n",
+    "scores = np.array(scores)\n",
+    "sample_labels = labels[:len(scores)]\n",
+    "\n",
+    "# Threshold at 95th percentile of normal scores\n",
+    "normal_scores = scores[sample_labels == 'normal']\n",
+    "anomaly_scores = scores[sample_labels != 'normal']\n",
+    "threshold = float(np.percentile(normal_scores, 95))\n",
+    "\n",
+    "print(f'Threshold: {threshold:.4f}')\n",
+    "print(f'Normal scores:  mean={normal_scores.mean():.4f}, p95={np.percentile(normal_scores, 95):.4f}')\n",
+    "print(f'Anomaly scores: mean={anomaly_scores.mean():.4f}, p95={np.percentile(anomaly_scores, 95):.4f}')\n",
+    "\n",
+    "# Metrics\n",
+    "predictions = scores > threshold\n",
+    "true_anomalies = sample_labels != 'normal'\n",
+    "tp = np.sum(predictions & true_anomalies)\n",
+    "fp = np.sum(predictions & ~true_anomalies)\n",
+    "fn = np.sum(~predictions & true_anomalies)\n",
+    "precision = tp / (tp + fp) if (tp + fp) > 0 else 0\n",
+    "recall = tp / (tp + fn) if (tp + fn) > 0 else 0\n",
+    "f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0\n",
+    "\n",
+    "print(f'\\nPrecision: {precision:.3f}')\n",
+    "print(f'Recall:    {recall:.3f}')\n",
+    "print(f'F1 Score:  {f1:.3f}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Visualize score distributions\n",
+    "fig, ax = plt.subplots(figsize=(10, 5))\n",
+    "ax.hist(normal_scores, bins=50, alpha=0.5, label='Normal', color='green')\n",
+    "ax.hist(anomaly_scores, bins=50, alpha=0.5, label='Anomaly', color='red')\n",
+    "ax.axvline(x=threshold, color='black', linestyle='--', label=f'Threshold ({threshold:.2f})')\n",
+    "ax.set_xlabel('Anomaly Score')\n",
+    "ax.set_ylabel('Count')\n",
+    "ax.set_title('RCF Anomaly Score Distribution')\n",
+    "ax.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Save Configuration to SSM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ssm = boto3.client('ssm', region_name=REGION)\n",
+    "prefix = f'/tire-prediction/{STAGE}'\n",
+    "\n",
+    "ssm.put_parameter(Name=f'{prefix}/normalization-stats', Value=json.dumps(stats), Type='String', Overwrite=True)\n",
+    "ssm.put_parameter(Name=f'{prefix}/anomaly-threshold', Value=json.dumps({'threshold': threshold}), Type='String', Overwrite=True)\n",
+    "ssm.put_parameter(Name=f'{prefix}/endpoint-name', Value=endpoint_name, Type='String', Overwrite=True)\n",
+    "\n",
+    "print(f'✅ Config saved to SSM ({prefix}/*)')\n",
+    "print(f'  Normalization stats: {json.dumps(stats, indent=2)}')\n",
+    "print(f'  Threshold: {threshold}')\n",
+    "print(f'  Endpoint: {endpoint_name}')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.13.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}