aws-samples · Kumfert · Feb 8, 2022 · Feb 8, 2022
diff --git a/SageMaker-ModelMonitoring.ipynb b/SageMaker-ModelMonitoring.ipynb
@@ -32,7 +32,6 @@
    "execution_count": 1,
    "source": [
     "%%time\n",
-    "# cell 01\n",
     "\n",
     "# Handful of configuration\n",
     "\n",
@@ -98,6 +97,7 @@
    "execution_count": 2,
    "source": [
     "# cell 02\n",
+    "\n",
     "# Upload some test files\n",
     "boto3.Session().resource('s3').Bucket(bucket).Object(\"test_upload/test.txt\").upload_file('test_data/upload-test-file.txt')\n",
     "print(\"Success! You are all set to proceed.\")"
@@ -129,6 +129,7 @@
    "execution_count": 3,
    "source": [
     "# cell 03\n",
+    "\n",
     "model_file = open(\"model/xgb-churn-prediction-model.tar.gz\", 'rb')\n",
     "s3_key = os.path.join(prefix, 'xgb-churn-prediction-model.tar.gz')\n",
     "boto3.Session().resource('s3').Bucket(bucket).Object(s3_key).upload_fileobj(model_file)"
@@ -149,6 +150,7 @@
    "execution_count": 4,
    "source": [
     "# cell 04\n",
+    "\n",
     "from time import gmtime, strftime\n",
     "from sagemaker.model import Model\n",
     "from sagemaker.image_uris import retrieve\n",
@@ -174,6 +176,7 @@
    "execution_count": 5,
    "source": [
     "# cell 05\n",
+    "\n",
     "from sagemaker.model_monitor import DataCaptureConfig\n",
     "\n",
     "endpoint_name = 'DEMO-xgb-churn-pred-model-monitor-' + strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n",
@@ -222,6 +225,7 @@
    "execution_count": 6,
    "source": [
     "# cell 06\n",
+    "\n",
     "from sagemaker.predictor import Predictor\n",
     "import sagemaker\n",
     "import time\n",
@@ -269,6 +273,7 @@
    "execution_count": 7,
    "source": [
     "# cell 07\n",
+    "\n",
     "s3_client = boto3.Session().client('s3')\n",
     "current_endpoint_capture_prefix = '{}/{}'.format(data_capture_prefix, endpoint_name)\n",
     "result = s3_client.list_objects(Bucket=bucket, Prefix=current_endpoint_capture_prefix)\n",
@@ -303,6 +308,7 @@
    "execution_count": 8,
    "source": [
     "# cell 08\n",
+    "\n",
     "def get_obj_body(obj_key):\n",
     "    return s3_client.get_object(Bucket=bucket, Key=obj_key).get('Body').read().decode(\"utf-8\")\n",
     "\n",
@@ -335,6 +341,7 @@
    "execution_count": 9,
    "source": [
     "# cell 09\n",
+    "\n",
     "import json\n",
     "print(json.dumps(json.loads(capture_file.split('\\n')[0]), indent=2))"
    ],
@@ -417,6 +424,7 @@
    "execution_count": 10,
    "source": [
     "# cell 10\n",
+    "\n",
     "# copy over the training dataset to Amazon S3 (if you already have it in Amazon S3, you could reuse it)\n",
     "baseline_prefix = prefix + '/baselining'\n",
     "baseline_data_prefix = baseline_prefix + '/data'\n",
@@ -444,6 +452,7 @@
    "execution_count": 11,
    "source": [
     "# cell 11\n",
+    "\n",
     "training_data_file = open(\"test_data/training-dataset-with-header.csv\", 'rb')\n",
     "s3_key = os.path.join(baseline_prefix, 'data', 'training-dataset-with-header.csv')\n",
     "boto3.Session().resource('s3').Bucket(bucket).Object(s3_key).upload_fileobj(training_data_file)"
@@ -470,6 +479,7 @@
    "execution_count": 12,
    "source": [
     "# cell 12\n",
+    "\n",
     "from sagemaker.model_monitor import DefaultModelMonitor\n",
     "from sagemaker.model_monitor.dataset_format import DatasetFormat\n",
     "\n",
@@ -3961,6 +3971,7 @@
    "execution_count": 13,
    "source": [
     "# cell 13\n",
+    "\n",
     "s3_client = boto3.Session().client('s3')\n",
     "result = s3_client.list_objects(Bucket=bucket, Prefix=baseline_results_prefix)\n",
     "report_files = [report_file.get(\"Key\") for report_file in result.get('Contents')]\n",
@@ -3985,6 +3996,7 @@
    "execution_count": 14,
    "source": [
     "# cell 14\n",
+    "\n",
     "import pandas as pd\n",
     "\n",
     "baseline_job = my_default_monitor.latest_baselining_job\n",
@@ -4303,6 +4315,7 @@
    "execution_count": 15,
    "source": [
     "# cell 15\n",
+    "\n",
     "constraints_df = pd.json_normalize(baseline_job.suggested_constraints().body_dict[\"features\"])\n",
     "constraints_df.head(10)"
    ],
@@ -4451,6 +4464,7 @@
    "execution_count": 16,
    "source": [
     "# cell 16\n",
+    "\n",
     "# First, copy over some test scripts to the S3 bucket so that they can be used for pre and post processing\n",
     "boto3.Session().resource('s3').Bucket(bucket).Object(code_prefix+\"/preprocessor.py\").upload_file('preprocessor.py')\n",
     "boto3.Session().resource('s3').Bucket(bucket).Object(code_prefix+\"/postprocessor.py\").upload_file('postprocessor.py')"
@@ -4470,6 +4484,7 @@
    "execution_count": 17,
    "source": [
     "# cell 17\n",
+    "\n",
     "from sagemaker.model_monitor import CronExpressionGenerator\n",
     "from time import gmtime, strftime\n",
     "\n",
@@ -4503,6 +4518,7 @@
    "execution_count": 18,
    "source": [
     "# cell 18\n",
+    "\n",
     "from threading import Thread\n",
     "from time import sleep\n",
     "import time\n",
@@ -4546,6 +4562,7 @@
    "execution_count": 19,
    "source": [
     "# cell 19\n",
+    "\n",
     "desc_schedule_result = my_default_monitor.describe_schedule()\n",
     "print('Schedule status: {}'.format(desc_schedule_result['MonitoringScheduleStatus']))"
    ],
@@ -4575,6 +4592,7 @@
    "execution_count": 20,
    "source": [
     "# cell 20\n",
+    "\n",
     "mon_executions = my_default_monitor.list_executions()\n",
     "print(\"We created a hourly schedule above and it will kick off executions ON the hour (plus 0 - 20 min buffer.\\nWe will have to wait till we hit the hour...\")\n",
     "\n",
@@ -4722,6 +4740,7 @@
    "execution_count": 21,
    "source": [
     "# cell 21\n",
+    "\n",
     "latest_execution = mon_executions[-1] # latest execution's index is -1, second to last is -2 and so on..\n",
     "time.sleep(60)\n",
     "latest_execution.wait(logs=False)\n",
@@ -4750,6 +4769,7 @@
    "execution_count": 22,
    "source": [
     "# cell 22\n",
+    "\n",
     "report_uri=latest_execution.output.destination\n",
     "print('Report Uri: {}'.format(report_uri))"
    ],
@@ -4776,6 +4796,7 @@
    "execution_count": 23,
    "source": [
     "# cell 23\n",
+    "\n",
     "from urllib.parse import urlparse\n",
     "s3uri = urlparse(report_uri)\n",
     "report_bucket = s3uri.netloc\n",
@@ -4824,6 +4845,7 @@
    "execution_count": 24,
    "source": [
     "# cell 24\n",
+    "\n",
     "violations = my_default_monitor.latest_monitoring_constraint_violations()\n",
     "pd.set_option('display.max_colwidth', None)\n",
     "constraints_df = pd.json_normalize(violations.body_dict[\"violations\"])\n",
@@ -4967,6 +4989,7 @@
    "execution_count": null,
    "source": [
     "# cell 25\n",
+    "\n",
     "#my_default_monitor.stop_monitoring_schedule()\n",
     "#my_default_monitor.start_monitoring_schedule()"
    ],
@@ -4989,6 +5012,7 @@
    "execution_count": null,
    "source": [
     "# cell 26\n",
+    "\n",
     "my_default_monitor.delete_monitoring_schedule()\n",
     "time.sleep(60) # actually wait for the deletion"
    ],
@@ -5000,6 +5024,7 @@
    "execution_count": null,
    "source": [
     "# cell 27\n",
+    "\n",
     "predictor.delete_endpoint()"
    ],
    "outputs": [],
@@ -5012,6 +5037,7 @@
    "execution_count": null,
    "source": [
     "# cell 28\n",
+    "\n",
     "predictor.delete_model()"
    ],
    "outputs": [],
@@ -5042,4 +5068,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/TaxiFare_Predict_Liner_Learner.ipynb b/TaxiFare_Predict_Liner_Learner.ipynb
@@ -40,6 +40,7 @@
    "outputs": [],
    "source": [
     "# cell 01\n",
+    "\n",
     "!conda update pandas -y"
    ]
   },
@@ -50,6 +51,7 @@
    "outputs": [],
    "source": [
     "# cell 02\n",
+    "\n",
     "import os\n",
     "import boto3\n",
     "import re\n",
@@ -64,6 +66,7 @@
    "outputs": [],
    "source": [
     "# cell 03\n",
+    "\n",
     "role = sagemaker.get_execution_role()\n",
     "sess = sagemaker.Session()\n",
     "region = boto3.Session().region_name\n",
@@ -128,6 +131,7 @@
    ],
    "source": [
     "# cell 04\n",
+    "\n",
     "import boto3\n",
     "FILE_TRAIN = \"nyc-taxi.csv\"\n",
     "# s3 = boto3.client(\"s3\")\n",
@@ -179,6 +183,7 @@
    ],
    "source": [
     "# cell 05\n",
+    "\n",
     "df.info()"
    ]
   },
@@ -781,6 +786,7 @@
    ],
    "source": [
     "# cell 06\n",
+    "\n",
     "# Frequency tables for each categorical feature\n",
     "for column in df.select_dtypes(include=['object']).columns:\n",
     "    display(pd.crosstab(index=df[column], columns='% observations', normalize='columns'))\n",
@@ -835,6 +841,7 @@
    ],
    "source": [
     "# cell 07\n",
+    "\n",
     "df = df.drop(['payment_type', 'store_and_fwd_flag'], axis=1)\n",
     "df.info()"
    ]
@@ -875,6 +882,7 @@
    ],
    "source": [
     "# cell 08\n",
+    "\n",
     "df['dropoff_datetime']= pd.to_datetime(df['dropoff_datetime'])\n",
     "df['pickup_datetime']= pd.to_datetime(df['pickup_datetime'])\n",
     "df['journey_time'] = (df['dropoff_datetime'] - df['pickup_datetime'])\n",
@@ -925,6 +933,7 @@
    ],
    "source": [
     "# cell 09\n",
+    "\n",
     "df = df.drop(['dropoff_datetime', 'pickup_datetime'], axis=1)\n",
     "df.info()"
    ]
@@ -973,6 +982,7 @@
    ],
    "source": [
     "# cell 10\n",
+    "\n",
     "df = pd.get_dummies(df, dtype=float)\n",
     "df.info()"
    ]
@@ -991,6 +1001,7 @@
    "outputs": [],
    "source": [
     "# cell 11\n",
+    "\n",
     "import numpy as np\n",
     "\n",
     "train_data, validation_data, test_data = np.split(df.sample(frac=1, random_state=1729), [int(0.7 * len(df)), int(0.9 * len(df))])\n",
@@ -1006,6 +1017,7 @@
    "outputs": [],
    "source": [
     "# cell 12\n",
+    "\n",
     "boto3.Session().resource('s3').Bucket(data_bucket).Object(os.path.join(data_prefix, 'train/train.csv')).upload_file('train.csv')\n",
     "boto3.Session().resource('s3').Bucket(data_bucket).Object(os.path.join(data_prefix, 'validation/validation.csv')).upload_file('validation.csv')\n",
     "boto3.Session().resource('s3').Bucket(data_bucket).Object(os.path.join(data_prefix, 'test/test.csv')).upload_file('test.csv')"
@@ -1038,6 +1050,7 @@
    ],
    "source": [
     "# cell 13\n",
+    "\n",
     "# creating the inputs for the fit() function with the training and validation location\n",
     "s3_train_data = f\"s3://{data_bucket}/{data_prefix}/train\"\n",
     "print(f\"training files will be taken from: {s3_train_data}\")\n",
@@ -1096,6 +1109,7 @@
    ],
    "source": [
     "# cell 14\n",
+    "\n",
     "# getting the linear learner image according to the region\n",
     "from sagemaker.image_uris import retrieve\n",
     "\n",
@@ -1119,7 +1133,6 @@
     }
    ],
    "source": [
-    "# cell 15\n",
     "%%time\n",
     "import boto3\n",
     "import sagemaker\n",
@@ -3435,7 +3448,6 @@
     }
    ],
    "source": [
-    "# cell 16\n",
     "%%time\n",
     "linear.fit(inputs={\"train\": train_data, \"validation\": validation_data}, job_name=job_name)"
    ]
@@ -3466,7 +3478,6 @@
     }
    ],
    "source": [
-    "# cell 17\n",
     "%%time\n",
     "# creating the endpoint out of the trained model\n",
     "linear_predictor = linear.deploy(initial_instance_count=1, instance_type=\"ml.c4.xlarge\")\n",
@@ -3496,6 +3507,7 @@
    "outputs": [],
    "source": [
     "# cell 18\n",
+    "\n",
     "# configure the predictor to accept to serialize csv input and parse the reposne as json\n",
     "from sagemaker.serializers import CSVSerializer\n",
     "from sagemaker.deserializers import JSONDeserializer\n",
@@ -3533,7 +3545,6 @@
     }
    ],
    "source": [
-    "# cell 19\n",
     "%%time\n",
     "import json\n",
     "from itertools import islice\n",
@@ -3571,7 +3582,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# cell 20\n"
+    "# cell 20\n",
+    "\n"
    ]
   }
  ],