diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..c853bc5
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,18 @@
+exercise_notebooks/*
+*/env*
+*/venv*
+.circleci*
+packages/gradient_boosting_model
+*.env
+*.log
+.git
+.gitignore
+.dockerignore
+*.mypy_cache
+*.pytest_cache
+
+### Python ###
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
diff --git a/.gitignore b/.gitignore
index 894a44c..c5c8946 100644
--- a/.gitignore
+++ b/.gitignore
@@ -89,6 +89,7 @@ venv/
ENV/
env.bak/
venv.bak/
+.tox/
# Spyder project settings
.spyderproject
@@ -102,3 +103,25 @@ venv.bak/
# mypy
.mypy_cache/
+
+# pycharm
+.idea/
+
+# OSX
+.DS_Store
+
+# all logs
+logs/
+
+# training data
+packages/gradient_boosting_model/gradient_boosting_model/datasets/*.csv
+packages/gradient_boosting_model/gradient_boosting_model/datasets/*.txt
+packages/gradient_boosting_model/gradient_boosting_model/datasets/*.zip
+
+# trained models
+packages/gradient_boosting_model/gradient_boosting_model/trained_models/*.pkl
+*.h5
+
+# differential test artifacts
+packages/ml_api/differential_tests/expected_results/
+packages/ml_api/differential_tests/actual_results/
diff --git a/exercise_notebooks/.gitkeep b/exercise_notebooks/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/exercise_notebooks/assessing_model_results.ipynb b/exercise_notebooks/assessing_model_results.ipynb
new file mode 100644
index 0000000..5dc782d
--- /dev/null
+++ b/exercise_notebooks/assessing_model_results.ipynb
@@ -0,0 +1,262 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Setup\n",
+ "\n",
+ "Make sure your virtualenv (use the same one as ml_api) is active, and all the imports below are installed\n",
+ "\n",
+ "Make sure your database docker container is running."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import sys\n",
+ "from sqlalchemy import create_engine"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Add the ptdraft folder path to the sys.path list\n",
+ "# WINDOWS USERS: You will need to change the backslashes (TODO: check if this is true)\n",
+ "sys.path.append('../../')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "A reminder that SQLAlchemy DB URIs look like this:\n",
+ "`postgres+psycop2://myuser:mypassword@hackersdb.example.com:5432/mydatabase`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['/Users/christophersamiullah/repos/testing-and-monitoring-ml-deployments/exercise_notebooks', '/Users/christophersamiullah/repos/advanced_ml_deployment_draft/packages/ml_api/differential_tests', '/Users/christophersamiullah/repos/advanced_ml_deployment_draft/packages/ml_api', '/usr/local/bin/python3.7', '/usr/local/Cellar/python/3.7.4/Frameworks/Python.framework/Versions/3.7/lib/python37.zip', '/usr/local/Cellar/python/3.7.4/Frameworks/Python.framework/Versions/3.7/lib/python3.7', '/usr/local/Cellar/python/3.7.4/Frameworks/Python.framework/Versions/3.7/lib/python3.7/lib-dynload', '', '/Users/christophersamiullah/repos/testing-and-monitoring-ml-deployments/packages/ml_api/env/lib/python3.7/site-packages', '/Users/christophersamiullah/repos/testing-and-monitoring-ml-deployments/packages/ml_api/env/lib/python3.7/site-packages/IPython/extensions', '/Users/christophersamiullah/.ipython', '../', '..', '../../']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(sys.path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from packages.ml_api.api.config import DevelopmentConfig"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "postgresql+psycopg2://user:password@0.0.0.0:6609/ml_api_dev\n"
+ ]
+ }
+ ],
+ "source": [
+ "db_uri = DevelopmentConfig.SQLALCHEMY_DATABASE_URI\n",
+ "print(db_uri)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "engine = create_engine(db_uri)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lasso_model_df = pd.read_sql_table(\"regression_model_predictions\", con=engine)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gradient_model_df = pd.read_sql_table(\"gradient_boosting_model_predictions\", con=engine)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "combined_df = lasso_model_df.merge(gradient_model_df, how='outer')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " user_id | \n",
+ " datetime_captured | \n",
+ " model_version | \n",
+ " inputs | \n",
+ " outputs | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 007 | \n",
+ " 2019-12-21 10:15:22.064246+00:00 | \n",
+ " 0.0.4444 | \n",
+ " [{\"1stFlrSF\": 896, \"2ndFlrSF\": 0, \"3SsnPorch\":... | \n",
+ " [105437.16948684008] | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 34 | \n",
+ " 007 | \n",
+ " 2019-12-21 10:23:00.036615+00:00 | \n",
+ " 0.0.4444 | \n",
+ " [{\"1stFlrSF\": 896, \"2ndFlrSF\": 0, \"3SsnPorch\":... | \n",
+ " [105437.16948684008] | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 35 | \n",
+ " 007 | \n",
+ " 2019-12-21 11:07:31.731324+00:00 | \n",
+ " 0.0.4444 | \n",
+ " [{\"1stFlrSF\": 896, \"2ndFlrSF\": 0, \"3SsnPorch\":... | \n",
+ " [105437.16948684008] | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 36 | \n",
+ " 007 | \n",
+ " 2019-12-21 11:10:33.730662+00:00 | \n",
+ " 0.0.4444 | \n",
+ " [{\"1stFlrSF\": 896, \"2ndFlrSF\": 0, \"3SsnPorch\":... | \n",
+ " [105437.16948684008] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 37 | \n",
+ " 007 | \n",
+ " 2019-12-21 11:10:34.408394+00:00 | \n",
+ " 0.0.4444 | \n",
+ " [{\"1stFlrSF\": 896, \"2ndFlrSF\": 0, \"3SsnPorch\":... | \n",
+ " [105437.16948684008] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id user_id datetime_captured model_version \\\n",
+ "0 1 007 2019-12-21 10:15:22.064246+00:00 0.0.4444 \n",
+ "1 34 007 2019-12-21 10:23:00.036615+00:00 0.0.4444 \n",
+ "2 35 007 2019-12-21 11:07:31.731324+00:00 0.0.4444 \n",
+ "3 36 007 2019-12-21 11:10:33.730662+00:00 0.0.4444 \n",
+ "4 37 007 2019-12-21 11:10:34.408394+00:00 0.0.4444 \n",
+ "\n",
+ " inputs outputs \n",
+ "0 [{\"1stFlrSF\": 896, \"2ndFlrSF\": 0, \"3SsnPorch\":... [105437.16948684008] \n",
+ "1 [{\"1stFlrSF\": 896, \"2ndFlrSF\": 0, \"3SsnPorch\":... [105437.16948684008] \n",
+ "2 [{\"1stFlrSF\": 896, \"2ndFlrSF\": 0, \"3SsnPorch\":... [105437.16948684008] \n",
+ "3 [{\"1stFlrSF\": 896, \"2ndFlrSF\": 0, \"3SsnPorch\":... [105437.16948684008] \n",
+ "4 [{\"1stFlrSF\": 896, \"2ndFlrSF\": 0, \"3SsnPorch\":... [105437.16948684008] "
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "combined_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/exercise_notebooks/docker_exercise/Dockerfile b/exercise_notebooks/docker_exercise/Dockerfile
new file mode 100644
index 0000000..e5f5365
--- /dev/null
+++ b/exercise_notebooks/docker_exercise/Dockerfile
@@ -0,0 +1,26 @@
+FROM python:3.7-alpine
+WORKDIR /code
+
+# Set env vars required by Flask
+ENV FLASK_APP app.py
+ENV FLASK_RUN_HOST 0.0.0.0
+
+# Install gcc so Python packages such as MarkupSafe
+# and SQLAlchemy can compile speedups.
+RUN apk add --no-cache gcc musl-dev linux-headers
+
+# copy local requirements.txt into container
+# doing this separately from the main copy
+# operation makes more efficient use of docker
+# layer caching.
+COPY requirements.txt requirements.txt
+
+# install requirements inside the container
+RUN pip install -r requirements.txt
+
+# Copy the current directory . in the project
+# to the workdir . in the image
+COPY . .
+
+# Set the default command for the container to flask run
+CMD ["flask", "run"]
diff --git a/exercise_notebooks/docker_exercise/app.py b/exercise_notebooks/docker_exercise/app.py
new file mode 100644
index 0000000..fadd836
--- /dev/null
+++ b/exercise_notebooks/docker_exercise/app.py
@@ -0,0 +1,25 @@
+import time
+
+import redis
+from flask import Flask
+
+app = Flask(__name__)
+cache = redis.Redis(host='redis', port=6379)
+
+
+def get_hit_count():
+ retries = 5
+ while True:
+ try:
+ return cache.incr('hits')
+ except redis.exceptions.ConnectionError as exc:
+ if retries == 0:
+ raise exc
+ retries -= 1
+ time.sleep(0.5)
+
+
+@app.route('/')
+def hello():
+ count = get_hit_count()
+ return f'Hello World! I have been seen {count} times.\n'
diff --git a/exercise_notebooks/docker_exercise/docker-compose.yml b/exercise_notebooks/docker_exercise/docker-compose.yml
new file mode 100644
index 0000000..00d2c37
--- /dev/null
+++ b/exercise_notebooks/docker_exercise/docker-compose.yml
@@ -0,0 +1,8 @@
+version: '3'
+services:
+ web:
+ build: .
+ ports:
+ - "5000:5000"
+ redis:
+ image: "redis:alpine"
diff --git a/exercise_notebooks/docker_exercise/requirements.txt b/exercise_notebooks/docker_exercise/requirements.txt
new file mode 100644
index 0000000..c1f684c
--- /dev/null
+++ b/exercise_notebooks/docker_exercise/requirements.txt
@@ -0,0 +1,2 @@
+flask>=1.1.1,<1.2.0
+redis>=3.3.11,<3.4
diff --git a/exercise_notebooks/elk_exercise/Dockerfile b/exercise_notebooks/elk_exercise/Dockerfile
new file mode 100644
index 0000000..86647f9
--- /dev/null
+++ b/exercise_notebooks/elk_exercise/Dockerfile
@@ -0,0 +1,23 @@
+FROM python:3.7-alpine
+WORKDIR /application
+
+COPY ./requirements.txt requirements.txt
+RUN apk add --no-cache \
+ gcc \
+ libc-dev \
+ linux-headers \
+ bash; \
+ pip install -r requirements.txt;
+
+COPY . /application
+
+
+EXPOSE 5000
+VOLUME /application
+CMD gunicorn --bind 0.0.0.0:5000 \
+ --workers=1 \
+ --log-config gunicorn_logging.conf \
+ --log-level=DEBUG \
+ --access-logfile=- \
+ --error-logfile=- \
+ application:application
diff --git a/exercise_notebooks/elk_exercise/app/__init__.py b/exercise_notebooks/elk_exercise/app/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/exercise_notebooks/elk_exercise/app/flask_app.py b/exercise_notebooks/elk_exercise/app/flask_app.py
new file mode 100644
index 0000000..251a2f1
--- /dev/null
+++ b/exercise_notebooks/elk_exercise/app/flask_app.py
@@ -0,0 +1,18 @@
+import logging
+
+from flask import Flask
+
+gunicorn_error_logger = logging.getLogger('gunicorn.error')
+gunicorn_error_logger.setLevel(logging.DEBUG)
+
+
+def index():
+ gunicorn_error_logger.info('hello')
+ return 'home'
+
+
+def create_app():
+ main_app = Flask(__name__)
+ main_app.add_url_rule('/', 'index', index)
+
+ return main_app
diff --git a/exercise_notebooks/elk_exercise/application.py b/exercise_notebooks/elk_exercise/application.py
new file mode 100644
index 0000000..e03e2a0
--- /dev/null
+++ b/exercise_notebooks/elk_exercise/application.py
@@ -0,0 +1,7 @@
+from app.flask_app import create_app
+
+
+application = create_app()
+
+if __name__ == '__main__':
+ application.run()
diff --git a/exercise_notebooks/elk_exercise/docker-compose.yml b/exercise_notebooks/elk_exercise/docker-compose.yml
new file mode 100644
index 0000000..c110a75
--- /dev/null
+++ b/exercise_notebooks/elk_exercise/docker-compose.yml
@@ -0,0 +1,91 @@
+version: '3.2'
+
+services:
+ # The environment variable "ELK_VERSION" is used throughout this file to
+ # specify the version of the images to run. The default is set in the
+ # '.env' file in this folder. It can be overridden with any normal
+ # technique for setting environment variables, for example:
+ #
+ # ELK_VERSION=6.0.0-beta1 docker-compose up
+ #
+ # REF: https://docs.docker.com/compose/compose-file/#variable-substitution
+ webapp:
+ build: .
+ container_name: webapp
+ expose:
+ - 5000
+ ports:
+ - 5000:5000
+ links:
+ - logstash
+ networks:
+ - elk
+ depends_on:
+ - logstash
+ - kibana
+ - elasticsearch
+ volumes:
+ - ./:/application
+ elasticsearch:
+ image: docker.elastic.co/elasticsearch/elasticsearch:${ELK_VERSION}
+ volumes:
+ - type: bind
+ source: ./elasticsearch/config/elasticsearch.yml
+ target: /usr/share/elasticsearch/config/elasticsearch.yml
+ read_only: true
+ - type: volume
+ source: elasticsearch
+ target: /usr/share/elasticsearch/data
+ ports:
+ - "9200:9200"
+ - "9300:9300"
+ environment:
+ ES_JAVA_OPTS: "-Xmx256m -Xms256m"
+ ELASTIC_PASSWORD: changeme
+ # Use single node discovery in order to disable production mode and avoid bootstrap checks
+ # see https://www.elastic.co/guide/en/elasticsearch/reference/current/bootstrap-checks.html
+ discovery.type: single-node
+ networks:
+ - elk
+
+ logstash:
+ image: docker.elastic.co/logstash/logstash:${ELK_VERSION}
+ volumes:
+ - type: bind
+ source: ./logstash/config/logstash.yml
+ target: /usr/share/logstash/config/logstash.yml
+ read_only: true
+ - type: bind
+ source: ./logstash/pipeline
+ target: /usr/share/logstash/pipeline
+ read_only: true
+ ports:
+ - "5001:5001"
+ - "9600:9600"
+ environment:
+ LS_JAVA_OPTS: "-Xmx256m -Xms256m"
+ networks:
+ - elk
+ depends_on:
+ - elasticsearch
+
+ kibana:
+ image: docker.elastic.co/kibana/kibana:${ELK_VERSION}
+ volumes:
+ - type: bind
+ source: ./kibana/config/kibana.yml
+ target: /usr/share/kibana/config/kibana.yml
+ read_only: true
+ ports:
+ - "5601:5601"
+ networks:
+ - elk
+ depends_on:
+ - elasticsearch
+
+networks:
+ elk:
+ driver: bridge
+
+volumes:
+ elasticsearch:
\ No newline at end of file
diff --git a/exercise_notebooks/elk_exercise/elasticsearch/config/elasticsearch.yml b/exercise_notebooks/elk_exercise/elasticsearch/config/elasticsearch.yml
new file mode 100644
index 0000000..cbed5c3
--- /dev/null
+++ b/exercise_notebooks/elk_exercise/elasticsearch/config/elasticsearch.yml
@@ -0,0 +1,11 @@
+---
+## Default Elasticsearch configuration from Elasticsearch base image.
+## https://github.com/elastic/elasticsearch/blob/master/distribution/docker/src/docker/config/elasticsearch.yml
+cluster.name: "docker-cluster"
+network.host: 0.0.0.0
+
+## X-Pack settings
+## see https://www.elastic.co/guide/en/elasticsearch/reference/current/setup-xpack.html
+xpack.license.self_generated.type: basic
+xpack.security.enabled: true
+xpack.monitoring.collection.enabled: true
diff --git a/exercise_notebooks/elk_exercise/gunicorn_logging.conf b/exercise_notebooks/elk_exercise/gunicorn_logging.conf
new file mode 100644
index 0000000..7ec8e8c
--- /dev/null
+++ b/exercise_notebooks/elk_exercise/gunicorn_logging.conf
@@ -0,0 +1,46 @@
+[loggers]
+keys=root, logstash.error, logstash.access
+
+[handlers]
+keys=console, logstash
+
+[formatters]
+keys=generic, access, json
+
+[logger_root]
+level=INFO
+handlers=console
+
+[logger_logstash.error]
+level=INFO
+handlers=logstash
+propagate=1
+qualname=gunicorn.error
+
+[logger_logstash.access]
+level=INFO
+handlers=logstash
+propagate=0
+qualname=gunicorn.access
+
+[handler_console]
+class=StreamHandler
+formatter=generic
+args=(sys.stdout, )
+
+[handler_logstash]
+class=logstash.TCPLogstashHandler
+formatter=json
+args=('logstash', 5001)
+
+[formatter_generic]
+format=%(asctime)s [%(process)d] [%(levelname)s] %(message)s
+datefmt=%Y-%m-%d %H:%M:%S
+class=logging.Formatter
+
+[formatter_access]
+format=%(message)s
+class=logging.Formatter
+
+[formatter_json]
+class=pythonjsonlogger.jsonlogger.JsonFormatter
\ No newline at end of file
diff --git a/exercise_notebooks/elk_exercise/kibana/config/kibana.yml b/exercise_notebooks/elk_exercise/kibana/config/kibana.yml
new file mode 100644
index 0000000..93380e9
--- /dev/null
+++ b/exercise_notebooks/elk_exercise/kibana/config/kibana.yml
@@ -0,0 +1,13 @@
+---
+## Default Kibana configuration from Kibana base image.
+## https://github.com/elastic/kibana/blob/master/src/dev/build/tasks/os_packages/docker_generator/templates/kibana_yml.template.js
+#
+server.name: kibana
+server.host: "0"
+elasticsearch.hosts: [ "http://elasticsearch:9200" ]
+xpack.monitoring.ui.container.elasticsearch.enabled: true
+
+## X-Pack security credentials
+#
+elasticsearch.username: elastic
+elasticsearch.password: changeme
\ No newline at end of file
diff --git a/exercise_notebooks/elk_exercise/logstash/config/logstash.yml b/exercise_notebooks/elk_exercise/logstash/config/logstash.yml
new file mode 100644
index 0000000..a48c35f
--- /dev/null
+++ b/exercise_notebooks/elk_exercise/logstash/config/logstash.yml
@@ -0,0 +1,12 @@
+---
+## Default Logstash configuration from Logstash base image.
+## https://github.com/elastic/logstash/blob/master/docker/data/logstash/config/logstash-full.yml
+#
+http.host: "0.0.0.0"
+xpack.monitoring.elasticsearch.hosts: [ "http://elasticsearch:9200" ]
+
+## X-Pack security credentials
+#
+xpack.monitoring.enabled: true
+xpack.monitoring.elasticsearch.username: elastic
+xpack.monitoring.elasticsearch.password: changeme
diff --git a/exercise_notebooks/elk_exercise/logstash/pipeline/logstash.conf b/exercise_notebooks/elk_exercise/logstash/pipeline/logstash.conf
new file mode 100644
index 0000000..7c273f0
--- /dev/null
+++ b/exercise_notebooks/elk_exercise/logstash/pipeline/logstash.conf
@@ -0,0 +1,17 @@
+input {
+ tcp {
+ port => 5001
+ tags => ["webapp_logs"]
+ type => "webapp_logs"
+ codec => json
+ }
+}
+
+output {
+ elasticsearch {
+ hosts => "elasticsearch:9200"
+ user => "elastic"
+ password => "changeme"
+ index => "webapp_logs-%{+YYYY.MM.dd}"
+ }
+}
\ No newline at end of file
diff --git a/exercise_notebooks/elk_exercise/requirements.txt b/exercise_notebooks/elk_exercise/requirements.txt
new file mode 100644
index 0000000..6607dd0
--- /dev/null
+++ b/exercise_notebooks/elk_exercise/requirements.txt
@@ -0,0 +1,5 @@
+Flask>=1.1.1,<1.2.0
+python3-logstash>=0.4.80,<0.5.0
+python-json-logger>=0.1.11,<0.2.0
+gunicorn>=20.0.4,<20.1.0
+
diff --git a/exercise_notebooks/prometheus_exercise/Dockerfile b/exercise_notebooks/prometheus_exercise/Dockerfile
new file mode 100644
index 0000000..4fc5705
--- /dev/null
+++ b/exercise_notebooks/prometheus_exercise/Dockerfile
@@ -0,0 +1,17 @@
+FROM python:3.7-alpine
+WORKDIR /application
+
+COPY ./requirements.txt requirements.txt
+RUN apk add --no-cache \
+ gcc \
+ libc-dev \
+ linux-headers \
+ bash; \
+ pip install -r requirements.txt;
+
+COPY . /application
+
+
+EXPOSE 5000
+VOLUME /application
+CMD gunicorn --workers=1 --bind 0.0.0.0:5000 application:application
diff --git a/exercise_notebooks/prometheus_exercise/app/__init__.py b/exercise_notebooks/prometheus_exercise/app/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/exercise_notebooks/prometheus_exercise/app/flask_app.py b/exercise_notebooks/prometheus_exercise/app/flask_app.py
new file mode 100644
index 0000000..9d2357d
--- /dev/null
+++ b/exercise_notebooks/prometheus_exercise/app/flask_app.py
@@ -0,0 +1,45 @@
+import prometheus_client
+from flask import Flask
+from werkzeug.middleware.dispatcher import DispatcherMiddleware
+from app.helpers.middleware import setup_metrics
+
+
+def index():
+ return 'home'
+
+
+def cpu():
+ # For older machines, you may want to lower
+ # this range to prevent timeouts.
+ for i in range(10000):
+ i**i
+
+ return 'cpu intensive operation complete'
+
+
+def memory():
+ d = {}
+ # For older machines, you may want to lower
+ # this range to prevent timeouts.
+ for i in range(10000000):
+ i = str(i)
+ i += "xyz"
+ d[i] = i
+
+ return 'memory intensive operation complete'
+
+
+def create_app():
+ main_app = Flask(__name__)
+ main_app.add_url_rule('/', 'index', index)
+ main_app.add_url_rule('/cpu', 'cpu', cpu)
+ main_app.add_url_rule('/memory', 'memory', memory)
+ setup_metrics(main_app)
+
+ # Add prometheus wsgi middleware to route /metrics requests
+ app = DispatcherMiddleware(
+ app=main_app.wsgi_app,
+ mounts={'/metrics': prometheus_client.make_wsgi_app()}
+ )
+
+ return app
diff --git a/exercise_notebooks/prometheus_exercise/app/helpers/__init__.py b/exercise_notebooks/prometheus_exercise/app/helpers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/exercise_notebooks/prometheus_exercise/app/helpers/middleware.py b/exercise_notebooks/prometheus_exercise/app/helpers/middleware.py
new file mode 100644
index 0000000..f547ee3
--- /dev/null
+++ b/exercise_notebooks/prometheus_exercise/app/helpers/middleware.py
@@ -0,0 +1,58 @@
+from flask import request, Flask
+from flask.wrappers import Response
+from prometheus_client import Counter, Histogram
+import time
+
+
+# Counter and Histogram are examples of default metrics
+# available from the prometheus Python client.
+REQUEST_COUNT = Counter(
+ name='http_request_count',
+ documentation='App Request Count',
+ labelnames=['app_name', 'method', 'endpoint', 'http_status']
+)
+REQUEST_LATENCY = Histogram(
+ name='http_request_latency_seconds',
+ documentation='Request latency',
+ labelnames=['app_name', 'endpoint']
+)
+
+
+def start_timer() -> None:
+ """Get start time of a request."""
+ request._prometheus_metrics_request_start_time = time.time()
+
+
+def stop_timer(response: Response) -> Response:
+ """Get stop time of a request.."""
+ request_latency = time.time() - request._prometheus_metrics_request_start_time
+ REQUEST_LATENCY.labels(
+ app_name='webapp',
+ endpoint=request.path).observe(request_latency)
+ return response
+
+
+def record_request_data(response: Response) -> Response:
+ """Capture request data.
+
+ Uses the flask request object to extract information such as
+ the HTTP request method, endpoint and HTTP status.
+ """
+ REQUEST_COUNT.labels(
+ app_name='webapp',
+ method=request.method,
+ endpoint=request.path,
+ http_status=response.status_code).inc()
+ return response
+
+
+def setup_metrics(app: Flask) -> None:
+ """Setup Prometheus metrics.
+
+ This function uses the flask before_request
+ and after_request hooks to capture metrics
+ with each HTTP request to the application.
+ """
+ app.before_request(start_timer)
+ app.after_request(record_request_data)
+ app.after_request(stop_timer)
diff --git a/exercise_notebooks/prometheus_exercise/application.py b/exercise_notebooks/prometheus_exercise/application.py
new file mode 100644
index 0000000..e03e2a0
--- /dev/null
+++ b/exercise_notebooks/prometheus_exercise/application.py
@@ -0,0 +1,7 @@
+from app.flask_app import create_app
+
+
+application = create_app()
+
+if __name__ == '__main__':
+ application.run()
diff --git a/exercise_notebooks/prometheus_exercise/config/grafana/basic_cadvisor_dashboard.json b/exercise_notebooks/prometheus_exercise/config/grafana/basic_cadvisor_dashboard.json
new file mode 100644
index 0000000..b621f02
--- /dev/null
+++ b/exercise_notebooks/prometheus_exercise/config/grafana/basic_cadvisor_dashboard.json
@@ -0,0 +1,605 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Docker monitoring with Prometheus and cAdvisor with node selection",
+ "editable": true,
+ "gnetId": 8321,
+ "graphTooltip": 1,
+ "id": 6,
+ "iteration": 1578215128428,
+ "links": [],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "Prometheus",
+ "editable": true,
+ "error": false,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 8,
+ "x": 0,
+ "y": 0
+ },
+ "height": "20",
+ "id": 7,
+ "interval": null,
+ "isNew": true,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(container_last_seen{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "metric": "container_last_seen",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": "",
+ "title": "Running containers",
+ "transparent": true,
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "Prometheus",
+ "editable": true,
+ "error": false,
+ "format": "mbytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 8,
+ "x": 8,
+ "y": 0
+ },
+ "height": "20",
+ "id": 5,
+ "interval": null,
+ "isNew": true,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})/1024/1024",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "metric": "container_memory_usage_bytes",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": "",
+ "title": "Total Memory Usage",
+ "transparent": true,
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "Prometheus",
+ "editable": true,
+ "error": false,
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 8,
+ "x": 16,
+ "y": 0
+ },
+ "height": "20",
+ "id": 6,
+ "interval": null,
+ "isNew": true,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100)",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "metric": "container_memory_usage_bytes",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": "",
+ "title": "Total CPU Usage",
+ "transparent": true,
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "decimals": 2,
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "fillGradient": 0,
+ "grid": {},
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 3
+ },
+ "hiddenSeries": false,
+ "id": 2,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100",
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "metric": "cpu",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU Usage",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "decimals": 2,
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "fillGradient": 0,
+ "grid": {},
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 10
+ },
+ "hiddenSeries": false,
+ "id": 1,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "metric": "container_memory_usage_bytes",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Usage",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "5s",
+ "schemaVersion": 21,
+ "style": "dark",
+ "tags": [
+ "docker"
+ ],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {
+ "text": "cadvisor",
+ "value": "cadvisor"
+ },
+ "datasource": "Prometheus",
+ "definition": "",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Job",
+ "multi": false,
+ "name": "job",
+ "options": [],
+ "query": "label_values(container_cpu_user_seconds_total, job)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "cadvisor",
+ "value": "cadvisor"
+ },
+ "datasource": "Prometheus",
+ "definition": "",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Host:",
+ "multi": false,
+ "name": "node",
+ "options": [],
+ "query": "label_values(container_cpu_user_seconds_total{job=~\"$job\"}, instance)",
+ "refresh": 1,
+ "regex": "/([^:]+):.*/",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "8080",
+ "value": "8080"
+ },
+ "datasource": "Prometheus",
+ "definition": "",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Port",
+ "multi": false,
+ "name": "port",
+ "options": [],
+ "query": "label_values(container_cpu_user_seconds_total{instance=~\"$node:(.*)\"}, instance)",
+ "refresh": 1,
+ "regex": "/[^:]+:(.*)/",
+ "skipUrlSync": false,
+ "sort": 3,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-5m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Docker monitoring with node selection",
+ "uid": "pHUTSjLZk",
+ "version": 2
+}
\ No newline at end of file
diff --git a/exercise_notebooks/prometheus_exercise/config/grafana/grafana_flask_basic_dashboard.json b/exercise_notebooks/prometheus_exercise/config/grafana/grafana_flask_basic_dashboard.json
new file mode 100644
index 0000000..b5a52ca
--- /dev/null
+++ b/exercise_notebooks/prometheus_exercise/config/grafana/grafana_flask_basic_dashboard.json
@@ -0,0 +1,224 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": 1,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "hiddenSeries": false,
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(http_request_count_total{job=\"webapp\"}[1m])",
+ "legendFormat": "{{app_name}} {{endpoint}} {{http_status}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Requests Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "hiddenSeries": false,
+ "id": 3,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(http_request_latency_seconds_sum{job=\"webapp\"}[1m]) / rate(http_request_latency_seconds_count{job=\"webapp\"}[1m])",
+ "legendFormat": "{{endpoint}} (seconds)",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Latency",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "schemaVersion": 21,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-5m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "Really Simple Flask Dashboard",
+ "uid": "q8vgEpLZk",
+ "version": 4
+}
\ No newline at end of file
diff --git a/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml b/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml
new file mode 100644
index 0000000..19d7bd8
--- /dev/null
+++ b/exercise_notebooks/prometheus_exercise/config/prometheus/prometheus.yml
@@ -0,0 +1,42 @@
+# my global config
+global:
+ scrape_interval: 15s # By default, scrape targets every 15 seconds.
+ evaluation_interval: 15s # By default, scrape targets every 15 seconds.
+ # scrape_timeout is set to the global default (10s).
+
+ # Attach these labels to any time series or alerts when communicating with
+ # external systems (federation, remote storage, Alertmanager).
+ external_labels:
+ monitor: 'my-project'
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs:
+ # The job name is added as a label `job=` to any timeseries scraped from this config.
+ - job_name: 'prometheus'
+
+ # Override the global default and scrape targets from this job every 5 seconds.
+ scrape_interval: 5s
+
+ # metrics_path defaults to '/metrics'
+ # scheme defaults to 'http'.
+
+ static_configs:
+ - targets: ['prometheus:9090']
+ - job_name: 'webapp'
+
+ # Override the global default and scrape targets from this job every 5 seconds.
+ scrape_interval: 5s
+
+ # metrics_path defaults to '/metrics'
+ # scheme defaults to 'http'.
+ static_configs:
+ - targets: ['webapp:5000']
+
+ - job_name: 'cadvisor'
+
+ # Override the global default and scrape targets from this job every 5 seconds.
+ scrape_interval: 5s
+
+ static_configs:
+ - targets: ['cadvisor:8080']
diff --git a/exercise_notebooks/prometheus_exercise/docker-compose.yml b/exercise_notebooks/prometheus_exercise/docker-compose.yml
new file mode 100644
index 0000000..522e59b
--- /dev/null
+++ b/exercise_notebooks/prometheus_exercise/docker-compose.yml
@@ -0,0 +1,51 @@
+version: '3'
+
+volumes:
+ prometheus_data: {}
+ grafana_data: {}
+
+services:
+ webapp:
+ build: .
+ container_name: webapp
+ expose:
+ - 5000
+ ports:
+ - 5000:5000
+ volumes:
+ - ./:/application
+ prometheus:
+ image: prom/prometheus
+ container_name: prometheus
+ volumes:
+ - ./config/prometheus/:/etc/prometheus/
+ - prometheus_data:/prometheus
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yml'
+ expose:
+ - 9090
+ ports:
+ - 9090:9090
+ depends_on:
+ - cadvisor
+ grafana:
+ image: grafana/grafana
+ depends_on:
+ - prometheus
+ ports:
+ - 3000:3000
+ volumes:
+ - grafana_data:/var/lib/grafana
+ environment:
+ - GF_SECURITY_ADMIN_PASSWORD=foobar
+ - GF_USERS_ALLOW_SIGN_UP=false
+
+ cadvisor:
+ image: google/cadvisor
+ volumes:
+ - /:/rootfs:ro
+ - /var/run:/var/run:rw
+ - /sys:/sys:ro
+ - /var/lib/docker/:/var/lib/docker:ro
+ ports:
+ - 8080:8080
diff --git a/exercise_notebooks/prometheus_exercise/requirements.txt b/exercise_notebooks/prometheus_exercise/requirements.txt
new file mode 100644
index 0000000..0fbe48f
--- /dev/null
+++ b/exercise_notebooks/prometheus_exercise/requirements.txt
@@ -0,0 +1,4 @@
+Flask>=1.1.1,<1.2.0
+prometheus_client>=0.7.1,<0.8.0
+gunicorn>=20.0.4,<20.1.0
+
diff --git a/exercise_notebooks/unit_testing_exercise/requirements.txt b/exercise_notebooks/unit_testing_exercise/requirements.txt
new file mode 100644
index 0000000..a0c27fa
--- /dev/null
+++ b/exercise_notebooks/unit_testing_exercise/requirements.txt
@@ -0,0 +1,4 @@
+numpy>=1.18.1,<1.19.0
+scikit-learn>=0.22.1,<0.23.0
+pandas>=0.25.3,<0.26.0
+jupyter>=1.0.0,<1.1.0
\ No newline at end of file
diff --git a/exercise_notebooks/unit_testing_exercise/unit_testing_data_engineering.ipynb b/exercise_notebooks/unit_testing_exercise/unit_testing_data_engineering.ipynb
new file mode 100644
index 0000000..22447ce
--- /dev/null
+++ b/exercise_notebooks/unit_testing_exercise/unit_testing_data_engineering.ipynb
@@ -0,0 +1,333 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Unit Testing ML Code: Hands-on Exercise (Data Engineering)\n",
+ "\n",
+ "## In this notebook we will explore unit tests for data engineering\n",
+ "\n",
+ "#### We will use a classic toy dataset: the Iris plants dataset, which comes included with scikit-learn\n",
+ "Dataset details: https://scikit-learn.org/stable/datasets/index.html#iris-plants-dataset\n",
+ "\n",
+ "As we progress through the course, the complexity of examples will increase, but we will start with something basic. This notebook is designed so that it can be run in isolation, once the setup steps described below are complete.\n",
+ "\n",
+ "### Setup\n",
+ "\n",
+ "Let's begin by importing the dataset and the libraries we are going to use. Make sure you have run `pip install -r requirements.txt` on requirements file located in the same directory as this notebook. We recommend doing this in a separate virtual environment (see dedicated setup lecture).\n",
+ "\n",
+ "If you need a refresher on jupyter, pandas or numpy, there are some links to resources in the section notes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "pycharm": {
+ "is_executing": false,
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn import datasets\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "# Access the iris dataset from sklearn\n",
+ "iris = datasets.load_iris()\n",
+ "\n",
+ "# Load the iris data into a pandas dataframe. The `data` and `feature_names`\n",
+ "# attributes of the dataset are added by default by sklearn. We use them to\n",
+ "# specify the columns of our dataframes.\n",
+ "iris_frame = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
+ "\n",
+ "# Create a \"target\" column in our dataframe, and set the values to the correct\n",
+ "# classifications from the dataset.\n",
+ "iris_frame['target'] = iris.target"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Add the `SimplePipeline` from the Test Input Values notebook (same as previous lecture, no changes here)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "\n",
+ "class SimplePipeline:\n",
+ " def __init__(self):\n",
+ " self.frame = None\n",
+ " # Shorthand to specify that each value should start out as\n",
+ " # None when the class is instantiated.\n",
+ " self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None\n",
+ " self.model = None\n",
+ " self.load_dataset()\n",
+ " \n",
+ " def load_dataset(self):\n",
+ " \"\"\"Load the dataset and perform train test split.\"\"\"\n",
+ " # fetch from sklearn\n",
+ " dataset = datasets.load_iris()\n",
+ " \n",
+ " # remove units ' (cm)' from variable names\n",
+ " self.feature_names = [fn[:-5] for fn in dataset.feature_names]\n",
+ " self.frame = pd.DataFrame(dataset.data, columns=self.feature_names)\n",
+ " self.frame['target'] = dataset.target\n",
+ " \n",
+ " # we divide the data set using the train_test_split function from sklearn, \n",
+ " # which takes as parameters, the dataframe with the predictor variables, \n",
+ " # then the target, then the percentage of data to assign to the test set, \n",
+ " # and finally the random_state to ensure reproducibility.\n",
+ " self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(\n",
+ " self.frame[self.feature_names], self.frame.target, test_size=0.65, random_state=42)\n",
+ " \n",
+ " def train(self, algorithm=LogisticRegression):\n",
+ " \n",
+ " # we set up a LogisticRegression classifier with default parameters\n",
+ " self.model = algorithm(solver='lbfgs', multi_class='auto')\n",
+ " self.model.fit(self.X_train, self.y_train)\n",
+ " \n",
+ " def predict(self, input_data):\n",
+ " return self.model.predict(input_data)\n",
+ " \n",
+ " def get_accuracy(self):\n",
+ " \n",
+ " # use our X_test and y_test values generated when we used\n",
+ " # `train_test_split` to test accuracy.\n",
+ " # score is a method on the Logisitic Regression that \n",
+ " # returns the accuracy by default, but can be changed to other metrics, see: \n",
+ " # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.score\n",
+ " return self.model.score(X=self.X_test, y=self.y_test)\n",
+ " \n",
+ " def run_pipeline(self):\n",
+ " \"\"\"Helper method to run multiple pipeline methods with one call.\"\"\"\n",
+ " self.load_dataset()\n",
+ " self.train()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Test Engineered Data (preprocessing)\n",
+ "\n",
+ "Below we create an updated pipeline which inherits from the SimplePipeline but has new functionality to preprocess the data by applying a scaler. Linear models are sensitive to the scale of the features. For example features with bigger magnitudes tend to dominate if we do not apply a scaler."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.preprocessing import StandardScaler\n",
+ "\n",
+ "\n",
+ "class PipelineWithDataEngineering(SimplePipeline):\n",
+ " def __init__(self):\n",
+ " # Call the inherited SimplePipeline __init__ method first.\n",
+ " super().__init__()\n",
+ " \n",
+ " # scaler to standardize the variables in the dataset\n",
+ " self.scaler = StandardScaler()\n",
+ " # Train the scaler once upon pipeline instantiation:\n",
+ " # Compute the mean and standard deviation based on the training data\n",
+ " self.scaler.fit(self.X_train)\n",
+ " \n",
+ " def apply_scaler(self):\n",
+ " # Scale the test and training data to be of mean 0 and of unit variance\n",
+ " self.X_train = self.scaler.transform(self.X_train)\n",
+ " self.X_test = self.scaler.transform(self.X_test)\n",
+ " \n",
+ " def predict(self, input_data):\n",
+ " # apply scaler transform on inputs before predictions\n",
+ " scaled_input_data = self.scaler.transform(input_data)\n",
+ " return self.model.predict(scaled_input_data)\n",
+ " \n",
+ " def run_pipeline(self):\n",
+ " \"\"\"Helper method to run multiple pipeline methods with one call.\"\"\"\n",
+ " self.load_dataset()\n",
+ " self.apply_scaler() # updated in the this class\n",
+ " self.train()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "current model accuracy is: 0.9591836734693877\n"
+ ]
+ }
+ ],
+ "source": [
+ "pipeline = PipelineWithDataEngineering()\n",
+ "pipeline.run_pipeline()\n",
+ "accuracy_score = pipeline.get_accuracy()\n",
+ "print(f'current model accuracy is: {accuracy_score}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we Unit Test\n",
+ "We focus specifically on the feature engineering step"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import unittest\n",
+ "\n",
+ "\n",
+ "class TestIrisDataEngineering(unittest.TestCase):\n",
+ " def setUp(self):\n",
+ " self.pipeline = PipelineWithDataEngineering()\n",
+ " self.pipeline.load_dataset()\n",
+ " \n",
+ " def test_scaler_preprocessing_brings_x_train_mean_near_zero(self):\n",
+ " # Given\n",
+ " # convert the dataframe to be a single column with pandas stack\n",
+ " original_mean = self.pipeline.X_train.stack().mean()\n",
+ " \n",
+ " # When\n",
+ " self.pipeline.apply_scaler()\n",
+ " \n",
+ " # Then\n",
+ " # The idea behind StandardScaler is that it will transform your data \n",
+ " # to center the distribution at 0 and scale the variance at 1.\n",
+ " # Therefore we test that the mean has shifted to be less than the original\n",
+ " # and close to 0 using assertAlmostEqual to check to 3 decimal places:\n",
+ " # https://docs.python.org/3/library/unittest.html#unittest.TestCase.assertAlmostEqual\n",
+ " self.assertTrue(original_mean > self.pipeline.X_train.mean()) # X_train is a numpy array at this point.\n",
+ " self.assertAlmostEqual(self.pipeline.X_train.mean(), 0.0, places=3)\n",
+ " print(f'Original X train mean: {original_mean}')\n",
+ " print(f'Transformed X train mean: {self.pipeline.X_train.mean()}')\n",
+ " \n",
+ " def test_scaler_preprocessing_brings_x_train_std_near_one(self):\n",
+ " # When\n",
+ " self.pipeline.apply_scaler()\n",
+ " \n",
+ " # Then\n",
+ " # We also check that the standard deviation is close to 1\n",
+ " self.assertAlmostEqual(self.pipeline.X_train.std(), 1.0, places=3)\n",
+ " print(f'Transformed X train standard deviation : {self.pipeline.X_train.std()}')\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ".."
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Original X train mean: 3.5889423076923075\n",
+ "Transformed X train mean: -5.978123978750843e-17\n",
+ "Transformed X train standard deviation : 1.0\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "----------------------------------------------------------------------\n",
+ "Ran 2 tests in 0.029s\n",
+ "\n",
+ "OK\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import sys\n",
+ "\n",
+ "\n",
+ "suite = unittest.TestLoader().loadTestsFromTestCase(TestIrisDataEngineering)\n",
+ "unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Data Engineering Test: Hands-on Exercise\n",
+ "Change the pipeline class preprocessing so that the test fails. Do you understand why the test is failing?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ },
+ "pycharm": {
+ "stem_cell": {
+ "cell_type": "raw",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": []
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/exercise_notebooks/unit_testing_exercise/unit_testing_input_data.ipynb b/exercise_notebooks/unit_testing_exercise/unit_testing_input_data.ipynb
new file mode 100644
index 0000000..8d658ce
--- /dev/null
+++ b/exercise_notebooks/unit_testing_exercise/unit_testing_input_data.ipynb
@@ -0,0 +1,558 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Unit Testing ML Code: Hands-on Exercise (Input Values)\n",
+ "\n",
+ "## In this notebook we will explore unit tests to validate input data using a basic schema\n",
+ "\n",
+ "#### We will use a classic toy dataset: the Iris plants dataset, which comes included with scikit-learn\n",
+ "Dataset details: https://scikit-learn.org/stable/datasets/index.html#iris-plants-dataset\n",
+ "\n",
+ "As we progress through the course, the complexity of examples will increase, but we will start with something basic. This notebook is designed so that it can be run in isolation, once the setup steps described below are complete.\n",
+ "\n",
+ "### Setup\n",
+ "\n",
+ "Let's begin by importing the dataset and the libraries we are going to use. Make sure you have run `pip install -r requirements.txt` on requirements file located in the same directory as this notebook. We recommend doing this in a separate virtual environment (see dedicated setup lecture).\n",
+ "\n",
+ "If you need a refresher on jupyter, pandas or numpy, there are some links to resources in the section notes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "pycharm": {
+ "is_executing": false,
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn import datasets\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "# Access the iris dataset from sklearn\n",
+ "iris = datasets.load_iris()\n",
+ "\n",
+ "# Load the iris data into a pandas dataframe. The `data` and `feature_names`\n",
+ "# attributes of the dataset are added by default by sklearn. We use them to\n",
+ "# specify the columns of our dataframes.\n",
+ "iris_frame = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
+ "\n",
+ "# Create a \"target\" column in our dataframe, and set the values to the correct\n",
+ "# classifications from the dataset.\n",
+ "iris_frame['target'] = iris.target"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "pycharm": {
+ "is_executing": false,
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sepal length (cm) | \n",
+ " sepal width (cm) | \n",
+ " petal length (cm) | \n",
+ " petal width (cm) | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5.1 | \n",
+ " 3.5 | \n",
+ " 1.4 | \n",
+ " 0.2 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 4.9 | \n",
+ " 3.0 | \n",
+ " 1.4 | \n",
+ " 0.2 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4.7 | \n",
+ " 3.2 | \n",
+ " 1.3 | \n",
+ " 0.2 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4.6 | \n",
+ " 3.1 | \n",
+ " 1.5 | \n",
+ " 0.2 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5.0 | \n",
+ " 3.6 | \n",
+ " 1.4 | \n",
+ " 0.2 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n",
+ "0 5.1 3.5 1.4 0.2 \n",
+ "1 4.9 3.0 1.4 0.2 \n",
+ "2 4.7 3.2 1.3 0.2 \n",
+ "3 4.6 3.1 1.5 0.2 \n",
+ "4 5.0 3.6 1.4 0.2 \n",
+ "\n",
+ " target \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# View the first 5 rows of our dataframe.\n",
+ "iris_frame.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " sepal length (cm) | \n",
+ " sepal width (cm) | \n",
+ " petal length (cm) | \n",
+ " petal width (cm) | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 150.000000 | \n",
+ " 150.000000 | \n",
+ " 150.000000 | \n",
+ " 150.000000 | \n",
+ " 150.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 5.843333 | \n",
+ " 3.057333 | \n",
+ " 3.758000 | \n",
+ " 1.199333 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 0.828066 | \n",
+ " 0.435866 | \n",
+ " 1.765298 | \n",
+ " 0.762238 | \n",
+ " 0.819232 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 4.300000 | \n",
+ " 2.000000 | \n",
+ " 1.000000 | \n",
+ " 0.100000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 5.100000 | \n",
+ " 2.800000 | \n",
+ " 1.600000 | \n",
+ " 0.300000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 5.800000 | \n",
+ " 3.000000 | \n",
+ " 4.350000 | \n",
+ " 1.300000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 6.400000 | \n",
+ " 3.300000 | \n",
+ " 5.100000 | \n",
+ " 1.800000 | \n",
+ " 2.000000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 7.900000 | \n",
+ " 4.400000 | \n",
+ " 6.900000 | \n",
+ " 2.500000 | \n",
+ " 2.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sepal length (cm) sepal width (cm) petal length (cm) \\\n",
+ "count 150.000000 150.000000 150.000000 \n",
+ "mean 5.843333 3.057333 3.758000 \n",
+ "std 0.828066 0.435866 1.765298 \n",
+ "min 4.300000 2.000000 1.000000 \n",
+ "25% 5.100000 2.800000 1.600000 \n",
+ "50% 5.800000 3.000000 4.350000 \n",
+ "75% 6.400000 3.300000 5.100000 \n",
+ "max 7.900000 4.400000 6.900000 \n",
+ "\n",
+ " petal width (cm) target \n",
+ "count 150.000000 150.000000 \n",
+ "mean 1.199333 1.000000 \n",
+ "std 0.762238 0.819232 \n",
+ "min 0.100000 0.000000 \n",
+ "25% 0.300000 0.000000 \n",
+ "50% 1.300000 1.000000 \n",
+ "75% 1.800000 2.000000 \n",
+ "max 2.500000 2.000000 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# View summary statistics for our dataframe.\n",
+ "iris_frame.describe()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now that we have our data loaded, we will create a simplified pipeline.\n",
+ "\n",
+ "This pipeline is a class for encapsulating all the related functionality for our model. As the course unfolds, we will work with more complex pipelines, including those provided by third party libraries.\n",
+ "\n",
+ "We train a logistic regression model to classify the flowers from the Iris dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "\n",
+ "class SimplePipeline:\n",
+ " def __init__(self):\n",
+ " self.frame = None\n",
+ " # Shorthand to specify that each value should start out as\n",
+ " # None when the class is instantiated.\n",
+ " self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None\n",
+ " self.model = None\n",
+ " self.load_dataset()\n",
+ " \n",
+ " def load_dataset(self):\n",
+ " \"\"\"Load the dataset and perform train test split.\"\"\"\n",
+ " # fetch from sklearn\n",
+ " dataset = datasets.load_iris()\n",
+ " \n",
+ " # remove units ' (cm)' from variable names\n",
+ " self.feature_names = [fn[:-5] for fn in dataset.feature_names]\n",
+ " self.frame = pd.DataFrame(dataset.data, columns=self.feature_names)\n",
+ " self.frame['target'] = dataset.target\n",
+ " \n",
+ " # we divide the data set using the train_test_split function from sklearn, \n",
+ " # which takes as parameters, the dataframe with the predictor variables, \n",
+ " # then the target, then the percentage of data to assign to the test set, \n",
+ " # and finally the random_state to ensure reproducibility.\n",
+ " self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(\n",
+ " self.frame[self.feature_names], self.frame.target, test_size=0.65, random_state=42)\n",
+ " \n",
+ " def train(self, algorithm=LogisticRegression):\n",
+ " \n",
+ " # we set up a LogisticRegression classifier with default parameters\n",
+ " self.model = algorithm(solver='lbfgs', multi_class='auto')\n",
+ " self.model.fit(self.X_train, self.y_train)\n",
+ " \n",
+ " def predict(self, input_data):\n",
+ " return self.model.predict(input_data)\n",
+ " \n",
+ " def get_accuracy(self):\n",
+ " \n",
+ " # use our X_test and y_test values generated when we used\n",
+ " # `train_test_split` to test accuracy.\n",
+ " # score is a method on the Logisitic Regression that \n",
+ " # returns the accuracy by default, but can be changed to other metrics, see: \n",
+ " # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.score\n",
+ " return self.model.score(X=self.X_test, y=self.y_test)\n",
+ " \n",
+ " def run_pipeline(self):\n",
+ " \"\"\"Helper method to run multiple pipeline methods with one call.\"\"\"\n",
+ " self.load_dataset()\n",
+ " self.train()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "current model accuracy is: 0.9693877551020408\n"
+ ]
+ }
+ ],
+ "source": [
+ "pipeline = SimplePipeline()\n",
+ "pipeline.run_pipeline()\n",
+ "accuracy_score = pipeline.get_accuracy()\n",
+ "\n",
+ "# note that f' string interpolation syntax requires python 3.6\n",
+ "# https://www.python.org/dev/peps/pep-0498/\n",
+ "print(f'current model accuracy is: {accuracy_score}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Test Inputs\n",
+ "\n",
+ "Now that we have our basic pipeline, we are in a position to test the input data.\n",
+ "\n",
+ "Best practice is to use a schema. A schema is a collection of rules which specify the expected values for a set of fields. Below we show a simple schema (just using a nested dictionary) for the Iris dataset. Later in the course we will look at more complex schemas, using some of the common Python libraries for data validation.\n",
+ "\n",
+ "The schema specifies the maximum and minimum values that can be taken by each variable. We can learn these values from the data, as we have done for this demo, or these values may come from specific domain knowledge of the subject."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "iris_schema = {\n",
+ " 'sepal length': {\n",
+ " 'range': {\n",
+ " 'min': 4.0, # determined by looking at the dataframe .describe() method\n",
+ " 'max': 8.0\n",
+ " },\n",
+ " 'dtype': float,\n",
+ " },\n",
+ " 'sepal width': {\n",
+ " 'range': {\n",
+ " 'min': 1.0,\n",
+ " 'max': 5.0\n",
+ " },\n",
+ " 'dtype': float,\n",
+ " },\n",
+ " 'petal length': {\n",
+ " 'range': {\n",
+ " 'min': 1.0,\n",
+ " 'max': 7.0\n",
+ " },\n",
+ " 'dtype': float,\n",
+ " },\n",
+ " 'petal width': {\n",
+ " 'range': {\n",
+ " 'min': 0.1,\n",
+ " 'max': 3.0\n",
+ " },\n",
+ " 'dtype': float,\n",
+ " }\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import unittest\n",
+ "import sys\n",
+ "\n",
+ "class TestIrisInputData(unittest.TestCase):\n",
+ " def setUp(self):\n",
+ " \n",
+ " # `setUp` will be run before each test, ensuring that you\n",
+ " # have a new pipeline to access in your tests. See the \n",
+ " # unittest docs if you are unfamiliar with unittest.\n",
+ " # https://docs.python.org/3/library/unittest.html#unittest.TestCase.setUp\n",
+ " self.pipeline = SimplePipeline()\n",
+ " self.pipeline.run_pipeline()\n",
+ " \n",
+ " def test_input_data_ranges(self):\n",
+ " # get df max and min values for each column\n",
+ " max_values = self.pipeline.frame.max()\n",
+ " min_values = self.pipeline.frame.min()\n",
+ " \n",
+ " # loop over each feature (i.e. all 4 column names)\n",
+ " for feature in self.pipeline.feature_names:\n",
+ " \n",
+ " # use unittest assertions to ensure the max/min values found in the dataset\n",
+ " # are less than/greater than those expected by the schema max/min.\n",
+ " self.assertTrue(max_values[feature] <= iris_schema[feature]['range']['max'])\n",
+ " self.assertTrue(min_values[feature] >= iris_schema[feature]['range']['min'])\n",
+ " \n",
+ " def test_input_data_types(self):\n",
+ " data_types = self.pipeline.frame.dtypes # pandas dtypes method\n",
+ " \n",
+ " for feature in self.pipeline.feature_names:\n",
+ " self.assertEqual(data_types[feature], iris_schema[feature]['dtype'])\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "..\n",
+ "----------------------------------------------------------------------\n",
+ "Ran 2 tests in 0.076s\n",
+ "\n",
+ "OK\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# setup code to allow unittest to run the above tests inside the jupyter notebook.\n",
+ "suite = unittest.TestLoader().loadTestsFromTestCase(TestIrisInputData)\n",
+ "unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Data Input Test: Hands-on Exercise\n",
+ "Change either the schema or the input data (not the model config) so that the test fails. Do you understand why the test is failing?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ },
+ "pycharm": {
+ "stem_cell": {
+ "cell_type": "raw",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": []
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/exercise_notebooks/unit_testing_exercise/unit_testing_model_configuration.ipynb b/exercise_notebooks/unit_testing_exercise/unit_testing_model_configuration.ipynb
new file mode 100644
index 0000000..13f5a30
--- /dev/null
+++ b/exercise_notebooks/unit_testing_exercise/unit_testing_model_configuration.ipynb
@@ -0,0 +1,266 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Unit Testing ML Code: Hands-on Exercise (Configuration)\n",
+ "\n",
+ "## In this notebook we will explore unit tests for *model configuration*\n",
+ "\n",
+ "#### We will use a classic toy dataset: the Iris plants dataset, which comes included with scikit-learn\n",
+ "Dataset details: https://scikit-learn.org/stable/datasets/index.html#iris-plants-dataset\n",
+ "\n",
+ "As we progress through the course, the complexity of examples will increase, but we will start with something basic. This notebook is designed so that it can be run in isolation, once the setup steps described below are complete. Cells should be run one after the other without skipping any.\n",
+ "\n",
+ "### Setup\n",
+ "\n",
+ "Let's begin by importing the dataset and the libraries we are going to use. Make sure you have run `pip install -r requirements.txt` on requirements file located in the same directory as this notebook. We recommend doing this in a separate virtual environment (see dedicated setup lecture).\n",
+ "\n",
+ "If you need a refresher on jupyter, pandas or numpy, there are some links to resources in the section notes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "pycharm": {
+ "is_executing": false,
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn import datasets\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "# Access the iris dataset from sklearn\n",
+ "iris = datasets.load_iris()\n",
+ "\n",
+ "# Load the iris data into a pandas dataframe. The `data` and `feature_names`\n",
+ "# attributes of the dataset are added by default by sklearn. We use them to\n",
+ "# specify the columns of our dataframes.\n",
+ "iris_frame = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
+ "\n",
+ "# Create a \"target\" column in our dataframe, and set the values to the correct\n",
+ "# classifications from the dataset.\n",
+ "iris_frame['target'] = iris.target"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Add the `SimplePipeline` from the Test Input Values notebook (same as first exercise, no changes here)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "\n",
+ "class SimplePipeline:\n",
+ " def __init__(self):\n",
+ " self.frame = None\n",
+ " # Shorthand to specify that each value should start out as\n",
+ " # None when the class is instantiated.\n",
+ " self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None\n",
+ " self.model = None\n",
+ " self.load_dataset()\n",
+ " \n",
+ " def load_dataset(self):\n",
+ " \"\"\"Load the dataset and perform train test split.\"\"\"\n",
+ " # fetch from sklearn\n",
+ " dataset = datasets.load_iris()\n",
+ " \n",
+ " # remove units ' (cm)' from variable names\n",
+ " self.feature_names = [fn[:-5] for fn in dataset.feature_names]\n",
+ " self.frame = pd.DataFrame(dataset.data, columns=self.feature_names)\n",
+ " self.frame['target'] = dataset.target\n",
+ " \n",
+ " # we divide the data set using the train_test_split function from sklearn, \n",
+ " # which takes as parameters, the dataframe with the predictor variables, \n",
+ " # then the target, then the percentage of data to assign to the test set, \n",
+ " # and finally the random_state to ensure reproducibility.\n",
+ " self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(\n",
+ " self.frame[self.feature_names], self.frame.target, test_size=0.65, random_state=42)\n",
+ " \n",
+ " def train(self, algorithm=LogisticRegression):\n",
+ " \n",
+ " # we set up a LogisticRegression classifier with default parameters\n",
+ " self.model = algorithm(solver='lbfgs', multi_class='auto')\n",
+ " self.model.fit(self.X_train, self.y_train)\n",
+ " \n",
+ " def predict(self, input_data):\n",
+ " return self.model.predict(input_data)\n",
+ " \n",
+ " def get_accuracy(self):\n",
+ " \n",
+ " # use our X_test and y_test values generated when we used\n",
+ " # `train_test_split` to test accuracy.\n",
+ " # score is a method on the Logisitic Regression that \n",
+ " # returns the accuracy by default, but can be changed to other metrics, see: \n",
+ " # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.score\n",
+ " return self.model.score(X=self.X_test, y=self.y_test)\n",
+ " \n",
+ " def run_pipeline(self):\n",
+ " \"\"\"Helper method to run multiple pipeline methods with one call.\"\"\"\n",
+ " self.load_dataset()\n",
+ " self.train()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Update the Pipeline\n",
+ "\n",
+ "We now create a new pipeline class which inherits from the `SimplePipeline` with one important modification: The configuration for the model is passed in as an argument when the pipeline object is instantiated. This means that configuration can be set via an external object or file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class PipelineWithConfig(SimplePipeline):\n",
+ " def __init__(self, config):\n",
+ " # Call the inherited SimplePipeline __init__ method first.\n",
+ " super().__init__()\n",
+ " # Pass in a config object which we use during the train method.\n",
+ " self.config = config\n",
+ " \n",
+ " def train(self, algorithm=LogisticRegression):\n",
+ " # note that we instantiate the LogisticRegression classifier \n",
+ " # with params from the pipeline config\n",
+ " self.model = algorithm(solver=self.config.get('solver'),\n",
+ " multi_class=self.config.get('multi_class'))\n",
+ " self.model.fit(self.X_train, self.y_train)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we Unit Test\n",
+ "\n",
+ "We will employ a simple unit test to check the configuration values.\n",
+ "\n",
+ "Let's say that after extensive testing in the research environment, we deduce that certain types of configuration (parameters passed to the model, preprocessing settings, GPU configurations etc.) are optimal, or that certain configurations tend to be a bad idea. We should then test our configuration is validated against this understanding."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import unittest\n",
+ "\n",
+ "\n",
+ "# arbitrarily selected for demonstration purposes. In a real\n",
+ "# system you would define this in config and import into your\n",
+ "# tests so you didn't have to update config and tests when\n",
+ "# the values changed.\n",
+ "ENABLED_MODEL_SOLVERS = {'lbfgs', 'newton-cg'}\n",
+ "\n",
+ "\n",
+ "class TestIrisConfig(unittest.TestCase):\n",
+ " def setUp(self):\n",
+ " # We prepare the pipeline for use in the tests\n",
+ " config = {'solver': 'lbfgs', 'multi_class': 'auto'}\n",
+ " self.pipeline = PipelineWithConfig(config=config)\n",
+ " self.pipeline.run_pipeline()\n",
+ " \n",
+ " def test_pipeline_config(self):\n",
+ " # Given\n",
+ " # fetch model config using sklearn get_params()\n",
+ " # https://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html#sklearn.base.BaseEstimator.get_params\n",
+ " model_params = self.pipeline.model.get_params()\n",
+ " \n",
+ " # Then\n",
+ " self.assertTrue(model_params['solver'] in ENABLED_MODEL_SOLVERS)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ".\n",
+ "----------------------------------------------------------------------\n",
+ "Ran 1 test in 0.034s\n",
+ "\n",
+ "OK\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import sys\n",
+ "\n",
+ "\n",
+ "suite = unittest.TestLoader().loadTestsFromTestCase(TestIrisConfig)\n",
+ "unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Model Configuration Testing: Hands-on Exercise\n",
+ "Change the model config so that the test fails. Do you understand why the test is failing?"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ },
+ "pycharm": {
+ "stem_cell": {
+ "cell_type": "raw",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": []
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/exercise_notebooks/unit_testing_exercise/unit_testing_model_predictions_quality.ipynb b/exercise_notebooks/unit_testing_exercise/unit_testing_model_predictions_quality.ipynb
new file mode 100644
index 0000000..b72e986
--- /dev/null
+++ b/exercise_notebooks/unit_testing_exercise/unit_testing_model_predictions_quality.ipynb
@@ -0,0 +1,365 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Unit Testing ML Code: Hands-on Exercise (Model Quality)\n",
+ "\n",
+ "## In this notebook we will explore unit tests for *model prediction quality*\n",
+ "\n",
+ "#### We will use a classic toy dataset: the Iris plants dataset, which comes included with scikit-learn\n",
+ "Dataset details: https://scikit-learn.org/stable/datasets/index.html#iris-plants-dataset\n",
+ "\n",
+ "As we progress through the course, the complexity of examples will increase, but we will start with something basic. This notebook is designed so that it can be run in isolation, once the setup steps described below are complete. Cells should be run one after the other without skipping any.\n",
+ "\n",
+ "### Setup\n",
+ "\n",
+ "Let's begin by importing the dataset and the libraries we are going to use. Make sure you have run `pip install -r requirements.txt` on requirements file located in the same directory as this notebook. We recommend doing this in a separate virtual environment (see dedicated setup lecture).\n",
+ "\n",
+ "If you need a refresher on jupyter, pandas or numpy, there are some links to resources in the section notes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "pycharm": {
+ "is_executing": false,
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn import datasets\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "# Access the iris dataset from sklearn\n",
+ "iris = datasets.load_iris()\n",
+ "\n",
+ "# Load the iris data into a pandas dataframe. The `data` and `feature_names`\n",
+ "# attributes of the dataset are added by default by sklearn. We use them to\n",
+ "# specify the columns of our dataframes.\n",
+ "iris_frame = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
+ "\n",
+ "# Create a \"target\" column in our dataframe, and set the values to the correct\n",
+ "# classifications from the dataset.\n",
+ "iris_frame['target'] = iris.target"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Create the Pipelines\n",
+ "\n",
+ "Below we use both pipelines from the previous exercises:\n",
+ "\n",
+ "- `SimplePipeline` from the testing inputs lecture\n",
+ "- `PipelineWithFeatureEngineering` from the testing data engineering lecture\n",
+ "\n",
+ "The pipelines have not been changed. We use both so that we can compare predictions between them in our tests."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "\n",
+ "class SimplePipeline:\n",
+ " def __init__(self):\n",
+ " self.frame = None\n",
+ " # Shorthand to specify that each value should start out as\n",
+ " # None when the class is instantiated.\n",
+ " self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None\n",
+ " self.model = None\n",
+ " self.load_dataset()\n",
+ " \n",
+ " def load_dataset(self):\n",
+ " \"\"\"Load the dataset and perform train test split.\"\"\"\n",
+ " # fetch from sklearn\n",
+ " dataset = datasets.load_iris()\n",
+ " \n",
+ " # remove units ' (cm)' from variable names\n",
+ " self.feature_names = [fn[:-5] for fn in dataset.feature_names]\n",
+ " self.frame = pd.DataFrame(dataset.data, columns=self.feature_names)\n",
+ " self.frame['target'] = dataset.target\n",
+ " \n",
+ " # we divide the data set using the train_test_split function from sklearn, \n",
+ " # which takes as parameters, the dataframe with the predictor variables, \n",
+ " # then the target, then the percentage of data to assign to the test set, \n",
+ " # and finally the random_state to ensure reproducibility.\n",
+ " self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(\n",
+ " self.frame[self.feature_names], self.frame.target, test_size=0.65, random_state=42)\n",
+ " \n",
+ " def train(self, algorithm=LogisticRegression):\n",
+ " \n",
+ " # we set up a LogisticRegression classifier with default parameters\n",
+ " self.model = algorithm(solver='lbfgs', multi_class='auto')\n",
+ " self.model.fit(self.X_train, self.y_train)\n",
+ " \n",
+ " def predict(self, input_data):\n",
+ " return self.model.predict(input_data)\n",
+ " \n",
+ " def get_accuracy(self):\n",
+ " \n",
+ " # use our X_test and y_test values generated when we used\n",
+ " # `train_test_split` to test accuracy.\n",
+ " # score is a method on the Logisitic Regression that \n",
+ " # returns the accuracy by default, but can be changed to other metrics, see: \n",
+ " # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression.score\n",
+ " return self.model.score(X=self.X_test, y=self.y_test)\n",
+ " \n",
+ " def run_pipeline(self):\n",
+ " \"\"\"Helper method to run multiple pipeline methods with one call.\"\"\"\n",
+ " self.load_dataset()\n",
+ " self.train()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.preprocessing import StandardScaler\n",
+ "\n",
+ "\n",
+ "class PipelineWithDataEngineering(SimplePipeline):\n",
+ " def __init__(self):\n",
+ " # Call the inherited SimplePipeline __init__ method first.\n",
+ " super().__init__()\n",
+ " \n",
+ " # scaler to standardize the variables in the dataset\n",
+ " self.scaler = StandardScaler()\n",
+ " # Train the scaler once upon pipeline instantiation:\n",
+ " # Compute the mean and standard deviation based on the training data\n",
+ " self.scaler.fit(self.X_train)\n",
+ " \n",
+ " def apply_scaler(self):\n",
+ " # Scale the test and training data to be of mean 0 and of unit variance\n",
+ " self.X_train = self.scaler.transform(self.X_train)\n",
+ " self.X_test = self.scaler.transform(self.X_test)\n",
+ " \n",
+ " def predict(self, input_data):\n",
+ " # apply scaler transform on inputs before predictions\n",
+ " scaled_input_data = self.scaler.transform(input_data)\n",
+ " return self.model.predict(scaled_input_data)\n",
+ " \n",
+ " def run_pipeline(self):\n",
+ " \"\"\"Helper method to run multiple pipeline methods with one call.\"\"\"\n",
+ " self.load_dataset()\n",
+ " self.apply_scaler() # updated in the this class\n",
+ " self.train()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Now we Unit Test\n",
+ "\n",
+ "We will employ a few different tests for model prediction quality:\n",
+ "\n",
+ "1. A benchmark test: checking model accuracy against a simple benchmark\n",
+ "2. A differential test: checking model accuracy from one version to the next\n",
+ "\n",
+ "To begin, let's establish a base line. The simplest baseline is predicting the most common class. If we run: "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2 50\n",
+ "1 50\n",
+ "0 50\n",
+ "Name: target, dtype: int64"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "iris_frame['target'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can see that there an equal number of classifications for the 3 flower types. Let's check the accuracy when always predicting classification 1. Obviously this is a very low benchmark (circa 33% accuracy on the dataset), but it serves to illustrate the sort of checks you should be running with your models. If this test fails, then our model accuracy is terrible and we have probably introduced a severe bug into our code."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import unittest\n",
+ "from sklearn.metrics import mean_squared_error, accuracy_score\n",
+ "\n",
+ "\n",
+ "class TestIrisPredictions(unittest.TestCase):\n",
+ " def setUp(self):\n",
+ " # We prepare both pipelines for use in the tests\n",
+ " self.pipeline_v1 = SimplePipeline()\n",
+ " self.pipeline_v2 = PipelineWithDataEngineering()\n",
+ " self.pipeline_v1.run_pipeline()\n",
+ " self.pipeline_v2.run_pipeline()\n",
+ " \n",
+ " # the benchmark is simply the same classification value for \n",
+ " # for every test entry\n",
+ " self.benchmark_predictions = [1.0] * len(self.pipeline_v1.y_test)\n",
+ " \n",
+ " def test_accuracy_higher_than_benchmark(self):\n",
+ " # Given\n",
+ " benchmark_accuracy = accuracy_score(\n",
+ " y_true=self.pipeline_v1.y_test,\n",
+ " y_pred=self.benchmark_predictions)\n",
+ " \n",
+ " predictions = self.pipeline_v1.predict(self.pipeline_v1.X_test)\n",
+ " \n",
+ " # When\n",
+ " actual_accuracy = accuracy_score(\n",
+ " y_true=self.pipeline_v1.y_test,\n",
+ " y_pred=predictions)\n",
+ " \n",
+ " # Then\n",
+ " print(f'model accuracy: {actual_accuracy}, benchmark accuracy: {benchmark_accuracy}')\n",
+ " self.assertTrue(actual_accuracy > benchmark_accuracy)\n",
+ " \n",
+ " def test_accuracy_compared_to_previous_version(self):\n",
+ " # When\n",
+ " v1_accuracy = self.pipeline_v1.get_accuracy()\n",
+ " v2_accuracy = self.pipeline_v2.get_accuracy()\n",
+ " print(f'pipeline v1 accuracy: {v1_accuracy}')\n",
+ " print(f'pipeline v2 accuracy: {v2_accuracy}')\n",
+ " \n",
+ " # Then\n",
+ " self.assertTrue(v2_accuracy >= v1_accuracy)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "F."
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "pipeline v1 accuracy: 0.9693877551020408\n",
+ "pipeline v2 accuracy: 0.9591836734693877\n",
+ "model accuracy: 0.9693877551020408, benchmark accuracy: 0.32653061224489793\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "======================================================================\n",
+ "FAIL: test_accuracy_compared_to_previous_version (__main__.TestIrisPredictions)\n",
+ "----------------------------------------------------------------------\n",
+ "Traceback (most recent call last):\n",
+ " File \"\", line 42, in test_accuracy_compared_to_previous_version\n",
+ " self.assertTrue(v2_accuracy >= v1_accuracy)\n",
+ "AssertionError: False is not true\n",
+ "\n",
+ "----------------------------------------------------------------------\n",
+ "Ran 2 tests in 0.115s\n",
+ "\n",
+ "FAILED (failures=1)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import sys\n",
+ "\n",
+ "\n",
+ "suite = unittest.TestLoader().loadTestsFromTestCase(TestIrisPredictions)\n",
+ "unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Model Quality Testing: Hands-on Exercise\n",
+ "1. Change the SimplePipeline class so that the benchmark test fails. Do you understand why the test is failing?\n",
+ "\n",
+ "2. Change either the SimplePipeline or the PipelineWithDataEngineering classes so that `test_accuracy_compared_to_previous_version` **passes**. \n",
+ "\n",
+ "These tests are a little more open ended than others we have looked at, don't worry if you find them tricky!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ },
+ "pycharm": {
+ "stem_cell": {
+ "cell_type": "raw",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": []
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/exercise_notebooks/utility_scripts/MapPortsForDocker.cmd b/exercise_notebooks/utility_scripts/MapPortsForDocker.cmd
new file mode 100644
index 0000000..6b912d6
--- /dev/null
+++ b/exercise_notebooks/utility_scripts/MapPortsForDocker.cmd
@@ -0,0 +1,23 @@
+@echo off
+SETLOCAL ENABLEEXTENSIONS ENABLEDELAYEDEXPANSION
+::%1 is mean first script parameter or switch
+if "%~1"=="" (
+ echo You need to pass a port as a parameter. Example %~n0 port
+ pause
+ goto :EOF
+)
+
+set hostFilePath="c:\Windows\System32\drivers\etc\hosts"
+set ContainerPort=%1
+
+for /f "USEBACKQ" %%a in (`docker-machine ip`) do set DockerIP=%%a
+for /f "tokens=3 delims=: USEBACKQ" %%b in (`find /c "%DockerIP%" %hostFilePath%`) do (
+ if /I "%%b"==" 0" (echo %DockerIP% localhost >> %hostFilePath%)
+)
+netsh interface portproxy add v4tov4 listenport=%ContainerPort% listenaddress=127.0.0.1 connectaddress=%DockerIP% connectport=%ContainerPort%
+netsh interface portproxy add v6tov4 listenport=%ContainerPort% listenaddress=::1 connectaddress=%DockerIP% connectport=%ContainerPort%
+netsh interface portproxy show v4tov4
+netsh interface portproxy show v6tov4
+::"netsh interface portproxy show v4tov4" allows you to view current port redirection
+::"netsh interface portproxy delete v4tov4 listenport=%ContainerPort% listenaddress=127.0.0.1" allows you to remove port redirection
+ping -n 10 127.0.0.1 > nul
\ No newline at end of file
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 0000000..97e52a5
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,11 @@
+[mypy]
+warn_unused_ignores = True
+follow_imports = skip
+show_error_context = True
+warn_incomplete_stub = True
+ignore_missing_imports = True
+check_untyped_defs = True
+cache_dir = /dev/null
+warn_redundant_casts = True
+warn_unused_configs = True
+strict_optional = True
diff --git a/packages/gradient_boosting_model/MANIFEST.in b/packages/gradient_boosting_model/MANIFEST.in
new file mode 100644
index 0000000..1e6401a
--- /dev/null
+++ b/packages/gradient_boosting_model/MANIFEST.in
@@ -0,0 +1,17 @@
+include *.txt
+include *.md
+include *.pkl
+recursive-include ./gradient_boosting_model/*
+
+include gradient_boosting_model/datasets/houseprice.csv
+include gradient_boosting_model/datasets/test.csv
+include gradient_boosting_model/trained_models/*.pkl
+include gradient_boosting_model/VERSION
+include gradient_boosting_model/config.yml
+
+include ./requirements.txt
+exclude *.log
+exclude *.cfg
+
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/VERSION b/packages/gradient_boosting_model/gradient_boosting_model/VERSION
new file mode 100644
index 0000000..f8bc4c6
--- /dev/null
+++ b/packages/gradient_boosting_model/gradient_boosting_model/VERSION
@@ -0,0 +1 @@
+0.1.18
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/__init__.py b/packages/gradient_boosting_model/gradient_boosting_model/__init__.py
new file mode 100644
index 0000000..5270f37
--- /dev/null
+++ b/packages/gradient_boosting_model/gradient_boosting_model/__init__.py
@@ -0,0 +1,17 @@
+import logging
+
+from gradient_boosting_model.config.core import config, PACKAGE_ROOT
+
+# It is strongly advised that you do not add any handlers other than
+# NullHandler to your library’s loggers. This is because the configuration
+# of handlers is the prerogative of the application developer who uses your
+# library. The application developer knows their target audience and what
+# handlers are most appropriate for their application: if you add handlers
+# ‘under the hood’, you might well interfere with their ability to carry out
+# unit tests and deliver logs which suit their requirements.
+# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
+logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
+
+
+with open(PACKAGE_ROOT / "VERSION") as version_file:
+ __version__ = version_file.read().strip()
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/config.yml b/packages/gradient_boosting_model/gradient_boosting_model/config.yml
new file mode 100644
index 0000000..96a603e
--- /dev/null
+++ b/packages/gradient_boosting_model/gradient_boosting_model/config.yml
@@ -0,0 +1,94 @@
+# Package Overview
+package_name: gradient_boosting_model
+
+# Data Files
+training_data_file: houseprice.csv
+test_data_file: test.csv
+
+# this variable is to calculate the temporal variable
+# but is dropped prior to model training.
+drop_features: YrSold
+
+pipeline_name: gb_regression
+pipeline_save_file: gb_regression_output_v
+
+# Variables
+# The variable we are attempting to predict (sale price)
+target: SalePrice
+
+# Will cause syntax errors since they begin with numbers
+variables_to_rename:
+ 1stFlrSF: FirstFlrSF
+ 2ndFlrSF: SecondFlrSF
+ 3SsnPorch: ThreeSsnPortch
+
+features:
+ - LotArea
+ - OverallQual
+ - YearRemodAdd
+ - BsmtQual
+ - BsmtFinSF1
+ - TotalBsmtSF
+ - FirstFlrSF
+ - SecondFlrSF
+ - GrLivArea
+ - GarageCars
+ # this one is only to calculate temporal variable:
+ - YrSold
+
+numerical_vars:
+ - LotArea
+ - OverallQual
+ - YearRemodAdd
+ - BsmtQual
+ - BsmtFinSF1
+ - TotalBsmtSF
+ - FirstFlrSF
+ - SecondFlrSF
+ - GrLivArea
+ - GarageCars
+
+categorical_vars:
+ - BsmtQual
+
+temporal_vars: YearRemodAdd
+
+# Validation
+# numerical variables with NA in train set
+numerical_vars_with_na:
+ - LotFrontage
+
+numerical_na_not_allowed:
+ - LotArea
+ - OverallQual
+ - YearRemodAdd
+ - BsmtFinSF1
+ - TotalBsmtSF
+ - FirstFlrSF
+ - SecondFlrSF
+ - GrLivArea
+ - GarageCars
+ - YrSold
+
+# set train/test split
+test_size: 0.1
+
+# to set the random seed
+random_state: 0
+
+# The number of boosting stages to perform
+n_estimators: 50
+
+# the minimum frequency a label should have to be considered frequent
+# and not be removed.
+rare_label_tol: 0.01
+
+# the minimum number of categories a variable should have in order for
+# the encoder to find frequent labels
+rare_label_n_categories: 5
+
+# loss function to be optimized
+loss: ls
+allowed_loss_functions:
+ - ls
+ - huber
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/config/__init__.py b/packages/gradient_boosting_model/gradient_boosting_model/config/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/config/core.py b/packages/gradient_boosting_model/gradient_boosting_model/config/core.py
new file mode 100644
index 0000000..a13b4bd
--- /dev/null
+++ b/packages/gradient_boosting_model/gradient_boosting_model/config/core.py
@@ -0,0 +1,118 @@
+from pathlib import Path
+import typing as t
+
+from pydantic import BaseModel, validator
+from strictyaml import load, YAML
+
+import gradient_boosting_model
+
+# Project Directories
+PACKAGE_ROOT = Path(gradient_boosting_model.__file__).resolve().parent
+ROOT = PACKAGE_ROOT.parent
+CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
+TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
+DATASET_DIR = PACKAGE_ROOT / "datasets"
+
+
+class AppConfig(BaseModel):
+ """
+ Application-level config.
+ """
+
+ package_name: str
+ pipeline_name: str
+ pipeline_save_file: str
+ training_data_file: str
+ test_data_file: str
+
+
+class ModelConfig(BaseModel):
+ """
+ All configuration relevant to model
+ training and feature engineering.
+ """
+
+ drop_features: str
+ target: str
+ variables_to_rename: t.Dict
+ features: t.Sequence[str]
+ numerical_vars: t.Sequence[str]
+ categorical_vars: t.Sequence[str]
+ temporal_vars: str
+ numerical_vars_with_na: t.Sequence[str]
+ numerical_na_not_allowed: t.Sequence[str]
+ test_size: float
+ random_state: int
+ n_estimators: int
+ rare_label_n_categories: int
+ rare_label_tol: float
+
+ # the order is necessary for validation
+ allowed_loss_functions: t.Tuple[str, ...]
+ loss: str
+
+ @validator("loss")
+ def allowed_loss_function(cls, value, values):
+ """
+ Loss function to be optimized.
+
+ `ls` refers to least squares regression.
+ `lad` (least absolute deviation)
+ `huber` is a combination of the two.
+ `quantile` allows quantile regression.
+
+ Following the research phase, loss is restricted to
+ `ls` and `huber` for this model.
+ """
+
+ allowed_loss_functions = values.get("allowed_loss_functions")
+ if value in allowed_loss_functions:
+ return value
+ raise ValueError(
+ f"the loss parameter specified: {value}, "
+ f"is not in the allowed set: {allowed_loss_functions}"
+ )
+
+
+class Config(BaseModel):
+ """Master config object."""
+
+ app_config: AppConfig
+ model_config: ModelConfig
+
+
+def find_config_file() -> Path:
+ """Locate the configuration file."""
+ if CONFIG_FILE_PATH.is_file():
+ return CONFIG_FILE_PATH
+ raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
+
+
+def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
+ """Parse YAML containing the package configuration."""
+
+ if not cfg_path:
+ cfg_path = find_config_file()
+
+ if cfg_path:
+ with open(cfg_path, "r") as conf_file:
+ parsed_config = load(conf_file.read())
+ return parsed_config
+ raise OSError(f"Did not find config file at path: {cfg_path}")
+
+
+def create_and_validate_config(parsed_config: YAML = None) -> Config:
+ """Run validation on config values."""
+ if parsed_config is None:
+ parsed_config = fetch_config_from_yaml()
+
+ # specify the data attribute from the strictyaml YAML type.
+ _config = Config(
+ app_config=AppConfig(**parsed_config.data),
+ model_config=ModelConfig(**parsed_config.data),
+ )
+
+ return _config
+
+
+config = create_and_validate_config()
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/datasets/__init__.py b/packages/gradient_boosting_model/gradient_boosting_model/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/pipeline.py b/packages/gradient_boosting_model/gradient_boosting_model/pipeline.py
new file mode 100644
index 0000000..fa2851a
--- /dev/null
+++ b/packages/gradient_boosting_model/gradient_boosting_model/pipeline.py
@@ -0,0 +1,118 @@
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from feature_engine.categorical_encoders import RareLabelCategoricalEncoder
+
+from gradient_boosting_model.processing import preprocessors as pp
+from gradient_boosting_model.config.core import config
+
+import logging
+
+
+_logger = logging.getLogger(__name__)
+
+numeric_features = [
+ 'LotArea',
+ 'OverallQual',
+ 'YearRemodAdd',
+ 'BsmtFinSF1',
+ 'TotalBsmtSF',
+ 'FirstFlrSF',
+ 'SecondFlrSF',
+ 'GrLivArea',
+ 'GarageCars',
+ # this one is only to calculate temporal variable:
+ 'YrSold',
+]
+#numeric_features = [0, 1, 2, 4, 5, 6, 7, 8, 9]
+categorical_features = ['BsmtQual'] # BsmtQual
+
+
+numeric_transformer = Pipeline(steps=[
+ ('numerical_imputer', SimpleImputer(strategy='most_frequent')),
+])
+
+categorical_transformer = Pipeline(steps=[
+ ('categorical_encoder', OrdinalEncoder()),
+])
+
+preprocessor = ColumnTransformer(
+ transformers=[
+ ('num_input', numeric_transformer, numeric_features),
+ ('cat_input', categorical_transformer, categorical_features)
+ ])
+
+classifier = GradientBoostingRegressor(
+ loss=config.model_config.loss,
+ random_state=config.model_config.random_state,
+ n_estimators=config.model_config.n_estimators)
+
+price_pipe = Pipeline(steps=[
+ ('preprocessor', preprocessor),
+ ('classifier', classifier)
+])
+
+
+# price_pipe = Pipeline(
+# [
+# # SimpleImputer included in sklearn-onnx
+# (
+# "numerical_imputer",
+# pp.SklearnTransformerWrapper(
+# variables=config.model_config.numerical_vars,
+# transformer=SimpleImputer(strategy="most_frequent"),
+# ),
+# ),
+# # SimpleImputer included in sklearn-onnx
+# # (
+# # "categorical_imputer",
+# # pp.SklearnTransformerWrapper(
+# # variables=config.model_config.categorical_vars,
+# # transformer=SimpleImputer(strategy="constant", fill_value="missing"),
+# # ),
+# # ),
+# # TODO: TemporalVariableEstimator not supported
+# # (
+# # "temporal_variable",
+# # pp.TemporalVariableEstimator(
+# # variables=config.model_config.temporal_vars,
+# # reference_variable=config.model_config.drop_features,
+# # ),
+# # ),
+# # TODO: RareLabelCategoricalEncoder not supported
+# # (
+# # "rare_label_encoder",
+# # RareLabelCategoricalEncoder(
+# # tol=config.model_config.rare_label_tol,
+# # n_categories=config.model_config.rare_label_n_categories,
+# # variables=config.model_config.categorical_vars,
+# # ),
+# # ),
+# # OrdinalEncoder is supported
+# (
+# "categorical_encoder",
+# pp.SklearnTransformerWrapper(
+# variables=config.model_config.categorical_vars,
+# transformer=OrdinalEncoder(),
+# ),
+# ),
+# # TODO: RareLabelCategoricalEncoder not supported
+# (
+# "drop_features",
+# pp.DropUnecessaryFeatures(
+# variables_to_drop=config.model_config.drop_features,
+# ),
+# ),
+# # GradientBoostingRegressor is supported
+# (
+# "gb_model",
+# GradientBoostingRegressor(
+# loss=config.model_config.loss,
+# random_state=config.model_config.random_state,
+# n_estimators=config.model_config.n_estimators,
+# ),
+# ),
+# ]
+# )
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/predict.py b/packages/gradient_boosting_model/gradient_boosting_model/predict.py
new file mode 100644
index 0000000..48331a5
--- /dev/null
+++ b/packages/gradient_boosting_model/gradient_boosting_model/predict.py
@@ -0,0 +1,37 @@
+import logging
+import typing as t
+
+import pandas as pd
+import numpy as np
+
+from gradient_boosting_model import __version__ as _version
+from gradient_boosting_model.config.core import config
+from gradient_boosting_model.processing.data_management import load_pipeline
+from gradient_boosting_model.processing.validation import validate_inputs
+
+_logger = logging.getLogger(__name__)
+
+pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
+_price_session = load_pipeline(file_name=pipeline_file_name)
+
+
+def make_prediction(*, input_data: t.Union[pd.DataFrame, dict],) -> dict:
+ """Make a prediction using a saved model pipeline."""
+
+ data = pd.DataFrame(input_data)
+ validated_data, errors = validate_inputs(input_data=data)
+ results = {"predictions": None, "version": _version, "errors": errors}
+
+ if not errors:
+
+ # The dict keys passed to run need to match what we pass to
+ # convert_sklearn
+ _price_session.run(None, {
+ "input": validated_data[config.model_config.features].astype(np.float32)})[0]
+ _logger.info(
+ f"Making predictions with model version: {_version} "
+ f"Predictions: {predictions}"
+ )
+ results = {"predictions": predictions, "version": _version, "errors": errors}
+
+ return results
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/processing/__init__.py b/packages/gradient_boosting_model/gradient_boosting_model/processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/processing/data_management.py b/packages/gradient_boosting_model/gradient_boosting_model/processing/data_management.py
new file mode 100644
index 0000000..3745222
--- /dev/null
+++ b/packages/gradient_boosting_model/gradient_boosting_model/processing/data_management.py
@@ -0,0 +1,99 @@
+import pandas as pd
+import joblib
+from sklearn.pipeline import Pipeline
+from skl2onnx import convert_sklearn, supported_converters
+from skl2onnx.common.data_types import FloatTensorType, Int64TensorType, StringTensorType
+import onnxruntime as rt
+
+from gradient_boosting_model.config.core import config, DATASET_DIR, TRAINED_MODEL_DIR
+from gradient_boosting_model import __version__ as _version
+
+import logging
+import typing as t
+
+
+_logger = logging.getLogger(__name__)
+
+
+def load_dataset(*, file_name: str) -> pd.DataFrame:
+ dataframe = pd.read_csv(f"{DATASET_DIR}/{file_name}")
+
+ # rename variables beginning with numbers to avoid syntax errors later
+ transformed = dataframe.rename(columns=config.model_config.variables_to_rename)
+ return transformed
+
+
+def convert_dataframe_schema(df, drop=None):
+ inputs = []
+ for k, v in zip(df.columns, df.dtypes):
+ if drop is not None and k in drop:
+ continue
+ if v == "int64":
+ t = Int64TensorType([1, 1])
+ elif v == "float64":
+ t = FloatTensorType([1, 1])
+ else:
+ t = StringTensorType([1, 1])
+ inputs.append((k, t))
+ return inputs
+
+
+def save_pipeline(*, pipeline_to_persist: Pipeline, inputs) -> None:
+ """Persist the pipeline.
+
+ Saves the versioned model, and overwrites any previous
+ saved models. This ensures that when the package is
+ published, there is only one trained model that can be
+ called, and we know exactly how it was built.
+ """
+
+ # Prepare versioned save file name
+ save_file_name = f"{config.app_config.pipeline_save_file}{_version}.onnx"
+ save_path = TRAINED_MODEL_DIR / save_file_name
+ remove_old_pipelines(files_to_keep=[save_file_name])
+
+ # Convert into ONNX format
+
+ # This is going to need to match features
+ # - LotArea - Integer
+ # - OverallQual - Integer
+ # - YearRemodAdd - Integer
+ # - BsmtQual - Str
+ # - BsmtFinSF1 - Float
+ # - TotalBsmtSF - Float
+ # - FirstFlrSF - Integer
+ # - SecondFlrSF - Integer
+ # - GrLivArea - Integer
+ # - GarageCars - Float
+ # # this one is only to calculate temporal variable:
+ # - YrSold - Integer
+
+ onx = convert_sklearn(pipeline_to_persist, initial_types=inputs)
+ with open(save_path, "wb") as f:
+ f.write(onx.SerializeToString())
+
+ #joblib.dump(pipeline_to_persist, save_path)
+ _logger.info(f"saved pipeline: {save_file_name}")
+
+
+def load_pipeline(*, file_name: str) -> Pipeline:
+ """Load a persisted pipeline."""
+
+ file_path = TRAINED_MODEL_DIR / file_name
+ session = rt.InferenceSession(str(file_path))
+ input_name = session.get_inputs()[0].name
+ return session
+
+
+def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None:
+ """
+ Remove old model pipelines.
+
+ This is to ensure there is a simple one-to-one
+ mapping between the package version and the model
+ version to be imported and used by other applications.
+ """
+ do_not_delete = files_to_keep + ["__init__.py"]
+ for model_file in TRAINED_MODEL_DIR.iterdir():
+ if model_file.name not in do_not_delete:
+ model_file.unlink()
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/processing/errors.py b/packages/gradient_boosting_model/gradient_boosting_model/processing/errors.py
new file mode 100644
index 0000000..b924254
--- /dev/null
+++ b/packages/gradient_boosting_model/gradient_boosting_model/processing/errors.py
@@ -0,0 +1,6 @@
+class BaseError(Exception):
+ """Base package error."""
+
+
+class InvalidModelInputError(BaseError):
+ """Model input contains an error."""
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/processing/preprocessors.py b/packages/gradient_boosting_model/gradient_boosting_model/processing/preprocessors.py
new file mode 100644
index 0000000..f7f103b
--- /dev/null
+++ b/packages/gradient_boosting_model/gradient_boosting_model/processing/preprocessors.py
@@ -0,0 +1,81 @@
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+
+
+class SklearnTransformerWrapper(BaseEstimator, TransformerMixin):
+ """
+ Wrapper for Scikit-learn pre-processing transformers,
+ like the SimpleImputer() or OrdinalEncoder(), to allow
+ the use of the transformer on a selected group of variables.
+ """
+
+ def __init__(self, variables=None, transformer=None):
+
+ if not isinstance(variables, list):
+ self.variables = [variables]
+ else:
+ self.variables = variables
+
+ self.transformer = transformer
+
+ def fit(self, X: pd.DataFrame, y: pd.Series = None):
+ """
+ The `fit` method allows scikit-learn transformers to
+ learn the required parameters from the training data set.
+ """
+
+ self.transformer.fit(X[self.variables])
+ return self
+
+ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+ """Apply the transforms to the dataframe."""
+ X = X.copy()
+ X[self.variables] = self.transformer.transform(X[self.variables])
+ return X
+
+
+class TemporalVariableEstimator(BaseEstimator, TransformerMixin):
+ """Calculates the time difference between 2 temporal variables."""
+
+ def __init__(self, variables=None, reference_variable=None):
+ if not isinstance(variables, list):
+ self.variables = [variables]
+ else:
+ self.variables = variables
+
+ self.reference_variables = reference_variable
+
+ def fit(self, X, y=None):
+ """
+ The `fit` method is necessary to accommodate the
+ scikit-learn pipeline functionality.
+ """
+
+ return self
+
+ def transform(self, X):
+ X = X.copy()
+ for feature in self.variables:
+ X[feature] = X[self.reference_variables] - X[feature]
+
+ return X
+
+
+class DropUnecessaryFeatures(BaseEstimator, TransformerMixin):
+ def __init__(self, variables_to_drop=None):
+ self.variables = variables_to_drop
+
+ def fit(self, X, y=None):
+ """
+ The `fit` method is necessary to accommodate the
+ scikit-learn pipeline functionality.
+ """
+
+ return self
+
+ def transform(self, X):
+ # drop unnecesary / unused features from the data set
+ X = X.copy()
+ X = X.drop(self.variables, axis=1)
+
+ return X
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/processing/validation.py b/packages/gradient_boosting_model/gradient_boosting_model/processing/validation.py
new file mode 100644
index 0000000..40339ee
--- /dev/null
+++ b/packages/gradient_boosting_model/gradient_boosting_model/processing/validation.py
@@ -0,0 +1,123 @@
+import typing as t
+
+from gradient_boosting_model.config.core import config
+
+import numpy as np
+import pandas as pd
+from marshmallow import fields, Schema, ValidationError
+
+
+class HouseDataInputSchema(Schema):
+ Alley = fields.Str(allow_none=True)
+ BedroomAbvGr = fields.Integer()
+ BldgType = fields.Str()
+ BsmtCond = fields.Str(allow_none=True)
+ BsmtExposure = fields.Str(allow_none=True)
+ BsmtFinSF1 = fields.Float(allow_none=True)
+ BsmtFinSF2 = fields.Float(allow_none=True)
+ BsmtFinType1 = fields.Str(allow_none=True)
+ BsmtFinType2 = fields.Str(allow_none=True)
+ BsmtFullBath = fields.Float(allow_none=True)
+ BsmtHalfBath = fields.Float(allow_none=True)
+ BsmtQual = fields.Str(allow_none=True)
+ BsmtUnfSF = fields.Float()
+ CentralAir = fields.Str()
+ Condition1 = fields.Str()
+ Condition2 = fields.Str()
+ Electrical = fields.Str(allow_none=True)
+ EnclosedPorch = fields.Integer()
+ ExterCond = fields.Str()
+ ExterQual = fields.Str()
+ Exterior1st = fields.Str(allow_none=True)
+ Exterior2nd = fields.Str(allow_none=True)
+ Fence = fields.Str(allow_none=True)
+ FireplaceQu = fields.Str(allow_none=True)
+ Fireplaces = fields.Integer()
+ Foundation = fields.Str()
+ FullBath = fields.Integer()
+ Functional = fields.Str(allow_none=True)
+ GarageArea = fields.Float()
+ GarageCars = fields.Float()
+ GarageCond = fields.Str(allow_none=True)
+ GarageFinish = fields.Str(allow_none=True)
+ GarageQual = fields.Str(allow_none=True)
+ GarageType = fields.Str(allow_none=True)
+ GarageYrBlt = fields.Float(allow_none=True)
+ GrLivArea = fields.Integer()
+ HalfBath = fields.Integer()
+ Heating = fields.Str()
+ HeatingQC = fields.Str()
+ HouseStyle = fields.Str()
+ Id = fields.Integer()
+ KitchenAbvGr = fields.Integer()
+ KitchenQual = fields.Str(allow_none=True)
+ LandContour = fields.Str()
+ LandSlope = fields.Str()
+ LotArea = fields.Integer()
+ LotConfig = fields.Str()
+ LotFrontage = fields.Float(allow_none=True)
+ LotShape = fields.Str()
+ LowQualFinSF = fields.Integer()
+ MSSubClass = fields.Integer()
+ MSZoning = fields.Str(allow_none=True)
+ MasVnrArea = fields.Float(allow_none=True)
+ MasVnrType = fields.Str(allow_none=True)
+ MiscFeature = fields.Str(allow_none=True)
+ MiscVal = fields.Integer()
+ MoSold = fields.Integer()
+ Neighborhood = fields.Str()
+ OpenPorchSF = fields.Integer()
+ OverallCond = fields.Integer()
+ OverallQual = fields.Integer()
+ PavedDrive = fields.Str()
+ PoolArea = fields.Integer()
+ PoolQC = fields.Str(allow_none=True)
+ RoofMatl = fields.Str()
+ RoofStyle = fields.Str()
+ SaleCondition = fields.Str()
+ SaleType = fields.Str(allow_none=True)
+ ScreenPorch = fields.Integer()
+ Street = fields.Str()
+ TotRmsAbvGrd = fields.Integer()
+ TotalBsmtSF = fields.Float()
+ Utilities = fields.Str(allow_none=True)
+ WoodDeckSF = fields.Integer()
+ YearBuilt = fields.Integer()
+ YearRemodAdd = fields.Integer()
+ YrSold = fields.Integer()
+ FirstFlrSF = fields.Integer()
+ SecondFlrSF = fields.Integer()
+ ThreeSsnPortch = fields.Integer()
+
+
+def drop_na_inputs(*, input_data: pd.DataFrame) -> pd.DataFrame:
+ """Check model inputs for na values and filter."""
+ validated_data = input_data.copy()
+ if input_data[config.model_config.numerical_na_not_allowed].isnull().any().any():
+ validated_data = validated_data.dropna(
+ axis=0, subset=config.model_config.numerical_na_not_allowed
+ )
+
+ return validated_data
+
+
+def validate_inputs(
+ *, input_data: pd.DataFrame
+) -> t.Tuple[pd.DataFrame, t.Optional[dict]]:
+ """Check model inputs for unprocessable values."""
+
+ # convert syntax error field names (beginning with numbers)
+ input_data.rename(columns=config.model_config.variables_to_rename, inplace=True)
+ validated_data = drop_na_inputs(input_data=input_data)
+
+ # set many=True to allow passing in a list
+ schema = HouseDataInputSchema(many=True)
+ errors = None
+
+ try:
+ # replace numpy nans so that Marshmallow can validate
+ schema.load(validated_data.replace({np.nan: None}).to_dict(orient="records"))
+ except ValidationError as exc:
+ errors = exc.messages
+
+ return validated_data, errors
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/train_pipeline.py b/packages/gradient_boosting_model/gradient_boosting_model/train_pipeline.py
new file mode 100755
index 0000000..704e6f3
--- /dev/null
+++ b/packages/gradient_boosting_model/gradient_boosting_model/train_pipeline.py
@@ -0,0 +1,48 @@
+from sklearn.model_selection import train_test_split
+
+from gradient_boosting_model import pipeline
+from gradient_boosting_model.processing.data_management import (
+ load_dataset,
+ save_pipeline,
+ convert_dataframe_schema
+)
+from gradient_boosting_model.config.core import config
+from gradient_boosting_model import __version__ as _version
+
+import logging
+
+
+_logger = logging.getLogger(__name__)
+
+
+def run_training() -> None:
+ """Train the model."""
+
+ # read training data
+ data = load_dataset(file_name=config.app_config.training_data_file)
+
+ # SimpleImputer on string is not available for string
+ # in ONNX-ML specifications.
+ # So we do it beforehand.
+ for cat in config.model_config.categorical_vars:
+ data[cat].fillna("missing", inplace=True)
+
+ # divide train and test
+ X_train, X_test, y_train, y_test = train_test_split(
+ data[config.model_config.features], # predictors
+ data[config.model_config.target],
+ test_size=config.model_config.test_size,
+ # we are setting the random seed here
+ # for reproducibility
+ random_state=config.model_config.random_state,
+ )
+
+ pipeline.price_pipe.fit(X_train, y_train)
+ inputs = convert_dataframe_schema(X_train)
+
+ _logger.warning(f"saving model version: {_version}")
+ save_pipeline(pipeline_to_persist=pipeline.price_pipe, inputs=inputs)
+
+
+if __name__ == "__main__":
+ run_training()
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/trained_models/__init__.py b/packages/gradient_boosting_model/gradient_boosting_model/trained_models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/gradient_boosting_model/gradient_boosting_model/trained_models/gb_regression_output_v0.1.18.onnx b/packages/gradient_boosting_model/gradient_boosting_model/trained_models/gb_regression_output_v0.1.18.onnx
new file mode 100644
index 0000000..b41f986
Binary files /dev/null and b/packages/gradient_boosting_model/gradient_boosting_model/trained_models/gb_regression_output_v0.1.18.onnx differ
diff --git a/packages/gradient_boosting_model/requirements.txt b/packages/gradient_boosting_model/requirements.txt
new file mode 100644
index 0000000..43f94fe
--- /dev/null
+++ b/packages/gradient_boosting_model/requirements.txt
@@ -0,0 +1,20 @@
+# ML requirements
+numpy>=1.18.1,<1.19.0
+scikit-learn>=0.22.1,<0.23.0
+pandas>=0.25.3,<0.26.0
+feature_engine>=0.3.1,<0.4.0
+joblib>=0.14.1,<0.15.0
+
+# config parsing
+strictyaml>=1.0.5,<1.1.0
+pydantic>=1.1,<1.2
+
+# validation
+marshmallow>=3.2.2,<4.0
+
+# packaging
+setuptools>=41.4.0,<42.0.0
+wheel>=0.33.6,<0.34.0
+skl2onnx>=1.6.0,<1.7.0
+onnxruntime>=1.1.1,<1.2.0
+onnxmltools>=1.6.0,<1.7.0
diff --git a/packages/gradient_boosting_model/setup.py b/packages/gradient_boosting_model/setup.py
new file mode 100644
index 0000000..4fffce0
--- /dev/null
+++ b/packages/gradient_boosting_model/setup.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+# Package meta-data.
+NAME = 'tid-gradient-boosting-model'
+DESCRIPTION = "Gradient boosting regression model from Train In Data."
+URL = "https://github.com/trainindata/testing-and-monitoring-ml-deployments"
+EMAIL = "christopher.samiullah@protonmail.com"
+AUTHOR = "ChristopherGS"
+REQUIRES_PYTHON = ">=3.6.0"
+
+
+# What packages are required for this module to be executed?
+def list_reqs(fname="requirements.txt"):
+ with open(fname) as fd:
+ return fd.read().splitlines()
+
+
+# The rest you shouldn't have to touch too much :)
+# ------------------------------------------------
+# Except, perhaps the License and Trove Classifiers!
+# If you do change the License, remember to change the
+# Trove Classifier for that!
+long_description = DESCRIPTION
+
+# Load the package's VERSION file as a dictionary.
+about = {}
+ROOT_DIR = Path(__file__).resolve().parent
+PACKAGE_DIR = ROOT_DIR / 'gradient_boosting_model'
+with open(PACKAGE_DIR / "VERSION") as f:
+ _version = f.read().strip()
+ about["__version__"] = _version
+
+
+# Where the magic happens:
+setup(
+ name=NAME,
+ version=about["__version__"],
+ description=DESCRIPTION,
+ long_description=long_description,
+ long_description_content_type="text/markdown",
+ author=AUTHOR,
+ author_email=EMAIL,
+ python_requires=REQUIRES_PYTHON,
+ url=URL,
+ packages=find_packages(exclude=("tests",)),
+ package_data={"gradient_boosting_model": ["VERSION"]},
+ install_requires=list_reqs(),
+ extras_require={},
+ include_package_data=True,
+ license="BSD-3",
+ classifiers=[
+ # Trove classifiers
+ # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
+ "License :: OSI Approved :: MIT License",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: Implementation :: CPython",
+ "Programming Language :: Python :: Implementation :: PyPy",
+ ],
+)
diff --git a/packages/gradient_boosting_model/test_requirements.txt b/packages/gradient_boosting_model/test_requirements.txt
new file mode 100644
index 0000000..993846e
--- /dev/null
+++ b/packages/gradient_boosting_model/test_requirements.txt
@@ -0,0 +1,13 @@
+-r requirements.txt
+
+# testing requirements
+pytest>=5.3.2,<6.0.0
+
+# old model for testing purposes
+# source code: https://github.com/trainindata/deploying-machine-learning-models/tree/master/packages/regression_model
+tid-regression-model>=2.0.20,<2.1.0
+
+# repo maintenance tooling
+black>=19.10b0,<20.0
+flake8>=3.7.9,<4.0
+mypy>=0.740
diff --git a/packages/gradient_boosting_model/tests/__init__.py b/packages/gradient_boosting_model/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/gradient_boosting_model/tests/conftest.py b/packages/gradient_boosting_model/tests/conftest.py
new file mode 100644
index 0000000..c896d72
--- /dev/null
+++ b/packages/gradient_boosting_model/tests/conftest.py
@@ -0,0 +1,34 @@
+import pytest
+from sklearn.model_selection import train_test_split
+
+from gradient_boosting_model.config.core import config
+from gradient_boosting_model.processing.data_management import load_dataset
+
+
+@pytest.fixture(scope="session")
+def pipeline_inputs():
+ # For larger datasets, here we would use a testing sub-sample.
+ data = load_dataset(file_name=config.app_config.training_data_file)
+
+ # Divide train and test
+ X_train, X_test, y_train, y_test = train_test_split(
+ data[config.model_config.features], # predictors
+ data[config.model_config.target],
+ test_size=config.model_config.test_size,
+ # we are setting the random seed here
+ # for reproducibility
+ random_state=config.model_config.random_state,
+ )
+
+ return X_train, X_test, y_train, y_test
+
+
+@pytest.fixture()
+def raw_training_data():
+ # For larger datasets, here we would use a testing sub-sample.
+ return load_dataset(file_name=config.app_config.training_data_file)
+
+
+@pytest.fixture()
+def sample_input_data():
+ return load_dataset(file_name=config.app_config.test_data_file)
diff --git a/packages/gradient_boosting_model/tests/test_config.py b/packages/gradient_boosting_model/tests/test_config.py
new file mode 100644
index 0000000..5a82241
--- /dev/null
+++ b/packages/gradient_boosting_model/tests/test_config.py
@@ -0,0 +1,124 @@
+from pathlib import Path
+
+from gradient_boosting_model.config.core import (
+ create_and_validate_config,
+ fetch_config_from_yaml,
+)
+
+import pytest
+from pydantic import ValidationError
+
+
+TEST_CONFIG_TEXT = """
+package_name: gradient_boosting_model
+training_data_file: houseprice.csv
+test_data_file: test.csv
+drop_features: YrSold
+pipeline_name: gb_regression
+pipeline_save_file: gb_regression_output_v
+target: SalePrice
+variables_to_rename:
+ foo: bar
+test_size: 0.1
+features:
+ - LotArea
+numerical_vars:
+ - LotArea
+categorical_vars:
+ - BsmtQual
+temporal_vars: YearRemodAdd
+numerical_vars_with_na:
+ - LotFrontage
+numerical_na_not_allowed:
+ - LotArea
+random_state: 0
+n_estimators: 50
+rare_label_tol: 0.01
+rare_label_n_categories: 5
+loss: ls
+allowed_loss_functions:
+ - ls
+ - huber
+"""
+
+INVALID_TEST_CONFIG_TEXT = """
+package_name: gradient_boosting_model
+training_data_file: houseprice.csv
+test_data_file: test.csv
+drop_features: YrSold
+pipeline_name: gb_regression
+pipeline_save_file: gb_regression_output_v
+target: SalePrice
+features:
+ - LotArea
+numerical_vars:
+ - LotArea
+categorical_vars:
+ - BsmtQual
+temporal_vars: YearRemodAdd
+numerical_vars_with_na:
+ - LotFrontage
+numerical_na_not_allowed:
+ - LotArea
+random_state: 0
+n_estimators: 50
+rare_label_tol: 0.01
+rare_label_n_categories: 5
+loss: ls
+allowed_loss_functions:
+ - huber
+"""
+
+
+def test_fetch_config_structure(tmpdir):
+ # Given
+ # We make use of the pytest built-in tmpdir fixture
+ configs_dir = Path(tmpdir)
+ config_1 = configs_dir / "sample_config.yml"
+ config_1.write_text(TEST_CONFIG_TEXT)
+ parsed_config = fetch_config_from_yaml(cfg_path=config_1)
+
+ # When
+ config = create_and_validate_config(parsed_config=parsed_config)
+
+ # Then
+ assert config.model_config
+ assert config.app_config
+
+
+def test_config_validation_raises_error_for_invalid_config(tmpdir):
+ # Given
+ # We make use of the pytest built-in tmpdir fixture
+ configs_dir = Path(tmpdir)
+ config_1 = configs_dir / "sample_config.yml"
+
+ # invalid config attempts to set a prohibited loss
+ # function which we validate against an allowed set of
+ # loss function parameters.
+ config_1.write_text(INVALID_TEST_CONFIG_TEXT)
+ parsed_config = fetch_config_from_yaml(cfg_path=config_1)
+
+ # When
+ with pytest.raises(ValidationError) as excinfo:
+ create_and_validate_config(parsed_config=parsed_config)
+
+ # Then
+ assert "not in the allowed set" in str(excinfo.value)
+
+
+def test_missing_config_field_raises_validation_error(tmpdir):
+ # Given
+ # We make use of the pytest built-in tmpdir fixture
+ configs_dir = Path(tmpdir)
+ config_1 = configs_dir / "sample_config.yml"
+ TEST_CONFIG_TEXT = """package_name: gradient_boosting_model"""
+ config_1.write_text(TEST_CONFIG_TEXT)
+ parsed_config = fetch_config_from_yaml(cfg_path=config_1)
+
+ # When
+ with pytest.raises(ValidationError) as excinfo:
+ create_and_validate_config(parsed_config=parsed_config)
+
+ # Then
+ assert "field required" in str(excinfo.value)
+ assert "pipeline_name" in str(excinfo.value)
diff --git a/packages/gradient_boosting_model/tests/test_data_management.py b/packages/gradient_boosting_model/tests/test_data_management.py
new file mode 100644
index 0000000..6bd3181
--- /dev/null
+++ b/packages/gradient_boosting_model/tests/test_data_management.py
@@ -0,0 +1,25 @@
+from gradient_boosting_model.train_pipeline import run_training
+from gradient_boosting_model.processing.data_management import load_pipeline
+from gradient_boosting_model.config.core import config
+from gradient_boosting_model import __version__ as _version
+
+from sklearn.pipeline import Pipeline
+from skl2onnx import convert_sklearn, supported_converters
+
+
+def test_can_load_onnx_format():
+ # Given
+ from skl2onnx.common.data_types import onnx_built_with_ml
+
+ deps_in_place = onnx_built_with_ml()
+
+ pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.onnx"
+ print(f'supported converters: {supported_converters()}')
+ run_training()
+
+
+ # When
+ pipe = load_pipeline(file_name=pipeline_file_name)
+
+ # Then
+ assert isinstance(pipe, Pipeline)
\ No newline at end of file
diff --git a/packages/gradient_boosting_model/tests/test_pipeline.py b/packages/gradient_boosting_model/tests/test_pipeline.py
new file mode 100644
index 0000000..3820995
--- /dev/null
+++ b/packages/gradient_boosting_model/tests/test_pipeline.py
@@ -0,0 +1,54 @@
+from gradient_boosting_model import pipeline
+from gradient_boosting_model.config.core import config
+from gradient_boosting_model.processing.validation import validate_inputs
+
+
+def test_pipeline_drops_unnecessary_features(pipeline_inputs):
+ # Given
+ X_train, X_test, y_train, y_test = pipeline_inputs
+ assert config.model_config.drop_features in X_train.columns
+
+ # When
+ # We use the scikit-learn Pipeline private method `_fit` which is called
+ # by the `fit` method, since this allows us to access the transformed
+ # dataframe. For other models we could use the `transform` method, but
+ # the GradientBoostingRegressor does not have a `transform` method.
+ X_transformed, _ = pipeline.price_pipe._fit(X_train, y_train)
+
+ # Then
+ assert config.model_config.drop_features in X_train.columns
+ assert config.model_config.drop_features not in X_transformed.columns
+
+
+def test_pipeline_transforms_temporal_features(pipeline_inputs):
+ # Given
+ X_train, X_test, y_train, y_test = pipeline_inputs
+
+ # When
+ # We use the scikit-learn Pipeline private method `_fit` which is called
+ # by the `fit` method, since this allows us to access the transformed
+ # dataframe. For other models we could use the `transform` method, but
+ # the GradientBoostingRegressor does not have a `transform` method.
+ X_transformed, _ = pipeline.price_pipe._fit(X_train, y_train)
+
+ # Then
+ assert (
+ X_transformed.iloc[0]["YearRemodAdd"]
+ == X_train.iloc[0]["YrSold"] - X_train.iloc[0]["YearRemodAdd"]
+ )
+
+
+def test_pipeline_predict_takes_validated_input(pipeline_inputs, sample_input_data):
+ # Given
+ X_train, X_test, y_train, y_test = pipeline_inputs
+ pipeline.price_pipe.fit(X_train, y_train)
+
+ # When
+ validated_inputs, errors = validate_inputs(input_data=sample_input_data)
+ predictions = pipeline.price_pipe.predict(
+ validated_inputs[config.model_config.features]
+ )
+
+ # Then
+ assert predictions is not None
+ assert errors is None
diff --git a/packages/gradient_boosting_model/tests/test_predict.py b/packages/gradient_boosting_model/tests/test_predict.py
new file mode 100644
index 0000000..5be393e
--- /dev/null
+++ b/packages/gradient_boosting_model/tests/test_predict.py
@@ -0,0 +1,62 @@
+from gradient_boosting_model.predict import make_prediction
+from gradient_boosting_model.config.core import config
+
+from sklearn.metrics import mean_squared_error
+
+from regression_model.predict import make_prediction as alt_make_prediction
+
+
+def test_prediction_quality_against_benchmark(raw_training_data, sample_input_data):
+ # Given
+ input_df = raw_training_data.drop(config.model_config.target, axis=1)
+ output_df = raw_training_data[config.model_config.target]
+
+ # Generate rough benchmarks (you would tweak depending on your model)
+ benchmark_flexibility = 50000
+ # setting ndigits to -4 will round the value to the nearest 10,000 i.e. 210,000
+ benchmark_lower_boundary = (
+ round(output_df.iloc[0], ndigits=-4) - benchmark_flexibility
+ ) # 210,000 - 50000 = 160000
+ benchmark_upper_boundary = (
+ round(output_df.iloc[0], ndigits=-4) + benchmark_flexibility
+ ) # 210000 + 50000 = 260000
+
+ # When
+ subject = make_prediction(input_data=input_df[0:1])
+
+ # Then
+ assert subject is not None
+ prediction = subject.get("predictions")[0]
+ assert isinstance(prediction, float)
+ assert prediction > benchmark_lower_boundary
+ assert prediction < benchmark_upper_boundary
+
+
+def test_prediction_quality_against_another_model(raw_training_data, sample_input_data):
+ # Given
+ input_df = raw_training_data.drop(config.model_config.target, axis=1)
+ output_df = raw_training_data[config.model_config.target]
+ current_predictions = make_prediction(input_data=input_df)
+
+ # the older model has these variable names reversed
+ input_df.rename(
+ columns={
+ "FirstFlrSF": "1stFlrSF",
+ "SecondFlrSF": "2ndFlrSF",
+ "ThreeSsnPortch": "3SsnPorch",
+ },
+ inplace=True,
+ )
+ alternative_predictions = alt_make_prediction(input_data=input_df)
+
+ # When
+ current_mse = mean_squared_error(
+ y_true=output_df.values, y_pred=current_predictions["predictions"]
+ )
+
+ alternative_mse = mean_squared_error(
+ y_true=output_df.values, y_pred=alternative_predictions["predictions"]
+ )
+
+ # Then
+ assert current_mse < alternative_mse
diff --git a/packages/gradient_boosting_model/tests/test_preprocessors.py b/packages/gradient_boosting_model/tests/test_preprocessors.py
new file mode 100644
index 0000000..11a4900
--- /dev/null
+++ b/packages/gradient_boosting_model/tests/test_preprocessors.py
@@ -0,0 +1,37 @@
+from gradient_boosting_model.config.core import config
+from gradient_boosting_model.processing import preprocessors as pp
+
+
+def test_drop_unnecessary_features_transformer(pipeline_inputs):
+ # Given
+ X_train, X_test, y_train, y_test = pipeline_inputs
+ assert config.model_config.drop_features in X_train.columns
+
+ transformer = pp.DropUnecessaryFeatures(
+ variables_to_drop=config.model_config.drop_features,
+ )
+
+ # When
+ X_transformed = transformer.transform(X_train)
+
+ # Then
+ assert config.model_config.drop_features not in X_transformed.columns
+
+
+def test_temporal_variable_estimator(pipeline_inputs):
+ # Given
+ X_train, X_test, y_train, y_test = pipeline_inputs
+
+ transformer = pp.TemporalVariableEstimator(
+ variables=config.model_config.temporal_vars,
+ reference_variable=config.model_config.drop_features,
+ )
+
+ # When
+ X_transformed = transformer.transform(X_train)
+
+ # Then
+ assert (
+ X_transformed.iloc[0]["YearRemodAdd"]
+ == X_train.iloc[0]["YrSold"] - X_train.iloc[0]["YearRemodAdd"]
+ )
diff --git a/packages/gradient_boosting_model/tests/test_validation.py b/packages/gradient_boosting_model/tests/test_validation.py
new file mode 100644
index 0000000..b636674
--- /dev/null
+++ b/packages/gradient_boosting_model/tests/test_validation.py
@@ -0,0 +1,30 @@
+from gradient_boosting_model.processing.validation import validate_inputs
+
+
+def test_validate_inputs(sample_input_data):
+ # When
+ validated_inputs, errors = validate_inputs(input_data=sample_input_data)
+
+ # Then
+ assert not errors
+
+ # we expect that 2 rows are removed due to missing vars
+ # 1459 is the total number of rows in the test data set (test.csv)
+ # and 1457 number returned after 2 rows are filtered out.
+ assert len(sample_input_data) == 1459
+ assert len(validated_inputs) == 1457
+
+
+def test_validate_inputs_identifies_errors(sample_input_data):
+ # Given
+ test_inputs = sample_input_data.copy()
+
+ # introduce errors
+ test_inputs.at[1, "BldgType"] = 50 # we expect a string
+
+ # When
+ validated_inputs, errors = validate_inputs(input_data=test_inputs)
+
+ # Then
+ assert errors
+ assert errors[1] == {"BldgType": ["Not a valid string."]}
diff --git a/packages/gradient_boosting_model/tox.ini b/packages/gradient_boosting_model/tox.ini
new file mode 100644
index 0000000..e898072
--- /dev/null
+++ b/packages/gradient_boosting_model/tox.ini
@@ -0,0 +1,51 @@
+[tox]
+envlist = unit_tests,typechecks,stylechecks
+skipsdist = True
+
+
+[testenv]
+install_command = pip install {opts} {packages}
+deps =
+ -rtest_requirements.txt
+
+commands=
+ py.test
+
+
+[testenv:unit_tests]
+envdir = {toxworkdir}/unit_tests
+deps =
+ {[testenv]deps}
+
+setenv =
+ PYTHONPATH=.
+
+commands =
+ python gradient_boosting_model/train_pipeline.py
+ pytest \
+ -s \
+ -vv \
+ {posargs:tests/}
+
+
+[testenv:typechecks]
+envdir = {toxworkdir}/unit_tests
+
+deps =
+ {[testenv:unit_tests]deps}
+
+commands = {posargs:mypy gradient_boosting_model}
+
+
+[testenv:stylechecks]
+envdir = {toxworkdir}/unit_tests
+
+deps =
+ {[testenv:unit_tests]deps}
+
+commands = {posargs:flake8 gradient_boosting_model tests}
+
+
+[flake8]
+exclude = .git,env
+max-line-length = 90
\ No newline at end of file
diff --git a/packages/ml_api/.dockerignore b/packages/ml_api/.dockerignore
new file mode 100644
index 0000000..26ab026
--- /dev/null
+++ b/packages/ml_api/.dockerignore
@@ -0,0 +1,20 @@
+exercise_notebooks/*
+*env*
+*venv*
+.circleci*
+packages/gradient_boosting_model
+*.env
+*.log
+.git
+.gitignore
+.dockerignore
+*.mypy_cache
+*.pytest_cache
+*.tox
+
+# alembic
+!alembic/env.py
+
+# Byte-compiled / optimized / DLL files
+*__pycache__*
+*.py[cod]
\ No newline at end of file
diff --git a/packages/ml_api/Makefile b/packages/ml_api/Makefile
new file mode 100644
index 0000000..e82ee68
--- /dev/null
+++ b/packages/ml_api/Makefile
@@ -0,0 +1,34 @@
+# For details on Makefiles, see the section notes.
+NAME=ml_api
+VERSION=$(shell git rev-parse HEAD)
+REPO=UPDATEME
+PASSWORD=UPDATEME
+
+# Specify phony list to ensure make recipes do not conflict with real file names
+.PHONY: run-service-development tag-push-master tag-push-local db-migrations
+
+
+tag-push-local:
+ @echo "+ $@"
+ docker login --username $(REPO) --password $(PASSWORD)
+ env TARGET=$(VERSION) docker-compose -f docker/docker-compose-ci-candidate.yml build
+ docker push $(REPO)/$(NAME):$(VERSION)
+
+tag-push-master:
+ @echo "+ $@"
+ docker login --username $(REPO) --password $(PASSWORD)
+ env TARGET=master docker-compose -f docker/docker-compose-ci-master.yml build
+ docker push $(REPO)/$(NAME):master
+
+# start up Flask API service
+run-service-development:
+ @echo "+ $@"
+ python run.py
+
+run-service-wsgi:
+ @echo "+ $@"
+ gunicorn --workers=1 --bind 0.0.0.0:5000 run:application
+
+db-migrations:
+ @echo "+ $@"
+ PYTHONPATH=. alembic -c alembic.ini upgrade head
diff --git a/packages/ml_api/__init__.py b/packages/ml_api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/ml_api/alembic.ini b/packages/ml_api/alembic.ini
new file mode 100644
index 0000000..604e701
--- /dev/null
+++ b/packages/ml_api/alembic.ini
@@ -0,0 +1,49 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = alembic
+
+# timezone to use when rendering the date
+# within the migration file as well as the filename.
+# string value is passed to dateutil.tz.gettz()
+# leave blank for localtime
+timezone = UTC
+
+sqlalchemy.url = VALUE_IS_SET_AT_RUNTIME
+
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/packages/ml_api/alembic/env.py b/packages/ml_api/alembic/env.py
new file mode 100644
index 0000000..377cb2e
--- /dev/null
+++ b/packages/ml_api/alembic/env.py
@@ -0,0 +1,63 @@
+import os
+
+from alembic import context
+from sqlalchemy import engine_from_config, pool
+
+# Import the models so the changes in them are automatically reflected in the
+# generated migrations.
+from api.persistence import models # noqa
+from api.config import DevelopmentConfig as user_config
+from api.persistence.core import Base
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+database_url = os.environ.get("ALEMBIC_DB_URI", user_config.SQLALCHEMY_DATABASE_URI)
+config.set_main_option("sqlalchemy.url", database_url)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+target_metadata = Base.metadata
+
+
+def run_migrations_offline():
+ """Run migrations in 'offline' mode.
+ This configures the context with just a URL
+ and not a user_ratings, though a user_ratings is acceptable
+ here as well. By skipping the user_ratings creation
+ we don't even need a DBAPI to be available.
+ Calls to context.execute() here emit the given string to the
+ script output.
+ """
+ url = config.get_main_option("sqlalchemy.url")
+ context.configure(
+ url=url, target_metadata=target_metadata, literal_binds=True,
+ )
+
+ with context.begin_transaction():
+ context.run_migrations()
+
+
+def run_migrations_online():
+ """Run migrations in 'online' mode.
+ In this scenario we need to create a user_ratings
+ and associate a connection with the context.
+ """
+ alembic_config = config.get_section(config.config_ini_section)
+ connectable = engine_from_config(
+ alembic_config, prefix="sqlalchemy.", poolclass=pool.NullPool,
+ )
+
+ with connectable.connect() as connection:
+ context.configure(
+ connection=connection, target_metadata=target_metadata,
+ )
+
+ with context.begin_transaction():
+ context.run_migrations()
+
+
+if context.is_offline_mode():
+ run_migrations_offline()
+else:
+ run_migrations_online()
diff --git a/packages/ml_api/alembic/script.py.mako b/packages/ml_api/alembic/script.py.mako
new file mode 100644
index 0000000..2c01563
--- /dev/null
+++ b/packages/ml_api/alembic/script.py.mako
@@ -0,0 +1,24 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision = ${repr(up_revision)}
+down_revision = ${repr(down_revision)}
+branch_labels = ${repr(branch_labels)}
+depends_on = ${repr(depends_on)}
+
+
+def upgrade():
+ ${upgrades if upgrades else "pass"}
+
+
+def downgrade():
+ ${downgrades if downgrades else "pass"}
diff --git a/packages/ml_api/alembic/versions/cf4abb13368d_create_prediction_tables.py b/packages/ml_api/alembic/versions/cf4abb13368d_create_prediction_tables.py
new file mode 100644
index 0000000..a26fb19
--- /dev/null
+++ b/packages/ml_api/alembic/versions/cf4abb13368d_create_prediction_tables.py
@@ -0,0 +1,78 @@
+"""create prediction tables
+
+Revision ID: cf4abb13368d
+Revises:
+Create Date: 2019-12-15 14:54:07.857500+00:00
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "cf4abb13368d"
+down_revision = None
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.create_table(
+ "gradient_boosting_model_predictions",
+ sa.Column("id", sa.Integer(), nullable=False),
+ sa.Column("user_id", sa.String(length=36), nullable=False),
+ sa.Column(
+ "datetime_captured",
+ sa.DateTime(timezone=True),
+ server_default=sa.text("now()"),
+ nullable=True,
+ ),
+ sa.Column("model_version", sa.String(length=36), nullable=False),
+ sa.Column("inputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+ sa.Column("outputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+ sa.PrimaryKeyConstraint("id"),
+ )
+ op.create_index(
+ op.f("ix_gradient_boosting_model_predictions_datetime_captured"),
+ "gradient_boosting_model_predictions",
+ ["datetime_captured"],
+ unique=False,
+ )
+ op.create_table(
+ "regression_model_predictions",
+ sa.Column("id", sa.Integer(), nullable=False),
+ sa.Column("user_id", sa.String(length=36), nullable=False),
+ sa.Column(
+ "datetime_captured",
+ sa.DateTime(timezone=True),
+ server_default=sa.text("now()"),
+ nullable=True,
+ ),
+ sa.Column("model_version", sa.String(length=36), nullable=False),
+ sa.Column("inputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+ sa.Column("outputs", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
+ sa.PrimaryKeyConstraint("id"),
+ )
+ op.create_index(
+ op.f("ix_regression_model_predictions_datetime_captured"),
+ "regression_model_predictions",
+ ["datetime_captured"],
+ unique=False,
+ )
+ # ### end Alembic commands ###
+
+
+def downgrade():
+ # ### commands auto generated by Alembic - please adjust! ###
+ op.drop_index(
+ op.f("ix_regression_model_predictions_datetime_captured"),
+ table_name="regression_model_predictions",
+ )
+ op.drop_table("regression_model_predictions")
+ op.drop_index(
+ op.f("ix_gradient_boosting_model_predictions_datetime_captured"),
+ table_name="gradient_boosting_model_predictions",
+ )
+ op.drop_table("gradient_boosting_model_predictions")
+ # ### end Alembic commands ###
diff --git a/packages/ml_api/api/__init__.py b/packages/ml_api/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/ml_api/api/app.py b/packages/ml_api/api/app.py
new file mode 100644
index 0000000..fd625f2
--- /dev/null
+++ b/packages/ml_api/api/app.py
@@ -0,0 +1,33 @@
+import logging
+
+import connexion
+from sqlalchemy.orm import scoped_session
+
+from api.config import Config
+from api.monitoring.middleware import setup_metrics
+from api.persistence.core import init_database
+
+_logger = logging.getLogger(__name__)
+
+
+def create_app(
+ *, config_object: Config, db_session: scoped_session = None
+) -> connexion.App:
+ """Create app instance."""
+
+ connexion_app = connexion.App(
+ __name__, debug=config_object.DEBUG, specification_dir="spec/"
+ )
+ flask_app = connexion_app.app
+ flask_app.config.from_object(config_object)
+
+ # Setup database
+ init_database(flask_app, config=config_object, db_session=db_session)
+
+ # Setup prometheus monitoring
+ setup_metrics(flask_app)
+
+ connexion_app.add_api("api.yaml")
+ _logger.info("Application instance created")
+
+ return connexion_app
diff --git a/packages/ml_api/api/config.py b/packages/ml_api/api/config.py
new file mode 100644
index 0000000..e593629
--- /dev/null
+++ b/packages/ml_api/api/config.py
@@ -0,0 +1,102 @@
+import logging
+import os
+import pathlib
+import sys
+
+import api
+
+
+# logging format
+FORMATTER = logging.Formatter(
+ "%(asctime)s — %(name)s — %(levelname)s —" "%(funcName)s:%(lineno)d — %(message)s"
+)
+
+# Project Directories
+ROOT = pathlib.Path(api.__file__).resolve().parent.parent
+
+APP_NAME = 'ml_api'
+
+
+class Config:
+ DEBUG = False
+ TESTING = False
+ ENV = os.getenv("FLASK_ENV", "production")
+ SERVER_PORT = int(os.getenv("SERVER_PORT", 5000))
+ SERVER_HOST = os.getenv("SERVER_HOST", "0.0.0.0")
+ LOGGING_LEVEL = os.getenv("LOGGING_LEVEL", logging.INFO)
+ SHADOW_MODE_ACTIVE = os.getenv('SHADOW_MODE_ACTIVE', True)
+ SQLALCHEMY_DATABASE_URI = (
+ f"postgresql+psycopg2://{os.getenv('DB_USER')}:"
+ f"{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}/{os.getenv('DB_NAME')}"
+ )
+ # DB config matches docker container
+ DB_USER = os.getenv("DB_USER", "user")
+ DB_PASSWORD = os.getenv("DB_PASSWORD", "password")
+ DB_PORT = os.getenv("DB_PORT", 6609)
+ DB_HOST = os.getenv("DB_HOST", "0.0.0.0")
+ DB_NAME = os.getenv("DB_NAME", "ml_api_dev")
+
+
+class DevelopmentConfig(Config):
+ DEBUG = True
+ ENV = "development" # do not use in production!
+ LOGGING_LEVEL = logging.DEBUG
+
+
+class TestingConfig(Config):
+ DEBUG = True
+ TESTING = True
+ LOGGING_LEVEL = logging.DEBUG
+
+ # DB config matches test docker container
+ DB_USER = os.getenv("DB_USER", "test_user")
+ DB_PASSWORD = os.getenv("DB_PASSWORD", "password")
+ DB_PORT = os.getenv("DB_PORT", 6608)
+ DB_HOST = os.getenv("DB_HOST", "0.0.0.0")
+ DB_NAME = "ml_api_test"
+ SQLALCHEMY_DATABASE_URI = (
+ f"postgresql+psycopg2://{DB_USER}:"
+ f"{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
+ )
+
+
+class ProductionConfig(Config):
+ DB_USER = os.getenv("DB_USER", "user")
+ DB_PASSWORD = os.getenv("DB_PASSWORD", "password")
+ DB_PORT = os.getenv("DB_PORT", 6609)
+ DB_HOST = os.getenv("DB_HOST", "database")
+ DB_NAME = os.getenv("DB_NAME", "ml_api")
+ SQLALCHEMY_DATABASE_URI = (
+ f"postgresql+psycopg2://{DB_USER}:"
+ f"{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
+ )
+
+
+def get_console_handler():
+ """Setup console logging handler."""
+ console_handler = logging.StreamHandler(sys.stdout)
+ console_handler.setFormatter(FORMATTER)
+ return console_handler
+
+
+def setup_app_logging(config: Config) -> None:
+ """Prepare custom logging for our application."""
+ _disable_irrelevant_loggers()
+ root = logging.getLogger()
+ root.setLevel(config.LOGGING_LEVEL)
+ root.addHandler(get_console_handler())
+ root.propagate = False
+
+
+def _disable_irrelevant_loggers() -> None:
+ """Disable loggers created by packages which create a lot of noise."""
+ for logger_name in (
+ "connexion.apis.flask_api",
+ "connexion.apis.abstract",
+ "connexion.decorators",
+ "connexion.operation",
+ "connexion.operations",
+ "connexion.app",
+ "openapi_spec_validator",
+ ):
+ logging.getLogger(logger_name).level = logging.WARNING
diff --git a/packages/ml_api/api/controller.py b/packages/ml_api/api/controller.py
new file mode 100644
index 0000000..1cee9e3
--- /dev/null
+++ b/packages/ml_api/api/controller.py
@@ -0,0 +1,123 @@
+import json
+import logging
+import threading
+
+from flask import request, jsonify, Response, current_app
+
+from gradient_boosting_model import __version__ as shadow_version
+from regression_model import __version__ as live_version
+from prometheus_client import Histogram, Gauge, Info
+from gradient_boosting_model.predict import make_prediction
+from api.persistence.data_access import PredictionPersistence, ModelType
+from api.config import APP_NAME
+
+
+_logger = logging.getLogger(__name__)
+
+PREDICTION_TRACKER = Histogram(
+ name='house_price_prediction_dollars',
+ documentation='ML Model Prediction on House Price',
+ labelnames=['app_name', 'model_name', 'model_version']
+)
+
+PREDICTION_GAUGE = Gauge(
+ name='house_price_gauge_dollars',
+ documentation='ML Model Prediction on House Price for min max calcs',
+ labelnames=['app_name', 'model_name', 'model_version']
+)
+
+PREDICTION_GAUGE.labels(
+ app_name=APP_NAME,
+ model_name=ModelType.LASSO.name,
+ model_version=live_version)
+
+MODEL_VERSIONS = Info(
+ 'model_version_details',
+ 'Capture model version information',
+)
+
+MODEL_VERSIONS.info({
+ 'live_model': ModelType.LASSO.name,
+ 'live_version': live_version,
+ 'shadow_model': ModelType.GRADIENT_BOOSTING.name,
+ 'shadow_version': shadow_version})
+
+
+def health():
+ if request.method == "GET":
+ return jsonify({"status": "ok"})
+
+
+def predict():
+ if request.method == "POST":
+ # Step 1: Extract POST data from request body as JSON
+ json_data = request.get_json()
+
+ # Step 2a: Get and save live model predictions
+ persistence = PredictionPersistence(db_session=current_app.db_session)
+ result = persistence.make_save_predictions(
+ db_model=ModelType.LASSO, input_data=json_data
+ )
+
+ # Step 2b: Get and save shadow predictions asynchronously
+ if current_app.config.get("SHADOW_MODE_ACTIVE"):
+ _logger.debug(
+ f"Calling shadow model asynchronously: "
+ f"{ModelType.GRADIENT_BOOSTING.value}"
+ )
+ thread = threading.Thread(
+ target=persistence.make_save_predictions,
+ kwargs={
+ "db_model": ModelType.GRADIENT_BOOSTING,
+ "input_data": json_data,
+ },
+ )
+ thread.start()
+
+ # Step 3: Handle errors
+ if result.errors:
+ _logger.warning(f"errors during prediction: {result.errors}")
+ return Response(json.dumps(result.errors), status=400)
+
+ # Step 4: Monitoring
+ for _prediction in result.predictions:
+ PREDICTION_TRACKER.labels(
+ app_name=APP_NAME,
+ model_name=ModelType.LASSO.name,
+ model_version=live_version).observe(_prediction)
+ PREDICTION_GAUGE.labels(
+ app_name=APP_NAME,
+ model_name=ModelType.LASSO.name,
+ model_version=live_version).set(_prediction)
+
+ # Step 5: Prepare prediction response
+ return jsonify(
+ {
+ "predictions": result.predictions,
+ "version": result.model_version,
+ "errors": result.errors,
+ }
+ )
+
+
+def predict_previous():
+ if request.method == "POST":
+ # Step 1: Extract POST data from request body as JSON
+ json_data = request.get_json()
+
+ # Step 2: Access the model prediction function (also validates data)
+ result = make_prediction(input_data=json_data)
+
+ # Step 3: Handle errors
+ errors = result.get("errors")
+ if errors:
+ return Response(json.dumps(errors), status=400)
+
+ # Step 4: Split out results
+ predictions = result.get("predictions").tolist()
+ version = result.get("version")
+
+ # Step 5: Prepare prediction response
+ return jsonify(
+ {"predictions": predictions, "version": version, "errors": errors}
+ )
diff --git a/packages/ml_api/api/monitoring/__init__.py b/packages/ml_api/api/monitoring/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/ml_api/api/monitoring/middleware.py b/packages/ml_api/api/monitoring/middleware.py
new file mode 100644
index 0000000..31712ef
--- /dev/null
+++ b/packages/ml_api/api/monitoring/middleware.py
@@ -0,0 +1,60 @@
+from flask import request, Flask
+from flask.wrappers import Response
+from prometheus_client import Counter, Histogram
+import time
+
+from api.config import APP_NAME
+
+
+# Counter and Histogram are examples of default metrics
+# available from the prometheus Python client.
+REQUEST_COUNT = Counter(
+ name='http_request_count',
+ documentation='App Request Count',
+ labelnames=['app_name', 'method', 'endpoint', 'http_status']
+)
+REQUEST_LATENCY = Histogram(
+ name='http_request_latency_seconds',
+ documentation='Request latency',
+ labelnames=['app_name', 'endpoint']
+)
+
+
+def start_timer() -> None:
+ """Get start time of a request."""
+ request._prometheus_metrics_request_start_time = time.time()
+
+
+def stop_timer(response: Response) -> Response:
+ """Get stop time of a request.."""
+ request_latency = time.time() - request._prometheus_metrics_request_start_time
+ REQUEST_LATENCY.labels(
+ app_name=APP_NAME,
+ endpoint=request.path).observe(request_latency)
+ return response
+
+
+def record_request_data(response: Response) -> Response:
+ """Capture request data.
+
+ Uses the flask request object to extract information such as
+ the HTTP request method, endpoint and HTTP status.
+ """
+ REQUEST_COUNT.labels(
+ app_name=APP_NAME,
+ method=request.method,
+ endpoint=request.path,
+ http_status=response.status_code).inc()
+ return response
+
+
+def setup_metrics(app: Flask) -> None:
+ """Setup Prometheus metrics.
+
+ This function uses the flask before_request
+ and after_request hooks to capture metrics
+ with each HTTP request to the application.
+ """
+ app.before_request(start_timer)
+ app.after_request(record_request_data)
+ app.after_request(stop_timer)
diff --git a/packages/ml_api/api/persistence/__init__.py b/packages/ml_api/api/persistence/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/ml_api/api/persistence/core.py b/packages/ml_api/api/persistence/core.py
new file mode 100644
index 0000000..7fd231a
--- /dev/null
+++ b/packages/ml_api/api/persistence/core.py
@@ -0,0 +1,67 @@
+import logging
+import os
+
+import alembic.config
+from flask import Flask
+from sqlalchemy import create_engine
+from sqlalchemy.engine import Engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import scoped_session, sessionmaker
+from sqlalchemy_utils import database_exists, create_database
+
+from api.config import Config, ROOT
+
+_logger = logging.getLogger(__name__)
+
+# Base class for SQLAlchemy models
+Base = declarative_base()
+
+
+def create_db_engine_from_config(*, config: Config) -> Engine:
+ """The Engine is the starting point for any SQLAlchemy application.
+
+ It’s “home base” for the actual database and its DBAPI, delivered to the SQLAlchemy
+ application through a connection pool and a Dialect, which describes how to talk to
+ a specific kind of database / DBAPI combination.
+ """
+
+ db_url = config.SQLALCHEMY_DATABASE_URI
+ if not database_exists(db_url):
+ create_database(db_url)
+ engine = create_engine(db_url)
+
+ _logger.info(f"creating DB conn with URI: {db_url}")
+ return engine
+
+
+def create_db_session(*, engine: Engine) -> scoped_session:
+ """Broadly speaking, the Session establishes all conversations with the database.
+
+ It represents a “holding zone” for all the objects which you’ve loaded or
+ associated with it during its lifespan.
+ """
+ return scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine))
+
+
+def init_database(app: Flask, config: Config, db_session=None) -> None:
+ """Connect to the database and attach DB session to the app."""
+
+ if not db_session:
+ engine = create_db_engine_from_config(config=config)
+ db_session = create_db_session(engine=engine)
+
+ app.db_session = db_session
+
+ @app.teardown_appcontext
+ def shutdown_session(exception=None):
+ db_session.remove()
+
+
+def run_migrations():
+ """Run the DB migrations prior to the tests."""
+
+ # alembic looks for the migrations in the current
+ # directory so we change to the correct directory.
+ os.chdir(str(ROOT))
+ alembicArgs = ["--raiseerr", "upgrade", "head"]
+ alembic.config.main(argv=alembicArgs)
diff --git a/packages/ml_api/api/persistence/data_access.py b/packages/ml_api/api/persistence/data_access.py
new file mode 100644
index 0000000..4feb247
--- /dev/null
+++ b/packages/ml_api/api/persistence/data_access.py
@@ -0,0 +1,112 @@
+import enum
+import json
+import logging
+import typing as t
+
+import numpy as np
+import pandas as pd
+from gradient_boosting_model.predict import make_prediction as make_shadow_prediction
+from regression_model.predict import make_prediction as make_live_prediction
+from sqlalchemy.orm.session import Session
+
+from api.persistence.models import (
+ LassoModelPredictions,
+ GradientBoostingModelPredictions,
+)
+
+SECONDARY_VARIABLES_TO_RENAME = {
+ "FirstFlrSF": "1stFlrSF",
+ "SecondFlrSF": "2ndFlrSF",
+ "ThreeSsnPortch": "3SsnPorch",
+}
+
+_logger = logging.getLogger(__name__)
+
+
+class ModelType(enum.Enum):
+ LASSO = "lasso"
+ GRADIENT_BOOSTING = "gradient_boosting"
+
+
+class PredictionResult(t.NamedTuple):
+ errors: t.Any
+ predictions: np.array
+ model_version: str
+
+
+MODEL_PREDICTION_MAP = {
+ ModelType.GRADIENT_BOOSTING: make_shadow_prediction,
+ ModelType.LASSO: make_live_prediction,
+}
+
+
+class PredictionPersistence:
+ def __init__(self, *, db_session: Session, user_id: str = None) -> None:
+ self.db_session = db_session
+ if not user_id:
+ # in reality, here we would use something like a UUID for anonymous users
+ # and if we had user logins, we would record the user ID.
+ self.user_id = "007"
+
+ def make_save_predictions(
+ self, *, db_model: ModelType, input_data: t.List
+ ) -> PredictionResult:
+ """Get the prediction from a given model and persist it."""
+ # Access the model prediction function via mapping
+ if db_model == ModelType.LASSO:
+ # we have to rename a few of the columns for backwards
+ # compatibility with the regression model package.
+ live_frame = pd.DataFrame(input_data)
+ input_data = live_frame.rename(
+ columns=SECONDARY_VARIABLES_TO_RENAME
+ ).to_dict(orient="records")
+
+ result = MODEL_PREDICTION_MAP[db_model](input_data=input_data)
+ errors = None
+ try:
+ errors = result["errors"]
+ except KeyError:
+ # regression model `make_prediction` does not include errors
+ pass
+
+ prediction_result = PredictionResult(
+ errors=errors,
+ predictions=result.get("predictions").tolist() if not errors else None,
+ model_version=result.get("version"),
+ )
+
+ if prediction_result.errors:
+ return prediction_result
+
+ self.save_predictions(
+ inputs=input_data, prediction_result=prediction_result, db_model=db_model
+ )
+
+ return prediction_result
+
+ def save_predictions(
+ self,
+ *,
+ inputs: t.List,
+ prediction_result: PredictionResult,
+ db_model: ModelType,
+ ) -> None:
+ """Persist model predictions to storage."""
+ if db_model == db_model.LASSO:
+ prediction_data = LassoModelPredictions(
+ user_id=self.user_id,
+ model_version=prediction_result.model_version,
+ inputs=json.dumps(inputs),
+ outputs=json.dumps(prediction_result.predictions),
+ )
+ else:
+ prediction_data = GradientBoostingModelPredictions(
+ user_id=self.user_id,
+ model_version=prediction_result.model_version,
+ inputs=json.dumps(inputs),
+ outputs=json.dumps(prediction_result.predictions),
+ )
+
+ self.db_session.add(prediction_data)
+ self.db_session.commit()
+ _logger.debug(f"saved data for model: {db_model}")
diff --git a/packages/ml_api/api/persistence/models.py b/packages/ml_api/api/persistence/models.py
new file mode 100644
index 0000000..65da0b8
--- /dev/null
+++ b/packages/ml_api/api/persistence/models.py
@@ -0,0 +1,29 @@
+from sqlalchemy import Column, String, DateTime, Integer
+from sqlalchemy.dialects.postgresql import JSONB
+from sqlalchemy.sql import func
+
+from api.persistence.core import Base
+
+
+class LassoModelPredictions(Base):
+ __tablename__ = "regression_model_predictions"
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(36), nullable=False)
+ datetime_captured = Column(
+ DateTime(timezone=True), server_default=func.now(), index=True
+ )
+ model_version = Column(String(36), nullable=False)
+ inputs = Column(JSONB)
+ outputs = Column(JSONB)
+
+
+class GradientBoostingModelPredictions(Base):
+ __tablename__ = "gradient_boosting_model_predictions"
+ id = Column(Integer, primary_key=True)
+ user_id = Column(String(36), nullable=False)
+ datetime_captured = Column(
+ DateTime(timezone=True), server_default=func.now(), index=True
+ )
+ model_version = Column(String(36), nullable=False)
+ inputs = Column(JSONB)
+ outputs = Column(JSONB)
diff --git a/packages/ml_api/api/spec/__init__.py b/packages/ml_api/api/spec/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/ml_api/api/spec/api.yaml b/packages/ml_api/api/spec/api.yaml
new file mode 100644
index 0000000..84c3075
--- /dev/null
+++ b/packages/ml_api/api/spec/api.yaml
@@ -0,0 +1,147 @@
+openapi: 3.0.0
+
+info:
+ title: Spec for House Price Prediction API
+ version: '1'
+
+servers:
+- url: http://{base}:5000/
+ description: API for performing house price predictions.
+ variables:
+ base:
+ default: 0.0.0.0
+
+paths:
+ /:
+ get:
+ operationId: api.controller.health
+ responses:
+ '200':
+ description: API Health Status
+
+ /v1/predictions/regression:
+ post:
+ operationId: api.controller.predict
+ requestBody:
+ description: House details used to make price prediction
+ required: true
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: '#/components/schemas/HouseDetails'
+ responses:
+ '200':
+ description: House Price Predictions
+ '400':
+ description: Bad request, house data validation failed
+ '5XX':
+ description: Unexpected error
+
+ /v1/predictions/gradient:
+ post:
+ operationId: api.controller.predict_previous
+ requestBody:
+ description: House details used to make price prediction
+ required: true
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: '#/components/schemas/HouseDetails'
+ responses:
+ '200':
+ description: House Price Predictions
+ '400':
+ description: Bad request, house data validation failed
+ '5XX':
+ description: Unexpected error
+
+components:
+ schemas:
+ HouseDetails:
+ type: object
+ description: "List of the houses to get predictions for."
+ example:
+ Id: 1461
+ MSSubClass: 20
+ MSZoning: RH
+ LotFrontage: 80.0
+ LotArea: 11622
+ Street: Pave
+ Alley: null
+ LotShape: Reg
+ LandContour: Lvl
+ Utilities: AllPub
+ LotConfig: Inside
+ LandSlope: Gtl
+ Neighborhood: NAmes
+ Condition1: Feedr
+ Condition2: Norm
+ BldgType: 1Fam
+ HouseStyle: 1Story
+ OverallQual: 5
+ OverallCond: 6
+ YearBuilt: 1961
+ YearRemodAdd: 1961
+ RoofStyle: Gable
+ RoofMatl: CompShg
+ Exterior1st: VinylSd
+ Exterior2nd: VinylSd
+ MasVnrType: None
+ MasVnrArea: 0.0
+ ExterQual: TA
+ ExterCond: TA
+ Foundation: CBlock
+ BsmtQual: TA
+ BsmtCond: TA
+ BsmtExposure: null
+ BsmtFinType1: Rec
+ BsmtFinSF1: 468.0
+ BsmtFinType2: LwQ
+ BsmtFinSF2: 144.0
+ BsmtUnfSF: 270.0
+ TotalBsmtSF: 882.0
+ Heating: GasA
+ HeatingQC: TA
+ CentralAir: Y
+ Electrical: SBrkr
+ 1stFlrSF: 896
+ 2ndFlrSF: 0
+ LowQualFinSF: 0
+ GrLivArea: 896
+ BsmtFullBath: 0.0
+ BsmtHalfBath: 0.0
+ FullBath: 1
+ HalfBath: 0
+ BedroomAbvGr: 2
+ KitchenAbvGr: 1
+ KitchenQual: TA
+ TotRmsAbvGrd: 5
+ Functional: Typ
+ Fireplaces: 0
+ FireplaceQu: null
+ GarageType: Attchd
+ GarageYrBlt: 1961.0
+ GarageFinish: Unf
+ GarageCars: 1.0
+ GarageArea: 730.0
+ GarageQual: TA
+ GarageCond: TA
+ PavedDrive: Y
+ WoodDeckSF: 140
+ OpenPorchSF: 0
+ EnclosedPorch: 0
+ 3SsnPorch: 0
+ ScreenPorch: 120
+ PoolArea: 0
+ PoolQC: null
+ Fence: MnPrv
+ MiscFeature: null
+ MiscVal: 0
+ MoSold: 6
+ YrSold: 2010
+ SaleType: WD
+ SaleCondition: Normal
diff --git a/packages/ml_api/differential_tests/__init__.py b/packages/ml_api/differential_tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/ml_api/differential_tests/__main__.py b/packages/ml_api/differential_tests/__main__.py
new file mode 100644
index 0000000..cf86222
--- /dev/null
+++ b/packages/ml_api/differential_tests/__main__.py
@@ -0,0 +1,100 @@
+import json
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from typing import Mapping
+
+from differential_tests.compare import compare_predictions
+from api.config import ROOT
+
+from termcolor import cprint
+from yarl import URL
+import requests
+
+Marginals = Mapping[str, Mapping[str, float]]
+
+
+def parse_args() -> Namespace:
+ parser = ArgumentParser()
+
+ subparsers = parser.add_subparsers(dest="command")
+
+ compute_parser = subparsers.add_parser(
+ "compute", help="Compute the predictions for a test set"
+ )
+ compute_parser.add_argument(
+ "--base-url",
+ default=URL("http://0.0.0.0:5000"),
+ type=URL,
+ help="Base URL of the service to test",
+ )
+ compute_parser.add_argument(
+ "tests_dir", type=Path, help="Directory containing the test set to use"
+ )
+ compute_parser.add_argument(
+ "results_dir", type=Path, help="Directory to save the prediction results to"
+ )
+
+ compare_parser = subparsers.add_parser(
+ "compare", help="Compare the actual results with the expected results"
+ )
+ compare_parser.add_argument(
+ "--absolute-tolerance",
+ dest="abs_tol",
+ metavar="X",
+ type=float,
+ help="math.isclose(a, b, abs_tol=X)",
+ default=1e-5,
+ )
+ compare_parser.add_argument(
+ "--relative-tolerance",
+ dest="rel_tol",
+ metavar="X",
+ type=float,
+ default=1e-5,
+ help="math.isclose(a, b, rel_tol=X)",
+ )
+ compare_parser.add_argument(
+ "expected_results_dir",
+ type=Path,
+ help="Directory containing the expected results",
+ )
+ compare_parser.add_argument(
+ "actual_results_dir", type=Path, help="Directory containing the actual results"
+ )
+
+ return parser.parse_args()
+
+
+def main(args: Namespace) -> None:
+ if args.command == "compute":
+ compute_predictions(args)
+ elif args.command == "compare":
+ compare_predictions(args)
+
+
+def compute_predictions(args: Namespace) -> None:
+ print("computing")
+
+ diff_test_dir = ROOT / "differential_tests"
+ results_dir = args.results_dir
+ results_dir.mkdir(parents=True, exist_ok=True)
+ prepared_test_dir = diff_test_dir / Path(args.tests_dir)
+
+ for test_filename in sorted(prepared_test_dir.glob("*.json")):
+ results_filename = results_dir / test_filename.name
+ print(f"Computing {results_filename} from {test_filename} ... ", end="")
+
+ with test_filename.open() as f:
+ test = json.load(f)
+
+ results = requests.post(f"{args.base_url}/v1/predictions/primary", json=test)
+
+ with results_filename.open("w") as f:
+ json.dump(results.json(), f, indent=2, sort_keys=True)
+
+ cprint("OK", "green")
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ main(args)
diff --git a/packages/ml_api/differential_tests/compare.py b/packages/ml_api/differential_tests/compare.py
new file mode 100644
index 0000000..012dd16
--- /dev/null
+++ b/packages/ml_api/differential_tests/compare.py
@@ -0,0 +1,95 @@
+import json
+import math
+import sys
+import typing as t
+from argparse import Namespace
+
+from termcolor import cprint
+
+from api.config import ROOT
+
+
+def compare_differences(
+ *,
+ expected_predictions: t.List,
+ actual_predictions: t.List,
+ rel_tol: t.Optional[float] = None,
+ abs_tol: t.Optional[float] = None,
+) -> None:
+ """
+ :param rel_tol: is the relative tolerance – it is the maximum allowed difference
+ between a and b, relative to the larger absolute value of a or b.
+ For example, to set a tolerance of 5%, pass rel_tol=0.05. The default
+ tolerance is 1e-09, which assures that the two values are the same within
+ about 9 decimal digits. rel_tol must be greater than zero.
+
+ :param abs_tol: abs_tol is the minimum absolute tolerance – useful for comparisons
+ near zero. abs_tol must be at least zero.
+ """
+ only_in_expected = len(expected_predictions) - len(actual_predictions)
+
+ if only_in_expected:
+ raise ValueError(f"Missing {only_in_expected} predictions")
+
+ only_in_actual = len(actual_predictions) - len(expected_predictions)
+
+ if only_in_actual:
+ raise ValueError(f"Found {only_in_actual} unexpected predictions")
+
+ thresholds = {}
+
+ if abs_tol is not None:
+ thresholds["abs_tol"] = abs_tol
+
+ if rel_tol is not None:
+ thresholds["rel_tol"] = rel_tol
+
+ for index, (actual_prediction, expected_prediction) in enumerate(
+ zip(actual_predictions, expected_predictions)
+ ):
+ if not math.isclose(expected_prediction, actual_prediction, **thresholds):
+ raise ValueError(
+ f"Price prediction {index} has changed by more "
+ f"than the thresholds: {thresholds}: "
+ f"{expected_prediction} (expected) vs "
+ f"{actual_prediction} (actual)"
+ )
+
+
+def compare_predictions(args: Namespace) -> None:
+ expected_results_dir = ROOT / args.expected_results_dir
+ actual_results_dir = ROOT / args.actual_results_dir
+
+ expected_results_filenames = list(expected_results_dir.glob("*.json"))
+
+ if not expected_results_filenames:
+ print("No results found!")
+ sys.exit(1)
+
+ for expected_results_filename in sorted(expected_results_filenames):
+ name = expected_results_filename.name
+ actual_results_filename = actual_results_dir / name
+
+ print(
+ f"Comparing {expected_results_filename} with {actual_results_filename} ... ",
+ end="",
+ )
+
+ with expected_results_filename.open() as f:
+ expected_results = json.load(f)
+
+ with actual_results_filename.open() as f:
+ actual_results = json.load(f)
+
+ try:
+ compare_differences(
+ expected_predictions=expected_results["predictions"],
+ actual_predictions=actual_results["predictions"],
+ rel_tol=args.rel_tol,
+ abs_tol=args.abs_tol,
+ )
+ except ValueError as exc:
+ cprint("ERROR", "red")
+ cprint(f" • {exc}", "red")
+ else:
+ cprint("OK", "green")
diff --git a/packages/ml_api/differential_tests/sample_payloads/sample_input1.json b/packages/ml_api/differential_tests/sample_payloads/sample_input1.json
new file mode 100644
index 0000000..61f96e6
--- /dev/null
+++ b/packages/ml_api/differential_tests/sample_payloads/sample_input1.json
@@ -0,0 +1,488 @@
+[{
+ "Id": 1461,
+ "MSSubClass": 20,
+ "MSZoning": "RH",
+ "LotFrontage": 80.0,
+ "LotArea": 11622,
+ "Street": "Pave",
+ "Alley": null,
+ "LotShape": "Reg",
+ "LandContour": "Lvl",
+ "Utilities": "AllPub",
+ "LotConfig": "Inside",
+ "LandSlope": "Gtl",
+ "Neighborhood": "NAmes",
+ "Condition1": "Feedr",
+ "Condition2": "Norm",
+ "BldgType": "1Fam",
+ "HouseStyle": "1Story",
+ "OverallQual": 5,
+ "OverallCond": 6,
+ "YearBuilt": 1961,
+ "YearRemodAdd": 1961,
+ "RoofStyle": "Gable",
+ "RoofMatl": "CompShg",
+ "Exterior1st": "VinylSd",
+ "Exterior2nd": "VinylSd",
+ "MasVnrType": "None",
+ "MasVnrArea": 0.0,
+ "ExterQual": "TA",
+ "ExterCond": "TA",
+ "Foundation": "CBlock",
+ "BsmtQual": "TA",
+ "BsmtCond": "TA",
+ "BsmtExposure": "No",
+ "BsmtFinType1": "Rec",
+ "BsmtFinSF1": 468.0,
+ "BsmtFinType2": "LwQ",
+ "BsmtFinSF2": 144.0,
+ "BsmtUnfSF": 270.0,
+ "TotalBsmtSF": 882.0,
+ "Heating": "GasA",
+ "HeatingQC": "TA",
+ "CentralAir": "Y",
+ "Electrical": "SBrkr",
+ "1stFlrSF": 896,
+ "2ndFlrSF": 0,
+ "LowQualFinSF": 0,
+ "GrLivArea": 896,
+ "BsmtFullBath": 0.0,
+ "BsmtHalfBath": 0.0,
+ "FullBath": 1,
+ "HalfBath": 0,
+ "BedroomAbvGr": 2,
+ "KitchenAbvGr": 1,
+ "KitchenQual": "TA",
+ "TotRmsAbvGrd": 5,
+ "Functional": "Typ",
+ "Fireplaces": 0,
+ "FireplaceQu": null,
+ "GarageType": "Attchd",
+ "GarageYrBlt": 1961.0,
+ "GarageFinish": "Unf",
+ "GarageCars": 1.0,
+ "GarageArea": 730.0,
+ "GarageQual": "TA",
+ "GarageCond": "TA",
+ "PavedDrive": "Y",
+ "WoodDeckSF": 140,
+ "OpenPorchSF": 0,
+ "EnclosedPorch": 0,
+ "3SsnPorch": 0,
+ "ScreenPorch": 120,
+ "PoolArea": 0,
+ "PoolQC": null,
+ "Fence": "MnPrv",
+ "MiscFeature": null,
+ "MiscVal": 0,
+ "MoSold": 6,
+ "YrSold": 2010,
+ "SaleType": "WD",
+ "SaleCondition": "Normal"
+}, {
+ "Id": 1461,
+ "MSSubClass": 20,
+ "MSZoning": "RH",
+ "LotFrontage": 80.0,
+ "LotArea": 11689,
+ "Street": "Pave",
+ "Alley": null,
+ "LotShape": "Reg",
+ "LandContour": "Lvl",
+ "Utilities": "AllPub",
+ "LotConfig": "Inside",
+ "LandSlope": "Gtl",
+ "Neighborhood": "NAmes",
+ "Condition1": "Feedr",
+ "Condition2": "Norm",
+ "BldgType": "1Fam",
+ "HouseStyle": "1Story",
+ "OverallQual": 5,
+ "OverallCond": 6,
+ "YearBuilt": 1969,
+ "YearRemodAdd": 1961,
+ "RoofStyle": "Gable",
+ "RoofMatl": "CompShg",
+ "Exterior1st": "VinylSd",
+ "Exterior2nd": "VinylSd",
+ "MasVnrType": "None",
+ "MasVnrArea": 0.0,
+ "ExterQual": "TA",
+ "ExterCond": "TA",
+ "Foundation": "CBlock",
+ "BsmtQual": "TA",
+ "BsmtCond": "TA",
+ "BsmtExposure": "No",
+ "BsmtFinType1": "Rec",
+ "BsmtFinSF1": 468.0,
+ "BsmtFinType2": "LwQ",
+ "BsmtFinSF2": 144.0,
+ "BsmtUnfSF": 270.0,
+ "TotalBsmtSF": 882.0,
+ "Heating": "GasA",
+ "HeatingQC": "TA",
+ "CentralAir": "Y",
+ "Electrical": "SBrkr",
+ "1stFlrSF": 752,
+ "2ndFlrSF": 0,
+ "LowQualFinSF": 0,
+ "GrLivArea": 896,
+ "BsmtFullBath": 0.0,
+ "BsmtHalfBath": 0.0,
+ "FullBath": 1,
+ "HalfBath": 0,
+ "BedroomAbvGr": 2,
+ "KitchenAbvGr": 1,
+ "KitchenQual": "TA",
+ "TotRmsAbvGrd": 5,
+ "Functional": "Typ",
+ "Fireplaces": 0,
+ "FireplaceQu": null,
+ "GarageType": "Attchd",
+ "GarageYrBlt": 1961.0,
+ "GarageFinish": "Unf",
+ "GarageCars": 1.0,
+ "GarageArea": 730.0,
+ "GarageQual": "TA",
+ "GarageCond": "TA",
+ "PavedDrive": "Y",
+ "WoodDeckSF": 140,
+ "OpenPorchSF": 0,
+ "EnclosedPorch": 0,
+ "3SsnPorch": 0,
+ "ScreenPorch": 120,
+ "PoolArea": 0,
+ "PoolQC": null,
+ "Fence": "MnPrv",
+ "MiscFeature": null,
+ "MiscVal": 0,
+ "MoSold": 6,
+ "YrSold": 2010,
+ "SaleType": "WD",
+ "SaleCondition": "Normal"
+},
+{
+ "Id": 1461,
+ "MSSubClass": 20,
+ "MSZoning": "RH",
+ "LotFrontage": 80.0,
+ "LotArea": 22689,
+ "Street": "Pave",
+ "Alley": null,
+ "LotShape": "Reg",
+ "LandContour": "Lvl",
+ "Utilities": "AllPub",
+ "LotConfig": "Inside",
+ "LandSlope": "Gtl",
+ "Neighborhood": "NAmes",
+ "Condition1": "Feedr",
+ "Condition2": "Norm",
+ "BldgType": "1Fam",
+ "HouseStyle": "1Story",
+ "OverallQual": 5,
+ "OverallCond": 6,
+ "YearBuilt": 1969,
+ "YearRemodAdd": 1961,
+ "RoofStyle": "Gable",
+ "RoofMatl": "CompShg",
+ "Exterior1st": "VinylSd",
+ "Exterior2nd": "VinylSd",
+ "MasVnrType": "None",
+ "MasVnrArea": 0.0,
+ "ExterQual": "TA",
+ "ExterCond": "TA",
+ "Foundation": "CBlock",
+ "BsmtQual": "TA",
+ "BsmtCond": "TA",
+ "BsmtExposure": "No",
+ "BsmtFinType1": "Rec",
+ "BsmtFinSF1": 468.0,
+ "BsmtFinType2": "LwQ",
+ "BsmtFinSF2": 144.0,
+ "BsmtUnfSF": 270.0,
+ "TotalBsmtSF": 882.0,
+ "Heating": "GasA",
+ "HeatingQC": "TA",
+ "CentralAir": "Y",
+ "Electrical": "SBrkr",
+ "1stFlrSF": 752,
+ "2ndFlrSF": 0,
+ "LowQualFinSF": 0,
+ "GrLivArea": 896,
+ "BsmtFullBath": 0.0,
+ "BsmtHalfBath": 0.0,
+ "FullBath": 1,
+ "HalfBath": 0,
+ "BedroomAbvGr": 2,
+ "KitchenAbvGr": 1,
+ "KitchenQual": "TA",
+ "TotRmsAbvGrd": 5,
+ "Functional": "Typ",
+ "Fireplaces": 0,
+ "FireplaceQu": null,
+ "GarageType": "Attchd",
+ "GarageYrBlt": 1961.0,
+ "GarageFinish": "Unf",
+ "GarageCars": 1.0,
+ "GarageArea": 730.0,
+ "GarageQual": "TA",
+ "GarageCond": "TA",
+ "PavedDrive": "Y",
+ "WoodDeckSF": 140,
+ "OpenPorchSF": 0,
+ "EnclosedPorch": 0,
+ "3SsnPorch": 0,
+ "ScreenPorch": 120,
+ "PoolArea": 0,
+ "PoolQC": null,
+ "Fence": "MnPrv",
+ "MiscFeature": null,
+ "MiscVal": 0,
+ "MoSold": 6,
+ "YrSold": 2010,
+ "SaleType": "WD",
+ "SaleCondition": "Normal"
+},{
+ "Id": 1461,
+ "MSSubClass": 20,
+ "MSZoning": "RH",
+ "LotFrontage": 80.0,
+ "LotArea": 11689,
+ "Street": "Pave",
+ "Alley": null,
+ "LotShape": "Reg",
+ "LandContour": "Lvl",
+ "Utilities": "AllPub",
+ "LotConfig": "Inside",
+ "LandSlope": "Gtl",
+ "Neighborhood": "NAmes",
+ "Condition1": "Feedr",
+ "Condition2": "Norm",
+ "BldgType": "1Fam",
+ "HouseStyle": "1Story",
+ "OverallQual": 5,
+ "OverallCond": 6,
+ "YearBuilt": 1969,
+ "YearRemodAdd": 1961,
+ "RoofStyle": "Gable",
+ "RoofMatl": "CompShg",
+ "Exterior1st": "VinylSd",
+ "Exterior2nd": "VinylSd",
+ "MasVnrType": "None",
+ "MasVnrArea": 0.0,
+ "ExterQual": "TA",
+ "ExterCond": "TA",
+ "Foundation": "CBlock",
+ "BsmtQual": "TA",
+ "BsmtCond": "TA",
+ "BsmtExposure": "No",
+ "BsmtFinType1": "Rec",
+ "BsmtFinSF1": 468.0,
+ "BsmtFinType2": "LwQ",
+ "BsmtFinSF2": 144.0,
+ "BsmtUnfSF": 270.0,
+ "TotalBsmtSF": 882.0,
+ "Heating": "GasA",
+ "HeatingQC": "TA",
+ "CentralAir": "Y",
+ "Electrical": "SBrkr",
+ "1stFlrSF": 988,
+ "2ndFlrSF": 0,
+ "LowQualFinSF": 0,
+ "GrLivArea": 896,
+ "BsmtFullBath": 0.0,
+ "BsmtHalfBath": 0.0,
+ "FullBath": 1,
+ "HalfBath": 0,
+ "BedroomAbvGr": 2,
+ "KitchenAbvGr": 1,
+ "KitchenQual": "TA",
+ "TotRmsAbvGrd": 5,
+ "Functional": "Typ",
+ "Fireplaces": 0,
+ "FireplaceQu": null,
+ "GarageType": "Attchd",
+ "GarageYrBlt": 1961.0,
+ "GarageFinish": "Unf",
+ "GarageCars": 1.0,
+ "GarageArea": 730.0,
+ "GarageQual": "TA",
+ "GarageCond": "TA",
+ "PavedDrive": "Y",
+ "WoodDeckSF": 140,
+ "OpenPorchSF": 0,
+ "EnclosedPorch": 0,
+ "3SsnPorch": 0,
+ "ScreenPorch": 120,
+ "PoolArea": 0,
+ "PoolQC": null,
+ "Fence": "MnPrv",
+ "MiscFeature": null,
+ "MiscVal": 0,
+ "MoSold": 6,
+ "YrSold": 2010,
+ "SaleType": "WD",
+ "SaleCondition": "Normal"
+},{
+ "Id": 1461,
+ "MSSubClass": 20,
+ "MSZoning": "RH",
+ "LotFrontage": 80.0,
+ "LotArea": 11689,
+ "Street": "Pave",
+ "Alley": null,
+ "LotShape": "Reg",
+ "LandContour": "Lvl",
+ "Utilities": "AllPub",
+ "LotConfig": "Inside",
+ "LandSlope": "Gtl",
+ "Neighborhood": "NAmes",
+ "Condition1": "Feedr",
+ "Condition2": "Norm",
+ "BldgType": "1Fam",
+ "HouseStyle": "1Story",
+ "OverallQual": 5,
+ "OverallCond": 6,
+ "YearBuilt": 1969,
+ "YearRemodAdd": 1961,
+ "RoofStyle": "Gable",
+ "RoofMatl": "CompShg",
+ "Exterior1st": "VinylSd",
+ "Exterior2nd": "VinylSd",
+ "MasVnrType": "None",
+ "MasVnrArea": 0.0,
+ "ExterQual": "TA",
+ "ExterCond": "TA",
+ "Foundation": "CBlock",
+ "BsmtQual": "TA",
+ "BsmtCond": "TA",
+ "BsmtExposure": "No",
+ "BsmtFinType1": "Rec",
+ "BsmtFinSF1": 468.0,
+ "BsmtFinType2": "LwQ",
+ "BsmtFinSF2": 144.0,
+ "BsmtUnfSF": 270.0,
+ "TotalBsmtSF": 882.0,
+ "Heating": "GasA",
+ "HeatingQC": "TA",
+ "CentralAir": "Y",
+ "Electrical": "SBrkr",
+ "1stFlrSF": 752,
+ "2ndFlrSF": 0,
+ "LowQualFinSF": 0,
+ "GrLivArea": 896,
+ "BsmtFullBath": 0.0,
+ "BsmtHalfBath": 0.0,
+ "FullBath": 1,
+ "HalfBath": 0,
+ "BedroomAbvGr": 2,
+ "KitchenAbvGr": 1,
+ "KitchenQual": "TA",
+ "TotRmsAbvGrd": 5,
+ "Functional": "Typ",
+ "Fireplaces": 0,
+ "FireplaceQu": null,
+ "GarageType": "Attchd",
+ "GarageYrBlt": 1961.0,
+ "GarageFinish": "Unf",
+ "GarageCars": 1.0,
+ "GarageArea": 730.0,
+ "GarageQual": "TA",
+ "GarageCond": "TA",
+ "PavedDrive": "Y",
+ "WoodDeckSF": 140,
+ "OpenPorchSF": 0,
+ "EnclosedPorch": 0,
+ "3SsnPorch": 0,
+ "ScreenPorch": 120,
+ "PoolArea": 0,
+ "PoolQC": null,
+ "Fence": "MnPrv",
+ "MiscFeature": null,
+ "MiscVal": 0,
+ "MoSold": 6,
+ "YrSold": 2008,
+ "SaleType": "WD",
+ "SaleCondition": "Normal"
+},{
+ "Id": 1461,
+ "MSSubClass": 20,
+ "MSZoning": "RH",
+ "LotFrontage": 80.0,
+ "LotArea": 25000,
+ "Street": "Pave",
+ "Alley": null,
+ "LotShape": "Reg",
+ "LandContour": "Lvl",
+ "Utilities": "AllPub",
+ "LotConfig": "Inside",
+ "LandSlope": "Gtl",
+ "Neighborhood": "NAmes",
+ "Condition1": "Feedr",
+ "Condition2": "Norm",
+ "BldgType": "1Fam",
+ "HouseStyle": "1Story",
+ "OverallQual": 5,
+ "OverallCond": 6,
+ "YearBuilt": 1969,
+ "YearRemodAdd": 1961,
+ "RoofStyle": "Gable",
+ "RoofMatl": "CompShg",
+ "Exterior1st": "VinylSd",
+ "Exterior2nd": "VinylSd",
+ "MasVnrType": "None",
+ "MasVnrArea": 0.0,
+ "ExterQual": "TA",
+ "ExterCond": "TA",
+ "Foundation": "CBlock",
+ "BsmtQual": "TA",
+ "BsmtCond": "TA",
+ "BsmtExposure": "No",
+ "BsmtFinType1": "Rec",
+ "BsmtFinSF1": 468.0,
+ "BsmtFinType2": "LwQ",
+ "BsmtFinSF2": 144.0,
+ "BsmtUnfSF": 270.0,
+ "TotalBsmtSF": 882.0,
+ "Heating": "GasA",
+ "HeatingQC": "TA",
+ "CentralAir": "Y",
+ "Electrical": "SBrkr",
+ "1stFlrSF": 752,
+ "2ndFlrSF": 0,
+ "LowQualFinSF": 0,
+ "GrLivArea": 896,
+ "BsmtFullBath": 0.0,
+ "BsmtHalfBath": 0.0,
+ "FullBath": 1,
+ "HalfBath": 0,
+ "BedroomAbvGr": 2,
+ "KitchenAbvGr": 1,
+ "KitchenQual": "TA",
+ "TotRmsAbvGrd": 5,
+ "Functional": "Typ",
+ "Fireplaces": 0,
+ "FireplaceQu": null,
+ "GarageType": "Attchd",
+ "GarageYrBlt": 1961.0,
+ "GarageFinish": "Unf",
+ "GarageCars": 1.0,
+ "GarageArea": 730.0,
+ "GarageQual": "TA",
+ "GarageCond": "TA",
+ "PavedDrive": "Y",
+ "WoodDeckSF": 140,
+ "OpenPorchSF": 0,
+ "EnclosedPorch": 0,
+ "3SsnPorch": 0,
+ "ScreenPorch": 120,
+ "PoolArea": 0,
+ "PoolQC": null,
+ "Fence": "MnPrv",
+ "MiscFeature": null,
+ "MiscVal": 0,
+ "MoSold": 6,
+ "YrSold": 2010,
+ "SaleType": "WD",
+ "SaleCondition": "Normal"
+}]
\ No newline at end of file
diff --git a/packages/ml_api/docker/Dockerfile b/packages/ml_api/docker/Dockerfile
new file mode 100644
index 0000000..9c948fc
--- /dev/null
+++ b/packages/ml_api/docker/Dockerfile
@@ -0,0 +1,20 @@
+FROM python:3.7.5-slim-buster
+
+RUN mkdir -p /opt/app
+COPY requirements /opt/app/requirements
+RUN pip install --upgrade pip
+
+# ensure we can run the make commands
+RUN apt-get update -y && \
+ apt-get install -y make && \
+ apt-get install -y libffi-dev gcc && \
+ # for swagger
+ apt-get install -y curl && \
+ # for postgres driver
+ apt-get install -y libpq-dev
+
+RUN pip install -r /opt/app/requirements/requirements.txt
+ENV PYTHONPATH "${PYTHONPATH}:/opt/app/"
+
+ADD . /opt/app
+WORKDIR /opt/app
diff --git a/packages/ml_api/docker/Dockerfile.test b/packages/ml_api/docker/Dockerfile.test
new file mode 100644
index 0000000..46c29ac
--- /dev/null
+++ b/packages/ml_api/docker/Dockerfile.test
@@ -0,0 +1,18 @@
+FROM python:3.7.5-slim-buster
+
+RUN mkdir -p /opt/app
+COPY requirements /opt/app/requirements
+RUN pip install --upgrade pip
+
+# ensure we can run the make commands
+RUN apt-get update -y && \
+ apt-get install -y make && \
+ apt-get install -y libffi-dev gcc && \
+ # for swagger
+ apt-get install -y curl
+
+ENV PYTHONPATH "${PYTHONPATH}:/opt/app"
+RUN pip install -r /opt/app/requirements/test_requirements.txt
+
+ADD . /opt/app
+WORKDIR /opt/app
diff --git a/packages/ml_api/docker/config/grafana/basic_cadvisor_dashboard_ml_api.json b/packages/ml_api/docker/config/grafana/basic_cadvisor_dashboard_ml_api.json
new file mode 100644
index 0000000..58b24a1
--- /dev/null
+++ b/packages/ml_api/docker/config/grafana/basic_cadvisor_dashboard_ml_api.json
@@ -0,0 +1,605 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Docker monitoring with Prometheus and cAdvisor with node selection",
+ "editable": true,
+ "gnetId": 8321,
+ "graphTooltip": 1,
+ "id": 1,
+ "iteration": 1578230538273,
+ "links": [],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "Prometheus",
+ "editable": true,
+ "error": false,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 8,
+ "x": 0,
+ "y": 0
+ },
+ "height": "20",
+ "id": 7,
+ "interval": null,
+ "isNew": true,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "count(container_last_seen{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "metric": "container_last_seen",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": "",
+ "title": "Running containers",
+ "transparent": true,
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "Prometheus",
+ "editable": true,
+ "error": false,
+ "format": "mbytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 8,
+ "x": 8,
+ "y": 0
+ },
+ "height": "20",
+ "id": 5,
+ "interval": null,
+ "isNew": true,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"})/1024/1024",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "metric": "container_memory_usage_bytes",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": "",
+ "title": "Total Memory Usage",
+ "transparent": true,
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "Prometheus",
+ "editable": true,
+ "error": false,
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 8,
+ "x": 16,
+ "y": 0
+ },
+ "height": "20",
+ "id": 6,
+ "interval": null,
+ "isNew": true,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100)",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "metric": "container_memory_usage_bytes",
+ "refId": "A",
+ "step": 240
+ }
+ ],
+ "thresholds": "",
+ "title": "Total CPU Usage",
+ "transparent": true,
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "decimals": 2,
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "fillGradient": 0,
+ "grid": {},
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 3
+ },
+ "hiddenSeries": false,
+ "id": 2,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(container_cpu_user_seconds_total{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}[5m]) * 100",
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "metric": "cpu",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU Usage",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "decimals": 2,
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "fillGradient": 0,
+ "grid": {},
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 10
+ },
+ "hiddenSeries": false,
+ "id": 1,
+ "isNew": true,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "container_memory_usage_bytes{instance=~\"$node:$port\",job=~\"$job\",image!=\"\"}",
+ "hide": false,
+ "intervalFactor": 2,
+ "legendFormat": "{{name}}",
+ "metric": "container_memory_usage_bytes",
+ "refId": "A",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Usage",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "5s",
+ "schemaVersion": 21,
+ "style": "dark",
+ "tags": [
+ "docker"
+ ],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {
+ "text": "cadvisor",
+ "value": "cadvisor"
+ },
+ "datasource": "Prometheus",
+ "definition": "",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Job",
+ "multi": false,
+ "name": "job",
+ "options": [],
+ "query": "label_values(container_cpu_user_seconds_total, job)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "cadvisor",
+ "value": "cadvisor"
+ },
+ "datasource": "Prometheus",
+ "definition": "",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Host:",
+ "multi": false,
+ "name": "node",
+ "options": [],
+ "query": "label_values(container_cpu_user_seconds_total{job=~\"$job\"}, instance)",
+ "refresh": 1,
+ "regex": "/([^:]+):.*/",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "8080",
+ "value": "8080"
+ },
+ "datasource": "Prometheus",
+ "definition": "",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Port",
+ "multi": false,
+ "name": "port",
+ "options": [],
+ "query": "label_values(container_cpu_user_seconds_total{instance=~\"$node:(.*)\"}, instance)",
+ "refresh": 1,
+ "regex": "/[^:]+:(.*)/",
+ "skipUrlSync": false,
+ "sort": 3,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-5m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Docker monitoring with node selection",
+ "uid": "pHUTSjLZk",
+ "version": 2
+}
\ No newline at end of file
diff --git a/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json b/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json
new file mode 100644
index 0000000..39224a7
--- /dev/null
+++ b/packages/ml_api/docker/config/grafana/grafana_flask_basic_dashboard_ml_api.json
@@ -0,0 +1,224 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": 3,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "hiddenSeries": false,
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(http_request_count_total{job=\"ml_api\"}[5m])",
+ "legendFormat": "{{app_name}} {{method}} {{endpoint}} {{http_status}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Requests Rate",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "hiddenSeries": false,
+ "id": 3,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum (rate(http_request_latency_seconds_sum{job=\"ml_api\"}[5m])) / sum (rate(http_request_latency_seconds_count{job=\"ml_api\"}[5m]))",
+ "legendFormat": "Average (seconds)",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Latency",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "schemaVersion": 21,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "Really Simple Flask Dashboard",
+ "uid": "q8vgEpLZl",
+ "version": 3
+}
\ No newline at end of file
diff --git a/packages/ml_api/docker/config/grafana/ml_api_dashboard.json b/packages/ml_api/docker/config/grafana/ml_api_dashboard.json
new file mode 100644
index 0000000..9c25e2d
--- /dev/null
+++ b/packages/ml_api/docker/config/grafana/ml_api_dashboard.json
@@ -0,0 +1,569 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Machine learning-specific metrics",
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": 7,
+ "links": [],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "columns": [],
+ "datasource": "Prometheus",
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 4,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 7,
+ "links": [],
+ "options": {},
+ "pageSize": 1,
+ "pluginVersion": "6.5.2",
+ "showHeader": true,
+ "sort": {
+ "col": null,
+ "desc": false
+ },
+ "styles": [
+ {
+ "alias": "Time",
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "pattern": "Time",
+ "type": "date"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "decimals": 2,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "count by(live_model, live_version, shadow_model, shadow_version, version)(model_version_details_info\n* on (instance, job) group_left(version)\npython_info)",
+ "format": "table",
+ "legendFormat": "{{model_version}}",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Model Versions",
+ "transform": "table",
+ "transparent": true,
+ "type": "table"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 12,
+ "x": 0,
+ "y": 4
+ },
+ "hiddenSeries": false,
+ "id": 3,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum (rate(house_price_prediction_dollars_sum{job=\"ml_api\"}[5m])) / sum (rate(house_price_prediction_dollars_count{job=\"ml_api\"}[5m]))",
+ "legendFormat": "Average Prediction Amount ($)",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Average House Price Prediction Amount (USD)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 12,
+ "x": 12,
+ "y": 4
+ },
+ "hiddenSeries": false,
+ "id": 9,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(house_price_prediction_dollars_count{job=\"ml_api\"}[1m])",
+ "legendFormat": "Average Prediction Amount ($)",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Average House Price Prediction Rate (/second)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 12,
+ "x": 0,
+ "y": 9
+ },
+ "hiddenSeries": false,
+ "id": 6,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg_over_time(house_price_gauge_dollars[1h])",
+ "legendFormat": "AVG",
+ "refId": "A"
+ },
+ {
+ "expr": "stddev_over_time(house_price_gauge_dollars[1h])",
+ "legendFormat": "STD",
+ "refId": "B"
+ },
+ {
+ "expr": "stddev_over_time(house_price_gauge_dollars[1h]) / (sqrt(count_over_time(house_price_prediction_dollars_count[1h])))",
+ "legendFormat": "SEM",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Standard Error of the Mean (SEM)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "Prometheus",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 12,
+ "x": 12,
+ "y": 9
+ },
+ "hiddenSeries": false,
+ "id": 10,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "(avg_over_time(house_price_gauge_dollars[1m]) - avg_over_time(house_price_gauge_dollars[1w])) / (stddev_over_time(house_price_gauge_dollars[1w]))",
+ "legendFormat": "Z-Score",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Standard Score (Z-Score)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "datasource": "Prometheus",
+ "gridPos": {
+ "h": 4,
+ "w": 12,
+ "x": 0,
+ "y": 14
+ },
+ "id": 5,
+ "options": {
+ "fieldOptions": {
+ "calcs": [
+ "logmin"
+ ],
+ "defaults": {
+ "decimals": 2,
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ],
+ "unit": "currencyUSD"
+ },
+ "override": {},
+ "values": false
+ },
+ "orientation": "auto",
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "6.5.2",
+ "targets": [
+ {
+ "expr": "house_price_gauge_dollars",
+ "legendFormat": "Average Prediction Amount ($)",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Min Prediction",
+ "type": "gauge"
+ },
+ {
+ "datasource": "Prometheus",
+ "gridPos": {
+ "h": 4,
+ "w": 12,
+ "x": 12,
+ "y": 14
+ },
+ "id": 8,
+ "options": {
+ "fieldOptions": {
+ "calcs": [
+ "max"
+ ],
+ "defaults": {
+ "decimals": 2,
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ],
+ "unit": "currencyUSD"
+ },
+ "override": {},
+ "values": false
+ },
+ "orientation": "auto",
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "6.5.2",
+ "targets": [
+ {
+ "expr": "house_price_gauge_dollars",
+ "legendFormat": "Average Prediction Amount ($)",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Max Prediction",
+ "type": "gauge"
+ }
+ ],
+ "schemaVersion": 21,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "ML API Dashboard",
+ "uid": "q8vgEpLZk",
+ "version": 19
+}
\ No newline at end of file
diff --git a/packages/ml_api/docker/config/prometheus/prometheus.yml b/packages/ml_api/docker/config/prometheus/prometheus.yml
new file mode 100644
index 0000000..1e9fa32
--- /dev/null
+++ b/packages/ml_api/docker/config/prometheus/prometheus.yml
@@ -0,0 +1,42 @@
+# my global config
+global:
+ scrape_interval: 15s # By default, scrape targets every 15 seconds.
+ evaluation_interval: 15s # By default, scrape targets every 15 seconds.
+ # scrape_timeout is set to the global default (10s).
+
+ # Attach these labels to any time series or alerts when communicating with
+ # external systems (federation, remote storage, Alertmanager).
+ external_labels:
+ monitor: 'my-project'
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs:
+ # The job name is added as a label `job=` to any timeseries scraped from this config.
+ - job_name: 'prometheus'
+
+ # Override the global default and scrape targets from this job every 5 seconds.
+ scrape_interval: 5s
+
+ # metrics_path defaults to '/metrics'
+ # scheme defaults to 'http'.
+
+ static_configs:
+ - targets: ['prometheus:9090']
+ - job_name: 'ml_api'
+
+ # Override the global default and scrape targets from this job every 5 seconds.
+ scrape_interval: 5s
+
+ # metrics_path defaults to '/metrics'
+ # scheme defaults to 'http'.
+ static_configs:
+ - targets: ['ml_api:5000']
+
+ - job_name: 'cadvisor'
+
+ # Override the global default and scrape targets from this job every 5 seconds.
+ scrape_interval: 5s
+
+ static_configs:
+ - targets: ['cadvisor:8080']
diff --git a/packages/ml_api/docker/docker-compose-ci-candidate.yml b/packages/ml_api/docker/docker-compose-ci-candidate.yml
new file mode 100644
index 0000000..47bbc16
--- /dev/null
+++ b/packages/ml_api/docker/docker-compose-ci-candidate.yml
@@ -0,0 +1,20 @@
+version: '3'
+services:
+
+ ml_api:
+ image: christophergs/ml_api:${TARGET}
+ environment:
+ SERVER_PORT: ${SERVER_PORT:-5001}
+ build:
+ context: ../
+ dockerfile: docker/Dockerfile.test
+ ports:
+ - "5001:5001"
+ tty: true
+ command: bash -c "make run-service-development"
+
+ differential-tests:
+ image: christophergs/ml_api:${TARGET}
+ command: ["true"]
+ depends_on:
+ - ml_api
\ No newline at end of file
diff --git a/packages/ml_api/docker/docker-compose-ci-master.yml b/packages/ml_api/docker/docker-compose-ci-master.yml
new file mode 100644
index 0000000..c0844e3
--- /dev/null
+++ b/packages/ml_api/docker/docker-compose-ci-master.yml
@@ -0,0 +1,20 @@
+version: '3'
+services:
+
+ ml_api:
+ image: christophergs/ml_api:${TARGET}
+ environment:
+ SERVER_PORT: ${SERVER_PORT:-5000}
+ build:
+ context: ../
+ dockerfile: docker/Dockerfile.test
+ ports:
+ - "5000:5000"
+ tty: true
+ command: bash -c "make run-service-development"
+
+ differential-tests:
+ image: christophergs/ml_api:${TARGET}
+ command: ["true"]
+ depends_on:
+ - ml_api
\ No newline at end of file
diff --git a/packages/ml_api/docker/docker-compose.test.yml b/packages/ml_api/docker/docker-compose.test.yml
new file mode 100644
index 0000000..44109b6
--- /dev/null
+++ b/packages/ml_api/docker/docker-compose.test.yml
@@ -0,0 +1,33 @@
+version: '3'
+services:
+ ml_api_test:
+ image: christophergs/ml_api:master
+ build:
+ context: ../
+ dockerfile: docker/Dockerfile.test
+ environment:
+ DB_HOST: test_database
+ DB_PORT: 5432
+ DB_USER: test_user
+ DB_PASSWORD: ${DB_PASSWORD:-password}
+ DB_NAME: ml_api_test
+ depends_on:
+ - test_database
+ ports:
+ - "5000:5000" # expose webserver to localhost host:container
+ command: bash -c "make db-migrations && make run-service-development"
+
+ test_database:
+ image: postgres:latest
+ environment:
+ POSTGRES_USER: test_user
+ POSTGRES_PASSWORD: password
+ POSTGRES_DB: ml_api_test
+ ports:
+ # expose postgres container on different host port to default (host:container)
+ - "6608:5432"
+ volumes:
+ - my_dbdata_test:/var/lib/postgresql/test_data
+
+volumes:
+ my_dbdata_test:
diff --git a/packages/ml_api/docker/docker-compose.yml b/packages/ml_api/docker/docker-compose.yml
new file mode 100644
index 0000000..798c35b
--- /dev/null
+++ b/packages/ml_api/docker/docker-compose.yml
@@ -0,0 +1,73 @@
+version: '3'
+services:
+ ml_api:
+ image: christophergs/ml_api:master
+ build:
+ context: ../
+ dockerfile: docker/Dockerfile
+ environment:
+ DB_HOST: database
+ DB_PORT: 5432
+ DB_USER: user
+ DB_PASSWORD: ${DB_PASSWORD:-password}
+ DB_NAME: ml_api_dev
+ depends_on:
+ - database
+ - cadvisor
+ ports:
+ - "5000:5000" # expose webserver to localhost host:container
+ command: bash -c "make db-migrations && make run-service-wsgi"
+
+ database:
+ image: postgres:latest
+ environment:
+ POSTGRES_USER: user
+ POSTGRES_PASSWORD: password
+ POSTGRES_DB: ml_api_dev
+ ports:
+ # expose postgres container on different host port to default (host:container)
+ - "6609:5432"
+ volumes:
+ - my_dbdata:/var/lib/postgresql/data
+
+ prometheus:
+ image: prom/prometheus
+ container_name: prometheus
+ volumes:
+ - ./config/prometheus/:/etc/prometheus/
+ - prometheus_data:/prometheus
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yml'
+ expose:
+ - 9090
+ ports:
+ - 9090:9090
+ depends_on:
+ - cadvisor
+
+ grafana:
+ image: grafana/grafana
+ depends_on:
+ - prometheus
+ ports:
+ - 3000:3000
+ volumes:
+ - grafana_data:/var/lib/grafana
+ environment:
+ - GF_SECURITY_ADMIN_PASSWORD=foobar
+ - GF_USERS_ALLOW_SIGN_UP=false
+
+ cadvisor:
+ image: google/cadvisor
+ volumes:
+ - /:/rootfs:ro
+ - /var/run:/var/run:rw
+ - /sys:/sys:ro
+ - /var/lib/docker/:/var/lib/docker:ro
+ ports:
+ - 8080:8080
+
+volumes:
+ my_dbdata: {}
+ prometheus_data: {}
+ grafana_data: {}
diff --git a/packages/ml_api/mypy.ini b/packages/ml_api/mypy.ini
new file mode 100644
index 0000000..97e52a5
--- /dev/null
+++ b/packages/ml_api/mypy.ini
@@ -0,0 +1,11 @@
+[mypy]
+warn_unused_ignores = True
+follow_imports = skip
+show_error_context = True
+warn_incomplete_stub = True
+ignore_missing_imports = True
+check_untyped_defs = True
+cache_dir = /dev/null
+warn_redundant_casts = True
+warn_unused_configs = True
+strict_optional = True
diff --git a/packages/ml_api/requirements/requirements.txt b/packages/ml_api/requirements/requirements.txt
new file mode 100644
index 0000000..4ef804c
--- /dev/null
+++ b/packages/ml_api/requirements/requirements.txt
@@ -0,0 +1,26 @@
+# ML Model
+tid-gradient-boosting-model>=0.1.18,<0.2.0
+
+# Old model
+tid-regression-model>=2.0.20,<2.1.0
+
+# Web microframework for the API
+flask>=1.1.1,<1.2.0
+connexion[swagger-ui]>=2.5.1,<2.6.0
+
+# repo maintenance tooling
+black>=19.10b0,<20.0
+flake8>=3.7.9,<4.0
+mypy>=0.740
+
+# Persistence
+sqlalchemy>=1.3.11,<1.4.0 # ORM
+psycopg2>=2.8.4,<2.9.0 # DB Driver
+alembic>=1.3.1,<1.4.0 # DB Migrations
+sqlalchemy_utils>=0.36.0,<0.37.0 # DB Utils
+
+# Monitoring
+prometheus_client>=0.7.1,<0.8.0
+
+# Deployment
+gunicorn>=20.0.4,<20.1.0
diff --git a/packages/ml_api/requirements/test_requirements.txt b/packages/ml_api/requirements/test_requirements.txt
new file mode 100644
index 0000000..c909f64
--- /dev/null
+++ b/packages/ml_api/requirements/test_requirements.txt
@@ -0,0 +1,14 @@
+-r requirements.txt
+
+# testing requirements
+pytest>=5.3.2,<6.0.0
+requests>=2.22.0,<2.23.0
+
+# repo maintenance tooling
+black>=19.10b0,<20.0
+flake8>=3.7.9,<4.0
+mypy>=0.740
+
+# diff test tooling
+termcolor==1.1.0
+yarl==1.3.0
\ No newline at end of file
diff --git a/packages/ml_api/run.py b/packages/ml_api/run.py
new file mode 100644
index 0000000..3e90817
--- /dev/null
+++ b/packages/ml_api/run.py
@@ -0,0 +1,21 @@
+import prometheus_client
+from werkzeug.middleware.dispatcher import DispatcherMiddleware
+
+from api.app import create_app
+from api.config import DevelopmentConfig, setup_app_logging
+
+
+_config = DevelopmentConfig()
+
+# setup logging as early as possible
+setup_app_logging(config=_config)
+
+main_app = create_app(config_object=_config).app
+application = DispatcherMiddleware(
+ app=main_app.wsgi_app,
+ mounts={'/metrics': prometheus_client.make_wsgi_app()}
+ )
+
+
+if __name__ == "__main__":
+ main_app.run(port=_config.SERVER_PORT, host=_config.SERVER_HOST)
diff --git a/packages/ml_api/scripts/differential_tests.sh b/packages/ml_api/scripts/differential_tests.sh
new file mode 100755
index 0000000..98c8a91
--- /dev/null
+++ b/packages/ml_api/scripts/differential_tests.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+set -euox pipefail
+
+MODEL_VERSION="master"
+MODEL_VARIANT="candidate"
+NUMBER_OF_TESTS="50"
+
+CANDIDATE_MODEL_SHA="$(git rev-parse HEAD)"
+
+# required once only (or whenever you make local changes):
+# comment these two lines out otherwise as they can take some time.
+make tag-push-local
+
+# should only be run once a model version has been finalized
+# best practice is to run as part of a CI pipeline on merge to master branch.
+make tag-push-master
+
+## Pull latest published image
+env TARGET=master docker-compose --file docker/docker-compose.yml pull
+
+# start latest (master) image and local image
+env TARGET=master SERVER_PORT=5000 docker-compose --project-name master --file docker/docker-compose-ci-master.yml up --no-recreate -d ml_api
+env TARGET=$CANDIDATE_MODEL_SHA SERVER_PORT=5001 docker-compose --project-name head --file docker/docker-compose-ci-candidate.yml up --no-recreate -d ml_api
+
+## Start the test runner containers
+env TARGET=master docker-compose --project-name master --file docker/docker-compose-ci-master.yml run -d --name differential-tests-expected differential-tests sleep infinity
+env TARGET=$CANDIDATE_MODEL_SHA docker-compose --project-name head --file docker/docker-compose-ci-candidate.yml run -d --name differential-tests-actual differential-tests sleep infinity
+
+docker ps --all
+
+echo "===== Running $CANDIDATE_MODEL_SHA ... ====="
+
+## Compute the actual predictions (i.e. candidate model)
+docker exec --user root differential-tests-actual \
+ python3 differential_tests compute sample_payloads differential_tests/actual_results --base-url http://head_ml_api_1:5001
+
+## Copy the actual predictions
+docker cp differential-tests-actual:/opt/app/differential_tests/actual_results/. differential_tests/actual_results
+
+echo "===== Running master ... ====="
+## Compute the expected marginals (i.e. existing model)
+docker exec --user root differential-tests-expected \
+ python3 differential_tests compute sample_payloads differential_tests/expected_results --base-url http://master_ml_api_1:5000
+
+## Copy the expected marginals
+docker cp differential-tests-expected:/opt/app/differential_tests/expected_results/. differential_tests/expected_results
+
+# then copy all results into the differential-tests-actual container for comparison
+docker cp differential_tests/expected_results/. differential-tests-actual:/opt/app/differential_tests/expected_results
+
+echo "===== Comparing $CANDIDATE_MODEL_SHA vs. master ... ====="
+## Compare the expected and actual marginals
+docker exec differential-tests-actual \
+ python3 -m differential_tests compare differential_tests/expected_results differential_tests/actual_results
+
+# clear any docker containers (will stop the script if no containers found)
+docker rm $(docker ps -a -q) -f
diff --git a/packages/ml_api/scripts/populate_database.py b/packages/ml_api/scripts/populate_database.py
new file mode 100644
index 0000000..fa12d88
--- /dev/null
+++ b/packages/ml_api/scripts/populate_database.py
@@ -0,0 +1,109 @@
+import argparse
+import os
+import time
+import typing as t
+from random import randint
+
+import pandas as pd
+import requests
+from gradient_boosting_model.config.core import config
+from gradient_boosting_model.processing.data_management import load_dataset
+
+LOCAL_URL = f'http://{os.getenv("DB_HOST", "localhost")}:5000'
+
+HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
+
+LOT_AREA_MAP = {"min": 1470, "max": 56600}
+
+FIRST_FLR_SF_MAP = {"min": 407, "max": 5095}
+
+SECOND_FLR_SF_MAP = {"min": 0, "max": 1862}
+
+
+def _generate_random_int(value: int, value_ranges: t.Mapping) -> int:
+ """Generate random integer within a min and max range."""
+ random_value = randint(value_ranges["min"], value_ranges["max"])
+
+ return int(random_value)
+
+
+def _prepare_inputs(dataframe: pd.DataFrame) -> pd.DataFrame:
+ """Prepare input data by removing key rows with NA values."""
+ clean_inputs_df = dataframe.dropna(
+ subset=config.model_config.features + ["KitchenQual", "LotFrontage"]
+ ).copy()
+
+ clean_inputs_df.loc[:, "FirstFlrSF"] = clean_inputs_df["FirstFlrSF"].apply(
+ _generate_random_int, value_ranges=FIRST_FLR_SF_MAP
+ )
+ clean_inputs_df.loc[:, "SecondFlrSF"] = clean_inputs_df["SecondFlrSF"].apply(
+ _generate_random_int, value_ranges=SECOND_FLR_SF_MAP
+ )
+ clean_inputs_df.loc[:, "LotArea"] = clean_inputs_df["LotArea"].apply(
+ _generate_random_int, value_ranges=LOT_AREA_MAP
+ )
+
+ return clean_inputs_df
+
+
+def populate_database(n_predictions: int = 500, anomaly: bool = False) -> None:
+ """
+ Manipulate the test data to generate random
+ predictions and save them to the database.
+ Before running this script, ensure that the
+ API and Database docker containers are running.
+ """
+
+ print(f"Preparing to generate: {n_predictions} predictions.")
+
+ # Load the gradient boosting test dataset which
+ # is included in the model package
+ test_inputs_df = load_dataset(file_name="test.csv")
+ clean_inputs_df = _prepare_inputs(dataframe=test_inputs_df)
+ if len(clean_inputs_df) < n_predictions:
+ print(
+ f"If you want {n_predictions} predictions, you need to"
+ "extend the script to handle more predictions."
+ )
+
+ if anomaly:
+ # set extremely low values to generate an outlier
+ n_predictions = 1
+ clean_inputs_df.loc[:, "FirstFlrSF"] = 1
+ clean_inputs_df.loc[:, "LotArea"] = 1
+ clean_inputs_df.loc[:, "OverallQual"] = 1
+ clean_inputs_df.loc[:, "GrLivArea"] = 1
+
+ for index, data in clean_inputs_df.iterrows():
+ if index > n_predictions:
+ if anomaly:
+ print('Created 1 anomaly')
+ break
+
+ response = requests.post(
+ f"{LOCAL_URL}/v1/predictions/regression",
+ headers=HEADERS,
+ json=[data.to_dict()],
+ )
+ response.raise_for_status()
+
+ if index % 50 == 0:
+ print(f"{index} predictions complete")
+
+ # prevent overloading the server
+ time.sleep(0.5)
+
+ print("Prediction generation complete.")
+
+
+if __name__ == "__main__":
+ anomaly = False
+ parser = argparse.ArgumentParser(
+ description='Send random requests to House Price API.')
+ parser.add_argument('--anomaly', help="generate unusual inputs")
+ args = parser.parse_args()
+ if args.anomaly:
+ print("Generating unusual inputs")
+ anomaly = True
+
+ populate_database(n_predictions=500, anomaly=anomaly)
diff --git a/packages/ml_api/tests/__init__.py b/packages/ml_api/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/packages/ml_api/tests/conftest.py b/packages/ml_api/tests/conftest.py
new file mode 100644
index 0000000..8939e04
--- /dev/null
+++ b/packages/ml_api/tests/conftest.py
@@ -0,0 +1,54 @@
+import os
+
+from unittest import mock
+import pytest
+from gradient_boosting_model.processing.data_management import load_dataset
+from sqlalchemy_utils import create_database, database_exists
+
+from api.app import create_app
+from api.config import TestingConfig
+from api.persistence import core
+
+
+@pytest.fixture(scope='session')
+def _db():
+ db_url = TestingConfig.SQLALCHEMY_DATABASE_URI
+ if not database_exists(db_url):
+ create_database(db_url)
+ # alembic can be configured through the configuration file. For testing
+ # purposes 'env.py' also checks the 'ALEMBIC_DB_URI' variable first.
+ engine = core.create_db_engine_from_config(config=TestingConfig())
+ evars = {"ALEMBIC_DB_URI": db_url}
+ with mock.patch.dict(os.environ, evars):
+ core.run_migrations()
+
+ yield engine
+
+
+@pytest.fixture(scope='session')
+def _db_session(_db):
+ """ Create DB session for testing.
+ """
+ session = core.create_db_session(engine=_db)
+ yield session
+
+
+@pytest.fixture(scope='session')
+def app(_db_session):
+ app = create_app(config_object=TestingConfig(), db_session=_db_session).app
+ with app.app_context():
+ yield app
+
+
+@pytest.fixture
+def client(app):
+ with app.test_client() as client:
+ yield client # Has to be yielded to access session cookies
+
+
+@pytest.fixture
+def test_inputs_df():
+ # Load the gradient boosting test dataset which
+ # is included in the model package
+ test_inputs_df = load_dataset(file_name="test.csv")
+ return test_inputs_df.copy(deep=True)
diff --git a/packages/ml_api/tests/test_api.py b/packages/ml_api/tests/test_api.py
new file mode 100644
index 0000000..3d9fb5b
--- /dev/null
+++ b/packages/ml_api/tests/test_api.py
@@ -0,0 +1,135 @@
+import json
+import time
+
+import numpy as np
+import pytest
+
+from api.persistence.data_access import SECONDARY_VARIABLES_TO_RENAME
+from api.persistence.models import (
+ GradientBoostingModelPredictions,
+ LassoModelPredictions,
+)
+from gradient_boosting_model.processing.data_management import load_dataset
+
+
+@pytest.mark.integration
+def test_health_endpoint(client):
+ # When
+ response = client.get("/")
+
+ # Then
+ assert response.status_code == 200
+ assert json.loads(response.data) == {"status": "ok"}
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize(
+ "api_endpoint, expected_no_predictions",
+ (
+ (
+ "v1/predictions/regression",
+ # test csv contains 1459 rows
+ # we expect 2 rows to be filtered
+ 1451,
+ ),
+ (
+ "v1/predictions/gradient",
+ # we expect 8 rows to be filtered
+ 1457,
+ ),
+ ),
+)
+def test_prediction_endpoint(
+ api_endpoint, expected_no_predictions, client, test_inputs_df
+):
+ # Given
+ # Load the test dataset which is included in the model package
+ test_inputs_df = load_dataset(file_name="test.csv") # dataframe
+ if api_endpoint == "v1/predictions/regression":
+ # adjust column names to those expected by the secondary model
+ test_inputs_df.rename(columns=SECONDARY_VARIABLES_TO_RENAME, inplace=True)
+
+ # When
+ response = client.post(api_endpoint, json=test_inputs_df.to_dict(orient="records"))
+
+ # Then
+ assert response.status_code == 200
+ data = json.loads(response.data)
+ assert data["errors"] is None
+ assert len(data["predictions"]) == expected_no_predictions
+
+
+# parameterizationa allows us to try many combinations of data
+# within the same test, see the pytest docs for details:
+# https://docs.pytest.org/en/latest/parametrize.html
+@pytest.mark.parametrize(
+ "field, field_value, index, expected_error",
+ (
+ (
+ "BldgType",
+ 1, # expected str
+ 33,
+ {"33": {"BldgType": ["Not a valid string."]}},
+ ),
+ (
+ "GarageArea", # model feature
+ "abc", # expected float
+ 45,
+ {"45": {"GarageArea": ["Not a valid number."]}},
+ ),
+ (
+ "CentralAir",
+ np.nan, # nan not allowed
+ 34,
+ {"34": {"CentralAir": ["Field may not be null."]}},
+ ),
+ ("LotArea", "", 2, {"2": {"LotArea": ["Not a valid integer."]}}),
+ ),
+)
+@pytest.mark.integration
+def test_prediction_validation(
+ field, field_value, index, expected_error, client, test_inputs_df
+):
+ # Given
+ # Check gradient_boosting_model.processing.validation import HouseDataInputSchema
+ # and you will see the expected values for the inputs to the house price prediction
+ # model. In this test, inputs are changed to incorrect values to check the validation.
+ test_inputs_df.loc[index, field] = field_value
+
+ # When
+ response = client.post(
+ "/v1/predictions/gradient", json=test_inputs_df.to_dict(orient="records")
+ )
+
+ # Then
+ assert response.status_code == 400
+ data = json.loads(response.data)
+ assert data == expected_error
+
+
+@pytest.mark.integration
+def test_prediction_data_saved(client, app, test_inputs_df):
+ # Given
+ initial_gradient_count = app.db_session.query(
+ GradientBoostingModelPredictions
+ ).count()
+ initial_lasso_count = app.db_session.query(LassoModelPredictions).count()
+
+ # When
+ response = client.post(
+ "/v1/predictions/regression", json=test_inputs_df.to_dict(orient="records")
+ )
+
+ # Then
+ assert response.status_code == 200
+ assert (
+ app.db_session.query(LassoModelPredictions).count() == initial_lasso_count + 1
+ )
+
+ # The gradient prediction save occurs on a separate async thread which can take
+ # time to complete. We pause the test briefly to allow the save operation to finish.
+ time.sleep(2)
+ assert (
+ app.db_session.query(GradientBoostingModelPredictions).count()
+ == initial_gradient_count + 1
+ )
diff --git a/packages/ml_api/tests/test_back_to_back_models.py b/packages/ml_api/tests/test_back_to_back_models.py
new file mode 100644
index 0000000..af98797
--- /dev/null
+++ b/packages/ml_api/tests/test_back_to_back_models.py
@@ -0,0 +1,37 @@
+import json
+
+import pytest
+from gradient_boosting_model.processing.data_management import load_dataset
+
+from api.persistence.data_access import SECONDARY_VARIABLES_TO_RENAME
+from differential_tests.compare import compare_differences
+
+
+@pytest.mark.differential
+def test_model_prediction_differentials(client):
+ test_inputs_df = load_dataset(file_name="test.csv")
+ old_model_inputs_df = test_inputs_df.rename(
+ columns=SECONDARY_VARIABLES_TO_RENAME
+ )
+
+ new_model_response = client.post(
+ "v1/predictions/gradient", json=test_inputs_df.to_dict(orient="records")
+ )
+ new_model_predictions = json.loads(new_model_response.data)["predictions"]
+
+ old_model_response = client.post(
+ "v1/predictions/regression",
+ json=old_model_inputs_df.to_dict(orient="records"),
+ )
+ old_model_predictions = json.loads(old_model_response.data)["predictions"]
+
+ # We just pass in the first 10 rows as the two models' validation differs
+ # which means they filter out a slightly different number of rows
+ # which would cause the differential tests to fail.
+ compare_differences(
+ expected_predictions=new_model_predictions[:10],
+ actual_predictions=old_model_predictions[:10],
+ # you would adjust the rel_tol level parameter on your model.
+ # right now this is extremely permissive of variation.
+ rel_tol=0.2,
+ )
diff --git a/packages/ml_api/tests/test_persistence.py b/packages/ml_api/tests/test_persistence.py
new file mode 100644
index 0000000..8172157
--- /dev/null
+++ b/packages/ml_api/tests/test_persistence.py
@@ -0,0 +1,36 @@
+from unittest import mock
+import pytest
+
+from api.persistence.data_access import PredictionPersistence, ModelType
+
+from api.persistence.models import (
+ GradientBoostingModelPredictions,
+ LassoModelPredictions,
+)
+
+
+# parameterizationa allows us to try many combinations of data
+# within the same test, see the pytest docs for details:
+# https://docs.pytest.org/en/latest/parametrize.html
+@pytest.mark.parametrize(
+ "model_type, model,",
+ (
+ (ModelType.GRADIENT_BOOSTING, GradientBoostingModelPredictions),
+ (ModelType.LASSO, LassoModelPredictions),
+ ),
+)
+def test_data_access(model_type, model, test_inputs_df):
+ # Given
+ # We mock the database session
+ mock_session = mock.MagicMock()
+ _persistence = PredictionPersistence(db_session=mock_session)
+
+ # When
+ _persistence.make_save_predictions(
+ db_model=model_type, input_data=test_inputs_df.to_dict(orient="records")
+ )
+
+ # Then
+ assert mock_session.commit.call_count == 1
+ assert mock_session.add.call_count == 1
+ assert isinstance(mock_session.add.call_args[0][0], model)
diff --git a/packages/ml_api/tox.ini b/packages/ml_api/tox.ini
new file mode 100644
index 0000000..ef439aa
--- /dev/null
+++ b/packages/ml_api/tox.ini
@@ -0,0 +1,138 @@
+[tox]
+envlist = integration_tests,unit_tests,differential_tests,typechecks,stylechecks
+skipsdist = True
+
+
+[testenv]
+install_command = pip install {opts} {packages}
+
+deps =
+ -rrequirements/test_requirements.txt
+
+passenv =
+# A list of wildcard environment variable names which shall be copied from
+# the tox invocation environment to the test environment when executing test commands
+ DB_*
+ SHADOW_MODE_ACTIVE
+
+commands=
+ py.test
+
+
+[testenv:integration_tests]
+envdir = {toxworkdir}/integration_tests
+deps =
+ {[testenv]deps}
+
+passenv =
+ {[testenv]passenv}
+
+setenv =
+ PYTHONPATH=.
+ DB_USER={env:DB_USER:test_user}
+ DB_PASSWORD={env:DB_PASSWORD:password}
+ DB_PORT={env:DB_PORT:6608}
+ DB_HOST={env:DB_HOST:0.0.0.0}
+ DB_NAME={env:DB_NAME:ml_api_test}
+ SHADOW_MODE_ACTIVE={env:SHADOW_MODE_ACTIVE:true}
+
+commands =
+ pytest \
+ -s \
+ -vv \
+ -m integration \
+ {posargs:tests/}
+
+
+[testenv:unit_tests]
+envdir = {toxworkdir}/integration_tests
+deps =
+ {[testenv]deps}
+
+passenv =
+ {[testenv]passenv}
+
+setenv =
+ PYTHONPATH=.
+
+commands =
+ pytest \
+ -s \
+ -vv \
+ -m "not integration and not differential" \
+ {posargs:tests/}
+
+
+[testenv:differential_tests]
+envdir = {toxworkdir}/integration_tests
+deps =
+ {[testenv]deps}
+
+passenv =
+ {[testenv]passenv}
+
+setenv =
+ PYTHONPATH=.
+ DB_USER={env:DB_USER:test_user}
+ DB_PASSWORD={env:DB_PASSWORD:password}
+ DB_PORT={env:DB_PORT:6608}
+ DB_HOST={env:DB_HOST:0.0.0.0}
+ DB_NAME={env:DB_NAME:ml_api_test}
+ SHADOW_MODE_ACTIVE={env:SHADOW_MODE_ACTIVE:true}
+
+commands =
+ pytest \
+ -s \
+ -vv \
+ -m differential \
+ {posargs:tests/}
+
+
+[testenv:generate_predictions]
+envdir = {toxworkdir}/generate_predictions
+deps =
+ {[testenv]deps}
+
+passenv =
+ {[testenv]passenv}
+
+setenv =
+ PYTHONPATH=.
+ DB_HOST={env:DB_HOST:localhost}
+
+commands = python scripts/populate_database.py
+
+
+[testenv:typechecks]
+envdir = {toxworkdir}/integration_tests
+
+deps =
+ {[testenv:integration_tests]deps}
+
+commands = {posargs:mypy api}
+
+
+[testenv:stylechecks]
+envdir = {toxworkdir}/integration_tests
+
+deps =
+ {[testenv:integration_tests]deps}
+
+commands = {posargs:flake8 api tests}
+
+
+[flake8]
+exclude = .git,env
+max-line-length = 90
+
+
+[pytest]
+markers =
+ integration: mark a test as an integration test.
+ differential: mark a test as a differential test.
+
+filterwarnings =
+ ignore::DeprecationWarning
+ ignore::RuntimeWarning
+ ignore::UserWarning
+ ignore::FutureWarning
diff --git a/research_phase/gradient_boosting_model.ipynb b/research_phase/gradient_boosting_model.ipynb
new file mode 100755
index 0000000..9640fb3
--- /dev/null
+++ b/research_phase/gradient_boosting_model.ipynb
@@ -0,0 +1,2230 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from math import sqrt\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "# for the ML pipeline\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.preprocessing import OrdinalEncoder\n",
+ "from sklearn.ensemble import GradientBoostingRegressor\n",
+ "from sklearn.feature_selection import SelectFromModel\n",
+ "from sklearn.metrics import mean_squared_error, r2_score\n",
+ "\n",
+ "# from sklearn.pipeline import Pipeline\n",
+ "# from sklearn.compose import ColumnTransformer\n",
+ "\n",
+ "from feature_engine.categorical_encoders import RareLabelCategoricalEncoder\n",
+ "\n",
+ "# to visualise al the columns in the dataframe\n",
+ "pd.pandas.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1460, 81)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Id | \n",
+ " MSSubClass | \n",
+ " MSZoning | \n",
+ " LotFrontage | \n",
+ " LotArea | \n",
+ " Street | \n",
+ " Alley | \n",
+ " LotShape | \n",
+ " LandContour | \n",
+ " Utilities | \n",
+ " LotConfig | \n",
+ " LandSlope | \n",
+ " Neighborhood | \n",
+ " Condition1 | \n",
+ " Condition2 | \n",
+ " BldgType | \n",
+ " HouseStyle | \n",
+ " OverallQual | \n",
+ " OverallCond | \n",
+ " YearBuilt | \n",
+ " YearRemodAdd | \n",
+ " RoofStyle | \n",
+ " RoofMatl | \n",
+ " Exterior1st | \n",
+ " Exterior2nd | \n",
+ " MasVnrType | \n",
+ " MasVnrArea | \n",
+ " ExterQual | \n",
+ " ExterCond | \n",
+ " Foundation | \n",
+ " BsmtQual | \n",
+ " BsmtCond | \n",
+ " BsmtExposure | \n",
+ " BsmtFinType1 | \n",
+ " BsmtFinSF1 | \n",
+ " BsmtFinType2 | \n",
+ " BsmtFinSF2 | \n",
+ " BsmtUnfSF | \n",
+ " TotalBsmtSF | \n",
+ " Heating | \n",
+ " HeatingQC | \n",
+ " CentralAir | \n",
+ " Electrical | \n",
+ " 1stFlrSF | \n",
+ " 2ndFlrSF | \n",
+ " LowQualFinSF | \n",
+ " GrLivArea | \n",
+ " BsmtFullBath | \n",
+ " BsmtHalfBath | \n",
+ " FullBath | \n",
+ " HalfBath | \n",
+ " BedroomAbvGr | \n",
+ " KitchenAbvGr | \n",
+ " KitchenQual | \n",
+ " TotRmsAbvGrd | \n",
+ " Functional | \n",
+ " Fireplaces | \n",
+ " FireplaceQu | \n",
+ " GarageType | \n",
+ " GarageYrBlt | \n",
+ " GarageFinish | \n",
+ " GarageCars | \n",
+ " GarageArea | \n",
+ " GarageQual | \n",
+ " GarageCond | \n",
+ " PavedDrive | \n",
+ " WoodDeckSF | \n",
+ " OpenPorchSF | \n",
+ " EnclosedPorch | \n",
+ " 3SsnPorch | \n",
+ " ScreenPorch | \n",
+ " PoolArea | \n",
+ " PoolQC | \n",
+ " Fence | \n",
+ " MiscFeature | \n",
+ " MiscVal | \n",
+ " MoSold | \n",
+ " YrSold | \n",
+ " SaleType | \n",
+ " SaleCondition | \n",
+ " SalePrice | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 60 | \n",
+ " RL | \n",
+ " 65.0 | \n",
+ " 8450 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " Reg | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " Gtl | \n",
+ " CollgCr | \n",
+ " Norm | \n",
+ " Norm | \n",
+ " 1Fam | \n",
+ " 2Story | \n",
+ " 7 | \n",
+ " 5 | \n",
+ " 2003 | \n",
+ " 2003 | \n",
+ " Gable | \n",
+ " CompShg | \n",
+ " VinylSd | \n",
+ " VinylSd | \n",
+ " BrkFace | \n",
+ " 196.0 | \n",
+ " Gd | \n",
+ " TA | \n",
+ " PConc | \n",
+ " Gd | \n",
+ " TA | \n",
+ " No | \n",
+ " GLQ | \n",
+ " 706 | \n",
+ " Unf | \n",
+ " 0 | \n",
+ " 150 | \n",
+ " 856 | \n",
+ " GasA | \n",
+ " Ex | \n",
+ " Y | \n",
+ " SBrkr | \n",
+ " 856 | \n",
+ " 854 | \n",
+ " 0 | \n",
+ " 1710 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " Gd | \n",
+ " 8 | \n",
+ " Typ | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " Attchd | \n",
+ " 2003.0 | \n",
+ " RFn | \n",
+ " 2 | \n",
+ " 548 | \n",
+ " TA | \n",
+ " TA | \n",
+ " Y | \n",
+ " 0 | \n",
+ " 61 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 2008 | \n",
+ " WD | \n",
+ " Normal | \n",
+ " 208500 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 20 | \n",
+ " RL | \n",
+ " 80.0 | \n",
+ " 9600 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " Reg | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " FR2 | \n",
+ " Gtl | \n",
+ " Veenker | \n",
+ " Feedr | \n",
+ " Norm | \n",
+ " 1Fam | \n",
+ " 1Story | \n",
+ " 6 | \n",
+ " 8 | \n",
+ " 1976 | \n",
+ " 1976 | \n",
+ " Gable | \n",
+ " CompShg | \n",
+ " MetalSd | \n",
+ " MetalSd | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " TA | \n",
+ " TA | \n",
+ " CBlock | \n",
+ " Gd | \n",
+ " TA | \n",
+ " Gd | \n",
+ " ALQ | \n",
+ " 978 | \n",
+ " Unf | \n",
+ " 0 | \n",
+ " 284 | \n",
+ " 1262 | \n",
+ " GasA | \n",
+ " Ex | \n",
+ " Y | \n",
+ " SBrkr | \n",
+ " 1262 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1262 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " TA | \n",
+ " 6 | \n",
+ " Typ | \n",
+ " 1 | \n",
+ " TA | \n",
+ " Attchd | \n",
+ " 1976.0 | \n",
+ " RFn | \n",
+ " 2 | \n",
+ " 460 | \n",
+ " TA | \n",
+ " TA | \n",
+ " Y | \n",
+ " 298 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 2007 | \n",
+ " WD | \n",
+ " Normal | \n",
+ " 181500 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 60 | \n",
+ " RL | \n",
+ " 68.0 | \n",
+ " 11250 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " Gtl | \n",
+ " CollgCr | \n",
+ " Norm | \n",
+ " Norm | \n",
+ " 1Fam | \n",
+ " 2Story | \n",
+ " 7 | \n",
+ " 5 | \n",
+ " 2001 | \n",
+ " 2002 | \n",
+ " Gable | \n",
+ " CompShg | \n",
+ " VinylSd | \n",
+ " VinylSd | \n",
+ " BrkFace | \n",
+ " 162.0 | \n",
+ " Gd | \n",
+ " TA | \n",
+ " PConc | \n",
+ " Gd | \n",
+ " TA | \n",
+ " Mn | \n",
+ " GLQ | \n",
+ " 486 | \n",
+ " Unf | \n",
+ " 0 | \n",
+ " 434 | \n",
+ " 920 | \n",
+ " GasA | \n",
+ " Ex | \n",
+ " Y | \n",
+ " SBrkr | \n",
+ " 920 | \n",
+ " 866 | \n",
+ " 0 | \n",
+ " 1786 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " Gd | \n",
+ " 6 | \n",
+ " Typ | \n",
+ " 1 | \n",
+ " TA | \n",
+ " Attchd | \n",
+ " 2001.0 | \n",
+ " RFn | \n",
+ " 2 | \n",
+ " 608 | \n",
+ " TA | \n",
+ " TA | \n",
+ " Y | \n",
+ " 0 | \n",
+ " 42 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 9 | \n",
+ " 2008 | \n",
+ " WD | \n",
+ " Normal | \n",
+ " 223500 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 70 | \n",
+ " RL | \n",
+ " 60.0 | \n",
+ " 9550 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Corner | \n",
+ " Gtl | \n",
+ " Crawfor | \n",
+ " Norm | \n",
+ " Norm | \n",
+ " 1Fam | \n",
+ " 2Story | \n",
+ " 7 | \n",
+ " 5 | \n",
+ " 1915 | \n",
+ " 1970 | \n",
+ " Gable | \n",
+ " CompShg | \n",
+ " Wd Sdng | \n",
+ " Wd Shng | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " TA | \n",
+ " TA | \n",
+ " BrkTil | \n",
+ " TA | \n",
+ " Gd | \n",
+ " No | \n",
+ " ALQ | \n",
+ " 216 | \n",
+ " Unf | \n",
+ " 0 | \n",
+ " 540 | \n",
+ " 756 | \n",
+ " GasA | \n",
+ " Gd | \n",
+ " Y | \n",
+ " SBrkr | \n",
+ " 961 | \n",
+ " 756 | \n",
+ " 0 | \n",
+ " 1717 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " Gd | \n",
+ " 7 | \n",
+ " Typ | \n",
+ " 1 | \n",
+ " Gd | \n",
+ " Detchd | \n",
+ " 1998.0 | \n",
+ " Unf | \n",
+ " 3 | \n",
+ " 642 | \n",
+ " TA | \n",
+ " TA | \n",
+ " Y | \n",
+ " 0 | \n",
+ " 35 | \n",
+ " 272 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 2006 | \n",
+ " WD | \n",
+ " Abnorml | \n",
+ " 140000 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 60 | \n",
+ " RL | \n",
+ " 84.0 | \n",
+ " 14260 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " FR2 | \n",
+ " Gtl | \n",
+ " NoRidge | \n",
+ " Norm | \n",
+ " Norm | \n",
+ " 1Fam | \n",
+ " 2Story | \n",
+ " 8 | \n",
+ " 5 | \n",
+ " 2000 | \n",
+ " 2000 | \n",
+ " Gable | \n",
+ " CompShg | \n",
+ " VinylSd | \n",
+ " VinylSd | \n",
+ " BrkFace | \n",
+ " 350.0 | \n",
+ " Gd | \n",
+ " TA | \n",
+ " PConc | \n",
+ " Gd | \n",
+ " TA | \n",
+ " Av | \n",
+ " GLQ | \n",
+ " 655 | \n",
+ " Unf | \n",
+ " 0 | \n",
+ " 490 | \n",
+ " 1145 | \n",
+ " GasA | \n",
+ " Ex | \n",
+ " Y | \n",
+ " SBrkr | \n",
+ " 1145 | \n",
+ " 1053 | \n",
+ " 0 | \n",
+ " 2198 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " Gd | \n",
+ " 9 | \n",
+ " Typ | \n",
+ " 1 | \n",
+ " TA | \n",
+ " Attchd | \n",
+ " 2000.0 | \n",
+ " RFn | \n",
+ " 3 | \n",
+ " 836 | \n",
+ " TA | \n",
+ " TA | \n",
+ " Y | \n",
+ " 192 | \n",
+ " 84 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 12 | \n",
+ " 2008 | \n",
+ " WD | \n",
+ " Normal | \n",
+ " 250000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
+ "0 1 60 RL 65.0 8450 Pave NaN Reg \n",
+ "1 2 20 RL 80.0 9600 Pave NaN Reg \n",
+ "2 3 60 RL 68.0 11250 Pave NaN IR1 \n",
+ "3 4 70 RL 60.0 9550 Pave NaN IR1 \n",
+ "4 5 60 RL 84.0 14260 Pave NaN IR1 \n",
+ "\n",
+ " LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \\\n",
+ "0 Lvl AllPub Inside Gtl CollgCr Norm \n",
+ "1 Lvl AllPub FR2 Gtl Veenker Feedr \n",
+ "2 Lvl AllPub Inside Gtl CollgCr Norm \n",
+ "3 Lvl AllPub Corner Gtl Crawfor Norm \n",
+ "4 Lvl AllPub FR2 Gtl NoRidge Norm \n",
+ "\n",
+ " Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \\\n",
+ "0 Norm 1Fam 2Story 7 5 2003 \n",
+ "1 Norm 1Fam 1Story 6 8 1976 \n",
+ "2 Norm 1Fam 2Story 7 5 2001 \n",
+ "3 Norm 1Fam 2Story 7 5 1915 \n",
+ "4 Norm 1Fam 2Story 8 5 2000 \n",
+ "\n",
+ " YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \\\n",
+ "0 2003 Gable CompShg VinylSd VinylSd BrkFace \n",
+ "1 1976 Gable CompShg MetalSd MetalSd None \n",
+ "2 2002 Gable CompShg VinylSd VinylSd BrkFace \n",
+ "3 1970 Gable CompShg Wd Sdng Wd Shng None \n",
+ "4 2000 Gable CompShg VinylSd VinylSd BrkFace \n",
+ "\n",
+ " MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure \\\n",
+ "0 196.0 Gd TA PConc Gd TA No \n",
+ "1 0.0 TA TA CBlock Gd TA Gd \n",
+ "2 162.0 Gd TA PConc Gd TA Mn \n",
+ "3 0.0 TA TA BrkTil TA Gd No \n",
+ "4 350.0 Gd TA PConc Gd TA Av \n",
+ "\n",
+ " BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF \\\n",
+ "0 GLQ 706 Unf 0 150 856 \n",
+ "1 ALQ 978 Unf 0 284 1262 \n",
+ "2 GLQ 486 Unf 0 434 920 \n",
+ "3 ALQ 216 Unf 0 540 756 \n",
+ "4 GLQ 655 Unf 0 490 1145 \n",
+ "\n",
+ " Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF \\\n",
+ "0 GasA Ex Y SBrkr 856 854 0 \n",
+ "1 GasA Ex Y SBrkr 1262 0 0 \n",
+ "2 GasA Ex Y SBrkr 920 866 0 \n",
+ "3 GasA Gd Y SBrkr 961 756 0 \n",
+ "4 GasA Ex Y SBrkr 1145 1053 0 \n",
+ "\n",
+ " GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr \\\n",
+ "0 1710 1 0 2 1 3 \n",
+ "1 1262 0 1 2 0 3 \n",
+ "2 1786 1 0 2 1 3 \n",
+ "3 1717 1 0 1 0 3 \n",
+ "4 2198 1 0 2 1 4 \n",
+ "\n",
+ " KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \\\n",
+ "0 1 Gd 8 Typ 0 NaN \n",
+ "1 1 TA 6 Typ 1 TA \n",
+ "2 1 Gd 6 Typ 1 TA \n",
+ "3 1 Gd 7 Typ 1 Gd \n",
+ "4 1 Gd 9 Typ 1 TA \n",
+ "\n",
+ " GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual \\\n",
+ "0 Attchd 2003.0 RFn 2 548 TA \n",
+ "1 Attchd 1976.0 RFn 2 460 TA \n",
+ "2 Attchd 2001.0 RFn 2 608 TA \n",
+ "3 Detchd 1998.0 Unf 3 642 TA \n",
+ "4 Attchd 2000.0 RFn 3 836 TA \n",
+ "\n",
+ " GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n",
+ "0 TA Y 0 61 0 0 \n",
+ "1 TA Y 298 0 0 0 \n",
+ "2 TA Y 0 42 0 0 \n",
+ "3 TA Y 0 35 272 0 \n",
+ "4 TA Y 192 84 0 0 \n",
+ "\n",
+ " ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold \\\n",
+ "0 0 0 NaN NaN NaN 0 2 2008 \n",
+ "1 0 0 NaN NaN NaN 0 5 2007 \n",
+ "2 0 0 NaN NaN NaN 0 9 2008 \n",
+ "3 0 0 NaN NaN NaN 0 2 2006 \n",
+ "4 0 0 NaN NaN NaN 0 12 2008 \n",
+ "\n",
+ " SaleType SaleCondition SalePrice \n",
+ "0 WD Normal 208500 \n",
+ "1 WD Normal 181500 \n",
+ "2 WD Normal 223500 \n",
+ "3 WD Abnorml 140000 \n",
+ "4 WD Normal 250000 "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# load dataset\n",
+ "data = pd.read_csv('houseprice.csv')\n",
+ "\n",
+ "print(data.shape)\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Separate dataset into train and test\n",
+ "\n",
+ "Before beginning to engineer our features, it is important to separate our data intro training and testing set. This is to avoid over-fitting. This step involves randomness, therefore, we need to set the seed."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "((1314, 80), (146, 80), (1314,), (146,))"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Let's separate into train and test set\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
+ " data.drop('SalePrice', axis=1), # predictors\n",
+ " data.SalePrice, # target\n",
+ " test_size=0.1,\n",
+ " random_state=0) # for reproducibility\n",
+ "\n",
+ "X_train.shape, X_test.shape, y_train.shape, y_test.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Missing values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# make a list of the categorical variables that contain missing values\n",
+ "vars_dates = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']\n",
+ "vars_cat = [var for var in X_train.columns if X_train[var].dtypes=='O']\n",
+ "vars_num = [var for var in X_train.columns if X_train[var].dtypes!='O' and var not in ['Id']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GarageYrBlt 0.056317\n",
+ "YearRemodAdd 0.000000\n",
+ "YearBuilt 0.000000\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train[vars_dates].isnull().mean().sort_values(ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LotFrontage 0.177321\n",
+ "GarageYrBlt 0.056317\n",
+ "MasVnrArea 0.004566\n",
+ "YrSold 0.000000\n",
+ "BsmtFinSF2 0.000000\n",
+ "LowQualFinSF 0.000000\n",
+ "2ndFlrSF 0.000000\n",
+ "1stFlrSF 0.000000\n",
+ "TotalBsmtSF 0.000000\n",
+ "BsmtUnfSF 0.000000\n",
+ "BsmtFinSF1 0.000000\n",
+ "BsmtFullBath 0.000000\n",
+ "YearRemodAdd 0.000000\n",
+ "YearBuilt 0.000000\n",
+ "OverallCond 0.000000\n",
+ "OverallQual 0.000000\n",
+ "LotArea 0.000000\n",
+ "GrLivArea 0.000000\n",
+ "BsmtHalfBath 0.000000\n",
+ "MoSold 0.000000\n",
+ "WoodDeckSF 0.000000\n",
+ "MiscVal 0.000000\n",
+ "PoolArea 0.000000\n",
+ "ScreenPorch 0.000000\n",
+ "3SsnPorch 0.000000\n",
+ "EnclosedPorch 0.000000\n",
+ "OpenPorchSF 0.000000\n",
+ "GarageArea 0.000000\n",
+ "FullBath 0.000000\n",
+ "GarageCars 0.000000\n",
+ "Fireplaces 0.000000\n",
+ "TotRmsAbvGrd 0.000000\n",
+ "KitchenAbvGr 0.000000\n",
+ "BedroomAbvGr 0.000000\n",
+ "HalfBath 0.000000\n",
+ "MSSubClass 0.000000\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train[vars_num].isnull().mean().sort_values(ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PoolQC 0.995434\n",
+ "MiscFeature 0.961187\n",
+ "Alley 0.938356\n",
+ "Fence 0.814307\n",
+ "FireplaceQu 0.472603\n",
+ "GarageCond 0.056317\n",
+ "GarageQual 0.056317\n",
+ "GarageFinish 0.056317\n",
+ "GarageType 0.056317\n",
+ "BsmtFinType2 0.025114\n",
+ "BsmtExposure 0.025114\n",
+ "BsmtFinType1 0.024353\n",
+ "BsmtQual 0.024353\n",
+ "BsmtCond 0.024353\n",
+ "MasVnrType 0.004566\n",
+ "Electrical 0.000761\n",
+ "Condition2 0.000000\n",
+ "Condition1 0.000000\n",
+ "Neighborhood 0.000000\n",
+ "LandSlope 0.000000\n",
+ "BldgType 0.000000\n",
+ "LandContour 0.000000\n",
+ "LotConfig 0.000000\n",
+ "Utilities 0.000000\n",
+ "RoofStyle 0.000000\n",
+ "LotShape 0.000000\n",
+ "Street 0.000000\n",
+ "HouseStyle 0.000000\n",
+ "SaleCondition 0.000000\n",
+ "RoofMatl 0.000000\n",
+ "Exterior1st 0.000000\n",
+ "Exterior2nd 0.000000\n",
+ "ExterQual 0.000000\n",
+ "ExterCond 0.000000\n",
+ "Foundation 0.000000\n",
+ "SaleType 0.000000\n",
+ "Heating 0.000000\n",
+ "HeatingQC 0.000000\n",
+ "CentralAir 0.000000\n",
+ "KitchenQual 0.000000\n",
+ "Functional 0.000000\n",
+ "PavedDrive 0.000000\n",
+ "MSZoning 0.000000\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train[vars_cat].isnull().mean().sort_values(ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# imputation numerical variables\n",
+ "imputer = SimpleImputer(strategy='constant', fill_value=-1)\n",
+ "X_train['LotFrontage'] = imputer.fit_transform(X_train['LotFrontage'].to_frame())\n",
+ "X_test['LotFrontage'] = imputer.transform(X_test['LotFrontage'].to_frame())\n",
+ "\n",
+ "imputer = SimpleImputer(strategy='most_frequent')\n",
+ "X_train[vars_num] = imputer.fit_transform(X_train[vars_num])\n",
+ "X_test[vars_num] = imputer.fit_transform(X_test[vars_num])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# imputation categorical variables\n",
+ "imputer = SimpleImputer(strategy='constant', fill_value='missing')\n",
+ "X_train[vars_cat] = imputer.fit_transform(X_train[vars_cat])\n",
+ "X_test[vars_cat] = imputer.fit_transform(X_test[vars_cat])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Temporal variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# let's explore the relationship between the year variables and the house price in a bit of more details\n",
+ "\n",
+ "def elapsed_years(df, var):\n",
+ " # capture difference between year variable and year the house was sold\n",
+ " df[var] = df['YrSold'] - df[var]\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:\n",
+ " X_train = elapsed_years(X_train, var)\n",
+ " X_test = elapsed_years(X_test, var)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# check that test set does not contain null values in the engineered variables\n",
+ "[vr for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'] if X_test[var].isnull().sum()>0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Categorical variable encoding"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "[var for var in X_train.columns if X_train[var].isnull().sum()>0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "[var for var in X_train.columns if X_test[var].isnull().sum()>0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# remove rare caregories\n",
+ "\n",
+ "rare_enc = RareLabelCategoricalEncoder(tol=0.01, n_categories=5, variables = vars_cat)\n",
+ "rare_enc.fit(X_train)\n",
+ "X_train = rare_enc.transform(X_train)\n",
+ "X_test = rare_enc.transform(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# encode with labels\n",
+ "\n",
+ "ordinal_enc = OrdinalEncoder()\n",
+ "X_train[vars_cat] = ordinal_enc.fit_transform(X_train[vars_cat])\n",
+ "X_test[vars_cat] = ordinal_enc.transform(X_test[vars_cat])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "[var for var in X_train.columns if X_test[var].isnull().sum()>0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Id | \n",
+ " MSSubClass | \n",
+ " MSZoning | \n",
+ " LotFrontage | \n",
+ " LotArea | \n",
+ " Street | \n",
+ " Alley | \n",
+ " LotShape | \n",
+ " LandContour | \n",
+ " Utilities | \n",
+ " LotConfig | \n",
+ " LandSlope | \n",
+ " Neighborhood | \n",
+ " Condition1 | \n",
+ " Condition2 | \n",
+ " BldgType | \n",
+ " HouseStyle | \n",
+ " OverallQual | \n",
+ " OverallCond | \n",
+ " YearBuilt | \n",
+ " YearRemodAdd | \n",
+ " RoofStyle | \n",
+ " RoofMatl | \n",
+ " Exterior1st | \n",
+ " Exterior2nd | \n",
+ " MasVnrType | \n",
+ " MasVnrArea | \n",
+ " ExterQual | \n",
+ " ExterCond | \n",
+ " Foundation | \n",
+ " BsmtQual | \n",
+ " BsmtCond | \n",
+ " BsmtExposure | \n",
+ " BsmtFinType1 | \n",
+ " BsmtFinSF1 | \n",
+ " BsmtFinType2 | \n",
+ " BsmtFinSF2 | \n",
+ " BsmtUnfSF | \n",
+ " TotalBsmtSF | \n",
+ " Heating | \n",
+ " HeatingQC | \n",
+ " CentralAir | \n",
+ " Electrical | \n",
+ " 1stFlrSF | \n",
+ " 2ndFlrSF | \n",
+ " LowQualFinSF | \n",
+ " GrLivArea | \n",
+ " BsmtFullBath | \n",
+ " BsmtHalfBath | \n",
+ " FullBath | \n",
+ " HalfBath | \n",
+ " BedroomAbvGr | \n",
+ " KitchenAbvGr | \n",
+ " KitchenQual | \n",
+ " TotRmsAbvGrd | \n",
+ " Functional | \n",
+ " Fireplaces | \n",
+ " FireplaceQu | \n",
+ " GarageType | \n",
+ " GarageYrBlt | \n",
+ " GarageFinish | \n",
+ " GarageCars | \n",
+ " GarageArea | \n",
+ " GarageQual | \n",
+ " GarageCond | \n",
+ " PavedDrive | \n",
+ " WoodDeckSF | \n",
+ " OpenPorchSF | \n",
+ " EnclosedPorch | \n",
+ " 3SsnPorch | \n",
+ " ScreenPorch | \n",
+ " PoolArea | \n",
+ " PoolQC | \n",
+ " Fence | \n",
+ " MiscFeature | \n",
+ " MiscVal | \n",
+ " MoSold | \n",
+ " YrSold | \n",
+ " SaleType | \n",
+ " SaleCondition | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 930 | \n",
+ " 931 | \n",
+ " 20.0 | \n",
+ " 3.0 | \n",
+ " 73.0 | \n",
+ " 8925.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 22.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 8.0 | \n",
+ " 5.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 4.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 16.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 1450.0 | \n",
+ " 1466.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1466.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1466.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 7.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 610.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 100.0 | \n",
+ " 18.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 4.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 2009.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 656 | \n",
+ " 657 | \n",
+ " 20.0 | \n",
+ " 3.0 | \n",
+ " 72.0 | \n",
+ " 10007.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 11.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 5.0 | \n",
+ " 7.0 | \n",
+ " 49.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 54.0 | \n",
+ " 2.0 | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " 806.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 247.0 | \n",
+ " 1053.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1053.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1053.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 5.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 49.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 312.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 2008.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 45 | \n",
+ " 46 | \n",
+ " 120.0 | \n",
+ " 3.0 | \n",
+ " 61.0 | \n",
+ " 7658.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 14.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ " 9.0 | \n",
+ " 5.0 | \n",
+ " 5.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ " 412.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 456.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 1296.0 | \n",
+ " 1752.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1752.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1752.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 6.0 | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 576.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 196.0 | \n",
+ " 82.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 4.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 2010.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 1348 | \n",
+ " 1349 | \n",
+ " 20.0 | \n",
+ " 3.0 | \n",
+ " -1.0 | \n",
+ " 16196.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 19.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 7.0 | \n",
+ " 5.0 | \n",
+ " 9.0 | \n",
+ " 9.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 4.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1443.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 39.0 | \n",
+ " 1482.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1494.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1494.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 5.0 | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 9.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 514.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 402.0 | \n",
+ " 25.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 4.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 8.0 | \n",
+ " 2007.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 55 | \n",
+ " 56 | \n",
+ " 20.0 | \n",
+ " 3.0 | \n",
+ " 100.0 | \n",
+ " 10175.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 11.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 6.0 | \n",
+ " 5.0 | \n",
+ " 44.0 | \n",
+ " 44.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 5.0 | \n",
+ " 1.0 | \n",
+ " 272.0 | \n",
+ " 3.0 | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 490.0 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 935.0 | \n",
+ " 1425.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 1425.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1425.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 7.0 | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 44.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 576.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 407.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 3.0 | \n",
+ " 4.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 7.0 | \n",
+ " 2008.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley \\\n",
+ "930 931 20.0 3.0 73.0 8925.0 1.0 2.0 \n",
+ "656 657 20.0 3.0 72.0 10007.0 1.0 2.0 \n",
+ "45 46 120.0 3.0 61.0 7658.0 1.0 2.0 \n",
+ "1348 1349 20.0 3.0 -1.0 16196.0 1.0 2.0 \n",
+ "55 56 20.0 3.0 100.0 10175.0 1.0 2.0 \n",
+ "\n",
+ " LotShape LandContour Utilities LotConfig LandSlope Neighborhood \\\n",
+ "930 0.0 1.0 0.0 4.0 0.0 22.0 \n",
+ "656 0.0 3.0 0.0 4.0 0.0 11.0 \n",
+ "45 3.0 3.0 0.0 4.0 0.0 14.0 \n",
+ "1348 2.0 2.0 0.0 4.0 0.0 19.0 \n",
+ "55 0.0 3.0 0.0 4.0 0.0 11.0 \n",
+ "\n",
+ " Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond \\\n",
+ "930 2.0 0.0 0.0 1.0 8.0 5.0 \n",
+ "656 2.0 0.0 0.0 1.0 5.0 7.0 \n",
+ "45 2.0 0.0 4.0 1.0 9.0 5.0 \n",
+ "1348 2.0 0.0 0.0 1.0 7.0 5.0 \n",
+ "55 2.0 0.0 0.0 1.0 6.0 5.0 \n",
+ "\n",
+ " YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd \\\n",
+ "930 2.0 2.0 0.0 0.0 8.0 8.0 \n",
+ "656 49.0 2.0 0.0 0.0 3.0 3.0 \n",
+ "45 5.0 5.0 1.0 0.0 4.0 4.0 \n",
+ "1348 9.0 9.0 0.0 0.0 8.0 8.0 \n",
+ "55 44.0 44.0 0.0 0.0 3.0 5.0 \n",
+ "\n",
+ " MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual \\\n",
+ "930 2.0 0.0 2.0 4.0 2.0 2.0 \n",
+ "656 1.0 54.0 2.0 4.0 1.0 3.0 \n",
+ "45 1.0 412.0 0.0 4.0 2.0 0.0 \n",
+ "1348 2.0 0.0 2.0 4.0 2.0 2.0 \n",
+ "55 1.0 272.0 3.0 4.0 1.0 3.0 \n",
+ "\n",
+ " BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 \\\n",
+ "930 3.0 0.0 2.0 16.0 5.0 \n",
+ "656 3.0 3.0 0.0 806.0 5.0 \n",
+ "45 3.0 3.0 2.0 456.0 5.0 \n",
+ "1348 3.0 1.0 2.0 1443.0 5.0 \n",
+ "55 3.0 3.0 1.0 490.0 5.0 \n",
+ "\n",
+ " BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir \\\n",
+ "930 0.0 1450.0 1466.0 0.0 0.0 1.0 \n",
+ "656 0.0 247.0 1053.0 0.0 0.0 1.0 \n",
+ "45 0.0 1296.0 1752.0 0.0 0.0 1.0 \n",
+ "1348 0.0 39.0 1482.0 0.0 0.0 1.0 \n",
+ "55 0.0 935.0 1425.0 0.0 2.0 1.0 \n",
+ "\n",
+ " Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath \\\n",
+ "930 3.0 1466.0 0.0 0.0 1466.0 0.0 \n",
+ "656 3.0 1053.0 0.0 0.0 1053.0 1.0 \n",
+ "45 3.0 1752.0 0.0 0.0 1752.0 1.0 \n",
+ "1348 3.0 1494.0 0.0 0.0 1494.0 1.0 \n",
+ "55 3.0 1425.0 0.0 0.0 1425.0 0.0 \n",
+ "\n",
+ " BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr \\\n",
+ "930 0.0 2.0 0.0 3.0 1.0 \n",
+ "656 0.0 1.0 1.0 3.0 1.0 \n",
+ "45 0.0 2.0 0.0 2.0 1.0 \n",
+ "1348 0.0 2.0 0.0 3.0 1.0 \n",
+ "55 0.0 2.0 0.0 3.0 1.0 \n",
+ "\n",
+ " KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \\\n",
+ "930 2.0 7.0 4.0 0.0 5.0 \n",
+ "656 2.0 5.0 4.0 0.0 5.0 \n",
+ "45 0.0 6.0 4.0 1.0 2.0 \n",
+ "1348 2.0 5.0 4.0 1.0 1.0 \n",
+ "55 3.0 7.0 4.0 1.0 2.0 \n",
+ "\n",
+ " GarageType GarageYrBlt GarageFinish GarageCars GarageArea \\\n",
+ "930 0.0 2.0 0.0 3.0 610.0 \n",
+ "656 0.0 49.0 1.0 1.0 312.0 \n",
+ "45 0.0 5.0 1.0 2.0 576.0 \n",
+ "1348 0.0 9.0 1.0 2.0 514.0 \n",
+ "55 0.0 44.0 1.0 2.0 576.0 \n",
+ "\n",
+ " GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF \\\n",
+ "930 2.0 2.0 2.0 100.0 18.0 \n",
+ "656 2.0 2.0 2.0 0.0 0.0 \n",
+ "45 2.0 2.0 2.0 196.0 82.0 \n",
+ "1348 2.0 2.0 2.0 402.0 25.0 \n",
+ "55 2.0 2.0 2.0 0.0 0.0 \n",
+ "\n",
+ " EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence \\\n",
+ "930 0.0 0.0 0.0 0.0 3.0 4.0 \n",
+ "656 0.0 0.0 0.0 0.0 3.0 2.0 \n",
+ "45 0.0 0.0 0.0 0.0 3.0 4.0 \n",
+ "1348 0.0 0.0 0.0 0.0 3.0 4.0 \n",
+ "55 0.0 407.0 0.0 0.0 3.0 4.0 \n",
+ "\n",
+ " MiscFeature MiscVal MoSold YrSold SaleType SaleCondition \n",
+ "930 4.0 0.0 7.0 2009.0 3.0 2.0 \n",
+ "656 4.0 0.0 8.0 2008.0 3.0 2.0 \n",
+ "45 4.0 0.0 2.0 2010.0 3.0 2.0 \n",
+ "1348 4.0 0.0 8.0 2007.0 3.0 2.0 \n",
+ "55 4.0 0.0 7.0 2008.0 3.0 2.0 "
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_train.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Gradient boosting regressor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
+ " learning_rate=0.1, loss='ls', max_depth=3,\n",
+ " max_features=None, max_leaf_nodes=None,\n",
+ " min_impurity_decrease=0.0, min_impurity_split=None,\n",
+ " min_samples_leaf=1, min_samples_split=2,\n",
+ " min_weight_fraction_leaf=0.0, n_estimators=50,\n",
+ " n_iter_no_change=None, presort='auto', random_state=0,\n",
+ " subsample=1.0, tol=0.0001, validation_fraction=0.1,\n",
+ " verbose=0, warm_start=False)"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tree_reg = GradientBoostingRegressor(random_state=0, n_estimators=50)\n",
+ "tree_reg.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "linear train mse: 319335295.90204245\n",
+ "linear train rmse: 17869.955117516172\n",
+ "\n",
+ "linear test mse: 956807676.3923079\n",
+ "linear test rmse: 30932.307970668917\n"
+ ]
+ }
+ ],
+ "source": [
+ "# evaluate the model:\n",
+ "# remember that we log transformed the output (SalePrice) in our feature engineering notebook / lecture.\n",
+ "\n",
+ "# In order to get the true performance of the Lasso\n",
+ "# we need to transform both the target and the predictions\n",
+ "# back to the original house prices values.\n",
+ "\n",
+ "# We will evaluate performance using the mean squared error and the\n",
+ "# root of the mean squared error\n",
+ "\n",
+ "pred = tree_reg.predict(X_train)\n",
+ "print('linear train mse: {}'.format(mean_squared_error(y_train, pred)))\n",
+ "print('linear train rmse: {}'.format(sqrt(mean_squared_error(y_train, pred))))\n",
+ "print()\n",
+ "pred = tree_reg.predict(X_test)\n",
+ "print('linear test mse: {}'.format(mean_squared_error(y_test, pred)))\n",
+ "print('linear test rmse: {}'.format(sqrt(mean_squared_error(y_test, pred))))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Prediction analysis from old Lasso Regression:\n",
+ "---------------------------------------------\n",
+ "linear train mse: 1087435415.4414542\n",
+ "linear train rmse: 32976.28565259366\n",
+ "\n",
+ "linear test mse: 1405259552.2596064\n",
+ "linear test rmse: 37486.79170400698\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('''\n",
+ "Prediction analysis from old Lasso Regression:\n",
+ "---------------------------------------------\n",
+ "linear train mse: 1087435415.4414542\n",
+ "linear train rmse: 32976.28565259366\n",
+ "\n",
+ "linear test mse: 1405259552.2596064\n",
+ "linear test rmse: 37486.79170400698\n",
+ "''')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 1.0, 'Evaluation of Lasso Predictions')"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZcAAAEWCAYAAACqitpwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3de5ycVZ3n8c+XpoEOtw6KLEkIQWSCOChgC3FQV9EhgCAZ1x1hdWFYRmZHHWUuGZMZVvA2oDiizCgjggoDioxgjIBGBBxndQN0DCQGyBAuEZqrJM0tDekkv/3jOZU8qVRVP9VdVV3V+b5fr3r185znck71pX59Ls85igjMzMwaaYfxLoCZmU08Di5mZtZwDi5mZtZwDi5mZtZwDi5mZtZwDi5mZtZwDi7WNiT9XNKfNunefyfpsmbce4R8/0jSI5JekHR4q/NvR5LeLunR3P4KSW8fxX3eKmllQwtnDePgYnWT9LCkofSBWXr983iXq6T8wwsgIv4hIpoSuEbwReCjEbFbRCwtPygpJL1mHMpVUyrXi+lnOyDpS5K6mpFXRLwuIn5esEybv1cR8R8RMbMZZbKx23G8C2Ad66SI+Nl4F6ID7A+sGO9CjNIbImKVpIOBnwP/CfxL/gRJO0bEhvEonLU311ysYSTtLGlQ0u/n0vZOtZxXSZos6QZJT0tam7anVbnXeZKuyu3PSP+57pj2z5B0r6TnJT0o6c9S+q7Aj4EpuVrVlAr3e09qjhlMzXGvzR17WNLfSFom6VlJ35O0S5Vy7iDpHEmrJT0l6UpJe6bvxQtAF3C3pAfq/F4eKOlWSc9I+p2kqyX15o5/ItUonpe0UtI7U/qRkvolPSfpSUlfKvKea4mI+4D/AH4/9/35hKRlwIuSdkzf4+vSz/YhSR/L5dsj6dvpZ34P8Kay9/qwpHel7a7UhPlAem9LJO0n6Rfp9LvTz/T9FZrXXpve12B6n+/JHfu2pK9KujHd93ZJB6ZjknRR+vk9J2l5/nfYRsfBxRomIl4GrgdOzSX/MfDvEfEU2e/bt8j+m58ODAGjbU57CjgR2AM4A7hI0hER8SJwPPBYaoraLSIey18o6feA7wJnA3sDNwE/krRTWbmPAw4AXg/8SZVy/El6vQN4NbAb8M8R8XJE7JbOeUNEHFjn+xNwPjAFeC2wH3BeKv9M4KPAmyJid2A28HC67ivAVyJiD+BA4No63nPlgkiHAG8F8s16pwLvBnqBTcCPgLuBqcA7gbMlzU7nnpvKcmAq6+k1svurdO8TyH62/wtYFxFvS8ffkH6m3ysrY3cqw0+BVwF/AVydvlclpwCfAiYDq4DPpfRjgbcBvwfsSfazf6bmN8VG5OBio7Ug/YdYen0opX+H7I+45H+kNCLimYi4LiLWRcTzZH/c/3U0mUfEjRHxQGT+nexD5a0FL38/cGNE3BwRw2T9Ij3AH+TOuTgiHouINWQfWodVudcHgC9FxIMR8QIwHzilVMMarYhYlcr3ckQ8DXyJLd+rjcDOwCGSuiPi4Ygo1YyGgddIemVEvBARi+t4z+V+LWkt2fu/jOwfg5KLI+KRiBgiq4nsHRGfjoj1EfEg8A22/B78MfC5iFgTEY8AF9fI80+BcyJiZfrZ3h0RRT7oZ5EF9gtSGW4FbmDrf3R+EBF3pGa8q9nyMx0GdgcOBhQR90bE4wXytBocXGy05kREb+71jZR+GzBJ0lGSZpD9Af8AQNIkSV9PTUjPAb8AejWKjmJJx0taLGmNpEGy/3RfWfDyKcDq0k5EbAIeIfuvu+SJ3PY6sg+uEe+VtncE9ilYlook7SPpmtT09RxwFen9RcQqshrIecBT6bwp6dIzyf4Dv0/SnZJOrFTOKu+53BERMTkiDoyIc9I1JY/ktvcna4bc/M8G8Hds+R5MKTs///0qtx9QVxNiPo+yMq6mwM80BaJ/Br5K9v28VNIeoyiD5Ti4WENFxEaypphT0+uGVEsB+GtgJnBUarYpNXWowq1eBCbl9v9LaUPSzsB1ZP997xMRvWTNPKX7jDTV92NkH4il+4nsQ21gpPc30r3Imvs2AE+O4l55/0D2Pg5N36sPkvs+RcR3IuItKe8APp/S74+IU8mahj4PfF9ZP1Qj3zNs/T1+BHio7J+N3SPihHT88ZRXyfQa932ErPmsXo8B+0nKf6ZNp+D7i4iLI+KNwCFkwXnuKMpgOQ4u1gzfIWuG+UDaLtmdrJ9lUNJeZG3x1dwFvE3SdEl7kjU3lexE1iz0NLBB0vFk7eYlTwKvSNdVci3wbknvTG31fw28DPyq6BvM+S7wl5IOkLQbWVD4Xp0jqHaStEvu1UX2vXoBeFbSVHIfdpJmSjomBdmXyL6nm9KxD0raO/0HP5gu2dTg91zuDuD51Mnfkzrlf19SqeP+WmC+sgEd08j6Q6q5DPiMpINSR/vrJb0iHXuSrF+rktvJaiN/K6lb2XMzJwHXjFR4SW9KNe1usn9qXiJ9P230HFxstH6krZ9z+UHpQETcTvZHOoVs5FbJl8na+X8HLAZ+Uu3mEXEz8D1gGbCErP28dOx54GNkH1pryfp1FuaO30f2of9gaqaZkrs1EbGSrCbwT6ksJ5ENrV5f7zcB+Cbwr2RNfA+RfTDV+vCsZAVZgCi9ziDreD4CeBa4kWygRMnOwAWp7E+Q1VJKwfc4YIWykWpfAU6JiKEGv+etpNrqiWRNoA+l+19G1jlOei+r07Gfkn2/qvkS2c/1p8BzwOVkvzOQNQNekX6mf1xWhvXpPR2f8v8acFr6XRjJHmR9RGtTOZ8BLixwndUgLxZmZmaN5pqLmZk1nIOLmZk1nIOLmZk1nIOLmZk1nCeuTF75ylfGjBkzxrsYZmYdZcmSJb+LiL3L0x1ckhkzZtDf3z/exTAz6yiSKs644GYxMzNrOAcXMzNrOAcXMzNrOAcXMzNrOAcXMzNrOI8WMzPbDi1YOsCFi1by2OAQU3p7mDt7JnMOr7W8T30cXMzMtjMLlg4w//rlDA1vBGBgcIj51y8HaFiAcbOYmdl25sJFKzcHlpKh4Y1cuGhlw/JwcDEz2848NjhUV/poOLiYmW1npvT21JU+Gg4uZmbbmbmzZ9LT3bVVWk93F3Nnz2xYHu7QNzPbzpQ67T1azMzMGmrO4VMbGkzKNbVZTNLDkpZLuktSf0rbS9LNku5PXyendEm6WNIqScskHZG7z+np/PslnZ5Lf2O6/6p0rWrlYWZmrdGKPpd3RMRhEdGX9ucBt0TEQcAtaR/geOCg9DoLuASyQAGcCxwFHAmcmwsWlwAfyl133Ah5mJlZC4xHh/7JwBVp+wpgTi79ysgsBnol7QvMBm6OiDURsRa4GTguHdsjIhZHRABXlt2rUh5mZtYCzQ4uAfxU0hJJZ6W0fSLi8bT9BLBP2p4KPJK79tGUViv90QrptfLYiqSzJPVL6n/66afrfnNmZlZZszv03xIRA5JeBdws6b78wYgISdHMAtTKIyIuBS4F6Ovra2o5zMy2J02tuUTEQPr6FPADsj6TJ1OTFunrU+n0AWC/3OXTUlqt9GkV0qmRh5mZtUDTgoukXSXtXtoGjgV+AywESiO+Tgd+mLYXAqelUWOzgGdT09Yi4FhJk1NH/rHAonTsOUmz0iix08ruVSkPMzNrgWY2i+0D/CCNDt4R+E5E/ETSncC1ks4EVgN/nM6/CTgBWAWsA84AiIg1kj4D3JnO+3RErEnbHwa+DfQAP04vgAuq5GFmZi2gbKCV9fX1RX9//3gXw8yso0haknvUZDPPLWZmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3n4GJmZg3XzJUozSxnwdIBLly0kscGh5jS28Pc2TOZc/jU8S6WWVM4uJi1wIKlA8y/fjlDwxsBGBgcYv71ywEcYGxCcrOYWQtcuGjl5sBSMjS8kQsXrRynEpk1l4OLWQs8NjhUV7pZp3NwMWuBKb09daWbdToHF7MWmDt7Jj3dXVul9XR3MXf2zHEqkVlzuUPfrAVKnfYeLWbbCwcXsxaZc/jUuoKJhy5bJ3NwMWtDHrpsnc59LmZtyEOXrdM5uJi1IQ9dtk7n4GLWhjx02Tqdg4tZG/LQZet07tA3a0MeumydzsHFrE3VO3TZrJ24WczMzBrOwcXMzBpuxOAi6fck3SLpN2n/9ZLOaX7RzMysUxWpuXwDmA8MA0TEMuCUZhbKzMw6W5HgMiki7ihL29CMwpiZ2cRQJLj8TtKBQABIeh/weNEMJHVJWirphrR/gKTbJa2S9D1JO6X0ndP+qnR8Ru4e81P6Skmzc+nHpbRVkubl0ivmYWZmrVEkuHwE+DpwsKQB4Gzgz+vI4+PAvbn9zwMXRcRrgLXAmSn9TGBtSr8onYekQ8ia4V4HHAd8LQWsLuCrwPHAIcCp6dxaeZiZWQuMGFwi4sGIeBewN3BwRLwlIh4ucnNJ04B3A5elfQHHAN9Pp1wBzEnbJ6d90vF3pvNPBq6JiJcj4iFgFXBkeq1K5VsPXAOcPEIeZmbWAkVGi/2DpN6IeDEinpc0WdJnC97/y8DfApvS/iuAwYgo9dk8CpSeEpsKPAKQjj+bzt+cXnZNtfRaeZiZWQsUaRY7PiIGSzsRsRY4YaSLJJ0IPBURS8ZQvqaSdJakfkn9Tz/99HgXx8xswigSXLok7VzakdQD7Fzj/JKjgfdIepisyeoY4CtAr6TStDPTgIG0PQDsl/LYEdgTeCafXnZNtfRnauSxlYi4NCL6IqJv7733LvCWzMysiCLB5WrgFklnSjoTuJktfSNVRcT8iJgWETPIOuRvjYgPALcB70unnQ78MG0vTPuk47dGRKT0U9JosgOAg4A7gDuBg9LIsJ1SHgvTNdXyMDOzFhhx4sqI+LykZcA7U9JnImLRGPL8BHBN6rdZClye0i8H/lXSKmAN6UHNiFgh6VrgHrLnaz4SERsBJH0UWAR0Ad+MiBUj5GFmZi2g7B996+vri/7+/vEuhplZR5G0JCL6ytOr1lwk/d+IeIuk50kPUJYOARERezShnGZmNgFUDS4R8Zb0dffWFcfMzCaCmh366Un4+1pVGDMzmxhqduhHxMY0d9f0iPhtqwplZlssWDrg5Y6t4xRZ5ngysELSHcCLpcSIeE/TSmVmQBZY5l+/nKHhjQAMDA4x//rlAA4w1taKBJf/0/RSmFlFFy5auTmwlAwNb+TCRSsdXKyt1QwukuYArwGWj/HZFrO20UnNTI8NDtWVbtYuqnboS/oa8JdkE0F+RpJrMNbxSs1MA4NDBFuamRYsrThD0Lib0ttTV7pZu6g1WuxtwDERMR94O5623iaAWs1M7Wju7Jn0dHdtldbT3cXc2TPHqURmxdRqFltfmmYlItaldVLMOlqnNTOVmus6pRnPrKRWcDk4zSkG2VP5B6b90hP6r2966cwabEpvDwMVAkk7NzPNOXyqg4l1nFrB5bUtK4VZi8ydPXOrob3gZiazZqg1/cvqVhbErBXczGTWGkWeczGbUNzMZNZ8RRYLMzMzq0uh4CKpR5Ibpc3MrJARg4ukk4C7gJ+k/cMkLWx2wczMrHMVqbmcBxwJDAJExF3AAU0sk5mZdbgiwWU4Ip4tS/PayGZmVlWR0WIrJP0PoEvSQcDHgF81t1hmZtbJitRc/gJ4HfAy8F3gOeDsZhbKzMw624g1l4hYB/w98PeSuoBdI+KlppfMzMw6VpHRYt+RtIekXYHlwD2S5ja/aGZm1qmKNIsdEhHPkU25/2OykWL/s6mlMjOzjlYkuHRL6iYLLgsjYhiPFjMzsxqKjBb7OvAwcDfwC0n7k3Xqm1kDddLyy2YjKdKhfzFwcS5ptaR3NK9IZtuf0vLLpaUASssvAw4w1pFGDC6SPlnl0KcbXBazlmqnmkKt5ZcdXKwTFWkWezG3vQtwInBvc4pj1hrNrCmMJmh12vLLZiMp0iz2j/l9SV8EFjWtRGYt0KyawmiDVicuv2xWy2jWc5kETGt0QcxaqVk1hVpBq5a5s2fS0921VZqXX7ZOVqTPZTlbhh53AXvj/hbrcM2qKYw2aHn5ZZtoivS5nJjb3gA8GREbmlQes5aYO3vmVs1X0JiawliClpdftolkxGaxiFgN9AInAX8EHNLsQpk125zDp3L+ew9lam8PAqb29nD+ew8d84e7m7fMMkWaxT4OfAi4PiVdLenSiPinppbMLKcZw4abUVNw85ZZRhG1Z3KRtAx4c0S8mPZ3Bf5fRLy+BeVrmb6+vujv7x/vYlgF5SOwIKsNNKKmYWZjI2lJRPSVpxcZLSYgP/xlY0oza4nRjsAys/FTJLh8C7hd0nmSzgMWA5ePdJGkXSTdIeluSSskfSqlHyDpdkmrJH1P0k4pfee0vyodn5G71/yUvlLS7Fz6cSltlaR5ufSKeVhn8gOGZp2nSIf+l4AzgDXpdUZEfLnAvV8GjomINwCHAcdJmgV8HrgoIl4DrAXOTOefCaxN6Rel85B0CHAK2WqYxwFfk9SVFi77KnA82SCDU9O51MjDOlC1kVZ+wNCsfVUNLpL2Kr3IZkW+Kr1Wp7SaIvNC2u1OrwCOAb6f0q8gm8of4OS0Tzr+TklK6ddExMsR8RCwCjgyvVZFxIMRsR64Bjg5XVMtD2sDC5YOcPQFt3LAvBs5+oJbWbB0oOb5HoFl1nlqjRZbQhYMBOwLPJbSldJfPdLNU+1iCfAaslrGA8Bg7jmZR4FSj+xU4BGAiNgg6VngFSl9ce62+WseKUs/Kl1TLY/y8p0FnAUwffr0kd6ONcBopkcppZ+3cAWDQ8MA7NI9msklzKxVqgaXiDigtC1paUQcXu/NI2IjcJikXuAHwMGjKmWTRMSlwKWQjRYb5+JsF6p1zp+3cMWIw3df3rBp8/badcOekt6sjRX9929MH7wRMQjcBrwZ6JVUCmrTgFKbyACwH0A6vifwTD697Jpq6c/UyMPGWbVO+MGhYQYGhwi21GbyzWUeMWbWWZrWtiBp71RjQVIP8IdkU/XfBrwvnXY68MO0vTDtk47fGtlDOAuBU9JosgOAg4A7gDuBg9LIsJ3IOv0Xpmuq5WHjrGgnfHng8Igxs85StVlM0l/ldl9Vtl8aRVbLvsAVqd9lB+DaiLhB0j3ANZI+Cyxly7Dmy4F/lbSKbFTaKSmfFZKuBe4hm9vsI6m5DUkfJZv+vwv4ZkSsSPf6RJU8bJxVmtOrmnzg8JT0Zp2lVof+7rntb5TtjygilgHb9NNExINkI73K018C/nuVe30O+FyF9JuAm4rmYeOv0vQo69ZvYO264W3OzQeOZk00aWbNUatD/1OtLIhtP8rn9Ko2vUs+cHjOLrPOUmTKfbOmKho4PCW9WedwcLG24MBhNrH4STQzM2u4oqPFtlFgtJiZmW2niowWmwm8iex5E8hWpLyjmYUyM7PONuJoMUm/AI6IiOfT/nnAjS0pnZmZdaQifS77AOtz++tTmpmZWUVFRotdCdwh6Qdpfw5bpsY3MzPbxojBJSI+J+nHwFtT0hkRsbS5xTIzs05W9DmXScBzEfGtNCHlAWnhLrOmWLB0oOJDldXSzay9jBhcJJ0L9JGNGvsW2YqSVwFHN7dotr2qtqBY/+o1XLdkoK6FxsxsfBSpufwR2QSUvwaIiMck1TWJpVneSLWPamu3fPf2R9gYsU36hYtWOriYtZkiwWV9RISkAJC0a5PLZBNYkWWOq63RUh5YSrymi1n7KTIU+VpJXydb3fFDwM+Ay5pbLJuoiqwoWW2Nli6pYrrXdDFrPyMGl4j4IvB94DqyfpdPRsTFzS6YTUxFVpScO3smPd1dWx3v6e7i1KP2q5juNV3M2k+RDv3PR8QngJsrpJnVpciKkrWm4O/bfy+PFjPrAIoq7dibT5B+HRFHlKUti4jXN7VkLdbX1xf9/f3jXYwJr9rCYOe/99CWBQkPZzZrHElLIqKvPL3WrMh/DnwYOFDSstyh3YFfNb6I1k6a9QE83itKFhlQYGZjV6tZ7DvAj4HzgXm59OcjYk1TS2XjqpEfwNWCVPl9WlWbqDWgwMHFrHGqduhHxLMR8TDwFWBNRKyOiNXABklHtaqA1npFRnQVUQpSA4NDBFuC1IKlA6M6rxGKDCgws7ErMhT5EuCF3P4LKc0mqEZ9ABcNUo0KZkVUG7bc6uHMC5YOcPQFt3LAvBs5+oJbmxJIzcZTkYcoFble/4jYJKnonGTWgYqM6CpXqVmraJCqdV6jm8vmzp5ZcUBBK4czu9/HtgdFai4PSvqYpO70+jjwYLMLZuOn2nMm1T6AqzVr9U7qrnh+eZCqFrR6J3U3vLlszuFTOf+9hzK1twcBU3t7WjpSDVpbUzMbL0VqIP8buBg4BwjgFuCsZhbKxle9I7qqfVjuvOMO9HR3jVhLqFabiKApne+VBhS0kvt9bHtQZD2Xp4BTWlAWayO1PoDLm6oqNaEBPDs0zEXvP2zEIFXa/9SPVrB23TAAO++4A4NDwxXv2+kfwqNpdjTrNLWec/nbiPiCpH8iq7FsJSI+1tSS2bjKB5A9e7qRYHDdML2TunnhpQ0Mb8p+JaoFFsg+LOupJbw0vGnz9uDQMKLCLx6d/yHcDv0+Zs1Wq+Zyb/rqx9a3M+UdzvkaRKlmMZJ6PywrNa0FbBNgJsKH8Hg/SGrWClWDS0T8KH29onXFsXZQ6YO+Hl1S3Z3k1Zq6gqzTfaJ9CI93v49Zs9VqFvsRlVslAIiI9zSlRDYuzlmwvOJiXKOxKaLuD85q/RBTe3v45bxjxlwmM2utWkORvwj8I/AQMAR8I71eAB5oftGsVc5ZsJyrFv+2IYEFqDoEuZZ6hz+bWXur1Sz27wCS/rFsxssfSXI/zATy3dsfaej9XnhpAwuWDtRVe3E/hNnEUuQ5l10lvToiHgSQdADgpY47zIKlA5y3cMXmzvnJk7o596TXMefwqSPWWHq6d2CX7i7Wrqs+gitveFOM6lkU90OYTRxFgstfAj+X9CDZ4J39gT9raqmsYcqDSsnadcPM/f7dQNYBXzvAaHMgKvqMS3kHvddQMdu+FFnm+CfAQcDHgY8BMyNiUbMLZmNXGlJc7WHE4Y1ZDePUo/areZ/81CRzDp/KL+cdw0XvP6zmNflnUVo567GZtYcRg4ukScBc4KMRcTcwXdKJTS+ZjVmRIcWPDQ7x2TmH8sFZ09EI55Xkg0Ul5R3xnkvLbPtTZOLKbwHrgTen/QHgsyNdJGk/SbdJukfSijThJZL2knSzpPvT18kpXZIulrRK0jJJR+TudXo6/35Jp+fS3yhpebrmYkmqlcf2psg0KaUaRt/+e7HjDtXDS74mUitoVZoI0nNpmW1/igSXAyPiC8AwQESsg5r/5JZsAP46Ig4BZgEfkXQI2aqWt0TEQWSTYJZWuTyerPntILKJMS+BLFAA5wJHAUcC5+aCxSXAh3LXHZfSq+WxXRlpmpTuLm2uYZy3cMXmKV3KlddEqgUFAb+cd8w2fSntsoaKmbVOkeCyXlIPaZCQpAOBl0e6KCIej4hfp+3nyaaTmQqcDJSe+r8CmJO2TwaujMxioFfSvsBs4OaIWBMRa4GbgePSsT0iYnFab+bKsntVymPCqrT4VKVnR0omT+rmwve9YXMgqNYvA7BL9w70r17D0Rfcyox5N1YdLVYtWPgZFrPtT5HRYucCPwH2k3Q1cDTwJ/VkImkGcDhwO7BPRDyeDj0B7JO2pwL5By4eTWm10h+tkE6NPMrLdRZp+YDp06fX85baSrXFp85/76Gc/95DRxyldc6C5TXvv3bdMFct/m3Nc2oFCz/DYrb9qRlcUh/GfcB7yZq2BHw8In5XNANJuwHXAWdHxHOpWwSAiAhJjXksvIpaeUTEpcClAH19fU0tRzNV6zD/62vvZlMEU3p7uOj9h23zYV5tmHK9dhAjziXmZ1jMti81g0v6YL4pIg4Fbqz35pK6yQLL1RFxfUp+UtK+EfF4atp6KqUPAPkxsdNS2gDw9rL0n6f0aRXOr5XHhFStD6T07EqpJtO/eg233ff05mn0X1y/geGNjZhLLPt69AW3umZiZkCxPpdfS3pTvTdOtZ7LgXsj4ku5QwuB0oiv04Ef5tJPS6PGZgHPpqatRcCxkianjvxjgUXp2HOSZqW8Tiu7V6U8JqQiHeNDwxu5evFvNz9rMjg03JDAUuLnWMwsr0hwOQpYLOmBNER4uaRlBa47GvifwDGS7kqvE4ALgD+UdD/wrrQPcBPwILCKbILMDwNExBrgM8Cd6fXplEY657J0zQPAj1N6tTwmpFod93nNavcT1ZcjNrPtk2KEeaUk7V8pPSJWN6VE46Svry/6+zt3Ps789CrQvEBSrnsHVR3CLOChC97dopKY2XiQtKRscmOgRs1F0i6SziZ7Ov84YCAiVpdeTSyrjUJpWpaHLnj3qKa8r8fU3h6Uvl7439/AVD/HYmZlanXoX0H24OR/kD3geAjZ/GLW5gYLLkU8GtUW7/Ka8GaWVyu4HJJGiSHpcuCO1hTJxqrabMW77tTFS8ObxrQoWKWA4edYzKxcreCy+d/fiNiQfz7F2tvc2TO3qUl0d4n1G8YWWHp7uqsGDD/HYmZ5tYLLGyQ9l7YF9KR9kT0Cs0fTS2ejUqkm8eLLGwo/LLnrTl2s37Bpq476nu4uznvP65pSXjObeGotczzy2FZrW+U1iQPmFXsGVsCKTx/nxb3MbEyKzC1mbWIsH/i9k7pZW6Cjf8+ebKSZm7nMbCwcXDpEtckpga2CQKUABPDCSxsK5eOuNTNrBAeXDlFrNcdScKkWgHbecYeqDzqWK1K7MTMbSZHpX6wNFFnNsVoAqmfWY4HnBDOzMXNwaXOlRcBGWqBrwdKBqmva1yPAc4KZ2Zi5WayNlTdzVfLYs0PMmHdjoXWni/La9mY2Vq65tLFKzVzlSs9ENnKiSs8JZmZj5eDSxsajBuE5wcysERxc2lizahD5JrRdd+qit6d78yzHIy1XbGZWhPtcxkHRhyHnzp7J3H+7u/Aw4iK8xoqZtYJrLi1W6qQvsiTwnIx84HMAAA2TSURBVMOn0t3V2Kca3Z9iZq3g4NJitR6GLLdg6QDrhjc1LG/3p5hZqzi4tFi1Z1EqpTfyeZMuyf0pZtYyDi4t1lVl8q5K6Y0cLbYpwoHFzFrGwaXFqi3WtTGCGfNu5MD5N3HOgmxCykb2j7ivxcxaycGlxaaO8CG/MYKrFv+WcxYsb1j/iPtazKzVHFxa7B0H711oqpbv3v4I/avXNCRP97WYWas5uLTQgqUDXLdkoNBULRsjuHrxbwvfe4cqEWtqb48Di5m1nINLCxWZKyyvnkcnN0XW/JXn5jAzGy8OLi3UzLnCSlO3TO3t8VQuZjbuPP1LC03p7WnImivlSjUUr3tvZu3CNZcWmjt75jZNV2MlucPezNqPay4tVAoAFy5aycDgEF1S1edeioqgamApOkGmmVmjObi0UOnDfmBwCFH9gcpG5ZVfxbI0QSZUD0ZmZo3iZrEWyc+GDI1bOXLypO6K6fVMkGlm1miuuTTISE1Q9Q5DLqJrB3HuSa+reKzayLTxWN3SzLY/Di5jUN7MVaqNVGqCasSHej6PyZO6Ofek11Vt4qo2Ms1zjJlZKzi4jFJ5n0Z5M1epCar04T+WYcjdXeLC972hrr6SubNnblU+8EOVZtY67nMZpSLNXPnaytzZM+muNkdLDZMnddcdWCCrMfmhSjMbL665jFKRZq4pvT1b9cXU68vvP2xMwcAPVZrZeGlazUXSNyU9Jek3ubS9JN0s6f70dXJKl6SLJa2StEzSEblrTk/n3y/p9Fz6GyUtT9dcLGWrbVXLo9GK9F0MrlvP3H+7m4HBIYL6Roh5wkkz62TNbBb7NnBcWdo84JaIOAi4Je0DHA8clF5nAZdAFiiAc4GjgCOBc3PB4hLgQ7nrjhshj4Yq8rT9i+s3Mryp/kHHSvc3M+tUTQsuEfELoHxBkpOBK9L2FcCcXPqVkVkM9EraF5gN3BwRayJiLXAzcFw6tkdELI6IAK4su1elPBoq36fRaB+YNd21FjPraK3u0N8nIh5P208A+6TtqcAjufMeTWm10h+tkF4rj4abc/hUfjnvmIYFmMmTuvny+w/js3MObcj9zMzGy7h16EdESGre/CcF8pB0FlkzHNOnTx91PmN5hmXypG6WfvLYUV9vZtaOWl1zeTI1aZG+PpXSB4D9cudNS2m10qdVSK+VxzYi4tKI6IuIvr333nvUb2qkzv3enspTtAAMrhsedb5mZu2q1cFlIVAa8XU68MNc+mlp1Ngs4NnUtLUIOFbS5NSRfyywKB17TtKsNErstLJ7Vcqjad5xcPXA9MFZ07nr3GOrNp35iXkzm4iaORT5u8D/A2ZKelTSmcAFwB9Kuh94V9oHuAl4EFgFfAP4MEBErAE+A9yZXp9OaaRzLkvXPAD8OKVXy6MpFiwd4LolA9uk93TvsFX/SaXRZSKbKuboC25lwdJt72Fm1qkUTZz2vZP09fVFf39/3dcdfcGtFad1mdrbwy/nHbNVWrW5yCCbmsVP0JtZp5G0JCL6ytM9/csYVZsvrFInf350WbW5yMzMJgIHlzFYsHSAarOF1epL8XT4ZjbRObiMwYWLVlac0mWkJ+yrBR537pvZROHgMgbVahpB7aWEK3Xuezp8M5tIHFzGoFpNY6Qn9j0dvplNdJ5yfwzGsiCXp8M3s4nMwWUMSsGhtF7LlN4e5s6e6aBhZts9B5cxcg3EzGxb7nMxM7OGc3AxM7OGc3AxM7OGc3AxM7OGc3AxM7OG86zIiaSngdVNzOKVwO+aeP9mcblbqxPL3YllBpe7UfaPiG0WtXJwaRFJ/ZWmpW53LndrdWK5O7HM4HI3m5vFzMys4RxczMys4RxcWufS8S7AKLncrdWJ5e7EMoPL3VTuczEzs4ZzzcXMzBrOwcXMzBrOwaUASd+U9JSk3+TS9pJ0s6T709fJKV2SLpa0StIySUfkrjk9nX+/pNNz6W+UtDxdc7Ek1cqjYJn3k3SbpHskrZD08Q4p9y6S7pB0dyr3p1L6AZJuT3l9T9JOKX3ntL8qHZ+Ru9f8lL5S0uxc+nEpbZWkebn0innUUfYuSUsl3dBBZX44/QzvktSf0tr6dyRd3yvp+5Luk3SvpDe3e7klzUzf59LrOUlnt3u5Ry0i/BrhBbwNOAL4TS7tC8C8tD0P+HzaPgH4MSBgFnB7St8LeDB9nZy2J6djd6Rzla49vlYeBcu8L3BE2t4d+E/gkA4ot4Dd0nY3cHvK41rglJT+L8Cfp+0PA/+Stk8Bvpe2DwHuBnYGDgAeALrS6wHg1cBO6ZxD0jUV86ij7H8FfAe4odb92qzMDwOvLEtr69+RdM0VwJ+m7Z2A3k4od678XcATwP6dVO663mOzM5goL2AGWweXlcC+aXtfYGXa/jpwavl5wKnA13PpX09p+wL35dI3n1ctj1GW/4fAH3ZSuYFJwK+Bo8ieSN4xpb8ZWJS2FwFvTts7pvMEzAfm5+61KF23+dqUPj+9VC2PgmWdBtwCHAPcUOt+7VLmdM3DbBtc2vp3BNgTeIg0IKlTyl1W1mOBX3Zauet5uVls9PaJiMfT9hPAPml7KvBI7rxHU1qt9EcrpNfKoy6p2eVwslpA25c7NS/dBTwF3Ez2X/tgRGyokNfm8qXjzwKvGMX7eUWNPIr4MvC3wKa0X+t+7VJmgAB+KmmJpLNSWrv/jhwAPA18S1kz5GWSdu2AcuedAnx3hHu2Y7kLc3BpgMj+HWjqmO7R5iFpN+A64OyIeK4R96zHaPKIiI0RcRhZbeBI4OBmlK1RJJ0IPBURS8a7LKPwlog4Ajge+Iikt+UPtunvyI5kzdSXRMThwItkTT1juWfdxvA3uRPwHuDfGnXPerQiD3BwGYsnJe0LkL4+ldIHgP1y501LabXSp1VIr5VHIZK6yQLL1RFxfaeUuyQiBoHbyJp7eiWVluXO57W5fOn4nsAzo3g/z9TIYyRHA++R9DBwDVnT2FfavMwARMRA+voU8AOyYN7uvyOPAo9GxO1p//tkwabdy11yPPDriHhyhHu2W7nr4uAyeguB09P26WR9GqX009JIj1nAs6k6ugg4VtLkNFLjWLL28ceB5yTNSiM7Tiu7V6U8RpTudTlwb0R8qYPKvbek3rTdQ9ZPdC9ZkHlflXKX8nofcGv6z2whcIqykVkHAAeRdXbeCRykbJTVTmTNEwvTNdXyqCki5kfEtIiYke53a0R8oJ3LDCBpV0m7l7bJfra/oc1/RyLiCeARSTNT0juBe9q93DmnsqVJrNY9263c9Wl2p85EeJH9IjwODJP913QmWXv3LcD9wM+AvdK5Ar5K1k+wHOjL3ed/AavS64xceh/ZH/UDwD+zZeaEinkULPNbyKq+y4C70uuEDij364Glqdy/AT6Z0l9N9kG7iqw5YeeUvkvaX5WOvzp3r79PZVtJGjWT0k8gGz33APD3ufSKedT5u/J2towWa+syp2vvTq8Vpfu2++9Iuv4woD/9niwgGzXVCeXelazGuWcure3LPZqXp38xM7OGc7OYmZk1nIOLmZk1nIOLmZk1nIOLmZk1nIOLmZk1nIOLbbckvUJbZqh9QtJAbr+u2YVHyOddkhaUpV0laU6j8qijLI8qmzV3maSfSHpVlfO+lXuOxKxuO458itnEFBHPkD0vgaTzgBci4ov5c9LDaIqITdveoWO9NSIGJX2BbNqUv8oflNQVEWeMT9FsonDNxayMpNcoWwfnarKHC/eTNJg7foqky9L2PpKul9SvbB2aWaPI79hUW1ou6Rvasu7Lo7nZCmZJ+lnaPkbZejd3Sfp1eroeSfNSGZZJ+mSBrH8BvEbSjpIGJX1Z0jLgSEn/V1Ip8L475XO3pJ+mtN0kfTvlt1TSSfW+b5vYXHMxq+xg4LSI6NeWubsquRj4QkQsVjb79A3A71c47x3KZnoumQ58X9Ik4JvAf42IB1JAO4vs6epq5gJnRcTtyiYmfUnSCemeR5E92X2TpD+IiF9VukGqkZ1I9uQ3ZPOb/SIizk7HS+f9F+ASstrOakl7pfM/CfwkIv4kTUFyu6SbI+KlGuW27YiDi1llD0REf4Hz3gXMLH0YA5Ml9UTEUNl5t0XE5j4WSVelzdcC/xkRD6T9K8mmF6oVXH4JfCUFousi4gVJx5JNiLg0nbMb8HtApeDyH2RLA9wFfD6lrSebuLLcm1PZVwNExJqUfixwvLasirkLWXD7zxrltu2Ig4tZZS/mtjeR1QZKdsltCzgyItY3oQwb2NJ0vTnPiPispIXAu4HFkt6ZyvHZiLi8wH3fGtmM08DmmZmHor65oATMyQVFs624z8VsBKkzf62kgyTtAPxR7vDPgI+Udkr9FHW4l2zG41en/Q8C/562HwbemLb/Wy6PAyNiWUScT7ZS50yymXLPzPW/TJP0yjrLUsmvyJr09k/3LTWLLQL+IlemwxuQl00gDi5mxXyC7AP1V2y92t9HgKNTJ/o9wIfquWlErCNrBrte0nLgZeAb6fB5wNck3UnWbFXyN5J+kzrfXwB+GhE3ka1rsjjd51qyprExiWzNkT8HfijpbuDqdOhTwK5pEMKKVFazzTwrspmZNZxrLmZm1nAOLmZm1nAOLmZm1nAOLmZm1nAOLmZm1nAOLmZm1nAOLmZm1nD/H5uwWLkZH5X2AAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# let's evaluate our predictions respect to the original price\n",
+ "plt.scatter(y_test, tree_reg.predict(X_test))\n",
+ "# plt.xlim(100000, 200000)\n",
+ "# plt.ylim(100000, 200000)\n",
+ "plt.xlabel('True House Price')\n",
+ "plt.ylabel('Predicted House Price')\n",
+ "plt.title('Evaluation of Lasso Predictions')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Feature Selection"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "SelectFromModel(estimator=GradientBoostingRegressor(alpha=0.9,\n",
+ " criterion='friedman_mse',\n",
+ " init=None,\n",
+ " learning_rate=0.1,\n",
+ " loss='ls', max_depth=3,\n",
+ " max_features=None,\n",
+ " max_leaf_nodes=None,\n",
+ " min_impurity_decrease=0.0,\n",
+ " min_impurity_split=None,\n",
+ " min_samples_leaf=1,\n",
+ " min_samples_split=2,\n",
+ " min_weight_fraction_leaf=0.0,\n",
+ " n_estimators=50,\n",
+ " n_iter_no_change=None,\n",
+ " presort='auto',\n",
+ " random_state=0,\n",
+ " subsample=1.0, tol=0.0001,\n",
+ " validation_fraction=0.1,\n",
+ " verbose=0,\n",
+ " warm_start=False),\n",
+ " max_features=None, norm_order=1, prefit=False, threshold=None)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# here I will do the model fitting and feature selection\n",
+ "# altogether in one line of code\n",
+ "\n",
+ "# first, I specify the Lasso Regression model, and I\n",
+ "# select a suitable alpha (equivalent of penalty).\n",
+ "# The bigger the alpha the less features that will be selected.\n",
+ "\n",
+ "# Then I use the selectFromModel object from sklearn, which\n",
+ "# will select the features which coefficients are non-zero\n",
+ "\n",
+ "sel_ = SelectFromModel(GradientBoostingRegressor(random_state=0, n_estimators=50)) # remember to set the seed, the random state in this function\n",
+ "sel_.fit(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "total features: 80\n",
+ "selected features: 10\n"
+ ]
+ }
+ ],
+ "source": [
+ "# let's print the number of total and selected features\n",
+ "\n",
+ "# this is how we can make a list of the selected features\n",
+ "selected_feat = X_train.columns[(sel_.get_support())]\n",
+ "\n",
+ "# let's print some stats\n",
+ "print('total features: {}'.format((X_train.shape[1])))\n",
+ "print('selected features: {}'.format(len(selected_feat)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['LotArea', 'OverallQual', 'YearRemodAdd', 'BsmtQual', 'BsmtFinSF1',\n",
+ " 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageCars'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "selected_feat"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Re-build model with selected features"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
+ " learning_rate=0.1, loss='ls', max_depth=3,\n",
+ " max_features=None, max_leaf_nodes=None,\n",
+ " min_impurity_decrease=0.0, min_impurity_split=None,\n",
+ " min_samples_leaf=1, min_samples_split=2,\n",
+ " min_weight_fraction_leaf=0.0, n_estimators=50,\n",
+ " n_iter_no_change=None, presort='auto', random_state=0,\n",
+ " subsample=1.0, tol=0.0001, validation_fraction=0.1,\n",
+ " verbose=0, warm_start=False)"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tree_reg = GradientBoostingRegressor(random_state=0, n_estimators=50)\n",
+ "tree_reg.fit(X_train[selected_feat], y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "linear train mse: 418644251.8706319\n",
+ "linear train rmse: 20460.797928493204\n",
+ "\n",
+ "linear test mse: 615157343.7654308\n",
+ "linear test rmse: 24802.36568889006\n"
+ ]
+ }
+ ],
+ "source": [
+ "pred = tree_reg.predict(X_train[selected_feat])\n",
+ "print('linear train mse: {}'.format(mean_squared_error(y_train, pred)))\n",
+ "print('linear train rmse: {}'.format(sqrt(mean_squared_error(y_train, pred))))\n",
+ "print()\n",
+ "pred = tree_reg.predict(X_test[selected_feat])\n",
+ "print('linear test mse: {}'.format(mean_squared_error(y_test, pred)))\n",
+ "print('linear test rmse: {}'.format(sqrt(mean_squared_error(y_test, pred))))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " LotArea | \n",
+ " OverallQual | \n",
+ " YearRemodAdd | \n",
+ " BsmtQual | \n",
+ " BsmtFinSF1 | \n",
+ " TotalBsmtSF | \n",
+ " 1stFlrSF | \n",
+ " 2ndFlrSF | \n",
+ " GrLivArea | \n",
+ " GarageCars | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 8450 | \n",
+ " 7 | \n",
+ " 2003 | \n",
+ " Gd | \n",
+ " 706 | \n",
+ " 856 | \n",
+ " 856 | \n",
+ " 854 | \n",
+ " 1710 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 9600 | \n",
+ " 6 | \n",
+ " 1976 | \n",
+ " Gd | \n",
+ " 978 | \n",
+ " 1262 | \n",
+ " 1262 | \n",
+ " 0 | \n",
+ " 1262 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 11250 | \n",
+ " 7 | \n",
+ " 2002 | \n",
+ " Gd | \n",
+ " 486 | \n",
+ " 920 | \n",
+ " 920 | \n",
+ " 866 | \n",
+ " 1786 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 9550 | \n",
+ " 7 | \n",
+ " 1970 | \n",
+ " TA | \n",
+ " 216 | \n",
+ " 756 | \n",
+ " 961 | \n",
+ " 756 | \n",
+ " 1717 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 14260 | \n",
+ " 8 | \n",
+ " 2000 | \n",
+ " Gd | \n",
+ " 655 | \n",
+ " 1145 | \n",
+ " 1145 | \n",
+ " 1053 | \n",
+ " 2198 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " LotArea OverallQual YearRemodAdd BsmtQual BsmtFinSF1 TotalBsmtSF \\\n",
+ "0 8450 7 2003 Gd 706 856 \n",
+ "1 9600 6 1976 Gd 978 1262 \n",
+ "2 11250 7 2002 Gd 486 920 \n",
+ "3 9550 7 1970 TA 216 756 \n",
+ "4 14260 8 2000 Gd 655 1145 \n",
+ "\n",
+ " 1stFlrSF 2ndFlrSF GrLivArea GarageCars \n",
+ "0 856 854 1710 2 \n",
+ "1 1262 0 1262 2 \n",
+ "2 920 866 1786 2 \n",
+ "3 961 756 1717 3 \n",
+ "4 1145 1053 2198 3 "
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[selected_feat].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# make a list of the categorical variables that contain missing values\n",
+ "\n",
+ "vars_dates = ['YearRemodAdd']\n",
+ "vars_cat = ['BsmtQual']\n",
+ "vars_num = ['LotArea', 'OverallQual', 'YearRemodAdd', 'BsmtQual', 'BsmtFinSF1',\n",
+ " 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageCars']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LotArea 0.000000\n",
+ "OverallQual 0.000000\n",
+ "YearRemodAdd 0.000000\n",
+ "BsmtQual 0.025342\n",
+ "BsmtFinSF1 0.000000\n",
+ "TotalBsmtSF 0.000000\n",
+ "1stFlrSF 0.000000\n",
+ "2ndFlrSF 0.000000\n",
+ "GrLivArea 0.000000\n",
+ "GarageCars 0.000000\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data[vars_num].isnull().mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ },
+ "toc": {
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "toc_cell": false,
+ "toc_position": {
+ "height": "583px",
+ "left": "0px",
+ "right": "1324px",
+ "top": "107px",
+ "width": "212px"
+ },
+ "toc_section_display": "block",
+ "toc_window_display": true
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}