Skip to content

Commit 5e6560c

Browse files
Section5 assignment (#789)
1 parent 08bfc7a commit 5e6560c

27 files changed

+798
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ packages/regression_model/regression_model/datasets/*.zip
112112
packages/regression_model/regression_model/datasets/*.txt
113113
train.csv
114114
test.csv
115+
raw.csv
115116
data_description.txt
116117
house-prices-advanced-regression-techniques.zip
117118
sample_submission.csv

assignment-section-05/MANIFEST.in

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
include *.txt
2+
include *.md
3+
include *.pkl
4+
recursive-include ./classification_model/*
5+
6+
include classification_model/datasets/train.csv
7+
include classification_model/datasets/test.csv
8+
include classification_model/trained_models/*.pkl
9+
include classification_model/VERSION
10+
include classification_model/config.yml
11+
12+
include ./requirements/requirements.txt
13+
include ./requirements/test_requirements.txt
14+
exclude *.log
15+
exclude *.cfg
16+
17+
recursive-exclude * __pycache__
18+
recursive-exclude * *.py[co]

assignment-section-05/README.md

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Productionized Titanic Classification Model Package
2+
3+
## Run With Tox (Recommended)
4+
- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
5+
- Save the file as `raw.csv` in the classification_model/datasets directory
6+
- `pip install tox`
7+
- Make sure you are in the assignment-section-05 directory (where the tox.ini file is) then run the command: `tox` (this runs the tests and typechecks, trains the model under the hood). The first time you run this it creates a virtual env and installs
8+
dependencies, so takes a few minutes.
9+
10+
## Run Without Tox
11+
- Download the data from: https://www.openml.org/data/get_csv/16826755/phpMYEkMl
12+
- Save the file as `raw.csv` in the classification_model/datasets directory
13+
- Add assignment-section-05 *and* classification_model paths to your system PYTHONPATH
14+
- `pip install -r requirements/test_requirements`
15+
- Train the model: `python classification_model/train_pipeline.py`
16+
- Run the tests `pytest tests`
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0.0.1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import logging
2+
3+
from classification_model.config.core import PACKAGE_ROOT, config
4+
5+
# It is strongly advised that you do not add any handlers other than
6+
# NullHandler to your library’s loggers. This is because the configuration
7+
# of handlers is the prerogative of the application developer who uses your
8+
# library. The application developer knows their target audience and what
9+
# handlers are most appropriate for their application: if you add handlers
10+
# ‘under the hood’, you might well interfere with their ability to carry out
11+
# unit tests and deliver logs which suit their requirements.
12+
# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
13+
logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
14+
15+
16+
with open(PACKAGE_ROOT / "VERSION") as version_file:
17+
__version__ = version_file.read().strip()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Package Overview
2+
package_name: regression_model
3+
4+
# Data Files
5+
raw_data_file: raw.csv
6+
training_data_file: train.csv
7+
test_data_file: test.csv
8+
9+
# Variables
10+
# The variable we are attempting to predict (sale price)
11+
target: survived
12+
13+
pipeline_name: titanic_classification_model
14+
pipeline_save_file: titanic_classification_model_output_v
15+
16+
features:
17+
- pclass
18+
- sex
19+
- age
20+
- sibsp
21+
- parch
22+
- fare
23+
- cabin
24+
- embarked
25+
- title # generated from name
26+
27+
# set train/test split
28+
test_size: 0.1
29+
30+
# to set the random seed
31+
random_state: 0
32+
33+
unused_fields:
34+
- name
35+
- ticket
36+
- boat
37+
- body
38+
- home.dest
39+
40+
numerical_vars:
41+
- age
42+
- fare
43+
44+
categorical_vars:
45+
- sex
46+
- cabin
47+
- embarked
48+
- title
49+
50+
cabin_vars:
51+
- cabin

assignment-section-05/classification_model/config/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from pathlib import Path
2+
from typing import Sequence
3+
4+
from pydantic import BaseModel
5+
from strictyaml import YAML, load
6+
7+
import classification_model
8+
9+
# Project Directories
10+
PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent
11+
ROOT = PACKAGE_ROOT.parent
12+
CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
13+
DATASET_DIR = PACKAGE_ROOT / "datasets"
14+
TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
15+
16+
17+
class AppConfig(BaseModel):
18+
"""
19+
Application-level config.
20+
"""
21+
22+
package_name: str
23+
raw_data_file: str
24+
pipeline_save_file: str
25+
26+
27+
class ModelConfig(BaseModel):
28+
"""
29+
All configuration relevant to model
30+
training and feature engineering.
31+
"""
32+
33+
target: str
34+
unused_fields: Sequence[str]
35+
features: Sequence[str]
36+
test_size: float
37+
random_state: int
38+
numerical_vars: Sequence[str]
39+
categorical_vars: Sequence[str]
40+
cabin_vars: Sequence[str]
41+
42+
43+
class Config(BaseModel):
44+
"""Master config object."""
45+
46+
app_config: AppConfig
47+
model_config: ModelConfig
48+
49+
50+
def find_config_file() -> Path:
51+
"""Locate the configuration file."""
52+
if CONFIG_FILE_PATH.is_file():
53+
return CONFIG_FILE_PATH
54+
raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
55+
56+
57+
def fetch_config_from_yaml(cfg_path: Path = None) -> YAML:
58+
"""Parse YAML containing the package configuration."""
59+
60+
if not cfg_path:
61+
cfg_path = find_config_file()
62+
63+
if cfg_path:
64+
with open(cfg_path, "r") as conf_file:
65+
parsed_config = load(conf_file.read())
66+
return parsed_config
67+
raise OSError(f"Did not find config file at path: {cfg_path}")
68+
69+
70+
def create_and_validate_config(parsed_config: YAML = None) -> Config:
71+
"""Run validation on config values."""
72+
if parsed_config is None:
73+
parsed_config = fetch_config_from_yaml()
74+
75+
# specify the data attribute from the strictyaml YAML type.
76+
_config = Config(
77+
app_config=AppConfig(**parsed_config.data),
78+
model_config=ModelConfig(**parsed_config.data),
79+
)
80+
81+
return _config
82+
83+
84+
config = create_and_validate_config()

assignment-section-05/classification_model/datasets/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# for encoding categorical variables
2+
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
3+
4+
# for imputation
5+
from feature_engine.imputation import (
6+
AddMissingIndicator,
7+
CategoricalImputer,
8+
MeanMedianImputer,
9+
)
10+
from sklearn.linear_model import LogisticRegression
11+
from sklearn.pipeline import Pipeline
12+
from sklearn.preprocessing import StandardScaler
13+
14+
from classification_model.config.core import config
15+
from classification_model.processing.features import ExtractLetterTransformer
16+
17+
titanic_pipe = Pipeline(
18+
[
19+
# impute categorical variables with string missing
20+
(
21+
"categorical_imputation",
22+
CategoricalImputer(
23+
imputation_method="missing",
24+
variables=config.model_config.categorical_vars,
25+
),
26+
),
27+
# add missing indicator to numerical variables
28+
(
29+
"missing_indicator",
30+
AddMissingIndicator(variables=config.model_config.numerical_vars),
31+
),
32+
# impute numerical variables with the median
33+
(
34+
"median_imputation",
35+
MeanMedianImputer(
36+
imputation_method="median", variables=config.model_config.numerical_vars
37+
),
38+
),
39+
# Extract letter from cabin
40+
(
41+
"extract_letter",
42+
ExtractLetterTransformer(variables=config.model_config.cabin_vars),
43+
),
44+
# == CATEGORICAL ENCODING ======
45+
# remove categories present in less than 5% of the observations (0.05)
46+
# group them in one category called 'Rare'
47+
(
48+
"rare_label_encoder",
49+
RareLabelEncoder(
50+
tol=0.05, n_categories=1, variables=config.model_config.categorical_vars
51+
),
52+
),
53+
# encode categorical variables using one hot encoding into k-1 variables
54+
(
55+
"categorical_encoder",
56+
OneHotEncoder(
57+
drop_last=True, variables=config.model_config.categorical_vars
58+
),
59+
),
60+
# scale
61+
("scaler", StandardScaler()),
62+
("Logit", LogisticRegression(C=0.0005, random_state=0)),
63+
]
64+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import typing as t
2+
3+
import pandas as pd
4+
5+
from classification_model import __version__ as _version
6+
from classification_model.config.core import config
7+
from classification_model.processing.data_manager import load_pipeline
8+
from classification_model.processing.validation import validate_inputs
9+
10+
pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
11+
_titanic_pipe = load_pipeline(file_name=pipeline_file_name)
12+
13+
14+
def make_prediction(
15+
*,
16+
input_data: t.Union[pd.DataFrame, dict],
17+
) -> dict:
18+
"""Make a prediction using a saved model pipeline."""
19+
20+
data = pd.DataFrame(input_data)
21+
validated_data, errors = validate_inputs(input_data=data)
22+
results = {"predictions": None, "version": _version, "errors": errors}
23+
24+
if not errors:
25+
predictions = _titanic_pipe.predict(
26+
X=validated_data[config.model_config.features]
27+
)
28+
results = {
29+
"predictions": predictions,
30+
"version": _version,
31+
"errors": errors,
32+
}
33+
34+
return results

assignment-section-05/classification_model/processing/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)