Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prova #948

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open

Prova #948

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ orbs:

defaults: &defaults
docker:
- image: cimg/python:3.11.1
- image: cimg/python:3.9.18
working_directory: ~/project

prepare_venv: &prepare_venv
Expand Down Expand Up @@ -82,7 +82,7 @@ jobs:
steps:
- setup_remote_docker:
# Supported versions: https://circleci.com/docs/2.0/building-docker-images/#docker-version
version: 20.10.18
version: default
- checkout:
path: ~/project/
- node/install:
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Deployment of Machine Learning Models
Accompanying repo for the online course Deployment of Machine Learning Models.

For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO).
For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO)


9 changes: 5 additions & 4 deletions assignment-section-05/requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
# to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
# updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
numpy>=1.21.0,<2.0.0
numpy>=1.21.0,<1.25.0
pandas>=1.3.5,<2.0.0
pydantic>=1.8.1,<2.0.0
scikit-learn>=1.1.3,<2.0.0
scikit-learn>=1.0.2,<1.1.0
strictyaml>=1.3.2,<2.0.0
ruamel.yaml>=0.16.12,<1.0.0
feature-engine>=1.0.2,<2.0.0
joblib>=1.0.1,<2.0.0
feature-engine>=1.0.2,<1.6.0 # breaking change in v1.6.0
joblib>=1.0.1,<2.0.0
setuptools<60
1 change: 1 addition & 0 deletions assignment-section-05/tests/test_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def test_make_prediction(sample_input_data):

# Then
predictions = result.get("predictions")
print(predictions)
assert isinstance(predictions, np.ndarray)
assert isinstance(predictions[0], np.int64)
assert result.get("errors") is None
Expand Down
1 change: 1 addition & 0 deletions assignment-section-05/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ envlist = test_package, checks
skipsdist = True

[testenv]
basepython = python3.9
install_command = pip install {opts} {packages}

[testenv:test_package]
Expand Down
1 change: 1 addition & 0 deletions my-assignement-section-05/classification_model/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.0.1
17 changes: 17 additions & 0 deletions my-assignement-section-05/classification_model/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import logging

from classification_model.config.core import PACKAGE_ROOT, config

# It is strongly advised that you do not add any handlers other than
# NullHandler to your library’s loggers. This is because the configuration
# of handlers is the prerogative of the application developer who uses your
# library. The application developer knows their target audience and what
# handlers are most appropriate for their application: if you add handlers
# ‘under the hood’, you might well interfere with their ability to carry out
# unit tests and deliver logs which suit their requirements.
# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())


with open(PACKAGE_ROOT / "VERSION") as version_file:
__version__ = version_file.read().strip()
50 changes: 50 additions & 0 deletions my-assignement-section-05/classification_model/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Package Overview
package_name: classification_model

# Data Files
data_file: raw.csv

# Variables
# The variable we are attempting to predict (sale price)
target: survived

pipeline_name: classification_model
pipeline_save_file: classification _model_output_v

# Will cause syntax errors since they begin with numbers
variables_to_rename:
home.dest: home_dest

features:
- pclass
- survived
- sex
- age
- sibsp
- parch
- fare
- cabin
- embarked
- title


# set train/test split
test_size: 0.1

# to set the random seed
random_state: 0

alpha: 0.001

numerical_vars:
- age
- fare

cabin:
- cabin

categorical_vars:
- sex
- cabin
- embarked
- title
Empty file.
85 changes: 85 additions & 0 deletions my-assignement-section-05/classification_model/config/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from pathlib import Path
from typing import Dict, List, Optional

from pydantic import BaseModel
from strictyaml import YAML, load

import classification_model

# Project Directories
PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent
ROOT = PACKAGE_ROOT.parent
CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
DATASET_DIR = PACKAGE_ROOT / "datasets"
TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"


class AppConfig(BaseModel):
"""
Application-level config.
"""

package_name: str
data_file: str
pipeline_save_file: str


class ModelConfig(BaseModel):
"""
All configuration relevant to model
training and feature engineering.
"""

target: str
variables_to_rename: Dict
features: List[str]
test_size: float
random_state: int
alpha: float
categorical_vars: List[str]
numerical_vars: List[str]
cabin: List[str]


class Config(BaseModel):
"""Master config object."""

app_config: AppConfig
model_config: ModelConfig


def find_config_file() -> Path:
"""Locate the configuration file."""
if CONFIG_FILE_PATH.is_file():
return CONFIG_FILE_PATH
raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")


def fetch_config_from_yaml(cfg_path: Optional[Path] = None) -> YAML:
"""Parse YAML containing the package configuration."""

if not cfg_path:
cfg_path = find_config_file()

if cfg_path:
with open(cfg_path, "r") as conf_file:
parsed_config = load(conf_file.read())
return parsed_config
raise OSError(f"Did not find config file at path: {cfg_path}")


def create_and_validate_config(parsed_config: YAML = None) -> Config:
"""Run validation on config values."""
if parsed_config is None:
parsed_config = fetch_config_from_yaml()

# specify the data attribute from the strictyaml YAML type.
_config = Config(
app_config=AppConfig(**parsed_config.data),
model_config=ModelConfig(**parsed_config.data),
)

return _config


config = create_and_validate_config()
Empty file.
63 changes: 63 additions & 0 deletions my-assignement-section-05/classification_model/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# for encoding categorical variables
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
from feature_engine.imputation import (
AddMissingIndicator,
CategoricalImputer,
MeanMedianImputer,
)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from classification_model.config.core import config
from classification_model.processing import features as pp

titanic_pipe = Pipeline(
[
# ===== IMPUTATION =====
# impute categorical variables with string missing
(
"categorical_imputation",
CategoricalImputer(
imputation_method="missing",
variables=config.model_config.categorical_vars,
),
),
# add missing indicator to numerical variables
(
"missing_indicator",
AddMissingIndicator(variables=config.model_config.numerical_vars),
),
# impute numerical variables with the median
(
"median_imputation",
MeanMedianImputer(
imputation_method="median", variables=config.model_config.numerical_vars
),
),
# Extract letter from cabin
(
"extract_letter",
pp.ExtractLetterTransformer(variables=config.model_config.cabin),
),
# == CATEGORICAL ENCODING ======
# remove categories present in less than 5% of the observations (0.05)
# group them in one category called 'Rare'
(
"rare_label_encoder",
RareLabelEncoder(
tol=0.05, n_categories=1, variables=config.model_config.categorical_vars
),
),
# encode categorical variables using one hot encoding into k-1 variables
(
"categorical_encoder",
OneHotEncoder(
drop_last=True, variables=config.model_config.categorical_vars
),
),
# scale
("scaler", StandardScaler()),
("Logit", LogisticRegression(C=0.0005, random_state=0)),
]
)
34 changes: 34 additions & 0 deletions my-assignement-section-05/classification_model/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import typing as t

import pandas as pd

from classification_model import __version__ as _version
from classification_model.config.core import config
from classification_model.processing.data_manager import load_pipeline
from classification_model.processing.validation import validate_inputs

pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
_titanic_pipe = load_pipeline(file_name=pipeline_file_name)


def make_prediction(
*,
input_data: t.Union[pd.DataFrame, dict],
) -> dict:
"""Make a prediction using a saved model pipeline."""

data = pd.DataFrame(input_data)
validated_data, errors = validate_inputs(input_data=data)
results = {"predictions": None, "version": _version, "errors": errors}

if not errors:
predictions = _titanic_pipe.predict(
X=validated_data[config.model_config.features]
)
results = {
"predictions": predictions,
"version": _version,
"errors": errors,
}

return results
Empty file.
Loading