trainindata · AlessandroCesaTs · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -4,7 +4,7 @@ orbs:
 
 defaults: &defaults
   docker:
-    - image: cimg/python:3.11.1
+    - image: cimg/python:3.9.18
   working_directory: ~/project
 
 prepare_venv: &prepare_venv
@@ -82,7 +82,7 @@ jobs:
     steps:
       - setup_remote_docker:
           # Supported versions: https://circleci.com/docs/2.0/building-docker-images/#docker-version
-          version: 20.10.18
+          version: default
       - checkout:
           path: ~/project/
       - node/install:

diff --git a/README.md b/README.md
@@ -1,4 +1,6 @@
 # Deployment of Machine Learning Models
 Accompanying repo for the online course Deployment of Machine Learning Models.
 
-For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO).
+For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO)
+
+
diff --git a/assignment-section-05/requirements/requirements.txt b/assignment-section-05/requirements/requirements.txt
@@ -1,11 +1,12 @@
 # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
 # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
 # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
-numpy>=1.21.0,<2.0.0
+numpy>=1.21.0,<1.25.0
 pandas>=1.3.5,<2.0.0
 pydantic>=1.8.1,<2.0.0
-scikit-learn>=1.1.3,<2.0.0
+scikit-learn>=1.0.2,<1.1.0
 strictyaml>=1.3.2,<2.0.0
 ruamel.yaml>=0.16.12,<1.0.0
-feature-engine>=1.0.2,<2.0.0
-joblib>=1.0.1,<2.0.0
+feature-engine>=1.0.2,<1.6.0  # breaking change in v1.6.0
+joblib>=1.0.1,<2.0.0
+setuptools<60
diff --git a/assignment-section-05/tests/test_prediction.py b/assignment-section-05/tests/test_prediction.py
@@ -17,6 +17,7 @@ def test_make_prediction(sample_input_data):
 
     # Then
     predictions = result.get("predictions")
+    print(predictions)
     assert isinstance(predictions, np.ndarray)
     assert isinstance(predictions[0], np.int64)
     assert result.get("errors") is None

diff --git a/assignment-section-05/tox.ini b/assignment-section-05/tox.ini
@@ -12,6 +12,7 @@ envlist = test_package, checks
 skipsdist = True
 
 [testenv]
+basepython = python3.9
 install_command = pip install {opts} {packages}
 
 [testenv:test_package]

diff --git a/my-assignement-section-05/classification_model/VERSION b/my-assignement-section-05/classification_model/VERSION
@@ -0,0 +1 @@
+0.0.1
diff --git a/my-assignement-section-05/classification_model/__init__.py b/my-assignement-section-05/classification_model/__init__.py
@@ -0,0 +1,17 @@
+import logging
+
+from classification_model.config.core import PACKAGE_ROOT, config
+
+# It is strongly advised that you do not add any handlers other than
+# NullHandler to your library’s loggers. This is because the configuration
+# of handlers is the prerogative of the application developer who uses your
+# library. The application developer knows their target audience and what
+# handlers are most appropriate for their application: if you add handlers
+# ‘under the hood’, you might well interfere with their ability to carry out
+# unit tests and deliver logs which suit their requirements.
+# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
+logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
+
+
+with open(PACKAGE_ROOT / "VERSION") as version_file:
+    __version__ = version_file.read().strip()
diff --git a/my-assignement-section-05/classification_model/config.yml b/my-assignement-section-05/classification_model/config.yml
@@ -0,0 +1,50 @@
+# Package Overview
+package_name: classification_model
+
+# Data Files
+data_file: raw.csv
+
+# Variables
+# The variable we are attempting to predict (sale price)
+target: survived
+
+pipeline_name: classification_model
+pipeline_save_file: classification _model_output_v
+
+# Will cause syntax errors since they begin with numbers
+variables_to_rename:
+  home.dest: home_dest
+
+features:
+  - pclass
+  - survived
+  - sex
+  - age
+  - sibsp
+  - parch
+  - fare
+  - cabin
+  - embarked
+  - title
+
+
+# set train/test split
+test_size: 0.1
+
+# to set the random seed
+random_state: 0
+
+alpha: 0.001
+
+numerical_vars:
+  - age
+  - fare
+
+cabin:
+  - cabin
+
+categorical_vars:
+  - sex
+  - cabin
+  - embarked
+  - title
diff --git a/my-assignement-section-05/classification_model/config/__init__.py b/my-assignement-section-05/classification_model/config/__init__.py
diff --git a/my-assignement-section-05/classification_model/config/core.py b/my-assignement-section-05/classification_model/config/core.py
@@ -0,0 +1,85 @@
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from pydantic import BaseModel
+from strictyaml import YAML, load
+
+import classification_model
+
+# Project Directories
+PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent
+ROOT = PACKAGE_ROOT.parent
+CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
+DATASET_DIR = PACKAGE_ROOT / "datasets"
+TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
+
+
+class AppConfig(BaseModel):
+    """
+    Application-level config.
+    """
+
+    package_name: str
+    data_file: str
+    pipeline_save_file: str
+
+
+class ModelConfig(BaseModel):
+    """
+    All configuration relevant to model
+    training and feature engineering.
+    """
+
+    target: str
+    variables_to_rename: Dict
+    features: List[str]
+    test_size: float
+    random_state: int
+    alpha: float
+    categorical_vars: List[str]
+    numerical_vars: List[str]
+    cabin: List[str]
+
+
+class Config(BaseModel):
+    """Master config object."""
+
+    app_config: AppConfig
+    model_config: ModelConfig
+
+
+def find_config_file() -> Path:
+    """Locate the configuration file."""
+    if CONFIG_FILE_PATH.is_file():
+        return CONFIG_FILE_PATH
+    raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
+
+
+def fetch_config_from_yaml(cfg_path: Optional[Path] = None) -> YAML:
+    """Parse YAML containing the package configuration."""
+
+    if not cfg_path:
+        cfg_path = find_config_file()
+
+    if cfg_path:
+        with open(cfg_path, "r") as conf_file:
+            parsed_config = load(conf_file.read())
+            return parsed_config
+    raise OSError(f"Did not find config file at path: {cfg_path}")
+
+
+def create_and_validate_config(parsed_config: YAML = None) -> Config:
+    """Run validation on config values."""
+    if parsed_config is None:
+        parsed_config = fetch_config_from_yaml()
+
+    # specify the data attribute from the strictyaml YAML type.
+    _config = Config(
+        app_config=AppConfig(**parsed_config.data),
+        model_config=ModelConfig(**parsed_config.data),
+    )
+
+    return _config
+
+
+config = create_and_validate_config()
diff --git a/my-assignement-section-05/classification_model/datasets/__init__.py b/my-assignement-section-05/classification_model/datasets/__init__.py
diff --git a/my-assignement-section-05/classification_model/pipeline.py b/my-assignement-section-05/classification_model/pipeline.py
@@ -0,0 +1,63 @@
+# for encoding categorical variables
+from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
+from feature_engine.imputation import (
+    AddMissingIndicator,
+    CategoricalImputer,
+    MeanMedianImputer,
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+from classification_model.config.core import config
+from classification_model.processing import features as pp
+
+titanic_pipe = Pipeline(
+    [
+        # ===== IMPUTATION =====
+        # impute categorical variables with string missing
+        (
+            "categorical_imputation",
+            CategoricalImputer(
+                imputation_method="missing",
+                variables=config.model_config.categorical_vars,
+            ),
+        ),
+        # add missing indicator to numerical variables
+        (
+            "missing_indicator",
+            AddMissingIndicator(variables=config.model_config.numerical_vars),
+        ),
+        # impute numerical variables with the median
+        (
+            "median_imputation",
+            MeanMedianImputer(
+                imputation_method="median", variables=config.model_config.numerical_vars
+            ),
+        ),
+        # Extract letter from cabin
+        (
+            "extract_letter",
+            pp.ExtractLetterTransformer(variables=config.model_config.cabin),
+        ),
+        # == CATEGORICAL ENCODING ======
+        # remove categories present in less than 5% of the observations (0.05)
+        # group them in one category called 'Rare'
+        (
+            "rare_label_encoder",
+            RareLabelEncoder(
+                tol=0.05, n_categories=1, variables=config.model_config.categorical_vars
+            ),
+        ),
+        # encode categorical variables using one hot encoding into k-1 variables
+        (
+            "categorical_encoder",
+            OneHotEncoder(
+                drop_last=True, variables=config.model_config.categorical_vars
+            ),
+        ),
+        # scale
+        ("scaler", StandardScaler()),
+        ("Logit", LogisticRegression(C=0.0005, random_state=0)),
+    ]
+)
diff --git a/my-assignement-section-05/classification_model/predict.py b/my-assignement-section-05/classification_model/predict.py
@@ -0,0 +1,34 @@
+import typing as t
+
+import pandas as pd
+
+from classification_model import __version__ as _version
+from classification_model.config.core import config
+from classification_model.processing.data_manager import load_pipeline
+from classification_model.processing.validation import validate_inputs
+
+pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
+_titanic_pipe = load_pipeline(file_name=pipeline_file_name)
+
+
+def make_prediction(
+    *,
+    input_data: t.Union[pd.DataFrame, dict],
+) -> dict:
+    """Make a prediction using a saved model pipeline."""
+
+    data = pd.DataFrame(input_data)
+    validated_data, errors = validate_inputs(input_data=data)
+    results = {"predictions": None, "version": _version, "errors": errors}
+
+    if not errors:
+        predictions = _titanic_pipe.predict(
+            X=validated_data[config.model_config.features]
+        )
+        results = {
+            "predictions": predictions,
+            "version": _version,
+            "errors": errors,
+        }
+
+    return results
diff --git a/my-assignement-section-05/classification_model/processing/__init__.py b/my-assignement-section-05/classification_model/processing/__init__.py