diff --git a/.circleci/config.yml b/.circleci/config.yml
index 037645ab2..f3cbb7e17 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -4,7 +4,7 @@ orbs:
defaults: &defaults
docker:
- - image: cimg/python:3.11.1
+ - image: cimg/python:3.9.18
working_directory: ~/project
prepare_venv: &prepare_venv
@@ -82,7 +82,7 @@ jobs:
steps:
- setup_remote_docker:
# Supported versions: https://circleci.com/docs/2.0/building-docker-images/#docker-version
- version: 20.10.18
+ version: default
- checkout:
path: ~/project/
- node/install:
diff --git a/README.md b/README.md
index 7fbf80b75..568b30a89 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
# Deployment of Machine Learning Models
Accompanying repo for the online course Deployment of Machine Learning Models.
-For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO).
+For the documentation, visit the [course on Udemy](https://www.udemy.com/deployment-of-machine-learning-models/?couponCode=TIDREPO)
+
+
diff --git a/assignment-section-05/requirements/requirements.txt b/assignment-section-05/requirements/requirements.txt
index f3783b618..a0800e668 100644
--- a/assignment-section-05/requirements/requirements.txt
+++ b/assignment-section-05/requirements/requirements.txt
@@ -1,11 +1,12 @@
# We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
# to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
# updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
-numpy>=1.21.0,<2.0.0
+numpy>=1.21.0,<1.25.0
pandas>=1.3.5,<2.0.0
pydantic>=1.8.1,<2.0.0
-scikit-learn>=1.1.3,<2.0.0
+scikit-learn>=1.0.2,<1.1.0
strictyaml>=1.3.2,<2.0.0
ruamel.yaml>=0.16.12,<1.0.0
-feature-engine>=1.0.2,<2.0.0
-joblib>=1.0.1,<2.0.0
\ No newline at end of file
+feature-engine>=1.0.2,<1.6.0 # breaking change in v1.6.0
+joblib>=1.0.1,<2.0.0
+setuptools<60
\ No newline at end of file
diff --git a/assignment-section-05/tests/test_prediction.py b/assignment-section-05/tests/test_prediction.py
index 76965698a..e0d4af892 100644
--- a/assignment-section-05/tests/test_prediction.py
+++ b/assignment-section-05/tests/test_prediction.py
@@ -17,6 +17,7 @@ def test_make_prediction(sample_input_data):
# Then
predictions = result.get("predictions")
+ print(predictions)
assert isinstance(predictions, np.ndarray)
assert isinstance(predictions[0], np.int64)
assert result.get("errors") is None
diff --git a/assignment-section-05/tox.ini b/assignment-section-05/tox.ini
index 37829355f..76484c454 100644
--- a/assignment-section-05/tox.ini
+++ b/assignment-section-05/tox.ini
@@ -12,6 +12,7 @@ envlist = test_package, checks
skipsdist = True
[testenv]
+basepython = python3.9
install_command = pip install {opts} {packages}
[testenv:test_package]
diff --git a/my-assignement-section-05/classification_model/VERSION b/my-assignement-section-05/classification_model/VERSION
new file mode 100644
index 000000000..8acdd82b7
--- /dev/null
+++ b/my-assignement-section-05/classification_model/VERSION
@@ -0,0 +1 @@
+0.0.1
diff --git a/my-assignement-section-05/classification_model/__init__.py b/my-assignement-section-05/classification_model/__init__.py
new file mode 100644
index 000000000..8cea86752
--- /dev/null
+++ b/my-assignement-section-05/classification_model/__init__.py
@@ -0,0 +1,17 @@
+import logging
+
+from classification_model.config.core import PACKAGE_ROOT, config
+
+# It is strongly advised that you do not add any handlers other than
+# NullHandler to your library’s loggers. This is because the configuration
+# of handlers is the prerogative of the application developer who uses your
+# library. The application developer knows their target audience and what
+# handlers are most appropriate for their application: if you add handlers
+# ‘under the hood’, you might well interfere with their ability to carry out
+# unit tests and deliver logs which suit their requirements.
+# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library
+logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler())
+
+
+with open(PACKAGE_ROOT / "VERSION") as version_file:
+ __version__ = version_file.read().strip()
diff --git a/my-assignement-section-05/classification_model/config.yml b/my-assignement-section-05/classification_model/config.yml
new file mode 100644
index 000000000..56f5dc65a
--- /dev/null
+++ b/my-assignement-section-05/classification_model/config.yml
@@ -0,0 +1,50 @@
+# Package Overview
+package_name: classification_model
+
+# Data Files
+data_file: raw.csv
+
+# Variables
+# The variable we are attempting to predict (sale price)
+target: survived
+
+pipeline_name: classification_model
+pipeline_save_file: classification _model_output_v
+
+# Will cause syntax errors since they begin with numbers
+variables_to_rename:
+ home.dest: home_dest
+
+features:
+ - pclass
+ - survived
+ - sex
+ - age
+ - sibsp
+ - parch
+ - fare
+ - cabin
+ - embarked
+ - title
+
+
+# set train/test split
+test_size: 0.1
+
+# to set the random seed
+random_state: 0
+
+alpha: 0.001
+
+numerical_vars:
+ - age
+ - fare
+
+cabin:
+ - cabin
+
+categorical_vars:
+ - sex
+ - cabin
+ - embarked
+ - title
\ No newline at end of file
diff --git a/my-assignement-section-05/classification_model/config/__init__.py b/my-assignement-section-05/classification_model/config/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/my-assignement-section-05/classification_model/config/core.py b/my-assignement-section-05/classification_model/config/core.py
new file mode 100644
index 000000000..75cdf5518
--- /dev/null
+++ b/my-assignement-section-05/classification_model/config/core.py
@@ -0,0 +1,85 @@
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from pydantic import BaseModel
+from strictyaml import YAML, load
+
+import classification_model
+
+# Project Directories
+PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent
+ROOT = PACKAGE_ROOT.parent
+CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml"
+DATASET_DIR = PACKAGE_ROOT / "datasets"
+TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models"
+
+
+class AppConfig(BaseModel):
+ """
+ Application-level config.
+ """
+
+ package_name: str
+ data_file: str
+ pipeline_save_file: str
+
+
+class ModelConfig(BaseModel):
+ """
+ All configuration relevant to model
+ training and feature engineering.
+ """
+
+ target: str
+ variables_to_rename: Dict
+ features: List[str]
+ test_size: float
+ random_state: int
+ alpha: float
+ categorical_vars: List[str]
+ numerical_vars: List[str]
+ cabin: List[str]
+
+
+class Config(BaseModel):
+ """Master config object."""
+
+ app_config: AppConfig
+ model_config: ModelConfig
+
+
+def find_config_file() -> Path:
+ """Locate the configuration file."""
+ if CONFIG_FILE_PATH.is_file():
+ return CONFIG_FILE_PATH
+ raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}")
+
+
+def fetch_config_from_yaml(cfg_path: Optional[Path] = None) -> YAML:
+ """Parse YAML containing the package configuration."""
+
+ if not cfg_path:
+ cfg_path = find_config_file()
+
+ if cfg_path:
+ with open(cfg_path, "r") as conf_file:
+ parsed_config = load(conf_file.read())
+ return parsed_config
+ raise OSError(f"Did not find config file at path: {cfg_path}")
+
+
+def create_and_validate_config(parsed_config: YAML = None) -> Config:
+ """Run validation on config values."""
+ if parsed_config is None:
+ parsed_config = fetch_config_from_yaml()
+
+ # specify the data attribute from the strictyaml YAML type.
+ _config = Config(
+ app_config=AppConfig(**parsed_config.data),
+ model_config=ModelConfig(**parsed_config.data),
+ )
+
+ return _config
+
+
+config = create_and_validate_config()
diff --git a/my-assignement-section-05/classification_model/datasets/__init__.py b/my-assignement-section-05/classification_model/datasets/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/my-assignement-section-05/classification_model/pipeline.py b/my-assignement-section-05/classification_model/pipeline.py
new file mode 100644
index 000000000..11fa84c71
--- /dev/null
+++ b/my-assignement-section-05/classification_model/pipeline.py
@@ -0,0 +1,63 @@
+# for encoding categorical variables
+from feature_engine.encoding import OneHotEncoder, RareLabelEncoder
+from feature_engine.imputation import (
+ AddMissingIndicator,
+ CategoricalImputer,
+ MeanMedianImputer,
+)
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+from classification_model.config.core import config
+from classification_model.processing import features as pp
+
+titanic_pipe = Pipeline(
+ [
+ # ===== IMPUTATION =====
+ # impute categorical variables with string missing
+ (
+ "categorical_imputation",
+ CategoricalImputer(
+ imputation_method="missing",
+ variables=config.model_config.categorical_vars,
+ ),
+ ),
+ # add missing indicator to numerical variables
+ (
+ "missing_indicator",
+ AddMissingIndicator(variables=config.model_config.numerical_vars),
+ ),
+ # impute numerical variables with the median
+ (
+ "median_imputation",
+ MeanMedianImputer(
+ imputation_method="median", variables=config.model_config.numerical_vars
+ ),
+ ),
+ # Extract letter from cabin
+ (
+ "extract_letter",
+ pp.ExtractLetterTransformer(variables=config.model_config.cabin),
+ ),
+ # == CATEGORICAL ENCODING ======
+ # remove categories present in less than 5% of the observations (0.05)
+ # group them in one category called 'Rare'
+ (
+ "rare_label_encoder",
+ RareLabelEncoder(
+ tol=0.05, n_categories=1, variables=config.model_config.categorical_vars
+ ),
+ ),
+ # encode categorical variables using one hot encoding into k-1 variables
+ (
+ "categorical_encoder",
+ OneHotEncoder(
+ drop_last=True, variables=config.model_config.categorical_vars
+ ),
+ ),
+ # scale
+ ("scaler", StandardScaler()),
+ ("Logit", LogisticRegression(C=0.0005, random_state=0)),
+ ]
+)
diff --git a/my-assignement-section-05/classification_model/predict.py b/my-assignement-section-05/classification_model/predict.py
new file mode 100644
index 000000000..eb2990bb3
--- /dev/null
+++ b/my-assignement-section-05/classification_model/predict.py
@@ -0,0 +1,34 @@
+import typing as t
+
+import pandas as pd
+
+from classification_model import __version__ as _version
+from classification_model.config.core import config
+from classification_model.processing.data_manager import load_pipeline
+from classification_model.processing.validation import validate_inputs
+
+pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
+_titanic_pipe = load_pipeline(file_name=pipeline_file_name)
+
+
+def make_prediction(
+ *,
+ input_data: t.Union[pd.DataFrame, dict],
+) -> dict:
+ """Make a prediction using a saved model pipeline."""
+
+ data = pd.DataFrame(input_data)
+ validated_data, errors = validate_inputs(input_data=data)
+ results = {"predictions": None, "version": _version, "errors": errors}
+
+ if not errors:
+ predictions = _titanic_pipe.predict(
+ X=validated_data[config.model_config.features]
+ )
+ results = {
+ "predictions": predictions,
+ "version": _version,
+ "errors": errors,
+ }
+
+ return results
diff --git a/my-assignement-section-05/classification_model/processing/__init__.py b/my-assignement-section-05/classification_model/processing/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/my-assignement-section-05/classification_model/processing/data_manager.py b/my-assignement-section-05/classification_model/processing/data_manager.py
new file mode 100644
index 000000000..032282501
--- /dev/null
+++ b/my-assignement-section-05/classification_model/processing/data_manager.py
@@ -0,0 +1,102 @@
+import re
+import typing as t
+from pathlib import Path
+
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.pipeline import Pipeline
+
+from classification_model import __version__ as _version
+from classification_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config
+
+
+def load_dataset(*, file_name: str) -> pd.DataFrame:
+ dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}"))
+ dataframe = preliminary_transformations(dataframe)
+
+ return dataframe
+
+
+def preliminary_transformations(dataframe: pd.DataFrame) -> pd.DataFrame:
+ dataframe = dataframe.replace("?", np.nan)
+ dataframe["cabin"] = dataframe["cabin"].apply(get_first_cabin)
+ dataframe["title"] = dataframe["name"].apply(get_title)
+ for label in ["fare", "age"]:
+ dataframe[label] = dataframe[label].astype("float")
+ dataframe = drop_unused_variables(dataframe)
+ return dataframe
+
+
+def get_title(passenger):
+ line = passenger
+ if re.search("Mrs", line):
+ return "Mrs"
+ elif re.search("Mr", line):
+ return "Mr"
+ elif re.search("Miss", line):
+ return "Miss"
+ elif re.search("Master", line):
+ return "Master"
+ else:
+ return "Other"
+
+
+def get_first_cabin(row):
+ try:
+ return row.split()[0]
+ except (AttributeError, IndexError):
+ return np.nan
+
+
+def drop_unused_variables(dataframe: pd.DataFrame) -> pd.DataFrame:
+ used_variables = set(config.model_config.features)
+ used_variables.add(config.model_config.target)
+ unused_variables = list(set(dataframe.columns) - used_variables)
+ dataframe.drop(columns=unused_variables, inplace=True)
+ return dataframe
+
+
+def save_pipeline(*, pipeline_to_persist: Pipeline) -> None:
+ """Persist the pipeline.
+ Saves the versioned model, and overwrites any previous
+ saved models. This ensures that when the package is
+ published, there is only one trained model that can be
+ called, and we know exactly how it was built.
+ """
+
+ # Prepare versioned save file name
+ save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl"
+ save_path = TRAINED_MODEL_DIR / save_file_name
+
+ remove_old_pipelines(files_to_keep=[save_file_name])
+ joblib.dump(pipeline_to_persist, save_path)
+
+
+def load_pipeline(*, file_name: str) -> Pipeline:
+ """Load a persisted pipeline."""
+
+ file_path = TRAINED_MODEL_DIR / file_name
+ trained_model = joblib.load(filename=file_path)
+ return trained_model
+
+
+def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None:
+ """
+ Remove old model pipelines.
+ This is to ensure there is a simple one-to-one
+ mapping between the package version and the model
+ version to be imported and used by other applications.
+ """
+ do_not_delete = files_to_keep + ["__init__.py"]
+ for model_file in TRAINED_MODEL_DIR.iterdir():
+ if model_file.name not in do_not_delete:
+ model_file.unlink()
+
+
+def save_dataset(*, dataset: pd.DataFrame, dataset_name: str) -> None:
+
+ save_file_name = f"{dataset_name}.csv"
+ save_path = DATASET_DIR / save_file_name
+
+ dataset.to_csv(save_path, index=False)
diff --git a/my-assignement-section-05/classification_model/processing/features.py b/my-assignement-section-05/classification_model/processing/features.py
new file mode 100644
index 000000000..6e4fc8d3e
--- /dev/null
+++ b/my-assignement-section-05/classification_model/processing/features.py
@@ -0,0 +1,26 @@
+from sklearn.base import BaseEstimator, TransformerMixin
+
+
+class ExtractLetterTransformer(BaseEstimator, TransformerMixin):
+ # Extract fist letter of variable
+
+ def __init__(self, variables):
+
+ if not isinstance(variables, list):
+ raise ValueError("variables should be a list")
+
+ self.variables = variables
+
+ def fit(self, X, y=None):
+ # we need this step to fit the sklearn pipeline
+ return self
+
+ def transform(self, X):
+
+ # so that we do not over-write the original dataframe
+ X = X.copy()
+
+ for feature in self.variables:
+ X[feature] = X[feature].str[0]
+
+ return X
diff --git a/my-assignement-section-05/classification_model/processing/validation.py b/my-assignement-section-05/classification_model/processing/validation.py
new file mode 100644
index 000000000..d8c34cfa9
--- /dev/null
+++ b/my-assignement-section-05/classification_model/processing/validation.py
@@ -0,0 +1,63 @@
+from typing import List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, ValidationError
+
+from classification_model.config.core import config
+
+
+def drop_na_inputs(*, input_data: pd.DataFrame) -> pd.DataFrame:
+ """Check model inputs for na values and filter."""
+ validated_data = input_data.copy()
+ new_vars_with_na = [
+ var
+ for var in config.model_config.features
+ if var
+ not in config.model_config.categorical_vars_with_na_frequent
+ + config.model_config.categorical_vars_with_na_missing
+ + config.model_config.numerical_vars_with_na
+ and validated_data[var].isnull().sum() > 0
+ ]
+ validated_data.dropna(subset=new_vars_with_na, inplace=True)
+
+ return validated_data
+
+
+def validate_inputs(*, input_data: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[dict]]:
+ """Check model inputs for unprocessable values."""
+
+ # convert syntax error field names (beginning with numbers)
+ relevant_data = input_data[config.model_config.features].copy()
+ errors = None
+
+ try:
+ # replace numpy nans so that pydantic can validate
+ MultipleTitanicDataInputs(
+ inputs=relevant_data.replace({np.nan: None}).to_dict(orient="records")
+ )
+ except ValidationError as error:
+ errors = error.json()
+
+ return relevant_data, errors
+
+
+class TitanicDataInputSchema(BaseModel):
+ pclass: Optional[int]
+ survived: Optional[int]
+ name: Optional[str]
+ sex: Optional[str]
+ age: Optional[float]
+ sibsp: Optional[int]
+ parch: Optional[int]
+ ticket: Optional[int]
+ fare: Optional[float]
+ cabin: Optional[str]
+ embarked: Optional[str]
+ boat: Optional[int]
+ body: Optional[int]
+ home_dest: Optional[str]
+
+
+class MultipleTitanicDataInputs(BaseModel):
+ inputs: List[TitanicDataInputSchema]
diff --git a/my-assignement-section-05/classification_model/train_pipeline.py b/my-assignement-section-05/classification_model/train_pipeline.py
new file mode 100644
index 000000000..34a5efb54
--- /dev/null
+++ b/my-assignement-section-05/classification_model/train_pipeline.py
@@ -0,0 +1,31 @@
+from config.core import config
+from pipeline import titanic_pipe
+from processing.data_manager import load_dataset, save_pipeline
+from sklearn.model_selection import train_test_split
+
+
+def run_training() -> None:
+ """Train the model."""
+
+ # read training data
+ data = load_dataset(file_name=config.app_config.data_file)
+
+ # divide train and test
+ X_train, X_test, y_train, y_test = train_test_split(
+ data[config.model_config.features], # predictors
+ data[config.model_config.target],
+ test_size=config.model_config.test_size,
+ # we are setting the random seed here
+ # for reproducibility
+ random_state=config.model_config.random_state,
+ )
+
+ # fit model
+ titanic_pipe.fit(X_train, y_train)
+
+ # persist trained model
+ save_pipeline(pipeline_to_persist=titanic_pipe)
+
+
+if __name__ == "__main__":
+ run_training()
diff --git a/my-assignement-section-05/classification_model/trained_models/__init__.py b/my-assignement-section-05/classification_model/trained_models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/my-assignement-section-05/mypy.ini b/my-assignement-section-05/mypy.ini
new file mode 100644
index 000000000..9f1b46b12
--- /dev/null
+++ b/my-assignement-section-05/mypy.ini
@@ -0,0 +1,14 @@
+[mypy]
+# warn_unreachable = True
+warn_unused_ignores = True
+follow_imports = skip
+show_error_context = True
+warn_incomplete_stub = True
+ignore_missing_imports = True
+check_untyped_defs = True
+cache_dir = /dev/null
+# Cannot enable this one as we still allow defining functions without any types.
+# disallow_untyped_defs = True
+warn_redundant_casts = True
+warn_unused_configs = True
+strict_optional = True
\ No newline at end of file
diff --git a/my-assignement-section-05/pyproject.toml b/my-assignement-section-05/pyproject.toml
new file mode 100644
index 000000000..31a46cadd
--- /dev/null
+++ b/my-assignement-section-05/pyproject.toml
@@ -0,0 +1,48 @@
+[build-system]
+requires = [
+ "setuptools>=42",
+ "wheel"
+]
+build-backend = "setuptools.build_meta"
+
+[tool.pytest.ini_options]
+minversion = "2.0"
+addopts = "-rfEX -p pytester --strict-markers"
+python_files = ["test_*.py", "*_test.py"]
+python_classes = ["Test", "Acceptance"]
+python_functions = ["test"]
+# NOTE: "doc" is not included here, but gets tested explicitly via "doctesting".
+testpaths = ["tests"]
+xfail_strict = true
+filterwarnings = [
+ "error",
+ "default:Using or importing the ABCs:DeprecationWarning:unittest2.*",
+ # produced by older pyparsing<=2.2.0.
+ "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*",
+ "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*",
+ # distutils is deprecated in 3.10, scheduled for removal in 3.12
+ "ignore:The distutils package is deprecated:DeprecationWarning",
+ # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)."
+ "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))",
+ # produced by pytest-xdist
+ "ignore:.*type argument to addoption.*:DeprecationWarning",
+ # produced on execnet (pytest-xdist)
+ "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning",
+ # pytest's own futurewarnings
+ "ignore::pytest.PytestExperimentalApiWarning",
+ # Do not cause SyntaxError for invalid escape sequences in py37.
+ # Those are caught/handled by pyupgrade, and not easy to filter with the
+ # module being the filename (with .py removed).
+ "default:invalid escape sequence:DeprecationWarning",
+ # ignore use of unregistered marks, because we use many to test the implementation
+ "ignore::_pytest.warning_types.PytestUnknownMarkWarning",
+]
+
+[tool.black]
+target-version = ['py311']
+
+[tool.isort]
+profile = "black"
+line_length = 100
+lines_between_sections = 1
+skip = "migrations"
diff --git a/my-assignement-section-05/requirements/requirements.txt b/my-assignement-section-05/requirements/requirements.txt
new file mode 100644
index 000000000..a0800e668
--- /dev/null
+++ b/my-assignement-section-05/requirements/requirements.txt
@@ -0,0 +1,12 @@
+# We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release)
+# to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small
+# updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes.
+numpy>=1.21.0,<1.25.0
+pandas>=1.3.5,<2.0.0
+pydantic>=1.8.1,<2.0.0
+scikit-learn>=1.0.2,<1.1.0
+strictyaml>=1.3.2,<2.0.0
+ruamel.yaml>=0.16.12,<1.0.0
+feature-engine>=1.0.2,<1.6.0 # breaking change in v1.6.0
+joblib>=1.0.1,<2.0.0
+setuptools<60
\ No newline at end of file
diff --git a/my-assignement-section-05/requirements/test_requirements.txt b/my-assignement-section-05/requirements/test_requirements.txt
new file mode 100644
index 000000000..e69019391
--- /dev/null
+++ b/my-assignement-section-05/requirements/test_requirements.txt
@@ -0,0 +1,4 @@
+-r requirements.txt
+
+# testing requirements
+pytest>=7.2.0,<8.0.0
diff --git a/my-assignement-section-05/requirements/typing_requirements.txt b/my-assignement-section-05/requirements/typing_requirements.txt
new file mode 100644
index 000000000..667cc2e4d
--- /dev/null
+++ b/my-assignement-section-05/requirements/typing_requirements.txt
@@ -0,0 +1,5 @@
+# repo maintenance tooling
+black>=22.12.0,<23.0.0
+flake8>=6.0.0,<7.0.0
+mypy>=0.991,<1.0.0
+isort>=5.11.4,<6.0.0
\ No newline at end of file
diff --git a/my-assignement-section-05/setup.py b/my-assignement-section-05/setup.py
new file mode 100644
index 000000000..20329cd0d
--- /dev/null
+++ b/my-assignement-section-05/setup.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+# Package meta-data.
+NAME = 'titanic-classification-model'
+DESCRIPTION = "Example classification model package from Train In Data."
+URL = "https://github.com/trainindata/testing-and-monitoring-ml-deployments"
+EMAIL = "cesa@circletouch.eu"
+AUTHOR = "Alessandro Cesa"
+REQUIRES_PYTHON = "=3.9.1"
+
+
+# The rest you shouldn't have to touch too much :)
+# ------------------------------------------------
+# Except, perhaps the License and Trove Classifiers!
+# If you do change the License, remember to change the
+# Trove Classifier for that!
+long_description = DESCRIPTION
+
+# Load the package's VERSION file as a dictionary.
+about = {}
+ROOT_DIR = Path(__file__).resolve().parent
+REQUIREMENTS_DIR = ROOT_DIR / 'requirements'
+PACKAGE_DIR = ROOT_DIR / 'classification_model'
+with open(PACKAGE_DIR / "VERSION") as f:
+ _version = f.read().strip()
+ about["__version__"] = _version
+
+
+# What packages are required for this module to be executed?
+def list_reqs(fname="requirements.txt"):
+ with open(REQUIREMENTS_DIR / fname) as fd:
+ return fd.read().splitlines()
+
+# Where the magic happens:
+setup(
+ name=NAME,
+ version=about["__version__"],
+ description=DESCRIPTION,
+ long_description=long_description,
+ long_description_content_type="text/markdown",
+ author=AUTHOR,
+ author_email=EMAIL,
+ python_requires=REQUIRES_PYTHON,
+ url=URL,
+ packages=find_packages(exclude=("tests",)),
+ package_data={"classification_model": ["VERSION"]},
+ install_requires=list_reqs(),
+ extras_require={},
+ include_package_data=True,
+ license="BSD-3",
+ classifiers=[
+ # Trove classifiers
+ # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
+ "License :: OSI Approved :: MIT License",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: Implementation :: CPython",
+ ],
+)
\ No newline at end of file
diff --git a/my-assignement-section-05/tests/__init__.py b/my-assignement-section-05/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/my-assignement-section-05/tests/conftest.py b/my-assignement-section-05/tests/conftest.py
new file mode 100644
index 000000000..8b09b0996
--- /dev/null
+++ b/my-assignement-section-05/tests/conftest.py
@@ -0,0 +1,9 @@
+import pytest
+
+from classification_model.config.core import config
+from classification_model.processing.data_manager import load_dataset
+
+
+@pytest.fixture()
+def sample_input_data():
+ return load_dataset(file_name=config.app_config.data_file)
diff --git a/my-assignement-section-05/tests/test_features.py b/my-assignement-section-05/tests/test_features.py
new file mode 100644
index 000000000..051980ea7
--- /dev/null
+++ b/my-assignement-section-05/tests/test_features.py
@@ -0,0 +1,14 @@
+from classification_model.config.core import config
+from classification_model.processing.features import ExtractLetterTransformer
+
+
+def test_name_transformer(sample_input_data):
+ # Given
+ transformer = ExtractLetterTransformer(variables=config.model_config.cabin)
+ assert sample_input_data["cabin"].iat[0] == "B5"
+
+ # When
+ subject = transformer.fit_transform(sample_input_data)
+
+ # Then
+ assert subject["cabin"].iat[0] == "B"
diff --git a/my-assignement-section-05/tests/test_prediction.py b/my-assignement-section-05/tests/test_prediction.py
new file mode 100644
index 000000000..3950222d2
--- /dev/null
+++ b/my-assignement-section-05/tests/test_prediction.py
@@ -0,0 +1,29 @@
+import numpy as np
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+
+from classification_model.config.core import config
+from classification_model.predict import make_prediction
+
+
+def test_make_prediction(sample_input_data):
+ expected_no_predictions = 131
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ sample_input_data[config.model_config.features], # predictors
+ sample_input_data[config.model_config.target],
+ test_size=config.model_config.test_size,
+ # we are setting the random seed here
+ # for reproducibility
+ random_state=config.model_config.random_state,
+ )
+
+ result = make_prediction(input_data=X_test)
+ predictions = result.get("predictions")
+ assert isinstance(predictions, np.ndarray)
+ assert isinstance(predictions[0], np.int64)
+ assert result.get("errors") is None
+ assert len(predictions) == expected_no_predictions
+ _predictions = list(predictions)
+ accuracy = accuracy_score(y_test, _predictions)
+ assert accuracy > 0.7
diff --git a/my-assignement-section-05/tox.ini b/my-assignement-section-05/tox.ini
new file mode 100644
index 000000000..2581112c2
--- /dev/null
+++ b/my-assignement-section-05/tox.ini
@@ -0,0 +1,56 @@
+# Tox is a generic virtualenv management and test command line tool. Its goal is to
+# standardize testing in Python. We will be using it extensively in this course.
+
+# Using Tox we can (on multiple operating systems):
+# + Eliminate PYTHONPATH challenges when running scripts/tests
+# + Eliminate virtualenv setup confusion
+# + Streamline steps such as model training, model publishing
+
+
+[tox]
+min_version = 4
+envlist = test_package, checks
+skipsdist = True
+
+[testenv]
+basepython = python3.9
+install_command = pip install {opts} {packages}
+allowlist_externals = train
+
+setenv =
+ PYTHONPATH=.
+ PYTHONHASHSEED=0
+
+[testenv:test_package]
+envdir = {toxworkdir}/test_package
+deps =
+ -r{toxinidir}/requirements/test_requirements.txt
+commands=
+ python classification_model/train_pipeline.py
+ pytest \
+ -s \
+ -vv \
+ {posargs:tests/}
+
+[testenv:train]
+envdir = {toxworkdir}/test_package
+deps =
+ {[testenv:test_package]deps}
+commands=
+ python classification_model/train_pipeline.py
+
+
+[testenv:checks]
+envdir = {toxworkdir}/checks
+deps =
+ -r{toxinidir}/requirements/typing_requirements.txt
+commands =
+ flake8 classification_model tests
+ isort classification_model tests
+ black classification_model tests
+ {posargs:mypy classification_model}
+
+
+[flake8]
+exclude = .git,env
+max-line-length = 100
\ No newline at end of file
diff --git a/section-04-research-and-development/01-machine-learning-pipeline-data-analysis.ipynb b/section-04-research-and-development/01-machine-learning-pipeline-data-analysis.ipynb
index df3c3c9f1..26972b81c 100644
--- a/section-04-research-and-development/01-machine-learning-pipeline-data-analysis.ipynb
+++ b/section-04-research-and-development/01-machine-learning-pipeline-data-analysis.ipynb
@@ -72,9 +72,12 @@
},
{
"cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-27T08:51:51.941397Z",
+ "start_time": "2025-02-27T08:51:51.927942Z"
+ }
+ },
"source": [
"# to handle datasets\n",
"import pandas as pd\n",
@@ -89,12 +92,28 @@
"\n",
"# to display all the columns of the dataframe in the notebook\n",
"pd.pandas.set_option('display.max_columns', None)"
- ]
+ ],
+ "outputs": [],
+ "execution_count": 1
},
{
"cell_type": "code",
- "execution_count": 2,
- "metadata": {},
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-27T08:51:52.080786Z",
+ "start_time": "2025-02-27T08:51:51.965777Z"
+ }
+ },
+ "source": [
+ "# load dataset\n",
+ "data = pd.read_csv('train.csv')\n",
+ "\n",
+ "# rows and columns of the data\n",
+ "print(data.shape)\n",
+ "\n",
+ "# visualise the dataset\n",
+ "data.head()"
+ ],
"outputs": [
{
"name": "stdout",
@@ -105,6 +124,98 @@
},
{
"data": {
+ "text/plain": [
+ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
+ "0 1 60 RL 65.0 8450 Pave NaN Reg \n",
+ "1 2 20 RL 80.0 9600 Pave NaN Reg \n",
+ "2 3 60 RL 68.0 11250 Pave NaN IR1 \n",
+ "3 4 70 RL 60.0 9550 Pave NaN IR1 \n",
+ "4 5 60 RL 84.0 14260 Pave NaN IR1 \n",
+ "\n",
+ " LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \\\n",
+ "0 Lvl AllPub Inside Gtl CollgCr Norm \n",
+ "1 Lvl AllPub FR2 Gtl Veenker Feedr \n",
+ "2 Lvl AllPub Inside Gtl CollgCr Norm \n",
+ "3 Lvl AllPub Corner Gtl Crawfor Norm \n",
+ "4 Lvl AllPub FR2 Gtl NoRidge Norm \n",
+ "\n",
+ " Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \\\n",
+ "0 Norm 1Fam 2Story 7 5 2003 \n",
+ "1 Norm 1Fam 1Story 6 8 1976 \n",
+ "2 Norm 1Fam 2Story 7 5 2001 \n",
+ "3 Norm 1Fam 2Story 7 5 1915 \n",
+ "4 Norm 1Fam 2Story 8 5 2000 \n",
+ "\n",
+ " YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \\\n",
+ "0 2003 Gable CompShg VinylSd VinylSd BrkFace \n",
+ "1 1976 Gable CompShg MetalSd MetalSd None \n",
+ "2 2002 Gable CompShg VinylSd VinylSd BrkFace \n",
+ "3 1970 Gable CompShg Wd Sdng Wd Shng None \n",
+ "4 2000 Gable CompShg VinylSd VinylSd BrkFace \n",
+ "\n",
+ " MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure \\\n",
+ "0 196.0 Gd TA PConc Gd TA No \n",
+ "1 0.0 TA TA CBlock Gd TA Gd \n",
+ "2 162.0 Gd TA PConc Gd TA Mn \n",
+ "3 0.0 TA TA BrkTil TA Gd No \n",
+ "4 350.0 Gd TA PConc Gd TA Av \n",
+ "\n",
+ " BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF \\\n",
+ "0 GLQ 706 Unf 0 150 856 \n",
+ "1 ALQ 978 Unf 0 284 1262 \n",
+ "2 GLQ 486 Unf 0 434 920 \n",
+ "3 ALQ 216 Unf 0 540 756 \n",
+ "4 GLQ 655 Unf 0 490 1145 \n",
+ "\n",
+ " Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF \\\n",
+ "0 GasA Ex Y SBrkr 856 854 0 \n",
+ "1 GasA Ex Y SBrkr 1262 0 0 \n",
+ "2 GasA Ex Y SBrkr 920 866 0 \n",
+ "3 GasA Gd Y SBrkr 961 756 0 \n",
+ "4 GasA Ex Y SBrkr 1145 1053 0 \n",
+ "\n",
+ " GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr \\\n",
+ "0 1710 1 0 2 1 3 \n",
+ "1 1262 0 1 2 0 3 \n",
+ "2 1786 1 0 2 1 3 \n",
+ "3 1717 1 0 1 0 3 \n",
+ "4 2198 1 0 2 1 4 \n",
+ "\n",
+ " KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \\\n",
+ "0 1 Gd 8 Typ 0 NaN \n",
+ "1 1 TA 6 Typ 1 TA \n",
+ "2 1 Gd 6 Typ 1 TA \n",
+ "3 1 Gd 7 Typ 1 Gd \n",
+ "4 1 Gd 9 Typ 1 TA \n",
+ "\n",
+ " GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual \\\n",
+ "0 Attchd 2003.0 RFn 2 548 TA \n",
+ "1 Attchd 1976.0 RFn 2 460 TA \n",
+ "2 Attchd 2001.0 RFn 2 608 TA \n",
+ "3 Detchd 1998.0 Unf 3 642 TA \n",
+ "4 Attchd 2000.0 RFn 3 836 TA \n",
+ "\n",
+ " GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n",
+ "0 TA Y 0 61 0 0 \n",
+ "1 TA Y 298 0 0 0 \n",
+ "2 TA Y 0 42 0 0 \n",
+ "3 TA Y 0 35 272 0 \n",
+ "4 TA Y 192 84 0 0 \n",
+ "\n",
+ " ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold \\\n",
+ "0 0 0 NaN NaN NaN 0 2 2008 \n",
+ "1 0 0 NaN NaN NaN 0 5 2007 \n",
+ "2 0 0 NaN NaN NaN 0 9 2008 \n",
+ "3 0 0 NaN NaN NaN 0 2 2006 \n",
+ "4 0 0 NaN NaN NaN 0 12 2008 \n",
+ "\n",
+ " SaleType SaleCondition SalePrice \n",
+ "0 WD Normal 208500 \n",
+ "1 WD Normal 181500 \n",
+ "2 WD Normal 223500 \n",
+ "3 WD Abnorml 140000 \n",
+ "4 WD Normal 250000 "
+ ],
"text/html": [
"
\n",
"\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Id | \n",
+ " MSSubClass | \n",
+ " MSZoning | \n",
+ " LotFrontage | \n",
+ " LotArea | \n",
+ " Street | \n",
+ " Alley | \n",
+ " LotShape | \n",
+ " LandContour | \n",
+ " Utilities | \n",
+ " ... | \n",
+ " ScreenPorch | \n",
+ " PoolArea | \n",
+ " PoolQC | \n",
+ " Fence | \n",
+ " MiscFeature | \n",
+ " MiscVal | \n",
+ " MoSold | \n",
+ " YrSold | \n",
+ " SaleType | \n",
+ " SaleCondition | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1461 | \n",
+ " 20 | \n",
+ " RH | \n",
+ " 80.0 | \n",
+ " 11622 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " Reg | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " ... | \n",
+ " 120 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " MnPrv | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 2010 | \n",
+ " WD | \n",
+ " Normal | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1462 | \n",
+ " 20 | \n",
+ " RL | \n",
+ " 81.0 | \n",
+ " 14267 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Gar2 | \n",
+ " 12500 | \n",
+ " 6 | \n",
+ " 2010 | \n",
+ " WD | \n",
+ " Normal | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1463 | \n",
+ " 60 | \n",
+ " RL | \n",
+ " 74.0 | \n",
+ " 13830 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " MnPrv | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 2010 | \n",
+ " WD | \n",
+ " Normal | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1464 | \n",
+ " 60 | \n",
+ " RL | \n",
+ " 78.0 | \n",
+ " 9978 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 2010 | \n",
+ " WD | \n",
+ " Normal | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1465 | \n",
+ " 120 | \n",
+ " RL | \n",
+ " 43.0 | \n",
+ " 5005 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " IR1 | \n",
+ " HLS | \n",
+ " AllPub | \n",
+ " ... | \n",
+ " 144 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2010 | \n",
+ " WD | \n",
+ " Normal | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 80 columns
\n",
+ "
"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 4
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:37.804587Z",
+ "start_time": "2025-02-28T09:34:37.788758Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "original_columns = ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',\n",
+ " 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',\n",
+ " 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',\n",
+ " 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',\n",
+ " 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',\n",
+ " 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',\n",
+ " 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',\n",
+ " 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',\n",
+ " 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',\n",
+ " 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',\n",
+ " 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',\n",
+ " 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',\n",
+ " 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',\n",
+ " 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',\n",
+ " 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',\n",
+ " 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',\n",
+ " 'SaleCondition']\n",
+ "\n",
+ "test_columns = test_set.columns\n",
+ "\n",
+ "for column in original_columns:\n",
+ " if column not in test_columns:\n",
+ " print(f\" {column} is in original but not in test \")\n",
+ " \n",
+ "for column in test_columns:\n",
+ " if column not in original_columns:\n",
+ " print(f\" {column} is in test but not in original\")\n",
+ "\n",
+ " "
+ ],
+ "outputs": [],
+ "execution_count": 5
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:37.947902Z",
+ "start_time": "2025-02-28T09:34:37.916150Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "selected_features = pd.read_csv('selected_features.csv')\n",
+ "selected_features = selected_features.iloc[:,0].to_list()\n",
+ "test_set.drop(['Id'],axis=1, inplace=True)\n",
+ "cat_vars = [var for var in test_set.columns if test_set[var].dtype == 'O']\n",
+ "cat_vars = cat_vars + ['MSSubClass']\n",
+ "num_vars = [\n",
+ " var for var in test_set.columns if var not in cat_vars and var != 'SalePrice'\n",
+ "]\n",
+ "test_set[cat_vars] = test_set[cat_vars].astype('O')"
+ ],
+ "outputs": [],
+ "execution_count": 6
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:38.168587Z",
+ "start_time": "2025-02-28T09:34:38.147555Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "test_set['MSSubClass'].dtype\n",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dtype('O')"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 7
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:38.422377Z",
+ "start_time": "2025-02-28T09:34:38.406632Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "selected_features",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['MSSubClass',\n",
+ " 'MSZoning',\n",
+ " 'LotFrontage',\n",
+ " 'LotShape',\n",
+ " 'LandContour',\n",
+ " 'LotConfig',\n",
+ " 'Neighborhood',\n",
+ " 'OverallQual',\n",
+ " 'OverallCond',\n",
+ " 'YearRemodAdd',\n",
+ " 'RoofStyle',\n",
+ " 'Exterior1st',\n",
+ " 'ExterQual',\n",
+ " 'Foundation',\n",
+ " 'BsmtQual',\n",
+ " 'BsmtExposure',\n",
+ " 'BsmtFinType1',\n",
+ " 'HeatingQC',\n",
+ " 'CentralAir',\n",
+ " '1stFlrSF',\n",
+ " '2ndFlrSF',\n",
+ " 'GrLivArea',\n",
+ " 'BsmtFullBath',\n",
+ " 'HalfBath',\n",
+ " 'KitchenQual',\n",
+ " 'TotRmsAbvGrd',\n",
+ " 'Functional',\n",
+ " 'Fireplaces',\n",
+ " 'FireplaceQu',\n",
+ " 'GarageFinish',\n",
+ " 'GarageCars',\n",
+ " 'GarageArea',\n",
+ " 'PavedDrive',\n",
+ " 'WoodDeckSF',\n",
+ " 'ScreenPorch',\n",
+ " 'SaleCondition']"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 8
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:38.616887Z",
+ "start_time": "2025-02-28T09:34:38.585226Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "test_set.head()",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ " MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour \\\n",
+ "0 20 RH 80.0 11622 Pave NaN Reg Lvl \n",
+ "1 20 RL 81.0 14267 Pave NaN IR1 Lvl \n",
+ "2 60 RL 74.0 13830 Pave NaN IR1 Lvl \n",
+ "3 60 RL 78.0 9978 Pave NaN IR1 Lvl \n",
+ "4 120 RL 43.0 5005 Pave NaN IR1 HLS \n",
+ "\n",
+ " Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature \\\n",
+ "0 AllPub Inside ... 120 0 NaN MnPrv NaN \n",
+ "1 AllPub Corner ... 0 0 NaN NaN Gar2 \n",
+ "2 AllPub Inside ... 0 0 NaN MnPrv NaN \n",
+ "3 AllPub Inside ... 0 0 NaN NaN NaN \n",
+ "4 AllPub Inside ... 144 0 NaN NaN NaN \n",
+ "\n",
+ " MiscVal MoSold YrSold SaleType SaleCondition \n",
+ "0 0 6 2010 WD Normal \n",
+ "1 12500 6 2010 WD Normal \n",
+ "2 0 3 2010 WD Normal \n",
+ "3 0 6 2010 WD Normal \n",
+ "4 0 1 2010 WD Normal \n",
+ "\n",
+ "[5 rows x 79 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " MSSubClass | \n",
+ " MSZoning | \n",
+ " LotFrontage | \n",
+ " LotArea | \n",
+ " Street | \n",
+ " Alley | \n",
+ " LotShape | \n",
+ " LandContour | \n",
+ " Utilities | \n",
+ " LotConfig | \n",
+ " ... | \n",
+ " ScreenPorch | \n",
+ " PoolArea | \n",
+ " PoolQC | \n",
+ " Fence | \n",
+ " MiscFeature | \n",
+ " MiscVal | \n",
+ " MoSold | \n",
+ " YrSold | \n",
+ " SaleType | \n",
+ " SaleCondition | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 20 | \n",
+ " RH | \n",
+ " 80.0 | \n",
+ " 11622 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " Reg | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 120 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " MnPrv | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 2010 | \n",
+ " WD | \n",
+ " Normal | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 20 | \n",
+ " RL | \n",
+ " 81.0 | \n",
+ " 14267 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Corner | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Gar2 | \n",
+ " 12500 | \n",
+ " 6 | \n",
+ " 2010 | \n",
+ " WD | \n",
+ " Normal | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 60 | \n",
+ " RL | \n",
+ " 74.0 | \n",
+ " 13830 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " MnPrv | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 2010 | \n",
+ " WD | \n",
+ " Normal | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 60 | \n",
+ " RL | \n",
+ " 78.0 | \n",
+ " 9978 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 2010 | \n",
+ " WD | \n",
+ " Normal | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 120 | \n",
+ " RL | \n",
+ " 43.0 | \n",
+ " 5005 | \n",
+ " Pave | \n",
+ " NaN | \n",
+ " IR1 | \n",
+ " HLS | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 144 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2010 | \n",
+ " WD | \n",
+ " Normal | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 79 columns
\n",
+ "
"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 9
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:38.784140Z",
+ "start_time": "2025-02-28T09:34:38.755700Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "test_set.isna().sum()",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "MSSubClass 0\n",
+ "MSZoning 4\n",
+ "LotFrontage 227\n",
+ "LotArea 0\n",
+ "Street 0\n",
+ " ... \n",
+ "MiscVal 0\n",
+ "MoSold 0\n",
+ "YrSold 0\n",
+ "SaleType 1\n",
+ "SaleCondition 0\n",
+ "Length: 79, dtype: int64"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 10
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "Categorical Variables"
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:39.080868Z",
+ "start_time": "2025-02-28T09:34:39.043818Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "cat_vars_with_na = [\n",
+ " var for var in cat_vars\n",
+ " if test_set[var].isnull().sum() > 0\n",
+ "]\n",
+ "\n",
+ "# print percentage of missing values per variable\n",
+ "test_set[cat_vars_with_na].isnull().mean().sort_values(ascending=False)\n"
+ ],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PoolQC 0.997944\n",
+ "MiscFeature 0.965045\n",
+ "Alley 0.926662\n",
+ "Fence 0.801234\n",
+ "FireplaceQu 0.500343\n",
+ "GarageCond 0.053461\n",
+ "GarageQual 0.053461\n",
+ "GarageFinish 0.053461\n",
+ "GarageType 0.052090\n",
+ "BsmtCond 0.030843\n",
+ "BsmtQual 0.030158\n",
+ "BsmtExposure 0.030158\n",
+ "BsmtFinType1 0.028787\n",
+ "BsmtFinType2 0.028787\n",
+ "MasVnrType 0.010966\n",
+ "MSZoning 0.002742\n",
+ "Functional 0.001371\n",
+ "Utilities 0.001371\n",
+ "KitchenQual 0.000685\n",
+ "Exterior2nd 0.000685\n",
+ "Exterior1st 0.000685\n",
+ "SaleType 0.000685\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 11
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:39.700752Z",
+ "start_time": "2025-02-28T09:34:39.686933Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "new_vars_with_nan = [var for var in cat_vars_with_na if var not in ['Alley',\n",
+ " 'MasVnrType',\n",
+ " 'BsmtQual',\n",
+ " 'BsmtCond',\n",
+ " 'BsmtExposure',\n",
+ " 'BsmtFinType1',\n",
+ " 'BsmtFinType2',\n",
+ " 'Electrical',\n",
+ " 'FireplaceQu',\n",
+ " 'GarageType',\n",
+ " 'GarageFinish',\n",
+ " 'GarageQual',\n",
+ " 'GarageCond',\n",
+ " 'PoolQC',\n",
+ " 'Fence',\n",
+ " 'MiscFeature']]\n",
+ "print(new_vars_with_nan)"
+ ],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'Functional', 'SaleType']\n"
+ ]
+ }
+ ],
+ "execution_count": 12
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:40.325490Z",
+ "start_time": "2025-02-28T09:34:40.303894Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "new_with_string_missing = [var for var in new_vars_with_nan if test_set[var].isnull().mean()>0.1]\n",
+ "new_with_frequent_category = [var for var in new_vars_with_nan if test_set[var].isnull().mean()<=0.1]\n",
+ "old_with_string_missing = [var for var in cat_vars_with_na if var in ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']]\n",
+ "old_with_frequent_category = [var for var in cat_vars_with_na if var in ['MasVnrType',\n",
+ " 'BsmtQual',\n",
+ " 'BsmtCond',\n",
+ " 'BsmtExposure',\n",
+ " 'BsmtFinType1',\n",
+ " 'BsmtFinType2',\n",
+ " 'Electrical',\n",
+ " 'GarageType',\n",
+ " 'GarageFinish',\n",
+ " 'GarageQual',\n",
+ " 'GarageCond']]\n",
+ "with_string_missing = old_with_string_missing + new_with_string_missing\n",
+ "with_frequent_category = old_with_frequent_category + new_with_frequent_category\n",
+ "\n",
+ "print(with_string_missing)\n",
+ "print(with_frequent_category)"
+ ],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']\n",
+ "['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'Functional', 'SaleType']\n"
+ ]
+ }
+ ],
+ "execution_count": 13
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:40.647242Z",
+ "start_time": "2025-02-28T09:34:40.623237Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "most_frequent_category = {'MasVnrType': 'None',' BsmtCond': 'TA' ,'BsmtQual':'TA', 'BsmtExposure':'No', 'BsmtFinType1':'Unf','BsmtFinType2':'Unf', 'Electrical': 'SBrkr',\n",
+ " 'GarageType': 'Attchd','GarageFinish':'Unf', 'GarageQual': 'TA', 'GarageCond': 'TA'}"
+ ],
+ "outputs": [],
+ "execution_count": 14
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:40.809136Z",
+ "start_time": "2025-02-28T09:34:40.777251Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "test_set[with_string_missing] = test_set[with_string_missing].fillna('Missing')\n",
+ "for var in with_frequent_category:\n",
+ " mode = most_frequent_category.get(var,test_set[var].mode()[0])\n",
+ " test_set[var].fillna(mode, inplace=True)\n",
+ " "
+ ],
+ "outputs": [],
+ "execution_count": 15
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:40.882047Z",
+ "start_time": "2025-02-28T09:34:40.856586Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "test_set[cat_vars_with_na].isnull().sum()",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "MSZoning 0\n",
+ "Alley 0\n",
+ "Utilities 0\n",
+ "Exterior1st 0\n",
+ "Exterior2nd 0\n",
+ "MasVnrType 0\n",
+ "BsmtQual 0\n",
+ "BsmtCond 0\n",
+ "BsmtExposure 0\n",
+ "BsmtFinType1 0\n",
+ "BsmtFinType2 0\n",
+ "KitchenQual 0\n",
+ "Functional 0\n",
+ "FireplaceQu 0\n",
+ "GarageType 0\n",
+ "GarageFinish 0\n",
+ "GarageQual 0\n",
+ "GarageCond 0\n",
+ "PoolQC 0\n",
+ "Fence 0\n",
+ "MiscFeature 0\n",
+ "SaleType 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 16
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "Numerical variables"
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:41.132263Z",
+ "start_time": "2025-02-28T09:34:41.095241Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "num_vars_with_na = [\n",
+ " var for var in num_vars\n",
+ " if test_set[var].isnull().sum() > 0\n",
+ "]\n",
+ "test_set[num_vars_with_na].isnull().mean()"
+ ],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LotFrontage 0.155586\n",
+ "MasVnrArea 0.010281\n",
+ "BsmtFinSF1 0.000685\n",
+ "BsmtFinSF2 0.000685\n",
+ "BsmtUnfSF 0.000685\n",
+ "TotalBsmtSF 0.000685\n",
+ "BsmtFullBath 0.001371\n",
+ "BsmtHalfBath 0.001371\n",
+ "GarageYrBlt 0.053461\n",
+ "GarageCars 0.000685\n",
+ "GarageArea 0.000685\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 17
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:41.311744Z",
+ "start_time": "2025-02-28T09:34:41.280353Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "means = {'LotFrontage': 69.87974098057354, 'MasVnrArea':103.7974006116208,'GarageYrBlt':1978.2959677419356 }",
+ "outputs": [],
+ "execution_count": 18
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:41.421801Z",
+ "start_time": "2025-02-28T09:34:41.390407Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "for var in num_vars_with_na:\n",
+ " mean_val = means.get(var, test_set[var].mean())\n",
+ " test_set[var + '_na'] = np.where(test_set[var].isnull(), 1, 0)\n",
+ " test_set[var].fillna(mean_val, inplace=True)\n",
+ " \n",
+ "# check that we have no more missing values in the engineered variables\n",
+ "test_set[num_vars_with_na].isnull().sum()"
+ ],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LotFrontage 0\n",
+ "MasVnrArea 0\n",
+ "BsmtFinSF1 0\n",
+ "BsmtFinSF2 0\n",
+ "BsmtUnfSF 0\n",
+ "TotalBsmtSF 0\n",
+ "BsmtFullBath 0\n",
+ "BsmtHalfBath 0\n",
+ "GarageYrBlt 0\n",
+ "GarageCars 0\n",
+ "GarageArea 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 19
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "Temporal variables"
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:41.612086Z",
+ "start_time": "2025-02-28T09:34:41.600130Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "def elapsed_years(df, var):\n",
+ " # capture difference between the year variable\n",
+ " # and the year in which the house was sold\n",
+ " df[var] = test_set['YrSold'] - df[var]\n",
+ " return df"
+ ],
+ "outputs": [],
+ "execution_count": 20
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:41.722730Z",
+ "start_time": "2025-02-28T09:34:41.691017Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:\n",
+ " test_set = elapsed_years(test_set, var)\n",
+ " \n",
+ "# now we drop YrSold\n",
+ "test_set.drop(['YrSold'], axis=1, inplace=True)\n"
+ ],
+ "outputs": [],
+ "execution_count": 21
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "Logarithmic Transformation"
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:41.801939Z",
+ "start_time": "2025-02-28T09:34:41.783926Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "for var in [\"LotFrontage\", \"1stFlrSF\", \"GrLivArea\"]:\n",
+ " test_set[var] = np.log(test_set[var])"
+ ],
+ "outputs": [],
+ "execution_count": 22
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "Yeo johnson transformation"
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:41.926695Z",
+ "start_time": "2025-02-28T09:34:41.910836Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "param = -12.55283001172003\n",
+ "test_set['LotArea'] = stats.yeojohnson(test_set['LotArea'], lmbda=param)\n"
+ ],
+ "outputs": [],
+ "execution_count": 23
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "Binarize skewed variables"
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.053448Z",
+ "start_time": "2025-02-28T09:34:42.035614Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "skewed = [\n",
+ " 'BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch',\n",
+ " '3SsnPorch', 'ScreenPorch', 'MiscVal'\n",
+ "]\n",
+ "\n",
+ "for var in skewed:\n",
+ " # map the variable values into 0 and 1\n",
+ " test_set[var] = np.where(test_set[var]==0, 0, 1)\n"
+ ],
+ "outputs": [],
+ "execution_count": 24
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "Categorical mappings"
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.147008Z",
+ "start_time": "2025-02-28T09:34:42.118477Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "qual_mappings = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'Missing': 0, 'NA': 0}\n",
+ "\n",
+ "qual_vars = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',\n",
+ " 'HeatingQC', 'KitchenQual', 'FireplaceQu',\n",
+ " 'GarageQual', 'GarageCond',\n",
+ " ]\n",
+ "for var in qual_vars:\n",
+ " test_set[var] = test_set[var].map(qual_mappings)\n"
+ ],
+ "outputs": [],
+ "execution_count": 25
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.180209Z",
+ "start_time": "2025-02-28T09:34:42.164752Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "exposure_mappings = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}\n",
+ "\n",
+ "var = 'BsmtExposure'\n",
+ "test_set[var] = test_set[var].map(exposure_mappings)\n"
+ ],
+ "outputs": [],
+ "execution_count": 26
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.227803Z",
+ "start_time": "2025-02-28T09:34:42.211972Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "finish_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}\n",
+ "\n",
+ "finish_vars = ['BsmtFinType1', 'BsmtFinType2']\n",
+ "for var in finish_vars:\n",
+ " test_set[var] = test_set[var].map(finish_mappings)\n"
+ ],
+ "outputs": [],
+ "execution_count": 27
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.279548Z",
+ "start_time": "2025-02-28T09:34:42.263551Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "garage_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}\n",
+ "\n",
+ "var = 'GarageFinish'\n",
+ "test_set[var] = test_set[var].map(garage_mappings)"
+ ],
+ "outputs": [],
+ "execution_count": 28
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.336038Z",
+ "start_time": "2025-02-28T09:34:42.311330Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "fence_mappings = {'Missing': 0, 'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}\n",
+ "\n",
+ "var = 'Fence'\n",
+ "test_set[var] = test_set[var].map(fence_mappings)"
+ ],
+ "outputs": [],
+ "execution_count": 29
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.415518Z",
+ "start_time": "2025-02-28T09:34:42.367763Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "[var for var in test_set.columns if test_set[var].isnull().sum() > 0]",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 30
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "Removing rare labels"
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.478975Z",
+ "start_time": "2025-02-28T09:34:42.451553Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "qual_vars = qual_vars + finish_vars + ['BsmtExposure', 'GarageFinish', 'Fence']\n",
+ "\n",
+ "# capture the remaining categorical variables\n",
+ "# (those that we did not re-map)\n",
+ "\n",
+ "cat_others = [\n",
+ " var for var in cat_vars if var not in qual_vars\n",
+ "]\n",
+ "\n",
+ "len(cat_others)\n"
+ ],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "30"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 31
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.542368Z",
+ "start_time": "2025-02-28T09:34:42.526578Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "print(cat_others)",
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'PoolQC', 'MiscFeature', 'SaleType', 'SaleCondition', 'MSSubClass']\n"
+ ]
+ }
+ ],
+ "execution_count": 32
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.652908Z",
+ "start_time": "2025-02-28T09:34:42.605618Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "test_set.head()",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ " MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
+ "0 20 RH 4.382027 0.079663 Pave Missing Reg \n",
+ "1 20 RL 4.394449 0.079663 Pave Missing IR1 \n",
+ "2 60 RL 4.304065 0.079663 Pave Missing IR1 \n",
+ "3 60 RL 4.356709 0.079663 Pave Missing IR1 \n",
+ "4 120 RL 3.761200 0.079663 Pave Missing IR1 \n",
+ "\n",
+ " LandContour Utilities LotConfig ... MasVnrArea_na BsmtFinSF1_na \\\n",
+ "0 Lvl AllPub Inside ... 0 0 \n",
+ "1 Lvl AllPub Corner ... 0 0 \n",
+ "2 Lvl AllPub Inside ... 0 0 \n",
+ "3 Lvl AllPub Inside ... 0 0 \n",
+ "4 HLS AllPub Inside ... 0 0 \n",
+ "\n",
+ " BsmtFinSF2_na BsmtUnfSF_na TotalBsmtSF_na BsmtFullBath_na BsmtHalfBath_na \\\n",
+ "0 0 0 0 0 0 \n",
+ "1 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 \n",
+ "4 0 0 0 0 0 \n",
+ "\n",
+ " GarageYrBlt_na GarageCars_na GarageArea_na \n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ "[5 rows x 89 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " MSSubClass | \n",
+ " MSZoning | \n",
+ " LotFrontage | \n",
+ " LotArea | \n",
+ " Street | \n",
+ " Alley | \n",
+ " LotShape | \n",
+ " LandContour | \n",
+ " Utilities | \n",
+ " LotConfig | \n",
+ " ... | \n",
+ " MasVnrArea_na | \n",
+ " BsmtFinSF1_na | \n",
+ " BsmtFinSF2_na | \n",
+ " BsmtUnfSF_na | \n",
+ " TotalBsmtSF_na | \n",
+ " BsmtFullBath_na | \n",
+ " BsmtHalfBath_na | \n",
+ " GarageYrBlt_na | \n",
+ " GarageCars_na | \n",
+ " GarageArea_na | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 20 | \n",
+ " RH | \n",
+ " 4.382027 | \n",
+ " 0.079663 | \n",
+ " Pave | \n",
+ " Missing | \n",
+ " Reg | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 20 | \n",
+ " RL | \n",
+ " 4.394449 | \n",
+ " 0.079663 | \n",
+ " Pave | \n",
+ " Missing | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Corner | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 60 | \n",
+ " RL | \n",
+ " 4.304065 | \n",
+ " 0.079663 | \n",
+ " Pave | \n",
+ " Missing | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 60 | \n",
+ " RL | \n",
+ " 4.356709 | \n",
+ " 0.079663 | \n",
+ " Pave | \n",
+ " Missing | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 120 | \n",
+ " RL | \n",
+ " 3.761200 | \n",
+ " 0.079663 | \n",
+ " Pave | \n",
+ " Missing | \n",
+ " IR1 | \n",
+ " HLS | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 89 columns
\n",
+ "
"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 33
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.795123Z",
+ "start_time": "2025-02-28T09:34:42.779108Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "type(test_set['MSSubClass'][0])",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "int"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 34
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:42.983422Z",
+ "start_time": "2025-02-28T09:34:42.954128Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "frequent_labels ={'MSZoning': ['FV', 'RH', 'RL', 'RM'], 'Street': ['Pave'], 'Alley': ['Grvl', 'Missing', 'Pave'], 'LotShape': ['IR1', 'IR2', 'Reg'], 'LandContour': ['Bnk', 'HLS', 'Low', 'Lvl'], 'Utilities': ['AllPub'], 'LotConfig': ['Corner', 'CulDSac', 'FR2', 'Inside'], 'LandSlope': ['Gtl', 'Mod'], 'Neighborhood': ['Blmngtn', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber'], 'Condition1': ['Artery', 'Feedr', 'Norm', 'PosN', 'RRAn'], 'Condition2': ['Norm'], 'BldgType': ['1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE'], 'HouseStyle': ['1.5Fin', '1Story', '2Story', 'SFoyer', 'SLvl'], 'RoofStyle': ['Gable', 'Hip'], 'RoofMatl': ['CompShg'], 'Exterior1st': ['AsbShng', 'BrkFace', 'CemntBd', 'HdBoard', 'MetalSd', 'Plywood', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing'], 'Exterior2nd': ['AsbShng', 'BrkFace', 'CmentBd', 'HdBoard', 'MetalSd', 'Plywood', 'Stucco', 'VinylSd', 'Wd Sdng', 'Wd Shng'], 'MasVnrType': ['BrkFace', 'None', 'Stone'], 'Foundation': ['BrkTil', 'CBlock', 'PConc', 'Slab'], 'Heating': ['GasA', 'GasW'], 'CentralAir': ['N', 'Y'], 'Electrical': ['FuseA', 'FuseF', 'SBrkr'], 'Functional': ['Min1', 'Min2', 'Mod', 'Typ'], 'GarageType': ['Attchd', 'Basment', 'BuiltIn', 'Detchd'], 'PavedDrive': ['N', 'P', 'Y'], 'PoolQC': ['Missing'], 'MiscFeature': ['Missing', 'Shed'], 'SaleType': ['COD', 'New', 'WD'], 'SaleCondition': ['Abnorml', 'Family', 'Normal', 'Partial'], 'MSSubClass': ['20', '30', '50', '60', '70', '75', '80', '85', '90', '120', '160', '190']}",
+ "outputs": [],
+ "execution_count": 35
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:43.157454Z",
+ "start_time": "2025-02-28T09:34:43.125685Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "for var in cat_others:\n",
+ " # find the frequent categories\n",
+ " frequent_ls = frequent_labels[var]\n",
+ "\n",
+ " # replace rare categories by the string \"Rare\"\n",
+ " test_set[var] = np.where(test_set[var].isin(\n",
+ " frequent_ls), test_set[var], 'Rare')"
+ ],
+ "outputs": [],
+ "execution_count": 36
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:43.426357Z",
+ "start_time": "2025-02-28T09:34:43.405722Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "test_set['MSSubClass'].dtype\n",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dtype('O')"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 37
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:43.535647Z",
+ "start_time": "2025-02-28T09:34:43.504526Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "test_set.head()",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ " MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
+ "0 Rare RH 4.382027 0.079663 Pave Missing Reg \n",
+ "1 Rare RL 4.394449 0.079663 Pave Missing IR1 \n",
+ "2 Rare RL 4.304065 0.079663 Pave Missing IR1 \n",
+ "3 Rare RL 4.356709 0.079663 Pave Missing IR1 \n",
+ "4 Rare RL 3.761200 0.079663 Pave Missing IR1 \n",
+ "\n",
+ " LandContour Utilities LotConfig ... MasVnrArea_na BsmtFinSF1_na \\\n",
+ "0 Lvl AllPub Inside ... 0 0 \n",
+ "1 Lvl AllPub Corner ... 0 0 \n",
+ "2 Lvl AllPub Inside ... 0 0 \n",
+ "3 Lvl AllPub Inside ... 0 0 \n",
+ "4 HLS AllPub Inside ... 0 0 \n",
+ "\n",
+ " BsmtFinSF2_na BsmtUnfSF_na TotalBsmtSF_na BsmtFullBath_na BsmtHalfBath_na \\\n",
+ "0 0 0 0 0 0 \n",
+ "1 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 \n",
+ "4 0 0 0 0 0 \n",
+ "\n",
+ " GarageYrBlt_na GarageCars_na GarageArea_na \n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "\n",
+ "[5 rows x 89 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " MSSubClass | \n",
+ " MSZoning | \n",
+ " LotFrontage | \n",
+ " LotArea | \n",
+ " Street | \n",
+ " Alley | \n",
+ " LotShape | \n",
+ " LandContour | \n",
+ " Utilities | \n",
+ " LotConfig | \n",
+ " ... | \n",
+ " MasVnrArea_na | \n",
+ " BsmtFinSF1_na | \n",
+ " BsmtFinSF2_na | \n",
+ " BsmtUnfSF_na | \n",
+ " TotalBsmtSF_na | \n",
+ " BsmtFullBath_na | \n",
+ " BsmtHalfBath_na | \n",
+ " GarageYrBlt_na | \n",
+ " GarageCars_na | \n",
+ " GarageArea_na | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Rare | \n",
+ " RH | \n",
+ " 4.382027 | \n",
+ " 0.079663 | \n",
+ " Pave | \n",
+ " Missing | \n",
+ " Reg | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Rare | \n",
+ " RL | \n",
+ " 4.394449 | \n",
+ " 0.079663 | \n",
+ " Pave | \n",
+ " Missing | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Corner | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Rare | \n",
+ " RL | \n",
+ " 4.304065 | \n",
+ " 0.079663 | \n",
+ " Pave | \n",
+ " Missing | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Rare | \n",
+ " RL | \n",
+ " 4.356709 | \n",
+ " 0.079663 | \n",
+ " Pave | \n",
+ " Missing | \n",
+ " IR1 | \n",
+ " Lvl | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Rare | \n",
+ " RL | \n",
+ " 3.761200 | \n",
+ " 0.079663 | \n",
+ " Pave | \n",
+ " Missing | \n",
+ " IR1 | \n",
+ " HLS | \n",
+ " AllPub | \n",
+ " Inside | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 89 columns
\n",
+ "
"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 38
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": "encoding of categorical variables"
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:43.709922Z",
+ "start_time": "2025-02-28T09:34:43.682445Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "encodings = {'MSZoning': {'Rare': 0, 'RM': 1, 'RH': 2, 'RL': 3, 'FV': 4}, 'Street': {'Rare': 0, 'Pave': 1}, 'Alley': {'Grvl': 0, 'Pave': 1, 'Missing': 2}, 'LotShape': {'Reg': 0, 'IR1': 1, 'Rare': 2, 'IR2': 3}, 'LandContour': {'Bnk': 0, 'Lvl': 1, 'Low': 2, 'HLS': 3}, 'Utilities': {'Rare': 0, 'AllPub': 1}, 'LotConfig': {'Inside': 0, 'FR2': 1, 'Corner': 2, 'Rare': 3, 'CulDSac': 4}, 'LandSlope': {'Gtl': 0, 'Mod': 1, 'Rare': 2}, 'Neighborhood': {'IDOTRR': 0, 'MeadowV': 1, 'BrDale': 2, 'Edwards': 3, 'BrkSide': 4, 'OldTown': 5, 'Sawyer': 6, 'SWISU': 7, 'NAmes': 8, 'Mitchel': 9, 'SawyerW': 10, 'Rare': 11, 'NWAmes': 12, 'Gilbert': 13, 'Blmngtn': 14, 'CollgCr': 15, 'Crawfor': 16, 'ClearCr': 17, 'Somerst': 18, 'Timber': 19, 'StoneBr': 20, 'NridgHt': 21, 'NoRidge': 22}, 'Condition1': {'Artery': 0, 'Feedr': 1, 'Norm': 2, 'RRAn': 3, 'Rare': 4, 'PosN': 5}, 'Condition2': {'Rare': 0, 'Norm': 1}, 'BldgType': {'2fmCon': 0, 'Duplex': 1, 'Twnhs': 2, '1Fam': 3, 'TwnhsE': 4}, 'HouseStyle': {'SFoyer': 0, '1.5Fin': 1, 'Rare': 2, '1Story': 3, 'SLvl': 4, '2Story': 5}, 'RoofStyle': {'Gable': 0, 'Rare': 1, 'Hip': 2}, 'RoofMatl': {'CompShg': 0, 'Rare': 1}, 'Exterior1st': {'AsbShng': 0, 'Wd Sdng': 1, 'WdShing': 2, 'MetalSd': 3, 'Stucco': 4, 'Rare': 5, 'HdBoard': 6, 'Plywood': 7, 'BrkFace': 8, 'CemntBd': 9, 'VinylSd': 10}, 'Exterior2nd': {'AsbShng': 0, 'Wd Sdng': 1, 'MetalSd': 2, 'Wd Shng': 3, 'Stucco': 4, 'Rare': 5, 'HdBoard': 6, 'Plywood': 7, 'BrkFace': 8, 'CmentBd': 9, 'VinylSd': 10}, 'MasVnrType': {'Rare': 0, 'None': 1, 'BrkFace': 2, 'Stone': 3}, 'Foundation': {'Slab': 0, 'BrkTil': 1, 'CBlock': 2, 'Rare': 3, 'PConc': 4}, 'Heating': {'Rare': 0, 'GasW': 1, 'GasA': 2}, 'CentralAir': {'N': 0, 'Y': 1}, 'Electrical': {'Rare': 0, 'FuseF': 1, 'FuseA': 2, 'SBrkr': 3}, 'Functional': {'Rare': 0, 'Min2': 1, 'Mod': 2, 'Min1': 3, 'Typ': 4}, 'GarageType': {'Rare': 0, 'Detchd': 1, 'Basment': 2, 'Attchd': 3, 'BuiltIn': 4}, 'PavedDrive': {'N': 0, 'P': 1, 'Y': 2}, 'PoolQC': {'Missing': 0, 'Rare': 1}, 'MiscFeature': {'Rare': 0, 'Shed': 1, 'Missing': 2}, 'SaleType': {'COD': 0, 'Rare': 1, 'WD': 2, 'New': 3}, 'SaleCondition': {'Rare': 0, 'Abnorml': 1, 'Family': 2, 'Normal': 3, 'Partial': 4}, 'MSSubClass': {30: 0, 'Rare': 1, 190: 2, 90: 3, 160: 4, 50: 5, 85: 6, 70: 7, 80: 8, 20: 9, 75: 10, 120: 11, 60: 12}}",
+ "outputs": [],
+ "execution_count": 39
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": ""
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:43.852628Z",
+ "start_time": "2025-02-28T09:34:43.809211Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "for var in cat_others:\n",
+ " test_set[var] = test_set[var].map(encodings[var])"
+ ],
+ "outputs": [],
+ "execution_count": 40
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:44.199531Z",
+ "start_time": "2025-02-28T09:34:44.167859Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "[var for var in test_set.columns if test_set[var].isnull().sum() > 0]\n",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[]"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 41
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:44.500292Z",
+ "start_time": "2025-02-28T09:34:44.338393Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "scaler = joblib.load('minmax_scaler.joblib')",
+ "outputs": [],
+ "execution_count": 42
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:44.550680Z",
+ "start_time": "2025-02-28T09:34:44.536642Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "test_set.shape",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1459, 89)"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 43
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:44.625725Z",
+ "start_time": "2025-02-28T09:34:44.601228Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "original_columns = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',\n",
+ " 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',\n",
+ " 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',\n",
+ " 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',\n",
+ " 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',\n",
+ " 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',\n",
+ " 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',\n",
+ " 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',\n",
+ " 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',\n",
+ " 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',\n",
+ " 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',\n",
+ " 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',\n",
+ " 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',\n",
+ " 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',\n",
+ " 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal',\n",
+ " 'MoSold', 'SaleType', 'SaleCondition', 'LotFrontage_na',\n",
+ " 'MasVnrArea_na', 'GarageYrBlt_na']\n",
+ "test_columns = test_set.columns\n",
+ "\n",
+ "in_original_not_in_test = []\n",
+ "in_test_not_in_original = []\n",
+ "\n",
+ "for column in original_columns:\n",
+ " if column not in test_columns:\n",
+ " in_original_not_in_test.append(column)\n",
+ " \n",
+ "for column in test_columns:\n",
+ " if column not in original_columns:\n",
+ " in_test_not_in_original.append(column)\n",
+ " \n",
+ "print(in_original_not_in_test)\n",
+ "print(in_test_not_in_original)\n"
+ ],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[]\n",
+ "['BsmtFinSF1_na', 'BsmtFinSF2_na', 'BsmtUnfSF_na', 'TotalBsmtSF_na', 'BsmtFullBath_na', 'BsmtHalfBath_na', 'GarageCars_na', 'GarageArea_na']\n"
+ ]
+ }
+ ],
+ "execution_count": 44
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:44.829451Z",
+ "start_time": "2025-02-28T09:34:44.744867Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "test_set.head()",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ " MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
+ "0 1 2 4.382027 0.079663 1 2 0 \n",
+ "1 1 3 4.394449 0.079663 1 2 1 \n",
+ "2 1 3 4.304065 0.079663 1 2 1 \n",
+ "3 1 3 4.356709 0.079663 1 2 1 \n",
+ "4 1 3 3.761200 0.079663 1 2 1 \n",
+ "\n",
+ " LandContour Utilities LotConfig ... MasVnrArea_na BsmtFinSF1_na \\\n",
+ "0 1 1 0 ... 0 0 \n",
+ "1 1 1 2 ... 0 0 \n",
+ "2 1 1 0 ... 0 0 \n",
+ "3 1 1 0 ... 0 0 \n",
+ "4 3 1 0 ... 0 0 \n",
+ "\n",
+ " BsmtFinSF2_na BsmtUnfSF_na TotalBsmtSF_na BsmtFullBath_na \\\n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ " BsmtHalfBath_na GarageYrBlt_na GarageCars_na GarageArea_na \n",
+ "0 0 0 0 0 \n",
+ "1 0 0 0 0 \n",
+ "2 0 0 0 0 \n",
+ "3 0 0 0 0 \n",
+ "4 0 0 0 0 \n",
+ "\n",
+ "[5 rows x 89 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " MSSubClass | \n",
+ " MSZoning | \n",
+ " LotFrontage | \n",
+ " LotArea | \n",
+ " Street | \n",
+ " Alley | \n",
+ " LotShape | \n",
+ " LandContour | \n",
+ " Utilities | \n",
+ " LotConfig | \n",
+ " ... | \n",
+ " MasVnrArea_na | \n",
+ " BsmtFinSF1_na | \n",
+ " BsmtFinSF2_na | \n",
+ " BsmtUnfSF_na | \n",
+ " TotalBsmtSF_na | \n",
+ " BsmtFullBath_na | \n",
+ " BsmtHalfBath_na | \n",
+ " GarageYrBlt_na | \n",
+ " GarageCars_na | \n",
+ " GarageArea_na | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 4.382027 | \n",
+ " 0.079663 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 4.394449 | \n",
+ " 0.079663 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 4.304065 | \n",
+ " 0.079663 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 4.356709 | \n",
+ " 0.079663 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3.761200 | \n",
+ " 0.079663 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 89 columns
\n",
+ "
"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 45
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:45.157808Z",
+ "start_time": "2025-02-28T09:34:45.110692Z"
+ }
+ },
+ "cell_type": "code",
+ "source": [
+ "test_set = test_set.drop(columns = in_test_not_in_original)\n",
+ "test_set =pd.DataFrame(scaler.transform(test_set), columns = test_set.columns)[selected_features]"
+ ],
+ "outputs": [],
+ "execution_count": 46
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:45.363330Z",
+ "start_time": "2025-02-28T09:34:45.284334Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "test_set.head()",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ " MSSubClass MSZoning LotFrontage LotShape LandContour LotConfig \\\n",
+ "0 0.083333 0.50 0.495064 0.000000 0.333333 0.0 \n",
+ "1 0.083333 0.75 0.499662 0.333333 0.333333 0.5 \n",
+ "2 0.083333 0.75 0.466207 0.333333 0.333333 0.0 \n",
+ "3 0.083333 0.75 0.485693 0.333333 0.333333 0.0 \n",
+ "4 0.083333 0.75 0.265271 0.333333 1.000000 0.0 \n",
+ "\n",
+ " Neighborhood OverallQual OverallCond YearRemodAdd ... Functional \\\n",
+ "0 0.363636 0.444444 0.625 0.819672 ... 1.0 \n",
+ "1 0.363636 0.555556 0.625 0.868852 ... 1.0 \n",
+ "2 0.590909 0.444444 0.500 0.213115 ... 1.0 \n",
+ "3 0.590909 0.555556 0.625 0.213115 ... 1.0 \n",
+ "4 0.909091 0.777778 0.500 0.311475 ... 1.0 \n",
+ "\n",
+ " Fireplaces FireplaceQu GarageFinish GarageCars GarageArea PavedDrive \\\n",
+ "0 0.000000 0.0 0.0 0.25 0.514810 1.0 \n",
+ "1 0.000000 0.0 0.0 0.25 0.220028 1.0 \n",
+ "2 0.333333 0.6 1.0 0.50 0.339915 1.0 \n",
+ "3 0.333333 0.8 1.0 0.50 0.331453 1.0 \n",
+ "4 0.000000 0.0 0.5 0.50 0.356841 1.0 \n",
+ "\n",
+ " WoodDeckSF ScreenPorch SaleCondition \n",
+ "0 0.163361 1.0 0.75 \n",
+ "1 0.458576 0.0 0.75 \n",
+ "2 0.247375 0.0 0.75 \n",
+ "3 0.420070 0.0 0.75 \n",
+ "4 0.000000 1.0 0.75 \n",
+ "\n",
+ "[5 rows x 36 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " MSSubClass | \n",
+ " MSZoning | \n",
+ " LotFrontage | \n",
+ " LotShape | \n",
+ " LandContour | \n",
+ " LotConfig | \n",
+ " Neighborhood | \n",
+ " OverallQual | \n",
+ " OverallCond | \n",
+ " YearRemodAdd | \n",
+ " ... | \n",
+ " Functional | \n",
+ " Fireplaces | \n",
+ " FireplaceQu | \n",
+ " GarageFinish | \n",
+ " GarageCars | \n",
+ " GarageArea | \n",
+ " PavedDrive | \n",
+ " WoodDeckSF | \n",
+ " ScreenPorch | \n",
+ " SaleCondition | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0.083333 | \n",
+ " 0.50 | \n",
+ " 0.495064 | \n",
+ " 0.000000 | \n",
+ " 0.333333 | \n",
+ " 0.0 | \n",
+ " 0.363636 | \n",
+ " 0.444444 | \n",
+ " 0.625 | \n",
+ " 0.819672 | \n",
+ " ... | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.25 | \n",
+ " 0.514810 | \n",
+ " 1.0 | \n",
+ " 0.163361 | \n",
+ " 1.0 | \n",
+ " 0.75 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0.083333 | \n",
+ " 0.75 | \n",
+ " 0.499662 | \n",
+ " 0.333333 | \n",
+ " 0.333333 | \n",
+ " 0.5 | \n",
+ " 0.363636 | \n",
+ " 0.555556 | \n",
+ " 0.625 | \n",
+ " 0.868852 | \n",
+ " ... | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.25 | \n",
+ " 0.220028 | \n",
+ " 1.0 | \n",
+ " 0.458576 | \n",
+ " 0.0 | \n",
+ " 0.75 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.083333 | \n",
+ " 0.75 | \n",
+ " 0.466207 | \n",
+ " 0.333333 | \n",
+ " 0.333333 | \n",
+ " 0.0 | \n",
+ " 0.590909 | \n",
+ " 0.444444 | \n",
+ " 0.500 | \n",
+ " 0.213115 | \n",
+ " ... | \n",
+ " 1.0 | \n",
+ " 0.333333 | \n",
+ " 0.6 | \n",
+ " 1.0 | \n",
+ " 0.50 | \n",
+ " 0.339915 | \n",
+ " 1.0 | \n",
+ " 0.247375 | \n",
+ " 0.0 | \n",
+ " 0.75 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.083333 | \n",
+ " 0.75 | \n",
+ " 0.485693 | \n",
+ " 0.333333 | \n",
+ " 0.333333 | \n",
+ " 0.0 | \n",
+ " 0.590909 | \n",
+ " 0.555556 | \n",
+ " 0.625 | \n",
+ " 0.213115 | \n",
+ " ... | \n",
+ " 1.0 | \n",
+ " 0.333333 | \n",
+ " 0.8 | \n",
+ " 1.0 | \n",
+ " 0.50 | \n",
+ " 0.331453 | \n",
+ " 1.0 | \n",
+ " 0.420070 | \n",
+ " 0.0 | \n",
+ " 0.75 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.083333 | \n",
+ " 0.75 | \n",
+ " 0.265271 | \n",
+ " 0.333333 | \n",
+ " 1.000000 | \n",
+ " 0.0 | \n",
+ " 0.909091 | \n",
+ " 0.777778 | \n",
+ " 0.500 | \n",
+ " 0.311475 | \n",
+ " ... | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.5 | \n",
+ " 0.50 | \n",
+ " 0.356841 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1.0 | \n",
+ " 0.75 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 36 columns
\n",
+ "
"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "execution_count": 47
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:45.971970Z",
+ "start_time": "2025-02-28T09:34:45.502223Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "lin_model = joblib.load('linear_regression.joblib')",
+ "outputs": [],
+ "execution_count": 48
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:46.130650Z",
+ "start_time": "2025-02-28T09:34:46.114639Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "predictions = lin_model.predict(test_set)",
+ "outputs": [],
+ "execution_count": 49
+ },
+ {
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T09:34:47.709356Z",
+ "start_time": "2025-02-28T09:34:46.360427Z"
+ }
+ },
+ "cell_type": "code",
+ "source": "pd.Series(np.exp(predictions)).hist(bins=50)",
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjAAAAGdCAYAAAAMm0nCAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAk4UlEQVR4nO3dfXBU1f3H8c8m2WwSYBMeJA8aJFYUBZUKEqP2QVmIigrKqGjaoehIrcGK6WihP0FA2yC1SKEI2iqMMyJqW9ARjGSCQtUYIIKC0IgtFqc2oYrJApF1Sc7vD8sd10RJ4G52z/J+zTCy5569+937XZKP5+7d9RhjjAAAACySFOsCAAAAOosAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwTkqsCzgWra2t+vjjj9WjRw95PJ5YlwMAADrAGKP9+/crLy9PSUnHt4ZiZYD5+OOPlZ+fH+syAADAMfjoo490yimnHNc+rAwwPXr0kPTlAfD7/TGuJj6Fw2GtXbtWo0aNktfrjXU5+B/6En/oSfyhJ/HHrZ4Eg0Hl5+c7v8ePh5UB5shpI7/fT4D5BuFwWBkZGfL7/fwAiCP0Jf7Qk/hDT+KP2z1x4+0fvIkXAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDopsS4AJ67+U1cfdc6Hc0Z3QSUAANuwAgMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0uo0ancfkzACDWWIEBAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgnZRYF4DE1H/q6liXAABIYKzAAAAA6xBgAACAdTiFhAic+gEA2IAVGAAAYJ1OB5gNGzbo6quvVl5enjwej1atWhWx3RijGTNmKDc3V+np6QoEAtq1a1fEnH379qmkpER+v19ZWVm69dZbdeDAgeN6IgAA4MTR6QBz8OBBnXfeeVq0aFG72+fOnasFCxZoyZIlqqmpUbdu3VRcXKxDhw45c0pKSvTee++psrJSL730kjZs2KBJkyYd+7MAAAAnlE6/B+aKK67QFVdc0e42Y4zmz5+v++67T2PGjJEkPfXUU8rOztaqVas0fvx47dy5UxUVFdq0aZOGDRsmSVq4cKGuvPJKPfzww8rLyzuOpwMAAE4Err6Jd/fu3aqvr1cgEHDGMjMzVVhYqOrqao0fP17V1dXKyspywoskBQIBJSUlqaamRtdee22b/YZCIYVCIed2MBiUJIXDYYXDYTefQsI4clw6e3x8ySYa5RyzROvvsfYF0UNP4g89iT9u9cTNnroaYOrr6yVJ2dnZEePZ2dnOtvr6evXt2zeyiJQU9erVy5nzdeXl5Zo1a1ab8bVr1yojI8ON0hNWZWVlp+bPHR6lQo7RmjVrYl1CVHS2L4g+ehJ/6En8Od6eNDc3u1SJJZdRT5s2TWVlZc7tYDCo/Px8jRo1Sn6/P4aVxa9wOKzKykqNHDlSXq+3w/cbPPOVKFbVedtnFse6BFcda18QPfQk/tCT+ONWT46cQXGDqwEmJydHktTQ0KDc3FxnvKGhQUOGDHHm7N27N+J+hw8f1r59+5z7f53P55PP52sz7vV6eXEfRWePUajFE8VqOi9R+8trN/7Qk/hDT+LP8fbEzX66+jkwBQUFysnJUVVVlTMWDAZVU1OjoqIiSVJRUZEaGxtVW1vrzFm3bp1aW1tVWFjoZjkAACBBdXoF5sCBA/rggw+c27t379bWrVvVq1cv9evXT1OmTNGDDz6oAQMGqKCgQNOnT1deXp7Gjh0rSTrrrLN0+eWX67bbbtOSJUsUDoc1efJkjR8/niuQAABAh3Q6wGzevFmXXnqpc/vIe1MmTJigZcuW6d5779XBgwc1adIkNTY26pJLLlFFRYXS0tKc+zz99NOaPHmyRowYoaSkJI0bN04LFixw4ekAAIATQacDzA9/+EMZ882X2no8Hs2ePVuzZ8/+xjm9evXS8uXLO/vQAAAAkvguJAAAYCECDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArON6gGlpadH06dNVUFCg9PR0fec739EDDzwgY4wzxxijGTNmKDc3V+np6QoEAtq1a5fbpQAAgATleoB56KGHtHjxYv3hD3/Qzp079dBDD2nu3LlauHChM2fu3LlasGCBlixZopqaGnXr1k3FxcU6dOiQ2+UAAIAElOL2Dt98802NGTNGo0ePliT1799fzzzzjDZu3Cjpy9WX+fPn67777tOYMWMkSU899ZSys7O1atUqjR8/3u2SAABAgnE9wFx00UV6/PHH9f777+uMM87QO++8o9dff13z5s2TJO3evVv19fUKBALOfTIzM1VYWKjq6up2A0woFFIoFHJuB4NBSVI4HFY4HHb7KSSEI8els8fHl2yOPqkLJVp/j7UviB56En/oSfxxqydu9tT1ADN16lQFg0ENHDhQycnJamlp0a9//WuVlJRIkurr6yVJ2dnZEffLzs52tn1deXm5Zs2a1WZ87dq1ysjIcPkZJJbKyspOzZ87PEqFHKM1a9bEuoSo6GxfEH30JP7Qk/hzvD1pbm52qZIoBJjnnntOTz/9tJYvX65BgwZp69atmjJlivLy8jRhwoRj2ue0adNUVlbm3A4Gg8rPz9eoUaPk9/vdKj2hhMNhVVZWauTIkfJ6vR2+3+CZr0Sxqs7bPrM41iW46lj7guihJ/GHnsQft3py5AyKG1wPMPfcc4+mTp3qnAo655xz9K9//Uvl5eWaMGGCcnJyJEkNDQ3Kzc117tfQ0KAhQ4a0u0+fzyefz9dm3Ov18uI+is4eo1CLJ4rVdF6i9pfXbvyhJ/GHnsSf4+2Jm/10PcA0NzcrKSny4qbk5GS1trZKkgoKCpSTk6OqqionsASDQdXU1OhnP/uZ2+UAkqT+U1cfdc6Hc0Z3QSUAADe4HmCuvvpq/frXv1a/fv00aNAgbdmyRfPmzdMtt9wiSfJ4PJoyZYoefPBBDRgwQAUFBZo+fbry8vI0duxYt8sBAAAJyPUAs3DhQk2fPl133HGH9u7dq7y8PP30pz/VjBkznDn33nuvDh48qEmTJqmxsVGXXHKJKioqlJaW5nY5AAAgAbkeYHr06KH58+dr/vz53zjH4/Fo9uzZmj17ttsPj2/RkdMoAADYgO9CAgAA1iHAAAAA67h+CglwE1cPAQDawwoMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANbhyxyBTuDLJQEgPrACAwAArEOAAQAA1uEUEvA/HTk9BACID6zAAAAA6xBgAACAdTiFBOtx6gcATjyswAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWCcqAebf//63fvSjH6l3795KT0/XOeeco82bNzvbjTGaMWOGcnNzlZ6erkAgoF27dkWjFAAAkIBcDzCfffaZLr74Ynm9Xr388svasWOHfve736lnz57OnLlz52rBggVasmSJampq1K1bNxUXF+vQoUNulwMAABJQits7fOihh5Sfn6+lS5c6YwUFBc7fjTGaP3++7rvvPo0ZM0aS9NRTTyk7O1urVq3S+PHj3S4JAAAkGNcDzIsvvqji4mJdf/31Wr9+vU4++WTdcccduu222yRJu3fvVn19vQKBgHOfzMxMFRYWqrq6ut0AEwqFFAqFnNvBYFCSFA6HFQ6H3X4KCeHIcfnq8fElm1iVc0L5ttdke31BbNGT+ENP4o9bPXGzpx5jjKu/1dLS0iRJZWVluv7667Vp0ybdddddWrJkiSZMmKA333xTF198sT7++GPl5uY697vhhhvk8Xj07LPPttnnzJkzNWvWrDbjy5cvV0ZGhpvlAwCAKGlubtbNN9+spqYm+f3+49qX6wEmNTVVw4YN05tvvumM/fznP9emTZtUXV19TAGmvRWY/Px8ffLJJ8d9ABLF4JmvRNz2JRk9MKxV0zcnKdTqiVFVJ6btM4u/cVs4HFZlZaVGjhwpr9fbhVXhm9CT+ENP4o9bPQkGg+rTp48rAcb1U0i5ubk6++yzI8bOOuss/eUvf5Ek5eTkSJIaGhoiAkxDQ4OGDBnS7j59Pp98Pl+bca/Xy4v7f0It7YeUUKvnG7chOjrymuS1G3/oSfyhJ/HneHviZj9dvwrp4osvVl1dXcTY+++/r1NPPVXSl2/ozcnJUVVVlbM9GAyqpqZGRUVFbpcDAAASkOsrMHfffbcuuugi/eY3v9ENN9ygjRs36vHHH9fjjz8uSfJ4PJoyZYoefPBBDRgwQAUFBZo+fbry8vI0duxYt8sBAAAJyPUAc8EFF2jlypWaNm2aZs+erYKCAs2fP18lJSXOnHvvvVcHDx7UpEmT1NjYqEsuuUQVFRXOG4ABAAC+jesBRpKuuuoqXXXVVd+43ePxaPbs2Zo9e3Y0Hh4AACQ4vgsJAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsE5KrAsATkSDZ76iUIvnW+d8OGd0F1UDAPZhBQYAAFiHAAMAAKzDKSQgTvWfuvqoczjNBOBExQoMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6UQ8wc+bMkcfj0ZQpU5yxQ4cOqbS0VL1791b37t01btw4NTQ0RLsUAACQIKL6QXabNm3SY489pnPPPTdi/O6779bq1av1/PPPKzMzU5MnT9Z1112nN954I5rlWKsjH2iG+PFt/fIlG80d3oXFAECCitoKzIEDB1RSUqI//vGP6tmzpzPe1NSkJ554QvPmzdNll12moUOHaunSpXrzzTf11ltvRascAACQQKIWYEpLSzV69GgFAoGI8draWoXD4YjxgQMHql+/fqquro5WOQAAIIFE5RTSihUr9Pbbb2vTpk1tttXX1ys1NVVZWVkR49nZ2aqvr293f6FQSKFQyLkdDAYlSeFwWOFw2L3C45Qv2XT+Pkkm4r+ID2735UR4/UfbkWPIsYwf9CT+uNUTN3vqeoD56KOPdNddd6myslJpaWmu7LO8vFyzZs1qM7527VplZGS48hjx7HjeM/HAsFb3CoFr3OrLmjVrXNkPpMrKyliXgK+hJ/HneHvS3NzsUiWSxxjj6v+ir1q1Stdee62Sk5OdsZaWFnk8HiUlJemVV15RIBDQZ599FrEKc+qpp2rKlCm6++672+yzvRWY/Px8ffLJJ/L7/W6WH5cGz3yl0/fxJRk9MKxV0zcnKdTqiUJVOBZu92X7zGIXqjqxhcNhVVZWauTIkfJ6vbEuB6In8citngSDQfXp00dNTU3H/fvb9RWYESNGaNu2bRFjEydO1MCBA/XLX/5S+fn58nq9qqqq0rhx4yRJdXV12rNnj4qKitrdp8/nk8/nazPu9XpPiBd3qOXYf9GFWj3HdX9Eh1t9ORFe/13lRPl5YhN6En+Otydu9tP1ANOjRw8NHjw4Yqxbt27q3bu3M37rrbeqrKxMvXr1kt/v15133qmioiJdeOGFbpcDAAASUFQ/B+abPPLII0pKStK4ceMUCoVUXFysRx99NBalAAAAC3VJgHnttdcibqelpWnRokVatGhRVzw8AABIMHwXEgAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwTky+SgCAO/pPXX3UOR/OGd0FlQBA12IFBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYh0/iBRIcn9YLIBGxAgMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcPsgPAh90BsA4rMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrpMS6gBNd/6mrY10C4JqOvJ4/nDO6CyoBkOhYgQEAANYhwAAAAOtwCglAl+I0EwA3sAIDAACsQ4ABAADWIcAAAADrEGAAAIB1XA8w5eXluuCCC9SjRw/17dtXY8eOVV1dXcScQ4cOqbS0VL1791b37t01btw4NTQ0uF0KAABIUK5fhbR+/XqVlpbqggsu0OHDh/WrX/1Ko0aN0o4dO9StWzdJ0t13363Vq1fr+eefV2ZmpiZPnqzrrrtOb7zxhtvlAHAJH7oIIJ64HmAqKioibi9btkx9+/ZVbW2tvv/976upqUlPPPGEli9frssuu0yStHTpUp111ll66623dOGFF7pdEgAASDBR/xyYpqYmSVKvXr0kSbW1tQqHwwoEAs6cgQMHql+/fqqurm43wIRCIYVCIed2MBiUJIXDYYXD4WiWH3W+ZBOd/SaZiP8iPtCXjunKf9dHHsv2nyWJhJ7EH7d64mZPPcaYqP0kbW1t1TXXXKPGxka9/vrrkqTly5dr4sSJEYFEkoYPH65LL71UDz30UJv9zJw5U7NmzWozvnz5cmVkZESneAAA4Krm5mbdfPPNampqkt/vP659RXUFprS0VNu3b3fCy7GaNm2aysrKnNvBYFD5+fkaNWrUcR+AWBs885Wo7NeXZPTAsFZN35ykUKsnKo+BzqMvHbN9ZnGXPVY4HFZlZaVGjhwpr9fbZY+Lb0ZP4o9bPTlyBsUNUQswkydP1ksvvaQNGzbolFNOccZzcnL0xRdfqLGxUVlZWc54Q0ODcnJy2t2Xz+eTz+drM+71eq1/cYdaovtLLNTqifpjoPPoy7eLxb/rRPh5kmjoSfw53p642U/XL6M2xmjy5MlauXKl1q1bp4KCgojtQ4cOldfrVVVVlTNWV1enPXv2qKioyO1yAABAAnJ9Baa0tFTLly/XCy+8oB49eqi+vl6SlJmZqfT0dGVmZurWW29VWVmZevXqJb/frzvvvFNFRUVcgQQAADrE9QCzePFiSdIPf/jDiPGlS5fqJz/5iSTpkUceUVJSksaNG6dQKKTi4mI9+uijbpcCAAASlOsBpiMXNaWlpWnRokVatGiR2w8PAABOAFH/HBgA6KyOfOrvh3NGd0ElAOIVX+YIAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHD7IDkLA68oF4ux4Y1QWVAHAbKzAAAMA6BBgAAGAdTiEBgAv4/iaga7ECAwAArEOAAQAA1uEUEgArdeSUDYDExQoMAACwDgEGAABYhwADAACsw3tgjhGXTAIAEDuswAAAAOsQYAAAgHU4hQTghDZ45iuaO/zL/4ZaPLEuB0AHsQIDAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6XIUEAF3ErS+g5EMyAVZgAACAhQgwAADAOpxCaodby7xu7QcAAERiBQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHW4CgkALNORKxz5sDskOlZgAACAdQgwAADAOgQYAABgHd4DAwAnKL5cEjZjBQYAAFiHAAMAAKzDKSQAQNRx6TfcxgoMAACwDgEGAABYh1NIAJCA3LrCKFFxSst+rMAAAADrEGAAAIB1OIUEAIgLR07r+JKN5g6XBs98RaEWT6f349apH04zxTdWYAAAgHViGmAWLVqk/v37Ky0tTYWFhdq4cWMsywEAAJaI2SmkZ599VmVlZVqyZIkKCws1f/58FRcXq66uTn379o1VWQCAToq3K57irZ54lAinx2K2AjNv3jzddtttmjhxos4++2wtWbJEGRkZevLJJ2NVEgAAsERMVmC++OIL1dbWatq0ac5YUlKSAoGAqqur28wPhUIKhULO7aamJknSvn37FA6HXa8v5fBB1/fZ1VJajZqbW5USTlJLa+ffBIfooC/xh57EH5t68umnn8a6hGPSkd9zX31u4XBYzc3N+vTTT+X1eo/5cffv3y9JMsYc8z6OiEmA+eSTT9TS0qLs7OyI8ezsbP39739vM7+8vFyzZs1qM15QUBC1GhPBzbEuAO2iL/GHnsQfW3rS53exriB6ovnc9u/fr8zMzOPahxWXUU+bNk1lZWXO7dbWVu3bt0+9e/eWxxPf6TxWgsGg8vPz9dFHH8nv98e6HPwPfYk/9CT+0JP441ZPjDHav3+/8vLyjrummASYPn36KDk5WQ0NDRHjDQ0NysnJaTPf5/PJ5/NFjGVlZUWzxITh9/v5ARCH6Ev8oSfxh57EHzd6crwrL0fE5E28qampGjp0qKqqqpyx1tZWVVVVqaioKBYlAQAAi8TsFFJZWZkmTJigYcOGafjw4Zo/f74OHjyoiRMnxqokAABgiZgFmBtvvFH//e9/NWPGDNXX12vIkCGqqKho88ZeHBufz6f777+/zak3xBZ9iT/0JP7Qk/gTjz3xGDeuZQIAAOhCfBcSAACwDgEGAABYhwADAACsQ4ABAADWIcDE0IYNG3T11VcrLy9PHo9Hq1atithujNGMGTOUm5ur9PR0BQIB7dq1K2LOvn37VFJSIr/fr6ysLN166606cOBAxJx3331X3/ve95SWlqb8/HzNnTu3TS3PP/+8Bg4cqLS0NJ1zzjlas2ZNp2uxXXl5uS644AL16NFDffv21dixY1VXVxcx59ChQyotLVXv3r3VvXt3jRs3rs0HMu7Zs0ejR49WRkaG+vbtq3vuuUeHDx+OmPPaa6/p/PPPl8/n0+mnn65ly5a1qWfRokXq37+/0tLSVFhYqI0bN3a6lkSwePFinXvuuc4HaBUVFenll192ttOT2JozZ448Ho+mTJnijNGTrjdz5kx5PJ6IPwMHDnS2J2RPDGJmzZo15v/+7//MX//6VyPJrFy5MmL7nDlzTGZmplm1apV55513zDXXXGMKCgrM559/7sy5/PLLzXnnnWfeeust87e//c2cfvrp5qabbnK2NzU1mezsbFNSUmK2b99unnnmGZOenm4ee+wxZ84bb7xhkpOTzdy5c82OHTvMfffdZ7xer9m2bVunarFdcXGxWbp0qdm+fbvZunWrufLKK02/fv3MgQMHnDm33367yc/PN1VVVWbz5s3mwgsvNBdddJGz/fDhw2bw4MEmEAiYLVu2mDVr1pg+ffqYadOmOXP++c9/moyMDFNWVmZ27NhhFi5caJKTk01FRYUzZ8WKFSY1NdU8+eST5r333jO33XabycrKMg0NDR2uJVG8+OKLZvXq1eb99983dXV15le/+pXxer1m+/btxhh6EksbN240/fv3N+eee6656667nHF60vXuv/9+M2jQIPOf//zH+fPf//7X2Z6IPSHAxImvB5jW1laTk5Njfvvb3zpjjY2NxufzmWeeecYYY8yOHTuMJLNp0yZnzssvv2w8Ho/597//bYwx5tFHHzU9e/Y0oVDImfPLX/7SnHnmmc7tG264wYwePTqinsLCQvPTn/60w7Ukor179xpJZv369caYL5+z1+s1zz//vDNn586dRpKprq42xnwZSpOSkkx9fb0zZ/Hixcbv9zs9uPfee82gQYMiHuvGG280xcXFzu3hw4eb0tJS53ZLS4vJy8sz5eXlHa4lkfXs2dP86U9/oicxtH//fjNgwABTWVlpfvCDHzgBhp7Exv3332/OO++8drclak84hRSndu/erfr6egUCAWcsMzNThYWFqq6uliRVV1crKytLw4YNc+YEAgElJSWppqbGmfP9739fqampzpzi4mLV1dXps88+c+Z89XGOzDnyOB2pJRE1NTVJknr16iVJqq2tVTgcjjgOAwcOVL9+/SJ6cs4550R8IGNxcbGCwaDee+89Z863He8vvvhCtbW1EXOSkpIUCAScOR2pJRG1tLRoxYoVOnjwoIqKiuhJDJWWlmr06NFtjhs9iZ1du3YpLy9Pp512mkpKSrRnzx5JidsTAkycqq+vl6Q2n0ycnZ3tbKuvr1ffvn0jtqekpKhXr14Rc9rbx1cf45vmfHX70WpJNK2trZoyZYouvvhiDR48WNKXxyE1NbXNF4l+/Vgd6/EOBoP6/PPP9cknn6ilpeWoPTlaLYlk27Zt6t69u3w+n26//XatXLlSZ599Nj2JkRUrVujtt99WeXl5m230JDYKCwu1bNkyVVRUaPHixdq9e7e+973vaf/+/Qnbk5h9lQAQz0pLS7V9+3a9/vrrsS4Fks4880xt3bpVTU1N+vOf/6wJEyZo/fr1sS7rhPTRRx/prrvuUmVlpdLS0mJdDv7niiuucP5+7rnnqrCwUKeeeqqee+45paenx7Cy6GEFJk7l5ORIUpt3Zjc0NDjbcnJytHfv3ojthw8f1r59+yLmtLePrz7GN8356vaj1ZJIJk+erJdeekmvvvqqTjnlFGc8JydHX3zxhRobGyPmf/1YHevx9vv9Sk9PV58+fZScnHzUnhytlkSSmpqq008/XUOHDlV5ebnOO+88/f73v6cnMVBbW6u9e/fq/PPPV0pKilJSUrR+/XotWLBAKSkpys7OpidxICsrS2eccYY++OCDhP13QoCJUwUFBcrJyVFVVZUzFgwGVVNTo6KiIklSUVGRGhsbVVtb68xZt26dWltbVVhY6MzZsGGDwuGwM6eyslJnnnmmevbs6cz56uMcmXPkcTpSSyIwxmjy5MlauXKl1q1bp4KCgojtQ4cOldfrjTgOdXV12rNnT0RPtm3bFhEsKysr5ff7dfbZZztzvu14p6amaujQoRFzWltbVVVV5czpSC2JrLW1VaFQiJ7EwIgRI7Rt2zZt3brV+TNs2DCVlJQ4f6cnsXfgwAH94x//UG5ubuL+O+nUW37hqv3795stW7aYLVu2GElm3rx5ZsuWLeZf//qXMebLS5ezsrLMCy+8YN59910zZsyYdi+j/u53v2tqamrM66+/bgYMGBBxGXVjY6PJzs42P/7xj8327dvNihUrTEZGRpvLqFNSUszDDz9sdu7cae6///52L6M+Wi22+9nPfmYyMzPNa6+9FnEpYnNzszPn9ttvN/369TPr1q0zmzdvNkVFRaaoqMjZfuRSxFGjRpmtW7eaiooKc9JJJ7V7KeI999xjdu7caRYtWtTupYg+n88sW7bM7Nixw0yaNMlkZWVFXCFwtFoSxdSpU8369evN7t27zbvvvmumTp1qPB6PWbt2rTGGnsSDr16FZAw9iYVf/OIX5rXXXjO7d+82b7zxhgkEAqZPnz5m7969xpjE7AkBJoZeffVVI6nNnwkTJhhjvrx8efr06SY7O9v4fD4zYsQIU1dXF7GPTz/91Nx0002me/fuxu/3m4kTJ5r9+/dHzHnnnXfMJZdcYnw+nzn55JPNnDlz2tTy3HPPmTPOOMOkpqaaQYMGmdWrV0ds70gttmuvF5LM0qVLnTmff/65ueOOO0zPnj1NRkaGufbaa81//vOfiP18+OGH5oorrjDp6emmT58+5he/+IUJh8MRc1599VUzZMgQk5qaak477bSIxzhi4cKFpl+/fiY1NdUMHz7cvPXWWxHbO1JLIrjlllvMqaeealJTU81JJ51kRowY4YQXY+hJPPh6gKEnXe/GG280ubm5JjU11Zx88snmxhtvNB988IGzPRF74jHGmM6t2QAAAMQW74EBAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDr/Dxg7FwrJ17igAAAAAElFTkSuQmCC\n"
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "execution_count": 50
+ },
+ {
+ "metadata": {},
+ "cell_type": "markdown",
+ "source": ""
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.2"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {
+ "height": "583px",
+ "left": "0px",
+ "right": "1324px",
+ "top": "107px",
+ "width": "212px"
+ },
+ "toc_section_display": "block",
+ "toc_window_display": true
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/section-04-research-and-development/06-feature-engineering-with-open-source.ipynb b/section-04-research-and-development/06-feature-engineering-with-open-source.ipynb
index 2d25751b3..232e2ae3b 100644
--- a/section-04-research-and-development/06-feature-engineering-with-open-source.ipynb
+++ b/section-04-research-and-development/06-feature-engineering-with-open-source.ipynb
@@ -20,9 +20,12 @@
},
{
"cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T11:19:36.355906Z",
+ "start_time": "2025-02-28T11:19:29.833665Z"
+ }
+ },
"source": [
"# data manipulation and plotting\n",
"import pandas as pd\n",
@@ -58,12 +61,28 @@
"\n",
"# to visualise al the columns in the dataframe\n",
"pd.pandas.set_option('display.max_columns', None)"
- ]
+ ],
+ "outputs": [],
+ "execution_count": 1
},
{
"cell_type": "code",
- "execution_count": 2,
- "metadata": {},
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-02-28T11:19:36.760614Z",
+ "start_time": "2025-02-28T11:19:36.461852Z"
+ }
+ },
+ "source": [
+ "# load dataset\n",
+ "data = pd.read_csv('train.csv')\n",
+ "\n",
+ "# rows and columns of the data\n",
+ "print(data.shape)\n",
+ "\n",
+ "# visualise the dataset\n",
+ "data.head()"
+ ],
"outputs": [
{
"name": "stdout",
@@ -74,6 +93,98 @@
},
{
"data": {
+ "text/plain": [
+ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
+ "0 1 60 RL 65.0 8450 Pave NaN Reg \n",
+ "1 2 20 RL 80.0 9600 Pave NaN Reg \n",
+ "2 3 60 RL 68.0 11250 Pave NaN IR1 \n",
+ "3 4 70 RL 60.0 9550 Pave NaN IR1 \n",
+ "4 5 60 RL 84.0 14260 Pave NaN IR1 \n",
+ "\n",
+ " LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \\\n",
+ "0 Lvl AllPub Inside Gtl CollgCr Norm \n",
+ "1 Lvl AllPub FR2 Gtl Veenker Feedr \n",
+ "2 Lvl AllPub Inside Gtl CollgCr Norm \n",
+ "3 Lvl AllPub Corner Gtl Crawfor Norm \n",
+ "4 Lvl AllPub FR2 Gtl NoRidge Norm \n",
+ "\n",
+ " Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \\\n",
+ "0 Norm 1Fam 2Story 7 5 2003 \n",
+ "1 Norm 1Fam 1Story 6 8 1976 \n",
+ "2 Norm 1Fam 2Story 7 5 2001 \n",
+ "3 Norm 1Fam 2Story 7 5 1915 \n",
+ "4 Norm 1Fam 2Story 8 5 2000 \n",
+ "\n",
+ " YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \\\n",
+ "0 2003 Gable CompShg VinylSd VinylSd BrkFace \n",
+ "1 1976 Gable CompShg MetalSd MetalSd None \n",
+ "2 2002 Gable CompShg VinylSd VinylSd BrkFace \n",
+ "3 1970 Gable CompShg Wd Sdng Wd Shng None \n",
+ "4 2000 Gable CompShg VinylSd VinylSd BrkFace \n",
+ "\n",
+ " MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure \\\n",
+ "0 196.0 Gd TA PConc Gd TA No \n",
+ "1 0.0 TA TA CBlock Gd TA Gd \n",
+ "2 162.0 Gd TA PConc Gd TA Mn \n",
+ "3 0.0 TA TA BrkTil TA Gd No \n",
+ "4 350.0 Gd TA PConc Gd TA Av \n",
+ "\n",
+ " BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF \\\n",
+ "0 GLQ 706 Unf 0 150 856 \n",
+ "1 ALQ 978 Unf 0 284 1262 \n",
+ "2 GLQ 486 Unf 0 434 920 \n",
+ "3 ALQ 216 Unf 0 540 756 \n",
+ "4 GLQ 655 Unf 0 490 1145 \n",
+ "\n",
+ " Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF \\\n",
+ "0 GasA Ex Y SBrkr 856 854 0 \n",
+ "1 GasA Ex Y SBrkr 1262 0 0 \n",
+ "2 GasA Ex Y SBrkr 920 866 0 \n",
+ "3 GasA Gd Y SBrkr 961 756 0 \n",
+ "4 GasA Ex Y SBrkr 1145 1053 0 \n",
+ "\n",
+ " GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr \\\n",
+ "0 1710 1 0 2 1 3 \n",
+ "1 1262 0 1 2 0 3 \n",
+ "2 1786 1 0 2 1 3 \n",
+ "3 1717 1 0 1 0 3 \n",
+ "4 2198 1 0 2 1 4 \n",
+ "\n",
+ " KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \\\n",
+ "0 1 Gd 8 Typ 0 NaN \n",
+ "1 1 TA 6 Typ 1 TA \n",
+ "2 1 Gd 6 Typ 1 TA \n",
+ "3 1 Gd 7 Typ 1 Gd \n",
+ "4 1 Gd 9 Typ 1 TA \n",
+ "\n",
+ " GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual \\\n",
+ "0 Attchd 2003.0 RFn 2 548 TA \n",
+ "1 Attchd 1976.0 RFn 2 460 TA \n",
+ "2 Attchd 2001.0 RFn 2 608 TA \n",
+ "3 Detchd 1998.0 Unf 3 642 TA \n",
+ "4 Attchd 2000.0 RFn 3 836 TA \n",
+ "\n",
+ " GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n",
+ "0 TA Y 0 61 0 0 \n",
+ "1 TA Y 298 0 0 0 \n",
+ "2 TA Y 0 42 0 0 \n",
+ "3 TA Y 0 35 272 0 \n",
+ "4 TA Y 192 84 0 0 \n",
+ "\n",
+ " ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold \\\n",
+ "0 0 0 NaN NaN NaN 0 2 2008 \n",
+ "1 0 0 NaN NaN NaN 0 5 2007 \n",
+ "2 0 0 NaN NaN NaN 0 9 2008 \n",
+ "3 0 0 NaN NaN NaN 0 2 2006 \n",
+ "4 0 0 NaN NaN NaN 0 12 2008 \n",
+ "\n",
+ " SaleType SaleCondition SalePrice \n",
+ "0 WD Normal 208500 \n",
+ "1 WD Normal 181500 \n",
+ "2 WD Normal 223500 \n",
+ "3 WD Abnorml 140000 \n",
+ "4 WD Normal 250000 "
+ ],
"text/html": [
"\n",
"