From df35da7811f2a02b864cd7b97cf55064fa37bbf2 Mon Sep 17 00:00:00 2001 From: Alessandro Cesa Date: Fri, 7 Mar 2025 12:53:25 +0100 Subject: [PATCH 01/11] committing my fork, with change on docker version --- .circleci/config.yml | 2 +- .../requirements/requirements.txt | 9 +- .../tests/test_prediction.py | 1 + assignment-section-05/tox.ini | 1 + .../classification_model/VERSION | 1 + .../classification_model/__init__.py | 17 + .../classification_model/config.yml | 50 + .../classification_model/config/__init__.py | 0 .../classification_model/config/core.py | 85 + .../classification_model/datasets/__init__.py | 0 .../classification_model/pipeline.py | 63 + .../classification_model/predict.py | 34 + .../processing/__init__.py | 0 .../processing/data_manager.py | 102 + .../processing/features.py | 26 + .../processing/validation.py | 63 + .../classification_model/train_pipeline.py | 31 + .../trained_models/__init__.py | 0 my-assignement-section-05/mypy.ini | 14 + my-assignement-section-05/pyproject.toml | 48 + .../requirements/requirements.txt | 12 + .../requirements/test_requirements.txt | 4 + .../requirements/typing_requirements.txt | 5 + my-assignement-section-05/setup.py | 63 + my-assignement-section-05/tests/__init__.py | 0 my-assignement-section-05/tests/conftest.py | 9 + .../tests/test_features.py | 14 + .../tests/test_prediction.py | 29 + my-assignement-section-05/tox.ini | 56 + ...hine-learning-pipeline-data-analysis.ipynb | 2741 +++++++++-------- ...earning-pipeline-feature-engineering.ipynb | 1857 ++++++----- ...-learning-pipeline-feature-selection.ipynb | 429 +-- ...ine-learning-pipeline-model-training.ipynb | 638 ++-- ...e-learning-pipeline-scoring-new-data.ipynb | 710 +++-- ...pipeline-scoring-new-data_my_version.ipynb | 2492 +++++++++++++++ ...feature-engineering-with-open-source.ipynb | 1779 ++++++----- .../07-feature-engineering-pipeline.ipynb | 657 ++-- .../08-final-machine-learning-pipeline.ipynb | 517 ++-- .../linear_regression.joblib | Bin 0 -> 1013 bytes .../minmax_scaler.joblib | Bin 0 -> 3854 bytes .../preprocessors.py | 4 +- .../price_pipe.joblib | Bin 0 -> 11509 bytes .../selected_features.csv | 37 + ...predicting-survival-titanic-solution.ipynb | 267 +- ...titanic-survival-pipeline-assignment.ipynb | 407 ++- ...4-titanic-survival-pipeline-solution.ipynb | 336 +- .../titanic-assignment/titanic.csv | 1310 ++++++++ section-04-research-and-development/xtest.csv | 147 + .../xtrain.csv | 1315 ++++++++ section-04-research-and-development/ytest.csv | 147 + .../ytrain.csv | 1315 ++++++++ .../regression_model/processing/validation.py | 8 +- .../requirements/requirements.txt | 7 +- section-05-production-model-package/tox.ini | 2 +- .../house-prices-api/.requirements.txt.un~ | Bin 0 -> 1452 bytes .../house-prices-api/requirements.txt | 27 +- .../house-prices-api/runtime.txt | 1 + .../house-prices-api/test_requirements.txt | 9 +- .../house-prices-api/tox.ini | 3 + .../house-prices-api/app/api.py | 2 +- .../house-prices-api/app/api.py | 2 +- 61 files changed, 13311 insertions(+), 4594 deletions(-) create mode 100644 my-assignement-section-05/classification_model/VERSION create mode 100644 my-assignement-section-05/classification_model/__init__.py create mode 100644 my-assignement-section-05/classification_model/config.yml create mode 100644 my-assignement-section-05/classification_model/config/__init__.py create mode 100644 my-assignement-section-05/classification_model/config/core.py create mode 100644 my-assignement-section-05/classification_model/datasets/__init__.py create mode 100644 my-assignement-section-05/classification_model/pipeline.py create mode 100644 my-assignement-section-05/classification_model/predict.py create mode 100644 my-assignement-section-05/classification_model/processing/__init__.py create mode 100644 my-assignement-section-05/classification_model/processing/data_manager.py create mode 100644 my-assignement-section-05/classification_model/processing/features.py create mode 100644 my-assignement-section-05/classification_model/processing/validation.py create mode 100644 my-assignement-section-05/classification_model/train_pipeline.py create mode 100644 my-assignement-section-05/classification_model/trained_models/__init__.py create mode 100644 my-assignement-section-05/mypy.ini create mode 100644 my-assignement-section-05/pyproject.toml create mode 100644 my-assignement-section-05/requirements/requirements.txt create mode 100644 my-assignement-section-05/requirements/test_requirements.txt create mode 100644 my-assignement-section-05/requirements/typing_requirements.txt create mode 100644 my-assignement-section-05/setup.py create mode 100644 my-assignement-section-05/tests/__init__.py create mode 100644 my-assignement-section-05/tests/conftest.py create mode 100644 my-assignement-section-05/tests/test_features.py create mode 100644 my-assignement-section-05/tests/test_prediction.py create mode 100644 my-assignement-section-05/tox.ini create mode 100644 section-04-research-and-development/05-machine-learning-pipeline-scoring-new-data_my_version.ipynb create mode 100644 section-04-research-and-development/linear_regression.joblib create mode 100644 section-04-research-and-development/minmax_scaler.joblib create mode 100644 section-04-research-and-development/price_pipe.joblib create mode 100644 section-04-research-and-development/selected_features.csv create mode 100644 section-04-research-and-development/titanic-assignment/titanic.csv create mode 100644 section-04-research-and-development/xtest.csv create mode 100644 section-04-research-and-development/xtrain.csv create mode 100644 section-04-research-and-development/ytest.csv create mode 100644 section-04-research-and-development/ytrain.csv create mode 100644 section-06-model-serving-api/house-prices-api/.requirements.txt.un~ create mode 100644 section-06-model-serving-api/house-prices-api/runtime.txt diff --git a/.circleci/config.yml b/.circleci/config.yml index 037645ab2..493640f1b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -82,7 +82,7 @@ jobs: steps: - setup_remote_docker: # Supported versions: https://circleci.com/docs/2.0/building-docker-images/#docker-version - version: 20.10.18 + version: default - checkout: path: ~/project/ - node/install: diff --git a/assignment-section-05/requirements/requirements.txt b/assignment-section-05/requirements/requirements.txt index f3783b618..a0800e668 100644 --- a/assignment-section-05/requirements/requirements.txt +++ b/assignment-section-05/requirements/requirements.txt @@ -1,11 +1,12 @@ # We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release) # to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small # updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes. -numpy>=1.21.0,<2.0.0 +numpy>=1.21.0,<1.25.0 pandas>=1.3.5,<2.0.0 pydantic>=1.8.1,<2.0.0 -scikit-learn>=1.1.3,<2.0.0 +scikit-learn>=1.0.2,<1.1.0 strictyaml>=1.3.2,<2.0.0 ruamel.yaml>=0.16.12,<1.0.0 -feature-engine>=1.0.2,<2.0.0 -joblib>=1.0.1,<2.0.0 \ No newline at end of file +feature-engine>=1.0.2,<1.6.0 # breaking change in v1.6.0 +joblib>=1.0.1,<2.0.0 +setuptools<60 \ No newline at end of file diff --git a/assignment-section-05/tests/test_prediction.py b/assignment-section-05/tests/test_prediction.py index 76965698a..e0d4af892 100644 --- a/assignment-section-05/tests/test_prediction.py +++ b/assignment-section-05/tests/test_prediction.py @@ -17,6 +17,7 @@ def test_make_prediction(sample_input_data): # Then predictions = result.get("predictions") + print(predictions) assert isinstance(predictions, np.ndarray) assert isinstance(predictions[0], np.int64) assert result.get("errors") is None diff --git a/assignment-section-05/tox.ini b/assignment-section-05/tox.ini index 37829355f..76484c454 100644 --- a/assignment-section-05/tox.ini +++ b/assignment-section-05/tox.ini @@ -12,6 +12,7 @@ envlist = test_package, checks skipsdist = True [testenv] +basepython = python3.9 install_command = pip install {opts} {packages} [testenv:test_package] diff --git a/my-assignement-section-05/classification_model/VERSION b/my-assignement-section-05/classification_model/VERSION new file mode 100644 index 000000000..8acdd82b7 --- /dev/null +++ b/my-assignement-section-05/classification_model/VERSION @@ -0,0 +1 @@ +0.0.1 diff --git a/my-assignement-section-05/classification_model/__init__.py b/my-assignement-section-05/classification_model/__init__.py new file mode 100644 index 000000000..8cea86752 --- /dev/null +++ b/my-assignement-section-05/classification_model/__init__.py @@ -0,0 +1,17 @@ +import logging + +from classification_model.config.core import PACKAGE_ROOT, config + +# It is strongly advised that you do not add any handlers other than +# NullHandler to your library’s loggers. This is because the configuration +# of handlers is the prerogative of the application developer who uses your +# library. The application developer knows their target audience and what +# handlers are most appropriate for their application: if you add handlers +# ‘under the hood’, you might well interfere with their ability to carry out +# unit tests and deliver logs which suit their requirements. +# https://docs.python.org/3/howto/logging.html#configuring-logging-for-a-library +logging.getLogger(config.app_config.package_name).addHandler(logging.NullHandler()) + + +with open(PACKAGE_ROOT / "VERSION") as version_file: + __version__ = version_file.read().strip() diff --git a/my-assignement-section-05/classification_model/config.yml b/my-assignement-section-05/classification_model/config.yml new file mode 100644 index 000000000..56f5dc65a --- /dev/null +++ b/my-assignement-section-05/classification_model/config.yml @@ -0,0 +1,50 @@ +# Package Overview +package_name: classification_model + +# Data Files +data_file: raw.csv + +# Variables +# The variable we are attempting to predict (sale price) +target: survived + +pipeline_name: classification_model +pipeline_save_file: classification _model_output_v + +# Will cause syntax errors since they begin with numbers +variables_to_rename: + home.dest: home_dest + +features: + - pclass + - survived + - sex + - age + - sibsp + - parch + - fare + - cabin + - embarked + - title + + +# set train/test split +test_size: 0.1 + +# to set the random seed +random_state: 0 + +alpha: 0.001 + +numerical_vars: + - age + - fare + +cabin: + - cabin + +categorical_vars: + - sex + - cabin + - embarked + - title \ No newline at end of file diff --git a/my-assignement-section-05/classification_model/config/__init__.py b/my-assignement-section-05/classification_model/config/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/my-assignement-section-05/classification_model/config/core.py b/my-assignement-section-05/classification_model/config/core.py new file mode 100644 index 000000000..75cdf5518 --- /dev/null +++ b/my-assignement-section-05/classification_model/config/core.py @@ -0,0 +1,85 @@ +from pathlib import Path +from typing import Dict, List, Optional + +from pydantic import BaseModel +from strictyaml import YAML, load + +import classification_model + +# Project Directories +PACKAGE_ROOT = Path(classification_model.__file__).resolve().parent +ROOT = PACKAGE_ROOT.parent +CONFIG_FILE_PATH = PACKAGE_ROOT / "config.yml" +DATASET_DIR = PACKAGE_ROOT / "datasets" +TRAINED_MODEL_DIR = PACKAGE_ROOT / "trained_models" + + +class AppConfig(BaseModel): + """ + Application-level config. + """ + + package_name: str + data_file: str + pipeline_save_file: str + + +class ModelConfig(BaseModel): + """ + All configuration relevant to model + training and feature engineering. + """ + + target: str + variables_to_rename: Dict + features: List[str] + test_size: float + random_state: int + alpha: float + categorical_vars: List[str] + numerical_vars: List[str] + cabin: List[str] + + +class Config(BaseModel): + """Master config object.""" + + app_config: AppConfig + model_config: ModelConfig + + +def find_config_file() -> Path: + """Locate the configuration file.""" + if CONFIG_FILE_PATH.is_file(): + return CONFIG_FILE_PATH + raise Exception(f"Config not found at {CONFIG_FILE_PATH!r}") + + +def fetch_config_from_yaml(cfg_path: Optional[Path] = None) -> YAML: + """Parse YAML containing the package configuration.""" + + if not cfg_path: + cfg_path = find_config_file() + + if cfg_path: + with open(cfg_path, "r") as conf_file: + parsed_config = load(conf_file.read()) + return parsed_config + raise OSError(f"Did not find config file at path: {cfg_path}") + + +def create_and_validate_config(parsed_config: YAML = None) -> Config: + """Run validation on config values.""" + if parsed_config is None: + parsed_config = fetch_config_from_yaml() + + # specify the data attribute from the strictyaml YAML type. + _config = Config( + app_config=AppConfig(**parsed_config.data), + model_config=ModelConfig(**parsed_config.data), + ) + + return _config + + +config = create_and_validate_config() diff --git a/my-assignement-section-05/classification_model/datasets/__init__.py b/my-assignement-section-05/classification_model/datasets/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/my-assignement-section-05/classification_model/pipeline.py b/my-assignement-section-05/classification_model/pipeline.py new file mode 100644 index 000000000..11fa84c71 --- /dev/null +++ b/my-assignement-section-05/classification_model/pipeline.py @@ -0,0 +1,63 @@ +# for encoding categorical variables +from feature_engine.encoding import OneHotEncoder, RareLabelEncoder +from feature_engine.imputation import ( + AddMissingIndicator, + CategoricalImputer, + MeanMedianImputer, +) +from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from classification_model.config.core import config +from classification_model.processing import features as pp + +titanic_pipe = Pipeline( + [ + # ===== IMPUTATION ===== + # impute categorical variables with string missing + ( + "categorical_imputation", + CategoricalImputer( + imputation_method="missing", + variables=config.model_config.categorical_vars, + ), + ), + # add missing indicator to numerical variables + ( + "missing_indicator", + AddMissingIndicator(variables=config.model_config.numerical_vars), + ), + # impute numerical variables with the median + ( + "median_imputation", + MeanMedianImputer( + imputation_method="median", variables=config.model_config.numerical_vars + ), + ), + # Extract letter from cabin + ( + "extract_letter", + pp.ExtractLetterTransformer(variables=config.model_config.cabin), + ), + # == CATEGORICAL ENCODING ====== + # remove categories present in less than 5% of the observations (0.05) + # group them in one category called 'Rare' + ( + "rare_label_encoder", + RareLabelEncoder( + tol=0.05, n_categories=1, variables=config.model_config.categorical_vars + ), + ), + # encode categorical variables using one hot encoding into k-1 variables + ( + "categorical_encoder", + OneHotEncoder( + drop_last=True, variables=config.model_config.categorical_vars + ), + ), + # scale + ("scaler", StandardScaler()), + ("Logit", LogisticRegression(C=0.0005, random_state=0)), + ] +) diff --git a/my-assignement-section-05/classification_model/predict.py b/my-assignement-section-05/classification_model/predict.py new file mode 100644 index 000000000..eb2990bb3 --- /dev/null +++ b/my-assignement-section-05/classification_model/predict.py @@ -0,0 +1,34 @@ +import typing as t + +import pandas as pd + +from classification_model import __version__ as _version +from classification_model.config.core import config +from classification_model.processing.data_manager import load_pipeline +from classification_model.processing.validation import validate_inputs + +pipeline_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl" +_titanic_pipe = load_pipeline(file_name=pipeline_file_name) + + +def make_prediction( + *, + input_data: t.Union[pd.DataFrame, dict], +) -> dict: + """Make a prediction using a saved model pipeline.""" + + data = pd.DataFrame(input_data) + validated_data, errors = validate_inputs(input_data=data) + results = {"predictions": None, "version": _version, "errors": errors} + + if not errors: + predictions = _titanic_pipe.predict( + X=validated_data[config.model_config.features] + ) + results = { + "predictions": predictions, + "version": _version, + "errors": errors, + } + + return results diff --git a/my-assignement-section-05/classification_model/processing/__init__.py b/my-assignement-section-05/classification_model/processing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/my-assignement-section-05/classification_model/processing/data_manager.py b/my-assignement-section-05/classification_model/processing/data_manager.py new file mode 100644 index 000000000..032282501 --- /dev/null +++ b/my-assignement-section-05/classification_model/processing/data_manager.py @@ -0,0 +1,102 @@ +import re +import typing as t +from pathlib import Path + +import joblib +import numpy as np +import pandas as pd +from sklearn.pipeline import Pipeline + +from classification_model import __version__ as _version +from classification_model.config.core import DATASET_DIR, TRAINED_MODEL_DIR, config + + +def load_dataset(*, file_name: str) -> pd.DataFrame: + dataframe = pd.read_csv(Path(f"{DATASET_DIR}/{file_name}")) + dataframe = preliminary_transformations(dataframe) + + return dataframe + + +def preliminary_transformations(dataframe: pd.DataFrame) -> pd.DataFrame: + dataframe = dataframe.replace("?", np.nan) + dataframe["cabin"] = dataframe["cabin"].apply(get_first_cabin) + dataframe["title"] = dataframe["name"].apply(get_title) + for label in ["fare", "age"]: + dataframe[label] = dataframe[label].astype("float") + dataframe = drop_unused_variables(dataframe) + return dataframe + + +def get_title(passenger): + line = passenger + if re.search("Mrs", line): + return "Mrs" + elif re.search("Mr", line): + return "Mr" + elif re.search("Miss", line): + return "Miss" + elif re.search("Master", line): + return "Master" + else: + return "Other" + + +def get_first_cabin(row): + try: + return row.split()[0] + except (AttributeError, IndexError): + return np.nan + + +def drop_unused_variables(dataframe: pd.DataFrame) -> pd.DataFrame: + used_variables = set(config.model_config.features) + used_variables.add(config.model_config.target) + unused_variables = list(set(dataframe.columns) - used_variables) + dataframe.drop(columns=unused_variables, inplace=True) + return dataframe + + +def save_pipeline(*, pipeline_to_persist: Pipeline) -> None: + """Persist the pipeline. + Saves the versioned model, and overwrites any previous + saved models. This ensures that when the package is + published, there is only one trained model that can be + called, and we know exactly how it was built. + """ + + # Prepare versioned save file name + save_file_name = f"{config.app_config.pipeline_save_file}{_version}.pkl" + save_path = TRAINED_MODEL_DIR / save_file_name + + remove_old_pipelines(files_to_keep=[save_file_name]) + joblib.dump(pipeline_to_persist, save_path) + + +def load_pipeline(*, file_name: str) -> Pipeline: + """Load a persisted pipeline.""" + + file_path = TRAINED_MODEL_DIR / file_name + trained_model = joblib.load(filename=file_path) + return trained_model + + +def remove_old_pipelines(*, files_to_keep: t.List[str]) -> None: + """ + Remove old model pipelines. + This is to ensure there is a simple one-to-one + mapping between the package version and the model + version to be imported and used by other applications. + """ + do_not_delete = files_to_keep + ["__init__.py"] + for model_file in TRAINED_MODEL_DIR.iterdir(): + if model_file.name not in do_not_delete: + model_file.unlink() + + +def save_dataset(*, dataset: pd.DataFrame, dataset_name: str) -> None: + + save_file_name = f"{dataset_name}.csv" + save_path = DATASET_DIR / save_file_name + + dataset.to_csv(save_path, index=False) diff --git a/my-assignement-section-05/classification_model/processing/features.py b/my-assignement-section-05/classification_model/processing/features.py new file mode 100644 index 000000000..6e4fc8d3e --- /dev/null +++ b/my-assignement-section-05/classification_model/processing/features.py @@ -0,0 +1,26 @@ +from sklearn.base import BaseEstimator, TransformerMixin + + +class ExtractLetterTransformer(BaseEstimator, TransformerMixin): + # Extract fist letter of variable + + def __init__(self, variables): + + if not isinstance(variables, list): + raise ValueError("variables should be a list") + + self.variables = variables + + def fit(self, X, y=None): + # we need this step to fit the sklearn pipeline + return self + + def transform(self, X): + + # so that we do not over-write the original dataframe + X = X.copy() + + for feature in self.variables: + X[feature] = X[feature].str[0] + + return X diff --git a/my-assignement-section-05/classification_model/processing/validation.py b/my-assignement-section-05/classification_model/processing/validation.py new file mode 100644 index 000000000..d8c34cfa9 --- /dev/null +++ b/my-assignement-section-05/classification_model/processing/validation.py @@ -0,0 +1,63 @@ +from typing import List, Optional, Tuple + +import numpy as np +import pandas as pd +from pydantic import BaseModel, ValidationError + +from classification_model.config.core import config + + +def drop_na_inputs(*, input_data: pd.DataFrame) -> pd.DataFrame: + """Check model inputs for na values and filter.""" + validated_data = input_data.copy() + new_vars_with_na = [ + var + for var in config.model_config.features + if var + not in config.model_config.categorical_vars_with_na_frequent + + config.model_config.categorical_vars_with_na_missing + + config.model_config.numerical_vars_with_na + and validated_data[var].isnull().sum() > 0 + ] + validated_data.dropna(subset=new_vars_with_na, inplace=True) + + return validated_data + + +def validate_inputs(*, input_data: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[dict]]: + """Check model inputs for unprocessable values.""" + + # convert syntax error field names (beginning with numbers) + relevant_data = input_data[config.model_config.features].copy() + errors = None + + try: + # replace numpy nans so that pydantic can validate + MultipleTitanicDataInputs( + inputs=relevant_data.replace({np.nan: None}).to_dict(orient="records") + ) + except ValidationError as error: + errors = error.json() + + return relevant_data, errors + + +class TitanicDataInputSchema(BaseModel): + pclass: Optional[int] + survived: Optional[int] + name: Optional[str] + sex: Optional[str] + age: Optional[float] + sibsp: Optional[int] + parch: Optional[int] + ticket: Optional[int] + fare: Optional[float] + cabin: Optional[str] + embarked: Optional[str] + boat: Optional[int] + body: Optional[int] + home_dest: Optional[str] + + +class MultipleTitanicDataInputs(BaseModel): + inputs: List[TitanicDataInputSchema] diff --git a/my-assignement-section-05/classification_model/train_pipeline.py b/my-assignement-section-05/classification_model/train_pipeline.py new file mode 100644 index 000000000..34a5efb54 --- /dev/null +++ b/my-assignement-section-05/classification_model/train_pipeline.py @@ -0,0 +1,31 @@ +from config.core import config +from pipeline import titanic_pipe +from processing.data_manager import load_dataset, save_pipeline +from sklearn.model_selection import train_test_split + + +def run_training() -> None: + """Train the model.""" + + # read training data + data = load_dataset(file_name=config.app_config.data_file) + + # divide train and test + X_train, X_test, y_train, y_test = train_test_split( + data[config.model_config.features], # predictors + data[config.model_config.target], + test_size=config.model_config.test_size, + # we are setting the random seed here + # for reproducibility + random_state=config.model_config.random_state, + ) + + # fit model + titanic_pipe.fit(X_train, y_train) + + # persist trained model + save_pipeline(pipeline_to_persist=titanic_pipe) + + +if __name__ == "__main__": + run_training() diff --git a/my-assignement-section-05/classification_model/trained_models/__init__.py b/my-assignement-section-05/classification_model/trained_models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/my-assignement-section-05/mypy.ini b/my-assignement-section-05/mypy.ini new file mode 100644 index 000000000..9f1b46b12 --- /dev/null +++ b/my-assignement-section-05/mypy.ini @@ -0,0 +1,14 @@ +[mypy] +# warn_unreachable = True +warn_unused_ignores = True +follow_imports = skip +show_error_context = True +warn_incomplete_stub = True +ignore_missing_imports = True +check_untyped_defs = True +cache_dir = /dev/null +# Cannot enable this one as we still allow defining functions without any types. +# disallow_untyped_defs = True +warn_redundant_casts = True +warn_unused_configs = True +strict_optional = True \ No newline at end of file diff --git a/my-assignement-section-05/pyproject.toml b/my-assignement-section-05/pyproject.toml new file mode 100644 index 000000000..31a46cadd --- /dev/null +++ b/my-assignement-section-05/pyproject.toml @@ -0,0 +1,48 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" + +[tool.pytest.ini_options] +minversion = "2.0" +addopts = "-rfEX -p pytester --strict-markers" +python_files = ["test_*.py", "*_test.py"] +python_classes = ["Test", "Acceptance"] +python_functions = ["test"] +# NOTE: "doc" is not included here, but gets tested explicitly via "doctesting". +testpaths = ["tests"] +xfail_strict = true +filterwarnings = [ + "error", + "default:Using or importing the ABCs:DeprecationWarning:unittest2.*", + # produced by older pyparsing<=2.2.0. + "default:Using or importing the ABCs:DeprecationWarning:pyparsing.*", + "default:the imp module is deprecated in favour of importlib:DeprecationWarning:nose.*", + # distutils is deprecated in 3.10, scheduled for removal in 3.12 + "ignore:The distutils package is deprecated:DeprecationWarning", + # produced by python3.6/site.py itself (3.6.7 on Travis, could not trigger it with 3.6.8)." + "ignore:.*U.*mode is deprecated:DeprecationWarning:(?!(pytest|_pytest))", + # produced by pytest-xdist + "ignore:.*type argument to addoption.*:DeprecationWarning", + # produced on execnet (pytest-xdist) + "ignore:.*inspect.getargspec.*deprecated, use inspect.signature.*:DeprecationWarning", + # pytest's own futurewarnings + "ignore::pytest.PytestExperimentalApiWarning", + # Do not cause SyntaxError for invalid escape sequences in py37. + # Those are caught/handled by pyupgrade, and not easy to filter with the + # module being the filename (with .py removed). + "default:invalid escape sequence:DeprecationWarning", + # ignore use of unregistered marks, because we use many to test the implementation + "ignore::_pytest.warning_types.PytestUnknownMarkWarning", +] + +[tool.black] +target-version = ['py311'] + +[tool.isort] +profile = "black" +line_length = 100 +lines_between_sections = 1 +skip = "migrations" diff --git a/my-assignement-section-05/requirements/requirements.txt b/my-assignement-section-05/requirements/requirements.txt new file mode 100644 index 000000000..a0800e668 --- /dev/null +++ b/my-assignement-section-05/requirements/requirements.txt @@ -0,0 +1,12 @@ +# We use compatible release functionality (see PEP 440 here: https://www.python.org/dev/peps/pep-0440/#compatible-release) +# to specify acceptable version ranges of our project dependencies. This gives us the flexibility to keep up with small +# updates/fixes, whilst ensuring we don't install a major update which could introduce backwards incompatible changes. +numpy>=1.21.0,<1.25.0 +pandas>=1.3.5,<2.0.0 +pydantic>=1.8.1,<2.0.0 +scikit-learn>=1.0.2,<1.1.0 +strictyaml>=1.3.2,<2.0.0 +ruamel.yaml>=0.16.12,<1.0.0 +feature-engine>=1.0.2,<1.6.0 # breaking change in v1.6.0 +joblib>=1.0.1,<2.0.0 +setuptools<60 \ No newline at end of file diff --git a/my-assignement-section-05/requirements/test_requirements.txt b/my-assignement-section-05/requirements/test_requirements.txt new file mode 100644 index 000000000..e69019391 --- /dev/null +++ b/my-assignement-section-05/requirements/test_requirements.txt @@ -0,0 +1,4 @@ +-r requirements.txt + +# testing requirements +pytest>=7.2.0,<8.0.0 diff --git a/my-assignement-section-05/requirements/typing_requirements.txt b/my-assignement-section-05/requirements/typing_requirements.txt new file mode 100644 index 000000000..667cc2e4d --- /dev/null +++ b/my-assignement-section-05/requirements/typing_requirements.txt @@ -0,0 +1,5 @@ +# repo maintenance tooling +black>=22.12.0,<23.0.0 +flake8>=6.0.0,<7.0.0 +mypy>=0.991,<1.0.0 +isort>=5.11.4,<6.0.0 \ No newline at end of file diff --git a/my-assignement-section-05/setup.py b/my-assignement-section-05/setup.py new file mode 100644 index 000000000..20329cd0d --- /dev/null +++ b/my-assignement-section-05/setup.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from pathlib import Path + +from setuptools import find_packages, setup + +# Package meta-data. +NAME = 'titanic-classification-model' +DESCRIPTION = "Example classification model package from Train In Data." +URL = "https://github.com/trainindata/testing-and-monitoring-ml-deployments" +EMAIL = "cesa@circletouch.eu" +AUTHOR = "Alessandro Cesa" +REQUIRES_PYTHON = "=3.9.1" + + +# The rest you shouldn't have to touch too much :) +# ------------------------------------------------ +# Except, perhaps the License and Trove Classifiers! +# If you do change the License, remember to change the +# Trove Classifier for that! +long_description = DESCRIPTION + +# Load the package's VERSION file as a dictionary. +about = {} +ROOT_DIR = Path(__file__).resolve().parent +REQUIREMENTS_DIR = ROOT_DIR / 'requirements' +PACKAGE_DIR = ROOT_DIR / 'classification_model' +with open(PACKAGE_DIR / "VERSION") as f: + _version = f.read().strip() + about["__version__"] = _version + + +# What packages are required for this module to be executed? +def list_reqs(fname="requirements.txt"): + with open(REQUIREMENTS_DIR / fname) as fd: + return fd.read().splitlines() + +# Where the magic happens: +setup( + name=NAME, + version=about["__version__"], + description=DESCRIPTION, + long_description=long_description, + long_description_content_type="text/markdown", + author=AUTHOR, + author_email=EMAIL, + python_requires=REQUIRES_PYTHON, + url=URL, + packages=find_packages(exclude=("tests",)), + package_data={"classification_model": ["VERSION"]}, + install_requires=list_reqs(), + extras_require={}, + include_package_data=True, + license="BSD-3", + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: Implementation :: CPython", + ], +) \ No newline at end of file diff --git a/my-assignement-section-05/tests/__init__.py b/my-assignement-section-05/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/my-assignement-section-05/tests/conftest.py b/my-assignement-section-05/tests/conftest.py new file mode 100644 index 000000000..8b09b0996 --- /dev/null +++ b/my-assignement-section-05/tests/conftest.py @@ -0,0 +1,9 @@ +import pytest + +from classification_model.config.core import config +from classification_model.processing.data_manager import load_dataset + + +@pytest.fixture() +def sample_input_data(): + return load_dataset(file_name=config.app_config.data_file) diff --git a/my-assignement-section-05/tests/test_features.py b/my-assignement-section-05/tests/test_features.py new file mode 100644 index 000000000..051980ea7 --- /dev/null +++ b/my-assignement-section-05/tests/test_features.py @@ -0,0 +1,14 @@ +from classification_model.config.core import config +from classification_model.processing.features import ExtractLetterTransformer + + +def test_name_transformer(sample_input_data): + # Given + transformer = ExtractLetterTransformer(variables=config.model_config.cabin) + assert sample_input_data["cabin"].iat[0] == "B5" + + # When + subject = transformer.fit_transform(sample_input_data) + + # Then + assert subject["cabin"].iat[0] == "B" diff --git a/my-assignement-section-05/tests/test_prediction.py b/my-assignement-section-05/tests/test_prediction.py new file mode 100644 index 000000000..3950222d2 --- /dev/null +++ b/my-assignement-section-05/tests/test_prediction.py @@ -0,0 +1,29 @@ +import numpy as np +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split + +from classification_model.config.core import config +from classification_model.predict import make_prediction + + +def test_make_prediction(sample_input_data): + expected_no_predictions = 131 + + X_train, X_test, y_train, y_test = train_test_split( + sample_input_data[config.model_config.features], # predictors + sample_input_data[config.model_config.target], + test_size=config.model_config.test_size, + # we are setting the random seed here + # for reproducibility + random_state=config.model_config.random_state, + ) + + result = make_prediction(input_data=X_test) + predictions = result.get("predictions") + assert isinstance(predictions, np.ndarray) + assert isinstance(predictions[0], np.int64) + assert result.get("errors") is None + assert len(predictions) == expected_no_predictions + _predictions = list(predictions) + accuracy = accuracy_score(y_test, _predictions) + assert accuracy > 0.7 diff --git a/my-assignement-section-05/tox.ini b/my-assignement-section-05/tox.ini new file mode 100644 index 000000000..2581112c2 --- /dev/null +++ b/my-assignement-section-05/tox.ini @@ -0,0 +1,56 @@ +# Tox is a generic virtualenv management and test command line tool. Its goal is to +# standardize testing in Python. We will be using it extensively in this course. + +# Using Tox we can (on multiple operating systems): +# + Eliminate PYTHONPATH challenges when running scripts/tests +# + Eliminate virtualenv setup confusion +# + Streamline steps such as model training, model publishing + + +[tox] +min_version = 4 +envlist = test_package, checks +skipsdist = True + +[testenv] +basepython = python3.9 +install_command = pip install {opts} {packages} +allowlist_externals = train + +setenv = + PYTHONPATH=. + PYTHONHASHSEED=0 + +[testenv:test_package] +envdir = {toxworkdir}/test_package +deps = + -r{toxinidir}/requirements/test_requirements.txt +commands= + python classification_model/train_pipeline.py + pytest \ + -s \ + -vv \ + {posargs:tests/} + +[testenv:train] +envdir = {toxworkdir}/test_package +deps = + {[testenv:test_package]deps} +commands= + python classification_model/train_pipeline.py + + +[testenv:checks] +envdir = {toxworkdir}/checks +deps = + -r{toxinidir}/requirements/typing_requirements.txt +commands = + flake8 classification_model tests + isort classification_model tests + black classification_model tests + {posargs:mypy classification_model} + + +[flake8] +exclude = .git,env +max-line-length = 100 \ No newline at end of file diff --git a/section-04-research-and-development/01-machine-learning-pipeline-data-analysis.ipynb b/section-04-research-and-development/01-machine-learning-pipeline-data-analysis.ipynb index df3c3c9f1..26972b81c 100644 --- a/section-04-research-and-development/01-machine-learning-pipeline-data-analysis.ipynb +++ b/section-04-research-and-development/01-machine-learning-pipeline-data-analysis.ipynb @@ -72,9 +72,12 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-27T08:51:51.941397Z", + "start_time": "2025-02-27T08:51:51.927942Z" + } + }, "source": [ "# to handle datasets\n", "import pandas as pd\n", @@ -89,12 +92,28 @@ "\n", "# to display all the columns of the dataframe in the notebook\n", "pd.pandas.set_option('display.max_columns', None)" - ] + ], + "outputs": [], + "execution_count": 1 }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-27T08:51:52.080786Z", + "start_time": "2025-02-27T08:51:51.965777Z" + } + }, + "source": [ + "# load dataset\n", + "data = pd.read_csv('train.csv')\n", + "\n", + "# rows and columns of the data\n", + "print(data.shape)\n", + "\n", + "# visualise the dataset\n", + "data.head()" + ], "outputs": [ { "name": "stdout", @@ -105,6 +124,98 @@ }, { "data": { + "text/plain": [ + " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", + "0 1 60 RL 65.0 8450 Pave NaN Reg \n", + "1 2 20 RL 80.0 9600 Pave NaN Reg \n", + "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", + "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", + "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", + "\n", + " LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \\\n", + "0 Lvl AllPub Inside Gtl CollgCr Norm \n", + "1 Lvl AllPub FR2 Gtl Veenker Feedr \n", + "2 Lvl AllPub Inside Gtl CollgCr Norm \n", + "3 Lvl AllPub Corner Gtl Crawfor Norm \n", + "4 Lvl AllPub FR2 Gtl NoRidge Norm \n", + "\n", + " Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \\\n", + "0 Norm 1Fam 2Story 7 5 2003 \n", + "1 Norm 1Fam 1Story 6 8 1976 \n", + "2 Norm 1Fam 2Story 7 5 2001 \n", + "3 Norm 1Fam 2Story 7 5 1915 \n", + "4 Norm 1Fam 2Story 8 5 2000 \n", + "\n", + " YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \\\n", + "0 2003 Gable CompShg VinylSd VinylSd BrkFace \n", + "1 1976 Gable CompShg MetalSd MetalSd None \n", + "2 2002 Gable CompShg VinylSd VinylSd BrkFace \n", + "3 1970 Gable CompShg Wd Sdng Wd Shng None \n", + "4 2000 Gable CompShg VinylSd VinylSd BrkFace \n", + "\n", + " MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure \\\n", + "0 196.0 Gd TA PConc Gd TA No \n", + "1 0.0 TA TA CBlock Gd TA Gd \n", + "2 162.0 Gd TA PConc Gd TA Mn \n", + "3 0.0 TA TA BrkTil TA Gd No \n", + "4 350.0 Gd TA PConc Gd TA Av \n", + "\n", + " BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF \\\n", + "0 GLQ 706 Unf 0 150 856 \n", + "1 ALQ 978 Unf 0 284 1262 \n", + "2 GLQ 486 Unf 0 434 920 \n", + "3 ALQ 216 Unf 0 540 756 \n", + "4 GLQ 655 Unf 0 490 1145 \n", + "\n", + " Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF \\\n", + "0 GasA Ex Y SBrkr 856 854 0 \n", + "1 GasA Ex Y SBrkr 1262 0 0 \n", + "2 GasA Ex Y SBrkr 920 866 0 \n", + "3 GasA Gd Y SBrkr 961 756 0 \n", + "4 GasA Ex Y SBrkr 1145 1053 0 \n", + "\n", + " GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr \\\n", + "0 1710 1 0 2 1 3 \n", + "1 1262 0 1 2 0 3 \n", + "2 1786 1 0 2 1 3 \n", + "3 1717 1 0 1 0 3 \n", + "4 2198 1 0 2 1 4 \n", + "\n", + " KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \\\n", + "0 1 Gd 8 Typ 0 NaN \n", + "1 1 TA 6 Typ 1 TA \n", + "2 1 Gd 6 Typ 1 TA \n", + "3 1 Gd 7 Typ 1 Gd \n", + "4 1 Gd 9 Typ 1 TA \n", + "\n", + " GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual \\\n", + "0 Attchd 2003.0 RFn 2 548 TA \n", + "1 Attchd 1976.0 RFn 2 460 TA \n", + "2 Attchd 2001.0 RFn 2 608 TA \n", + "3 Detchd 1998.0 Unf 3 642 TA \n", + "4 Attchd 2000.0 RFn 3 836 TA \n", + "\n", + " GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n", + "0 TA Y 0 61 0 0 \n", + "1 TA Y 298 0 0 0 \n", + "2 TA Y 0 42 0 0 \n", + "3 TA Y 0 35 272 0 \n", + "4 TA Y 192 84 0 0 \n", + "\n", + " ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold \\\n", + "0 0 0 NaN NaN NaN 0 2 2008 \n", + "1 0 0 NaN NaN NaN 0 5 2007 \n", + "2 0 0 NaN NaN NaN 0 9 2008 \n", + "3 0 0 NaN NaN NaN 0 2 2006 \n", + "4 0 0 NaN NaN NaN 0 12 2008 \n", + "\n", + " SaleType SaleCondition SalePrice \n", + "0 WD Normal 208500 \n", + "1 WD Normal 181500 \n", + "2 WD Normal 223500 \n", + "3 WD Abnorml 140000 \n", + "4 WD Normal 250000 " + ], "text/html": [ "
\n", "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...ScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleCondition
0146120RH80.011622PaveNaNRegLvlAllPub...1200NaNMnPrvNaN062010WDNormal
1146220RL81.014267PaveNaNIR1LvlAllPub...00NaNNaNGar21250062010WDNormal
2146360RL74.013830PaveNaNIR1LvlAllPub...00NaNMnPrvNaN032010WDNormal
3146460RL78.09978PaveNaNIR1LvlAllPub...00NaNNaNNaN062010WDNormal
41465120RL43.05005PaveNaNIR1HLSAllPub...1440NaNNaNNaN012010WDNormal
\n", + "

5 rows × 80 columns

\n", + "
" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 4 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:37.804587Z", + "start_time": "2025-02-28T09:34:37.788758Z" + } + }, + "cell_type": "code", + "source": [ + "original_columns = ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',\n", + " 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',\n", + " 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',\n", + " 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',\n", + " 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',\n", + " 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',\n", + " 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',\n", + " 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',\n", + " 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',\n", + " 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',\n", + " 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',\n", + " 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',\n", + " 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',\n", + " 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',\n", + " 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',\n", + " 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',\n", + " 'SaleCondition']\n", + "\n", + "test_columns = test_set.columns\n", + "\n", + "for column in original_columns:\n", + " if column not in test_columns:\n", + " print(f\" {column} is in original but not in test \")\n", + " \n", + "for column in test_columns:\n", + " if column not in original_columns:\n", + " print(f\" {column} is in test but not in original\")\n", + "\n", + " " + ], + "outputs": [], + "execution_count": 5 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:37.947902Z", + "start_time": "2025-02-28T09:34:37.916150Z" + } + }, + "cell_type": "code", + "source": [ + "selected_features = pd.read_csv('selected_features.csv')\n", + "selected_features = selected_features.iloc[:,0].to_list()\n", + "test_set.drop(['Id'],axis=1, inplace=True)\n", + "cat_vars = [var for var in test_set.columns if test_set[var].dtype == 'O']\n", + "cat_vars = cat_vars + ['MSSubClass']\n", + "num_vars = [\n", + " var for var in test_set.columns if var not in cat_vars and var != 'SalePrice'\n", + "]\n", + "test_set[cat_vars] = test_set[cat_vars].astype('O')" + ], + "outputs": [], + "execution_count": 6 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:38.168587Z", + "start_time": "2025-02-28T09:34:38.147555Z" + } + }, + "cell_type": "code", + "source": "test_set['MSSubClass'].dtype\n", + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('O')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 7 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:38.422377Z", + "start_time": "2025-02-28T09:34:38.406632Z" + } + }, + "cell_type": "code", + "source": "selected_features", + "outputs": [ + { + "data": { + "text/plain": [ + "['MSSubClass',\n", + " 'MSZoning',\n", + " 'LotFrontage',\n", + " 'LotShape',\n", + " 'LandContour',\n", + " 'LotConfig',\n", + " 'Neighborhood',\n", + " 'OverallQual',\n", + " 'OverallCond',\n", + " 'YearRemodAdd',\n", + " 'RoofStyle',\n", + " 'Exterior1st',\n", + " 'ExterQual',\n", + " 'Foundation',\n", + " 'BsmtQual',\n", + " 'BsmtExposure',\n", + " 'BsmtFinType1',\n", + " 'HeatingQC',\n", + " 'CentralAir',\n", + " '1stFlrSF',\n", + " '2ndFlrSF',\n", + " 'GrLivArea',\n", + " 'BsmtFullBath',\n", + " 'HalfBath',\n", + " 'KitchenQual',\n", + " 'TotRmsAbvGrd',\n", + " 'Functional',\n", + " 'Fireplaces',\n", + " 'FireplaceQu',\n", + " 'GarageFinish',\n", + " 'GarageCars',\n", + " 'GarageArea',\n", + " 'PavedDrive',\n", + " 'WoodDeckSF',\n", + " 'ScreenPorch',\n", + " 'SaleCondition']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 8 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:38.616887Z", + "start_time": "2025-02-28T09:34:38.585226Z" + } + }, + "cell_type": "code", + "source": "test_set.head()", + "outputs": [ + { + "data": { + "text/plain": [ + " MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour \\\n", + "0 20 RH 80.0 11622 Pave NaN Reg Lvl \n", + "1 20 RL 81.0 14267 Pave NaN IR1 Lvl \n", + "2 60 RL 74.0 13830 Pave NaN IR1 Lvl \n", + "3 60 RL 78.0 9978 Pave NaN IR1 Lvl \n", + "4 120 RL 43.0 5005 Pave NaN IR1 HLS \n", + "\n", + " Utilities LotConfig ... ScreenPorch PoolArea PoolQC Fence MiscFeature \\\n", + "0 AllPub Inside ... 120 0 NaN MnPrv NaN \n", + "1 AllPub Corner ... 0 0 NaN NaN Gar2 \n", + "2 AllPub Inside ... 0 0 NaN MnPrv NaN \n", + "3 AllPub Inside ... 0 0 NaN NaN NaN \n", + "4 AllPub Inside ... 144 0 NaN NaN NaN \n", + "\n", + " MiscVal MoSold YrSold SaleType SaleCondition \n", + "0 0 6 2010 WD Normal \n", + "1 12500 6 2010 WD Normal \n", + "2 0 3 2010 WD Normal \n", + "3 0 6 2010 WD Normal \n", + "4 0 1 2010 WD Normal \n", + "\n", + "[5 rows x 79 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfig...ScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleCondition
020RH80.011622PaveNaNRegLvlAllPubInside...1200NaNMnPrvNaN062010WDNormal
120RL81.014267PaveNaNIR1LvlAllPubCorner...00NaNNaNGar21250062010WDNormal
260RL74.013830PaveNaNIR1LvlAllPubInside...00NaNMnPrvNaN032010WDNormal
360RL78.09978PaveNaNIR1LvlAllPubInside...00NaNNaNNaN062010WDNormal
4120RL43.05005PaveNaNIR1HLSAllPubInside...1440NaNNaNNaN012010WDNormal
\n", + "

5 rows × 79 columns

\n", + "
" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 9 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:38.784140Z", + "start_time": "2025-02-28T09:34:38.755700Z" + } + }, + "cell_type": "code", + "source": "test_set.isna().sum()", + "outputs": [ + { + "data": { + "text/plain": [ + "MSSubClass 0\n", + "MSZoning 4\n", + "LotFrontage 227\n", + "LotArea 0\n", + "Street 0\n", + " ... \n", + "MiscVal 0\n", + "MoSold 0\n", + "YrSold 0\n", + "SaleType 1\n", + "SaleCondition 0\n", + "Length: 79, dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 10 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Categorical Variables" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:39.080868Z", + "start_time": "2025-02-28T09:34:39.043818Z" + } + }, + "cell_type": "code", + "source": [ + "cat_vars_with_na = [\n", + " var for var in cat_vars\n", + " if test_set[var].isnull().sum() > 0\n", + "]\n", + "\n", + "# print percentage of missing values per variable\n", + "test_set[cat_vars_with_na].isnull().mean().sort_values(ascending=False)\n" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "PoolQC 0.997944\n", + "MiscFeature 0.965045\n", + "Alley 0.926662\n", + "Fence 0.801234\n", + "FireplaceQu 0.500343\n", + "GarageCond 0.053461\n", + "GarageQual 0.053461\n", + "GarageFinish 0.053461\n", + "GarageType 0.052090\n", + "BsmtCond 0.030843\n", + "BsmtQual 0.030158\n", + "BsmtExposure 0.030158\n", + "BsmtFinType1 0.028787\n", + "BsmtFinType2 0.028787\n", + "MasVnrType 0.010966\n", + "MSZoning 0.002742\n", + "Functional 0.001371\n", + "Utilities 0.001371\n", + "KitchenQual 0.000685\n", + "Exterior2nd 0.000685\n", + "Exterior1st 0.000685\n", + "SaleType 0.000685\n", + "dtype: float64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 11 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:39.700752Z", + "start_time": "2025-02-28T09:34:39.686933Z" + } + }, + "cell_type": "code", + "source": [ + "new_vars_with_nan = [var for var in cat_vars_with_na if var not in ['Alley',\n", + " 'MasVnrType',\n", + " 'BsmtQual',\n", + " 'BsmtCond',\n", + " 'BsmtExposure',\n", + " 'BsmtFinType1',\n", + " 'BsmtFinType2',\n", + " 'Electrical',\n", + " 'FireplaceQu',\n", + " 'GarageType',\n", + " 'GarageFinish',\n", + " 'GarageQual',\n", + " 'GarageCond',\n", + " 'PoolQC',\n", + " 'Fence',\n", + " 'MiscFeature']]\n", + "print(new_vars_with_nan)" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'Functional', 'SaleType']\n" + ] + } + ], + "execution_count": 12 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:40.325490Z", + "start_time": "2025-02-28T09:34:40.303894Z" + } + }, + "cell_type": "code", + "source": [ + "new_with_string_missing = [var for var in new_vars_with_nan if test_set[var].isnull().mean()>0.1]\n", + "new_with_frequent_category = [var for var in new_vars_with_nan if test_set[var].isnull().mean()<=0.1]\n", + "old_with_string_missing = [var for var in cat_vars_with_na if var in ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']]\n", + "old_with_frequent_category = [var for var in cat_vars_with_na if var in ['MasVnrType',\n", + " 'BsmtQual',\n", + " 'BsmtCond',\n", + " 'BsmtExposure',\n", + " 'BsmtFinType1',\n", + " 'BsmtFinType2',\n", + " 'Electrical',\n", + " 'GarageType',\n", + " 'GarageFinish',\n", + " 'GarageQual',\n", + " 'GarageCond']]\n", + "with_string_missing = old_with_string_missing + new_with_string_missing\n", + "with_frequent_category = old_with_frequent_category + new_with_frequent_category\n", + "\n", + "print(with_string_missing)\n", + "print(with_frequent_category)" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']\n", + "['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'Functional', 'SaleType']\n" + ] + } + ], + "execution_count": 13 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:40.647242Z", + "start_time": "2025-02-28T09:34:40.623237Z" + } + }, + "cell_type": "code", + "source": [ + "most_frequent_category = {'MasVnrType': 'None',' BsmtCond': 'TA' ,'BsmtQual':'TA', 'BsmtExposure':'No', 'BsmtFinType1':'Unf','BsmtFinType2':'Unf', 'Electrical': 'SBrkr',\n", + " 'GarageType': 'Attchd','GarageFinish':'Unf', 'GarageQual': 'TA', 'GarageCond': 'TA'}" + ], + "outputs": [], + "execution_count": 14 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:40.809136Z", + "start_time": "2025-02-28T09:34:40.777251Z" + } + }, + "cell_type": "code", + "source": [ + "test_set[with_string_missing] = test_set[with_string_missing].fillna('Missing')\n", + "for var in with_frequent_category:\n", + " mode = most_frequent_category.get(var,test_set[var].mode()[0])\n", + " test_set[var].fillna(mode, inplace=True)\n", + " " + ], + "outputs": [], + "execution_count": 15 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:40.882047Z", + "start_time": "2025-02-28T09:34:40.856586Z" + } + }, + "cell_type": "code", + "source": "test_set[cat_vars_with_na].isnull().sum()", + "outputs": [ + { + "data": { + "text/plain": [ + "MSZoning 0\n", + "Alley 0\n", + "Utilities 0\n", + "Exterior1st 0\n", + "Exterior2nd 0\n", + "MasVnrType 0\n", + "BsmtQual 0\n", + "BsmtCond 0\n", + "BsmtExposure 0\n", + "BsmtFinType1 0\n", + "BsmtFinType2 0\n", + "KitchenQual 0\n", + "Functional 0\n", + "FireplaceQu 0\n", + "GarageType 0\n", + "GarageFinish 0\n", + "GarageQual 0\n", + "GarageCond 0\n", + "PoolQC 0\n", + "Fence 0\n", + "MiscFeature 0\n", + "SaleType 0\n", + "dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 16 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Numerical variables" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:41.132263Z", + "start_time": "2025-02-28T09:34:41.095241Z" + } + }, + "cell_type": "code", + "source": [ + "num_vars_with_na = [\n", + " var for var in num_vars\n", + " if test_set[var].isnull().sum() > 0\n", + "]\n", + "test_set[num_vars_with_na].isnull().mean()" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "LotFrontage 0.155586\n", + "MasVnrArea 0.010281\n", + "BsmtFinSF1 0.000685\n", + "BsmtFinSF2 0.000685\n", + "BsmtUnfSF 0.000685\n", + "TotalBsmtSF 0.000685\n", + "BsmtFullBath 0.001371\n", + "BsmtHalfBath 0.001371\n", + "GarageYrBlt 0.053461\n", + "GarageCars 0.000685\n", + "GarageArea 0.000685\n", + "dtype: float64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 17 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:41.311744Z", + "start_time": "2025-02-28T09:34:41.280353Z" + } + }, + "cell_type": "code", + "source": "means = {'LotFrontage': 69.87974098057354, 'MasVnrArea':103.7974006116208,'GarageYrBlt':1978.2959677419356 }", + "outputs": [], + "execution_count": 18 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:41.421801Z", + "start_time": "2025-02-28T09:34:41.390407Z" + } + }, + "cell_type": "code", + "source": [ + "for var in num_vars_with_na:\n", + " mean_val = means.get(var, test_set[var].mean())\n", + " test_set[var + '_na'] = np.where(test_set[var].isnull(), 1, 0)\n", + " test_set[var].fillna(mean_val, inplace=True)\n", + " \n", + "# check that we have no more missing values in the engineered variables\n", + "test_set[num_vars_with_na].isnull().sum()" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "LotFrontage 0\n", + "MasVnrArea 0\n", + "BsmtFinSF1 0\n", + "BsmtFinSF2 0\n", + "BsmtUnfSF 0\n", + "TotalBsmtSF 0\n", + "BsmtFullBath 0\n", + "BsmtHalfBath 0\n", + "GarageYrBlt 0\n", + "GarageCars 0\n", + "GarageArea 0\n", + "dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 19 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Temporal variables" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:41.612086Z", + "start_time": "2025-02-28T09:34:41.600130Z" + } + }, + "cell_type": "code", + "source": [ + "def elapsed_years(df, var):\n", + " # capture difference between the year variable\n", + " # and the year in which the house was sold\n", + " df[var] = test_set['YrSold'] - df[var]\n", + " return df" + ], + "outputs": [], + "execution_count": 20 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:41.722730Z", + "start_time": "2025-02-28T09:34:41.691017Z" + } + }, + "cell_type": "code", + "source": [ + "for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:\n", + " test_set = elapsed_years(test_set, var)\n", + " \n", + "# now we drop YrSold\n", + "test_set.drop(['YrSold'], axis=1, inplace=True)\n" + ], + "outputs": [], + "execution_count": 21 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Logarithmic Transformation" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:41.801939Z", + "start_time": "2025-02-28T09:34:41.783926Z" + } + }, + "cell_type": "code", + "source": [ + "for var in [\"LotFrontage\", \"1stFlrSF\", \"GrLivArea\"]:\n", + " test_set[var] = np.log(test_set[var])" + ], + "outputs": [], + "execution_count": 22 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Yeo johnson transformation" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:41.926695Z", + "start_time": "2025-02-28T09:34:41.910836Z" + } + }, + "cell_type": "code", + "source": [ + "param = -12.55283001172003\n", + "test_set['LotArea'] = stats.yeojohnson(test_set['LotArea'], lmbda=param)\n" + ], + "outputs": [], + "execution_count": 23 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Binarize skewed variables" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.053448Z", + "start_time": "2025-02-28T09:34:42.035614Z" + } + }, + "cell_type": "code", + "source": [ + "skewed = [\n", + " 'BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch',\n", + " '3SsnPorch', 'ScreenPorch', 'MiscVal'\n", + "]\n", + "\n", + "for var in skewed:\n", + " # map the variable values into 0 and 1\n", + " test_set[var] = np.where(test_set[var]==0, 0, 1)\n" + ], + "outputs": [], + "execution_count": 24 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Categorical mappings" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.147008Z", + "start_time": "2025-02-28T09:34:42.118477Z" + } + }, + "cell_type": "code", + "source": [ + "qual_mappings = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'Missing': 0, 'NA': 0}\n", + "\n", + "qual_vars = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',\n", + " 'HeatingQC', 'KitchenQual', 'FireplaceQu',\n", + " 'GarageQual', 'GarageCond',\n", + " ]\n", + "for var in qual_vars:\n", + " test_set[var] = test_set[var].map(qual_mappings)\n" + ], + "outputs": [], + "execution_count": 25 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.180209Z", + "start_time": "2025-02-28T09:34:42.164752Z" + } + }, + "cell_type": "code", + "source": [ + "exposure_mappings = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}\n", + "\n", + "var = 'BsmtExposure'\n", + "test_set[var] = test_set[var].map(exposure_mappings)\n" + ], + "outputs": [], + "execution_count": 26 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.227803Z", + "start_time": "2025-02-28T09:34:42.211972Z" + } + }, + "cell_type": "code", + "source": [ + "finish_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}\n", + "\n", + "finish_vars = ['BsmtFinType1', 'BsmtFinType2']\n", + "for var in finish_vars:\n", + " test_set[var] = test_set[var].map(finish_mappings)\n" + ], + "outputs": [], + "execution_count": 27 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.279548Z", + "start_time": "2025-02-28T09:34:42.263551Z" + } + }, + "cell_type": "code", + "source": [ + "garage_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}\n", + "\n", + "var = 'GarageFinish'\n", + "test_set[var] = test_set[var].map(garage_mappings)" + ], + "outputs": [], + "execution_count": 28 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.336038Z", + "start_time": "2025-02-28T09:34:42.311330Z" + } + }, + "cell_type": "code", + "source": [ + "fence_mappings = {'Missing': 0, 'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}\n", + "\n", + "var = 'Fence'\n", + "test_set[var] = test_set[var].map(fence_mappings)" + ], + "outputs": [], + "execution_count": 29 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.415518Z", + "start_time": "2025-02-28T09:34:42.367763Z" + } + }, + "cell_type": "code", + "source": "[var for var in test_set.columns if test_set[var].isnull().sum() > 0]", + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 30 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Removing rare labels" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.478975Z", + "start_time": "2025-02-28T09:34:42.451553Z" + } + }, + "cell_type": "code", + "source": [ + "qual_vars = qual_vars + finish_vars + ['BsmtExposure', 'GarageFinish', 'Fence']\n", + "\n", + "# capture the remaining categorical variables\n", + "# (those that we did not re-map)\n", + "\n", + "cat_others = [\n", + " var for var in cat_vars if var not in qual_vars\n", + "]\n", + "\n", + "len(cat_others)\n" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "30" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 31 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.542368Z", + "start_time": "2025-02-28T09:34:42.526578Z" + } + }, + "cell_type": "code", + "source": "print(cat_others)", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'PoolQC', 'MiscFeature', 'SaleType', 'SaleCondition', 'MSSubClass']\n" + ] + } + ], + "execution_count": 32 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.652908Z", + "start_time": "2025-02-28T09:34:42.605618Z" + } + }, + "cell_type": "code", + "source": "test_set.head()", + "outputs": [ + { + "data": { + "text/plain": [ + " MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", + "0 20 RH 4.382027 0.079663 Pave Missing Reg \n", + "1 20 RL 4.394449 0.079663 Pave Missing IR1 \n", + "2 60 RL 4.304065 0.079663 Pave Missing IR1 \n", + "3 60 RL 4.356709 0.079663 Pave Missing IR1 \n", + "4 120 RL 3.761200 0.079663 Pave Missing IR1 \n", + "\n", + " LandContour Utilities LotConfig ... MasVnrArea_na BsmtFinSF1_na \\\n", + "0 Lvl AllPub Inside ... 0 0 \n", + "1 Lvl AllPub Corner ... 0 0 \n", + "2 Lvl AllPub Inside ... 0 0 \n", + "3 Lvl AllPub Inside ... 0 0 \n", + "4 HLS AllPub Inside ... 0 0 \n", + "\n", + " BsmtFinSF2_na BsmtUnfSF_na TotalBsmtSF_na BsmtFullBath_na BsmtHalfBath_na \\\n", + "0 0 0 0 0 0 \n", + "1 0 0 0 0 0 \n", + "2 0 0 0 0 0 \n", + "3 0 0 0 0 0 \n", + "4 0 0 0 0 0 \n", + "\n", + " GarageYrBlt_na GarageCars_na GarageArea_na \n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "\n", + "[5 rows x 89 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfig...MasVnrArea_naBsmtFinSF1_naBsmtFinSF2_naBsmtUnfSF_naTotalBsmtSF_naBsmtFullBath_naBsmtHalfBath_naGarageYrBlt_naGarageCars_naGarageArea_na
020RH4.3820270.079663PaveMissingRegLvlAllPubInside...0000000000
120RL4.3944490.079663PaveMissingIR1LvlAllPubCorner...0000000000
260RL4.3040650.079663PaveMissingIR1LvlAllPubInside...0000000000
360RL4.3567090.079663PaveMissingIR1LvlAllPubInside...0000000000
4120RL3.7612000.079663PaveMissingIR1HLSAllPubInside...0000000000
\n", + "

5 rows × 89 columns

\n", + "
" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 33 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.795123Z", + "start_time": "2025-02-28T09:34:42.779108Z" + } + }, + "cell_type": "code", + "source": "type(test_set['MSSubClass'][0])", + "outputs": [ + { + "data": { + "text/plain": [ + "int" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 34 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:42.983422Z", + "start_time": "2025-02-28T09:34:42.954128Z" + } + }, + "cell_type": "code", + "source": "frequent_labels ={'MSZoning': ['FV', 'RH', 'RL', 'RM'], 'Street': ['Pave'], 'Alley': ['Grvl', 'Missing', 'Pave'], 'LotShape': ['IR1', 'IR2', 'Reg'], 'LandContour': ['Bnk', 'HLS', 'Low', 'Lvl'], 'Utilities': ['AllPub'], 'LotConfig': ['Corner', 'CulDSac', 'FR2', 'Inside'], 'LandSlope': ['Gtl', 'Mod'], 'Neighborhood': ['Blmngtn', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber'], 'Condition1': ['Artery', 'Feedr', 'Norm', 'PosN', 'RRAn'], 'Condition2': ['Norm'], 'BldgType': ['1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE'], 'HouseStyle': ['1.5Fin', '1Story', '2Story', 'SFoyer', 'SLvl'], 'RoofStyle': ['Gable', 'Hip'], 'RoofMatl': ['CompShg'], 'Exterior1st': ['AsbShng', 'BrkFace', 'CemntBd', 'HdBoard', 'MetalSd', 'Plywood', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing'], 'Exterior2nd': ['AsbShng', 'BrkFace', 'CmentBd', 'HdBoard', 'MetalSd', 'Plywood', 'Stucco', 'VinylSd', 'Wd Sdng', 'Wd Shng'], 'MasVnrType': ['BrkFace', 'None', 'Stone'], 'Foundation': ['BrkTil', 'CBlock', 'PConc', 'Slab'], 'Heating': ['GasA', 'GasW'], 'CentralAir': ['N', 'Y'], 'Electrical': ['FuseA', 'FuseF', 'SBrkr'], 'Functional': ['Min1', 'Min2', 'Mod', 'Typ'], 'GarageType': ['Attchd', 'Basment', 'BuiltIn', 'Detchd'], 'PavedDrive': ['N', 'P', 'Y'], 'PoolQC': ['Missing'], 'MiscFeature': ['Missing', 'Shed'], 'SaleType': ['COD', 'New', 'WD'], 'SaleCondition': ['Abnorml', 'Family', 'Normal', 'Partial'], 'MSSubClass': ['20', '30', '50', '60', '70', '75', '80', '85', '90', '120', '160', '190']}", + "outputs": [], + "execution_count": 35 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:43.157454Z", + "start_time": "2025-02-28T09:34:43.125685Z" + } + }, + "cell_type": "code", + "source": [ + "for var in cat_others:\n", + " # find the frequent categories\n", + " frequent_ls = frequent_labels[var]\n", + "\n", + " # replace rare categories by the string \"Rare\"\n", + " test_set[var] = np.where(test_set[var].isin(\n", + " frequent_ls), test_set[var], 'Rare')" + ], + "outputs": [], + "execution_count": 36 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:43.426357Z", + "start_time": "2025-02-28T09:34:43.405722Z" + } + }, + "cell_type": "code", + "source": "test_set['MSSubClass'].dtype\n", + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('O')" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 37 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:43.535647Z", + "start_time": "2025-02-28T09:34:43.504526Z" + } + }, + "cell_type": "code", + "source": "test_set.head()", + "outputs": [ + { + "data": { + "text/plain": [ + " MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", + "0 Rare RH 4.382027 0.079663 Pave Missing Reg \n", + "1 Rare RL 4.394449 0.079663 Pave Missing IR1 \n", + "2 Rare RL 4.304065 0.079663 Pave Missing IR1 \n", + "3 Rare RL 4.356709 0.079663 Pave Missing IR1 \n", + "4 Rare RL 3.761200 0.079663 Pave Missing IR1 \n", + "\n", + " LandContour Utilities LotConfig ... MasVnrArea_na BsmtFinSF1_na \\\n", + "0 Lvl AllPub Inside ... 0 0 \n", + "1 Lvl AllPub Corner ... 0 0 \n", + "2 Lvl AllPub Inside ... 0 0 \n", + "3 Lvl AllPub Inside ... 0 0 \n", + "4 HLS AllPub Inside ... 0 0 \n", + "\n", + " BsmtFinSF2_na BsmtUnfSF_na TotalBsmtSF_na BsmtFullBath_na BsmtHalfBath_na \\\n", + "0 0 0 0 0 0 \n", + "1 0 0 0 0 0 \n", + "2 0 0 0 0 0 \n", + "3 0 0 0 0 0 \n", + "4 0 0 0 0 0 \n", + "\n", + " GarageYrBlt_na GarageCars_na GarageArea_na \n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "\n", + "[5 rows x 89 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfig...MasVnrArea_naBsmtFinSF1_naBsmtFinSF2_naBsmtUnfSF_naTotalBsmtSF_naBsmtFullBath_naBsmtHalfBath_naGarageYrBlt_naGarageCars_naGarageArea_na
0RareRH4.3820270.079663PaveMissingRegLvlAllPubInside...0000000000
1RareRL4.3944490.079663PaveMissingIR1LvlAllPubCorner...0000000000
2RareRL4.3040650.079663PaveMissingIR1LvlAllPubInside...0000000000
3RareRL4.3567090.079663PaveMissingIR1LvlAllPubInside...0000000000
4RareRL3.7612000.079663PaveMissingIR1HLSAllPubInside...0000000000
\n", + "

5 rows × 89 columns

\n", + "
" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 38 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "encoding of categorical variables" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:43.709922Z", + "start_time": "2025-02-28T09:34:43.682445Z" + } + }, + "cell_type": "code", + "source": "encodings = {'MSZoning': {'Rare': 0, 'RM': 1, 'RH': 2, 'RL': 3, 'FV': 4}, 'Street': {'Rare': 0, 'Pave': 1}, 'Alley': {'Grvl': 0, 'Pave': 1, 'Missing': 2}, 'LotShape': {'Reg': 0, 'IR1': 1, 'Rare': 2, 'IR2': 3}, 'LandContour': {'Bnk': 0, 'Lvl': 1, 'Low': 2, 'HLS': 3}, 'Utilities': {'Rare': 0, 'AllPub': 1}, 'LotConfig': {'Inside': 0, 'FR2': 1, 'Corner': 2, 'Rare': 3, 'CulDSac': 4}, 'LandSlope': {'Gtl': 0, 'Mod': 1, 'Rare': 2}, 'Neighborhood': {'IDOTRR': 0, 'MeadowV': 1, 'BrDale': 2, 'Edwards': 3, 'BrkSide': 4, 'OldTown': 5, 'Sawyer': 6, 'SWISU': 7, 'NAmes': 8, 'Mitchel': 9, 'SawyerW': 10, 'Rare': 11, 'NWAmes': 12, 'Gilbert': 13, 'Blmngtn': 14, 'CollgCr': 15, 'Crawfor': 16, 'ClearCr': 17, 'Somerst': 18, 'Timber': 19, 'StoneBr': 20, 'NridgHt': 21, 'NoRidge': 22}, 'Condition1': {'Artery': 0, 'Feedr': 1, 'Norm': 2, 'RRAn': 3, 'Rare': 4, 'PosN': 5}, 'Condition2': {'Rare': 0, 'Norm': 1}, 'BldgType': {'2fmCon': 0, 'Duplex': 1, 'Twnhs': 2, '1Fam': 3, 'TwnhsE': 4}, 'HouseStyle': {'SFoyer': 0, '1.5Fin': 1, 'Rare': 2, '1Story': 3, 'SLvl': 4, '2Story': 5}, 'RoofStyle': {'Gable': 0, 'Rare': 1, 'Hip': 2}, 'RoofMatl': {'CompShg': 0, 'Rare': 1}, 'Exterior1st': {'AsbShng': 0, 'Wd Sdng': 1, 'WdShing': 2, 'MetalSd': 3, 'Stucco': 4, 'Rare': 5, 'HdBoard': 6, 'Plywood': 7, 'BrkFace': 8, 'CemntBd': 9, 'VinylSd': 10}, 'Exterior2nd': {'AsbShng': 0, 'Wd Sdng': 1, 'MetalSd': 2, 'Wd Shng': 3, 'Stucco': 4, 'Rare': 5, 'HdBoard': 6, 'Plywood': 7, 'BrkFace': 8, 'CmentBd': 9, 'VinylSd': 10}, 'MasVnrType': {'Rare': 0, 'None': 1, 'BrkFace': 2, 'Stone': 3}, 'Foundation': {'Slab': 0, 'BrkTil': 1, 'CBlock': 2, 'Rare': 3, 'PConc': 4}, 'Heating': {'Rare': 0, 'GasW': 1, 'GasA': 2}, 'CentralAir': {'N': 0, 'Y': 1}, 'Electrical': {'Rare': 0, 'FuseF': 1, 'FuseA': 2, 'SBrkr': 3}, 'Functional': {'Rare': 0, 'Min2': 1, 'Mod': 2, 'Min1': 3, 'Typ': 4}, 'GarageType': {'Rare': 0, 'Detchd': 1, 'Basment': 2, 'Attchd': 3, 'BuiltIn': 4}, 'PavedDrive': {'N': 0, 'P': 1, 'Y': 2}, 'PoolQC': {'Missing': 0, 'Rare': 1}, 'MiscFeature': {'Rare': 0, 'Shed': 1, 'Missing': 2}, 'SaleType': {'COD': 0, 'Rare': 1, 'WD': 2, 'New': 3}, 'SaleCondition': {'Rare': 0, 'Abnorml': 1, 'Family': 2, 'Normal': 3, 'Partial': 4}, 'MSSubClass': {30: 0, 'Rare': 1, 190: 2, 90: 3, 160: 4, 50: 5, 85: 6, 70: 7, 80: 8, 20: 9, 75: 10, 120: 11, 60: 12}}", + "outputs": [], + "execution_count": 39 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:43.852628Z", + "start_time": "2025-02-28T09:34:43.809211Z" + } + }, + "cell_type": "code", + "source": [ + "for var in cat_others:\n", + " test_set[var] = test_set[var].map(encodings[var])" + ], + "outputs": [], + "execution_count": 40 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:44.199531Z", + "start_time": "2025-02-28T09:34:44.167859Z" + } + }, + "cell_type": "code", + "source": "[var for var in test_set.columns if test_set[var].isnull().sum() > 0]\n", + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 41 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:44.500292Z", + "start_time": "2025-02-28T09:34:44.338393Z" + } + }, + "cell_type": "code", + "source": "scaler = joblib.load('minmax_scaler.joblib')", + "outputs": [], + "execution_count": 42 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:44.550680Z", + "start_time": "2025-02-28T09:34:44.536642Z" + } + }, + "cell_type": "code", + "source": "test_set.shape", + "outputs": [ + { + "data": { + "text/plain": [ + "(1459, 89)" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 43 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:44.625725Z", + "start_time": "2025-02-28T09:34:44.601228Z" + } + }, + "cell_type": "code", + "source": [ + "original_columns = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',\n", + " 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',\n", + " 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',\n", + " 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',\n", + " 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',\n", + " 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',\n", + " 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',\n", + " 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',\n", + " 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',\n", + " 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',\n", + " 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',\n", + " 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',\n", + " 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',\n", + " 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',\n", + " 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal',\n", + " 'MoSold', 'SaleType', 'SaleCondition', 'LotFrontage_na',\n", + " 'MasVnrArea_na', 'GarageYrBlt_na']\n", + "test_columns = test_set.columns\n", + "\n", + "in_original_not_in_test = []\n", + "in_test_not_in_original = []\n", + "\n", + "for column in original_columns:\n", + " if column not in test_columns:\n", + " in_original_not_in_test.append(column)\n", + " \n", + "for column in test_columns:\n", + " if column not in original_columns:\n", + " in_test_not_in_original.append(column)\n", + " \n", + "print(in_original_not_in_test)\n", + "print(in_test_not_in_original)\n" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n", + "['BsmtFinSF1_na', 'BsmtFinSF2_na', 'BsmtUnfSF_na', 'TotalBsmtSF_na', 'BsmtFullBath_na', 'BsmtHalfBath_na', 'GarageCars_na', 'GarageArea_na']\n" + ] + } + ], + "execution_count": 44 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:44.829451Z", + "start_time": "2025-02-28T09:34:44.744867Z" + } + }, + "cell_type": "code", + "source": "test_set.head()", + "outputs": [ + { + "data": { + "text/plain": [ + " MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", + "0 1 2 4.382027 0.079663 1 2 0 \n", + "1 1 3 4.394449 0.079663 1 2 1 \n", + "2 1 3 4.304065 0.079663 1 2 1 \n", + "3 1 3 4.356709 0.079663 1 2 1 \n", + "4 1 3 3.761200 0.079663 1 2 1 \n", + "\n", + " LandContour Utilities LotConfig ... MasVnrArea_na BsmtFinSF1_na \\\n", + "0 1 1 0 ... 0 0 \n", + "1 1 1 2 ... 0 0 \n", + "2 1 1 0 ... 0 0 \n", + "3 1 1 0 ... 0 0 \n", + "4 3 1 0 ... 0 0 \n", + "\n", + " BsmtFinSF2_na BsmtUnfSF_na TotalBsmtSF_na BsmtFullBath_na \\\n", + "0 0 0 0 0 \n", + "1 0 0 0 0 \n", + "2 0 0 0 0 \n", + "3 0 0 0 0 \n", + "4 0 0 0 0 \n", + "\n", + " BsmtHalfBath_na GarageYrBlt_na GarageCars_na GarageArea_na \n", + "0 0 0 0 0 \n", + "1 0 0 0 0 \n", + "2 0 0 0 0 \n", + "3 0 0 0 0 \n", + "4 0 0 0 0 \n", + "\n", + "[5 rows x 89 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfig...MasVnrArea_naBsmtFinSF1_naBsmtFinSF2_naBsmtUnfSF_naTotalBsmtSF_naBsmtFullBath_naBsmtHalfBath_naGarageYrBlt_naGarageCars_naGarageArea_na
0124.3820270.079663120110...0000000000
1134.3944490.079663121112...0000000000
2134.3040650.079663121110...0000000000
3134.3567090.079663121110...0000000000
4133.7612000.079663121310...0000000000
\n", + "

5 rows × 89 columns

\n", + "
" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 45 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:45.157808Z", + "start_time": "2025-02-28T09:34:45.110692Z" + } + }, + "cell_type": "code", + "source": [ + "test_set = test_set.drop(columns = in_test_not_in_original)\n", + "test_set =pd.DataFrame(scaler.transform(test_set), columns = test_set.columns)[selected_features]" + ], + "outputs": [], + "execution_count": 46 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:45.363330Z", + "start_time": "2025-02-28T09:34:45.284334Z" + } + }, + "cell_type": "code", + "source": "test_set.head()", + "outputs": [ + { + "data": { + "text/plain": [ + " MSSubClass MSZoning LotFrontage LotShape LandContour LotConfig \\\n", + "0 0.083333 0.50 0.495064 0.000000 0.333333 0.0 \n", + "1 0.083333 0.75 0.499662 0.333333 0.333333 0.5 \n", + "2 0.083333 0.75 0.466207 0.333333 0.333333 0.0 \n", + "3 0.083333 0.75 0.485693 0.333333 0.333333 0.0 \n", + "4 0.083333 0.75 0.265271 0.333333 1.000000 0.0 \n", + "\n", + " Neighborhood OverallQual OverallCond YearRemodAdd ... Functional \\\n", + "0 0.363636 0.444444 0.625 0.819672 ... 1.0 \n", + "1 0.363636 0.555556 0.625 0.868852 ... 1.0 \n", + "2 0.590909 0.444444 0.500 0.213115 ... 1.0 \n", + "3 0.590909 0.555556 0.625 0.213115 ... 1.0 \n", + "4 0.909091 0.777778 0.500 0.311475 ... 1.0 \n", + "\n", + " Fireplaces FireplaceQu GarageFinish GarageCars GarageArea PavedDrive \\\n", + "0 0.000000 0.0 0.0 0.25 0.514810 1.0 \n", + "1 0.000000 0.0 0.0 0.25 0.220028 1.0 \n", + "2 0.333333 0.6 1.0 0.50 0.339915 1.0 \n", + "3 0.333333 0.8 1.0 0.50 0.331453 1.0 \n", + "4 0.000000 0.0 0.5 0.50 0.356841 1.0 \n", + "\n", + " WoodDeckSF ScreenPorch SaleCondition \n", + "0 0.163361 1.0 0.75 \n", + "1 0.458576 0.0 0.75 \n", + "2 0.247375 0.0 0.75 \n", + "3 0.420070 0.0 0.75 \n", + "4 0.000000 1.0 0.75 \n", + "\n", + "[5 rows x 36 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MSSubClassMSZoningLotFrontageLotShapeLandContourLotConfigNeighborhoodOverallQualOverallCondYearRemodAdd...FunctionalFireplacesFireplaceQuGarageFinishGarageCarsGarageAreaPavedDriveWoodDeckSFScreenPorchSaleCondition
00.0833330.500.4950640.0000000.3333330.00.3636360.4444440.6250.819672...1.00.0000000.00.00.250.5148101.00.1633611.00.75
10.0833330.750.4996620.3333330.3333330.50.3636360.5555560.6250.868852...1.00.0000000.00.00.250.2200281.00.4585760.00.75
20.0833330.750.4662070.3333330.3333330.00.5909090.4444440.5000.213115...1.00.3333330.61.00.500.3399151.00.2473750.00.75
30.0833330.750.4856930.3333330.3333330.00.5909090.5555560.6250.213115...1.00.3333330.81.00.500.3314531.00.4200700.00.75
40.0833330.750.2652710.3333331.0000000.00.9090910.7777780.5000.311475...1.00.0000000.00.50.500.3568411.00.0000001.00.75
\n", + "

5 rows × 36 columns

\n", + "
" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 47 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:45.971970Z", + "start_time": "2025-02-28T09:34:45.502223Z" + } + }, + "cell_type": "code", + "source": "lin_model = joblib.load('linear_regression.joblib')", + "outputs": [], + "execution_count": 48 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:46.130650Z", + "start_time": "2025-02-28T09:34:46.114639Z" + } + }, + "cell_type": "code", + "source": "predictions = lin_model.predict(test_set)", + "outputs": [], + "execution_count": 49 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T09:34:47.709356Z", + "start_time": "2025-02-28T09:34:46.360427Z" + } + }, + "cell_type": "code", + "source": "pd.Series(np.exp(predictions)).hist(bins=50)", + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjAAAAGdCAYAAAAMm0nCAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAk4UlEQVR4nO3dfXBU1f3H8c8m2WwSYBMeJA8aJFYUBZUKEqP2QVmIigrKqGjaoehIrcGK6WihP0FA2yC1SKEI2iqMMyJqW9ARjGSCQtUYIIKC0IgtFqc2oYrJApF1Sc7vD8sd10RJ4G52z/J+zTCy5569+937XZKP5+7d9RhjjAAAACySFOsCAAAAOosAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwTkqsCzgWra2t+vjjj9WjRw95PJ5YlwMAADrAGKP9+/crLy9PSUnHt4ZiZYD5+OOPlZ+fH+syAADAMfjoo490yimnHNc+rAwwPXr0kPTlAfD7/TGuJj6Fw2GtXbtWo0aNktfrjXU5+B/6En/oSfyhJ/HHrZ4Eg0Hl5+c7v8ePh5UB5shpI7/fT4D5BuFwWBkZGfL7/fwAiCP0Jf7Qk/hDT+KP2z1x4+0fvIkXAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDopsS4AJ67+U1cfdc6Hc0Z3QSUAANuwAgMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0uo0ancfkzACDWWIEBAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgnZRYF4DE1H/q6liXAABIYKzAAAAA6xBgAACAdTiFhAic+gEA2IAVGAAAYJ1OB5gNGzbo6quvVl5enjwej1atWhWx3RijGTNmKDc3V+np6QoEAtq1a1fEnH379qmkpER+v19ZWVm69dZbdeDAgeN6IgAA4MTR6QBz8OBBnXfeeVq0aFG72+fOnasFCxZoyZIlqqmpUbdu3VRcXKxDhw45c0pKSvTee++psrJSL730kjZs2KBJkyYd+7MAAAAnlE6/B+aKK67QFVdc0e42Y4zmz5+v++67T2PGjJEkPfXUU8rOztaqVas0fvx47dy5UxUVFdq0aZOGDRsmSVq4cKGuvPJKPfzww8rLyzuOpwMAAE4Err6Jd/fu3aqvr1cgEHDGMjMzVVhYqOrqao0fP17V1dXKyspywoskBQIBJSUlqaamRtdee22b/YZCIYVCIed2MBiUJIXDYYXDYTefQsI4clw6e3x8ySYa5RyzROvvsfYF0UNP4g89iT9u9cTNnroaYOrr6yVJ2dnZEePZ2dnOtvr6evXt2zeyiJQU9erVy5nzdeXl5Zo1a1ab8bVr1yojI8ON0hNWZWVlp+bPHR6lQo7RmjVrYl1CVHS2L4g+ehJ/6En8Od6eNDc3u1SJJZdRT5s2TWVlZc7tYDCo/Px8jRo1Sn6/P4aVxa9wOKzKykqNHDlSXq+3w/cbPPOVKFbVedtnFse6BFcda18QPfQk/tCT+ONWT46cQXGDqwEmJydHktTQ0KDc3FxnvKGhQUOGDHHm7N27N+J+hw8f1r59+5z7f53P55PP52sz7vV6eXEfRWePUajFE8VqOi9R+8trN/7Qk/hDT+LP8fbEzX66+jkwBQUFysnJUVVVlTMWDAZVU1OjoqIiSVJRUZEaGxtVW1vrzFm3bp1aW1tVWFjoZjkAACBBdXoF5sCBA/rggw+c27t379bWrVvVq1cv9evXT1OmTNGDDz6oAQMGqKCgQNOnT1deXp7Gjh0rSTrrrLN0+eWX67bbbtOSJUsUDoc1efJkjR8/niuQAABAh3Q6wGzevFmXXnqpc/vIe1MmTJigZcuW6d5779XBgwc1adIkNTY26pJLLlFFRYXS0tKc+zz99NOaPHmyRowYoaSkJI0bN04LFixw4ekAAIATQacDzA9/+EMZ882X2no8Hs2ePVuzZ8/+xjm9evXS8uXLO/vQAAAAkvguJAAAYCECDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArON6gGlpadH06dNVUFCg9PR0fec739EDDzwgY4wzxxijGTNmKDc3V+np6QoEAtq1a5fbpQAAgATleoB56KGHtHjxYv3hD3/Qzp079dBDD2nu3LlauHChM2fu3LlasGCBlixZopqaGnXr1k3FxcU6dOiQ2+UAAIAElOL2Dt98802NGTNGo0ePliT1799fzzzzjDZu3Cjpy9WX+fPn67777tOYMWMkSU899ZSys7O1atUqjR8/3u2SAABAgnE9wFx00UV6/PHH9f777+uMM87QO++8o9dff13z5s2TJO3evVv19fUKBALOfTIzM1VYWKjq6up2A0woFFIoFHJuB4NBSVI4HFY4HHb7KSSEI8els8fHl2yOPqkLJVp/j7UviB56En/oSfxxqydu9tT1ADN16lQFg0ENHDhQycnJamlp0a9//WuVlJRIkurr6yVJ2dnZEffLzs52tn1deXm5Zs2a1WZ87dq1ysjIcPkZJJbKyspOzZ87PEqFHKM1a9bEuoSo6GxfEH30JP7Qk/hzvD1pbm52qZIoBJjnnntOTz/9tJYvX65BgwZp69atmjJlivLy8jRhwoRj2ue0adNUVlbm3A4Gg8rPz9eoUaPk9/vdKj2hhMNhVVZWauTIkfJ6vR2+3+CZr0Sxqs7bPrM41iW46lj7guihJ/GHnsQft3py5AyKG1wPMPfcc4+mTp3qnAo655xz9K9//Uvl5eWaMGGCcnJyJEkNDQ3Kzc117tfQ0KAhQ4a0u0+fzyefz9dm3Ov18uI+is4eo1CLJ4rVdF6i9pfXbvyhJ/GHnsSf4+2Jm/10PcA0NzcrKSny4qbk5GS1trZKkgoKCpSTk6OqqionsASDQdXU1OhnP/uZ2+UAkqT+U1cfdc6Hc0Z3QSUAADe4HmCuvvpq/frXv1a/fv00aNAgbdmyRfPmzdMtt9wiSfJ4PJoyZYoefPBBDRgwQAUFBZo+fbry8vI0duxYt8sBAAAJyPUAs3DhQk2fPl133HGH9u7dq7y8PP30pz/VjBkznDn33nuvDh48qEmTJqmxsVGXXHKJKioqlJaW5nY5AAAgAbkeYHr06KH58+dr/vz53zjH4/Fo9uzZmj17ttsPj2/RkdMoAADYgO9CAgAA1iHAAAAA67h+CglwE1cPAQDawwoMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANbhyxyBTuDLJQEgPrACAwAArEOAAQAA1uEUEvA/HTk9BACID6zAAAAA6xBgAACAdTiFBOtx6gcATjyswAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWCcqAebf//63fvSjH6l3795KT0/XOeeco82bNzvbjTGaMWOGcnNzlZ6erkAgoF27dkWjFAAAkIBcDzCfffaZLr74Ynm9Xr388svasWOHfve736lnz57OnLlz52rBggVasmSJampq1K1bNxUXF+vQoUNulwMAABJQits7fOihh5Sfn6+lS5c6YwUFBc7fjTGaP3++7rvvPo0ZM0aS9NRTTyk7O1urVq3S+PHj3S4JAAAkGNcDzIsvvqji4mJdf/31Wr9+vU4++WTdcccduu222yRJu3fvVn19vQKBgHOfzMxMFRYWqrq6ut0AEwqFFAqFnNvBYFCSFA6HFQ6H3X4KCeHIcfnq8fElm1iVc0L5ttdke31BbNGT+ENP4o9bPXGzpx5jjKu/1dLS0iRJZWVluv7667Vp0ybdddddWrJkiSZMmKA333xTF198sT7++GPl5uY697vhhhvk8Xj07LPPttnnzJkzNWvWrDbjy5cvV0ZGhpvlAwCAKGlubtbNN9+spqYm+f3+49qX6wEmNTVVw4YN05tvvumM/fznP9emTZtUXV19TAGmvRWY/Px8ffLJJ8d9ABLF4JmvRNz2JRk9MKxV0zcnKdTqiVFVJ6btM4u/cVs4HFZlZaVGjhwpr9fbhVXhm9CT+ENP4o9bPQkGg+rTp48rAcb1U0i5ubk6++yzI8bOOuss/eUvf5Ek5eTkSJIaGhoiAkxDQ4OGDBnS7j59Pp98Pl+bca/Xy4v7f0It7YeUUKvnG7chOjrymuS1G3/oSfyhJ/HneHviZj9dvwrp4osvVl1dXcTY+++/r1NPPVXSl2/ozcnJUVVVlbM9GAyqpqZGRUVFbpcDAAASkOsrMHfffbcuuugi/eY3v9ENN9ygjRs36vHHH9fjjz8uSfJ4PJoyZYoefPBBDRgwQAUFBZo+fbry8vI0duxYt8sBAAAJyPUAc8EFF2jlypWaNm2aZs+erYKCAs2fP18lJSXOnHvvvVcHDx7UpEmT1NjYqEsuuUQVFRXOG4ABAAC+jesBRpKuuuoqXXXVVd+43ePxaPbs2Zo9e3Y0Hh4AACQ4vgsJAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsE5KrAsATkSDZ76iUIvnW+d8OGd0F1UDAPZhBQYAAFiHAAMAAKzDKSQgTvWfuvqoczjNBOBExQoMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6UQ8wc+bMkcfj0ZQpU5yxQ4cOqbS0VL1791b37t01btw4NTQ0RLsUAACQIKL6QXabNm3SY489pnPPPTdi/O6779bq1av1/PPPKzMzU5MnT9Z1112nN954I5rlWKsjH2iG+PFt/fIlG80d3oXFAECCitoKzIEDB1RSUqI//vGP6tmzpzPe1NSkJ554QvPmzdNll12moUOHaunSpXrzzTf11ltvRascAACQQKIWYEpLSzV69GgFAoGI8draWoXD4YjxgQMHql+/fqquro5WOQAAIIFE5RTSihUr9Pbbb2vTpk1tttXX1ys1NVVZWVkR49nZ2aqvr293f6FQSKFQyLkdDAYlSeFwWOFw2L3C45Qv2XT+Pkkm4r+ID2735UR4/UfbkWPIsYwf9CT+uNUTN3vqeoD56KOPdNddd6myslJpaWmu7LO8vFyzZs1qM7527VplZGS48hjx7HjeM/HAsFb3CoFr3OrLmjVrXNkPpMrKyliXgK+hJ/HneHvS3NzsUiWSxxjj6v+ir1q1Stdee62Sk5OdsZaWFnk8HiUlJemVV15RIBDQZ599FrEKc+qpp2rKlCm6++672+yzvRWY/Px8ffLJJ/L7/W6WH5cGz3yl0/fxJRk9MKxV0zcnKdTqiUJVOBZu92X7zGIXqjqxhcNhVVZWauTIkfJ6vbEuB6In8citngSDQfXp00dNTU3H/fvb9RWYESNGaNu2bRFjEydO1MCBA/XLX/5S+fn58nq9qqqq0rhx4yRJdXV12rNnj4qKitrdp8/nk8/nazPu9XpPiBd3qOXYf9GFWj3HdX9Eh1t9ORFe/13lRPl5YhN6En+Otydu9tP1ANOjRw8NHjw4Yqxbt27q3bu3M37rrbeqrKxMvXr1kt/v15133qmioiJdeOGFbpcDAAASUFQ/B+abPPLII0pKStK4ceMUCoVUXFysRx99NBalAAAAC3VJgHnttdcibqelpWnRokVatGhRVzw8AABIMHwXEgAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwTky+SgCAO/pPXX3UOR/OGd0FlQBA12IFBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYh0/iBRIcn9YLIBGxAgMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcPsgPAh90BsA4rMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrpMS6gBNd/6mrY10C4JqOvJ4/nDO6CyoBkOhYgQEAANYhwAAAAOtwCglAl+I0EwA3sAIDAACsQ4ABAADWIcAAAADrEGAAAIB1XA8w5eXluuCCC9SjRw/17dtXY8eOVV1dXcScQ4cOqbS0VL1791b37t01btw4NTQ0uF0KAABIUK5fhbR+/XqVlpbqggsu0OHDh/WrX/1Ko0aN0o4dO9StWzdJ0t13363Vq1fr+eefV2ZmpiZPnqzrrrtOb7zxhtvlAHAJH7oIIJ64HmAqKioibi9btkx9+/ZVbW2tvv/976upqUlPPPGEli9frssuu0yStHTpUp111ll66623dOGFF7pdEgAASDBR/xyYpqYmSVKvXr0kSbW1tQqHwwoEAs6cgQMHql+/fqqurm43wIRCIYVCIed2MBiUJIXDYYXD4WiWH3W+ZBOd/SaZiP8iPtCXjunKf9dHHsv2nyWJhJ7EH7d64mZPPcaYqP0kbW1t1TXXXKPGxka9/vrrkqTly5dr4sSJEYFEkoYPH65LL71UDz30UJv9zJw5U7NmzWozvnz5cmVkZESneAAA4Krm5mbdfPPNampqkt/vP659RXUFprS0VNu3b3fCy7GaNm2aysrKnNvBYFD5+fkaNWrUcR+AWBs885Wo7NeXZPTAsFZN35ykUKsnKo+BzqMvHbN9ZnGXPVY4HFZlZaVGjhwpr9fbZY+Lb0ZP4o9bPTlyBsUNUQswkydP1ksvvaQNGzbolFNOccZzcnL0xRdfqLGxUVlZWc54Q0ODcnJy2t2Xz+eTz+drM+71eq1/cYdaovtLLNTqifpjoPPoy7eLxb/rRPh5kmjoSfw53p642U/XL6M2xmjy5MlauXKl1q1bp4KCgojtQ4cOldfrVVVVlTNWV1enPXv2qKioyO1yAABAAnJ9Baa0tFTLly/XCy+8oB49eqi+vl6SlJmZqfT0dGVmZurWW29VWVmZevXqJb/frzvvvFNFRUVcgQQAADrE9QCzePFiSdIPf/jDiPGlS5fqJz/5iSTpkUceUVJSksaNG6dQKKTi4mI9+uijbpcCAAASlOsBpiMXNaWlpWnRokVatGiR2w8PAABOAFH/HBgA6KyOfOrvh3NGd0ElAOIVX+YIAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHD7IDkLA68oF4ux4Y1QWVAHAbKzAAAMA6BBgAAGAdTiEBgAv4/iaga7ECAwAArEOAAQAA1uEUEgArdeSUDYDExQoMAACwDgEGAABYhwADAACsw3tgjhGXTAIAEDuswAAAAOsQYAAAgHU4hQTghDZ45iuaO/zL/4ZaPLEuB0AHsQIDAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6XIUEAF3ErS+g5EMyAVZgAACAhQgwAADAOpxCaodby7xu7QcAAERiBQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHW4CgkALNORKxz5sDskOlZgAACAdQgwAADAOgQYAABgHd4DAwAnKL5cEjZjBQYAAFiHAAMAAKzDKSQAQNRx6TfcxgoMAACwDgEGAABYh1NIAJCA3LrCKFFxSst+rMAAAADrEGAAAIB1OIUEAIgLR07r+JKN5g6XBs98RaEWT6f349apH04zxTdWYAAAgHViGmAWLVqk/v37Ky0tTYWFhdq4cWMsywEAAJaI2SmkZ599VmVlZVqyZIkKCws1f/58FRcXq66uTn379o1VWQCAToq3K57irZ54lAinx2K2AjNv3jzddtttmjhxos4++2wtWbJEGRkZevLJJ2NVEgAAsERMVmC++OIL1dbWatq0ac5YUlKSAoGAqqur28wPhUIKhULO7aamJknSvn37FA6HXa8v5fBB1/fZ1VJajZqbW5USTlJLa+ffBIfooC/xh57EH5t68umnn8a6hGPSkd9zX31u4XBYzc3N+vTTT+X1eo/5cffv3y9JMsYc8z6OiEmA+eSTT9TS0qLs7OyI8ezsbP39739vM7+8vFyzZs1qM15QUBC1GhPBzbEuAO2iL/GHnsQfW3rS53exriB6ovnc9u/fr8zMzOPahxWXUU+bNk1lZWXO7dbWVu3bt0+9e/eWxxPf6TxWgsGg8vPz9dFHH8nv98e6HPwPfYk/9CT+0JP441ZPjDHav3+/8vLyjrummASYPn36KDk5WQ0NDRHjDQ0NysnJaTPf5/PJ5/NFjGVlZUWzxITh9/v5ARCH6Ev8oSfxh57EHzd6crwrL0fE5E28qampGjp0qKqqqpyx1tZWVVVVqaioKBYlAQAAi8TsFFJZWZkmTJigYcOGafjw4Zo/f74OHjyoiRMnxqokAABgiZgFmBtvvFH//e9/NWPGDNXX12vIkCGqqKho88ZeHBufz6f777+/zak3xBZ9iT/0JP7Qk/gTjz3xGDeuZQIAAOhCfBcSAACwDgEGAABYhwADAACsQ4ABAADWIcDE0IYNG3T11VcrLy9PHo9Hq1atithujNGMGTOUm5ur9PR0BQIB7dq1K2LOvn37VFJSIr/fr6ysLN166606cOBAxJx3331X3/ve95SWlqb8/HzNnTu3TS3PP/+8Bg4cqLS0NJ1zzjlas2ZNp2uxXXl5uS644AL16NFDffv21dixY1VXVxcx59ChQyotLVXv3r3VvXt3jRs3rs0HMu7Zs0ejR49WRkaG+vbtq3vuuUeHDx+OmPPaa6/p/PPPl8/n0+mnn65ly5a1qWfRokXq37+/0tLSVFhYqI0bN3a6lkSwePFinXvuuc4HaBUVFenll192ttOT2JozZ448Ho+mTJnijNGTrjdz5kx5PJ6IPwMHDnS2J2RPDGJmzZo15v/+7//MX//6VyPJrFy5MmL7nDlzTGZmplm1apV55513zDXXXGMKCgrM559/7sy5/PLLzXnnnWfeeust87e//c2cfvrp5qabbnK2NzU1mezsbFNSUmK2b99unnnmGZOenm4ee+wxZ84bb7xhkpOTzdy5c82OHTvMfffdZ7xer9m2bVunarFdcXGxWbp0qdm+fbvZunWrufLKK02/fv3MgQMHnDm33367yc/PN1VVVWbz5s3mwgsvNBdddJGz/fDhw2bw4MEmEAiYLVu2mDVr1pg+ffqYadOmOXP++c9/moyMDFNWVmZ27NhhFi5caJKTk01FRYUzZ8WKFSY1NdU8+eST5r333jO33XabycrKMg0NDR2uJVG8+OKLZvXq1eb99983dXV15le/+pXxer1m+/btxhh6EksbN240/fv3N+eee6656667nHF60vXuv/9+M2jQIPOf//zH+fPf//7X2Z6IPSHAxImvB5jW1laTk5Njfvvb3zpjjY2NxufzmWeeecYYY8yOHTuMJLNp0yZnzssvv2w8Ho/597//bYwx5tFHHzU9e/Y0oVDImfPLX/7SnHnmmc7tG264wYwePTqinsLCQvPTn/60w7Ukor179xpJZv369caYL5+z1+s1zz//vDNn586dRpKprq42xnwZSpOSkkx9fb0zZ/Hixcbv9zs9uPfee82gQYMiHuvGG280xcXFzu3hw4eb0tJS53ZLS4vJy8sz5eXlHa4lkfXs2dP86U9/oicxtH//fjNgwABTWVlpfvCDHzgBhp7Exv3332/OO++8drclak84hRSndu/erfr6egUCAWcsMzNThYWFqq6uliRVV1crKytLw4YNc+YEAgElJSWppqbGmfP9739fqampzpzi4mLV1dXps88+c+Z89XGOzDnyOB2pJRE1NTVJknr16iVJqq2tVTgcjjgOAwcOVL9+/SJ6cs4550R8IGNxcbGCwaDee+89Z863He8vvvhCtbW1EXOSkpIUCAScOR2pJRG1tLRoxYoVOnjwoIqKiuhJDJWWlmr06NFtjhs9iZ1du3YpLy9Pp512mkpKSrRnzx5JidsTAkycqq+vl6Q2n0ycnZ3tbKuvr1ffvn0jtqekpKhXr14Rc9rbx1cf45vmfHX70WpJNK2trZoyZYouvvhiDR48WNKXxyE1NbXNF4l+/Vgd6/EOBoP6/PPP9cknn6ilpeWoPTlaLYlk27Zt6t69u3w+n26//XatXLlSZ599Nj2JkRUrVujtt99WeXl5m230JDYKCwu1bNkyVVRUaPHixdq9e7e+973vaf/+/Qnbk5h9lQAQz0pLS7V9+3a9/vrrsS4Fks4880xt3bpVTU1N+vOf/6wJEyZo/fr1sS7rhPTRRx/prrvuUmVlpdLS0mJdDv7niiuucP5+7rnnqrCwUKeeeqqee+45paenx7Cy6GEFJk7l5ORIUpt3Zjc0NDjbcnJytHfv3ojthw8f1r59+yLmtLePrz7GN8356vaj1ZJIJk+erJdeekmvvvqqTjnlFGc8JydHX3zxhRobGyPmf/1YHevx9vv9Sk9PV58+fZScnHzUnhytlkSSmpqq008/XUOHDlV5ebnOO+88/f73v6cnMVBbW6u9e/fq/PPPV0pKilJSUrR+/XotWLBAKSkpys7OpidxICsrS2eccYY++OCDhP13QoCJUwUFBcrJyVFVVZUzFgwGVVNTo6KiIklSUVGRGhsbVVtb68xZt26dWltbVVhY6MzZsGGDwuGwM6eyslJnnnmmevbs6cz56uMcmXPkcTpSSyIwxmjy5MlauXKl1q1bp4KCgojtQ4cOldfrjTgOdXV12rNnT0RPtm3bFhEsKysr5ff7dfbZZztzvu14p6amaujQoRFzWltbVVVV5czpSC2JrLW1VaFQiJ7EwIgRI7Rt2zZt3brV+TNs2DCVlJQ4f6cnsXfgwAH94x//UG5ubuL+O+nUW37hqv3795stW7aYLVu2GElm3rx5ZsuWLeZf//qXMebLS5ezsrLMCy+8YN59910zZsyYdi+j/u53v2tqamrM66+/bgYMGBBxGXVjY6PJzs42P/7xj8327dvNihUrTEZGRpvLqFNSUszDDz9sdu7cae6///52L6M+Wi22+9nPfmYyMzPNa6+9FnEpYnNzszPn9ttvN/369TPr1q0zmzdvNkVFRaaoqMjZfuRSxFGjRpmtW7eaiooKc9JJJ7V7KeI999xjdu7caRYtWtTupYg+n88sW7bM7Nixw0yaNMlkZWVFXCFwtFoSxdSpU8369evN7t27zbvvvmumTp1qPB6PWbt2rTGGnsSDr16FZAw9iYVf/OIX5rXXXjO7d+82b7zxhgkEAqZPnz5m7969xpjE7AkBJoZeffVVI6nNnwkTJhhjvrx8efr06SY7O9v4fD4zYsQIU1dXF7GPTz/91Nx0002me/fuxu/3m4kTJ5r9+/dHzHnnnXfMJZdcYnw+nzn55JPNnDlz2tTy3HPPmTPOOMOkpqaaQYMGmdWrV0ds70gttmuvF5LM0qVLnTmff/65ueOOO0zPnj1NRkaGufbaa81//vOfiP18+OGH5oorrjDp6emmT58+5he/+IUJh8MRc1599VUzZMgQk5qaak477bSIxzhi4cKFpl+/fiY1NdUMHz7cvPXWWxHbO1JLIrjlllvMqaeealJTU81JJ51kRowY4YQXY+hJPPh6gKEnXe/GG280ubm5JjU11Zx88snmxhtvNB988IGzPRF74jHGmM6t2QAAAMQW74EBAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDr/Dxg7FwrJ17igAAAAAElFTkSuQmCC\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 50 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "583px", + "left": "0px", + "right": "1324px", + "top": "107px", + "width": "212px" + }, + "toc_section_display": "block", + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/section-04-research-and-development/06-feature-engineering-with-open-source.ipynb b/section-04-research-and-development/06-feature-engineering-with-open-source.ipynb index 2d25751b3..232e2ae3b 100644 --- a/section-04-research-and-development/06-feature-engineering-with-open-source.ipynb +++ b/section-04-research-and-development/06-feature-engineering-with-open-source.ipynb @@ -20,9 +20,12 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T11:19:36.355906Z", + "start_time": "2025-02-28T11:19:29.833665Z" + } + }, "source": [ "# data manipulation and plotting\n", "import pandas as pd\n", @@ -58,12 +61,28 @@ "\n", "# to visualise al the columns in the dataframe\n", "pd.pandas.set_option('display.max_columns', None)" - ] + ], + "outputs": [], + "execution_count": 1 }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2025-02-28T11:19:36.760614Z", + "start_time": "2025-02-28T11:19:36.461852Z" + } + }, + "source": [ + "# load dataset\n", + "data = pd.read_csv('train.csv')\n", + "\n", + "# rows and columns of the data\n", + "print(data.shape)\n", + "\n", + "# visualise the dataset\n", + "data.head()" + ], "outputs": [ { "name": "stdout", @@ -74,6 +93,98 @@ }, { "data": { + "text/plain": [ + " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", + "0 1 60 RL 65.0 8450 Pave NaN Reg \n", + "1 2 20 RL 80.0 9600 Pave NaN Reg \n", + "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", + "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", + "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", + "\n", + " LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \\\n", + "0 Lvl AllPub Inside Gtl CollgCr Norm \n", + "1 Lvl AllPub FR2 Gtl Veenker Feedr \n", + "2 Lvl AllPub Inside Gtl CollgCr Norm \n", + "3 Lvl AllPub Corner Gtl Crawfor Norm \n", + "4 Lvl AllPub FR2 Gtl NoRidge Norm \n", + "\n", + " Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \\\n", + "0 Norm 1Fam 2Story 7 5 2003 \n", + "1 Norm 1Fam 1Story 6 8 1976 \n", + "2 Norm 1Fam 2Story 7 5 2001 \n", + "3 Norm 1Fam 2Story 7 5 1915 \n", + "4 Norm 1Fam 2Story 8 5 2000 \n", + "\n", + " YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \\\n", + "0 2003 Gable CompShg VinylSd VinylSd BrkFace \n", + "1 1976 Gable CompShg MetalSd MetalSd None \n", + "2 2002 Gable CompShg VinylSd VinylSd BrkFace \n", + "3 1970 Gable CompShg Wd Sdng Wd Shng None \n", + "4 2000 Gable CompShg VinylSd VinylSd BrkFace \n", + "\n", + " MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure \\\n", + "0 196.0 Gd TA PConc Gd TA No \n", + "1 0.0 TA TA CBlock Gd TA Gd \n", + "2 162.0 Gd TA PConc Gd TA Mn \n", + "3 0.0 TA TA BrkTil TA Gd No \n", + "4 350.0 Gd TA PConc Gd TA Av \n", + "\n", + " BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF \\\n", + "0 GLQ 706 Unf 0 150 856 \n", + "1 ALQ 978 Unf 0 284 1262 \n", + "2 GLQ 486 Unf 0 434 920 \n", + "3 ALQ 216 Unf 0 540 756 \n", + "4 GLQ 655 Unf 0 490 1145 \n", + "\n", + " Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF \\\n", + "0 GasA Ex Y SBrkr 856 854 0 \n", + "1 GasA Ex Y SBrkr 1262 0 0 \n", + "2 GasA Ex Y SBrkr 920 866 0 \n", + "3 GasA Gd Y SBrkr 961 756 0 \n", + "4 GasA Ex Y SBrkr 1145 1053 0 \n", + "\n", + " GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr \\\n", + "0 1710 1 0 2 1 3 \n", + "1 1262 0 1 2 0 3 \n", + "2 1786 1 0 2 1 3 \n", + "3 1717 1 0 1 0 3 \n", + "4 2198 1 0 2 1 4 \n", + "\n", + " KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \\\n", + "0 1 Gd 8 Typ 0 NaN \n", + "1 1 TA 6 Typ 1 TA \n", + "2 1 Gd 6 Typ 1 TA \n", + "3 1 Gd 7 Typ 1 Gd \n", + "4 1 Gd 9 Typ 1 TA \n", + "\n", + " GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual \\\n", + "0 Attchd 2003.0 RFn 2 548 TA \n", + "1 Attchd 1976.0 RFn 2 460 TA \n", + "2 Attchd 2001.0 RFn 2 608 TA \n", + "3 Detchd 1998.0 Unf 3 642 TA \n", + "4 Attchd 2000.0 RFn 3 836 TA \n", + "\n", + " GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n", + "0 TA Y 0 61 0 0 \n", + "1 TA Y 298 0 0 0 \n", + "2 TA Y 0 42 0 0 \n", + "3 TA Y 0 35 272 0 \n", + "4 TA Y 192 84 0 0 \n", + "\n", + " ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold \\\n", + "0 0 0 NaN NaN NaN 0 2 2008 \n", + "1 0 0 NaN NaN NaN 0 5 2007 \n", + "2 0 0 NaN NaN NaN 0 9 2008 \n", + "3 0 0 NaN NaN NaN 0 2 2006 \n", + "4 0 0 NaN NaN NaN 0 12 2008 \n", + "\n", + " SaleType SaleCondition SalePrice \n", + "0 WD Normal 208500 \n", + "1 WD Normal 181500 \n", + "2 WD Normal 223500 \n", + "3 WD Abnorml 140000 \n", + "4 WD Normal 250000 " + ], "text/html": [ "
\n", "