intelygenz · christianint · Jul 26, 2022 · Aug 1, 2022 · Aug 1, 2022 · Aug 1, 2022
diff --git a/.env b/.env
@@ -0,0 +1 @@
+PYTHONPATH=lab
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,18 @@
 Pipfile
 
-.vscode/
+.vscode/
+.ipynb_checkpoints
+__pycache__
+.idea/
+*.bundle
+*.csv
+*.joblib
+*.kvmodel
+*.npy
+*.pt
+*.png
+*.tgz
+.mypy_cache
+.ropeproject
+.coverage
+*.log
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,19 @@
+[[source]]
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+pandas = "==1.4.3"
+matplotlib = "==3.5.2"
+numpy = "==1.23.1"
+scikit-learn = "==1.1.1"
+
+[dev-packages]
+ipykernel = "==6.15.1"
+seaborn = "==0.11.2"
+pytest = "==7.1.2"
+pylint = "==2.14.5"
+
+[requires]
+python_version = "3.9"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/lab/processes/config.py b/lab/processes/config.py
@@ -0,0 +1,126 @@
+"""
+This file contain the class that encapsulate config for preprocess
+"""
+
+import numpy as np
+
+
+class DataRawColumns:
+    """
+    This class has all names of columns for the raw dataframe
+    """
+
+    ID = "id"
+    BATHROOMS = "bathrooms"
+    BATHROOMS_TEXT = "bathrooms_text"
+    NEIGHBOURHOOD_GROUP_CLEANSED = "neighbourhood_group_cleansed"
+    PROPERTY_TYPE = "property_type"
+    ROOM_TYPE = "room_type"
+    LATITUDE = "latitude"
+    LONGITUDE = "longitude"
+    ACCOMMODATES = "accommodates"
+    BEDROOMS = "bedrooms"
+    BEDS = "beds"
+    AMENITIES = "amenities"
+    PRICE = "price"
+
+    SUBSET_TRAINING = [
+        ID,
+        BATHROOMS,
+        NEIGHBOURHOOD_GROUP_CLEANSED,
+        PROPERTY_TYPE,
+        ROOM_TYPE,
+        LATITUDE,
+        LONGITUDE,
+        ACCOMMODATES,
+        BEDROOMS,
+        BEDS,
+        AMENITIES,
+        PRICE,
+    ]
+
+
+class DataPreprocessColumns:
+    """
+    This class has all names of columns for the preprocess dataframe
+    """
+
+    ID = "id"
+    NEIGHBOURHOOD = "neighbourhood"
+    PROPERTY_TYPE = "property_type"
+    ROOM_TYPE = "room_type"
+    LATITUDE = "latitude"
+    LONGITUDE = "longitude"
+    ACCOMMODATES = "accommodates"
+    BATHROOMS = "bathrooms"
+    BEDROOMS = "bedrooms"
+    BEDS = "beds"
+    PRICE = "price"
+    CATEGORY = "category"
+    TV = "TV"
+    INTERNET = "Internet"
+    AIR_CONDITIONING = "Air_conditioning"
+    KITCHEN = "Kitchen"
+    HEATING = "Heating"
+    WIFI = "Wifi"
+    ELEVATOR = "Elevator"
+    BREAKFAST = "Breakfast"
+
+
+class ConfigPreprocess:
+    """
+    This class encapsulate the config for preprocess
+    """
+
+    # Paths
+    RAW_FILE = "data/raw/listings.csv"
+    PREPROCESS_FILE = "data/processed/new_processed_listings.csv"
+
+    # Preprocess config
+    MIN_PRICE = 10
+    BINS_PRICE = [10, 90, 180, 400, np.inf]
+    LABELS_PRICE = [0, 1, 2, 3]
+    MAPING_COLUMNS = {
+        DataPreprocessColumns.ROOM_TYPE: {
+            "Shared room": 1,
+            "Private room": 2,
+            "Entire home/apt": 3,
+            "Hotel room": 4,
+        },
+        DataPreprocessColumns.NEIGHBOURHOOD: {
+            "Bronx": 1,
+            "Queens": 2,
+            "Staten Island": 3,
+            "Brooklyn": 4,
+            "Manhattan": 5,
+        },
+    }
+
+
+class ConfigTrain:
+    """
+    This class encapsulate the config for train process
+    """
+
+    # Features info
+    FEATURE_NAMES = [
+        DataPreprocessColumns.NEIGHBOURHOOD,
+        DataPreprocessColumns.ROOM_TYPE,
+        DataPreprocessColumns.ACCOMMODATES,
+        DataPreprocessColumns.BATHROOMS,
+        DataPreprocessColumns.BEDROOMS,
+    ]
+    FEATURE_CATEGORY = DataPreprocessColumns.CATEGORY
+
+    # Split parameters
+    TEST_SIZE = 0.15
+    RANDOM_STATE_SPLIT = 1
+
+    # Train parameters
+    N_ESTIMATORS = 500
+    RANDOM_STATE_TRAIN = 0
+    CLASS_WEIGHT = "balanced"
+    N_JOBS = 4
+
+    # Paths
+    FOLDER_PATH = "models/"
diff --git a/lab/processes/preprocess/main.py b/lab/processes/preprocess/main.py
@@ -0,0 +1,28 @@
+"""
+This file contains the code for launch the preprocess
+"""
+import logging
+
+import pandas as pd
+
+from processes.config import ConfigPreprocess
+from processes.preprocess.preprocess import preprocess
+
+logging.basicConfig(
+    format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+if __name__ == "__main__":
+
+    # Load dataset
+    logger.info("Preprocessing %s", ConfigPreprocess.RAW_FILE)
+    df_raw = pd.read_csv(ConfigPreprocess.RAW_FILE)
+
+    # Preprocess dataset
+    df_result = preprocess(df=df_raw)
+
+    # Save the preprocess dataframe
+    df_result.to_csv(ConfigPreprocess.PREPROCESS_FILE)
diff --git a/lab/processes/preprocess/preprocess.py b/lab/processes/preprocess/preprocess.py
@@ -0,0 +1,180 @@
+"""
+This file contains all functions for preprocessing the dataset
+"""
+import logging
+
+import numpy as np
+import pandas as pd
+
+from processes.config import ConfigPreprocess, DataPreprocessColumns, DataRawColumns
+
+logger = logging.getLogger(__name__)
+
+
+def prepare_bathrooms_column(text: str) -> float:
+    """
+    Extract number of bathtrooms from text
+
+    Args:
+        text (str): _description_
+
+    Returns:
+        float: _description_
+    """
+
+    try:
+        return float(text.split(" ")[0]) if isinstance(text, str) else np.NaN
+    except ValueError:
+        return np.NaN
+
+
+def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Rename the column in the dataframe
+
+    Args:
+        df (pd.DataFrame): preprocess dataframe
+
+    Returns:
+        pd.DataFrame: the dataframe updated
+    """
+
+    return df.rename(columns={DataRawColumns.NEIGHBOURHOOD_GROUP_CLEANSED: DataPreprocessColumns.NEIGHBOURHOOD})
+
+
+def preprocess_nan(df: pd.DataFrame) -> None:
+    """
+    This function deal with nan values in the dataframe
+
+    Args:
+        df (pd.DataFrame): preprocess dataframe
+
+    Returns:
+        pd.DataFrame: the dataframe updated
+    """
+    return df.dropna(axis=0)
+
+
+def preprocess_categorical_column(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Prepare the categorical column
+
+    Args:
+        df (pd.DataFrame): preprocess dataframe
+
+    Returns:
+        pd.DataFrame: the dataframe updated
+    """
+    # Convert price to value
+    df[DataPreprocessColumns.PRICE] = df[DataPreprocessColumns.PRICE].str.extract(r"(\d+).")
+    df[DataPreprocessColumns.PRICE] = df[DataPreprocessColumns.PRICE].astype(int)
+
+    # Remove values below configured value
+    df = df[df[DataPreprocessColumns.PRICE] >= ConfigPreprocess.MIN_PRICE].copy()
+
+    # Categorize values
+    df[DataPreprocessColumns.CATEGORY] = pd.cut(
+        df[DataPreprocessColumns.PRICE], bins=ConfigPreprocess.BINS_PRICE, labels=ConfigPreprocess.LABELS_PRICE
+    )
+
+    return df
+
+
+def create_new_column(df: pd.DataFrame, column_search: str, new_column_name: str) -> pd.DataFrame:
+    """
+    Create a new column if the text contains a specific text
+
+    Args:
+        df (pd.DataFrame): dataframe for search and create new column
+        column_search (str): column where search the text
+        new_column_name (str): new column name and text to search in original column
+
+    Returns:
+        pd.DataFrame: the dataframe updated
+    """
+    df[new_column_name] = df[column_search].str.contains(new_column_name)
+    df[new_column_name] = df[new_column_name].astype(int)
+    return df
+
+
+def preprocess_amenities_column(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Create new columns in from amenities column
+
+    Args:
+        df (pd.DataFrame): preprocess dataframe
+
+    Returns:
+        pd.DataFrame: the dataframe updated
+    """
+    columns_to_add = [
+        DataPreprocessColumns.TV,
+        DataPreprocessColumns.INTERNET,
+        DataPreprocessColumns.AIR_CONDITIONING,
+        DataPreprocessColumns.KITCHEN,
+        DataPreprocessColumns.HEATING,
+        DataPreprocessColumns.WIFI,
+        DataPreprocessColumns.ELEVATOR,
+        DataPreprocessColumns.BREAKFAST,
+    ]
+
+    for new_column in columns_to_add:
+        df = create_new_column(df=df, column_search=DataRawColumns.AMENITIES, new_column_name=new_column)
+
+    return df.drop(DataRawColumns.AMENITIES, axis=1)
+
+
+def preprocess_mapping_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Convert in categorical with map some columns
+
+    Args:
+        df (pd.DataFrame): dataframe to transform
+
+    Returns:
+        pd.DataFrame: dataframe updated
+    """
+    for column, mapping in ConfigPreprocess.MAPING_COLUMNS.items():
+        df[column] = df[column].map(mapping)
+
+    return df
+
+
+def preprocess(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Preprocess the original dataframe
+
+    Args:
+        df (pd.DataFrame): dataframe to preprocess
+
+    Returns:
+        pd.DataFrame: the dataframe updated
+    """
+
+    # Create a copy of df
+    df_preprocess = df.copy()
+
+    # Create bathrooms column from bathrooms text
+    df_preprocess[DataRawColumns.BATHROOMS] = df_preprocess[DataRawColumns.BATHROOMS_TEXT].apply(
+        prepare_bathrooms_column
+    )
+
+    # Get columns of interest
+    df_preprocess = df_preprocess[DataRawColumns.SUBSET_TRAINING]
+
+    # Rename columns
+    df_preprocess = rename_columns(df_preprocess)
+
+    # Prepare categorical column
+    df_preprocess = preprocess_categorical_column(df_preprocess)
+
+    # Prepare new columns
+    df_preprocess = preprocess_amenities_column(df_preprocess)
+
+    # Prepare mapping columns
+    df_preprocess = preprocess_mapping_columns(df_preprocess)
+
+    # Deal with nan values
+    df_preprocess = preprocess_nan(df_preprocess)
+
+    return df_preprocess