Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
PYTHONPATH=lab
17 changes: 16 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
Pipfile

.vscode/
.vscode/
.ipynb_checkpoints
__pycache__
.idea/
*.bundle
*.csv
*.joblib
*.kvmodel
*.npy
*.pt
*.png
*.tgz
.mypy_cache
.ropeproject
.coverage
*.log
19 changes: 19 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[[source]]
url = "https://pypi.python.org/simple"
verify_ssl = true
name = "pypi"

[packages]
pandas = "==1.4.3"
matplotlib = "==3.5.2"
numpy = "==1.23.1"
scikit-learn = "==1.1.1"

[dev-packages]
ipykernel = "==6.15.1"
seaborn = "==0.11.2"
pytest = "==7.1.2"
pylint = "==2.14.5"

[requires]
python_version = "3.9"
1,198 changes: 1,198 additions & 0 deletions Pipfile.lock

Large diffs are not rendered by default.

126 changes: 126 additions & 0 deletions lab/processes/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"""
This file contain the class that encapsulate config for preprocess
"""

import numpy as np


class DataRawColumns:
"""
This class has all names of columns for the raw dataframe
"""

ID = "id"
BATHROOMS = "bathrooms"
BATHROOMS_TEXT = "bathrooms_text"
NEIGHBOURHOOD_GROUP_CLEANSED = "neighbourhood_group_cleansed"
PROPERTY_TYPE = "property_type"
ROOM_TYPE = "room_type"
LATITUDE = "latitude"
LONGITUDE = "longitude"
ACCOMMODATES = "accommodates"
BEDROOMS = "bedrooms"
BEDS = "beds"
AMENITIES = "amenities"
PRICE = "price"

SUBSET_TRAINING = [
ID,
BATHROOMS,
NEIGHBOURHOOD_GROUP_CLEANSED,
PROPERTY_TYPE,
ROOM_TYPE,
LATITUDE,
LONGITUDE,
ACCOMMODATES,
BEDROOMS,
BEDS,
AMENITIES,
PRICE,
]


class DataPreprocessColumns:
"""
This class has all names of columns for the preprocess dataframe
"""

ID = "id"
NEIGHBOURHOOD = "neighbourhood"
PROPERTY_TYPE = "property_type"
ROOM_TYPE = "room_type"
LATITUDE = "latitude"
LONGITUDE = "longitude"
ACCOMMODATES = "accommodates"
BATHROOMS = "bathrooms"
BEDROOMS = "bedrooms"
BEDS = "beds"
PRICE = "price"
CATEGORY = "category"
TV = "TV"
INTERNET = "Internet"
AIR_CONDITIONING = "Air_conditioning"
KITCHEN = "Kitchen"
HEATING = "Heating"
WIFI = "Wifi"
ELEVATOR = "Elevator"
BREAKFAST = "Breakfast"


class ConfigPreprocess:
"""
This class encapsulate the config for preprocess
"""

# Paths
RAW_FILE = "data/raw/listings.csv"
PREPROCESS_FILE = "data/processed/new_processed_listings.csv"

# Preprocess config
MIN_PRICE = 10
BINS_PRICE = [10, 90, 180, 400, np.inf]
LABELS_PRICE = [0, 1, 2, 3]
MAPING_COLUMNS = {
DataPreprocessColumns.ROOM_TYPE: {
"Shared room": 1,
"Private room": 2,
"Entire home/apt": 3,
"Hotel room": 4,
},
DataPreprocessColumns.NEIGHBOURHOOD: {
"Bronx": 1,
"Queens": 2,
"Staten Island": 3,
"Brooklyn": 4,
"Manhattan": 5,
},
}


class ConfigTrain:
"""
This class encapsulate the config for train process
"""

# Features info
FEATURE_NAMES = [
DataPreprocessColumns.NEIGHBOURHOOD,
DataPreprocessColumns.ROOM_TYPE,
DataPreprocessColumns.ACCOMMODATES,
DataPreprocessColumns.BATHROOMS,
DataPreprocessColumns.BEDROOMS,
]
FEATURE_CATEGORY = DataPreprocessColumns.CATEGORY

# Split parameters
TEST_SIZE = 0.15
RANDOM_STATE_SPLIT = 1

# Train parameters
N_ESTIMATORS = 500
RANDOM_STATE_TRAIN = 0
CLASS_WEIGHT = "balanced"
N_JOBS = 4

# Paths
FOLDER_PATH = "models/"
28 changes: 28 additions & 0 deletions lab/processes/preprocess/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
This file contains the code for launch the preprocess
"""
import logging

import pandas as pd

from processes.config import ConfigPreprocess
from processes.preprocess.preprocess import preprocess

logging.basicConfig(
format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
logger = logging.getLogger(__name__)

if __name__ == "__main__":

# Load dataset
logger.info("Preprocessing %s", ConfigPreprocess.RAW_FILE)
df_raw = pd.read_csv(ConfigPreprocess.RAW_FILE)

# Preprocess dataset
df_result = preprocess(df=df_raw)

# Save the preprocess dataframe
df_result.to_csv(ConfigPreprocess.PREPROCESS_FILE)
180 changes: 180 additions & 0 deletions lab/processes/preprocess/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
"""
This file contains all functions for preprocessing the dataset
"""
import logging

import numpy as np
import pandas as pd

from processes.config import ConfigPreprocess, DataPreprocessColumns, DataRawColumns

logger = logging.getLogger(__name__)


def prepare_bathrooms_column(text: str) -> float:
"""
Extract number of bathtrooms from text

Args:
text (str): _description_

Returns:
float: _description_
"""

try:
return float(text.split(" ")[0]) if isinstance(text, str) else np.NaN
except ValueError:
return np.NaN


def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Rename the column in the dataframe

Args:
df (pd.DataFrame): preprocess dataframe

Returns:
pd.DataFrame: the dataframe updated
"""

return df.rename(columns={DataRawColumns.NEIGHBOURHOOD_GROUP_CLEANSED: DataPreprocessColumns.NEIGHBOURHOOD})


def preprocess_nan(df: pd.DataFrame) -> None:
"""
This function deal with nan values in the dataframe

Args:
df (pd.DataFrame): preprocess dataframe

Returns:
pd.DataFrame: the dataframe updated
"""
return df.dropna(axis=0)


def preprocess_categorical_column(df: pd.DataFrame) -> pd.DataFrame:
"""
Prepare the categorical column

Args:
df (pd.DataFrame): preprocess dataframe

Returns:
pd.DataFrame: the dataframe updated
"""
# Convert price to value
df[DataPreprocessColumns.PRICE] = df[DataPreprocessColumns.PRICE].str.extract(r"(\d+).")
df[DataPreprocessColumns.PRICE] = df[DataPreprocessColumns.PRICE].astype(int)

# Remove values below configured value
df = df[df[DataPreprocessColumns.PRICE] >= ConfigPreprocess.MIN_PRICE].copy()

# Categorize values
df[DataPreprocessColumns.CATEGORY] = pd.cut(
df[DataPreprocessColumns.PRICE], bins=ConfigPreprocess.BINS_PRICE, labels=ConfigPreprocess.LABELS_PRICE
)

return df


def create_new_column(df: pd.DataFrame, column_search: str, new_column_name: str) -> pd.DataFrame:
"""
Create a new column if the text contains a specific text

Args:
df (pd.DataFrame): dataframe for search and create new column
column_search (str): column where search the text
new_column_name (str): new column name and text to search in original column

Returns:
pd.DataFrame: the dataframe updated
"""
df[new_column_name] = df[column_search].str.contains(new_column_name)
df[new_column_name] = df[new_column_name].astype(int)
return df


def preprocess_amenities_column(df: pd.DataFrame) -> pd.DataFrame:
"""
Create new columns in from amenities column

Args:
df (pd.DataFrame): preprocess dataframe

Returns:
pd.DataFrame: the dataframe updated
"""
columns_to_add = [
DataPreprocessColumns.TV,
DataPreprocessColumns.INTERNET,
DataPreprocessColumns.AIR_CONDITIONING,
DataPreprocessColumns.KITCHEN,
DataPreprocessColumns.HEATING,
DataPreprocessColumns.WIFI,
DataPreprocessColumns.ELEVATOR,
DataPreprocessColumns.BREAKFAST,
]

for new_column in columns_to_add:
df = create_new_column(df=df, column_search=DataRawColumns.AMENITIES, new_column_name=new_column)

return df.drop(DataRawColumns.AMENITIES, axis=1)


def preprocess_mapping_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Convert in categorical with map some columns

Args:
df (pd.DataFrame): dataframe to transform

Returns:
pd.DataFrame: dataframe updated
"""
for column, mapping in ConfigPreprocess.MAPING_COLUMNS.items():
df[column] = df[column].map(mapping)

return df


def preprocess(df: pd.DataFrame) -> pd.DataFrame:
"""
Preprocess the original dataframe

Args:
df (pd.DataFrame): dataframe to preprocess

Returns:
pd.DataFrame: the dataframe updated
"""

# Create a copy of df
df_preprocess = df.copy()

# Create bathrooms column from bathrooms text
df_preprocess[DataRawColumns.BATHROOMS] = df_preprocess[DataRawColumns.BATHROOMS_TEXT].apply(
prepare_bathrooms_column
)

# Get columns of interest
df_preprocess = df_preprocess[DataRawColumns.SUBSET_TRAINING]

# Rename columns
df_preprocess = rename_columns(df_preprocess)

# Prepare categorical column
df_preprocess = preprocess_categorical_column(df_preprocess)

# Prepare new columns
df_preprocess = preprocess_amenities_column(df_preprocess)

# Prepare mapping columns
df_preprocess = preprocess_mapping_columns(df_preprocess)

# Deal with nan values
df_preprocess = preprocess_nan(df_preprocess)

return df_preprocess
Loading