Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# Jupyter Notebook
.ipynb_checkpoints/

# Virtual environments
venv/
ENV/
env/

# IDE
.idea/
.vscode/
*.swp
*.swo

# OS files
.DS_Store
Thumbs.db
59 changes: 37 additions & 22 deletions code/train/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split


Expand All @@ -14,7 +14,7 @@ class DataPreprocessor:

def __init__(self):
"""Initialize the data preprocessor."""
self.label_encoders = {}
self.onehot_encoder = None
self.scaler = StandardScaler()
self.feature_names = None
self.categorical_features = []
Expand Down Expand Up @@ -74,44 +74,59 @@ def handle_missing_values(self, df):
if col in df.columns and df[col].isna().any():
mode_val = df[col].mode()
if len(mode_val) > 0:
df[col].fillna(mode_val[0], inplace=True)
df[col] = df[col].fillna(mode_val[0])

# Fill missing numerical values with median
for col in self.numerical_features:
if col in df.columns and df[col].isna().any():
df[col].fillna(df[col].median(), inplace=True)
df[col] = df[col].fillna(df[col].median())

return df

def encode_categorical_features(self, df, is_training=True):
"""
Encode categorical features using label encoding.
Encode categorical features using one-hot encoding.

Args:
df (pd.DataFrame): Input dataframe
is_training (bool): Whether this is training data

Returns:
pd.DataFrame: Dataframe with encoded categorical features
pd.DataFrame: Dataframe with one-hot encoded categorical features
"""
df = df.copy()

for col in self.categorical_features:
if col not in df.columns:
continue

if is_training:
# Fit and transform for training data
self.label_encoders[col] = LabelEncoder()
df[col] = self.label_encoders[col].fit_transform(df[col].astype(str))
else:
# Transform for test data
if col in self.label_encoders:
# Handle unseen categories
df[col] = df[col].astype(str)
known_labels = set(self.label_encoders[col].classes_)
df[col] = df[col].apply(lambda x: x if x in known_labels else self.label_encoders[col].classes_[0])
df[col] = self.label_encoders[col].transform(df[col])
if not self.categorical_features:
return df

# Filter to only include categorical features that exist in the dataframe
cat_cols = [col for col in self.categorical_features if col in df.columns]

if not cat_cols:
return df

# Get categorical columns data
cat_data = df[cat_cols].astype(str)

if is_training:
# Fit and transform for training data
self.onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = self.onehot_encoder.fit_transform(cat_data)
else:
# Transform for test data
if self.onehot_encoder is None:
raise ValueError("One-hot encoder not fitted. Call encode_categorical_features with is_training=True first.")
encoded = self.onehot_encoder.transform(cat_data)

# Create column names for one-hot encoded features
encoded_columns = self.onehot_encoder.get_feature_names_out(cat_cols)

# Create dataframe with encoded features
encoded_df = pd.DataFrame(encoded, columns=encoded_columns, index=df.index)

# Drop original categorical columns and add encoded columns
df = df.drop(columns=cat_cols)
df = pd.concat([df, encoded_df], axis=1)

return df

Expand Down
19 changes: 19 additions & 0 deletions training_notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,25 @@
"print(question_train[question_train > 0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Handle missing values using DataPreprocessor\n",
"# First identify feature types (required before handling missing values)\n",
"preprocessor.identify_feature_types(train_df)\n",
"\n",
"# Handle missing values in both train and test data\n",
"train_df = preprocessor.handle_missing_values(train_df)\n",
"test_df = preprocessor.handle_missing_values(test_df)\n",
"\n",
"print(\"Missing values handled successfully!\")\n",
"print(f\"\\nRemaining missing values in training data: {train_df.isna().sum().sum()}\")\n",
"print(f\"Remaining '?' values in training data: {(train_df == '?').sum().sum()}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down