Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions Top-Coder-Challenege-YasmineScotland/01_project_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""
Session 1 – Project setup and initial data split.

- Flattens the JSON structure
- Renames columns to friendly names
- Creates a 750 / 250 train–test split
"""
import json
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split


ROOT = Path(__file__).parent
RAW = ROOT / "data" / "raw"
PROC = ROOT / "data" / "processed"
RESULTS = ROOT / "results"

PROC.mkdir(parents=True, exist_ok=True)
RESULTS.mkdir(parents=True, exist_ok=True)


def main() -> None:
# ------------------------------------------------------------------
# Load and flatten JSON
# ------------------------------------------------------------------
public_path = RAW / "public_cases.json"
if not public_path.exists():
raise FileNotFoundError(f"Expected file not found: {public_path}")

with public_path.open() as f:
data = json.load(f)

# Flatten nested keys like input.trip_duration_days
df = pd.json_normalize(data, sep=".")
df.columns = [c.split(".")[-1] for c in df.columns]

# Identify input and target columns
feature_cols = ["trip_duration_days", "miles_traveled", "total_receipts_amount"]
missing = [c for c in feature_cols if c not in df.columns]
if missing:
raise ValueError(f"Missing expected input columns: {missing}")

# Anything that looks like the reimbursement output
if "expected_output" in df.columns:
target_col = "expected_output"
else:
# fall back: choose the last numeric column
numeric_cols = df.select_dtypes("number").columns.tolist()
if not numeric_cols:
raise ValueError("Could not find numeric target column.")
target_col = numeric_cols[-1]

df = df[feature_cols + [target_col]].copy()
df.rename(columns={target_col: "reimbursement"}, inplace=True)

# Basic cleaning
df = df.dropna().reset_index(drop=True)

# ------------------------------------------------------------------
# Train / test split (750 / 250)
# ------------------------------------------------------------------
train_df, test_df = train_test_split(
df,
train_size=750,
test_size=250,
random_state=42,
shuffle=True,
)

train_df.to_csv(PROC / "train_data.csv", index=False)
test_df.to_csv(PROC / "test_data.csv", index=False)

print(f"Saved train_data.csv with {len(train_df)} rows")
print(f"Saved test_data.csv with {len(test_df)} rows")

# ------------------------------------------------------------------
# Quick sanity‑check histograms
# ------------------------------------------------------------------
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
cols = ["trip_duration_days", "miles_traveled", "total_receipts_amount", "reimbursement"]
for ax, col in zip(axes.ravel(), cols):
ax.hist(train_df[col], bins=30)
ax.set_title(col)
fig.tight_layout()
fig.savefig(RESULTS / "01_initial_histograms.png", dpi=150)
plt.close(fig)
print("Saved initial histograms to results/01_initial_histograms.png")


if __name__ == "__main__":
main()
59 changes: 59 additions & 0 deletions Top-Coder-Challenege-YasmineScotland/02_deep_eda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Session 2 – Deeper EDA.

Reads `train_data.csv` and produces:
- Summary statistics
- Simple correlation matrix
- A few scatter plots to explore relationships
"""
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


ROOT = Path(__file__).parent
PROC = ROOT / "data" / "processed"
RESULTS = ROOT / "results"

RESULTS.mkdir(exist_ok=True)


def main() -> None:
train_path = PROC / "train_data.csv"
if not train_path.exists():
raise FileNotFoundError("Run 01_project_setup.py first to create train_data.csv")

train = pd.read_csv(train_path)
print("Train shape:", train.shape)
print(train.head())

# Summary stats
summary = train.describe()
summary.to_csv(RESULTS / "02_summary_stats.csv")
print("Saved summary stats to results/02_summary_stats.csv")

# Correlation heatmap
corr = train.corr(numeric_only=True)
plt.figure(figsize=(6, 5))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.tight_layout()
plt.savefig(RESULTS / "02_corr_heatmap.png", dpi=150)
plt.close()
print("Saved correlation heatmap to results/02_corr_heatmap.png")

# Scatter plots vs target
for col in ["trip_duration_days", "miles_traveled", "total_receipts_amount"]:
plt.figure(figsize=(5, 4))
sns.scatterplot(data=train, x=col, y="reimbursement", alpha=0.6)
plt.title(f"reimbursement vs {col}")
plt.tight_layout()
plt.savefig(RESULTS / f"02_scatter_{col}.png", dpi=150)
plt.close()

print("EDA plots saved in results/")


if __name__ == "__main__":
main()
51 changes: 51 additions & 0 deletions Top-Coder-Challenege-YasmineScotland/03_feature_engineering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Session 3 – Feature engineering.

Creates a few simple, business‑motivated features and saves:
- train_features.csv
- test_features.csv
"""
from pathlib import Path

import numpy as np
import pandas as pd


ROOT = Path(__file__).parent
PROC = ROOT / "data" / "processed"

def add_features(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
days = df["trip_duration_days"].replace(0, 1)

df["receipts_per_day"] = df["total_receipts_amount"] / days
df["miles_per_day"] = df["miles_traveled"] / days
df["log_receipts"] = np.log1p(df["total_receipts_amount"])
df["log_miles"] = np.log1p(df["miles_traveled"])
df["is_week_plus"] = (df["trip_duration_days"] >= 7).astype(int)
df["is_long_miles"] = (df["miles_traveled"] > 500).astype(int)

return df


def main() -> None:
train_path = PROC / "train_data.csv"
test_path = PROC / "test_data.csv"
if not train_path.exists() or not test_path.exists():
raise FileNotFoundError("Run 01_project_setup.py first to create train/test CSVs")

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

train_fe = add_features(train)
test_fe = add_features(test)

train_fe.to_csv(PROC / "train_features.csv", index=False)
test_fe.to_csv(PROC / "test_features.csv", index=False)

print("Saved train_features.csv and test_features.csv in data/processed/")
print("Feature columns:", [c for c in train_fe.columns if c != "reimbursement"])


if __name__ == "__main__":
main()
72 changes: 72 additions & 0 deletions Top-Coder-Challenege-YasmineScotland/04_baseline_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
Session 4 – Baseline models.

Implements a couple of simple baselines:
- Mean predictor
- Plain linear regression on engineered features
"""
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error


ROOT = Path(__file__).parent
PROC = ROOT / "data" / "processed"
RESULTS = ROOT / "results"

RESULTS.mkdir(exist_ok=True)


def project_metrics(y_true, y_pred):
diff = np.abs(y_true - y_pred)
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
exact = np.mean(diff <= 0.01) * 100
close = np.mean(diff <= 1.00) * 100
return {
"mae": mae,
"rmse": rmse,
"exact_pct": exact,
"close_pct": close,
}


def main() -> None:
train_path = PROC / "train_features.csv"
test_path = PROC / "test_features.csv"
if not train_path.exists() or not test_path.exists():
raise FileNotFoundError("Run 03_feature_engineering.py first to create feature CSVs")

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

feature_cols = [c for c in train.columns if c != "reimbursement"]

X_train = train[feature_cols]
y_train = train["reimbursement"]
X_test = test[feature_cols]
y_test = test["reimbursement"]

rows = []

# Baseline 1 – mean
mean_value = y_train.mean()
y_pred_mean = np.full_like(y_test, fill_value=mean_value, dtype=float)
rows.append({"model": "mean", **project_metrics(y_test, y_pred_mean)})

# Baseline 2 – linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
rows.append({"model": "linear_regression", **project_metrics(y_test, y_pred_lr)})

df_results = pd.DataFrame(rows)
df_results.to_csv(RESULTS / "04_baseline_results.csv", index=False)
print(df_results)


if __name__ == "__main__":
main()
86 changes: 86 additions & 0 deletions Top-Coder-Challenege-YasmineScotland/05_advanced_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
Session 5 – A couple of stronger models.

Trains:
- RandomForestRegressor
- GradientBoostingRegressor

Saves their raw performance so we can decide what to tune later.
"""
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


ROOT = Path(__file__).parent
PROC = ROOT / "data" / "processed"
RESULTS = ROOT / "results"
MODELS_DIR = ROOT / "models" / "saved"

RESULTS.mkdir(exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)


def project_metrics(y_true, y_pred):
diff = np.abs(y_true - y_pred)
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
exact = np.mean(diff <= 0.01) * 100
close = np.mean(diff <= 1.00) * 100
return {
"mae": mae,
"rmse": rmse,
"exact_pct": exact,
"close_pct": close,
}


def main() -> None:
train_path = PROC / "train_features.csv"
test_path = PROC / "test_features.csv"
if not train_path.exists() or not test_path.exists():
raise FileNotFoundError("Run 03_feature_engineering.py first to create feature CSVs")

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

feature_cols = [c for c in train.columns if c != "reimbursement"]
X_train = train[feature_cols]
y_train = train["reimbursement"]
X_test = test[feature_cols]
y_test = test["reimbursement"]

rows = []

# Random Forest (moderate size)
rf = RandomForestRegressor(
n_estimators=200,
max_depth=None,
random_state=42,
n_jobs=-1,
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
rows.append({"model": "random_forest", **project_metrics(y_test, y_pred_rf)})

# Gradient Boosting
gb = GradientBoostingRegressor(
n_estimators=300,
learning_rate=0.05,
max_depth=3,
random_state=42,
)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
rows.append({"model": "gradient_boosting", **project_metrics(y_test, y_pred_gb)})

df_results = pd.DataFrame(rows)
df_results.to_csv(RESULTS / "05_advanced_results.csv", index=False)
print(df_results)


if __name__ == "__main__":
main()
Loading