8090-inc · Yscotland · Dec 3, 2025
diff --git a/Top-Coder-Challenege-YasmineScotland/01_project_setup.py b/Top-Coder-Challenege-YasmineScotland/01_project_setup.py
@@ -0,0 +1,94 @@
+"""
+Session 1 – Project setup and initial data split.
+
+- Flattens the JSON structure
+- Renames columns to friendly names
+- Creates a 750 / 250 train–test split
+"""
+import json
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+
+ROOT = Path(__file__).parent
+RAW = ROOT / "data" / "raw"
+PROC = ROOT / "data" / "processed"
+RESULTS = ROOT / "results"
+
+PROC.mkdir(parents=True, exist_ok=True)
+RESULTS.mkdir(parents=True, exist_ok=True)
+
+
+def main() -> None:
+    # ------------------------------------------------------------------
+    # Load and flatten JSON
+    # ------------------------------------------------------------------
+    public_path = RAW / "public_cases.json"
+    if not public_path.exists():
+        raise FileNotFoundError(f"Expected file not found: {public_path}")
+
+    with public_path.open() as f:
+        data = json.load(f)
+
+    # Flatten nested keys like input.trip_duration_days
+    df = pd.json_normalize(data, sep=".")
+    df.columns = [c.split(".")[-1] for c in df.columns]
+
+    # Identify input and target columns
+    feature_cols = ["trip_duration_days", "miles_traveled", "total_receipts_amount"]
+    missing = [c for c in feature_cols if c not in df.columns]
+    if missing:
+        raise ValueError(f"Missing expected input columns: {missing}")
+
+    # Anything that looks like the reimbursement output
+    if "expected_output" in df.columns:
+        target_col = "expected_output"
+    else:
+        # fall back: choose the last numeric column
+        numeric_cols = df.select_dtypes("number").columns.tolist()
+        if not numeric_cols:
+            raise ValueError("Could not find numeric target column.")
+        target_col = numeric_cols[-1]
+
+    df = df[feature_cols + [target_col]].copy()
+    df.rename(columns={target_col: "reimbursement"}, inplace=True)
+
+    # Basic cleaning
+    df = df.dropna().reset_index(drop=True)
+
+    # ------------------------------------------------------------------
+    # Train / test split (750 / 250)
+    # ------------------------------------------------------------------
+    train_df, test_df = train_test_split(
+        df,
+        train_size=750,
+        test_size=250,
+        random_state=42,
+        shuffle=True,
+    )
+
+    train_df.to_csv(PROC / "train_data.csv", index=False)
+    test_df.to_csv(PROC / "test_data.csv", index=False)
+
+    print(f"Saved train_data.csv with {len(train_df)} rows")
+    print(f"Saved test_data.csv with  {len(test_df)} rows")
+
+    # ------------------------------------------------------------------
+    # Quick sanity‑check histograms
+    # ------------------------------------------------------------------
+    fig, axes = plt.subplots(2, 2, figsize=(10, 8))
+    cols = ["trip_duration_days", "miles_traveled", "total_receipts_amount", "reimbursement"]
+    for ax, col in zip(axes.ravel(), cols):
+        ax.hist(train_df[col], bins=30)
+        ax.set_title(col)
+    fig.tight_layout()
+    fig.savefig(RESULTS / "01_initial_histograms.png", dpi=150)
+    plt.close(fig)
+    print("Saved initial histograms to results/01_initial_histograms.png")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Top-Coder-Challenege-YasmineScotland/02_deep_eda.py b/Top-Coder-Challenege-YasmineScotland/02_deep_eda.py
@@ -0,0 +1,59 @@
+"""
+Session 2 – Deeper EDA.
+
+Reads `train_data.csv` and produces:
+- Summary statistics
+- Simple correlation matrix
+- A few scatter plots to explore relationships
+"""
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+
+ROOT = Path(__file__).parent
+PROC = ROOT / "data" / "processed"
+RESULTS = ROOT / "results"
+
+RESULTS.mkdir(exist_ok=True)
+
+
+def main() -> None:
+    train_path = PROC / "train_data.csv"
+    if not train_path.exists():
+        raise FileNotFoundError("Run 01_project_setup.py first to create train_data.csv")
+
+    train = pd.read_csv(train_path)
+    print("Train shape:", train.shape)
+    print(train.head())
+
+    # Summary stats
+    summary = train.describe()
+    summary.to_csv(RESULTS / "02_summary_stats.csv")
+    print("Saved summary stats to results/02_summary_stats.csv")
+
+    # Correlation heatmap
+    corr = train.corr(numeric_only=True)
+    plt.figure(figsize=(6, 5))
+    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
+    plt.tight_layout()
+    plt.savefig(RESULTS / "02_corr_heatmap.png", dpi=150)
+    plt.close()
+    print("Saved correlation heatmap to results/02_corr_heatmap.png")
+
+    # Scatter plots vs target
+    for col in ["trip_duration_days", "miles_traveled", "total_receipts_amount"]:
+        plt.figure(figsize=(5, 4))
+        sns.scatterplot(data=train, x=col, y="reimbursement", alpha=0.6)
+        plt.title(f"reimbursement vs {col}")
+        plt.tight_layout()
+        plt.savefig(RESULTS / f"02_scatter_{col}.png", dpi=150)
+        plt.close()
+
+    print("EDA plots saved in results/")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Top-Coder-Challenege-YasmineScotland/03_feature_engineering.py b/Top-Coder-Challenege-YasmineScotland/03_feature_engineering.py
@@ -0,0 +1,51 @@
+"""
+Session 3 – Feature engineering.
+
+Creates a few simple, business‑motivated features and saves:
+- train_features.csv
+- test_features.csv
+"""
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+
+ROOT = Path(__file__).parent
+PROC = ROOT / "data" / "processed"
+
+def add_features(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    days = df["trip_duration_days"].replace(0, 1)
+
+    df["receipts_per_day"] = df["total_receipts_amount"] / days
+    df["miles_per_day"] = df["miles_traveled"] / days
+    df["log_receipts"] = np.log1p(df["total_receipts_amount"])
+    df["log_miles"] = np.log1p(df["miles_traveled"])
+    df["is_week_plus"] = (df["trip_duration_days"] >= 7).astype(int)
+    df["is_long_miles"] = (df["miles_traveled"] > 500).astype(int)
+
+    return df
+
+
+def main() -> None:
+    train_path = PROC / "train_data.csv"
+    test_path = PROC / "test_data.csv"
+    if not train_path.exists() or not test_path.exists():
+        raise FileNotFoundError("Run 01_project_setup.py first to create train/test CSVs")
+
+    train = pd.read_csv(train_path)
+    test = pd.read_csv(test_path)
+
+    train_fe = add_features(train)
+    test_fe = add_features(test)
+
+    train_fe.to_csv(PROC / "train_features.csv", index=False)
+    test_fe.to_csv(PROC / "test_features.csv", index=False)
+
+    print("Saved train_features.csv and test_features.csv in data/processed/")
+    print("Feature columns:", [c for c in train_fe.columns if c != "reimbursement"])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Top-Coder-Challenege-YasmineScotland/04_baseline_models.py b/Top-Coder-Challenege-YasmineScotland/04_baseline_models.py
@@ -0,0 +1,72 @@
+"""
+Session 4 – Baseline models.
+
+Implements a couple of simple baselines:
+- Mean predictor
+- Plain linear regression on engineered features
+"""
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+
+
+ROOT = Path(__file__).parent
+PROC = ROOT / "data" / "processed"
+RESULTS = ROOT / "results"
+
+RESULTS.mkdir(exist_ok=True)
+
+
+def project_metrics(y_true, y_pred):
+    diff = np.abs(y_true - y_pred)
+    mae = mean_absolute_error(y_true, y_pred)
+    rmse = mean_squared_error(y_true, y_pred, squared=False)
+    exact = np.mean(diff <= 0.01) * 100
+    close = np.mean(diff <= 1.00) * 100
+    return {
+        "mae": mae,
+        "rmse": rmse,
+        "exact_pct": exact,
+        "close_pct": close,
+    }
+
+
+def main() -> None:
+    train_path = PROC / "train_features.csv"
+    test_path = PROC / "test_features.csv"
+    if not train_path.exists() or not test_path.exists():
+        raise FileNotFoundError("Run 03_feature_engineering.py first to create feature CSVs")
+
+    train = pd.read_csv(train_path)
+    test = pd.read_csv(test_path)
+
+    feature_cols = [c for c in train.columns if c != "reimbursement"]
+
+    X_train = train[feature_cols]
+    y_train = train["reimbursement"]
+    X_test = test[feature_cols]
+    y_test = test["reimbursement"]
+
+    rows = []
+
+    # Baseline 1 – mean
+    mean_value = y_train.mean()
+    y_pred_mean = np.full_like(y_test, fill_value=mean_value, dtype=float)
+    rows.append({"model": "mean", **project_metrics(y_test, y_pred_mean)})
+
+    # Baseline 2 – linear regression
+    lr = LinearRegression()
+    lr.fit(X_train, y_train)
+    y_pred_lr = lr.predict(X_test)
+    rows.append({"model": "linear_regression", **project_metrics(y_test, y_pred_lr)})
+
+    df_results = pd.DataFrame(rows)
+    df_results.to_csv(RESULTS / "04_baseline_results.csv", index=False)
+    print(df_results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Top-Coder-Challenege-YasmineScotland/05_advanced_models.py b/Top-Coder-Challenege-YasmineScotland/05_advanced_models.py
@@ -0,0 +1,86 @@
+"""
+Session 5 – A couple of stronger models.
+
+Trains:
+- RandomForestRegressor
+- GradientBoostingRegressor
+
+Saves their raw performance so we can decide what to tune later.
+"""
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+
+
+ROOT = Path(__file__).parent
+PROC = ROOT / "data" / "processed"
+RESULTS = ROOT / "results"
+MODELS_DIR = ROOT / "models" / "saved"
+
+RESULTS.mkdir(exist_ok=True)
+MODELS_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def project_metrics(y_true, y_pred):
+    diff = np.abs(y_true - y_pred)
+    mae = mean_absolute_error(y_true, y_pred)
+    rmse = mean_squared_error(y_true, y_pred, squared=False)
+    exact = np.mean(diff <= 0.01) * 100
+    close = np.mean(diff <= 1.00) * 100
+    return {
+        "mae": mae,
+        "rmse": rmse,
+        "exact_pct": exact,
+        "close_pct": close,
+    }
+
+
+def main() -> None:
+    train_path = PROC / "train_features.csv"
+    test_path = PROC / "test_features.csv"
+    if not train_path.exists() or not test_path.exists():
+        raise FileNotFoundError("Run 03_feature_engineering.py first to create feature CSVs")
+
+    train = pd.read_csv(train_path)
+    test = pd.read_csv(test_path)
+
+    feature_cols = [c for c in train.columns if c != "reimbursement"]
+    X_train = train[feature_cols]
+    y_train = train["reimbursement"]
+    X_test = test[feature_cols]
+    y_test = test["reimbursement"]
+
+    rows = []
+
+    # Random Forest (moderate size)
+    rf = RandomForestRegressor(
+        n_estimators=200,
+        max_depth=None,
+        random_state=42,
+        n_jobs=-1,
+    )
+    rf.fit(X_train, y_train)
+    y_pred_rf = rf.predict(X_test)
+    rows.append({"model": "random_forest", **project_metrics(y_test, y_pred_rf)})
+
+    # Gradient Boosting
+    gb = GradientBoostingRegressor(
+        n_estimators=300,
+        learning_rate=0.05,
+        max_depth=3,
+        random_state=42,
+    )
+    gb.fit(X_train, y_train)
+    y_pred_gb = gb.predict(X_test)
+    rows.append({"model": "gradient_boosting", **project_metrics(y_test, y_pred_gb)})
+
+    df_results = pd.DataFrame(rows)
+    df_results.to_csv(RESULTS / "05_advanced_results.csv", index=False)
+    print(df_results)
+
+
+if __name__ == "__main__":
+    main()