Remover clutter and added MinMax scaling to the pipeline

anibalrojosan · anibalrojosan · commit c20a619f8696 · 2025-10-09T13:59:15.000-03:00
diff --git a/src/model/data_ingestion.py b/src/model/data_ingestion.py
@@ -3,4 +3,4 @@
 def load_raw_data(data_path='data/data.csv'):
     """Loads the raw dataset from the specified path."""
     df = pd.read_csv(data_path)
-    return df
+    return df
diff --git a/src/model/data_preprocessing.py b/src/model/data_preprocessing.py
@@ -13,4 +13,4 @@ def prepare_features_and_target(df):
     """Prepares features (X) and target (y) from the preprocessed DataFrame."""
     X = df.drop('diagnosis', axis=1)
     y = df['diagnosis']
-    return X, y
+    return X, y
diff --git a/src/model/model_inference.py b/src/model/model_inference.py
@@ -2,9 +2,6 @@
 import pandas as pd
 import os
 
-# Ensure the data_preprocessing module is accessible for FunctionTransformer if it was pickled
-# This might not be strictly necessary if FunctionTransformer only relies on the function definition itself
-# but good practice to have the context
 from .data_preprocessing import drop_unnecessary_columns
 
 def load_pipeline(model_path='models/model.joblib'):
@@ -38,4 +35,4 @@ def predict(raw_data, model_path='models/model.joblib'):
         prediction = predict(sample_new_data)
         print(f"Prediction for sample data: {prediction[0]} (0: Benign, 1: Malignant)")
     except FileNotFoundError as e:
-        print(e)
+        print(e)
diff --git a/src/model/model_training.py b/src/model/model_training.py
@@ -39,4 +39,4 @@ def train_and_save_pipeline(data_path='data/data.csv', model_path='models/model.
     print(f"Trained pipeline saved to {model_path}")
 
 if __name__ == "__main__":
-    train_and_save_pipeline()
+    train_and_save_pipeline()
diff --git a/src/model/pipeline_utils.py b/src/model/pipeline_utils.py
@@ -1,5 +1,6 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import FunctionTransformer
+from sklearn.preprocessing import MinMaxScaler
 from sklearn.ensemble import RandomForestClassifier
 import pandas as pd
 
@@ -11,12 +12,12 @@ def create_breast_cancer_pipeline():
     # Define preprocessing steps
     preprocessing_pipeline = Pipeline([
         ('drop_cols', FunctionTransformer(drop_unnecessary_columns, validate=False)),
-        # Add other preprocessing steps here if needed, e.g., scaling
+        ('scaler', MinMaxScaler()),
     ])
 
     # Combine preprocessing and model into a full pipeline
     full_pipeline = Pipeline([
         ('preprocessor', preprocessing_pipeline),
         ('classifier', RandomForestClassifier(random_state=42))
     ])
-    return full_pipeline
+    return full_pipeline