Refactor: Modularized train_model.py for separation of concerns

anibalrojosan · anibalrojosan · commit bb91e549f9bb · 2025-10-08T21:23:22.000-03:00
diff --git a/src/data_ingestion.py b/src/data_ingestion.py
@@ -0,0 +1,6 @@
+import pandas as pd
+
+def load_raw_data(data_path='data/data.csv'):
+    """Loads the raw dataset from the specified path."""
+    df = pd.read_csv(data_path)
+    return df
diff --git a/src/data_preprocessing.py b/src/data_preprocessing.py
@@ -0,0 +1,16 @@
+import pandas as pd
+
+def drop_unnecessary_columns(df):
+    """Drops the 'id' and 'Unnamed: 32' columns from the DataFrame."""
+    return df.drop(['id', 'Unnamed: 32'], axis=1, errors='ignore')
+
+def map_diagnosis_to_numerical(df):
+    """Converts the 'diagnosis' column to numerical (M=1, B=0)."""
+    df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
+    return df
+
+def prepare_features_and_target(df):
+    """Prepares features (X) and target (y) from the preprocessed DataFrame."""
+    X = df.drop('diagnosis', axis=1)
+    y = df['diagnosis']
+    return X, y
diff --git a/src/model.py b/src/model.py
diff --git a/src/model_inference.py b/src/model_inference.py
@@ -0,0 +1,41 @@
+import joblib
+import pandas as pd
+import os
+
+# Ensure the data_preprocessing module is accessible for FunctionTransformer if it was pickled
+# This might not be strictly necessary if FunctionTransformer only relies on the function definition itself
+# but good practice to have the context
+from .data_preprocessing import drop_unnecessary_columns
+
+def load_pipeline(model_path='models/model.joblib'):
+    """Loads the trained scikit-learn pipeline."""
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"Model pipeline not found at {model_path}. Please train the model first.")
+    return joblib.load(model_path)
+
+def predict(raw_data, model_path='models/model.joblib'):
+    """Loads the pipeline and makes a prediction on new raw data."""
+    pipeline = load_pipeline(model_path)
+    prediction = pipeline.predict(raw_data)
+    return prediction
+
+if __name__ == "__main__":
+    print("This module is for inference. Please run model_training.py to train the model.")
+    try:
+        # Example of new raw data (single row DataFrame)
+        sample_new_data = pd.DataFrame([{
+            'radius_mean': 17.99, 'texture_mean': 10.38,
+            'perimeter_mean': 122.8, 'area_mean': 1001.0, 'smoothness_mean': 0.1184,
+            'compactness_mean': 0.2776, 'concavity_mean': 0.3001, 'concave points_mean': 0.1471,
+            'symmetry_mean': 0.2419, 'fractal_dimension_mean': 0.07871,
+            'radius_se': 1.095, 'texture_se': 0.9053, 'perimeter_se': 8.589, 'area_se': 153.4,
+            'smoothness_se': 0.006399, 'compactness_se': 0.04904, 'concavity_se': 0.05373,
+            'concave points_se': 0.01587, 'symmetry_se': 0.03003, 'fractal_dimension_se': 0.006193,
+            'radius_worst': 25.38, 'texture_worst': 17.33, 'perimeter_worst': 184.6, 'area_worst': 2019.0,
+            'smoothness_worst': 0.1622, 'compactness_worst': 0.6656, 'concavity_worst': 0.7119,
+            'concave points_worst': 0.2654, 'symmetry_worst': 0.4601, 'fractal_dimension_worst': 0.1189
+        }])
+        prediction = predict(sample_new_data)
+        print(f"Prediction for sample data: {prediction[0]} (0: Benign, 1: Malignant)")
+    except FileNotFoundError as e:
+        print(e)
diff --git a/src/model_training.py b/src/model_training.py
@@ -0,0 +1,42 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+import joblib
+import os
+
+from .data_ingestion import load_raw_data
+from .data_preprocessing import map_diagnosis_to_numerical, prepare_features_and_target
+from .pipeline_utils import create_breast_cancer_pipeline
+
+def train_and_save_pipeline(data_path='data/data.csv', model_path='models/model.joblib'):
+    """Orchestrates the training process: loads data, preprocesses, trains, and saves the pipeline."""
+    # Load the raw dataset
+    df_raw = load_raw_data(data_path)
+
+    # Apply diagnosis mapping before splitting features and target
+    df_mapped = map_diagnosis_to_numerical(df_raw.copy()) # Use a copy to avoid modifying original df_raw if it's used elsewhere
+
+    # Prepare features (X) and target (y)
+    X, y = prepare_features_and_target(df_mapped)
+
+    # Split data into train/test sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+    # Create and train the pipeline
+    pipeline = create_breast_cancer_pipeline()
+    pipeline.fit(X_train, y_train)
+
+    # Evaluate the pipeline
+    y_pred = pipeline.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    print(f"Pipeline Accuracy: {accuracy:.4f}")
+
+    # Ensure the models directory exists
+    os.makedirs(os.path.dirname(model_path), exist_ok=True)
+
+    # Save the trained pipeline using joblib
+    joblib.dump(pipeline, model_path)
+    print(f"Trained pipeline saved to {model_path}")
+
+if __name__ == "__main__":
+    train_and_save_pipeline()
diff --git a/src/pipeline_utils.py b/src/pipeline_utils.py
@@ -0,0 +1,22 @@
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import FunctionTransformer
+from sklearn.ensemble import RandomForestClassifier
+import pandas as pd
+
+from .data_preprocessing import drop_unnecessary_columns
+
+def create_breast_cancer_pipeline():
+    """Creates and returns a scikit-learn pipeline for breast cancer prediction."""
+
+    # Define preprocessing steps
+    preprocessing_pipeline = Pipeline([
+        ('drop_cols', FunctionTransformer(drop_unnecessary_columns, validate=False)),
+        # Add other preprocessing steps here if needed, e.g., scaling
+    ])
+
+    # Combine preprocessing and model into a full pipeline
+    full_pipeline = Pipeline([
+        ('preprocessor', preprocessing_pipeline),
+        ('classifier', RandomForestClassifier(random_state=42))
+    ])
+    return full_pipeline