anibalrojosan
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 28 additions & 0 deletions b/‎.github/workflows/main.yml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 0 deletions b/‎.gitignore‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 16 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/app.py‎
Lines changed: 32 additions & 21 deletions b/‎src/app.py‎
Lines changed: 32 additions & 21 deletions
diff --git a/‎src/model/data_ingestion.py‎
Lines changed: 3 additions & 2 deletions b/‎src/model/data_ingestion.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/model/data_preprocessing.py‎
Lines changed: 12 additions & 9 deletions b/‎src/model/data_preprocessing.py‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎src/model/model_inference.py‎
Lines changed: 47 additions & 17 deletions b/‎src/model/model_inference.py‎
Lines changed: 47 additions & 17 deletions
diff --git a/‎src/model/model_training.py‎
Lines changed: 12 additions & 5 deletions b/‎src/model/model_training.py‎
Lines changed: 12 additions & 5 deletions
@@ -6,10 +6,38 @@ on:
     branches:
       - main
       - feature/uv-ci-test
+      - new-ci-quality-gates
 
 jobs:
+  quality:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+
+    - name: Install Python dependencies
+      run: |
+        pip install --upgrade pip
+        pip install -r requirements.txt
+
+    - name: Install QA tooling
+      run: pip install ruff mypy coverage
+
+    - name: Lint, type-check, and test with coverage
+      run: |
+        ruff check .
+        mypy src
+        coverage run -m pytest -q
+        coverage report -m --fail-under=80
+
   build-and-push:
     runs-on: ubuntu-latest
+    needs: quality
     env:
       DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}
 
 
@@ -4,6 +4,13 @@ __pycache__/
 *.log
 .venv/
 
+# Tools
+.mypy_cache/
+.pytest_cache/
+.ruff_cache/
+.coverage
+htmlcov/
+
 # Models
 models/
 
 
@@ -16,3 +16,19 @@ dependencies = [
     "scikit-learn>=1.7.2",
     "streamlit>=1.50.0",
 ]
+
+[dependency-groups]
+dev = [
+    "coverage>=7.11.0",
+    "mypy>=1.18.2",
+    "pre-commit>=4.3.0",
+    "ruff>=0.14.1",
+]
+
+[tool.mypy]
+files = ["src"]
+explicit_package_bases = true
+
+[[tool.mypy.overrides]]
+module = ["pandas", "numpy", "sklearn.*", "joblib", "requests"]
+ignore_missing_imports = true
@@ -2,26 +2,28 @@
 import joblib
 import pandas as pd
 import logging
-import sys 
+import sys
 from pydantic import ValidationError
 from src.schemas import PredictRequest
 
 
 # Configure logging
-log_file_path = 'api_logs.log'
+log_file_path = "api_logs.log"
 logging.basicConfig(
     level=logging.INFO,
-    format='*** FLASK API LOG: %(asctime)s - %(levelname)s - %(message)s ***',
+    format="*** FLASK API LOG: %(asctime)s - %(levelname)s - %(message)s ***",
     handlers=[
         logging.FileHandler(log_file_path),
-        logging.StreamHandler(sys.stdout) # Explicitly use sys.stdout for terminal output
-    ]
+        logging.StreamHandler(
+            sys.stdout
+        ),  # Explicitly use sys.stdout for terminal output
+    ],
 )
 
 app = Flask(__name__)
 
 # Path to the trained model
-MODEL_PATH = 'models/model.joblib'
+MODEL_PATH = "models/model.joblib"
 
 try:
     model = joblib.load(MODEL_PATH)
@@ -33,62 +35,71 @@
     logging.error(f"Error loading model: {e}")
     model = None
 
+
 # Endpoints
-@app.route('/', methods=['GET'])
+@app.route("/", methods=["GET"])
 def health_check():
     """Health check endpoint."""
     logging.info("Health check requested.")
-    return jsonify({'status': 'healthy', 'model_loaded': model is not None}), 200
+    return jsonify({"status": "healthy", "model_loaded": model is not None}), 200
+
 
-@app.route('/predict', methods=['POST'])
+@app.route("/predict", methods=["POST"])
 def predict():
     logging.info("Prediction endpoint hit.")
     if model is None:
         logging.error("Prediction requested but model is not loaded.")
-        return jsonify({'error': 'Model not loaded. Please ensure the model is trained and available.'}), 500
+        return jsonify(
+            {
+                "error": "Model not loaded. Please ensure the model is trained and available."
+            }
+        ), 500
 
     # Parse JSON
     try:
         data = request.get_json(force=True)
     except Exception:
         logging.warning("Invalid JSON body.")
-        return jsonify({'error': 'Invalid JSON body.'}), 400
+        return jsonify({"error": "Invalid JSON body."}), 400
 
     # Validation using Pydantic
     try:
         payload = PredictRequest.model_validate(data)
     except ValidationError as e:
         logging.warning(f"Validation error: {e}")
-        return jsonify({'error': 'Invalid input', 'details': e.errors()}), 422
+        return jsonify({"error": "Invalid input", "details": e.errors()}), 422
 
     # Build DataFrame using aliases to match training feature names
     input_df = pd.DataFrame([payload.model_dump(by_alias=True)])
 
     # Feature alignment
-    if hasattr(model, 'feature_names_in_'):
-        expected_features = list(model.feature_names_in_) # scikit-learn feature names
+    if hasattr(model, "feature_names_in_"):
+        expected_features = list(model.feature_names_in_)  # scikit-learn feature names
         missing_features = set(expected_features) - set(input_df.columns)
         for feature in missing_features:
             input_df[feature] = 0
         input_df = input_df[expected_features]
     else:
-        logging.warning("model.feature_names_in_ not found. Relying on input JSON for feature order.")
+        logging.warning(
+            "model.feature_names_in_ not found. Relying on input JSON for feature order."
+        )
 
     # Inference
     try:
         prediction = model.predict(input_df)
         prediction_proba = model.predict_proba(input_df)
     except Exception as e:
         logging.error(f"Error during prediction: {e}", exc_info=True)
-        return jsonify({'error': f'An internal error occurred: {e}'}), 500
+        return jsonify({"error": f"An internal error occurred: {e}"}), 500
 
     result = {
-        'prediction': int(prediction[0]),
-        'probability_benign': float(prediction_proba[0][0]),       # Class 0 is benign
-        'probability_malignant': float(prediction_proba[0][1]),    # Class 1 is malignant
+        "prediction": int(prediction[0]),
+        "probability_benign": float(prediction_proba[0][0]),  # Class 0 is benign
+        "probability_malignant": float(prediction_proba[0][1]),  # Class 1 is malignant
     }
     logging.info(f"Prediction successful: {result}")
     return jsonify(result), 200
 
-if __name__ == '__main__':
-    app.run(debug=False, use_reloader=False, host='0.0.0.0', port=5000)
+
+if __name__ == "__main__":
+    app.run(debug=False, use_reloader=False, host="0.0.0.0", port=5000)
@@ -1,9 +1,10 @@
 import pandas as pd
 
-def load_raw_data(data_path='data/data.csv'):
+
+def load_raw_data(data_path="data/data.csv"):
     """Loads the raw dataset from the specified path."""
     try:
         df = pd.read_csv(data_path)
     except pd.errors.EmptyDataError:
         return pd.DataFrame()
-    return df
+    return df
@@ -1,21 +1,24 @@
 import pandas as pd
 
+
 def drop_unnecessary_columns(df):
     """Drops the 'id' and 'Unnamed: 32' columns from the DataFrame."""
-    return df.drop(['id', 'Unnamed: 32'], axis=1, errors='ignore')
+    return df.drop(["id", "Unnamed: 32"], axis=1, errors="ignore")
+
 
 def map_diagnosis_to_numerical(df):
     """Converts the 'diagnosis' column to numerical (M=1, B=0)."""
-    # Use .get() to retrieve the 'diagnosis' Series.If 'diagnosis' doesn't exist, 
-    # .get() returns the default value, which we set to an empty Series with 
-    # the correct index. This ensures .map() is always called on a Series, 
+    # Use .get() to retrieve the 'diagnosis' Series.If 'diagnosis' doesn't exist,
+    # .get() returns the default value, which we set to an empty Series with
+    # the correct index. This ensures .map() is always called on a Series,
     # preventing KeyError.
-    diagnosis_series = df.get('diagnosis', pd.Series(index=df.index, dtype='object'))
-    df['diagnosis'] = diagnosis_series.map({'M': 1, 'B': 0})
+    diagnosis_series = df.get("diagnosis", pd.Series(index=df.index, dtype="object"))
+    df["diagnosis"] = diagnosis_series.map({"M": 1, "B": 0})
     return df
 
+
 def prepare_features_and_target(df):
     """Prepares features (X) and target (y) from the preprocessed DataFrame."""
-    X = df.drop('diagnosis', axis=1)
-    y = df['diagnosis']
-    return X, y
+    X = df.drop("diagnosis", axis=1)
+    y = df["diagnosis"]
+    return X, y
@@ -3,35 +3,65 @@
 import os
 
 
-def load_pipeline(model_path='models/model.joblib'):
+def load_pipeline(model_path="models/model.joblib"):
     """Loads the trained scikit-learn pipeline."""
     if not os.path.exists(model_path):
-        raise FileNotFoundError(f"Model pipeline not found at {model_path}. Please train the model first.")
+        raise FileNotFoundError(
+            f"Model pipeline not found at {model_path}. Please train the model first."
+        )
     return joblib.load(model_path)
 
-def predict(raw_data, model_path='models/model.joblib'):
+
+def predict(raw_data, model_path="models/model.joblib"):
     """Loads the pipeline and makes a prediction on new raw data."""
     pipeline = load_pipeline(model_path)
     prediction = pipeline.predict(raw_data)
     return prediction
 
+
 if __name__ == "__main__":
-    print("This module is for inference. Please run model_training.py to train the model.")
+    print(
+        "This module is for inference. Please run model_training.py to train the model."
+    )
     try:
         # Example of new raw data (single row DataFrame)
-        sample_new_data = pd.DataFrame([{
-            'radius_mean': 17.99, 'texture_mean': 10.38,
-            'perimeter_mean': 122.8, 'area_mean': 1001.0, 'smoothness_mean': 0.1184,
-            'compactness_mean': 0.2776, 'concavity_mean': 0.3001, 'concave points_mean': 0.1471,
-            'symmetry_mean': 0.2419, 'fractal_dimension_mean': 0.07871,
-            'radius_se': 1.095, 'texture_se': 0.9053, 'perimeter_se': 8.589, 'area_se': 153.4,
-            'smoothness_se': 0.006399, 'compactness_se': 0.04904, 'concavity_se': 0.05373,
-            'concave points_se': 0.01587, 'symmetry_se': 0.03003, 'fractal_dimension_se': 0.006193,
-            'radius_worst': 25.38, 'texture_worst': 17.33, 'perimeter_worst': 184.6, 'area_worst': 2019.0,
-            'smoothness_worst': 0.1622, 'compactness_worst': 0.6656, 'concavity_worst': 0.7119,
-            'concave points_worst': 0.2654, 'symmetry_worst': 0.4601, 'fractal_dimension_worst': 0.1189
-        }])
+        sample_new_data = pd.DataFrame(
+            [
+                {
+                    "radius_mean": 17.99,
+                    "texture_mean": 10.38,
+                    "perimeter_mean": 122.8,
+                    "area_mean": 1001.0,
+                    "smoothness_mean": 0.1184,
+                    "compactness_mean": 0.2776,
+                    "concavity_mean": 0.3001,
+                    "concave points_mean": 0.1471,
+                    "symmetry_mean": 0.2419,
+                    "fractal_dimension_mean": 0.07871,
+                    "radius_se": 1.095,
+                    "texture_se": 0.9053,
+                    "perimeter_se": 8.589,
+                    "area_se": 153.4,
+                    "smoothness_se": 0.006399,
+                    "compactness_se": 0.04904,
+                    "concavity_se": 0.05373,
+                    "concave points_se": 0.01587,
+                    "symmetry_se": 0.03003,
+                    "fractal_dimension_se": 0.006193,
+                    "radius_worst": 25.38,
+                    "texture_worst": 17.33,
+                    "perimeter_worst": 184.6,
+                    "area_worst": 2019.0,
+                    "smoothness_worst": 0.1622,
+                    "compactness_worst": 0.6656,
+                    "concavity_worst": 0.7119,
+                    "concave points_worst": 0.2654,
+                    "symmetry_worst": 0.4601,
+                    "fractal_dimension_worst": 0.1189,
+                }
+            ]
+        )
         prediction = predict(sample_new_data)
         print(f"Prediction for sample data: {prediction[0]} (0: Benign, 1: Malignant)")
     except FileNotFoundError as e:
-        print(e)
+        print(e)
@@ -1,4 +1,3 @@
-import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score
 import joblib
@@ -8,19 +7,26 @@
 from .data_preprocessing import map_diagnosis_to_numerical, prepare_features_and_target
 from .pipeline_utils import create_breast_cancer_pipeline
 
-def train_and_save_pipeline(data_path='data/data.csv', model_path='models/model.joblib'):
+
+def train_and_save_pipeline(
+    data_path="data/data.csv", model_path="models/model.joblib"
+):
     """Orchestrates the training process: loads data, preprocesses, trains, and saves the pipeline."""
     # Load the raw dataset
     df_raw = load_raw_data(data_path)
 
     # Apply diagnosis mapping before splitting features and target
-    df_mapped = map_diagnosis_to_numerical(df_raw.copy()) # Use a copy to avoid modifying original df_raw if it's used elsewhere
+    df_mapped = map_diagnosis_to_numerical(
+        df_raw.copy()
+    )  # Use a copy to avoid modifying original df_raw if it's used elsewhere
 
     # Prepare features (X) and target (y)
     X, y = prepare_features_and_target(df_mapped)
 
     # Split data into train/test sets
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
 
     # Create and train the pipeline
     pipeline = create_breast_cancer_pipeline()
@@ -38,5 +44,6 @@ def train_and_save_pipeline(data_path='data/data.csv', model_path='models/model.
     joblib.dump(pipeline, model_path)
     print(f"Trained pipeline saved to {model_path}")
 
+
 if __name__ == "__main__":
-    train_and_save_pipeline()
+    train_and_save_pipeline()