Skip to content

Commit 8372c46

Browse files
Merge pull request #2 from anibalrojosan/new-ci-quality-gates
What changed: Split CI into two jobs in .github/workflows/main.yml: quality: installs deps + QA tooling, runs ruff check ., mypy src, and coverage run -m pytest with --fail-under=80. build-and-push: now depends on quality via needs: quality; proceeds to train, verify model artifact, build/push images, run compose, and probe endpoints. ruff, mypy and coverage were used for quality testing. Errors founded on scr/ and tests/ folders were fixed. Why: Fail fast on lint/type/test errors to avoid building/pushing broken images. Keep runtime images clean (QA tools not installed inside images). Provide a consistent, reproducible quality bar for all commits/PRs. Validation: CI “quality” job passed, and subsequent “build-and-push” completed successfully.
2 parents 4b7df93 + 303cb3c commit 8372c46

21 files changed

+816
-238
lines changed

.github/workflows/main.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,38 @@ on:
66
branches:
77
- main
88
- feature/uv-ci-test
9+
- new-ci-quality-gates
910

1011
jobs:
12+
quality:
13+
runs-on: ubuntu-latest
14+
steps:
15+
- name: Checkout code
16+
uses: actions/checkout@v4
17+
18+
- name: Set up Python 3.12
19+
uses: actions/setup-python@v5
20+
with:
21+
python-version: '3.12'
22+
23+
- name: Install Python dependencies
24+
run: |
25+
pip install --upgrade pip
26+
pip install -r requirements.txt
27+
28+
- name: Install QA tooling
29+
run: pip install ruff mypy coverage
30+
31+
- name: Lint, type-check, and test with coverage
32+
run: |
33+
ruff check .
34+
mypy src
35+
coverage run -m pytest -q
36+
coverage report -m --fail-under=80
37+
1138
build-and-push:
1239
runs-on: ubuntu-latest
40+
needs: quality
1341
env:
1442
DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}
1543

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@ __pycache__/
44
*.log
55
.venv/
66

7+
# Tools
8+
.mypy_cache/
9+
.pytest_cache/
10+
.ruff_cache/
11+
.coverage
12+
htmlcov/
13+
714
# Models
815
models/
916

pyproject.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,19 @@ dependencies = [
1616
"scikit-learn>=1.7.2",
1717
"streamlit>=1.50.0",
1818
]
19+
20+
[dependency-groups]
21+
dev = [
22+
"coverage>=7.11.0",
23+
"mypy>=1.18.2",
24+
"pre-commit>=4.3.0",
25+
"ruff>=0.14.1",
26+
]
27+
28+
[tool.mypy]
29+
files = ["src"]
30+
explicit_package_bases = true
31+
32+
[[tool.mypy.overrides]]
33+
module = ["pandas", "numpy", "sklearn.*", "joblib", "requests"]
34+
ignore_missing_imports = true

src/app.py

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,28 @@
22
import joblib
33
import pandas as pd
44
import logging
5-
import sys
5+
import sys
66
from pydantic import ValidationError
77
from src.schemas import PredictRequest
88

99

1010
# Configure logging
11-
log_file_path = 'api_logs.log'
11+
log_file_path = "api_logs.log"
1212
logging.basicConfig(
1313
level=logging.INFO,
14-
format='*** FLASK API LOG: %(asctime)s - %(levelname)s - %(message)s ***',
14+
format="*** FLASK API LOG: %(asctime)s - %(levelname)s - %(message)s ***",
1515
handlers=[
1616
logging.FileHandler(log_file_path),
17-
logging.StreamHandler(sys.stdout) # Explicitly use sys.stdout for terminal output
18-
]
17+
logging.StreamHandler(
18+
sys.stdout
19+
), # Explicitly use sys.stdout for terminal output
20+
],
1921
)
2022

2123
app = Flask(__name__)
2224

2325
# Path to the trained model
24-
MODEL_PATH = 'models/model.joblib'
26+
MODEL_PATH = "models/model.joblib"
2527

2628
try:
2729
model = joblib.load(MODEL_PATH)
@@ -33,62 +35,71 @@
3335
logging.error(f"Error loading model: {e}")
3436
model = None
3537

38+
3639
# Endpoints
37-
@app.route('/', methods=['GET'])
40+
@app.route("/", methods=["GET"])
3841
def health_check():
3942
"""Health check endpoint."""
4043
logging.info("Health check requested.")
41-
return jsonify({'status': 'healthy', 'model_loaded': model is not None}), 200
44+
return jsonify({"status": "healthy", "model_loaded": model is not None}), 200
45+
4246

43-
@app.route('/predict', methods=['POST'])
47+
@app.route("/predict", methods=["POST"])
4448
def predict():
4549
logging.info("Prediction endpoint hit.")
4650
if model is None:
4751
logging.error("Prediction requested but model is not loaded.")
48-
return jsonify({'error': 'Model not loaded. Please ensure the model is trained and available.'}), 500
52+
return jsonify(
53+
{
54+
"error": "Model not loaded. Please ensure the model is trained and available."
55+
}
56+
), 500
4957

5058
# Parse JSON
5159
try:
5260
data = request.get_json(force=True)
5361
except Exception:
5462
logging.warning("Invalid JSON body.")
55-
return jsonify({'error': 'Invalid JSON body.'}), 400
63+
return jsonify({"error": "Invalid JSON body."}), 400
5664

5765
# Validation using Pydantic
5866
try:
5967
payload = PredictRequest.model_validate(data)
6068
except ValidationError as e:
6169
logging.warning(f"Validation error: {e}")
62-
return jsonify({'error': 'Invalid input', 'details': e.errors()}), 422
70+
return jsonify({"error": "Invalid input", "details": e.errors()}), 422
6371

6472
# Build DataFrame using aliases to match training feature names
6573
input_df = pd.DataFrame([payload.model_dump(by_alias=True)])
6674

6775
# Feature alignment
68-
if hasattr(model, 'feature_names_in_'):
69-
expected_features = list(model.feature_names_in_) # scikit-learn feature names
76+
if hasattr(model, "feature_names_in_"):
77+
expected_features = list(model.feature_names_in_) # scikit-learn feature names
7078
missing_features = set(expected_features) - set(input_df.columns)
7179
for feature in missing_features:
7280
input_df[feature] = 0
7381
input_df = input_df[expected_features]
7482
else:
75-
logging.warning("model.feature_names_in_ not found. Relying on input JSON for feature order.")
83+
logging.warning(
84+
"model.feature_names_in_ not found. Relying on input JSON for feature order."
85+
)
7686

7787
# Inference
7888
try:
7989
prediction = model.predict(input_df)
8090
prediction_proba = model.predict_proba(input_df)
8191
except Exception as e:
8292
logging.error(f"Error during prediction: {e}", exc_info=True)
83-
return jsonify({'error': f'An internal error occurred: {e}'}), 500
93+
return jsonify({"error": f"An internal error occurred: {e}"}), 500
8494

8595
result = {
86-
'prediction': int(prediction[0]),
87-
'probability_benign': float(prediction_proba[0][0]), # Class 0 is benign
88-
'probability_malignant': float(prediction_proba[0][1]), # Class 1 is malignant
96+
"prediction": int(prediction[0]),
97+
"probability_benign": float(prediction_proba[0][0]), # Class 0 is benign
98+
"probability_malignant": float(prediction_proba[0][1]), # Class 1 is malignant
8999
}
90100
logging.info(f"Prediction successful: {result}")
91101
return jsonify(result), 200
92102

93-
if __name__ == '__main__':
94-
app.run(debug=False, use_reloader=False, host='0.0.0.0', port=5000)
103+
104+
if __name__ == "__main__":
105+
app.run(debug=False, use_reloader=False, host="0.0.0.0", port=5000)

src/model/data_ingestion.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import pandas as pd
22

3-
def load_raw_data(data_path='data/data.csv'):
3+
4+
def load_raw_data(data_path="data/data.csv"):
45
"""Loads the raw dataset from the specified path."""
56
try:
67
df = pd.read_csv(data_path)
78
except pd.errors.EmptyDataError:
89
return pd.DataFrame()
9-
return df
10+
return df

src/model/data_preprocessing.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,24 @@
11
import pandas as pd
22

3+
34
def drop_unnecessary_columns(df):
45
"""Drops the 'id' and 'Unnamed: 32' columns from the DataFrame."""
5-
return df.drop(['id', 'Unnamed: 32'], axis=1, errors='ignore')
6+
return df.drop(["id", "Unnamed: 32"], axis=1, errors="ignore")
7+
68

79
def map_diagnosis_to_numerical(df):
810
"""Converts the 'diagnosis' column to numerical (M=1, B=0)."""
9-
# Use .get() to retrieve the 'diagnosis' Series.If 'diagnosis' doesn't exist,
10-
# .get() returns the default value, which we set to an empty Series with
11-
# the correct index. This ensures .map() is always called on a Series,
11+
# Use .get() to retrieve the 'diagnosis' Series.If 'diagnosis' doesn't exist,
12+
# .get() returns the default value, which we set to an empty Series with
13+
# the correct index. This ensures .map() is always called on a Series,
1214
# preventing KeyError.
13-
diagnosis_series = df.get('diagnosis', pd.Series(index=df.index, dtype='object'))
14-
df['diagnosis'] = diagnosis_series.map({'M': 1, 'B': 0})
15+
diagnosis_series = df.get("diagnosis", pd.Series(index=df.index, dtype="object"))
16+
df["diagnosis"] = diagnosis_series.map({"M": 1, "B": 0})
1517
return df
1618

19+
1720
def prepare_features_and_target(df):
1821
"""Prepares features (X) and target (y) from the preprocessed DataFrame."""
19-
X = df.drop('diagnosis', axis=1)
20-
y = df['diagnosis']
21-
return X, y
22+
X = df.drop("diagnosis", axis=1)
23+
y = df["diagnosis"]
24+
return X, y

src/model/model_inference.py

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,35 +3,65 @@
33
import os
44

55

6-
def load_pipeline(model_path='models/model.joblib'):
6+
def load_pipeline(model_path="models/model.joblib"):
77
"""Loads the trained scikit-learn pipeline."""
88
if not os.path.exists(model_path):
9-
raise FileNotFoundError(f"Model pipeline not found at {model_path}. Please train the model first.")
9+
raise FileNotFoundError(
10+
f"Model pipeline not found at {model_path}. Please train the model first."
11+
)
1012
return joblib.load(model_path)
1113

12-
def predict(raw_data, model_path='models/model.joblib'):
14+
15+
def predict(raw_data, model_path="models/model.joblib"):
1316
"""Loads the pipeline and makes a prediction on new raw data."""
1417
pipeline = load_pipeline(model_path)
1518
prediction = pipeline.predict(raw_data)
1619
return prediction
1720

21+
1822
if __name__ == "__main__":
19-
print("This module is for inference. Please run model_training.py to train the model.")
23+
print(
24+
"This module is for inference. Please run model_training.py to train the model."
25+
)
2026
try:
2127
# Example of new raw data (single row DataFrame)
22-
sample_new_data = pd.DataFrame([{
23-
'radius_mean': 17.99, 'texture_mean': 10.38,
24-
'perimeter_mean': 122.8, 'area_mean': 1001.0, 'smoothness_mean': 0.1184,
25-
'compactness_mean': 0.2776, 'concavity_mean': 0.3001, 'concave points_mean': 0.1471,
26-
'symmetry_mean': 0.2419, 'fractal_dimension_mean': 0.07871,
27-
'radius_se': 1.095, 'texture_se': 0.9053, 'perimeter_se': 8.589, 'area_se': 153.4,
28-
'smoothness_se': 0.006399, 'compactness_se': 0.04904, 'concavity_se': 0.05373,
29-
'concave points_se': 0.01587, 'symmetry_se': 0.03003, 'fractal_dimension_se': 0.006193,
30-
'radius_worst': 25.38, 'texture_worst': 17.33, 'perimeter_worst': 184.6, 'area_worst': 2019.0,
31-
'smoothness_worst': 0.1622, 'compactness_worst': 0.6656, 'concavity_worst': 0.7119,
32-
'concave points_worst': 0.2654, 'symmetry_worst': 0.4601, 'fractal_dimension_worst': 0.1189
33-
}])
28+
sample_new_data = pd.DataFrame(
29+
[
30+
{
31+
"radius_mean": 17.99,
32+
"texture_mean": 10.38,
33+
"perimeter_mean": 122.8,
34+
"area_mean": 1001.0,
35+
"smoothness_mean": 0.1184,
36+
"compactness_mean": 0.2776,
37+
"concavity_mean": 0.3001,
38+
"concave points_mean": 0.1471,
39+
"symmetry_mean": 0.2419,
40+
"fractal_dimension_mean": 0.07871,
41+
"radius_se": 1.095,
42+
"texture_se": 0.9053,
43+
"perimeter_se": 8.589,
44+
"area_se": 153.4,
45+
"smoothness_se": 0.006399,
46+
"compactness_se": 0.04904,
47+
"concavity_se": 0.05373,
48+
"concave points_se": 0.01587,
49+
"symmetry_se": 0.03003,
50+
"fractal_dimension_se": 0.006193,
51+
"radius_worst": 25.38,
52+
"texture_worst": 17.33,
53+
"perimeter_worst": 184.6,
54+
"area_worst": 2019.0,
55+
"smoothness_worst": 0.1622,
56+
"compactness_worst": 0.6656,
57+
"concavity_worst": 0.7119,
58+
"concave points_worst": 0.2654,
59+
"symmetry_worst": 0.4601,
60+
"fractal_dimension_worst": 0.1189,
61+
}
62+
]
63+
)
3464
prediction = predict(sample_new_data)
3565
print(f"Prediction for sample data: {prediction[0]} (0: Benign, 1: Malignant)")
3666
except FileNotFoundError as e:
37-
print(e)
67+
print(e)

src/model/model_training.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import pandas as pd
21
from sklearn.model_selection import train_test_split
32
from sklearn.metrics import accuracy_score
43
import joblib
@@ -8,19 +7,26 @@
87
from .data_preprocessing import map_diagnosis_to_numerical, prepare_features_and_target
98
from .pipeline_utils import create_breast_cancer_pipeline
109

11-
def train_and_save_pipeline(data_path='data/data.csv', model_path='models/model.joblib'):
10+
11+
def train_and_save_pipeline(
12+
data_path="data/data.csv", model_path="models/model.joblib"
13+
):
1214
"""Orchestrates the training process: loads data, preprocesses, trains, and saves the pipeline."""
1315
# Load the raw dataset
1416
df_raw = load_raw_data(data_path)
1517

1618
# Apply diagnosis mapping before splitting features and target
17-
df_mapped = map_diagnosis_to_numerical(df_raw.copy()) # Use a copy to avoid modifying original df_raw if it's used elsewhere
19+
df_mapped = map_diagnosis_to_numerical(
20+
df_raw.copy()
21+
) # Use a copy to avoid modifying original df_raw if it's used elsewhere
1822

1923
# Prepare features (X) and target (y)
2024
X, y = prepare_features_and_target(df_mapped)
2125

2226
# Split data into train/test sets
23-
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
27+
X_train, X_test, y_train, y_test = train_test_split(
28+
X, y, test_size=0.2, random_state=42
29+
)
2430

2531
# Create and train the pipeline
2632
pipeline = create_breast_cancer_pipeline()
@@ -38,5 +44,6 @@ def train_and_save_pipeline(data_path='data/data.csv', model_path='models/model.
3844
joblib.dump(pipeline, model_path)
3945
print(f"Trained pipeline saved to {model_path}")
4046

47+
4148
if __name__ == "__main__":
42-
train_and_save_pipeline()
49+
train_and_save_pipeline()

0 commit comments

Comments
 (0)