Skip to content

Commit 7fb4919

Browse files
committed
feat(data_preprocessing): Add unit tests and improve diagnosis mapping
- Implemented unit tests for the map_diagnosis_to_numerical function. - Added test cases covering successful mapping, handling of missing 'diagnosis' column, and empty DataFrames. - Fixed map_diagnosis_to_numerical to gracefully handle input DataFrames missing the 'diagnosis' column by using .get(), preventing KeyError. - Refactored pytest fixtures (sample_data_1, sample_data_2, empty_data) into ests/conftest.py for shared use across test files.
1 parent acf8cf3 commit 7fb4919

File tree

4 files changed

+34
-5
lines changed

4 files changed

+34
-5
lines changed

README.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,18 @@ breast-cancer-ops/
2525
│ │ └── pipeline_utils.py # Defines the scikit-learn pipeline structure
2626
│ └── streamlit_app.py # Streamlit user interface for predictions
2727
├── tests/ # For unit and integration tests
28+
│ ├── __init__.py # Makes 'tests' a Python package
29+
│ ├── conftest.py # Shared pytest fixtures across test files
2830
│ ├── bash_test.sh
2931
│ ├── powershell_test.ps1
32+
│ ├── sample_payload.json # For API/integration tests
3033
│ ├── unit/
31-
│ │ └── test_data_ingestion.py
32-
│ └── sample_payload.json
34+
│ │ ├── __init__.py # Makes 'unit' a Python package
35+
│ │ ├── dat-ingestion/ # Tests for src/model/dat-ingestion.py
36+
│ │ │ └── test_dat-ingestion.py
37+
│ │ ├── data_preprocessing/ # Tests for src/model/data_preprocessing.py
38+
│ │ │ ├── test_drop_unnecessary_columns.py
39+
│ │ │ └── test_map_diagnosis_to_numerical.py
3340
├── pytest.ini # pytest configuration
3441
├── README.md # Project documentation
3542
└── requirements.txt # Python dependencies

src/model/data_preprocessing.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@ def drop_unnecessary_columns(df):
66

77
def map_diagnosis_to_numerical(df):
88
"""Converts the 'diagnosis' column to numerical (M=1, B=0)."""
9-
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
9+
# Use .get() to retrieve the 'diagnosis' Series.If 'diagnosis' doesn't exist,
10+
# .get() returns the default value, which we set to an empty Series with
11+
# the correct index. This ensures .map() is always called on a Series,
12+
# preventing KeyError.
13+
diagnosis_series = df.get('diagnosis', pd.Series(index=df.index, dtype='object'))
14+
df['diagnosis'] = diagnosis_series.map({'M': 1, 'B': 0})
1015
return df
1116

1217
def prepare_features_and_target(df):

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def sample_data_1():
1818
def sample_data_2():
1919
'Provides a sample dataframe for testing preprocessing functions'
2020
data = {
21-
'diagnosis': ['M', 'B', 'M'],
21+
# 'diagnosis': ['M', 'B', 'M'],
2222
'feature_1': [0.1, 0.2, 0.3],
2323
'feature_2': [0.4, 0.5, 0.6],
2424
'feature_3': [0.7, 0.8, 0.9],

tests/unit/data_preprocessing/test_map_diagnosis_to_numerical.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,21 @@ def test_map_diagnosis_to_numerical_success(sample_data_1):
1818
# Assert that the other columns remain the same
1919
assert result['feature_1'].equals(sample_data_1['feature_1'])
2020
assert result['feature_2'].equals(sample_data_1['feature_2'])
21-
assert result['feature_3'].equals(sample_data_1['feature_3'])
21+
assert result['feature_3'].equals(sample_data_1['feature_3'])
22+
23+
def test_map_diagnosis_to_numerical_missing(sample_data_2):
24+
'''Test the behavior when the diagnosis column is missing in the dataframe.
25+
The diagnosis column should be created and filled with NaN.'''
26+
df = sample_data_2.copy()
27+
result = map_diagnosis_to_numerical(df)
28+
29+
# Assert that diagnosis column it's created
30+
assert 'diagnosis' in result.columns
31+
32+
# Assert that the 'diagnosis' column is filled with NaN
33+
assert result['diagnosis'].isnull().all()
34+
35+
# Assert that the other columns remain the same
36+
assert result['feature_1'].equals(sample_data_2['feature_1'])
37+
assert result['feature_2'].equals(sample_data_2['feature_2'])
38+
assert result['feature_3'].equals(sample_data_2['feature_3'])

0 commit comments

Comments
 (0)