本次Kaggle月赛是一个二分类任务,需要根据个人的健康指标和生活习惯预测其是否患有糖尿病。
- 训练集:
data/train.csv(包含特征和标签) - 测试集:
data/test.csv(仅包含特征) - 提交样例:
data/sample_submission.csv
预测 diagnosed_diabetes 字段(0: 未患病, 1: 患病)
在kaggle环境下安装必备的包
conda activate kaggle # Linux/Mac
pip install pandas numpy scikit-learn matplotlib seaborn
pip install lightgbm xgboost catboost
pip install optuna # 用于超参数优化创建 EDA.ipynb 进行以下分析:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 加载数据
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
# 基本统计
print(train.info())
print(train.describe())
# 查看目标变量分布
print(train['diagnosed_diabetes'].value_counts(normalize=True))
# 特征类型分析
numeric_features = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides']
categorical_features = ['gender', 'ethnicity', 'education_level', 'income_level',
'smoking_status', 'employment_status', 'family_history_diabetes',
'hypertension_history', 'cardiovascular_history']创建 feature_engineering.py:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
class FeatureEngineer:
def __init__(self):
self.scalers = {}
self.encoders = {}
def fit(self, train_df):
"""在训练集上拟合所有转换器"""
df = train_df.copy()
# 数值特征标准化
numeric_features = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides']
scaler = StandardScaler()
scaler.fit(df[numeric_features])
self.scalers['numeric'] = scaler
# 分类特征编码
categorical_features = ['gender', 'ethnicity', 'education_level', 'income_level',
'smoking_status', 'employment_status']
for col in categorical_features:
le = LabelEncoder()
le.fit(df[col])
self.encoders[col] = le
return self
def transform(self, df):
"""应用特征转换"""
df = df.copy()
# 数值特征标准化
numeric_features = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides']
df[numeric_features] = self.scalers['numeric'].transform(df[numeric_features])
# 分类特征编码
categorical_features = ['gender', 'ethnicity', 'education_level', 'income_level',
'smoking_status', 'employment_status']
for col in categorical_features:
if col in df.columns:
df[col] = self.encoders[col].transform(df[col])
return df
def create_interaction_features(self, df):
"""创建交互特征"""
df = df.copy()
# BMI类别
df['bmi_category'] = pd.cut(df['bmi'],
bins=[0, 18.5, 24.9, 29.9, 100],
labels=['underweight', 'normal', 'overweight', 'obese'])
# 年龄分组
df['age_group'] = pd.cut(df['age'],
bins=[0, 30, 45, 60, 100],
labels=['young', 'middle', 'senior', 'elderly'])
# 血压分类
df['bp_category'] = np.where(
(df['systolic_bp'] < 120) & (df['diastolic_bp'] < 80), 'normal',
np.where(
(df['systolic_bp'] < 140) & (df['diastolic_bp'] < 90), 'elevated', 'high'
)
)
# 胆固醇比率
df['cholesterol_ratio'] = df['ldl_cholesterol'] / df['hdl_cholesterol']
return df创建 model_training.py:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import optuna
def train_models(X_train, y_train):
"""训练多个模型"""
models = {}
# LightGBM
lgb_model = lgb.LGBMClassifier(
n_estimators=1000,
learning_rate=0.05,
num_leaves=31,
subsample=0.8,
colsample_bytree=0.8,
random_state=42
)
models['lightgbm'] = lgb_model
# XGBoost
xgb_model = xgb.XGBClassifier(
n_estimators=1000,
learning_rate=0.05,
max_depth=6,
subsample=0.8,
colsample_bytree=0.8,
random_state=42
)
models['xgboost'] = xgb_model
# CatBoost
cat_model = CatBoostClassifier(
iterations=1000,
learning_rate=0.05,
depth=6,
random_seed=42,
verbose=False
)
models['catboost'] = cat_model
return models
def cross_validate_model(model, X, y, cv=5):
"""交叉验证"""
skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')
return scores.mean(), scores.std()
def optimize_hyperparameters(X, y, n_trials=50):
"""使用Optuna优化超参数"""
def objective(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'num_leaves': trial.suggest_int('num_leaves', 20, 50),
'subsample': trial.suggest_float('subsample', 0.6, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
}
model = lgb.LGBMClassifier(**params, random_state=42)
score, _ = cross_validate_model(model, X, y)
return score
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)
return study.best_params创建 train.py:
import pandas as pd
import numpy as np
from feature_engineering import FeatureEngineer
from model_training import train_models, cross_validate_model
import warnings
warnings.filterwarnings('ignore')
def main():
# 加载数据
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
# 特征工程
fe = FeatureEngineer()
fe.fit(train)
# 创建交互特征
train = fe.create_interaction_features(train)
test = fe.create_interaction_features(test)
# 应用转换
train_processed = fe.transform(train)
test_processed = fe.transform(test)
# 准备特征和目标
feature_cols = [col for col in train_processed.columns
if col not in ['id', 'diagnosed_diabetes']]
X_train = train_processed[feature_cols]
y_train = train_processed['diagnosed_diabetes']
X_test = test_processed[feature_cols]
# 训练模型
models = train_models(X_train, y_train)
# 评估模型
print("模型交叉验证结果:")
for name, model in models.items():
mean_score, std_score = cross_validate_model(model, X_train, y_train)
print(f"{name}: {mean_score:.4f} (+/- {std_score:.4f})")
# 选择最佳模型(这里选择LightGBM)
best_model = models['lightgbm']
best_model.fit(X_train, y_train)
# 预测测试集
predictions = best_model.predict_proba(X_test)[:, 1]
# 创建提交文件
submission = pd.DataFrame({
'id': test['id'],
'diagnosed_diabetes': predictions
})
# 转换为二分类(根据阈值)
submission['diagnosed_diabetes'] = (submission['diagnosed_diabetes'] > 0.5).astype(int)
submission.to_csv('submission.csv', index=False)
print("提交文件已创建:submission.csv")
# 特征重要性
feature_importance = pd.DataFrame({
'feature': feature_cols,
'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\n特征重要性前10:")
print(feature_importance.head(10))
if __name__ == "__main__":
main()# 创建集成预测
def ensemble_predict(models, X_test):
predictions = []
for model in models.values():
pred = model.predict_proba(X_test)[:, 1]
predictions.append(pred)
return np.mean(predictions, axis=0)def pseudo_labeling(model, X_train, y_train, X_test, threshold=0.9):
# 在训练集上训练模型
model.fit(X_train, y_train)
# 预测测试集
test_probs = model.predict_proba(X_test)[:, 1]
test_preds = (test_probs > threshold).astype(int)
# 选择高置信度的预测
high_conf_idx = np.where((test_probs > threshold) | (test_probs < 1-threshold))[0]
pseudo_X = X_test.iloc[high_conf_idx]
pseudo_y = test_preds[high_conf_idx]
# 合并数据
X_aug = pd.concat([X_train, pseudo_X])
y_aug = pd.concat([y_train, pd.Series(pseudo_y)])
return X_aug, y_augfrom sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [500, 1000, 1500],
'learning_rate': [0.01, 0.05, 0.1],
'max_depth': [4, 6, 8],
'subsample': [0.8, 0.9, 1.0]
}
grid_search = GridSearchCV(
estimator=lgb.LGBMClassifier(random_state=42),
param_grid=param_grid,
cv=5,
scoring='roc_auc',
n_jobs=-1
)diabetes_prediction/
├── data/
│ ├── train.csv
│ ├── test.csv
│ └── sample_submission.csv
├── src/
│ ├── __init__.py
│ ├── feature_engineering.py
│ ├── model_training.py
│ └── utils.py
├── notebooks/
│ ├── EDA.ipynb
│ └── model_experiments.ipynb
├── train.py
├── predict.py
├── requirements.txt
└── README.md
- 创建交互特征(如BMI × 年龄)
- 使用领域知识创建医学特征
- 处理异常值和缺失值
- 特征选择:删除不重要的特征
- 尝试不同的算法(Random Forest, Neural Networks)
- 使用Stacking/Blending集成多个模型
- 超参数优化(Optuna, Hyperopt)
- 使用交叉验证获得稳健的结果
- 过采样/欠采样处理类别不平衡
- SMOTE生成合成样本
- 伪标签技术利用未标记数据
- 调整分类阈值
- 使用校准(Calibration)优化概率输出
- 后处理规则(基于医学知识)
- 准备环境
pip install -r requirements.txt- 运行训练
python train.py- 生成预测
python predict.py- 提交结果
将
submission.csv提交到Kaggle
本项目使用以下指标:
- AUC-ROC: 主要评估指标
- 准确率 (Accuracy)
- 精确率 (Precision)
- 召回率 (Recall)
- F1分数
建议使用:
- Git进行版本控制
- MLflow或Weights & Biases跟踪实验
- 保存每次实验的配置和结果
- 验证数据加载
assert train.shape[0] > 0
assert test.shape[0] > 0- 检查特征一致性
assert set(train.columns) - {'diagnosed_diabetes'} == set(test.columns)- 监控过拟合
# 检查训练集和验证集的性能差异
train_score = model.score(X_train, y_train)
val_score = model.score(X_val, y_val)
print(f"训练集分数: {train_score:.4f}")
print(f"验证集分数: {val_score:.4f}")-
第一阶段:建立基线模型
- 简单的特征工程
- 使用LightGBM/XGBoost
- 获得基准分数
-
第二阶段:特征优化
- 深入的EDA
- 创建新特征
- 特征选择
-
第三阶段:模型优化
- 集成学习
- 超参数调优
- 高级技巧
-
最后阶段:微调
- 后处理优化
- 提交多次找最佳结果
祝你在比赛中取得好成绩!🎉