Skip to content

yry1997/diabetes_prediction

Repository files navigation

Diabetes Prediction Challenge - 完整实施指南

📋 比赛概述

本次Kaggle月赛是一个二分类任务,需要根据个人的健康指标和生活习惯预测其是否患有糖尿病。

数据集信息

  • 训练集: data/train.csv (包含特征和标签)
  • 测试集: data/test.csv (仅包含特征)
  • 提交样例: data/sample_submission.csv

任务目标

预测 diagnosed_diabetes 字段(0: 未患病, 1: 患病)


🎯 实施步骤

1. 环境准备

在kaggle环境下安装必备的包

conda activate kaggle  # Linux/Mac

pip install pandas numpy scikit-learn matplotlib seaborn
pip install lightgbm xgboost catboost
pip install optuna  # 用于超参数优化

2. 数据探索性分析 (EDA)

创建 EDA.ipynb 进行以下分析:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 加载数据
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# 基本统计
print(train.info())
print(train.describe())

# 查看目标变量分布
print(train['diagnosed_diabetes'].value_counts(normalize=True))

# 特征类型分析
numeric_features = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
                   'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
                   'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
                   'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides']

categorical_features = ['gender', 'ethnicity', 'education_level', 'income_level',
                       'smoking_status', 'employment_status', 'family_history_diabetes',
                       'hypertension_history', 'cardiovascular_history']

3. 特征工程

创建 feature_engineering.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

class FeatureEngineer:
    def __init__(self):
        self.scalers = {}
        self.encoders = {}

    def fit(self, train_df):
        """在训练集上拟合所有转换器"""
        df = train_df.copy()

        # 数值特征标准化
        numeric_features = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
                           'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
                           'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
                           'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides']

        scaler = StandardScaler()
        scaler.fit(df[numeric_features])
        self.scalers['numeric'] = scaler

        # 分类特征编码
        categorical_features = ['gender', 'ethnicity', 'education_level', 'income_level',
                               'smoking_status', 'employment_status']

        for col in categorical_features:
            le = LabelEncoder()
            le.fit(df[col])
            self.encoders[col] = le

        return self

    def transform(self, df):
        """应用特征转换"""
        df = df.copy()

        # 数值特征标准化
        numeric_features = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
                           'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
                           'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
                           'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides']

        df[numeric_features] = self.scalers['numeric'].transform(df[numeric_features])

        # 分类特征编码
        categorical_features = ['gender', 'ethnicity', 'education_level', 'income_level',
                               'smoking_status', 'employment_status']

        for col in categorical_features:
            if col in df.columns:
                df[col] = self.encoders[col].transform(df[col])

        return df

    def create_interaction_features(self, df):
        """创建交互特征"""
        df = df.copy()

        # BMI类别
        df['bmi_category'] = pd.cut(df['bmi'],
                                   bins=[0, 18.5, 24.9, 29.9, 100],
                                   labels=['underweight', 'normal', 'overweight', 'obese'])

        # 年龄分组
        df['age_group'] = pd.cut(df['age'],
                                bins=[0, 30, 45, 60, 100],
                                labels=['young', 'middle', 'senior', 'elderly'])

        # 血压分类
        df['bp_category'] = np.where(
            (df['systolic_bp'] < 120) & (df['diastolic_bp'] < 80), 'normal',
            np.where(
                (df['systolic_bp'] < 140) & (df['diastolic_bp'] < 90), 'elevated', 'high'
            )
        )

        # 胆固醇比率
        df['cholesterol_ratio'] = df['ldl_cholesterol'] / df['hdl_cholesterol']

        return df

4. 模型训练与验证

创建 model_training.py

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import optuna

def train_models(X_train, y_train):
    """训练多个模型"""
    models = {}

    # LightGBM
    lgb_model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    models['lightgbm'] = lgb_model

    # XGBoost
    xgb_model = xgb.XGBClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    models['xgboost'] = xgb_model

    # CatBoost
    cat_model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        random_seed=42,
        verbose=False
    )
    models['catboost'] = cat_model

    return models

def cross_validate_model(model, X, y, cv=5):
    """交叉验证"""
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')
    return scores.mean(), scores.std()

def optimize_hyperparameters(X, y, n_trials=50):
    """使用Optuna优化超参数"""
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'num_leaves': trial.suggest_int('num_leaves', 20, 50),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        }

        model = lgb.LGBMClassifier(**params, random_state=42)
        score, _ = cross_validate_model(model, X, y)
        return score

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    return study.best_params

5. 完整训练脚本

创建 train.py

import pandas as pd
import numpy as np
from feature_engineering import FeatureEngineer
from model_training import train_models, cross_validate_model
import warnings
warnings.filterwarnings('ignore')

def main():
    # 加载数据
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')

    # 特征工程
    fe = FeatureEngineer()
    fe.fit(train)

    # 创建交互特征
    train = fe.create_interaction_features(train)
    test = fe.create_interaction_features(test)

    # 应用转换
    train_processed = fe.transform(train)
    test_processed = fe.transform(test)

    # 准备特征和目标
    feature_cols = [col for col in train_processed.columns
                   if col not in ['id', 'diagnosed_diabetes']]

    X_train = train_processed[feature_cols]
    y_train = train_processed['diagnosed_diabetes']
    X_test = test_processed[feature_cols]

    # 训练模型
    models = train_models(X_train, y_train)

    # 评估模型
    print("模型交叉验证结果:")
    for name, model in models.items():
        mean_score, std_score = cross_validate_model(model, X_train, y_train)
        print(f"{name}: {mean_score:.4f} (+/- {std_score:.4f})")

    # 选择最佳模型(这里选择LightGBM)
    best_model = models['lightgbm']
    best_model.fit(X_train, y_train)

    # 预测测试集
    predictions = best_model.predict_proba(X_test)[:, 1]

    # 创建提交文件
    submission = pd.DataFrame({
        'id': test['id'],
        'diagnosed_diabetes': predictions
    })

    # 转换为二分类(根据阈值)
    submission['diagnosed_diabetes'] = (submission['diagnosed_diabetes'] > 0.5).astype(int)

    submission.to_csv('submission.csv', index=False)
    print("提交文件已创建:submission.csv")

    # 特征重要性
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\n特征重要性前10:")
    print(feature_importance.head(10))

if __name__ == "__main__":
    main()

6. 高级技巧

6.1 集成学习

# 创建集成预测
def ensemble_predict(models, X_test):
    predictions = []
    for model in models.values():
        pred = model.predict_proba(X_test)[:, 1]
        predictions.append(pred)
    return np.mean(predictions, axis=0)

6.2 伪标签技术(半监督学习)

def pseudo_labeling(model, X_train, y_train, X_test, threshold=0.9):
    # 在训练集上训练模型
    model.fit(X_train, y_train)

    # 预测测试集
    test_probs = model.predict_proba(X_test)[:, 1]
    test_preds = (test_probs > threshold).astype(int)

    # 选择高置信度的预测
    high_conf_idx = np.where((test_probs > threshold) | (test_probs < 1-threshold))[0]
    pseudo_X = X_test.iloc[high_conf_idx]
    pseudo_y = test_preds[high_conf_idx]

    # 合并数据
    X_aug = pd.concat([X_train, pseudo_X])
    y_aug = pd.concat([y_train, pd.Series(pseudo_y)])

    return X_aug, y_aug

6.3 模型调优

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(
    estimator=lgb.LGBMClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)

📁 项目结构

diabetes_prediction/
├── data/
│   ├── train.csv
│   ├── test.csv
│   └── sample_submission.csv
├── src/
│   ├── __init__.py
│   ├── feature_engineering.py
│   ├── model_training.py
│   └── utils.py
├── notebooks/
│   ├── EDA.ipynb
│   └── model_experiments.ipynb
├── train.py
├── predict.py
├── requirements.txt
└── README.md

💡 提高成绩的建议

1. 特征工程

  • 创建交互特征(如BMI × 年龄)
  • 使用领域知识创建医学特征
  • 处理异常值和缺失值
  • 特征选择:删除不重要的特征

2. 模型优化

  • 尝试不同的算法(Random Forest, Neural Networks)
  • 使用Stacking/Blending集成多个模型
  • 超参数优化(Optuna, Hyperopt)
  • 使用交叉验证获得稳健的结果

3. 数据增强

  • 过采样/欠采样处理类别不平衡
  • SMOTE生成合成样本
  • 伪标签技术利用未标记数据

4. 后处理

  • 调整分类阈值
  • 使用校准(Calibration)优化概率输出
  • 后处理规则(基于医学知识)

🚀 快速开始

  1. 准备环境
pip install -r requirements.txt
  1. 运行训练
python train.py
  1. 生成预测
python predict.py
  1. 提交结果submission.csv 提交到Kaggle

📊 评估指标

本项目使用以下指标:

  • AUC-ROC: 主要评估指标
  • 准确率 (Accuracy)
  • 精确率 (Precision)
  • 召回率 (Recall)
  • F1分数

📝 日志和版本控制

建议使用:

  • Git进行版本控制
  • MLflow或Weights & Biases跟踪实验
  • 保存每次实验的配置和结果

🔍 调试技巧

  1. 验证数据加载
assert train.shape[0] > 0
assert test.shape[0] > 0
  1. 检查特征一致性
assert set(train.columns) - {'diagnosed_diabetes'} == set(test.columns)
  1. 监控过拟合
# 检查训练集和验证集的性能差异
train_score = model.score(X_train, y_train)
val_score = model.score(X_val, y_val)
print(f"训练集分数: {train_score:.4f}")
print(f"验证集分数: {val_score:.4f}")

🎯 比赛策略

  1. 第一阶段:建立基线模型

    • 简单的特征工程
    • 使用LightGBM/XGBoost
    • 获得基准分数
  2. 第二阶段:特征优化

    • 深入的EDA
    • 创建新特征
    • 特征选择
  3. 第三阶段:模型优化

    • 集成学习
    • 超参数调优
    • 高级技巧
  4. 最后阶段:微调

    • 后处理优化
    • 提交多次找最佳结果

祝你在比赛中取得好成绩!🎉

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published