CASE_Customer/build_model.py at main · theaseven/CASE_Customer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("逻辑回归模型：预测客户未来3个月资产提升至100万+的概率")
print("=" * 60)

# 1. 读取数据
print("\n[1] 读取数据...")
df_base = pd.read_csv('customer/customer_base.csv')
df_behavior = pd.read_csv('customer/customer_behavior_assets.csv')

# 排序
df_behavior = df_behavior.sort_values(['customer_id', 'stat_month']).reset_index(drop=True)

print(f"    客户基础数据: {len(df_base)} 条")
print(f"    客户行为数据: {len(df_behavior)} 条")

# 2. 构造时序特征函数
def create_time_series_features(group, base_month, lookback=6):
    """
    为每个客户在特定基线月份构造时序特征
    group: 某个客户的历史数据
    base_month: 基线月份
    lookback: 往回看几个月
    """
    # 获取基线月份之前的数据
    months_list = sorted(group['stat_month'].unique())
    if base_month not in months_list:
        return None

    base_idx = months_list.index(base_month)
    if base_idx < lookback - 1:
        return None

    # 特征窗口数据
    feature_months = months_list[base_idx - lookback + 1: base_idx + 1]
    feature_data = group[group['stat_month'].isin(feature_months)].copy()

    if len(feature_data) < lookback:
        return None

    # 观察窗口数据（未来3个月）
    future_months = []
    base_year, base_m = int(base_month.split('-')[0]), int(base_month.split('-')[1])
    for i in range(1, 4):
        m = base_m + i
        y = base_year
        while m > 12:
            m -= 12
            y += 1
        future_months.append(f"{y}-{m:02d}")

    future_data = group[group['stat_month'].isin(future_months)]

    # 计算特征
    features = {}

    # 当前状态特征（基线月份）
    base_data = feature_data[feature_data['stat_month'] == base_month].iloc[0]
    features['total_assets'] = base_data['total_assets']
    features['deposit_balance'] = base_data['deposit_balance']
    features['financial_balance'] = base_data['financial_balance']
    features['fund_balance'] = base_data['fund_balance']
    features['insurance_balance'] = base_data['insurance_balance']
    features['product_count'] = base_data['product_count']
    features['app_login_count'] = base_data['app_login_count']
    features['app_financial_view_time'] = base_data['app_financial_view_time']
    features['credit_card_monthly_expense'] = base_data['credit_card_monthly_expense']
    features['investment_monthly_count'] = base_data['investment_monthly_count']
    features['financial_repurchase_count'] = base_data['financial_repurchase_count']

    # 时序特征
    assets = feature_data.sort_values('stat_month')['total_assets'].values

    # 增长率特征
    if len(assets) >= 2:
        features['asset_growth_rate_1m'] = (assets[-1] - assets[-2]) / (assets[-2] + 1)
    if len(assets) >= 4:
        features['asset_growth_rate_3m'] = (assets[-1] - assets[-4]) / (assets[-4] + 1)
    if len(assets) >= 6:
        features['asset_growth_rate_6m'] = (assets[-1] - assets[-6]) / (assets[-6] + 1)

    # 统计特征
    features['asset_mean_6m'] = np.mean(assets)
    features['asset_std_6m'] = np.std(assets)
    features['asset_cv_6m'] = np.std(assets) / (np.mean(assets) + 1)

    # 当前相对均值
    features['current_vs_avg'] = (assets[-1] - np.mean(assets)) / (np.mean(assets) + 1)

    # 线性趋势
    if len(assets) >= 2:
        x = np.arange(len(assets))
        slope, intercept = np.polyfit(x, assets, 1)
        features['trend_slope'] = slope
        # R-squared
        y_pred = slope * x + intercept
        ss_res = np.sum((assets - y_pred) ** 2)
        ss_tot = np.sum((assets - np.mean(assets)) ** 2)
        features['trend_r_squared'] = 1 - (ss_res / (ss_tot + 1))

    # 动量特征
    if len(assets) >= 6:
        recent_3_mean = np.mean(assets[-3:])
        earlier_3_mean = np.mean(assets[-6:-3])
        features['asset_momentum'] = (recent_3_mean - earlier_3_mean) / (earlier_3_mean + 1)

    # 目标变量：未来3个月是否达到100万
    if len(future_data) > 0:
        max_assets_future = future_data['total_assets'].max()
        features['target'] = 1 if max_assets_future >= 1000000 else 0
    else:
        return None

    features['base_month'] = base_month
    features['customer_id'] = group['customer_id'].iloc[0]

    return features

# 3. 构造训练数据
print("\n[2] 构造时序特征和目标变量...")

# 确定基线月份范围（需要往前看6个月，往后看3个月）
all_months = sorted(df_behavior['stat_month'].unique())
print(f"    可用月份: {all_months}")

# 基线月份：2024-07 到 2024-09（需要2025-01之前的数据作为目标）
# 由于最晚月份是2025-06，基线最晚可以是2025-03
base_months = ['2024-07', '2024-08', '2024-09', '2024-10', '2024-11',
               '2024-12', '2025-01', '2025-02', '2025-03']

all_samples = []

for customer_id, group in df_behavior.groupby('customer_id'):
    for base_month in base_months:
        result = create_time_series_features(group, base_month, lookback=6)
        if result is not None:
            all_samples.append(result)

df_samples = pd.DataFrame(all_samples)
print(f"    构造样本数: {len(df_samples)}")

# 4. 合并静态特征
print("\n[3] 合并静态特征...")

# 选择静态特征
static_features = ['age', 'monthly_income', 'gender', 'occupation_type',
                   'lifecycle_stage', 'marriage_status', 'city_level']

df_static = df_base[['customer_id'] + static_features].copy()

# 合并
df_train = df_samples.merge(df_static, on='customer_id', how='left')
print(f"    合并后样本数: {len(df_train)}")

# 5. 数据预处理
print("\n[4] 数据预处理...")

# 分离特征和目标
target_col = 'target'
feature_cols = [col for col in df_train.columns if col not in ['target', 'customer_id', 'base_month']]

X = df_train[feature_cols].copy()
y = df_train[target_col].copy()

# 处理分类变量
categorical_cols = ['gender', 'occupation_type', 'lifecycle_stage', 'marriage_status', 'city_level']

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# 处理缺失值
X = X.fillna(0)

print(f"    特征数: {len(feature_cols)}")
print(f"    正样本数: {y.sum()} ({y.mean()*100:.2f}%)")
print(f"    负样本数: {len(y) - y.sum()} ({(1-y.mean())*100:.2f}%)")

# 6. 划分训练集和测试集
print("\n[5] 划分数据集...")

# 按客户划分，避免同客户同时出现在训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"    训练集: {len(X_train)} 条")
print(f"    测试集: {len(X_test)} 条")

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7. 训练逻辑回归模型
print("\n[6] 训练逻辑回归模型...")

# 使用 class_weight='balanced' 处理类别不平衡
model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    class_weight='balanced',
    solver='lbfgs'
)

model.fit(X_train_scaled, y_train)

# 8. 模型评估
print("\n[7] 模型评估...")

y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=['未达100万', '达到100万']))

auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC 分数: {auc_score:.4f}")

# 9. 输出系数
print("\n[8] 逻辑回归系数:")
print("-" * 60)

coefficients = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': model.coef_[0]
})

# 按系数绝对值排序
coefficients['abs_coef'] = np.abs(coefficients['coefficient'])
coefficients = coefficients.sort_values('abs_coef', ascending=False)

print("\n特征系数（按重要性排序）:")
print(coefficients[['feature', 'coefficient']].to_string(index=False))

# 保存系数到文件
coefficients.to_csv('model_coefficients.csv', index=False)
print("\n系数已保存到 model_coefficients.csv")

# 10. 可视化
print("\n[9] 生成可视化...")

import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import matplotlib as mpl

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

# 创建可视化
fig, ax = plt.subplots(figsize=(12, 10))

# 按系数值排序
coef_sorted = coefficients.sort_values('coefficient')

# 设置颜色：正系数绿色，负系数红色
colors = ['#2ecc71' if c >= 0 else '#e74c3c' for c in coef_sorted['coefficient']]

# 绘制水平条形图
bars = ax.barh(range(len(coef_sorted)), coef_sorted['coefficient'], color=colors)

# 设置y轴标签
ax.set_yticks(range(len(coef_sorted)))
ax.set_yticklabels(coef_sorted['feature'], fontsize=10)

# 添加数值标签
for i, (bar, coef) in enumerate(zip(bars, coef_sorted['coefficient'])):
    width = bar.get_width()
    ax.text(width + 0.001 if width >= 0 else width - 0.001,
            bar.get_y() + bar.get_height()/2,
            f'{coef:.4f}',
            va='center', ha='left' if width >= 0 else 'right',
            fontsize=9)

# 添加零线
ax.axvline(x=0, color='black', linewidth=0.8)

# 设置标题和标签
ax.set_xlabel('系数值 (Coefficient)', fontsize=12)
ax.set_title('逻辑回归系数可视化\n（绿色=正向影响，红色=负向影响）', fontsize=14, fontweight='bold')

# 添加图例
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#2ecc71', label='正系数（促进资产增长）'),
                   Patch(facecolor='#e74c3c', label='负系数（阻碍资产增长）')]
ax.legend(handles=legend_elements, loc='lower right')

plt.tight_layout()
plt.savefig('coefficient_visualization.png', dpi=150, bbox_inches='tight')
print("    可视化已保存到 coefficient_visualization.png")

# 11. 显示Top正负系数
print("\n[10] 关键发现:")
print("-" * 60)

top_positive = coefficients.nlargest(5, 'coefficient')[['feature', 'coefficient']]
top_negative = coefficients.nsmallest(5, 'coefficient')[['feature', 'coefficient']]

print("\n正向影响最大的特征（Top 5）:")
for idx, row in top_positive.iterrows():
    print(f"    {row['feature']}: {row['coefficient']:.6f}")

print("\n负向影响最大的特征（Top 5）:")
for idx, row in top_negative.iterrows():
    print(f"    {row['feature']}: {row['coefficient']:.6f}")

print("\n" + "=" * 60)
print("模型训练完成！")
print("=" * 60)