-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_clustering.py
More file actions
357 lines (290 loc) · 14.2 KB
/
build_clustering.py
File metadata and controls
357 lines (290 loc) · 14.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')
print("=" * 70)
print("客户聚类分析:客户分群(高复购、中产家庭、年轻高消费等)")
print("=" * 70)
# 1. 读取数据
print("\n[1] 读取数据...")
df_base = pd.read_csv('customer/customer_base.csv')
df_behavior = pd.read_csv('customer/customer_behavior_assets.csv')
print(f" 客户基础数据:{len(df_base)} 条")
print(f" 客户行为数据:{len(df_behavior)} 条")
# 2. 特征工程:计算客户聚合特征
print("\n[2] 特征工程:计算客户聚合特征...")
df_behavior_latest = df_behavior.sort_values(['customer_id', 'stat_month'])
df_behavior_latest = df_behavior_latest.groupby('customer_id').last().reset_index()
df_agg = df_behavior.groupby('customer_id').agg({
'total_assets': ['mean', 'std', 'last'],
'financial_repurchase_count': 'sum',
'credit_card_monthly_expense': ['mean', 'max'],
'investment_monthly_count': 'sum',
'app_login_count': ['mean', 'sum'],
'app_financial_view_time': 'mean',
'product_count': 'mean',
'deposit_flag': 'mean',
'financial_flag': 'mean',
'fund_flag': 'mean',
'insurance_flag': 'mean'
}).reset_index()
df_agg.columns = ['customer_id', 'assets_mean', 'assets_std', 'assets_latest',
'total_repurchase', 'credit_expense_mean', 'credit_expense_max',
'total_investment', 'app_login_mean', 'app_login_total',
'app_view_time_mean', 'product_count_mean',
'deposit_ratio', 'financial_ratio', 'fund_ratio', 'insurance_ratio']
df_features = df_agg.merge(df_base, on='customer_id', how='left')
df_features['total_assets_normalized'] = df_features['assets_latest']
df_features['repurchase_normalized'] = df_features['total_repurchase']
df_features['credit_expense_normalized'] = df_features['credit_expense_mean']
df_features['engagement_normalized'] = df_features['app_login_mean'] + df_features['app_view_time_mean'] / 100
df_features = df_features.fillna(0)
print(f" 聚合后客户数:{len(df_features)} 条")
# 3. 选择聚类特征
print("\n[3] 选择聚类特征...")
cluster_features = [
'age',
'monthly_income',
'assets_latest',
'total_repurchase',
'credit_expense_mean',
'app_login_total',
'app_view_time_mean',
'product_count_mean',
'financial_ratio',
'fund_ratio'
]
print(f" 使用特征数:{len(cluster_features)}")
print(f" 特征列表:{', '.join(cluster_features)}")
# 4. 数据标准化
print("\n[4] 数据标准化...")
X = df_features[cluster_features].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(" 数据标准化完成")
# 5. 确定最佳聚类数
print("\n[5] 确定最佳聚类数(肘部法则 + 轮廓系数)...")
inertias = []
silhouette_scores = []
K_range = range(2, 8)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)
inertias.append(kmeans.inertia_)
if k >= 2:
silhouette_scores.append(silhouette_score(X_scaled, labels))
print(f" K={k} - 惯性: {kmeans.inertia_:.2f}")
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
ax1.set_xlabel('聚类数 K', fontsize=12)
ax1.set_ylabel('惯性 (Inertia)', fontsize=12)
ax1.set_title('肘部法则:确定最佳聚类数', fontsize=13, fontweight='bold')
ax1.grid(alpha=0.3)
if len(silhouette_scores) > 0:
ax2.plot(K_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
ax2.set_xlabel('聚类数 K', fontsize=12)
ax2.set_ylabel('轮廓系数 (Silhouette Score)', fontsize=12)
ax2.set_title('轮廓系数:确定最佳聚类数', fontsize=13, fontweight='bold')
ax2.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('cluster_elbow_silhouette.png', dpi=150, bbox_inches='tight')
print(" 肘部法则与轮廓系数图已保存到 cluster_elbow_silhouette.png")
best_k = 4
print(f" 选择最佳聚类数:K={best_k}")
# 6. K-means聚类
print(f"\n[6] 使用 K-means 进行聚类(K={best_k})...")
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
df_features['cluster'] = kmeans.fit_predict(X_scaled)
print(f" 聚类完成,{best_k} 个群组")
# 7. PCA可视化
print("\n[7] PCA 降维可视化...")
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)
df_features['pca_x'] = X_pca[:, 0]
df_features['pca_y'] = X_pca[:, 1]
fig, ax = plt.subplots(figsize=(10, 8))
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']
for cluster in range(best_k):
cluster_data = df_features[df_features['cluster'] == cluster]
ax.scatter(cluster_data['pca_x'], cluster_data['pca_y'],
c=colors[cluster], label=f'群组 {cluster + 1}',
alpha=0.6, s=50)
ax.set_xlabel(f'PCA 1 (方差解释率: {pca.explained_variance_ratio_[0]:.2%})', fontsize=12)
ax.set_ylabel(f'PCA 2 (方差解释率: {pca.explained_variance_ratio_[1]:.2%})', fontsize=12)
ax.set_title('客户聚类可视化 (PCA降维)', fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('cluster_pca_visualization.png', dpi=150, bbox_inches='tight')
print(" PCA 可视化图已保存到 cluster_pca_visualization.png")
# 8. 聚类分析和命名
print("\n[8] 聚类分析和群组命名...")
cluster_analysis = df_features.groupby('cluster').agg({
'customer_id': 'count',
'age': 'mean',
'monthly_income': 'mean',
'assets_latest': 'mean',
'total_repurchase': 'mean',
'credit_expense_mean': 'mean',
'app_login_total': 'mean',
'product_count_mean': 'mean',
'financial_ratio': 'mean'
}).round(2)
cluster_analysis.columns = ['客户数量', '平均年龄', '平均月收入', '平均资产',
'平均复购次数', '平均月消费', '平均登录次数',
'平均产品数', '理财持有率']
cluster_analysis = cluster_analysis.sort_values('平均资产', ascending=False)
print("\n聚类分析结果:")
print("-" * 90)
print(cluster_analysis.to_string())
# 智能命名群组
cluster_names = {}
for idx, cluster in enumerate(cluster_analysis.index):
row = cluster_analysis.loc[cluster]
if row['平均资产'] > 1000000:
if row['平均复购次数'] > 5:
name = "高净值高复购客户"
else:
name = "高净值稳健客户"
elif row['平均年龄'] < 35 and row['平均月消费'] > 8000:
if row['平均登录次数'] > 80:
name = "年轻活跃高消费客户"
else:
name = "年轻高消费客户"
elif row['平均年龄'] >= 35 and row['平均年龄'] <= 55 and row['平均月收入'] > 25000:
name = "中产家庭客户"
elif row['平均复购次数'] > 2:
name = "活跃复购客户"
elif row['平均资产'] > 300000:
name = "潜力成长客户"
else:
name = "大众客户"
cluster_names[cluster] = name
df_features['cluster_name'] = df_features['cluster'].map(cluster_names)
print("\n" + "-" * 90)
print("群组命名:")
for cluster in sorted(cluster_names.keys()):
count = len(df_features[df_features['cluster'] == cluster])
print(f" 群组 {cluster + 1}: {cluster_names[cluster]} ({count} 人, {count/len(df_features)*100:.1f}%)")
# 9. 生成雷达图对比
print("\n[9] 生成雷达图对比...")
radar_features = ['age', 'monthly_income', 'assets_latest', 'total_repurchase',
'credit_expense_mean', 'app_login_total', 'product_count_mean']
radar_labels = ['年龄', '月收入', '资产', '复购次数', '月消费', '活跃度', '产品数']
cluster_centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_),
columns=cluster_features)
radar_data = cluster_centers[radar_features].copy()
for col in radar_features:
radar_data[col] = (radar_data[col] - radar_data[col].min()) / (radar_data[col].max() - radar_data[col].min() + 1)
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, polar=True)
angles = [n / float(len(radar_labels)) * 2 * np.pi for n in range(len(radar_labels))]
angles += angles[:1]
for i, cluster in enumerate(range(best_k)):
values = radar_data.loc[cluster].values.tolist()
values += values[:1]
ax.plot(angles, values, 'o-', linewidth=2, label=cluster_names[cluster], color=colors[i])
ax.fill(angles, values, alpha=0.15, color=colors[i])
ax.set_xticks(angles[:-1])
ax.set_xticklabels(radar_labels, fontsize=11)
ax.set_ylim(0, 1)
ax.set_title('各群组特征雷达对比图', fontsize=14, fontweight='bold', pad=20)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=10)
plt.tight_layout()
plt.savefig('cluster_radar_comparison.png', dpi=150, bbox_inches='tight')
print(" 雷达对比图已保存到 cluster_radar_comparison.png")
# 10. 保存结果
print("\n[10] 保存聚类结果...")
df_output = df_features[['customer_id', 'cluster', 'cluster_name'] + cluster_features].copy()
df_output.to_csv('customer_clustering_results.csv', index=False)
print(" 客户聚类结果已保存到 customer_clustering_results.csv")
cluster_analysis.to_csv('cluster_analysis_summary.csv')
print(" 聚类分析摘要已保存到 cluster_analysis_summary.csv")
# 11. 生成详细的营销建议
print("\n[11] 生成分群营销策略建议...")
print("\n" + "=" * 90)
print("客户分群营销建议")
print("=" * 90)
for cluster in sorted(cluster_names.keys()):
name = cluster_names[cluster]
cluster_data = df_features[df_features['cluster'] == cluster]
print(f"\n【{name}】(群组 {cluster + 1})")
print("-" * 90)
print(f" 客户规模:{len(cluster_data)} 人(占比 {len(cluster_data)/len(df_features)*100:.1f}%)")
print(f" 平均年龄:{cluster_data['age'].mean():.1f} 岁")
print(f" 平均月收入:{cluster_data['monthly_income'].mean():,.0f} 元")
print(f" 平均资产:{cluster_data['assets_latest'].mean():,.0f} 元")
print(f" 平均复购次数:{cluster_data['total_repurchase'].mean():.1f} 次")
print(f" 平均月消费:{cluster_data['credit_expense_mean'].mean():,.0f} 元")
print(f" 理财持有率:{cluster_data['financial_ratio'].mean()*100:.1f}%")
print(f" 基金持有率:{cluster_data['fund_ratio'].mean()*100:.1f}%")
if '高净值高复购' in name:
print("\n 🎯 营销策略:")
print(" - 推荐:私人银行服务、私募基金、家族信托、高端保险")
print(" - 触达:一对一专属理财经理、线下VIP活动、高净值圈层活动")
print(" - 重点:资产配置、财富传承、税务规划、复购激励")
elif '高净值稳健' in name:
print("\n 🎯 营销策略:")
print(" - 推荐:稳健型私募、固定收益产品、高端医疗险")
print(" - 触达:专属理财经理、定期健康检查、财富管理沙龙")
print(" - 重点:资产保值、风险控制、稳健增值")
elif '年轻活跃高消费' in name:
print("\n 🎯 营销策略:")
print(" - 推荐:消费信贷、基金定投、智能投顾、网红理财产品")
print(" - 触达:APP推送、社交媒体、直播带货、KOL合作")
print(" - 重点:便捷性、收益可视化、社交分享、游戏化互动")
elif '年轻高消费' in name:
print("\n 🎯 营销策略:")
print(" - 推荐:消费分期、指数基金、智能理财、联名信用卡")
print(" - 触达:APP推送、短信提醒、新户礼活动")
print(" - 重点:用户教育、体验优化、场景化推荐")
elif '中产家庭' in name:
print("\n 🎯 营销策略:")
print(" - 推荐:子女教育金、养老规划、家庭保险套餐、稳健型理财")
print(" - 触达:电话营销、社区活动、家庭理财讲座")
print(" - 重点:长期规划、风险分散、家庭保障")
elif '活跃复购' in name:
print("\n 🎯 营销策略:")
print(" - 推荐:爆款理财产品、会员积分体系、专属活动")
print(" - 触达:APP推送、短信提醒、老客户专属优惠")
print(" - 重点:复购激励、忠诚度计划、产品升级")
elif '潜力成长' in name:
print("\n 🎯 营销策略:")
print(" - 推荐:混合型基金、定投计划、增值型保险")
print(" - 触达:理财顾问、投资教育、成长型产品推荐")
print(" - 重点:资产增值、投资教育、用户成长")
else:
print("\n 🎯 营销策略:")
print(" - 推荐:基础存款产品、入门级理财、简易保险")
print(" - 触达:APP引导、新手指南、小额投资活动")
print(" - 重点:教育培养、逐步渗透、用户成长")
print("\n" + "=" * 90)
# 12. 最终总结
print("\n" + "=" * 90)
print("客户聚类分析完成!")
print("=" * 90)
print("\n生成的文件:")
print(" 1. cluster_elbow_silhouette.png - 肘部法则与轮廓系数图")
print(" 2. cluster_pca_visualization.png - PCA降维可视化图")
print(" 3. cluster_radar_comparison.png - 群组雷达对比图")
print(" 4. customer_clustering_results.csv - 客户聚类结果")
print(" 5. cluster_analysis_summary.csv - 聚类分析摘要")
print("\n主要发现:")
print(f" - 成功将 {len(df_features)} 位客户分为 {best_k} 个群组")
for cluster in sorted(cluster_names.keys()):
count = len(df_features[df_features['cluster'] == cluster])
print(f" - {cluster_names[cluster]}:{count} 人({count/len(df_features)*100:.1f}%)")
print("\n下一步建议:")
print(" 1. 将聚类结果导入营销系统,开展精准营销活动")
print(" 2. 定期(如每月)重新运行聚类,跟踪客户群组变化")
print(" 3. 针对不同群组设计差异化的产品和服务")
print(" 4. A/B测试不同营销策略的效果,持续优化")
print("=" * 90)