CASE_Customer/build_apriori.py at main · theaseven/CASE_Customer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("Apriori 关联分析：产品组合推荐（存款/理财/基金/保险）")
print("=" * 80)

# 1. 读取数据
print("\n[1] 读取数据...")
df_base = pd.read_csv('customer/customer_base.csv')
df_behavior = pd.read_csv('customer/customer_behavior_assets.csv')

print(f"    客户基础数据：{len(df_base)} 条")
print(f"    客户行为数据：{len(df_behavior)} 条")

# 2. 数据预处理：获取每个客户最新的产品持有情况
print("\n[2] 数据预处理...")

df_behavior_sorted = df_behavior.sort_values(['customer_id', 'stat_month'], ascending=[True, False])
df_latest = df_behavior_sorted.groupby('customer_id').first().reset_index()

print(f"    去重后客户数：{len(df_latest)} 条")

# 3. 构建交易数据
print("\n[3] 构建交易数据...")

product_map = {
    'deposit_flag': '存款',
    'financial_flag': '理财',
    'fund_flag': '基金',
    'insurance_flag': '保险'
}

transactions = []
for idx, row in df_latest.iterrows():
    items = []
    for flag_col, product_name in product_map.items():
        if row[flag_col] == 1:
            items.append(product_name)
    if items:
        transactions.append(items)

print(f"    有效交易数：{len(transactions)} 条")

# 4. 实现 Apriori 算法
print("\n[4] 运行 Apriori 算法...")

def create_c1(transactions):
    c1 = []
    for transaction in transactions:
        for item in transaction:
            if not [item] in c1:
                c1.append([item])
    c1.sort()
    return list(map(frozenset, c1))

def scan_dataset(D, Ck, min_support):
    item_count = {}
    for transaction in D:
        for candidate in Ck:
            if candidate.issubset(transaction):
                if candidate not in item_count:
                    item_count[candidate] = 1
                else:
                    item_count[candidate] += 1
    num_items = float(len(D))
    ret_list = []
    support_data = {}
    for key in item_count:
        support = item_count[key] / num_items
        if support >= min_support:
            ret_list.insert(0, key)
        support_data[key] = support
    return ret_list, support_data

def apriori_gen(Lk, k):
    ret_list = []
    len_Lk = len(Lk)
    for i in range(len_Lk):
        for j in range(i + 1, len_Lk):
            L1 = list(Lk[i])[:k - 2]
            L2 = list(Lk[j])[:k - 2]
            L1.sort()
            L2.sort()
            if L1 == L2:
                ret_list.append(Lk[i] | Lk[j])
    return ret_list

def apriori(transactions, min_support=0.1):
    D = list(map(set, transactions))
    C1 = create_c1(transactions)
    L1, support_data = scan_dataset(D, C1, min_support)
    L = [L1]
    k = 2
    while len(L[k - 2]) > 0:
        Ck = apriori_gen(L[k - 2], k)
        Lk, sup_k = scan_dataset(D, Ck, min_support)
        support_data.update(sup_k)
        L.append(Lk)
        k += 1
    return L, support_data

def generate_rules(L, support_data, min_confidence=0.5):
    big_rule_list = []
    for i in range(1, len(L)):
        for freq_set in L[i]:
            H1 = [frozenset([item]) for item in freq_set]
            if i > 1:
                rules_from_conseq(freq_set, H1, support_data, big_rule_list, min_confidence)
            else:
                calc_conf(freq_set, H1, support_data, big_rule_list, min_confidence)
    return big_rule_list

def calc_conf(freq_set, H, support_data, brl, min_confidence):
    pruned_H = []
    for conseq in H:
        conf = support_data[freq_set] / support_data[freq_set - conseq]
        if conf >= min_confidence:
            brl.append((freq_set - conseq, conseq, conf, support_data[freq_set]))
            pruned_H.append(conseq)
    return pruned_H

def rules_from_conseq(freq_set, H, support_data, brl, min_confidence):
    m = len(H[0])
    if len(freq_set) > m + 1:
        Hmp1 = apriori_gen(H, m + 1)
        Hmp1 = calc_conf(freq_set, Hmp1, support_data, brl, min_confidence)
        if len(Hmp1) > 1:
            rules_from_conseq(freq_set, Hmp1, support_data, brl, min_confidence)

# 设置参数
min_support = 0.05
min_confidence = 0.3

print(f"    最小支持度：{min_support}")
print(f"    最小置信度：{min_confidence}")

L, support_data = apriori(transactions, min_support=min_support)
rules = generate_rules(L, support_data, min_confidence=min_confidence)

print(f"    发现频繁项集：{sum([len(x) for x in L])} 个")
print(f"    发现关联规则：{len(rules)} 条")

# 5. 输出频繁项集
print("\n[5] 频繁项集（按支持度排序）：")
print("-" * 80)

freq_items = []
for k in range(len(L)):
    for itemset in L[k]:
        freq_items.append({
            'items': list(itemset),
            'size': len(itemset),
            'support': support_data[itemset]
        })

freq_items_df = pd.DataFrame(freq_items)
freq_items_df = freq_items_df.sort_values('support', ascending=False)

for idx, row in freq_items_df.iterrows():
    items_str = ', '.join(row['items'])
    print(f"  [{items_str}] 支持度：{row['support']:.2%}")

freq_items_df.to_csv('apriori_frequent_itemsets.csv', index=False, encoding='utf-8-sig')
print("\n    频繁项集已保存到 apriori_frequent_itemsets.csv")

# 6. 输出关联规则
print("\n[6] 关联规则（按置信度排序）：")
print("-" * 80)

rules_list = []
for rule in rules:
    antecedent = list(rule[0])
    consequent = list(rule[1])
    confidence = rule[2]
    support = rule[3]
    lift = confidence / (support_data[rule[1]] if support_data[rule[1]] > 0 else 1)

    rules_list.append({
        'antecedent': ', '.join(antecedent),
        'consequent': ', '.join(consequent),
        'confidence': confidence,
        'support': support,
        'lift': lift
    })

rules_df = pd.DataFrame(rules_list)
rules_df = rules_df.sort_values(['confidence', 'support'], ascending=[False, False])

for idx, row in rules_df.head(20).iterrows():
    print(f"  [{row['antecedent']}] → [{row['consequent']}]")
    print(f"    置信度：{row['confidence']:.2%}, 支持度：{row['support']:.2%}, 提升度：{row['lift']:.2f}")

rules_df.to_csv('apriori_association_rules.csv', index=False, encoding='utf-8-sig')
print("\n    关联规则已保存到 apriori_association_rules.csv")

# 7. 产品组合统计
print("\n[7] 产品持有情况统计：")
print("-" * 80)

product_holdings = []
for product in product_map.values():
    count = sum(1 for t in transactions if product in t)
    ratio = count / len(df_latest)
    product_holdings.append({
        'product': product,
        'count': count,
        'ratio': ratio
    })

product_df = pd.DataFrame(product_holdings)
product_df = product_df.sort_values('count', ascending=False)

for idx, row in product_df.iterrows():
    print(f"  {row['product']}：{row['count']} 人（{row['ratio']:.2%}）")

# 8. 可视化
print("\n[8] 生成可视化...")

plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 8.1 产品持有率
ax1 = axes[0, 0]
products = [p['product'] for p in product_holdings]
ratios = [p['ratio'] * 100 for p in product_holdings]
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
bars = ax1.bar(products, ratios, color=colors)
ax1.set_ylabel('持有率 (%)', fontsize=12)
ax1.set_title('各产品持有率分布', fontsize=14, fontweight='bold')
ax1.set_ylim(0, 100)
for bar, ratio in zip(bars, ratios):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
             f'{ratio:.1f}%', ha='center', va='bottom', fontsize=11)

# 8.2 频繁项集支持度 Top 10
ax2 = axes[0, 1]
top_freq = freq_items_df.head(10)
item_labels = [' + '.join(items) for items in top_freq['items']]
supports = top_freq['support'] * 100
y_pos = np.arange(len(item_labels))
bars2 = ax2.barh(y_pos, supports, color='#667eea')
ax2.set_yticks(y_pos)
ax2.set_yticklabels(item_labels, fontsize=10)
ax2.set_xlabel('支持度 (%)', fontsize=12)
ax2.set_title('Top 10 频繁项集（按支持度）', fontsize=14, fontweight='bold')
ax2.invert_yaxis()
for i, (bar, supp) in enumerate(zip(bars2, supports)):
    width = bar.get_width()
    ax2.text(width + 0.5, bar.get_y() + bar.get_height()/2,
             f'{supp:.1f}%', va='center', fontsize=10)

# 8.3 产品组合热力图
ax3 = axes[1, 0]
product_names = list(product_map.values())
cooccurrence = np.zeros((4, 4))
for i, p1 in enumerate(product_names):
    for j, p2 in enumerate(product_names):
        if i <= j:
            count = sum(1 for t in transactions if p1 in t and p2 in t)
            cooccurrence[i, j] = count
            cooccurrence[j, i] = count

im = ax3.imshow(cooccurrence, cmap='YlOrRd')
ax3.set_xticks(np.arange(4))
ax3.set_yticks(np.arange(4))
ax3.set_xticklabels(product_names, fontsize=11)
ax3.set_yticklabels(product_names, fontsize=11)
ax3.set_title('产品共现热力图', fontsize=14, fontweight='bold')
for i in range(4):
    for j in range(4):
        text = ax3.text(j, i, int(cooccurrence[i, j]),
                        ha="center", va="center", color="black", fontsize=12)
plt.colorbar(im, ax=ax3)

# 8.4 关联规则可视化（Top 10）
ax4 = axes[1, 1]
top_rules = rules_df.head(10)
rule_labels = [f"{row['antecedent']}→{row['consequent']}" for _, row in top_rules.iterrows()]
confidences = top_rules['confidence'] * 100
y_pos2 = np.arange(len(rule_labels))
bars4 = ax4.barh(y_pos2, confidences, color='#f093fb')
ax4.set_yticks(y_pos2)
ax4.set_yticklabels(rule_labels, fontsize=9)
ax4.set_xlabel('置信度 (%)', fontsize=12)
ax4.set_title('Top 10 关联规则（按置信度）', fontsize=14, fontweight='bold')
ax4.invert_yaxis()
ax4.set_xlim(0, 100)
for i, (bar, conf) in enumerate(zip(bars4, confidences)):
    width = bar.get_width()
    ax4.text(width + 1, bar.get_y() + bar.get_height()/2,
             f'{conf:.1f}%', va='center', fontsize=9)

plt.tight_layout()
plt.savefig('apriori_visualization.png', dpi=150, bbox_inches='tight')
print("    可视化图已保存到 apriori_visualization.png")

# 9. 生成产品推荐建议
print("\n[9] 产品组合推荐建议：")
print("=" * 80)

for idx, row in rules_df.head(5).iterrows():
    print(f"\n💡 推荐规则 {idx + 1}：")
    print(f"   如果客户持有：[{row['antecedent']}]")
    print(f"   推荐持有：[{row['consequent']}]")
    print(f"   置信度：{row['confidence']:.2%}，支持度：{row['support']:.2%}，提升度：{row['lift']:.2f}")
    if row['lift'] > 1.2:
        print(f"   评价：强正相关，推荐效果显著！")
    elif row['lift'] > 1:
        print(f"   评价：正相关，可以推荐")
    else:
        print(f"   评价：相关性一般，谨慎推荐")

print("\n" + "=" * 80)
print("产品营销策略建议：")
print("=" * 80)

for product in product_df['product']:
    print(f"\n📦 {product} 客户交叉推荐：")
    for other_product in product_df['product']:
        if product != other_product:
            # 查找相关规则
            relevant_rules = rules_df[
                (rules_df['antecedent'].str.contains(product)) &
                (rules_df['consequent'].str.contains(other_product))
            ]
            if len(relevant_rules) > 0:
                best_rule = relevant_rules.iloc[0]
                print(f"  → {other_product}：置信度 {best_rule['confidence']:.2%}")

print("\n" + "=" * 80)

# 10. 最终总结
print("\n" + "=" * 80)
print("Apriori 关联分析完成！")
print("=" * 80)
print("\n生成的文件：")
print("  1. apriori_frequent_itemsets.csv - 频繁项集")
print("  2. apriori_association_rules.csv - 关联规则")
print("  3. apriori_visualization.png - 可视化图表")
print("\n主要发现：")
print(f"  - 分析了 {len(df_latest)} 位客户的产品持有情况")
for idx, row in product_df.iterrows():
    print(f"  - {row['product']}：{row['count']} 人（{row['ratio']:.2%}）")
print(f"\n  - 发现 {len(rules)} 条有效关联规则")
if len(rules) > 0:
    best_rule = rules_df.iloc[0]
    print(f"  - 最强规则：[{best_rule['antecedent']}] → [{best_rule['consequent']}]")
    print(f"    置信度：{best_rule['confidence']:.2%}")
print("\n下一步建议：")
print("  1. 根据关联规则设计产品捆绑套餐")
print("  2. 在客户购买产品时智能推荐相关产品")
print("  3. 针对不同客户分群定制推荐策略")
print("  4. A/B测试不同推荐方案的转化率")
print("=" * 80)