forked from SuiMingYang/ProductTypeClassify
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataparse.py
50 lines (40 loc) · 1.09 KB
/
dataparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import jieba
import pandas as pd
from keytitle import extractkey
import random
"""
使用词库对商品库标注,生成训练集
"""
jieba.load_userdict("./data/expendword.txt")
data=pd.read_csv('./data/product_2019-09-20.csv')
def relation_dict():
#分词加载,预处理字典
#data=pd.read_excel(u'./data/源数据.xlsx')
# 分类做键值对字典
relation=pd.read_csv(u'./data/relation.csv')
rela_group=relation.groupby([u'二级类目',u'三级类目'])
rela_dict={}
for g in rela_group:
#print(g[0])
rela_dict[g[0][1]]=g[0][0]
return rela_dict
rela_dict=relation_dict()
target=[]
#obj=[]
second=[]
extract_obj=extractkey()
for i,item in enumerate(data['product_title']):
target_name,second_obj=extract_obj.progress(item,rela_dict)
target.append(target_name)
#obj.append(obj)
second.append(second_obj)
print(len(second))
new_data={
'id':data['product_id'],
'title':data['product_title'],
'category':target,
#'obj':obj,
'list':second
}
new=pd.DataFrame(new_data,index=None)
new.to_csv('./data/qingsheceshi.csv')