-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsplit_words.py
More file actions
75 lines (62 loc) · 3.06 KB
/
split_words.py
File metadata and controls
75 lines (62 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
import time
from compute_TFIDF import *
def date_to_int(date):
timeArray = time.strptime(date, "%Y-%m-%d")
return int(time.mktime(timeArray))
def int_to_date(integer):
return time.strftime("%Y-%m-%d", time.localtime(integer))
def save_word_list(start_date, days, root_path):
for i in range(days):
date = int_to_date(date_to_int(start_date) + 86400 * i)
print("[" + time.strftime("%Y-%m-%d %H:%M:%S") + "]:" + "正在分词:" + date)
corpus = []
filename = 'source/' + root_path + '/' + date + ".json"
with open(filename, 'r', encoding='utf-8') as File:
data = json.load(File).get("news")
for part in data:
new = ""
title = part.get('title')
word_list = get_paticle_words(part.get('text'))
if (judge_relativeness(word_list)):
new = " ".join(word_list)
corpus.append({"label": "", "title": title, "word_list": new})
filename = 'word_list' + '/' + root_path + '/' + date + ".json"
with open(filename, 'w+', encoding='utf-8') as f:
json.dump(corpus, f, ensure_ascii=False, indent=4)
print("[" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) + "]:" + date + ": 写入分词完成。")
print("[" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) + "]:" + "从" + str(
start_date) + "开始,共" + str(days) + "天的分词完成")
def save_word_list_comment(start_date, days, root_path):
for i in range(days):
date = int_to_date(date_to_int(start_date) + 86400 * i)
print("[" + time.strftime("%Y-%m-%d %H:%M:%S") + "]:" + "正在分词:" + date)
corpus = []
filename = 'source/' + root_path + '/' + date + ".json"
with open(filename, 'r', encoding='utf-8') as File:
data = json.load(File)
File.close()
for part in data:
word_list = []
new = ""
title = part.get('微博内容')
count = part.get('评论数量')
reviewList = part.get('评论')
for review in reviewList:
word_list += get_paticle_words(review)
new = " ".join(word_list)
corpus.append({"label": "", "title": title, "count": count, "word_list": new})
filename = 'word_list' + '/' + root_path + '/' + date + ".json"
with open(filename, 'w+', encoding='utf-8') as f:
json.dump(corpus, f, ensure_ascii=False, indent=4)
print("[" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) + "]:" + date + ": 写入分词完成。")
print("[" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) + "]:" + "从" + str(
start_date) + "开始,共" + str(days) + "天的分词完成")
if __name__ == '__main__':
# start_date: 开始日期 格式:YYYY-MM-DD
# days : 天数 int
start_date = '2019-12-08'
days = 206
root_path = 'rmrb'
save_word_list_comment(start_date, days, root_path)
# 2020年5月下次获取第 33 页