-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrend_data_clean.py
77 lines (66 loc) · 2.11 KB
/
trend_data_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from wedge import date_format1
import pandas as pd
from tqdm import tqdm
import argparse
import pickle
def italy_filter(data: str):
erea = data.split(',')
country = erea[len(erea)-1].strip()
if country != 'Italy':
return True
else:
return False
def mask_filter(content: str):
# or ('Mask' in content) or ('Masks' in content)
if ('Trump' in content) or ('trump' in content):
return False
else:
return True
def file_load(path: str):
cov_data = pd.read_csv(path, sep='\t', low_memory=False)
data = cov_data.drop_duplicates(['Title'])
#data = data.dropna(subset=["ActionGeo_FullName"])
data = data.dropna(subset=["Content"])
positive, normal, negtive = 0, 0, 0
dim = data.shape[0]
for cov_line, cov in tqdm(data.iterrows(), total=dim, ncols=80):
trend = cov['AvgTone']
country = cov['Actor2Name']
action_geo = cov['ActionGeo_FullName']
content = cov['Content']
if mask_filter(content):
continue
if trend < -2:
negtive += 1
if -2 <= trend <=2:
normal += 1
if 2 < trend:
positive += 1
time = '{}-{}'.format(path[17:19], path[19:21])
print(negtive, normal, positive, time)
return (negtive, normal, positive, time)
def the_trend(save_name: str):
trends = []
for month in range(1, 4):
if month == 2:
days = 30
else:
days = 32
for day in range(1,days):
path = date_format1(month, day)
#trends.append(file_load(path))
try:
trends.append(file_load(path))
except:
print("时间无效")
print(trends)
save_path = './data/result/{}.pkl'.format(save_name)
with open(save_path, 'wb+') as f:
pickle.dump(trends, f)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--path_name', type=str, default="global_trend")
opt = parser.parse_args()
the_trend(opt.path_name)
if __name__ == '__main__':
main()