-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathexportCSV.py
158 lines (142 loc) · 7.16 KB
/
exportCSV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# pylint: disable=I0011,C0111, C0103,C0326,C0301, C0304, W0401,W0614
from cassandra.cluster import Cluster
from cassandra.util import Date
import time
import datetime
import csv
import math
# from dateutil.relativedelta import relativedelta
import os
################################################################################################
## Select all valid stock with required factors identified by stock code & date, saved in CSV ##
################################################################################################
def export(fileName, beginDate, endDate=datetime.datetime.today().date(), factors = [], table = "factors_month"):
if len(factors) == 0 or beginDate > endDate or len(fileName) == 0:
return
# cassandra connection
#cluster = Cluster(['192.168.1.111'])
cluster = Cluster(['202.120.40.111'])
session = cluster.connect('factors') #connect to the keyspace 'factors'
# get valid stocks in A share, # IPO PREPARE
rows = session.execute(''' SELECT stock, ipo_date FROM stock_info WHERE trade_status = '1' ALLOW FILTERING ''')
stocks = {}
for row in rows:
stocks[row.stock] = row.ipo_date
# sorting factors since they're ordered in cassandra
# factors = sorted(factors)
# print("Sorted factors: ", factors)
#time list
rows = session.execute('''
select * from transaction_time
where type='M' and time >= %s and time <= %s ALLOW FILTERING;''', [beginDate, endDate])
dateList = []
for row in rows:
dateList.append(row.time)
# retrieve valid stock number which has been stored in DB at the sorting phase
countStmt = session.prepare('''
SELECT value from factors_month WHERE stock = 'VALID_STOCK_COUNT' and factor = ? and time = ? ''')
# prepare SQL
SQL = "SELECT * FROM "+table+" WHERE stock = ? AND factor IN ("
for factor in factors:
SQL = SQL + "'"+ factor + "',"
SQL = SQL[:-1]
SQL = SQL +") AND time = ?;"
print (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " PREPARE QUERY SQL: \n"+SQL)
preparedStmt = session.prepare(SQL)
# select 'trade_status'
tradeStmt = session.prepare('''select * from factors_month WHERE stock = ?
and factor = 'trade_status' and time = ? ''')
# open CSV file & write first line: title
# NOTICE: [wb] mode won't result in problem of blank line
with open(fileName, 'w') as csvFile:
factors = factors + ['Yield_Rank_Class']
names = ['id'] + factors # column names
print (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " ---- Starting to export ------ \r\n")
f = csv.writer(csvFile, delimiter=',',lineterminator='\n')
f.writerow(names)
# retrieve data
for day in dateList:
# real factor ranking on every trading day
factorSizeMap = {}
for factor in factors:
rows = session.execute(countStmt,(factor,day))
for row in rows:
factorSizeMap[factor] = row.value
break
for stock, ipo_date in stocks.items():
## Trade_Status Filtering
tradeRow = session.execute(tradeStmt,(stock, day))
valid = 0
for status in tradeRow:
valid = status.value
break
if valid != 1:
continue
## Calculating
line = []
dic = {} # paired K/V for ordering
rows = session.execute(preparedStmt, (stock,day))
# pass when no data
empty = True
line.append(stock+'_' + str(day))
for row in rows:
empty = False
# IPO Filtering
# filterDate = datetime.datetime.strptime(str(day), "%Y-%m-%d") + relativedelta(months=-3)
# ipoDate = datetime.datetime.strptime(str(ipo_date), "%Y-%m-%d")
# if (filterDate <= ipoDate):
if (day.date() - ipo_date.date()).days <= 92:
empty = True # the stock is invalid
break
if row.factor.find('rank') != -1:
rank = math.ceil(row.value / factorSizeMap[row.factor] * 1000) # normalize rank value
if row.factor.find('Yield') != -1:
# rank = int(row.value / totalStockNum * 1000)
##################################################
####### CODE Area for Yield Rank Classification ##
##################################################
# class 1: [1, 26]
if rank >= 3 * 10 and rank <= 18 * 10:
#line.append(1)
dic['Yield_Rank_Class'] = '1'
# class 0: [74, 99]
elif rank >= 84 * 10 and rank <= 97 * 10:
#line.append(0)
dic['Yield_Rank_Class'] = '0'
else:
#line.append('') #no class, fill in empty char to keep CSV well-formed
#dic['Yield_Rank_Class'] = ''
empty = True # only need stocks fall in above 2 class
break
# line.append(rank)
dic[row.factor] = rank
# elif row.factor.find('Yield') != -1:
# # line.append('') # empty for Yield Binary Class
# # line.append(str(row.value))
# dic['Yield_Rank_Class'] = ''
# dic['Yield'] = str(row.value)
else:
# line.append(str(row.value))
dic[row.factor] = row.value
if empty:
continue
# write row
# print (dic)
empty = False
for factor in factors:
try:
line.append(dic[factor])
except KeyError:
# print(" --- Empty Omitted %s 's factor: %s " % (row.stock, factor))
empty = True
break
if empty == False:
f.writerow(line)
print (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Writing at "+str(day))
# close connection with cassandra
cluster.shutdown()
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'Writing to ',fileName,' complete!')
##############################################
################# USAGE EXAMPLE ##############
# export('D:\\rongshidata\\alldata_416_2.csv', datetime.date(2015,1,31),datetime.date(2017,3,31),factors=['mkt_freeshares_rank', 'mmt_rank', 'roa_growth_rank','Yield_rank'])
export('E:\\train-2016-10-31.csv', datetime.date(2016,10,1),datetime.date(2016,10,31),factors=['mkt_freeshares_rank', 'mmt_rank', 'mfd_buyamt_d1_rank', 'mfd_buyamt_d4_rank','Yield_rank'])