march-madness-machine-learning/targetVariables.R at master · mrosendin/march-madness-machine-learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
library(dplyr)
library(ggplot2)
library(GGally)
library(caTools)
library(ROCR)
library(caret)

tourney = read.csv("./data/TourneyCompactResults.csv")

tourney = tourney %>% filter(Season >2002) %>% select(-Wscore, -Lscore, -Numot, -Wloc)

#SPlit half data to be 1's indicating win by team A, and half to be 0's indicating loss by team A
data1 = tourney %>% filter(Season <2010) %>% mutate(Outcome = 1) %>% rename(team_A = Wteam, team_B = Lteam)
data2 = tourney %>% filter(Season >2009) %>% mutate(Outcome = 0) %>% select(Season, Daynum, Lteam, everything()) %>%
  rename(team_A = Lteam, team_B = Wteam)

tourney = rbind(data1, data2)

ratings = read.csv("./data/TeamRatings.csv")
names(ratings)[2] = "TeamID"
stats = read.csv("./data/FinalStats.csv")


#Final stats of every team by season (nabeels + kaggles)
finalStats = left_join(stats, ratings, by = c("TeamID", "Season")) %>% select(-Pts, -W, -L, -X.y, -X.x)
head(finalStats)

# Begin to joins stats to games played
#First for the team A
names(tourney)[3] = "TeamID"
tourney = left_join(tourney, finalStats, by = c("TeamID", "Season"))
names(tourney)[6:length(tourney)] = paste0(names(tourney)[6:length(tourney)], "_A")

#then for team B
names(tourney)[3] = "Team_A"
names(tourney)[4] = "TeamID"
tourney = left_join(tourney, finalStats, by = c("TeamID", "Season"))
names(tourney)[27:length(tourney)] = paste0(names(tourney)[27:length(tourney)], "_B")

tourney = na.omit(tourney) #Omit NA's
tourney = tourney %>% filter(Season != 2017)


######## Some basic Statistics
round(cor(tourney.train[,6:17]), digits = 3)
names(tourney.train)

# SPlit data
tourney$Outcome = as.factor(tourney$Outcome)

set.seed(123) #111
split = sample.split(tourney$Outcome, SplitRatio = 0.7)

tourney.train <- filter(tourney, split == TRUE)
tourney.test <- filter(tourney, split == FALSE)

xnam <- paste(names(tourney)[6:17], sep= "")
xnam2 <- paste(names(tourney)[21:38],sep = "")
xnam3 <- paste(names(tourney)[42:47], sep= "")
xnam4 = c(xnam, xnam2, xnam3)

#Different attributes for consideration
fmla <- as.formula(paste("Outcome ~ ", paste(xnam4, collapse= "+")))
fmla2 = Outcome ~ FGP_A + TPP_A + FTP_A + ORPG_A + DRPG_A + APG_A + SPG_A + BPG_A + PFPG_A + PCT_A + MOV_A + SOS_A +
  SRS_A + Seed_A + FGP_B + TPP_B + FTP_B + ORPG_B + DRPG_B + APG_B + SPG_B + BPG_B + PFPG_B + PCT_B + MOV_B + SOS_B +
  SRS_B + Seed_B
fmla3 = Outcome ~ FGP_A + TPP_A + FTP_A + ORPG_A + DRPG_A + APG_A + SPG_A + BPG_A + PFPG_A + PCT_A + MOV_A + SOS_A +
  SRS_A + FGP_B + TPP_B + FTP_B + ORPG_B + DRPG_B + APG_B + SPG_B + BPG_B + PFPG_B + PCT_B + MOV_B + SOS_B +
  SRS_B + Seed_B
fmla4 = Outcome ~ FGP_A + TPP_A + FTP_A + ORPG_A + DRPG_A + APG_A + SPG_A + BPG_A + PFPG_A + MOV_A + SOS_A +
  SRS_A +FGP_B + TPP_B + FTP_B + ORPG_B + DRPG_B + APG_B + SPG_B + BPG_B + PFPG_B + MOV_B + SOS_B + SRS_B + PCT_A + PCT_B

fmla5 = Outcome ~ FGP_A + TPP_A + FTP_A + ORPG_A + DRPG_A + APG_A + SPG_A + BPG_A + PFPG_A + MOV_A + SOS_A +
  SRS_A +FGP_B + TPP_B + FTP_B + ORPG_B + DRPG_B + APG_B + SPG_B + BPG_B + PFPG_B + MOV_B + SOS_B + SRS_B

# LOGISTIC REGRESSION
mod <- glm(fmla5, data=tourney.train, family="binomial")
summary(mod)

pred = predict(mod, newdata = tourney.test, type = "response")
hist(pred)
table(tourney.test$Outcome, pred > 0.5)

## ROC CURVE ##
rocr.pred <- prediction(pred, tourney.test$Outcome)
ROC.performance <- performance(rocr.pred, "tpr", "fpr")
plot(ROC.performance, col='red')
abline(0, 1)

### with regulatization

library(glmnet)
tourney.train.mat = as.matrix(tourney.train)
tourney.train.mat.y = as.factor(tourney.train.mat[,5])
tourney.train.mat = tourney.train.mat[,c(6:17, 21:38,42:47)]

tourney.test.mat = as.matrix(tourney.test)
tourney.test.mat.y = as.factor(tourney.test.mat[,5])
tourney.test.mat = tourney.test.mat[,c(6:17, 21:38,42:47)]

# GLM with regulatization
mod3 = glmnet(tourney.train.mat, alpha = 0,tourney.train.mat.y, family = "binomial")
pred3 = predict(mod3, newx = as(tourney.test.mat, "dgCMatrix"), type = "response", s = c(.05,.1,2,3))
table(tourney.test.mat.y, pred3[,3] >.5)


plot(mod3, xvar = "dev", label = TRUE)

mod3.cv = cv.glmnet(as(tourney.train.mat,"dgCMatrix"),alpha=0 ,tourney.train.mat.y, family = "binomial",
                    type.measure = "class")
plot(mod3.cv)

mod3.cv$lambda.min
mod3.cv$lambda.1se

pred3 = predict(mod3.cv, newx = as(tourney.test.mat, "dgCMatrix"), s = "lambda.min", type = "response")
hist(pred3)
plot(pred3, pred)
abline(0,1)
abline(v=.5)
abline(h=.5)
table(tourney.test.mat.y, pred3 > 0.5)

## ROC CURVE ##
rocr.pred <- prediction(pred3, tourney.test$Outcome)
ROC.performance <- performance(rocr.pred, "tpr", "fpr")
plot(ROC.performance, col='black', add = T)
abline(0, 1)


#### RANDOM FOREST
library(randomForest)
set.seed(100)
train.rf <- train(fmla5,
                  data = tourney.train,
                  method = "rf",
                  tuneGrid = data.frame(mtry=1:15),
                  trControl = trainControl(method="cv", number = 5, verboseIter = F),
                  metric = "Accuracy")
train.rf$results
train.rf$bestTune
mod.rf = train.rf$finalModel

ggplot(train.rf$results, aes(x = mtry, y = Accuracy)) + geom_point(size = 3) +
  ylab("CV Accuracy") + theme_bw() + theme(axis.title=element_text(size=18), axis.text=element_text(size=18)) +
  geom_line() + scale_x_continuous(breaks = (seq(1,34,2)))

pred.rf = predict(mod.rf, newdata = tourney.test, type = "prob") #  Make Prediction and get probabilities
hist(pred.rf)
table(tourney.test$Outcome, pred.rf[,2]>.5)
varImpPlot(mod.rf)

#Add random forest ROC plot
rocr.pred <- prediction(pred.rf[,2], tourney.test$Outcome)
ROC.performance <- performance(rocr.pred, "tpr", "fpr")
par(new=T)
plot(ROC.performance, col='blue')
abline(0, 1)

acc.perf = performance(rocr.pred, measure = "acc")
plot(acc.perf)

#Optimal Cutoff -- not necessary but good to know
ind = which.max( slot(acc.perf, "y.values")[[1]] )
acc = slot(acc.perf, "y.values")[[1]][ind]
cutoff = slot(acc.perf, "x.values")[[1]][ind]
print(c(accuracy= acc, cutoff = cutoff))

#### CART
library(rpart)
library(rpart.plot)

cpVals = data.frame(cp = seq(0, .5, by=.005))

# Perform Cross-Validation
train.cart = train(fmla5,
                    data = tourney.train,
                    method = "rpart",
                    tuneGrid = cpVals,
                    trControl = trainControl(method = "cv", number = 10),
                    metric = "Accuracy")
train.cart$results

ggplot(train.cart$results, aes(x = cp, y = Accuracy)) + geom_line() + geom_point(size = 2)
ggplot(train.cart$results, aes(x = cp, y = Accuracy)) + geom_line() + geom_point(size = 2) + xlim(0,.2)
train.cart$bestTune

mod.cart = train.cart$finalModel

rpart.plot(mod.cart)
prp(mod.cart)
mod.cart$variable.importance
pred.cart = predict(mod.cart, newdata = tourney.test, type = "prob")
table(tourney.test$Outcome, pred.cart[,2]>.5)
hist(pred.cart[,2])

## ROC CURVE ##
rocr.pred <- prediction(pred.cart[,2], tourney.test$Outcome)
ROC.performance <- performance(rocr.pred, "tpr", "fpr")
par(new=T)
plot(ROC.performance, col='green')
abline(0, 1)

# SUPPORT VECTOR MACHINE
library(e1071)

#Default SVM
mod.svm <- svm(fmla, data = tourney.train, cost = .5, probability = TRUE, kernel= "linear")
pred.svm = predict(mod.svm, newdata = tourney.test, probability = TRUE)
table(tourney.test$Outcome, pred.svm)
attributes(mod.svm)
summary(mod.svm)
# With Cross_validation
set.seed(99)
mod.svm = tune(svm, fmla, data = tourney.train, kernel = "linear", ranges = list(cost = seq(.001,10,1))) #within the e1071 package and does 10-fold by default
mod.svm$performances

ggplot(mod.svm$performances, aes(cost, error))+geom_point(size =2) +geom_line() + scale_x_continuous(breaks = seq(0,8,1))
mod.svm = mod.svm$best.model

pred.svm = predict(mod.svm, newdata = tourney.test)
table(tourney.test$Outcome, pred.svm)

# radial kernel
mod.svm.radial = svm(fmla, data = tourney.train, cost = 1, gamma = .01, kernel = "radial")
pred.svm.radial = predict(mod.svm.radial, newdata = tourney.test)
table(tourney.test$Outcome, pred.svm.radial)

#radial with CV
mod.svm.radial = tune(svm, fmla, data = tourney.train, kernel = "radial",
                      ranges = list(cost = seq(.001,10,1), gamma = seq(.001,.01,.001)))
summary(mod.svm.radial)

ggplot(mod.svm.radial$performances, aes(cost, error)) + geom_point() + geom_line(aes(color = as.factor(gamma)), lwd = 1)
mod.svm.radial$best.performance
mod.svm.radial = mod.svm.radial$best.model
pred.svm.radial = predict(mod.svm.radial, newdata=tourney.test)
table(tourney.test$Outcome, pred.svm.radial)

########################### MAKING PREDICTIONS FOR 2017 ##########################################

teams_2017 = read.csv("./data/TourneyCompactResults.csv")

teams_2017 = teams_2017 %>% filter(Season == 2017, Daynum < 138) %>% select(Wteam, Lteam)
teams_2017 = c(teams_2017$Wteam, teams_2017$Lteam) #the teams that made bracket

#all pairwise matcheups
games_comb = as.data.frame(t(combn(teams_2017, 2)))
colnames(games_comb) = c("Team_A", "Team_B")

dim(games_comb)
finalStats_2017 = finalStats %>% filter(Season == 2017)

# Begin to joins stats to games played
#First for the team A
names(games_comb)[1] = "TeamID"
games_comb = left_join(games_comb, finalStats_2017, by = "TeamID")
names(games_comb)[4:length(games_comb)] = paste0(names(games_comb)[4:length(games_comb)], "_A")

#then for team B
names(games_comb)[1] = "Team_A"
names(games_comb)[2] = "TeamID"
games_comb = left_join(games_comb, finalStats_2017, by = "TeamID")
names(games_comb)[25:length(games_comb)] = paste0(names(games_comb)[25:length(games_comb)], "_B")

head(games_comb)

#Some teams have NA for seeds so fix them
games_comb = games_comb %>% mutate(Seed_A = ifelse(Team_A==1307,14,Seed_A)) #fix newmexico
games_comb = games_comb %>% mutate(Seed_A = ifelse(Team_A==1377,16,Seed_A)) #fix South dakota
games_comb = games_comb %>% mutate(Seed_B = ifelse(TeamID==1307,14,Seed_B)) #fix newmexico
games_comb = games_comb %>% mutate(Seed_B = ifelse(TeamID==1377,16,Seed_B)) #fix South dakota

#feed seed factors
games_comb$Seed_A = as.numeric(games_comb$Seed_A)
games_comb$Seed_B = as.numeric(games_comb$Seed_B)


write.csv(games_comb, './data/dataSetfor2017.csv')
unique(games_comb$Seed_B)

<<<<<<< HEAD
############################# PREDICTIONS ##############################

=======
>>>>>>> 3c59db0edada3e2bdfc31aecb51ed5d10ded218d
write.csv(games_comb, './data/DataSetfor2017.csv')
unique(games_comb$Seed_A)

#predict using logistic
probs = predict(mod, newdata = games_comb, type = "response")
hist(probs)
probs = cbind(games_comb[,1:2], logModel = probs)

#predict using logistic with regulization
games_comb.mat = as.matrix(games_comb)
games_comb.mat = games_comb.mat[,c(4:15, 19:24,26:37,41:46)]

hold = predict(mod3.cv, newx = as(games_comb.mat, "dgCMatrix"), s = "lambda.min", type = "response")
hist(hold)
probs = cbind(probs, hold)

#predict using random forest
hold = predict(mod.rf, newdata = games_comb, type = "prob")
hist(hold[,2])
probs = cbind(probs, rfModel = hold[,2])
names(probs)[4] = "logRegModel"

# log vs RF
ggplot(probs, aes(x = logModel, y = rfModel))+geom_point()+
  geom_abline(slope = 1, lwd=1, color="blue")+
  geom_hline(yintercept = .5, lwd =1, color = "red") +
  geom_vline(xintercept = .5, lwd=1, color="red")

probs %>% filter(logModel > .5, rfModel < .5) %>% summarise(count=n()) #bottom right
probs %>% filter(logModel <= .5, rfModel <= .5) %>% summarise(count=n()) #bottom left
probs %>% filter(logModel < .5, rfModel > .5) %>% summarise(count=n()) #top left
probs %>% filter(logModel >= .5, rfModel >= .5) %>% summarise(count=n()) #top right
# 84.2% consistent classification
# 15.8%

#log vs logRegul
ggplot(probs, aes(x = logModel, y = logRegModel))+geom_point()+
  geom_abline(slope = 1, lwd=1, color="blue")+
  geom_hline(yintercept = .5, lwd =1, color = "red") +
  geom_vline(xintercept = .5, lwd=1, color="red")

probs %>% filter(logModel > .5, logRegModel < .5) %>% summarise(count=n()) #bottom right 20%
probs %>% filter(logModel <= .5, logRegModel <= .5) %>% summarise(count=n()) #bottom left
probs %>% filter(logModel < .5, logRegModel > .5) %>% summarise(count=n()) #top left
probs %>% filter(logModel >= .5, logRegModel >= .5) %>% summarise(count=n()) #top right
# 78.2% consistent classification
# 21.8%

#logRegul vs RF
ggplot(probs, aes(x = rfModel, y = logRegModel))+geom_point()+
  geom_abline(slope = 1, lwd=1, color="blue")+
  geom_hline(yintercept = .5, lwd =1, color = "red") +
  geom_vline(xintercept = .5, lwd=1, color="red")


write.csv(probs, "./data/SampleSubmission.csv")


svm.pred.2017=predict(mod.svm.radial, newdata = games_comb)
svm.pred.2017 = cbind(games_comb[,1:2], svm.pred.2017)


################################## BOOSTING #############################################
library(gbm)
#boosting requires special outcome to be character not factor
tourney.train.boost = tourney.train
tourney.train.boost$Outcome = as.character(tourney.train.boost$Outcome)

tourney.test.boost = tourney.test
tourney.test.boost$Outcome = as.character(tourney.test.boost$Outcome)

mod.boost = gbm(fmla,
                data = tourney.train.boost,
                distribution = "bernoulli",
                n.trees = 1000,
                interaction.depth = 9)

# NOTE: we need to specify number of trees to get a prediction for boosting
pred.boost = predict(mod.boost, newdata = tourney.test, n.trees = 1000, type = "response")

table(tourney.test$Outcome, pred.boost >.5)

# Cross validation now NOT WORKING

tGrid = expand.grid(n.trees = seq(100,1000,50), interaction.depth = seq(1,20,5),
                    shrinkage = seq(.01,.02,.005), n.minobsinnode = 10)

set.seed(232)
train.boost = train(fmla,
                    data = tourney.train.boost,
                    method = "gbm",
                    tuneGrid = tGrid,
                    trControl = trainControl(method="cv", number=5, verboseIter = FALSE),
                    metric = "Accuracy",
                    distribution = "bernoulli")

train.boost$results

ggplot(train.boost$results, aes(x = n.trees, y = Accuracy, colour = as.factor(interaction.depth))) + geom_line(lwd=1) +
  ylab("CV Accuracy") + theme(axis.title=element_text(size=13), axis.text=element_text(size=13)) +
  scale_color_discrete(name = "interaction.depth")

train.boost$bestTune

mod.boost = train.boost$finalModel
pred.boost = predict(mod.boost, newdata =tourney.test.boost, n.trees = 500, type = "response")