-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdescriptiveAnalysis.Rmd
More file actions
249 lines (193 loc) · 8.79 KB
/
descriptiveAnalysis.Rmd
File metadata and controls
249 lines (193 loc) · 8.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
---
title: "Descriptive Analysis"
date: "4/22/2017"
output: pdf_document
---
# Teams Winning the Tournament
Which 5 teams have won the most NCAA titles?
```{r}
library(plyr)
library(dplyr)
# Get the champion for all years
tourney <- read.csv('./data/TourneyCompactResults.csv')
teams <- read.csv('./data/Teams.csv')
titleTeams <- merge(tourney, teams, by.x="Wteam", by.y="Team_Id")
titleTeams <- titleTeams %>% filter(Daynum == 154)
titleTeams <- titleTeams[, c('Team_Name', 'Season')] %>% arrange(desc(Season))
# Count the frequency of winning teams from 1985-2017
titleCount <- count(titleTeams, 'Team_Name') %>% arrange(desc(freq))
```
| Team | Titles |
|----------------|--------|
| Duke | 5 |
| Connecticut | 4 |
| North Carolina | 4 |
| Kentucky | 3 |
| Florida | 2 |
```{r, echo=FALSE}
library(ggplot2)
p <- ggplot(head(titleCount, 5), aes(Team_Name, freq))
p + labs(title="Most Titles (1985-2017)", x="Team", y="Count") +
geom_bar(stat = "identity")
```
# Teams in the Finals
Which teams have played in the championship game the most times?
```{r}
# We have the teams that have won the title, now we want teams that have lost in the finals.
finalsLosingTeams <- merge(tourney, teams, by.x="Lteam", by.y="Team_Id")
finalsLosingTeams <- finalsLosingTeams %>% filter(Daynum == 154)
finalsLosingTeams <- finalsLosingTeams[, c('Team_Name', 'Season')] %>% arrange(desc(Season))
# Combine all the winning teams with all the losing teams in the finals
finalsAppearances <- rbind(titleTeams, finalsLosingTeams)
# Count the frequency of winning teams from 1985-2017
finalsAppearancesCount <- count(finalsAppearances, 'Team_Name') %>% arrange(desc(freq))
```
| Team | Finals Appearances |
|----------------|--------------------|
| Duke | 9 |
| North Carolina | 5 |
| Kansas | 5 |
| Kentucky | 5 |
| Connecticut | 4 |
```{r, echo=FALSE}
library(ggplot2)
p <- ggplot(head(finalsAppearancesCount, 5), aes(Team_Name, freq))
p + labs(title="Most Finals Appearances (1985-2017)", x="Team", y="Count") + geom_bar(stat = "identity")
```
# Teams in the Final Four
How about the 5 teams that have appeared in the Final Four the most times?
```{r}
finalFourTeams <- tourney %>% filter(Daynum == 145 | Daynum == 146)
finalFourWinningTeams <- merge(finalFourTeams, teams, by.x="Wteam", by.y="Team_Id")
finalFourLosingTeams <- merge(finalFourTeams, teams, by.x="Lteam", by.y="Team_Id")
finalFourTeams <- rbind(finalFourWinningTeams, finalFourLosingTeams)
finalFourTeams <- finalFourTeams[, c('Team_Name', 'Season')] %>% arrange(desc(Season))
# Count the frequency
finalFourCount <- count(finalFourTeams, 'Team_Name') %>% arrange(desc(freq))
```
| Team | Appearances |
|----------------|-------------|
| North Carolina | 17 |
| Kentucky | 16 |
| Kansas | 14 |
| Duke | 14 |
| Arizona | 10 |
```{r, echo=FALSE}
p <- ggplot(head(finalFourCount, 5), aes(Team_Name, freq))
p + labs(title="Most Final Four Appearances (1985-2017)", x="Team", y="Count") + geom_bar(stat = "identity")
```
# Significance of Statistics over Time
How have TOPG and PCT for the winning team from 2003-2017 changed over time?
```{r}
# All team season stats from 1985-2017
seasonStats <- read.csv('./data/FinalStats.csv')[,2:15]
# All championship teams from 1985-2017 with Team_Id column
titleTeams <- tourney %>% filter(Daynum == 154)
titleTeams <- titleTeams[, c('Wteam', 'Season')] %>% arrange(desc(Season))
# Function to return the season stats for each title team
getTeamStats <- function(seasonStats, teams) {
datalist = list()
for (i in 1:nrow(teams)) {
datalist[[i]] <- subset(seasonStats, Season == teams[i, 'Season'] & TeamID == teams[i, 'Wteam'])
}
result <- bind_rows(datalist)
result <- merge(result, teams, by.x="TeamID", by.y="Team_Id")
result <- result[, !(names(result) %in% c('TeamID'))] %>% arrange(desc(Season))
return(result)
}
# Title team season stats for years 2003-2017
titleTeamStats <- getTeamStats(seasonStats, titleTeams)
```
```{r, echo=FALSE}
p1 <- ggplot(data=titleTeamStats, aes(titleTeamStats['Season'])) +
labs(title="Title Team Season Stats (2003-2017)", x="Season", y="Stat") +
geom_line(aes(y=titleTeamStats['TOPG'], color='Turnovers per game')) +
geom_line(aes(y=titleTeamStats['ORPG'] + titleTeamStats['DRPG'], color='Rebounds per game')) +
geom_line(aes(y=titleTeamStats['ORPG'] + titleTeamStats['APG'], color='Assists per game')) +
geom_line(aes(y=titleTeamStats['ORPG'] + titleTeamStats['PFPG'], color='Personal fouls per game')) +
geom_line(aes(y=titleTeamStats['ORPG'] + titleTeamStats['BPG'], color='Blocks per game')) +
geom_line(aes(y=titleTeamStats['SPG'] + titleTeamStats['SPG'], color='Steals per game')) +
geom_line(aes(y=titleTeamStats['PPG'], color='Points per game')) +
scale_x_continuous(breaks=seq(2003, 2017, 2)) +
scale_y_continuous(limits=c(0, 90))
p1
library(scales)
p2 <- ggplot(data=titleTeamStats, aes(titleTeamStats['Season'])) +
labs(title="Title Team Season Percentage-based Stats (2003-2017)", x="Season", y="Stat") +
geom_line(aes(y=titleTeamStats['FGP'], color='Field goal %')) +
geom_line(aes(y=titleTeamStats['TPP'], color='3 pt. %')) +
geom_line(aes(y=titleTeamStats['PCT'], color='Win %')) +
scale_x_continuous(breaks=seq(2003, 2017, 2)) +
scale_y_continuous(labels = percent_format(), limits=c(0,1))
p2
```
How about TOPG for all teams from 2003-2017?
```{r}
# All final four teams from 1985-2017 with Team_Id column
tourneyTeams <- tourney %>% filter(Daynum > 132)
tourneyTeams <- tourneyTeams[, c('Wteam', 'Season')] %>% arrange(desc(Season))
# Tournament team season stats for years 2003-2017
tourneyTeamStats <- getTeamStats(seasonStats, tourneyTeams)
# Plot with regression line
p3 <- ggplot(data=tourneyTeamStats, aes(x=tourneyTeamStats['Season'], y=tourneyTeamStats['TOPG'])) +
labs(title="Tournament Team Reg. Season Turnovers per Game by Year (2003-2017)", x="Season", y="%") +
geom_point(shape=1) + scale_x_continuous(breaks=seq(2003, 2017, 2)) +
scale_y_continuous() + geom_smooth(method=lm)
p3
```
What about margin of victory over the years? Could the tournament be getting more competitive?
```{r}
scores <- tourney[, c('Wscore', 'Lscore', 'Season', 'Daynum')] %>% arrange(desc(Season))
scores <- scores %>% filter(Daynum > 144 & Daynum < 152)
scores <- mutate(scores, Margin = Wscore - Lscore)
avgScores <- group_by(scores, Season) %>% summarize(m = mean(Margin))
# Plot with regression line
p4 <- ggplot(data=avgScores, aes(x=avgScores['Season'], y=avgScores['m'])) +
labs(title="Averaged Final Four Game Margins of Victory per Season (1985-2017)", x="Season", y="Margin of Victory (Points)") +
geom_point(shape=0) + scale_x_continuous(breaks=seq(1985, 2017, 2)) +
scale_y_continuous() + geom_smooth(method=lm)
p4
```
What are the seed records in the first round?
```{r, echo=FALSE}
seeds <- read.csv('./data/TourneySeeds.csv')
seeds$Seed <- substring(seeds$Seed, 2, 3)
# Need to replace tourney$Wteam and tourney$Lteam with tourney$Seed
getTeamSeeds <- function(tournamentDataset, seedDataset) {
tourneyCopy <- tournamentDataset
tourneyCopy$Wseed <- 0
tourneyCopy$Lseed <- 0
# 1st round
# tourneyCopy <- tourneyCopy %>% filter(Daynum > 135 & Daynum < 138)
# 2nd round
# tourneyCopy <- tourneyCopy %>% filter(Daynum > 137 & Daynum < 141)
# Sweet 16
# tourneyCopy <- tourneyCopy %>% filter(Daynum > 140 & Daynum < 145)
# Elite 8
# tourneyCopy <- tourneyCopy %>% filter(Daynum > 144 & Daynum < 147)
# Final 4
# tourneyCopy <- tourneyCopy %>% filter(Daynum > 146 & Daynum < 153)
# Title Game
tourneyCopy <- tourneyCopy %>% filter(Daynum == 154)
for (i in 1:nrow(tourneyCopy)) {
tourneySeason <- tourneyCopy[i, 1] # Season
wteam <- tourneyCopy[i, 3] # Wteam
lteam <- tourneyCopy[i, 5] # Lteam
# Update Wteam with seed
teamSeed <- as.numeric(subset(seeds, Season == tourneySeason & Team == wteam)$Seed)
if (length(teamSeed) == 0) next
tourneyCopy[i, grep("Wseed", colnames(tourneyCopy))] <- as.numeric(teamSeed)
# Update Lteam with seed
teamSeed <- as.numeric(subset(seeds, Season == tourneySeason & Team == lteam)$Seed)
if (length(teamSeed) == 0) next
tourneyCopy[i, grep("Lseed", colnames(tourneyCopy))] <- as.numeric(teamSeed)
}
return(tourneyCopy)
}
tourneyWithSeeds <- getTeamSeeds(tourney, seeds)
seedWinCount <- count(tourneyWithSeeds, 'Wseed') %>% arrange(Wseed)
p <- ggplot(seedWinCount)
p + labs(title="Title Game Wins by Seed (1985-2017)", x="Seed", y="Wins") +
geom_bar(aes(x=Wseed, y=freq), stat = "identity", lwd=0.1) +
scale_x_continuous(breaks=seq(1, 16, 1)) + geom_text(aes(x=Wseed, y=freq, label=freq), position=position_dodge(width=1.0), vjust=-0.0)
```