FinalProject/dataviz.qmd at main · mgh2xx/FinalProject · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
---
title: "Data Visualization"
format: html
---

```{python}
#| echo: false
#| warning: false
#| message: false

# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import matplotlib.patches as mpatches
from scipy import stats

# Loading in data
data = pd.read_csv('./data/cleaned_data.csv')

# Converting date to datetime
data["Date"] = pd.to_datetime(data["Date"])
```

## Making Visualization 1

### Getting Rolling Averages for Visualization 1
```{python}
# This code chunk retrieves the n-game (7-game) rolling average of proportion of low, med, and high band jumps

# Empty list to hold rolling_averages data
rolling_averages = []

# Variable for number of games in rolling average
window = 7

# Looping through each group of the df, where the groups are by player
for player, group in data.groupby("About"):
    # Ensuring the dates are in the correct order for the rolling average calculation
    group = group.sort_values(by = "Date")

    # Calculating the rolling averages, over window games, and assigning that to the new column made for that value
    group["Prop_Jump_High"]= group["Prop_Jump_High"].rolling(window, min_periods = 1).mean()
    group["Prop_Jump_Med"]= group["Prop_Jump_Med"].rolling(window, min_periods = 1).mean()
    group["Prop_Jump_Low"]= group["Prop_Jump_Low"].rolling(window, min_periods = 1).mean()

    # Appending the group to the list of groups to be put in a df later
    rolling_averages.append(group)

# Concatenating the rolling average groups. Each player/group is essentially a row in this new dataframe
rolling_df = pd.concat(rolling_averages)

# Filtering out players who have played less than 15 games, as our analysis requires a pretty large sample size

# Getting game count by player
player_games = rolling_df["About"].value_counts()
# Getting the players who have played more than 15 games
players_met_min_games = player_games[player_games >= 15].index
# Limiting rolling_df to only players in the list of players who have played 15 games
rolling_df = rolling_df[rolling_df["About"].isin(players_met_min_games)]
```

### Creating Visualization 1 Color Palette
```{python}
# Getting the number of seasons per player, and getting a list of players who are in both seasons
players_seasons = rolling_df.groupby('About')['Season'].nunique()
players_in_both = players_seasons[players_seasons > 1].index.tolist()

# Getting list of all players
players = rolling_df['About'].unique()

# Making a palette where players in 1 season are orange, and players in 2 seasons are navy
palette = {player : ('navy' if player in players_in_both else 'orange') for player in players}
```

### Generating and Saving Visualization 1

```{python}
# Creating Figure and Flattening Axes
fig, axs = plt.subplots(1, 2, figsize = (30, 10), sharey = True)
axs = axs.flatten()

# Looping through subplots and seasons
for ax, season in zip(axs, [1, 2]):
    # Filter data for the current season
    season_data = rolling_df[rolling_df["Season"] == season]

    # Calculate dates for x-ticks
    date_percentiles = np.percentile(season_data["Date"].values.astype('int64'), [0, 25, 50, 75, 100])
    date_ticks = pd.to_datetime(date_percentiles)

    # Plotting
    plot = sns.lineplot(data=season_data, x='Date', y="Prop_Jump_High", hue="About", palette=palette, ax=ax)
    plot_c = sns.lineplot(data=season_data[season_data["About"] == "Athlete C"], x="Date", y="Prop_Jump_High", color='lightblue', ax=ax)

    # Scaffolding
    ax.set_title(f"Season {season}", weight='bold')
    ax.set_xlabel("Date", weight="bold")
    ax.set_ylabel('')
    ax.legend_.remove()
    sns.despine(ax=ax, top=True, right=True, left=False, bottom=False)
    ax.tick_params(axis='both', which='both', length=0, width=0)

    # Set x-axis ticks to selected dates
    ax.set_xticks(date_ticks)
    ax.set_xticklabels([dt.strftime('%b %Y') for dt in date_ticks])

# Adding title, subtitle, axis label, etc
fig.text(0.125, 1, 'High-Band Jump Trends by Player: Comparing', fontsize=20, rotation='horizontal', weight='bold')
fig.text(0.365, 1, 'One-Season', fontsize=20, rotation='horizontal', weight='bold', color='orange')
fig.text(0.429, 1, 'vs', fontsize=20, rotation='horizontal', weight='bold')
fig.text(0.443, 1, 'Two-Season', fontsize=20, rotation='horizontal', weight='bold', color='navy')
fig.text(0.506, 1, 'Tenure', fontsize=20, rotation='horizontal', weight='bold')
fig.text(0.125, 0.965, "No clear trend appears amongst one-season players, whereas two-season players show growth in their second season, as highlighted by", fontsize=12, ha="left")
fig.text(0.51, 0.965, "Athlete C", color = "lightblue", fontsize = 12)
fig.text(0.125, 0.9, 'High-Band Proportion', fontsize=10, rotation='horizontal', weight='bold')
plt.subplots_adjust(wspace=0.05)

# Saving and Showing Plot
plt.savefig("./images/viz_1.png", bbox_inches='tight')
plt.show()
```

## Making Visualization 2

### Creating "Adjusted_Day" Variable

```{python}
# Making a new variable which tells whether or not the player is in both seasons or just one
rolling_df['Player_Group'] = rolling_df['About'].apply(lambda x: 'both_seasons' if x in players_in_both else 'one_season')

# Getting the start date for each season
season_starts = rolling_df.groupby('Season')['Date'].min().to_dict()

# Getting the number of days into the season for each season
rolling_df['Day_of_Season'] = rolling_df.apply(lambda row: (row['Date'] - season_starts[row['Season']]).days + 1, axis=1)

# Getting last day of season 1
season1_length = (rolling_df[rolling_df['Season'] == 1]['Date'].max() - season_starts[1]).days + 1

# Function adds the number of days in the first season to the day in the second season
    # This prevents for example the 5th day in the second season and the 5th day in the first season being seen as the same day
def adjust_day(row):
    if row['Player_Group'] == 'both_seasons' and row['Season'] == 2:
        return row['Day_of_Season'] + season1_length
    else:
        return row['Day_of_Season']

# Adding the first season length to the second seasons
rolling_df['Adjusted_Day'] = rolling_df.apply(adjust_day, axis=1)
```

### Generating and Saving Visualization 2
```{python}
# Filter for Athlete C
Athlete_C = rolling_df[rolling_df["About"] == "Athlete C"]

# Plot both seasons on a single real-time timeline using Adjusted_Day
plt.figure(figsize=(14, 6))

sns.lineplot(data=Athlete_C, x='Adjusted_Day', y='Prop_Jump_High', marker='o', color='navy', label='High')
sns.lineplot(data=Athlete_C, x='Adjusted_Day', y='Prop_Jump_Low', marker='o', color='orange', label='Low')

# Adding line between Season 1 and Season 2
season1_len = (rolling_df[rolling_df['Season'] == 1]['Date'].max() - rolling_df[rolling_df['Season'] == 1]['Date'].min()).days + 1
plt.axvline(season1_len, linestyle='--', color='lightgray', label='Season Break')

# Adding subtitle
plt.suptitle('Athlete C Sees Significant Change in Jump-Band Proportions Over the Course of 2 Seasons', x = 0.33, y = 0.965, weight = "bold")

# Adding title and subtitle
plt.title("The two seasons Athlete C spends in the program results in notable growth in high-band jump proportion and decrease in the low-band jump proportion", ha = "left", fontsize = 10, x = 0, y = 1.075)
plt.text(276, 0.33, "High Band", color = "navy", weight = "bold")
plt.text(276, 0.3875, "Low Band", color = "orange", weight = "bold")
plt.text(60, 0.55, "Season 1", color = "gray", weight = "bold")
plt.text(180, 0.55, "Season 2", color = "gray", weight = "bold")
plt.xlabel("Adjusted Day", weight = "bold")
plt.text(-15, 0.625, "Proportion of Jumps", weight = "bold")
plt.ylabel("")

# Formatting
plt.tight_layout()
plt.gca().tick_params(axis = 'both', length = 0)
sns.despine(top = True, right = True)
plt.gca().get_legend().remove()

# Saving and Showing Plot
plt.savefig("./images/viz_2.png", bbox_inches='tight')
plt.show()
```

## Making Visualization 3

### Getting Daily Averages
```{python}
# Grouping by day of the season and whether or not the player was in both seasons, then taking the mean of the prop_jump_high
daily_means = rolling_df.groupby(['Adjusted_Day', 'Player_Group'])['Prop_Jump_High'].mean().reset_index()
```

### Getting Regression Lines for 1 and 2 Season Players
```{python}
# Making a dataframe to hold the required linear regression values
linreg_df = pd.DataFrame(columns=["Player_Group", "Slope", "Intercept", "R-value"])

# Looping through each unique player group
for group in daily_means["Player_Group"].unique():
    # Selecting the relevant data for the current player group
    group_data = daily_means[daily_means["Player_Group"] == group]

    # Running linear regression on Adjusted_Day vs Prop_Jump_High
    slope, intercept, r_value, p_value, std_err = stats.linregress(group_data['Adjusted_Day'], group_data['Prop_Jump_High'])

    # Adding the linear regression results to the DataFrame
    linreg_df.loc[len(linreg_df)] = [group, slope, intercept, r_value]

# Showing the table with linear regression results
linreg_df.head()
```

### Generating and Saving Visualization 3
```{python}
# Making palette for below plot
palette = {
    "both_seasons" : "navy",
    "one_season" : "orange"
}

# Making linear regression plot of prop_jump_high by # seasons played
plot = sns.lmplot(
    data = daily_means,
    x = 'Adjusted_Day',
    y = 'Prop_Jump_High',
    hue = 'Player_Group',
    palette = palette,
    markers = ["o", "s"],
    scatter_kws = {"s": 50, "alpha": 0.5},
    height = 6,
    aspect = 2
)

# Adding subtitle and title
plt.suptitle('Two-Season Players See Positive Growth in High-Band Jump Proportions'
             , x = 0.305, y = 1.125, weight = "bold")
plt.title("On average, players with 2 seasons of data increased their proportion of jumps in the high band, while 1-season players\nsaw an average decrease (likely due to fatigue, not regression).", x = 0, y = 1.055, ha = "left", fontsize = 10)

# Scaffolding/Formatting
plt.xlabel('Adjusted Day', weight = 'bold')
plt.text(-15, 0.32, "Mean Proportion of Jumps in High Band", weight = "bold")
plt.ylabel('')
plt.text(275, 0.259, "Two Seasons", color = "navy", weight = 'bold')
plt.text(150, 0.195, "One Season", color = "orange", weight = 'bold')
plt.gca().tick_params(axis = 'both', length = 0)
plot._legend.remove()

# Saving and Showing Plot
plt.savefig("./images/viz_3.png", bbox_inches='tight')
plt.show()
```