-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_clean_data.py
More file actions
88 lines (70 loc) · 2.38 KB
/
get_clean_data.py
File metadata and controls
88 lines (70 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from nba_api.stats.endpoints import LeagueDashPlayerStats
import time
from unidecode import unidecode
import pandas as pd
import os
# Made DATA FOLDER
if not os.path.exists('DATA'):
os.makedirs('DATA')
#base per-game stats
stats_base = LeagueDashPlayerStats(
season='2024-25',
per_mode_detailed='PerGame'
).get_data_frames()[0]
time.sleep(1)
#advanced stats
stats_adv = LeagueDashPlayerStats(
season='2024-25',
per_mode_detailed='PerGame',
measure_type_detailed_defense='Advanced'
).get_data_frames()[0]
time.sleep(1)
stats_base['PLAYER_NAME'] = stats_base['PLAYER_NAME'].apply(unidecode)
stats_adv['PLAYER_NAME'] = stats_adv['PLAYER_NAME'].apply(unidecode)
#merge
nba_df = stats_base.merge(
stats_adv,
on='PLAYER_ID',
suffixes=('_base', '_adv')
)
#drop duplicates
if 'PLAYER_NAME_adv' in nba_df.columns:
nba_df.drop(columns=['PLAYER_NAME_adv'], inplace=True)
nba_df.rename(columns={'PLAYER_NAME_base': 'PLAYER_NAME'}, inplace=True)
nba_df.head()
# Confirm the data
print(nba_df.head())
# Get the directory where this script is located
project_dir = os.path.dirname(os.path.abspath(__file__))
output_path = os.path.join(project_dir, "DATA/player_stats.csv")
nba_df.to_csv(output_path, index=False)
print(f"Saved data to: {output_path}")
###############################################################
# DATA CLEANING AND MERGING WITH SALARIES
# Load Data
nba = pd.read_csv('DATA/player_stats.csv')
salaries = pd.read_csv('DATA/24-25_salaries.csv')
nba['helper'] = nba['PLAYER_NAME'].str.lower()\
.str.replace('.', '', regex=False)\
.str.replace("'", '', regex=False)\
.str.replace(' ', '', regex=False)
salaries['helper'] = salaries['PLAYER_NAME'].str.lower()\
.str.replace('.', '', regex=False)\
.str.replace("'", '', regex=False)\
.str.replace(' ', '', regex=False)
merged = pd.merge(
nba,
salaries[['helper', 'SALARY']],
on='helper',
how='outer',
validate='one_to_one',
indicator=True
)
merged = merged[merged['_merge'] == 'both']
merged = merged.drop(columns=['helper', '_merge'])
merged['SALARY'] = merged['SALARY']\
.astype(str).str.replace('$', '', regex=False)\
.str.replace(',', '', regex=False)
merged['SALARY'] = pd.to_numeric(merged['SALARY'], errors='coerce')
merged.to_csv('DATA/nba_data_with_salaries.csv', index=False)
print('\n\nFinalized Data Cleaning!')