-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinjury_scrapper.py
72 lines (55 loc) · 2.21 KB
/
injury_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
Descripiton:
This script is a webscraper for injury data from the Pro Sports Transaction website
Written by:
G-R-H
"""
import numpy as np
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
def replace_all(text, dic):
'''
This function will replace characters in text given a dictionary of characters to seach for and replace
'''
rc = re.compile('|'.join(map(re.escape, dic)))
def translate(match):
return dic[match.group(0)]
return rc.sub(translate, text)
# Dictionary of characters to remove from text
char_replace = {' • ': ''}
# Create list of records read from webscrapper
list_of_rows = []
# Loop through webpage table, scrap data, and store lists
for i in range(0, 27125, 25):
url = 'https://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team=&BeginDate=2010-10-01&EndDate=&ILChkBx=yes&InjuriesChkBx=yes&Submit=Search&start={}'.format(
i)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'lxml')
table = soup.find('table', attrs={'class': 'datatable center'})
for row in table.findAll('tr', attrs={'align': 'left'}):
list_of_cells = []
for cell in row.findAll('td'):
text = replace_all(cell.text, char_replace)
text = text.strip()
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
# Store data in a dataframe for manipulation
injuries_df = pd.DataFrame(list_of_rows, columns=[
'Date', 'Team', 'Acquired', 'Relinquished', 'Notes'])
acq = injuries_df['Acquired']
rel = injuries_df['Relinquished']
# Remove instances where value is like "Name 1/ Name 2"
injuries_df['Acquired'] = np.where(
acq.str.contains('/'), acq.str.split('/ ').str[1], acq)
injuries_df['Relinquished'] = np.where(
rel.str.contains('/'), rel.str.split('/ ').str[1], rel)
# Remove instances where value is like "(some text)"
injuries_df['Acquired'] = injuries_df.Acquired.str.replace(
r"[\(\[].*?[\)\]]", "")
injuries_df['Relinquished'] = injuries_df.Relinquished.str.replace(
r"[\(\[].*?[\)\]]", "")
# Write contents to csv file
injuries_df.to_csv('data/injuries_2010-2020.csv', index=False)