Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions basketballCrawler/basketballCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import string
import pandas as pd
import logging
from bs4 import Comment, BeautifulSoup
from difflib import SequenceMatcher
from player import Player,getSoupFromURL
from player import Player, getSoupFromURL


__all__ = ['getSoupFromURL', 'getCurrentPlayerNamesAndURLS',
Expand All @@ -24,7 +25,7 @@ def getCurrentPlayerNamesAndURLS(suppressOutput=True):

names = []

for letter in string.ascii_lowercase:
for letter in string.ascii_lowercase[0]:
letter_page = getSoupFromURL('http://www.basketball-reference.com/players/%s/' % (letter), suppressOutput)

# we know that all the currently active players have <strong> tags, so we'll limit our names to those
Expand All @@ -51,7 +52,7 @@ def buildPlayerDictionary(suppressOutput=True):

players={}
for name, url in playerNamesAndURLS.items():
players[name] = Player(name,url,scrape_data=True)
players[name] = Player(name, url, scrape_data=True)
time.sleep(1) # sleep to be kind.

logging.debug("buildPlayerDictionary complete")
Expand Down Expand Up @@ -118,13 +119,21 @@ def dfFromGameLogURL(url):
glsoup = getSoupFromURL(url)

reg_season_table = glsoup.findAll('table', attrs={'id': 'pgl_basic'}) # id for reg season table
playoff_table = glsoup.findAll('table', attrs={'id': 'pgl_basic_playoffs'}) # id for playoff table
playoff_table = glsoup.find_all(string = lambda text: isinstance(text, Comment))
try:
playoff_table = BeautifulSoup(filter(lambda x: 'pgl_basic_playoffs' in x, playoff_table)[0])
playoff_table = playoff_table.findAll('table', attrs={'id': 'pgl_basic_playoffs'}) # id for playoff table
except:
playoff_table = []

# parse the table header. we'll use this for the creation of the DataFrame
header = []
for th in reg_season_table[0].findAll('th'):
if not th.getText() in header:
header.append(th.getText())
if not th.getText() in header :
try:
int(th.getText())
except:
header.append(th.getText())

# add in headers for home/away and w/l columns. a must to get the DataFrame to parse correctly

Expand Down Expand Up @@ -155,8 +164,8 @@ def soupTableToDF(table_soup, header):
# remove blank rows
rows = [r for r in rows if len(r.findAll('td')) > 0]

parsed_table = [[col.getText() for col in row.findAll('td')] for row in rows] # build 2d list of table values
return pd.io.parsers.TextParser(parsed_table, names=header, index_col=2, parse_dates=True).get_chunk()
parsed_table = [[col.getText() for col in row.findAll(['td', 'th'])] for row in rows] # build 2d list of table values
return pd.io.parsers.TextParser(parsed_table, names = header, index_col=2, parse_dates=True).get_chunk()


def gameLogs(playerDictionary, name):
Expand Down
18 changes: 9 additions & 9 deletions basketballCrawler/player.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class Player(object):
gamelog_data = None
gamelog_url_list = []

def __init__(self,_name,_overview_url,scrape_data=True):
def __init__(self, _name, _overview_url, scrape_data=True):
self.name = _name
self.overview_url = _overview_url

Expand All @@ -37,19 +37,18 @@ def __init__(self,_name,_overview_url,scrape_data=True):
self.scrape_data()

def scrape_data(self):
print self.name,self.overview_url
print self.name, self.overview_url
if self.overview_url_content is not None:
raise Exception("Can't populate this!")

overview_soup = getSoupFromURL(self.overview_url)
self.overview_url_content = overview_soup.text

try:
player_infotext = overview_soup.findAll('p',attrs={'class':'padding_bottom_half'})[0].text.split('\n')[0]

self.positions = re.findall(self.POSN_PATTERN,player_infotext)[0].strip().encode("utf8").split(" and ")
self.height = re.findall(self.HEIGHT_PATTERN,player_infotext)[0].strip().encode("utf8")
self.weight = re.findall(self.WEIGHT_PATTERN,player_infotext)[0].strip().encode("utf8")
pos = filter(lambda x: 'Position:' in x, [p.text for p in overview_soup.findAll('p')])[0].strip().replace('\n','')
self.positions = re.findall(self.POSN_PATTERN, pos)[0].strip().encode("utf8").split(" and ")
self.height = overview_soup.find('span', {'itemprop':'height'}).text
self.weight = overview_soup.find('span', {'itemprop':'weight'}).text[:-2]

except Exception as ex:
logging.error(ex.message)
Expand All @@ -65,7 +64,8 @@ def scrape_data(self):
game_log_links = li.findAll('a')

for game_log_link in game_log_links:
self.gamelog_url_list.append('http://www.basketball-reference.com' + game_log_link.get('href'))
if 'gamelog' in game_log_link.get('href'):
self.gamelog_url_list.append('http://www.basketball-reference.com' + game_log_link.get('href'))

def to_json(self):
return json.dumps(self.__dict__)
return json.dumps(self.__dict__)