Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion basketballCrawler/basketballCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,11 @@ def loadPlayerDictionary(pathToFile):
json_dict = json.loads(f.read())
for player_name in json_dict:
parsed_player = Player(None,None,False)
parsed_player.__dict__ = json_dict[player_name]
json_dict_player = json_dict[player_name]
if isinstance(json_dict_player, unicode):
parsed_player.__dict__ = json.loads(json_dict_player)
else:
parsed_player.__dict__ = json_dict_player
result[player_name] = parsed_player
return result

Expand Down
22 changes: 21 additions & 1 deletion basketballCrawler/player.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import re
import logging
import json
from bs4 import Comment
from bs4 import BeautifulSoup

class Player(object):
# Regex patterns for player info
Expand All @@ -14,6 +16,7 @@ class Player(object):
positions = []
height = None
weight = None
salaries = []

overview_url = None
overview_url_content = None
Expand All @@ -29,6 +32,8 @@ def __init__(self,_name,_overview_url,scrape_data=True):
self.positions = []
self.height = None
self.weight = None
self.salaries = []

self.overview_url_content = None
self.gamelog_data = None
self.gamelog_url_list = []
Expand All @@ -52,6 +57,7 @@ def scrape_data(self):
self.weight = re.findall(self.WEIGHT_PATTERN,player_weight_text)[0].strip().encode("utf8")
tempPositions = re.findall(self.POSN_PATTERN,player_position_text)
self.positions = [position.strip().encode("utf8") for position in tempPositions]
self.salaries = self.findSalaries(overview_soup)

except Exception as ex:
logging.error(ex.message)
Expand All @@ -69,5 +75,19 @@ def scrape_data(self):
for game_log_link in game_log_links:
self.gamelog_url_list.append('http://www.basketball-reference.com' + game_log_link.get('href'))

def findSalaries(self, soupped):
total_salaries = []
all_all_salaries = soupped.find("div", {"id": "all_all_salaries"})
comments=all_all_salaries.find_all(string=lambda text:isinstance(text,Comment))
raw_salary_rows = BeautifulSoup(comments[0], "lxml").find("tbody").find_all("tr")
for each_raw_salary in raw_salary_rows:
year = each_raw_salary.find("th").text.replace("-","_").encode("utf8")
salary = self.salaryTextToFloat(each_raw_salary.find_all("td")[2].text)
total_salaries.append((year, salary))
return total_salaries

def salaryTextToFloat(self, text):
return float(text[1:].replace(",",""))

def to_json(self):
return json.dumps(self.__dict__)
return json.dumps(self.__dict__)