11"""Scraper for understat.com."""
22
33import itertools
4+ import io
45import json
6+ import re
57from collections .abc import Iterable
68from html import unescape
79from pathlib import Path
8- from typing import Any , Callable , Optional , Union
10+ from typing import Any , Callable , IO , Optional , Union
911
1012import pandas as pd
1113
1416
1517UNDERSTAT_DATADIR = DATA_DIR / "Understat"
1618UNDERSTAT_URL = "https://understat.com"
19+ UNDERSTAT_HEADERS = {"X-Requested-With" : "XMLHttpRequest" }
1720
1821SHOT_SITUATIONS = {
1922 "OpenPlay" : "Open Play" ,
@@ -82,6 +85,13 @@ def __init__(
8285 data_dir = data_dir ,
8386 )
8487 self .seasons = seasons
88+ self ._cookies_initialized = False
89+
90+ def _ensure_cookies (self ) -> None :
91+ """Ensure the session has cookies from the homepage."""
92+ if not self ._cookies_initialized :
93+ self ._session .get (UNDERSTAT_URL )
94+ self ._cookies_initialized = True
8595
8696 def read_leagues (self ) -> pd .DataFrame :
8797 """Retrieve the selected leagues from the datasource.
@@ -637,32 +647,88 @@ def _select_matches(
637647 return df
638648
639649 def _read_leagues (self , no_cache : bool = False ) -> dict :
640- url = UNDERSTAT_URL
650+ self ._ensure_cookies ()
651+ url = UNDERSTAT_URL + "/getStatData"
641652 filepath = self .data_dir / "leagues.json"
642- response = self .get (url , filepath , no_cache = no_cache , var = "statData" )
643- return json .load (response )
653+ reader = self ._request_api (url , filepath , no_cache = no_cache )
654+ data = json .load (reader )
655+ return {"statData" : data ["stat" ]}
644656
645657 def _read_league_season (
646658 self , url : str , league_id : int , season_id : int , no_cache : bool = False
647659 ) -> dict :
660+ self ._ensure_cookies ()
661+ # Extract league slug and season from the HTML page URL
662+ # URL format: https://understat.com/league/{league_slug}/{season_id}
663+ parts = url .rstrip ("/" ).split ("/" )
664+ league_slug = parts [- 2 ]
665+ season = parts [- 1 ]
666+ api_url = UNDERSTAT_URL + f"/getLeagueData/{ league_slug } /{ season } "
648667 filepath = self .data_dir / f"league_{ league_id } _season_{ season_id } .json"
649- response = self .get (
650- url ,
651- filepath ,
652- no_cache = no_cache ,
653- var = [ "datesData" , " playersData", "teamsData " ],
654- )
655- return json . load ( response )
668+ reader = self ._request_api ( api_url , filepath , no_cache = no_cache )
669+ data = json . load ( reader )
670+ return {
671+ "datesData" : data [ "dates" ] ,
672+ " playersData": data [ "players " ],
673+ "teamsData" : data [ "teams" ],
674+ }
656675
657676 def _read_match (self , url : str , match_id : int ) -> Optional [dict ]:
677+ self ._ensure_cookies ()
658678 try :
679+ api_url = UNDERSTAT_URL + f"/getMatchData/{ match_id } "
659680 filepath = self .data_dir / f"match_{ match_id } .json"
660- response = self .get (url , filepath , var = ["match_info" , "rostersData" , "shotsData" ])
661- data = json .load (response )
681+ reader = self ._request_api (api_url , filepath )
682+ data = json .load (reader )
683+
684+ # Construct match_info from tmpl and rosters
685+ home_team_name = self ._extract_team_name (data ["tmpl" ]["home" ])
686+ away_team_name = self ._extract_team_name (data ["tmpl" ]["away" ])
687+ rosters = data ["rosters" ]
688+ home_team_id = next (iter (rosters ["h" ].values ()))["team_id" ]
689+ away_team_id = next (iter (rosters ["a" ].values ()))["team_id" ]
690+
691+ match_info = {
692+ "h" : home_team_id ,
693+ "a" : away_team_id ,
694+ "team_h" : home_team_name ,
695+ "team_a" : away_team_name ,
696+ }
697+
698+ return {
699+ "match_info" : match_info ,
700+ "rostersData" : rosters ,
701+ "shotsData" : data ["shots" ],
702+ }
662703 except ConnectionError :
663- data = None
704+ return None
664705
665- return data
706+ def _request_api (
707+ self , url : str , filepath : Optional [Path ] = None , no_cache : bool = False
708+ ) -> IO [bytes ]:
709+ """Make an API request with proper headers and caching."""
710+ is_cached = filepath is not None and filepath .exists () and not no_cache and not self .no_cache
711+ if is_cached and filepath is not None :
712+ return filepath .open (mode = "rb" )
713+
714+ response = self ._session .get (url , headers = UNDERSTAT_HEADERS )
715+ response .raise_for_status ()
716+ payload = response .content
717+
718+ if not self .no_store and filepath is not None :
719+ filepath .parent .mkdir (parents = True , exist_ok = True )
720+ with filepath .open (mode = "wb" ) as fh :
721+ fh .write (payload )
722+
723+ return io .BytesIO (payload )
724+
725+ @staticmethod
726+ def _extract_team_name (html : str ) -> str :
727+ """Extract team name from tmpl HTML."""
728+ match = re .search (r'<h3><a[^>]*>([^<]+)</a></h3>' , html )
729+ if match :
730+ return match .group (1 )
731+ return ""
666732
667733
668734def _as_bool (value : Any ) -> Optional [bool ]:
@@ -690,4 +756,4 @@ def _as_str(value: Any) -> Optional[str]:
690756 try :
691757 return unescape (value )
692758 except (TypeError , ValueError ):
693- return None
759+ return None
0 commit comments