Skip to content

Commit 61c071b

Browse files
feat: fixed Understat scraper to work with the new JSON API endpoints (#907)
1 parent 1a0ea9d commit 61c071b

File tree

1 file changed

+82
-16
lines changed

1 file changed

+82
-16
lines changed

soccerdata/understat.py

Lines changed: 82 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
"""Scraper for understat.com."""
22

33
import itertools
4+
import io
45
import json
6+
import re
57
from collections.abc import Iterable
68
from html import unescape
79
from pathlib import Path
8-
from typing import Any, Callable, Optional, Union
10+
from typing import Any, Callable, IO, Optional, Union
911

1012
import pandas as pd
1113

@@ -14,6 +16,7 @@
1416

1517
UNDERSTAT_DATADIR = DATA_DIR / "Understat"
1618
UNDERSTAT_URL = "https://understat.com"
19+
UNDERSTAT_HEADERS = {"X-Requested-With": "XMLHttpRequest"}
1720

1821
SHOT_SITUATIONS = {
1922
"OpenPlay": "Open Play",
@@ -82,6 +85,13 @@ def __init__(
8285
data_dir=data_dir,
8386
)
8487
self.seasons = seasons
88+
self._cookies_initialized = False
89+
90+
def _ensure_cookies(self) -> None:
91+
"""Ensure the session has cookies from the homepage."""
92+
if not self._cookies_initialized:
93+
self._session.get(UNDERSTAT_URL)
94+
self._cookies_initialized = True
8595

8696
def read_leagues(self) -> pd.DataFrame:
8797
"""Retrieve the selected leagues from the datasource.
@@ -637,32 +647,88 @@ def _select_matches(
637647
return df
638648

639649
def _read_leagues(self, no_cache: bool = False) -> dict:
640-
url = UNDERSTAT_URL
650+
self._ensure_cookies()
651+
url = UNDERSTAT_URL + "/getStatData"
641652
filepath = self.data_dir / "leagues.json"
642-
response = self.get(url, filepath, no_cache=no_cache, var="statData")
643-
return json.load(response)
653+
reader = self._request_api(url, filepath, no_cache=no_cache)
654+
data = json.load(reader)
655+
return {"statData": data["stat"]}
644656

645657
def _read_league_season(
646658
self, url: str, league_id: int, season_id: int, no_cache: bool = False
647659
) -> dict:
660+
self._ensure_cookies()
661+
# Extract league slug and season from the HTML page URL
662+
# URL format: https://understat.com/league/{league_slug}/{season_id}
663+
parts = url.rstrip("/").split("/")
664+
league_slug = parts[-2]
665+
season = parts[-1]
666+
api_url = UNDERSTAT_URL + f"/getLeagueData/{league_slug}/{season}"
648667
filepath = self.data_dir / f"league_{league_id}_season_{season_id}.json"
649-
response = self.get(
650-
url,
651-
filepath,
652-
no_cache=no_cache,
653-
var=["datesData", "playersData", "teamsData"],
654-
)
655-
return json.load(response)
668+
reader = self._request_api(api_url, filepath, no_cache=no_cache)
669+
data = json.load(reader)
670+
return {
671+
"datesData": data["dates"],
672+
"playersData": data["players"],
673+
"teamsData": data["teams"],
674+
}
656675

657676
def _read_match(self, url: str, match_id: int) -> Optional[dict]:
677+
self._ensure_cookies()
658678
try:
679+
api_url = UNDERSTAT_URL + f"/getMatchData/{match_id}"
659680
filepath = self.data_dir / f"match_{match_id}.json"
660-
response = self.get(url, filepath, var=["match_info", "rostersData", "shotsData"])
661-
data = json.load(response)
681+
reader = self._request_api(api_url, filepath)
682+
data = json.load(reader)
683+
684+
# Construct match_info from tmpl and rosters
685+
home_team_name = self._extract_team_name(data["tmpl"]["home"])
686+
away_team_name = self._extract_team_name(data["tmpl"]["away"])
687+
rosters = data["rosters"]
688+
home_team_id = next(iter(rosters["h"].values()))["team_id"]
689+
away_team_id = next(iter(rosters["a"].values()))["team_id"]
690+
691+
match_info = {
692+
"h": home_team_id,
693+
"a": away_team_id,
694+
"team_h": home_team_name,
695+
"team_a": away_team_name,
696+
}
697+
698+
return {
699+
"match_info": match_info,
700+
"rostersData": rosters,
701+
"shotsData": data["shots"],
702+
}
662703
except ConnectionError:
663-
data = None
704+
return None
664705

665-
return data
706+
def _request_api(
707+
self, url: str, filepath: Optional[Path] = None, no_cache: bool = False
708+
) -> IO[bytes]:
709+
"""Make an API request with proper headers and caching."""
710+
is_cached = filepath is not None and filepath.exists() and not no_cache and not self.no_cache
711+
if is_cached and filepath is not None:
712+
return filepath.open(mode="rb")
713+
714+
response = self._session.get(url, headers=UNDERSTAT_HEADERS)
715+
response.raise_for_status()
716+
payload = response.content
717+
718+
if not self.no_store and filepath is not None:
719+
filepath.parent.mkdir(parents=True, exist_ok=True)
720+
with filepath.open(mode="wb") as fh:
721+
fh.write(payload)
722+
723+
return io.BytesIO(payload)
724+
725+
@staticmethod
726+
def _extract_team_name(html: str) -> str:
727+
"""Extract team name from tmpl HTML."""
728+
match = re.search(r'<h3><a[^>]*>([^<]+)</a></h3>', html)
729+
if match:
730+
return match.group(1)
731+
return ""
666732

667733

668734
def _as_bool(value: Any) -> Optional[bool]:
@@ -690,4 +756,4 @@ def _as_str(value: Any) -> Optional[str]:
690756
try:
691757
return unescape(value)
692758
except (TypeError, ValueError):
693-
return None
759+
return None

0 commit comments

Comments
 (0)