Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dvc/config
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[core]
remote = myremote
autostage = true
['remote "myremote"']
url = gs://soccerdata-test-data
5 changes: 0 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
# Rules can depend on other rules which run first. Rules with _ prefix are internal helpers.

MODULE_NAME = soccerdata
PYTHON_VERSION = 3.9
PYTHON_INTERPRETER = python
DOCS_PORT ?= 8000
SOCCERDATA_DIR ?= tests/appdata
.DEFAULT_GOAL := help
Expand Down Expand Up @@ -97,9 +95,6 @@ create-env: ## Set up python interpreter environment
requirements: ## Install Python Dep
uv sync

.PHONY: publish-all
publish-all: format lint publish docs-publish ## Run format, lint, publish package and docs

# ━━━━━━━━━━━━━━━━━━━━━━━━━━━ Pre-Commits ━━━━━━━━━━━━━━━━━━━━━━━━━━ #

.PHONY: pre-commit-test pre-commit-update
Expand Down
7 changes: 5 additions & 2 deletions soccerdata/clubelo.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ def read_by_date(self, date: Optional[Union[str, datetime]] = None) -> pd.DataFr
.set_index("team")
)

def read_team_history(self, team: str, max_age: Union[int, timedelta] = 1) -> pd.DataFrame:
def read_team_history(
self, team: str, max_age: Optional[Union[int, timedelta]] = 1
) -> pd.DataFrame:
"""Retrieve full ELO history for one club.

For the exact spelling of a club's name, check the result of
Expand All @@ -124,7 +126,8 @@ def read_team_history(self, team: str, max_age: Union[int, timedelta] = 1) -> pd
team : str
The club's name.
max_age : int for age in days, or timedelta object
The max. age of locally cached file before re-download.
The max. age of locally cached file before re-download. To disable
re-downloading, set to None.

Raises
------
Expand Down
6 changes: 2 additions & 4 deletions soccerdata/fotmob.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,14 @@ def __init__(
(self.data_dir / "seasons").mkdir(parents=True, exist_ok=True)
(self.data_dir / "matches").mkdir(parents=True, exist_ok=True)

def _init_session(self) -> tls_requests.Client:
session = super()._init_session()
def _init_session(self, headers: Optional[dict[str, str]] = None) -> tls_requests.Client:
try:
r = tls_requests.get("http://46.101.91.154:6006/")
r.raise_for_status()
except tls_requests.exceptions.HTTPError:
raise ConnectionError("Unable to connect to the session cookie server.")
result = r.json()
session.headers.update(result)
return session
return super()._init_session(headers=result)

@property
def leagues(self) -> list[str]:
Expand Down
8 changes: 5 additions & 3 deletions soccerdata/understat.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,9 @@ def _request_api(
self, url: str, filepath: Optional[Path] = None, no_cache: bool = False
) -> IO[bytes]:
"""Make an API request with proper headers and caching."""
is_cached = filepath is not None and filepath.exists() and not no_cache and not self.no_cache
is_cached = (
filepath is not None and filepath.exists() and not no_cache and not self.no_cache
)
if is_cached and filepath is not None:
return filepath.open(mode="rb")

Expand All @@ -725,7 +727,7 @@ def _request_api(
@staticmethod
def _extract_team_name(html: str) -> str:
"""Extract team name from tmpl HTML."""
match = re.search(r'<h3><a[^>]*>([^<]+)</a></h3>', html)
match = re.search(r"<h3><a[^>]*>([^<]+)</a></h3>", html)
if match:
return match.group(1)
return ""
Expand Down Expand Up @@ -756,4 +758,4 @@ def _as_str(value: Any) -> Optional[str]:
try:
return unescape(value)
except (TypeError, ValueError):
return None
return None
1 change: 1 addition & 0 deletions tests/appdata/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/data
6 changes: 3 additions & 3 deletions tests/appdata/data.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 105c9a198a6d5dad8ef76ca820a3b3ed.dir
size: 242939408
nfiles: 1070
- md5: 875464d3a845bdaee53c992de3e08ebe.dir
size: 232008858
nfiles: 885
hash: md5
path: data
13 changes: 7 additions & 6 deletions tests/test_ClubElo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def _check_dataframe(self, df: pd.DataFrame) -> None:
assert pd.api.types.is_datetime64_any_dtype(df["from"])
assert pd.api.types.is_datetime64_any_dtype(df["to"])

@pytest.mark.fails_gha
def test_default(self, elo: ClubElo) -> None:
"""It should return a dataframe with the latest ELO ratings if no date is given."""
df = elo.read_by_date()
Expand Down Expand Up @@ -63,27 +64,27 @@ def _check_dataframe(self, df: pd.DataFrame) -> None:

def test_with_valid_team(self, elo: ClubElo) -> None:
"""It should return a dataframe with the ELO history for the specified club."""
df = elo.read_team_history("Feyenoord")
df = elo.read_team_history("Feyenoord", max_age=None)
self._check_dataframe(df)

def test_with_teamname_replacements(self, elo: ClubElo) -> None:
"""It should use the replacement names from teamname_replacements.json."""
# ClubElo uses "Man City" as the team name
df_original = elo.read_team_history("Man City")
df_replacement = elo.read_team_history("Manchester City")
df_original = elo.read_team_history("Man City", max_age=None)
df_replacement = elo.read_team_history("Manchester City", max_age=None)
assert df_original.equals(df_replacement)

def test_raises_when_team_not_found(self, elo: ClubElo) -> None:
"""It should raise an error if the team is not found."""
with pytest.raises(ValueError, match="No data found for team FC Knudde"):
_ = elo.read_team_history("FC Knudde")
_ = elo.read_team_history("FC Knudde", max_age=None)

def test_handles_special_characters_in_team_names(self, elo: ClubElo) -> None:
"""It should be able to deal with special characters in team names."""
df = elo.read_team_history("Brighton & Hove Albion")
df = elo.read_team_history("Brighton & Hove Albion", max_age=None)
self._check_dataframe(df)
with pytest.raises(ValueError, match="No data found for team Team & City"):
_ = elo.read_team_history("Team & City")
_ = elo.read_team_history("Team & City", max_age=None)

@pytest.mark.fails_gha
def test_respects_max_age_and_updates_cache(self, elo: ClubElo) -> None:
Expand Down
43 changes: 21 additions & 22 deletions tests/test_Integration.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
"""Integration tests for soccerdata package."""

import pandas as pd
import pytest

import soccerdata as foo

# TODO: integration tests
# Names of common leagues equal for all classes
Expand All @@ -13,21 +9,24 @@
# Scores per game equal for all common leagues over classes


@pytest.mark.e2e
def test_mh_vs_elo():
"""We should be able to retrieve the Elo history for all teams in these leagues."""
league_sel = [
"ENG-Premier League",
"ESP-La Liga",
"FRA-Ligue 1",
"GER-Bundesliga",
"ITA-Serie A",
]

mh = foo.MatchHistory(leagues=league_sel, seasons="1819")
mh_games = mh.read_games()

elo = foo.ClubElo()
elo_hist = pd.concat([elo.read_team_history(team) for team in set(mh_games["home_team"])])

assert set(mh_games["home_team"]) - set(elo_hist["team"]) == set()
# FIXME: disable for now as ClubElo is flaky
# @pytest.mark.e2e
# def test_mh_vs_elo():
# """We should be able to retrieve the Elo history for all teams in these leagues."""
# league_sel = [
# "ENG-Premier League",
# "ESP-La Liga",
# "FRA-Ligue 1",
# "GER-Bundesliga",
# "ITA-Serie A",
# ]
#
# mh = foo.MatchHistory(leagues=league_sel, seasons="1819")
# mh_games = mh.read_games()
#
# elo = foo.ClubElo()
# elo_hist = pd.concat(
# [elo.read_team_history(team, max_age=None) for team in set(mh_games["home_team"])]
# )
#
# assert set(mh_games["home_team"]) - set(elo_hist["team"]) == set()
109 changes: 94 additions & 15 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import json
from datetime import datetime, timezone
from unittest.mock import MagicMock, patch

import pandas as pd
import pytest
Expand All @@ -20,55 +21,133 @@
# _download_and_save


def test_download_and_save_not_cached(tmp_path):
@pytest.fixture
def mock_tls_client():
# Patch the session's get method
# Change 'your_module' to the actual module name
with patch("tls_requests.Client.get") as mock_get:

def _return_csv(content="Rank,Club,Country\n1,Barcelona,ESP"):
mock_resp = MagicMock()
mock_resp.content = content.encode("utf-8")
mock_resp.status_code = 200
mock_resp.raise_for_status = lambda: None
mock_get.return_value = mock_resp
return mock_get

def _return_js_var(var_name="statData", data={"key": "value"}):
"""
Mimics: var name = JSON.parse('\x7b\x22key\x22\x3a\x22value\x22\x7d')
The regex in the reader expects string-escaped content inside single quotes.
"""
# 1. Convert dict to JSON string
json_str = json.dumps(data)
# 2. Escape double quotes so it survives being wrapped in single quotes
# and works with the reader's .decode("unicode_escape")
escaped_json = json_str.replace('"', '\\"')

html = f"var {var_name} = JSON.parse('{escaped_json}')"

mock_resp = MagicMock()
mock_resp.content = html.encode("utf-8")
mock_resp.status_code = 200
mock_resp.raise_for_status = lambda: None
mock_get.return_value = mock_resp
return mock_get

mock_get.return_csv = _return_csv
mock_get.return_js_var = _return_js_var
yield mock_get


# --- Tests ---


def test_download_and_save_not_cached(tmp_path, mock_tls_client):
# Setup mock
mock_tls_client.return_csv()

reader = BaseRequestsReader()
url = "http://api.clubelo.com/Barcelona"
filepath = tmp_path / "Barcelona.csv"
data = reader._download_and_save(url, filepath)
data = reader.get(url, filepath)

assert isinstance(pd.read_csv(data), pd.DataFrame)
assert filepath.exists()


def test_download_and_save_cached(tmp_path, mock_tls_client):
# Setup mock
mock_tls_client.return_csv()

def test_download_and_save_cached(tmp_path):
reader = BaseRequestsReader()
url = "http://api.clubelo.com/Barcelona"
filepath = tmp_path / "Barcelona.csv"
data = reader._download_and_save(url, filepath)
data = reader._download_and_save(url, filepath)

# First call: triggers the mock/download
reader.get(url, filepath)
# Second call: should read from disk
data = reader.get(url, filepath)

assert isinstance(pd.read_csv(data), pd.DataFrame)
# Verify the network was only hit once
assert mock_tls_client.call_count == 1


def test_download_and_save_no_cache(tmp_path, mock_tls_client):
# Setup mock with at least 2 rows of data
mock_tls_client.return_csv("Col1,Col2\nVal1,Val2\nVal3,Val4")

def test_download_and_save_no_cache(tmp_path):
reader = BaseRequestsReader(no_cache=True)
url = "http://api.clubelo.com/Barcelona"
filepath = tmp_path / "Barcelona.csv"

# Pre-populate with bogus data
filepath.write_text("bogus")
data = reader._download_and_save(url, filepath)
assert len(pd.read_csv(data)) > 1

data = reader.get(url, filepath)
# If no_cache=True, it should have overwritten "bogus" with our 2-row CSV
assert len(pd.read_csv(data)) >= 2


def test_download_and_save_no_store_no_filepath(mock_tls_client):
# Setup mock
mock_tls_client.return_csv()

def test_download_and_save_no_store_no_filepath():
reader = BaseRequestsReader(no_store=True)
url = "http://api.clubelo.com/Barcelona"
data = reader._download_and_save(url, filepath=None)
data = reader.get(url, filepath=None)

assert isinstance(pd.read_csv(data), pd.DataFrame)


def test_download_and_save_no_cache_filepath(tmp_path):
def test_download_and_save_no_cache_filepath(tmp_path, mock_tls_client):
# Setup mock
mock_tls_client.return_csv()

reader = BaseRequestsReader(no_store=True)
url = "http://api.clubelo.com/Barcelona"
filepath = tmp_path / "Barcelona.csv"
data = reader._download_and_save(url, filepath)

data = reader.get(url, filepath)

assert isinstance(pd.read_csv(data), pd.DataFrame)
# no_store=True means the file should be deleted or never written
assert not filepath.exists()


def test_download_and_save_variable_no_store_no_filepath():
def test_download_and_save_variable_no_store_no_filepath(mock_tls_client):
# Setup mock using the JS variable helper
mock_tls_client.return_js_var(var_name="statData", data={"player": "Messi", "goals": 10})

reader = BaseRequestsReader(no_store=True)
url = "https://understat.com/"
data = reader._download_and_save(url, filepath=None, var="statData")
data = reader.get(url, filepath=None, var="statData")

stats = json.load(data)
assert isinstance(stats, dict)
assert "statData" in stats
# the result is wrapped in {var_name: data}
assert stats["statData"]["player"] == "Messi"


# def test_download_and_save_requests_tor(tmp_path):
Expand Down