Skip to content

Commit bf56bc6

Browse files
committed
test: Add comprehensive extraction test script
Created test_extraction.py to verify the complete data extraction pipeline works correctly after fixing import issues. The script tests three phases: Phase 1: Test soccerdata library directly - Verifies all 9 data source classes can be instantiated - Checks available read_* methods - Tests: FBref, FotMob, Understat, WhoScored, Sofascore, ESPN, ClubElo, MatchHistory, SoFIFA Phase 2: Test our custom extractor classes - Imports all 9 extractor classes - Verifies they're importable (may fail if psycopg2 not installed) Phase 3: Test basic data extraction (optional) - Makes a real API call to FBref - Fetches Premier League 2023-24 schedule - Verifies data is returned correctly - Demonstrates the extraction pipeline works end-to-end Usage: python test_extraction.py This script helps diagnose any remaining issues before running the full historical loader.
1 parent 183d729 commit bf56bc6

File tree

1 file changed

+234
-0
lines changed

1 file changed

+234
-0
lines changed

test_extraction.py

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Test script to verify each data source can extract data successfully.
4+
Tests with a single league/season to minimize API calls and time.
5+
"""
6+
7+
import sys
8+
from pathlib import Path
9+
10+
# Add project root to path
11+
project_root = Path(__file__).parent
12+
sys.path.insert(0, str(project_root))
13+
14+
def test_soccerdata_direct():
15+
"""Test soccerdata library directly before testing our extractors"""
16+
print("="*80)
17+
print("PHASE 1: Testing soccerdata library directly")
18+
print("="*80)
19+
20+
try:
21+
import soccerdata as sd
22+
print(f"\n✅ soccerdata v{sd.__version__} imported")
23+
except ImportError as e:
24+
print(f"\n❌ Failed to import soccerdata: {e}")
25+
return False
26+
27+
# Test parameters
28+
test_league = "ENG-Premier League"
29+
test_season = "2324"
30+
31+
sources = {
32+
'FBref': (sd.FBref, True),
33+
'FotMob': (sd.FotMob, True),
34+
'Understat': (sd.Understat, True),
35+
'Sofascore': (sd.Sofascore, True),
36+
'ESPN': (sd.ESPN, True),
37+
'WhoScored': (sd.WhoScored, True),
38+
'MatchHistory': (sd.MatchHistory, True),
39+
'ClubElo': (sd.ClubElo, False), # ClubElo doesn't take league/season
40+
'SoFIFA': (sd.SoFIFA, True),
41+
}
42+
43+
results = {}
44+
45+
for name, (cls, needs_params) in sources.items():
46+
print(f"\n{name}:")
47+
print("-" * 40)
48+
49+
try:
50+
# Instantiate
51+
if needs_params:
52+
scraper = cls(leagues=test_league, seasons=test_season)
53+
else:
54+
scraper = cls()
55+
56+
print(f" ✅ Instantiated successfully")
57+
58+
# Try to read something (don't actually fetch data yet)
59+
methods = [m for m in dir(scraper) if m.startswith('read_')]
60+
print(f" 📚 Available methods: {len(methods)}")
61+
if methods:
62+
print(f" Examples: {', '.join(methods[:3])}")
63+
64+
results[name] = 'AVAILABLE'
65+
66+
except Exception as e:
67+
print(f" ❌ Failed: {type(e).__name__}: {str(e)[:100]}")
68+
results[name] = f'FAILED: {type(e).__name__}'
69+
70+
# Summary
71+
print("\n" + "="*80)
72+
print("PHASE 1 SUMMARY")
73+
print("="*80)
74+
75+
available = [k for k, v in results.items() if v == 'AVAILABLE']
76+
failed = [k for k, v in results.items() if v != 'AVAILABLE']
77+
78+
print(f"\n✅ Available: {len(available)}/{len(sources)}")
79+
if available:
80+
for source in available:
81+
print(f" • {source}")
82+
83+
if failed:
84+
print(f"\n❌ Failed: {len(failed)}/{len(sources)}")
85+
for source in failed:
86+
print(f" • {source}: {results[source]}")
87+
88+
return len(failed) == 0
89+
90+
91+
def test_our_extractors():
92+
"""Test our custom extractor classes"""
93+
print("\n" + "="*80)
94+
print("PHASE 2: Testing our extractor classes")
95+
print("="*80)
96+
97+
try:
98+
from scripts.utils import get_config_loader, DatabaseManager
99+
from scripts.extractors import (
100+
FBrefExtractor,
101+
FotMobExtractor,
102+
UnderstatExtractor,
103+
WhoScoredExtractor,
104+
SofascoreExtractor,
105+
ESPNExtractor,
106+
ClubEloExtractor,
107+
MatchHistoryExtractor,
108+
SoFIFAExtractor,
109+
)
110+
111+
print("\n✅ All extractor imports successful")
112+
113+
except ImportError as e:
114+
print(f"\n❌ Failed to import extractors: {e}")
115+
print("\n💡 This is expected if database dependencies (psycopg2) aren't installed")
116+
print(" The extractors require DatabaseManager which needs psycopg2")
117+
return False
118+
119+
print("\n✅ Extractor classes are importable")
120+
print(" (Full testing requires database connection)")
121+
122+
return True
123+
124+
125+
def test_basic_extraction():
126+
"""Test a simple data fetch without database"""
127+
print("\n" + "="*80)
128+
print("PHASE 3: Testing basic data extraction (no database)")
129+
print("="*80)
130+
131+
try:
132+
import soccerdata as sd
133+
import pandas as pd
134+
135+
print("\nAttempting to fetch FBref schedule for testing...")
136+
print("(This will make a real API call - may take 10-20 seconds)")
137+
138+
# Use a recent season that should have data
139+
fbref = sd.FBref(leagues='ENG-Premier League', seasons='2324')
140+
141+
print(" Fetching schedule...")
142+
df = fbref.read_schedule()
143+
144+
print(f" ✅ Successfully fetched {len(df)} matches")
145+
print(f" 📊 Columns: {list(df.columns)[:5]}...")
146+
147+
if len(df) > 0:
148+
print(f" 📝 Sample match: {df.iloc[0]['home']} vs {df.iloc[0]['away']}")
149+
150+
return True
151+
152+
except Exception as e:
153+
print(f" ❌ Failed: {type(e).__name__}: {e}")
154+
print("\n💡 This might be due to:")
155+
print(" • Network issues")
156+
print(" • Rate limiting")
157+
print(" • Website changes")
158+
print(" • Invalid league/season combination")
159+
return False
160+
161+
162+
def main():
163+
"""Run all tests"""
164+
print("\n" + "="*80)
165+
print("SOCCERDATA EXTRACTION TEST SUITE")
166+
print("="*80)
167+
print("\nThis script will test:")
168+
print(" 1. soccerdata library classes")
169+
print(" 2. Our custom extractor imports")
170+
print(" 3. Basic data extraction (1 API call)")
171+
172+
input("\nPress Enter to continue...")
173+
174+
# Phase 1: Test soccerdata directly
175+
phase1_ok = test_soccerdata_direct()
176+
177+
# Phase 2: Test our extractors
178+
phase2_ok = test_our_extractors()
179+
180+
# Phase 3: Test basic extraction (optional)
181+
print("\n" + "="*80)
182+
print("OPTIONAL: Test actual data fetching?")
183+
print("="*80)
184+
print("\nThis will make a real API call to FBref.")
185+
print("It's safe and respectful (uses caching), but may take 10-20 seconds.")
186+
187+
response = input("\nRun extraction test? [y/N]: ").strip().lower()
188+
189+
if response == 'y':
190+
phase3_ok = test_basic_extraction()
191+
else:
192+
print("\n⏭️ Skipping extraction test")
193+
phase3_ok = None
194+
195+
# Final summary
196+
print("\n" + "="*80)
197+
print("FINAL SUMMARY")
198+
print("="*80)
199+
200+
print(f"\n{'✅' if phase1_ok else '❌'} Phase 1: soccerdata library classes")
201+
print(f"{'✅' if phase2_ok else '⚠️ '} Phase 2: Our extractor imports")
202+
if phase3_ok is not None:
203+
print(f"{'✅' if phase3_ok else '❌'} Phase 3: Basic data extraction")
204+
205+
if phase1_ok and phase2_ok:
206+
print("\n" + "="*80)
207+
print("🎉 SUCCESS! Ready to run historical loader")
208+
print("="*80)
209+
print("\nNext steps:")
210+
print(" 1. Configure your database in .env")
211+
print(" 2. Run: python -m scripts.historical_loader")
212+
print(" 3. Monitor logs in logs/ directory")
213+
return 0
214+
215+
elif phase1_ok and not phase2_ok:
216+
print("\n" + "="*80)
217+
print("⚠️ PARTIAL SUCCESS")
218+
print("="*80)
219+
print("\nsoccerdata works, but extractor imports failed.")
220+
print("This is expected if psycopg2-binary is not installed.")
221+
print("\nTo fix:")
222+
print(" pip install -r requirements-database.txt")
223+
return 1
224+
225+
else:
226+
print("\n" + "="*80)
227+
print("❌ ISSUES DETECTED")
228+
print("="*80)
229+
print("\nPlease share this output for diagnosis.")
230+
return 2
231+
232+
233+
if __name__ == "__main__":
234+
sys.exit(main())

0 commit comments

Comments
 (0)