Skip to content

Commit 73277bf

Browse files
authored
Add also second and third tier leagues (#94)
* Add also second and third tier leagues * Improve settings * Remove unused JSON files and clean up settings, correctly set the needed pages to get all competitions till third league * Fix the contract for competitions * Update competitions spider documentation and clean up settings file * Remove unused request_count variable from CompetitionsSpider * remove update of python and not needed changes
1 parent 900ae0e commit 73277bf

File tree

1 file changed

+31
-4
lines changed

1 file changed

+31
-4
lines changed

tfmkt/spiders/competitions.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ class CompetitionsSpider(BaseSpider):
1414
def parse(self, response, parent):
1515
"""Parse confederations page. From this page we collect all
1616
confederation's competitions urls
17-
17+
This contract will scrape /europa, /europa?page=2 etc. till it reaches =6
1818
@url https://www.transfermarkt.co.uk/wettbewerbe/europa
19-
@returns requests 25 25
19+
@returns requests 30 30
2020
@cb_kwargs {"parent": {}}
2121
"""
2222
# uncommenting the two lines below will open a scrapy shell with the context of this request
@@ -25,6 +25,31 @@ def parse(self, response, parent):
2525
# inspect_response(response, self)
2626
# exit(1)
2727

28+
# Making use of the ?page attribute to render more then just the first page of the confederation
29+
current_url = response.url
30+
if '?page=' not in current_url:
31+
# Setting up the number of pages for each confederation that we need to scrape to find all till third tier
32+
confederation_pages = {
33+
'/wettbewerbe/europa': 6,
34+
'/wettbewerbe/amerika': 3,
35+
'/wettbewerbe/asien': 3,
36+
'/wettbewerbe/afrika': 1
37+
}
38+
39+
# Find the confederation path
40+
confederation_path = None
41+
for path in confederation_pages.keys():
42+
if path in current_url:
43+
confederation_path = path
44+
break
45+
46+
if confederation_path:
47+
total_pages = confederation_pages[confederation_path]
48+
# Generate requests for pages 2 onwards (page 1 is handled below)
49+
for page_num in range(2, total_pages + 1):
50+
page_url = f"{confederation_path}?page={page_num}"
51+
yield response.follow(page_url, self.parse, cb_kwargs={'parent': parent})
52+
2853
table_rows = response.css('table.items tbody tr.odd, table.items tbody tr.even')
2954

3055
for row in table_rows[0:]:
@@ -65,12 +90,12 @@ def parse(self, response, parent):
6590
}
6691

6792
yield response.follow(self.base_url + href, self.parse_competitions, cb_kwargs=cb_kwargs)
68-
93+
6994
def parse_competitions(self, response, base):
7095
"""Parse competitions from the country competitions page.
7196
7297
@url https://www.transfermarkt.co.uk/wettbewerbe/national/wettbewerbe/157
73-
@returns items 3 3
98+
@returns items 5 5
7499
@cb_kwargs {"base": {"href": "some_href/3", "type": "competition", "parent": {}, "country_id": 1, "country_name": "n", "country_code": "CC"}}
75100
@scrapes type href parent country_id country_name country_code competition_type
76101
"""
@@ -115,6 +140,8 @@ def parse_competitions(self, response, base):
115140
tier = row.xpath('td/text()').get()
116141
if tier in [
117142
'First Tier',
143+
'Second Tier',
144+
'Third Tier',
118145
'Domestic Cup',
119146
'Domestic Super Cup'
120147
]:

0 commit comments

Comments
 (0)