-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollect_data.py
37 lines (28 loc) · 1018 Bytes
/
collect_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from pprint import pprint
import requests
import bs4
url = "https://www.tesco.com/groceries/en-GB/promotions/A32671682"
items = {}
page_no = 1
connection = requests.Session()
while True:
page = connection.get(url, params={
"page": str(page_no)})
soup = bs4.BeautifulSoup(page.text)
if page.status_code == 404:
# This page doesn't exist - we have reached the end of the products
break
divs = soup.find_all(class_="tile-content")
for i in divs:
try:
item = {
"title": i.findAll(class_="sc-htoDjs")[0].string,
"image": i.findAll(class_="product-image")[0].get('src'),
"price": i.find(class_="price-control-wrapper").find(class_="value").string
}
items.update({str(i.get('data-auto-id')): item})
except AttributeError:
# The item does not have all of the required attributes - maybe it's out of stock?
pass
page_no = page_no+1
pprint(items)