-
Notifications
You must be signed in to change notification settings - Fork 783
Add examples for new quotes.toscrape.com endpoints (fixes #15) #16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
cad2832
bded20a
88e5f88
0c85efc
c5628ef
044096b
d540815
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,22 +1,22 @@ | ||
| # -*- coding: utf-8 -*- | ||
| import scrapy | ||
| from quotesbot.items import QuotesbotItem | ||
|
|
||
|
|
||
| class ToScrapeCSSSpider(scrapy.Spider): | ||
| name = "toscrape-css" | ||
| start_urls = [ | ||
| 'http://quotes.toscrape.com/', | ||
| "http://quotes.toscrape.com/", | ||
| ] | ||
|
|
||
| def parse(self, response): | ||
| for quote in response.css("div.quote"): | ||
| yield { | ||
| 'text': quote.css("span.text::text").extract_first(), | ||
| 'author': quote.css("small.author::text").extract_first(), | ||
| 'tags': quote.css("div.tags > a.tag::text").extract() | ||
| } | ||
| yield QuotesbotItem( | ||
| text=quote.css("span.text::text").get(), | ||
| author=quote.css("small.author::text").get(), | ||
| tags=quote.css("div.tags > a.tag::text").getall(), | ||
| ) | ||
|
|
||
| next_page_url = response.css("li.next > a::attr(href)").extract_first() | ||
| next_page_url = response.css("li.next > a::attr(href)").get() | ||
| if next_page_url is not None: | ||
| yield scrapy.Request(response.urljoin(next_page_url)) | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| import json | ||
| import re | ||
| import scrapy | ||
| from quotesbot.items import QuotesbotItem | ||
|
|
||
|
|
||
| class ToScrapeJSSpider(scrapy.Spider): | ||
| name = "toscrape-js" | ||
| start_urls = ["http://quotes.toscrape.com/js/"] | ||
|
|
||
| def parse(self, response): | ||
| script_data = response.xpath( | ||
| '//script[contains(text(), "var data =")]/text()' | ||
| ).get() | ||
| if script_data: | ||
| # Extract the JSON list from the script text | ||
| match = re.search( | ||
| r"var data = (\[.*?\]);", script_data, re.DOTALL | ||
| ) | ||
| if match: | ||
| json_str = match.group(1) | ||
| data = json.loads(json_str) | ||
| for quote in data: | ||
| yield QuotesbotItem( | ||
| text=quote["text"], | ||
| author=quote["author"]["name"], | ||
| tags=quote["tags"], | ||
| ) | ||
|
|
||
| # Pagination for JS page usually follows the same pattern or links | ||
| # But on the JS page, the "Next" button is also JS generated. | ||
| # However, the URL structure /js/page/2/ usually works or we can find the link in the data if present. | ||
| # For this example, let's assume we just want to show how to extract data from the script. | ||
| # If we want pagination, we might need to check if the script has 'next' info or just increment page number. | ||
| # Let's check if there is a next page link in the HTML (often there is a <li class="next"> even if hidden or generated). | ||
| # Actually, on quotes.toscrape.com/js, the pagination links are normal <a> tags but the content is JS. | ||
|
|
||
| next_page = response.css("li.next > a::attr(href)").get() | ||
| if next_page: | ||
| yield scrapy.Request(response.urljoin(next_page)) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| import scrapy | ||
| from quotesbot.items import QuotesbotItem | ||
|
|
||
|
|
||
| class ToScrapeLoginSpider(scrapy.Spider): | ||
| name = "toscrape-login" | ||
| login_url = "http://quotes.toscrape.com/login" | ||
| start_urls = [login_url] | ||
|
|
||
| def parse(self, response): | ||
| # Submit the login form | ||
| # CSRF token is handled automatically by FormRequest.from_response if it's in a hidden field | ||
| return scrapy.FormRequest.from_response( | ||
| response, | ||
| formdata={"username": "myuser", "password": "mypassword"}, | ||
| callback=self.after_login, | ||
| ) | ||
|
|
||
| def after_login(self, response): | ||
| # Check if login succeeded | ||
| if "Logout" in response.text: | ||
| self.logger.info("Login successful!") | ||
| # Now scrape the quotes | ||
| for quote in response.css("div.quote"): | ||
| yield QuotesbotItem( | ||
| text=quote.css("span.text::text").get(), | ||
| author=quote.css("small.author::text").get(), | ||
| tags=quote.css("div.tags > a.tag::text").getall(), | ||
| ) | ||
|
|
||
| next_page = response.css("li.next > a::attr(href)").get() | ||
| if next_page: | ||
| yield scrapy.Request( | ||
| response.urljoin(next_page), callback=self.after_login | ||
| ) | ||
|
Comment on lines
+23
to
+35
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a point to scrape anything from this spider? There is no content behind the login, so it would extract the same as if you were not logged. I wonder if we should just log whether or not the login was successful. |
||
| else: | ||
| self.logger.error("Login failed") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| import scrapy | ||
| from quotesbot.items import QuotesbotItem | ||
|
|
||
|
|
||
| class ToScrapeRandomSpider(scrapy.Spider): | ||
| name = "toscrape-random" | ||
| start_urls = ["http://quotes.toscrape.com/random"] | ||
|
|
||
| def parse(self, response): | ||
| yield QuotesbotItem( | ||
| text=response.css("span.text::text").get(), | ||
| author=response.css("small.author::text").get(), | ||
| tags=response.css("div.tags > a.tag::text").getall(), | ||
| ) | ||
|
|
||
| # To get multiple random quotes, we can yield new requests to the same URL | ||
| # Be careful not to create an infinite loop if not intended. | ||
| # Here we'll just stop after one, or we could use a counter. | ||
|
Comment on lines
+16
to
+18
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about removing this comment, and instead define a |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| import json | ||
| import scrapy | ||
| from quotesbot.items import QuotesbotItem | ||
|
|
||
|
|
||
| class ToScrapeScrollSpider(scrapy.Spider): | ||
| name = "toscrape-scroll" | ||
| start_urls = ["http://quotes.toscrape.com/api/quotes?page=1"] | ||
|
|
||
| def parse(self, response): | ||
| data = json.loads(response.text) | ||
| for quote in data["quotes"]: | ||
| yield QuotesbotItem( | ||
| text=quote["text"], author=quote["author"]["name"], tags=quote["tags"] | ||
| ) | ||
|
|
||
| if data["has_next"]: | ||
| next_page = data["page"] + 1 | ||
| yield scrapy.Request( | ||
| url=f"http://quotes.toscrape.com/api/quotes?page={next_page}", | ||
| callback=self.parse, | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| import scrapy | ||
| from quotesbot.items import QuotesbotItem | ||
|
|
||
|
|
||
| class ToScrapeTableSpider(scrapy.Spider): | ||
| name = "toscrape-table" | ||
| start_urls = ["http://quotes.toscrape.com/tableful"] | ||
|
|
||
| def parse(self, response): | ||
| # The tableful page has quotes in a table structure | ||
| # Each row contains a quote with text, author, and tags | ||
| # We use .quote class to select quote containers within the table | ||
|
|
||
| for quote in response.css("div.quote"): | ||
| yield QuotesbotItem( | ||
| text=quote.css("span.text::text").get(), | ||
| author=quote.css("small.author::text").get(), | ||
| tags=quote.css("div.tags > a.tag::text").getall(), | ||
| ) | ||
|
|
||
| # Pagination | ||
| next_page = response.css("li.next > a::attr(href)").get() | ||
| if next_page: | ||
| yield scrapy.Request(response.urljoin(next_page)) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| import scrapy | ||
| from quotesbot.items import QuotesbotItem | ||
|
|
||
|
|
||
| class ToScrapeViewStateSpider(scrapy.Spider): | ||
| name = "toscrape-viewstate" | ||
| start_urls = ["http://quotes.toscrape.com/search.aspx"] | ||
|
|
||
| def parse(self, response): | ||
| # Extract quotes from the current page first | ||
| for quote in response.css("div.quote"): | ||
| yield QuotesbotItem( | ||
| text=quote.css("span.text::text").get(), | ||
| author=quote.css("small.author::text").get(), | ||
| tags=quote.css("div.tags > a.tag::text").getall(), | ||
| ) | ||
|
|
||
| # Demonstrate ViewState form submission | ||
| # Check if there's a filter form (tag dropdown/input) | ||
| if response.css('form select[name="tag"], form input[name="tag"]'): | ||
| # Submit form with a tag filter using FormRequest.from_response | ||
| # which automatically handles __VIEWSTATE and other hidden fields | ||
| yield scrapy.FormRequest.from_response( | ||
| response, | ||
| formdata={"tag": "love"}, | ||
| callback=self.parse_filtered_results, | ||
| dont_click=True, # Don't simulate button click, just submit | ||
| ) | ||
|
|
||
| def parse_filtered_results(self, response): | ||
| # Parse filtered results | ||
| for quote in response.css("div.quote"): | ||
| yield QuotesbotItem( | ||
| text=quote.css("span.text::text").get(), | ||
| author=quote.css("small.author::text").get(), | ||
| tags=quote.css("div.tags > a.tag::text").getall(), | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There seems to be some artifacts here.