-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpaulien.py
More file actions
33 lines (27 loc) · 1.6 KB
/
paulien.py
File metadata and controls
33 lines (27 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider
from scrapy.spiders import Rule
import re
class PoemsSpider(CrawlSpider):
name = "poems"
allowed_domains = ['www.poetryfoundation.org']
start_urls = ['https://www.poetryfoundation.org/poems/browse']
rules = (
Rule(LinkExtractor(allow= '/poems'), callback="parse_item",follow=True),
)
def parse_item(self, response):
#the following condition makes sure I only save the actual poems
if re.match('.*/poems/.*/',response.url) is not None:
title = ''.join(response.xpath('//*[@id="mainContent"]/div/div[1]/article/div/div[1]/div/div/div[1]/div/div[1]/h1/text()').get().strip())
author = response.xpath('//*[@id="mainContent"]/div/div[1]/article/div/div[1]/div/div/div[1]/div/div[2]/div/span/a/text()').get()
#the poem below is the title the author and the text, I just kept the two first above seperately in case we need them
poem = response.xpath('//*[@id="mainContent"]/div/div[1]/article/div/div/div/div/div[1]/div').get()
filename=response.url.split("/")[-1] + '.html'
#I did the exception handling below cause there were some encoding errors
try:
with open(filename, "w") as f:
f.write(poem)
except:
with open(filename, "w", encoding="utf-8") as f:
f.write(poem)
#I could also save the tags of the categories the poem belongs to, what do you think?