-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.rake
105 lines (93 loc) · 3.43 KB
/
scrape.rake
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Execute this rake file by issuing rake scrape:fl_parks
namespace :scrape do
desc "Scrape list of Florida parks from the Florida State Parks website"
task fl_parks: :environment do
base_url = "https://www.floridastateparks.org"
url = base_url + "/parks-and-trails?type[0]=park"
links = []
park_names = []
addresses1 = []
addresses2 = []
phone_nums = []
detail_desc = []
finished = nil
until finished
#finished = 1
response = HTTParty.get url
if response.code == 200
dom = Nokogiri::HTML(response.body)
else
puts "Error; response code #{response.code}"
exit
end
# Navigate through all the parks on current page
counter = 1
element_obj = ""
until !element_obj
# Scrape for park name & park detail link
element_obj = dom.xpath("//*[@id='block-system-main']/div/div[3]/div[#{counter}]/div/div/div/a").first
if element_obj
links << element_obj.get_attribute(:href)
park_names << element_obj.content
print "."
end
# Scrape for address
address_obj = dom.xpath("//*[@id='block-system-main']/div/div[3]/div[#{counter}]//div[@class='park-list-info']/address").first
if address_obj
# Retrieve the first and second line of address. Second line has city, state, and zip.
addresses1 << address_obj.element_children[0].content
addresses2 << address_obj.element_children[1].content
end
# Scrape for phone number
phone_obj = dom.xpath("//*[@id='block-system-main']/div/div[3]/div[#{counter}]//div[@class='park-list-info']/span[@class='park-list-phone']").first
if phone_obj
phone_nums << phone_obj.content
end
counter += 1
end
# Scrape for link to next page
next_url_obj = dom.xpath("//*[@id='block-system-main']/div/div[4]/ul//a[@title='Go to next page']").first
if next_url_obj
next_url = next_url_obj.get_attribute(:href)
url = base_url + next_url
else
finished = 1
end
end
puts "\nObtaining park details."
# Navigate details pages for every park
finished = nil
counter = 0
until finished || !links[counter]
#finished = 1
url = base_url + links[counter]
response = HTTParty.get url
if response.code == 200
dom = Nokogiri::HTML(response.body)
else
puts "Error; response code #{response.code}"
exit
end
element_obj = dom.xpath("//*[@id='block-system-main']//div[@class='views-field views-field-field-description']").first
if element_obj
detail_desc << element_obj.content.strip.chomp
print "."
else
finished = 1
end
counter +=1
end
puts "\nPopulating database."
# Populate database with scraped info
park_names.count.times.each do |i|
# Street address is located on line 1 of address (address1)
address = addresses1[i]
# City is item before last, strip removes leading and trailing spaces
city = addresses2[i].split(",").fetch(-2).strip
state = addresses2[i].split(",").last.split(" ").first
zip = addresses2[i].split(",").last.split(" ").last
park_url = base_url + links[i]
Park.create(name: park_names[i], address: address, city: city, state: state, zip: zip, phone_number: phone_nums[i], park_url: park_url, description: detail_desc[i])
end
end
end