forked from alltheplaces/alltheplaces
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstore_locator_plus_self.py
More file actions
137 lines (126 loc) · 6.2 KB
/
store_locator_plus_self.py
File metadata and controls
137 lines (126 loc) · 6.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from typing import Iterable
from scrapy import Spider
from scrapy.http import FormRequest, Response
from locations.dict_parser import DictParser
from locations.geo import country_iseadgg_centroids, point_locations
from locations.items import Feature
# This store finder is a self-hosted WordPress plugin with a website
# of https://wordpress.org/plugins/store-locator-le/ and source code
# at https://github.com/wp-plugins/store-locator-le
#
# This store finder is not to be confused with the
# software-as-a-service store locator of the same name, from the
# same company, that is documented at https://storelocatorplus.com/
#
# To use this spider, specify a value for allowed_domains and the
# default path for the StoreLocatorPlus API endpoint will be used.
# In the event the default path is different, you can alternatively
# specify a value for start_url. Then specify either:
#
# PREFERRED
# 1. A list of ISO-3166 alpha-2 country codes as the
# iseadgg_countries_list parameter and a suitable non-zero
# search_radius value in kilometres.
#
# Example:
# iseadgg_countries_list = ["US", "CA"]
# search_radius = 100
#
# In this example, a 94km ISEADGG centroid grid will
# automatically be selected as the most appropriate to use
# against a 100km search radius accepted by the API endpoint.
#
# NONPREFERRED
# 2. A list of searchable_points_files = [x, y..] suitable for use
# with the point_locations function of locations.geo and a
# suitable non-zero search_radius value in kilometres.
#
# A non-zero value is also required for the max_results attribute.
# This number is set by the server and cannot be changed. To obtain
# this max_results value, observe an API call and check the JSON
# response: data_queries["standard_location_load"]["query"] to
# obtain the number from the SQL LIMIT clause.
#
# An exception will be raised if max_results (or more) locations
# are returned in any given radius search, as this indicates some
# locations have been truncated. If this occurs, search_radius
# needs to be reduced to ensure that max_results (or more) locations
# are never returned for any radius search.
#
# If clean ups or additional field extraction is required from the
# source data, override the parse_item function. Two parameters are
# passed, item (and ATP "Feature" class) and location (a dict which
# is returned from the store locator JSON response for a particular
# location).
class StoreLocatorPlusSelfSpider(Spider):
iseadgg_countries_list: list[str] = []
searchable_points_files: list[str] = []
search_radius: int = 0
max_results: int = 0
def start_requests(self) -> Iterable[FormRequest]:
if hasattr(self, "allowed_domains"):
url = f"https://{self.allowed_domains[0]}/wp-admin/admin-ajax.php"
else:
url = self.start_urls[0]
if url and len(self.iseadgg_countries_list) > 0 and self.search_radius != 0 and self.max_results != 0:
# PREFERRED geographic radius search method using ISEADGG
# centroids for a supplied list of ISO-3166 alpha-2 country codes.
if self.search_radius >= 458:
iseadgg_radius = 458
elif self.search_radius >= 315:
iseadgg_radius = 315
elif self.search_radius >= 158:
iseadgg_radius = 158
elif self.search_radius >= 94:
iseadgg_radius = 94
elif self.search_radius >= 79:
iseadgg_radius = 79
elif self.search_radius >= 48:
iseadgg_radius = 48
elif self.search_radius >= 24:
iseadgg_radius = 24
else:
raise RuntimeError(
"A minimum search_radius of 24 (kilometres) is required to be used for the ISEADGG geographic radius search method."
)
for lat, lon in country_iseadgg_centroids(self.iseadgg_countries_list, iseadgg_radius):
formdata = {
"action": "csl_ajax_onload",
"lat": str(lat),
"lng": str(lon),
"radius": str(self.search_radius),
}
yield FormRequest(url=url, formdata=formdata, method="POST")
elif url and len(self.searchable_points_files) > 0 and self.search_radius != 0 and self.max_results != 0:
# NONPREFERRED geographic radius search method using a manually
# specified list of searchable_points_file containing centroids.
for searchable_points_file in self.searchable_points_files:
for lat, lon in point_locations(searchable_points_file):
formdata = {
"action": "csl_ajax_onload",
"lat": str(lat),
"lng": str(lon),
"radius": str(self.search_radius),
}
yield FormRequest(url=url, formdata=formdata, method="POST")
def parse(self, response: Response, **kwargs) -> Iterable[Feature]:
locations = response.json()["response"]
if self.max_results > 0:
if len(locations) >= self.max_results:
raise RuntimeError(
"Locations have probably been truncated due to max_results (or more) locations being returned by a single geographic radius search. Use a smaller search_radius."
)
if len(locations) > 0:
self.crawler.stats.inc_value("atp/geo_search/hits")
else:
self.crawler.stats.inc_value("atp/geo_search/misses")
self.crawler.stats.max_value("atp/geo_search/max_features_returned", len(locations))
for location in locations:
item = DictParser.parse(location)
item.pop("addr_full", None)
item["street_address"] = ", ".join(filter(None, [location.get("address"), location.get("address2")]))
if item["website"] and item["website"].startswith("/") and hasattr(self, "allowed_domains"):
item["website"] = f"https://{self.allowed_domains[0]}{item['website']}"
yield from self.parse_item(item, location) or []
def parse_item(self, item: Feature, location: dict) -> Iterable[Feature]:
yield item