Skip to content

Commit e8640ef

Browse files
authored
Merge pull request #125 from opencivicdata/hcg/tweaks
Don't retry 410, move event processing into reusable method
2 parents 851bb7a + 1ed635e commit e8640ef

File tree

3 files changed

+55
-33
lines changed

3 files changed

+55
-33
lines changed

.github/workflows/pythonapp.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ jobs:
88
runs-on: ubuntu-latest
99

1010
steps:
11-
- uses: actions/checkout@v1
12-
- name: Set up Python 3.7
13-
uses: actions/setup-python@v1
11+
- uses: actions/checkout@v2
12+
- name: Set up Python
13+
uses: actions/setup-python@v2
1414
with:
15-
python-version: 3.7
15+
python-version: 3.x
1616
- name: Install dependencies
1717
run: |
1818
python -m pip install --upgrade pip

legistar/base.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,11 @@ def sessionSecrets(self, page):
268268

269269
return(payload)
270270

271+
def accept_response(self, response, **kwargs):
272+
if response.status_code == 410:
273+
return True
274+
return super().accept_response(response, **kwargs)
275+
271276

272277
def fieldKey(x):
273278
field_id = x.attrib['id']
@@ -336,7 +341,8 @@ def search(self, route, item_key, search_conditions):
336341
except requests.HTTPError as e:
337342
if e.response.status_code == 400:
338343
raise ValueError(e.response.json()['Message'])
339-
raise
344+
if not self.accept_response(e.response):
345+
raise
340346

341347
def pages(self, url, params=None, item_key=None):
342348
if params is None:
@@ -359,9 +365,10 @@ def pages(self, url, params=None, item_key=None):
359365
page_num += 1
360366

361367
def accept_response(self, response, **kwargs):
362-
'''
368+
"""
363369
This overrides a method that controls whether
364370
the scraper should retry on an error. We don't
365-
want to retry if the API returns a 400
366-
'''
367-
return response.status_code < 401
371+
want to retry if the API returns a 400, except for
372+
410, which means the record no longer exists.
373+
"""
374+
return response.status_code < 401 or response.status_code == 410

legistar/events.py

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -240,36 +240,44 @@ def api_events(self, since_datetime=None):
240240

241241
def events(self, since_datetime=None):
242242
for api_event in self.api_events(since_datetime=since_datetime):
243+
if event := self.event(api_event):
244+
yield event
243245

244-
time_str = api_event['EventTime']
245-
if not time_str: # If we don't have an event time, skip it
246-
continue
247-
248-
try:
249-
# Start times are entered manually. Sometimes, they don't
250-
# conform to this format. Log events with invalid start times,
251-
# but don't interrupt the scrape for them.
252-
start_time = time.strptime(time_str, self.time_string_format)
253-
except ValueError:
254-
event_url = '{0}/events/{1}'.format(self.BASE_URL, api_event['EventId'])
255-
self.logger.error('API event has invalid start time "{0}": {1}'.format(time_str, event_url))
256-
continue
246+
def event(self, api_event):
247+
time_str = api_event["EventTime"]
248+
if not time_str: # If we don't have an event time, skip it
249+
return
250+
try:
251+
# Start times are entered manually. Sometimes, they don't
252+
# conform to this format. Log events with invalid start times,
253+
# but don't interrupt the scrape for them.
254+
start_time = time.strptime(time_str, self.time_string_format)
255+
except ValueError:
256+
event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"])
257+
self.logger.error(
258+
'API event has invalid start time "{0}": {1}'.format(
259+
time_str, event_url
260+
)
261+
)
262+
return
257263

258-
start = self.toTime(api_event['EventDate'])
259-
api_event['start'] = start.replace(hour=start_time.tm_hour,
260-
minute=start_time.tm_min)
264+
start = self.toTime(api_event["EventDate"])
265+
api_event["start"] = start.replace(
266+
hour=start_time.tm_hour, minute=start_time.tm_min
267+
)
261268

262-
api_event['status'] = self._event_status(api_event)
269+
api_event["status"] = self._event_status(api_event)
263270

264-
web_event = self._get_web_event(api_event)
271+
web_event = self._get_web_event(api_event)
265272

266-
if web_event:
267-
yield api_event, web_event
273+
if web_event:
274+
return api_event, web_event
268275

269-
else:
270-
event_url = '{0}/events/{1}'.format(self.BASE_URL, api_event['EventId'])
271-
self.warning('API event could not be found in web interface: {0}'.format(event_url))
272-
continue
276+
else:
277+
event_url = "{0}/events/{1}".format(self.BASE_URL, api_event["EventId"])
278+
self.warning(
279+
"API event could not be found in web interface: {0}".format(event_url)
280+
)
273281

274282
def agenda(self, event):
275283
agenda_url = (self.BASE_URL +
@@ -378,6 +386,13 @@ def web_detail(self, event):
378386
except scrapelib.HTTPError as e:
379387
if e.response.status_code == 410:
380388
return None
389+
elif e.response.status_code == 503:
390+
# Events with draft agendas sometimes have an EventInSiteURL
391+
# that resolves to a 503 status code
392+
self.logger.error(
393+
f"Error while fetching event detail at {insite_url}: {e}"
394+
)
395+
return None
381396
else:
382397
raise
383398

0 commit comments

Comments
 (0)