Skip to content

Commit 816cbb7

Browse files
authored
Census vendoring (#1690)
* remove censusgeocode requirement * add test_geocoder init file * exclue vendor folders from linting * update test_census_geocoder * update expected response to test_coordinates live test * vendor censusgeocoder * don't run bandit on vendored code * improved glob for skipping vendored code with ruff * use import censusgeocode from vendored code * add FutureWarning when importing CensusGeocode * update import to resolve pdi import test failure (seems totally unrelated?) * vendor censusgeocode code directly since we already require requests
1 parent f247c4e commit 816cbb7

File tree

14 files changed

+1249
-98
lines changed

14 files changed

+1249
-98
lines changed

.github/workflows/python-checks.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ jobs:
195195

196196
- id: run-bandit-sarif
197197
run: |
198-
bandit --confidence-level 'medium' --severity-level 'medium' --recursive 'parsons' --format 'sarif' --output 'results.sarif'
198+
bandit --confidence-level 'medium' --severity-level 'medium' --recursive 'parsons' --exclude '**/vendor/*' --format 'sarif' --output 'results.sarif'
199199
200200
- uses: github/codeql-action/upload-sarif@cdefb33c0f6224e58673d9004f47f7cb3e328b89
201201
if: ( success() || failure() ) && contains('["success", "failure"]', steps.run-bandit-sarif.outcome)
@@ -205,7 +205,7 @@ jobs:
205205
- id: run-bandit
206206
if: failure() && contains('["failure"]', steps.run-bandit-sarif.outcome)
207207
run: |
208-
bandit --confidence-level 'medium' --severity-level 'medium' --recursive 'parsons'
208+
bandit --confidence-level 'medium' --severity-level 'medium' --recursive 'parsons' --exclude '**/vendor/*'
209209
210210
coverage:
211211
runs-on: ubuntu-latest

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@ repos:
99
rev: 1.8.6
1010
hooks:
1111
- id: bandit
12-
args: ['--confidence-level', 'medium', '--severity-level', 'medium']
12+
args: ['--confidence-level', 'medium', '--severity-level', 'medium', '--exclude', '**/vendor/*']
1313
files: '^parsons'

parsons/geocode/census_geocoder.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import logging
22

3-
import censusgeocode
43
import petl
54

65
from parsons.etl import Table
76

7+
from .vendor import censusgeocode
8+
89
logger = logging.getLogger(__name__)
910

1011

parsons/geocode/vendor/__init__.py

Whitespace-only changes.

parsons/geocode/vendor/censusgeocode/LICENSE

Lines changed: 621 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# This file is part of censusgeocode.
4+
# https://github.com/fitnr/censusgeocode
5+
6+
# Licensed under the General Public License (version 3)
7+
# http://opensource.org/licenses/LGPL-3.0
8+
# Copyright (c) 2015-9, Neil Freeman <contact@fakeisthenewreal.org>
9+
10+
from .censusgeocode import CensusGeocode
11+
12+
__version__ = '0.5.3'
13+
14+
cg = CensusGeocode()
15+
16+
coordinates = cg.coordinates
17+
address = cg.address
18+
onelineaddress = cg.onelineaddress
19+
addressbatch = cg.addressbatch
Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
"""
2+
Census Geocoder wrapper
3+
For details on the API, see:
4+
http://geocoding.geo.census.gov/geocoder/Geocoding_Services_API.pdf
5+
"""
6+
7+
# Copyright (C) 2015-9 Neil Freeman
8+
9+
# This program is free software: you can redistribute it and/or modify
10+
# it under the terms of the GNU General Public License as published by
11+
# the Free Software Foundation, either version 3 of the License, or
12+
# (at your option) any later version.
13+
14+
# This program is distributed in the hope that it will be useful,
15+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
16+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17+
# GNU General Public License for more details.
18+
19+
# You should have received a copy of the GNU General Public License
20+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
21+
22+
# noqa
23+
24+
import csv
25+
import io
26+
import warnings
27+
28+
import requests
29+
from requests.exceptions import RequestException
30+
from requests_toolbelt.multipart.encoder import MultipartEncoder
31+
32+
33+
DEFAULT_BENCHMARK = "Public_AR_Current"
34+
DEFAULT_VINTAGE = "Current_Current"
35+
36+
37+
class CensusGeocode:
38+
"""Fetch results from the Census Geocoder"""
39+
40+
_url = "https://geocoding.geo.census.gov/geocoder/{returntype}/{searchtype}"
41+
returntypes = ["geographies", "locations"]
42+
43+
batchfields = {
44+
"locations": [
45+
"id",
46+
"address",
47+
"match",
48+
"matchtype",
49+
"parsed",
50+
"coordinate",
51+
"tigerlineid",
52+
"side",
53+
],
54+
"geographies": [
55+
"id",
56+
"address",
57+
"match",
58+
"matchtype",
59+
"parsed",
60+
"coordinate",
61+
"tigerlineid",
62+
"side",
63+
"statefp",
64+
"countyfp",
65+
"tract",
66+
"block",
67+
],
68+
}
69+
70+
def __init__(self, benchmark=None, vintage=None):
71+
"""
72+
Arguments:
73+
benchmark (str): A name that references the version of the locator to use.
74+
See https://geocoding.geo.census.gov/geocoder/benchmarks
75+
vintage (str): The geography part of the desired vintage.
76+
See: https://geocoding.geo.census.gov/geocoder/vintages?form
77+
78+
>>> CensusGeocode(benchmark='Public_AR_Current', vintage='Current_Current')
79+
"""
80+
81+
self._benchmark = benchmark or DEFAULT_BENCHMARK
82+
self._vintage = vintage or DEFAULT_VINTAGE
83+
84+
def _geturl(self, searchtype, returntype=None):
85+
"""Construct an URL for the geocoder."""
86+
87+
returntype = returntype or self.returntypes[0]
88+
return self._url.format(returntype=returntype, searchtype=searchtype)
89+
90+
def _fetch(self, searchtype, fields, **kwargs):
91+
"""Fetch a response from the Geocoding API."""
92+
93+
fields["vintage"] = self.vintage
94+
fields["benchmark"] = self.benchmark
95+
96+
fields["format"] = "json"
97+
98+
if "layers" in kwargs:
99+
fields["layers"] = kwargs["layers"]
100+
101+
returntype = kwargs.get("returntype", "geographies")
102+
url = self._geturl(searchtype, returntype)
103+
104+
try:
105+
with requests.get(url, params=fields, timeout=kwargs.get("timeout")) as r:
106+
content = r.json()
107+
if "addressMatches" in content.get("result", {}):
108+
return AddressResult(content)
109+
110+
if "geographies" in content.get("result", {}):
111+
return GeographyResult(content)
112+
113+
raise ValueError()
114+
115+
except (ValueError, KeyError):
116+
raise ValueError("Unable to parse response from Census")
117+
118+
except RequestException as err:
119+
raise err
120+
121+
def coordinates(self, x, y, **kwargs):
122+
"""Geocode a (lon, lat) coordinate."""
123+
124+
kwargs["returntype"] = "geographies"
125+
fields = {"x": x, "y": y}
126+
127+
return self._fetch("coordinates", fields, **kwargs)
128+
129+
def address(self, street, city=None, state=None, **kwargs):
130+
"""Geocode an address."""
131+
132+
fields = {
133+
"street": street,
134+
"city": city,
135+
"state": state,
136+
"zip": kwargs.get('zip') or kwargs.get('zipcode'),
137+
}
138+
139+
return self._fetch("address", fields, **kwargs)
140+
141+
def onelineaddress(self, address, **kwargs):
142+
"""
143+
Geocode an an address passed as one string.
144+
145+
e.g. "4600 Silver Hill Rd, Suitland, MD 20746"
146+
"""
147+
148+
fields = {
149+
"address": address,
150+
}
151+
152+
return self._fetch("onelineaddress", fields, **kwargs)
153+
154+
def set_benchmark(self, benchmark):
155+
"""
156+
Set the Census Geocoding API benchmark the class will use.
157+
158+
See: https://geocoding.geo.census.gov/geocoder/vintages?form
159+
"""
160+
161+
self._benchmark = benchmark
162+
163+
@property
164+
def benchmark(self):
165+
"""
166+
Give the Census Geocoding API benchmark the class is using.
167+
168+
See: https://geocoding.geo.census.gov/geocoder/benchmarks
169+
"""
170+
171+
return getattr(self, "_benchmark")
172+
173+
def set_vintage(self, vintage):
174+
"""
175+
Set the Census Geocoding API vintage the class will use.
176+
177+
See: https://geocoding.geo.census.gov/geocoder/vintages?form
178+
"""
179+
180+
self._vintage = vintage
181+
182+
@property
183+
def vintage(self):
184+
"""
185+
Give the Census Geocoding API vintage the class is using.
186+
187+
See: https://geocoding.geo.census.gov/geocoder/vintages?form
188+
"""
189+
190+
return getattr(self, "_vintage")
191+
192+
def _parse_batch_result(self, data, returntype):
193+
"""Parse the batch address results returned from the Census Geocoding API."""
194+
195+
try:
196+
fieldnames = self.batchfields[returntype]
197+
except KeyError as err:
198+
raise ValueError("unknown returntype: {}".format(returntype)) from err
199+
200+
def parse(row):
201+
row["lat"], row["lon"] = None, None
202+
203+
if row["coordinate"]:
204+
try:
205+
row["lon"], row["lat"] = tuple(float(a) for a in row["coordinate"].split(","))
206+
except:
207+
pass
208+
209+
del row["coordinate"]
210+
row["match"] = row["match"] == "Match"
211+
return row
212+
213+
# return as list of dicts
214+
with io.StringIO(data) as f:
215+
reader = csv.DictReader(f, fieldnames=fieldnames)
216+
return [parse(row) for row in reader]
217+
218+
def _post_batch(self, data=None, f=None, **kwargs):
219+
"""Send batch address file to the Census Geocoding API."""
220+
221+
returntype = kwargs.get("returntype", "geographies")
222+
url = self._geturl("addressbatch", returntype)
223+
224+
if data:
225+
# For Python 3, compile data into a StringIO
226+
f = io.StringIO()
227+
writer = csv.DictWriter(f, fieldnames=["id", "street", "city", "state", "zip"])
228+
for i, row in enumerate(data, 1):
229+
row.setdefault("id", i)
230+
writer.writerow(row)
231+
if i == 10001:
232+
warnings.warn("Sending more than 10,000 records, the upper limit for the Census Geocoder. Request will likely fail")
233+
234+
f.seek(0)
235+
236+
elif f is None:
237+
raise ValueError("Need either data or a file for CensusGeocode.addressbatch")
238+
239+
try:
240+
form = MultipartEncoder(
241+
fields={
242+
"vintage": self.vintage,
243+
"benchmark": self.benchmark,
244+
"addressFile": ("batch.csv", f, "text/plain"),
245+
}
246+
)
247+
headers = {"Content-Type": form.content_type}
248+
249+
with requests.post(url, data=form, timeout=kwargs.get("timeout"), headers=headers) as r:
250+
# return as list of dicts
251+
return self._parse_batch_result(r.text, returntype)
252+
253+
except RequestException as err:
254+
raise err
255+
256+
finally:
257+
f.close()
258+
259+
def addressbatch(self, data, **kwargs):
260+
"""
261+
Send either a CSV file or data to the addressbatch API.
262+
263+
According to the Census, "there is currently an upper limit of 10,000 records per batch file."
264+
265+
If a file, can either be a file-like with a `read()` method, or a `str` that's a path to the
266+
file. Either way, it must have no header and have fields id,street,city,state,zip
267+
268+
If data, should be an iterable of dicts with the above fields (although ID is optional).
269+
"""
270+
271+
# Does data quack like a file handle?
272+
if hasattr(data, "read"):
273+
return self._post_batch(f=data, **kwargs)
274+
275+
# If it is a string, assume it's a filename
276+
if isinstance(data, str):
277+
with open(data, "rb") as f:
278+
return self._post_batch(f=f, **kwargs)
279+
280+
# Otherwise, assume an iterable of dicts
281+
return self._post_batch(data=data, **kwargs)
282+
283+
284+
class GeographyResult(dict):
285+
"""Wrapper for geography objects returned by the Census Geocoding API."""
286+
287+
def __init__(self, data):
288+
self.input = data["result"].get("input", {})
289+
super().__init__(data["result"]["geographies"])
290+
291+
# create float coordinate tuples
292+
for geolist in self.values():
293+
for geo in geolist:
294+
try:
295+
geo["CENT"] = float(geo["CENTLON"]), float(geo["CENTLAT"])
296+
except ValueError:
297+
geo["CENT"] = ()
298+
299+
try:
300+
geo["INTPT"] = float(geo["INTPTLON"]), float(geo["INTPTLAT"])
301+
except ValueError:
302+
geo["INTPT"] = ()
303+
304+
305+
class AddressResult(list):
306+
"""Wrapper for address objects returned by the Census Geocoding API."""
307+
308+
def __init__(self, data):
309+
self.input = data["result"].get("input", {})
310+
super().__init__(data["result"]["addressMatches"])

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ exclude = [
7474
"node_modules",
7575
"site-packages",
7676
"venv",
77+
"**/vendor/*",
7778
]
7879

7980
# Default line length is 88
@@ -174,6 +175,9 @@ src_paths = [
174175
[tool.coverage.run]
175176
branch = true
176177
relative_files = true
178+
omit = [
179+
"*/vendor/*",
180+
]
177181

178182
[tool.coverage.paths]
179183
source = [

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ boto3>=1.17.98
33
boxsdk==4.1.0
44
braintree==4.41.0
55
bs4==0.0.2
6-
censusgeocode@git+https://github.com/fitnr/censusgeocode.git@1824f5d558ff6378dc4359b44c9cf535a2ba205f
76
civis==1.16.1;python_version<"3.10" # later Civis versions do not support Python 3.9
87
civis==2.4.3;python_version>="3.10"
98
curlify==3.0.0

0 commit comments

Comments
 (0)