Skip to content

Commit 4b3a3e8

Browse files
authored
Merge pull request #173 from ehinman/add-samples
Add a `get_usgs_samples()` function to dataretrieval-python.
2 parents 3ba0c83 + 3dfff7b commit 4b3a3e8

File tree

7 files changed

+536
-0
lines changed

7 files changed

+536
-0
lines changed

dataretrieval/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from dataretrieval.nadp import *
44
from dataretrieval.nwis import *
5+
from dataretrieval.samples import *
56
from dataretrieval.streamstats import *
67
from dataretrieval.utils import *
78
from dataretrieval.waterwatch import *

dataretrieval/samples.py

Lines changed: 351 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,351 @@
1+
"""Functions for downloading data from the USGS Aquarius Samples database
2+
(https://waterdata.usgs.gov/download-samples/).
3+
4+
See https://api.waterdata.usgs.gov/samples-data/docs#/ for API reference
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import json
10+
from io import StringIO
11+
from typing import TYPE_CHECKING, Literal, get_args
12+
13+
import pandas as pd
14+
import requests
15+
from requests.models import PreparedRequest
16+
17+
from dataretrieval.utils import BaseMetadata, to_str
18+
19+
if TYPE_CHECKING:
20+
from typing import Optional, Tuple, Union
21+
22+
from pandas import DataFrame
23+
24+
25+
_BASE_URL = "https://api.waterdata.usgs.gov/samples-data"
26+
27+
_CODE_SERVICES = Literal[
28+
"characteristicgroup",
29+
"characteristics",
30+
"counties",
31+
"countries",
32+
"observedproperty",
33+
"samplemedia",
34+
"sitetype",
35+
"states",
36+
]
37+
38+
39+
_SERVICES = Literal["activities", "locations", "organizations", "projects", "results"]
40+
41+
_PROFILES = Literal[
42+
"actgroup",
43+
"actmetric",
44+
"basicbio",
45+
"basicphyschem",
46+
"count",
47+
"fullbio",
48+
"fullphyschem",
49+
"labsampleprep",
50+
"narrow",
51+
"organization",
52+
"project",
53+
"projectmonitoringlocationweight",
54+
"resultdetectionquantitationlimit",
55+
"sampact",
56+
"site",
57+
]
58+
59+
_PROFILE_LOOKUP = {
60+
"activities": ["sampact", "actmetric", "actgroup", "count"],
61+
"locations": ["site", "count"],
62+
"organizations": ["organization", "count"],
63+
"projects": ["project", "projectmonitoringlocationweight"],
64+
"results": [
65+
"fullphyschem",
66+
"basicphyschem",
67+
"fullbio",
68+
"basicbio",
69+
"narrow",
70+
"resultdetectionquantitationlimit",
71+
"labsampleprep",
72+
"count",
73+
],
74+
}
75+
76+
77+
def get_codes(code_service: _CODE_SERVICES) -> DataFrame:
78+
"""Return codes from a Samples code service.
79+
80+
Parameters
81+
----------
82+
code_service : string
83+
One of the following options: "states", "counties", "countries"
84+
"sitetype", "samplemedia", "characteristicgroup", "characteristics",
85+
or "observedproperty"
86+
"""
87+
valid_code_services = get_args(_CODE_SERVICES)
88+
if code_service not in valid_code_services:
89+
raise ValueError(
90+
f"Invalid code service: '{code_service}'. "
91+
f"Valid options are: {valid_code_services}."
92+
)
93+
94+
url = f"{_BASE_URL}/codeservice/{code_service}?mimeType=application%2Fjson"
95+
96+
response = requests.get(url)
97+
98+
response.raise_for_status()
99+
100+
data_dict = json.loads(response.text)
101+
data_list = data_dict['data']
102+
103+
df = pd.DataFrame(data_list)
104+
105+
return df
106+
107+
def get_usgs_samples(
108+
ssl_check: bool = True,
109+
service: _SERVICES = "results",
110+
profile: _PROFILES = "fullphyschem",
111+
activityMediaName: Optional[Union[str, list[str]]] = None,
112+
activityStartDateLower: Optional[str] = None,
113+
activityStartDateUpper: Optional[str] = None,
114+
activityTypeCode: Optional[Union[str, list[str]]] = None,
115+
characteristicGroup: Optional[Union[str, list[str]]] = None,
116+
characteristic: Optional[Union[str, list[str]]] = None,
117+
characteristicUserSupplied: Optional[Union[str, list[str]]] = None,
118+
boundingBox: Optional[list[float]] = None,
119+
countryFips: Optional[Union[str, list[str]]] = None,
120+
stateFips: Optional[Union[str, list[str]]] = None,
121+
countyFips: Optional[Union[str, list[str]]] = None,
122+
siteTypeCode: Optional[Union[str, list[str]]] = None,
123+
siteTypeName: Optional[Union[str, list[str]]] = None,
124+
usgsPCode: Optional[Union[str, list[str]]] = None,
125+
hydrologicUnit: Optional[Union[str, list[str]]] = None,
126+
monitoringLocationIdentifier: Optional[Union[str, list[str]]] = None,
127+
organizationIdentifier: Optional[Union[str, list[str]]] = None,
128+
pointLocationLatitude: Optional[float] = None,
129+
pointLocationLongitude: Optional[float] = None,
130+
pointLocationWithinMiles: Optional[float] = None,
131+
projectIdentifier: Optional[Union[str, list[str]]] = None,
132+
recordIdentifierUserSupplied: Optional[Union[str, list[str]]] = None,
133+
) -> Tuple[DataFrame, BaseMetadata]:
134+
"""Search Samples database for USGS water quality data.
135+
This is a wrapper function for the Samples database API. All potential
136+
filters are provided as arguments to the function, but please do not
137+
populate all possible filters; leave as many as feasible with their default
138+
value (None). This is important because overcomplicated web service queries
139+
can bog down the database's ability to return an applicable dataset before
140+
it times out.
141+
142+
The web GUI for the Samples database can be found here:
143+
https://waterdata.usgs.gov/download-samples/#dataProfile=site
144+
145+
If you would like more details on feasible query parameters (complete with
146+
examples), please visit the Samples database swagger docs, here:
147+
https://api.waterdata.usgs.gov/samples-data/docs#/
148+
149+
Parameters
150+
----------
151+
ssl_check : bool, optional
152+
Check the SSL certificate.
153+
service : string
154+
One of the available Samples services: "results", "locations", "activities",
155+
"projects", or "organizations". Defaults to "results".
156+
profile : string
157+
One of the available profiles associated with a service. Options for each
158+
service are:
159+
results - "fullphyschem", "basicphyschem",
160+
"fullbio", "basicbio", "narrow",
161+
"resultdetectionquantitationlimit",
162+
"labsampleprep", "count"
163+
locations - "site", "count"
164+
activities - "sampact", "actmetric",
165+
"actgroup", "count"
166+
projects - "project", "projectmonitoringlocationweight"
167+
organizations - "organization", "count"
168+
activityMediaName : string or list of strings, optional
169+
Name or code indicating environmental medium in which sample was taken.
170+
Check the `activityMediaName_lookup()` function in this module for all
171+
possible inputs.
172+
Example: "Water".
173+
activityStartDateLower : string, optional
174+
The start date if using a date range. Takes the format YYYY-MM-DD.
175+
The logic is inclusive, i.e. it will also return results that
176+
match the date. If left as None, will pull all data on or before
177+
activityStartDateUpper, if populated.
178+
activityStartDateUpper : string, optional
179+
The end date if using a date range. Takes the format YYYY-MM-DD.
180+
The logic is inclusive, i.e. it will also return results that
181+
match the date. If left as None, will pull all data after
182+
activityStartDateLower up to the most recent available results.
183+
activityTypeCode : string or list of strings, optional
184+
Text code that describes type of field activity performed.
185+
Example: "Sample-Routine, regular".
186+
characteristicGroup : string or list of strings, optional
187+
Characteristic group is a broad category of characteristics
188+
describing one or more results. Check the `characteristicGroup_lookup()`
189+
function in this module for all possible inputs.
190+
Example: "Organics, PFAS"
191+
characteristic : string or list of strings, optional
192+
Characteristic is a specific category describing one or more results.
193+
Check the `characteristic_lookup()` function in this module for all
194+
possible inputs.
195+
Example: "Suspended Sediment Discharge"
196+
characteristicUserSupplied : string or list of strings, optional
197+
A user supplied characteristic name describing one or more results.
198+
boundingBox: list of four floats, optional
199+
Filters on the the associated monitoring location's point location
200+
by checking if it is located within the specified geographic area.
201+
The logic is inclusive, i.e. it will include locations that overlap
202+
with the edge of the bounding box. Values are separated by commas,
203+
expressed in decimal degrees, NAD83, and longitudes west of Greenwich
204+
are negative.
205+
The format is a string consisting of:
206+
- Western-most longitude
207+
- Southern-most latitude
208+
- Eastern-most longitude
209+
- Northern-most longitude
210+
Example: [-92.8,44.2,-88.9,46.0]
211+
countryFips : string or list of strings, optional
212+
Example: "US" (United States)
213+
stateFips : string or list of strings, optional
214+
Check the `stateFips_lookup()` function in this module for all
215+
possible inputs.
216+
Example: "US:15" (United States: Hawaii)
217+
countyFips : string or list of strings, optional
218+
Check the `countyFips_lookup()` function in this module for all
219+
possible inputs.
220+
Example: "US:15:001" (United States: Hawaii, Hawaii County)
221+
siteTypeCode : string or list of strings, optional
222+
An abbreviation for a certain site type. Check the `siteType_lookup()`
223+
function in this module for all possible inputs.
224+
Example: "GW" (Groundwater site)
225+
siteTypeName : string or list of strings, optional
226+
A full name for a certain site type. Check the `siteType_lookup()`
227+
function in this module for all possible inputs.
228+
Example: "Well"
229+
usgsPCode : string or list of strings, optional
230+
5-digit number used in the US Geological Survey computerized
231+
data system, National Water Information System (NWIS), to
232+
uniquely identify a specific constituent. Check the
233+
`characteristic_lookup()` function in this module for all possible
234+
inputs.
235+
Example: "00060" (Discharge, cubic feet per second)
236+
hydrologicUnit : string or list of strings, optional
237+
Max 12-digit number used to describe a hydrologic unit.
238+
Example: "070900020502"
239+
monitoringLocationIdentifier : string or list of strings, optional
240+
A monitoring location identifier has two parts: the agency code
241+
and the location number, separated by a dash (-).
242+
Example: "USGS-040851385"
243+
organizationIdentifier : string or list of strings, optional
244+
Designator used to uniquely identify a specific organization.
245+
Currently only accepting the organization "USGS".
246+
pointLocationLatitude : float, optional
247+
Latitude for a point/radius query (decimal degrees). Must be used
248+
with pointLocationLongitude and pointLocationWithinMiles.
249+
pointLocationLongitude : float, optional
250+
Longitude for a point/radius query (decimal degrees). Must be used
251+
with pointLocationLatitude and pointLocationWithinMiles.
252+
pointLocationWithinMiles : float, optional
253+
Radius for a point/radius query. Must be used with
254+
pointLocationLatitude and pointLocationLongitude
255+
projectIdentifier : string or list of strings, optional
256+
Designator used to uniquely identify a data collection project. Project
257+
identifiers are specific to an organization (e.g. USGS).
258+
Example: "ZH003QW03"
259+
recordIdentifierUserSupplied : string or list of strings, optional
260+
Internal AQS record identifier that returns 1 entry. Only available
261+
for the "results" service.
262+
263+
Returns
264+
-------
265+
df : ``pandas.DataFrame``
266+
Formatted data returned from the API query.
267+
md : :obj:`dataretrieval.utils.Metadata`
268+
Custom ``dataretrieval`` metadata object pertaining to the query.
269+
270+
Examples
271+
--------
272+
.. code::
273+
274+
>>> # Get PFAS results within a bounding box
275+
>>> df, md = dataretrieval.samples.get_usgs_samples(
276+
... boundingBox=[-90.2,42.6,-88.7,43.2],
277+
... characteristicGroup="Organics, PFAS"
278+
... )
279+
280+
>>> # Get all activities for the Commonwealth of Virginia over a date range
281+
>>> df, md = dataretrieval.samples.get_usgs_samples(
282+
... service="activities",
283+
... profile="sampact",
284+
... activityStartDateLower="2023-10-01",
285+
... activityStartDateUpper="2024-01-01",
286+
... stateFips="US:51")
287+
288+
>>> # Get all pH samples for two sites in Utah
289+
>>> df, md = dataretrieval.samples.get_usgs_samples(
290+
... monitoringLocationIdentifier=['USGS-393147111462301', 'USGS-393343111454101'],
291+
... usgsPCode='00400')
292+
293+
"""
294+
295+
_check_profiles(service, profile)
296+
297+
params = {
298+
k: v for k, v in locals().items()
299+
if k not in ["ssl_check", "service", "profile"]
300+
and v is not None
301+
}
302+
303+
304+
params.update({"mimeType": "text/csv"})
305+
306+
if "boundingBox" in params:
307+
params["boundingBox"] = to_str(params["boundingBox"])
308+
309+
url = f"{_BASE_URL}/{service}/{profile}"
310+
311+
req = PreparedRequest()
312+
req.prepare_url(url, params=params)
313+
print(f"Request: {req.url}")
314+
315+
response = requests.get(url, params=params, verify=ssl_check)
316+
317+
response.raise_for_status()
318+
319+
df = pd.read_csv(StringIO(response.text), delimiter=",")
320+
321+
return df, BaseMetadata(response)
322+
323+
def _check_profiles(
324+
service: _SERVICES,
325+
profile: _PROFILES,
326+
) -> None:
327+
"""Check whether a service profile is valid.
328+
329+
Parameters
330+
----------
331+
service : string
332+
One of the service names from the "services" list.
333+
profile : string
334+
One of the profile names from "results_profiles",
335+
"locations_profiles", "activities_profiles",
336+
"projects_profiles" or "organizations_profiles".
337+
"""
338+
valid_services = get_args(_SERVICES)
339+
if service not in valid_services:
340+
raise ValueError(
341+
f"Invalid service: '{service}'. "
342+
f"Valid options are: {valid_services}."
343+
)
344+
345+
valid_profiles = _PROFILE_LOOKUP[service]
346+
if profile not in valid_profiles:
347+
raise ValueError(
348+
f"Invalid profile: '{profile}' for service '{service}'. "
349+
f"Valid options are: {valid_profiles}."
350+
)
351+

docs/source/reference/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ API reference
99

1010
nadp
1111
nwis
12+
samples
1213
streamstats
1314
utils
1415
wqp

docs/source/reference/samples.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
.. _samples
2+
3+
dataretrieval.samples
4+
-------------------------
5+
6+
.. automodule:: dataretrieval.samples
7+
:members:
8+
:special-members:

docs/source/userguide/dataportals.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ provided below.
1717
+-----------------------------------+---------------------------------------------------------------+
1818
| Mercury Deposition Network | https://nadp.slh.wisc.edu/networks/mercury-deposition-network |
1919
+-----------------------------------+---------------------------------------------------------------+
20+
| USGS Samples | https://waterdata.usgs.gov/download-samples/ |
21+
+-----------------------------------+---------------------------------------------------------------+
2022
| Streamstats | https://streamstats.usgs.gov |
2123
+-----------------------------------+---------------------------------------------------------------+
2224
| Water Quality Portal | https://waterqualitydata.us |

0 commit comments

Comments
 (0)