129 data downloading issue with a specific gauging station (#131)

thodson-usgs · web-flow · commit 4eba0a27e6ce · 2024-01-30T13:42:12.000-06:00
* Improve JSON parsing #129
diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py
@@ -19,7 +19,7 @@
 import pandas as pd
 import requests
 
-from dataretrieval.utils import BaseMetadata, format_datetime, to_str, update_merge
+from dataretrieval.utils import BaseMetadata, format_datetime, to_str
 
 from .utils import query
 
@@ -834,6 +834,7 @@ def get_iv(
     response = query_waterservices(
         service='iv', format='json', ssl_check=ssl_check, **kwargs
     )
+
     df = _read_json(response.json())
     return format_response(df, **kwargs), NWIS_Metadata(response, **kwargs)
 
@@ -1304,67 +1305,88 @@ def _read_json(json):
         A custom metadata object
 
     """
-    merged_df = pd.DataFrame()
-
-    for timeseries in json['value']['timeSeries']:
-        site_no = timeseries['sourceInfo']['siteCode'][0]['value']
-        param_cd = timeseries['variable']['variableCode'][0]['value']
-        # check whether min, max, mean record XXX
-        option = timeseries['variable']['options']['option'][0].get('value')
-
-        # loop through each parameter in timeseries.
-        for parameter in timeseries['values']:
-            col_name = param_cd
-            method = parameter['method'][0]['methodDescription']
-
-            # if len(timeseries['values']) > 1 and method:
-            if method:
-                # get method, format it, and append to column name
-                method = method.strip('[]()').lower()
-                col_name = f'{col_name}_{method}'
-
-            if option:
-                col_name = f'{col_name}_{option}'
-
-            record_json = parameter['value']
-
-            if not record_json:
-                # no data in record
-                continue
-            # should be able to avoid this by dumping
-            record_json = str(record_json).replace("'", '"')
-
-            # read json, converting all values to float64 and all qualifiers
-            # Lists can't be hashed, thus we cannot df.merge on a list column
-            record_df = pd.read_json(
-                StringIO(record_json),
-                orient='records',
-                dtype={'value': 'float64', 'qualifiers': 'unicode'},
-                convert_dates=False,
-            )
+    merged_df = pd.DataFrame(columns=['site_no', 'datetime'])
 
-            record_df['qualifiers'] = (
-                record_df['qualifiers'].str.strip('[]').str.replace("'", '')
-            )
-            record_df['site_no'] = site_no
-
-            record_df.rename(
-                columns={
-                    'value': col_name,
-                    'dateTime': 'datetime',
-                    'qualifiers': col_name + '_cd',
-                },
-                inplace=True,
-            )
+    site_list = [
+        ts['sourceInfo']['siteCode'][0]['value'] for ts in json['value']['timeSeries']
+    ]
 
-            if merged_df.empty:
-                merged_df = record_df
+    # create a list of indexes for each change in site no
+    # for example, [0, 21, 22] would be the first and last indeces
+    index_list = [0]
+    index_list.extend(
+        [i + 1 for i, (a, b) in enumerate(zip(site_list[:-1], site_list[1:])) if a != b]
+    )
+    index_list.append(len(site_list))
+
+    for i in range(len(index_list) - 1):
+        start = index_list[i]  # [0]
+        end = index_list[i + 1]  # [21]
+
+        # grab a block containing timeseries 0:21,
+        # which are all from the same site
+        site_block = json['value']['timeSeries'][start:end]
+        if not site_block:
+            continue
+
+        site_no = site_block[0]['sourceInfo']['siteCode'][0]['value']
+        site_df = pd.DataFrame(columns=['datetime'])
+
+        for timeseries in site_block:
+            param_cd = timeseries['variable']['variableCode'][0]['value']
+            # check whether min, max, mean record XXX
+            option = timeseries['variable']['options']['option'][0].get('value')
+
+            # loop through each parameter in timeseries, then concat to the merged_df
+            for parameter in timeseries['values']:
+                col_name = param_cd
+                method = parameter['method'][0]['methodDescription']
+
+                # if len(timeseries['values']) > 1 and method:
+                if method:
+                    # get method, format it, and append to column name
+                    method = method.strip('[]()').lower()
+                    col_name = f'{col_name}_{method}'
+
+                if option:
+                    col_name = f'{col_name}_{option}'
+
+                record_json = parameter['value']
+
+                if not record_json:
+                    # no data in record
+                    continue
+                # should be able to avoid this by dumping
+                record_json = str(record_json).replace("'", '"')
+
+                # read json, converting all values to float64 and all qualifiers
+                # Lists can't be hashed, thus we cannot df.merge on a list column
+                record_df = pd.read_json(
+                    StringIO(record_json),
+                    orient='records',
+                    dtype={'value': 'float64', 'qualifiers': 'unicode'},
+                    convert_dates=False,
+                )
+
+                record_df['qualifiers'] = (
+                    record_df['qualifiers'].str.strip('[]').str.replace("'", '')
+                )
 
-            else:
-                merged_df = update_merge(
-                    merged_df, record_df, na_only=True, on=['site_no', 'datetime']
+                record_df.rename(
+                    columns={
+                        'value': col_name,
+                        'dateTime': 'datetime',
+                        'qualifiers': col_name + '_cd',
+                    },
+                    inplace=True,
                 )
 
+                site_df = site_df.merge(record_df, how='outer', on='datetime')
+
+        # end of site loop
+        site_df['site_no'] = site_no
+        merged_df = pd.concat([merged_df, site_df])
+
     # convert to datetime, normalizing the timezone to UTC when doing so
     if 'datetime' in merged_df.columns:
         merged_df['datetime'] = pd.to_datetime(merged_df['datetime'], utc=True)
diff --git a/dataretrieval/utils.py b/dataretrieval/utils.py
@@ -95,50 +95,6 @@ def format_datetime(df, date_field, time_field, tz_field):
     return df
 
 
-# This function may be deprecated once pandas.update support joins besides left.
-def update_merge(left, right, na_only=False, on=None, **kwargs):
-    """Performs a combination update and merge.
-
-    Parameters
-    ----------
-    left: ``pandas.DataFrame``
-        Original data
-    right: ``pandas.DataFrame``
-        Updated data
-    na_only: bool
-        If True, only update na values
-
-    Returns
-    -------
-    df: ``pandas.DataFrame``
-        Updated data frame
-
-    .. todo::
-
-        add na_only parameter support
-
-    """
-    # df = left.merge(right, how='outer',
-    #                left_index=True, right_index=True)
-    df = left.merge(right, how='outer', on=on, **kwargs)
-
-    # check for column overlap and resolve update
-    for column in df.columns:
-        # if duplicated column, use the value from right
-        if column[-2:] == '_x':
-            name = column[:-2]  # find column name
-
-            if na_only:
-                df[name] = df[name + '_x'].fillna(df[name + '_y'])
-
-            else:
-                df[name] = df[name + '_x'].update(df[name + '_y'])
-
-            df.drop([name + '_x', name + '_y'], axis=1, inplace=True)
-
-    return df
-
-
 class BaseMetadata:
     """Base class for metadata.