properly handle forecast with empty catalogs (#111)

wsavran · web-flow · commit 6e07a43502d7 · 2021-04-08T11:58:18.000-07:00
* properly handle forecast with empty catalogs - fixes #107 - added ability for empty events to be specified explicitly - implement storage optimization by allowing for empty catalogs to be omitted so long as final catalog_id = (n_cat - 1) - added unit tests for catalog reading - documentation updates for catalog ascii format
diff --git a/csep/core/catalogs.py b/csep/core/catalogs.py
@@ -905,10 +905,10 @@ def __init__(self, **kwargs):
 
     @classmethod
     def load_ascii_catalogs(cls, filename, **kwargs):
-        """ Loads multiple CSEP catalogs in ASCII format.
+        """ Loads multiple catalogs in csep-ascii format.
 
-        This function can load multiple catalogs stored in a single file or directories. This typically called to
-        load a catalog-based forecast.
+        This function can load multiple catalogs stored in a single file. This typically called to
+        load a catalog-based forecast, but could also load a collection of catalogs stored in the same file
 
         Args:
             filename (str): filepath or directory of catalog files
@@ -926,15 +926,46 @@ def parse_filename(filename):
             start_time = strptime_to_utc_datetime(split_fname[1], format="%Y-%m-%dT%H-%M-%S-%f")
             return (name, start_time)
 
+        def read_float(val):
+            """Returns val as float or None if unable"""
+            try:
+                val = float(val)
+            except:
+                val = None
+            return val
+
         def is_header_line(line):
-            if line[0] == 'lon':
+            if line[0].lower() == 'lon':
                 return True
             else:
                 return False
 
-        name_from_file, start_time = parse_filename(filename)
+        def read_catalog_line(line):
+            # convert to correct types
+            lon = read_float(line[0])
+            lat = read_float(line[1])
+            magnitude = read_float(line[2])
+            # maybe fractional seconds are not included
+            origin_time = line[3]
+            if origin_time:
+                try:
+                    origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S.%f')
+                except ValueError:
+                    origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S')
+            depth = read_float(line[4])
+            catalog_id = int(line[5])
+            event_id = line[6]
+            # temporary event
+            temp_event = (event_id, origin_time, lat, lon, depth, magnitude)
+            return temp_event, catalog_id
+
         # overwrite filename, if user specifies
-        kwargs.setdefault('name', name_from_file)
+        try:
+            name_from_file, start_time = parse_filename(filename)
+            kwargs.setdefault('name', name_from_file)
+        except:
+            pass
+
         # handle all catalogs in single file
         if os.path.isfile(filename):
             with open(filename, 'r', newline='') as input_file:
@@ -948,52 +979,53 @@ def is_header_line(line):
                     if prev_id is None:
                         if is_header_line(line):
                             continue
-                    # convert to correct types
-                    lon = float(line[0])
-                    lat = float(line[1])
-                    magnitude = float(line[2])
-                    # maybe fractional seconds are not included
-                    try:
-                        origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S.%f')
-                    except ValueError:
-                        origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S')
-                    depth = float(line[4])
-                    catalog_id = int(line[5])
-                    event_id = line[6]
+                    # read line and return catalog id
+                    temp_event, catalog_id = read_catalog_line(line)
+                    empty = False
+                    # OK if event_id is empty
+                    if all([val in (None, '') for val in temp_event[1:]]):
+                        empty = True
                     # first event is when prev_id is none, catalog_id should always start at zero
                     if prev_id is None:
                         prev_id = 0
                         # if the first catalog doesn't start at zero
                         if catalog_id != prev_id:
-                            prev_id = catalog_id
-                            # store this event for next time
-                            events = [(event_id, origin_time, lat, lon, depth, magnitude)]
+                            if not empty:
+                                events = [temp_event]
+                            else:
+                                events = []
                             for id in range(catalog_id):
                                 yield cls(data=[], catalog_id=id, **kwargs)
-                    # deal with cases of events
+                            prev_id = catalog_id
+                            continue
+                    # accumulate event if catalog_id is the same as previous event
                     if catalog_id == prev_id:
+                        if not all([val in (None, '') for val in temp_event]):
+                            events.append(temp_event)
                         prev_id = catalog_id
-                        events.append((event_id, origin_time, lat, lon, depth, magnitude))
                     # create and yield class if the events are from different catalogs
                     elif catalog_id == prev_id + 1:
-                        catalog = cls(data=events, catalog_id=prev_id, **kwargs)
+                        yield cls(data=events, catalog_id=prev_id, **kwargs)
+                        # add event to new event list
+                        if not empty:
+                            events = [temp_event]
+                        else:
+                            events = []
                         prev_id = catalog_id
-                        # add first event to new event list
-                        events = [(event_id, origin_time, lat, lon, depth, magnitude)]
-                        yield catalog
                     # this implies there are empty catalogs, because they are not listed in the ascii file
                     elif catalog_id > prev_id + 1:
-                        catalog = cls(data=events, catalog_id=prev_id, **kwargs)
-                        # add event to new event list
-                        events = [(event_id, origin_time, lat, lon, depth, magnitude)]
+                        yield cls(data=events, catalog_id=prev_id, **kwargs)
                         # if prev_id = 0 and catalog_id = 2, then we skipped one catalog. thus, we skip catalog_id - prev_id - 1 catalogs
                         num_empty_catalogs = catalog_id - prev_id - 1
-                        # create empty catalog classes
+                        # first yield empty catalog classes
                         for id in range(num_empty_catalogs):
                             yield cls(data=[], catalog_id=catalog_id - num_empty_catalogs + id, **kwargs)
-                        # finally we want to yield the buffered catalog to preserve order
                         prev_id = catalog_id
-                        yield catalog
+                        # add event to new event list
+                        if not empty:
+                            events = [temp_event]
+                        else:
+                            events = []
                     else:
                         raise ValueError(
                             "catalog_id should be monotonically increasing and events should be ordered by catalog_id")
diff --git a/docs/concepts/catalogs.rst b/docs/concepts/catalogs.rst
@@ -72,8 +72,6 @@ PyCSEP data model.
     Going between a DataFrame and CSEPCatalog is a lossy transformation. It essentially only retains the essential event
     attributes that are defined by the ``dtype`` of the class.
 
-
-
 ****************
 Loading catalogs
 ****************
diff --git a/docs/concepts/evaluations.rst b/docs/concepts/evaluations.rst
@@ -109,11 +109,11 @@ Consistency tests
 Publication reference
 =====================
 
-1. Number test (:ref:`Savran et al., 2020`<savran-2020>`)
-2. Spatial test (:ref:`Savran et al., 2020`<savran-2020>`)
-3. Magnitude test (:ref:`Savran et al., 2020`<savran-2020>`)
-4. Pseudolikelihood test (:ref:`Savran et al., 2020`<savran-2020>`)
-5. Calibration test (:ref:`Savran et al., 2020`<savran-2020>`)
+1. Number test (:ref:`Savran et al., 2020<savran-2020>`)
+2. Spatial test (:ref:`Savran et al., 2020<savran-2020>`)
+3. Magnitude test (:ref:`Savran et al., 2020<savran-2020>`)
+4. Pseudolikelihood test (:ref:`Savran et al., 2020<savran-2020>`)
+5. Calibration test (:ref:`Savran et al., 2020<savran-2020>`)
 
 ****************************
 Preparing evaluation catalog
diff --git a/docs/concepts/forecasts.rst b/docs/concepts/forecasts.rst
@@ -87,14 +87,70 @@ they are trying to model. A grid-based forecast can be easily computed from a ca
 space-magnitude region and counting events within each bin from each catalog in the forecast. There can be issues with
 under sampling, especially for larger magnitude events.
 
-
 Working with catalog-based forecasts
 ####################################
 
 .. autosummary:: csep.core.forecasts.CatalogForecast
 
 Please see visit :ref:`this<catalog-forecast-evaluation>` example for an end-to-end tutorial on how to evaluate a catalog-based
 earthquake forecast. An example of a catalog-based forecast stored in the default PyCSEP format can be found
-`here<https://github.com/SCECcode/csep2/blob/dev/csep/artifacts/ExampleForecasts/CatalogForecasts/ucerf3-landers_1992-06-28T11-57-34-14.csv>_`.
+`here<https://github.com/SCECcode/pycsep/blob/dev/csep/artifacts/ExampleForecasts/CatalogForecasts/ucerf3-landers_1992-06-28T11-57-34-14.csv>`_.
+
+
+The standard format for catalog-based forecasts a comma separated value ASCII format. This format was chosen to be
+human-readable and easy to implement in all programming languages. Information about the format is shown below.
+
+.. note::
+    Custom formats can be supported by writing a custom function or sub-classing the
+    :ref:`AbstractBaseCatalog<csep.core.forecasts.AbstractBaseCatalog>`.
+
+The event format matches the follow specfication: ::
+
+    LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
+    -125.4, 40.1, 3.96, 1992-01-05T0:40:3.1, 8, 0, 0
+
+Each row in the catalog corresponds to an event. The catalogs are expected to be placed into the same file and are
+differentiated through their `catalog_id`. Catalogs with no events can be handled in a couple different ways intended to
+save storage.
+
+The events within a catalog should be sorted in time, and the *catalog_id* should be increasing sequentially. Breaks in
+the *catalog_id* are interpreted as missing catalogs.
+
+The following two examples show how you represent a forecast with 5 catalogs each containing zero events.
+
+**1. Including all events (verbose)** ::
+
+    LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
+    ,,,,,0,
+    ,,,,,1,
+    ,,,,,2,
+    ,,,,,3,
+    ,,,,,4,
+
+**2. Short-hand** ::
+
+    LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
+    ,,,,,4,
+
+The following three example show how you could represent a forecast with 5 catalogs. Four of the catalogs contain zero events
+and one catalog contains one event.
+
+**3. Including all events (verbose)** ::
+
+    LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
+    ,,,,,0,
+    ,,,,,1,
+    ,,,,,2,
+    ,,,,,3,
+    -125.4, 40.1, 3.96, 1992-01-05T0:40:3.1, 8, 4, 0
+
+**4. Short-hand** ::
+
+    LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
+    -125.4, 40.1, 3.96, 1992-01-05T0:40:3.1, 8, 4, 0
+
+The simplest way to orient the file follow (3) in the case where some catalogs contain zero events. The zero oriented
+catalog_id should be assigned to correspond with the total number of catalogs in the forecast. In the case where every catalog
+contains zero forecasted events, you would specify the forecasting using (2). The *catalog_id* should be assigned to
+correspond with the total number of catalogs in the forecast.
 
-We will be adding more to these documentation pages, so stay tuned for updated.
diff --git a/docs/conf.py b/docs/conf.py
@@ -28,9 +28,9 @@
 html_show_sphinx = False
 
 # The short X.Y version
-version = 'v0.2'
+version = 'v0.4'
 # The full version, including alpha/beta/rc tags
-release = 'v0.2.0'
+release = 'v0.4.0'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/tests/artifacts/test_ascii_catalogs/all_empty.csv b/tests/artifacts/test_ascii_catalogs/all_empty.csv
@@ -0,0 +1 @@
+,,,,,9,
diff --git a/tests/artifacts/test_ascii_catalogs/all_empty_verbose.csv b/tests/artifacts/test_ascii_catalogs/all_empty_verbose.csv
@@ -0,0 +1,10 @@
+,,,,,0,
+,,,,,1,
+,,,,,2,
+,,,,,3,
+,,,,,4,
+,,,,,5,
+,,,,,6,
+,,,,,7,
+,,,,,8,
+,,,,,9,
diff --git a/tests/artifacts/test_ascii_catalogs/all_present.csv b/tests/artifacts/test_ascii_catalogs/all_present.csv
@@ -0,0 +1,10 @@
+1,1,1,1992-01-01T0:0:0.0,1,0,1
+1,1,1,1992-01-01T0:0:0.0,1,1,1
+1,1,1,1992-01-01T0:0:0.0,1,2,1
+1,1,1,1992-01-01T0:0:0.0,1,3,1
+1,1,1,1992-01-01T0:0:0.0,1,4,1
+1,1,1,1992-01-01T0:0:0.0,1,5,1
+1,1,1,1992-01-01T0:0:0.0,1,6,1
+1,1,1,1992-01-01T0:0:0.0,1,7,1
+1,1,1,1992-01-01T0:0:0.0,1,8,1
+1,1,1,1992-01-01T0:0:0.0,1,9,1
diff --git a/tests/artifacts/test_ascii_catalogs/last_empty.csv b/tests/artifacts/test_ascii_catalogs/last_empty.csv
@@ -0,0 +1,2 @@
+1,1,1,1992-01-01T0:0:0.0,1,0,1
+,,,,,9,
diff --git a/tests/artifacts/test_ascii_catalogs/some_empty.csv b/tests/artifacts/test_ascii_catalogs/some_empty.csv
@@ -0,0 +1,4 @@
+1,1,1,1992-01-01T0:0:0.0,1,3,1
+,,,,,5,
+1,1,1,1992-01-01T0:0:0.0,1,7,1
+,,,,,9,
diff --git a/tests/artifacts/test_ascii_catalogs/some_missing_verbose.csv b/tests/artifacts/test_ascii_catalogs/some_missing_verbose.csv
@@ -0,0 +1,10 @@
+,,,,,0,
+,,,,,1,
+,,,,,2,
+1,1,1,1992-01-01T0:0:0.0,1,3,1
+,,,,,4,
+,,,,,5,
+,,,,,6,
+1,1,1,1992-01-01T0:0:0.0,1,7,1
+,,,,,8,
+,,,,,9,
diff --git a/tests/test_catalog.py b/tests/test_catalog.py
@@ -92,7 +92,6 @@ def test_filter_spatial(self):
         filtered_test_cat = test_cat.filter_spatial(region=self.cart_grid)
         numpy.testing.assert_equal(numpy.array([b'2', b'3'], dtype='S256').T, filtered_test_cat.get_event_ids())
 
-
     def test_filter_spatial_in_place_return(self):
         test_cat = copy.deepcopy(self.test_cat1)
         filtered_test_cat = test_cat.filter_spatial(region=self.cart_grid, in_place=False)
diff --git a/tests/test_forecast.py b/tests/test_forecast.py
@@ -0,0 +1,58 @@
+import os, unittest
+import numpy
+from csep import load_catalog_forecast
+
+def get_test_catalog_root():
+    root_dir = os.path.dirname(os.path.abspath(__file__))
+    data_dir = os.path.join(root_dir, 'artifacts', 'test_ascii_catalogs')
+    return data_dir
+
+class TestCatalogForecastCreation(unittest.TestCase):
+
+    def test_ascii_load_all_empty(self):
+        fname = os.path.join(get_test_catalog_root(), 'all_empty.csv')
+        test_fore = load_catalog_forecast(fname)
+        total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
+        self.assertEqual(0, total_event_count)
+        self.assertEqual(10, test_fore.n_cat)
+        numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))
+
+    def test_ascii_load_all_empty_verbose(self):
+        fname = os.path.join(get_test_catalog_root(), 'all_empty_verbose.csv')
+        test_fore = load_catalog_forecast(fname)
+        total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
+        self.assertEqual(0, total_event_count)
+        self.assertEqual(10, test_fore.n_cat)
+        numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))
+
+    def test_ascii_load_last_empty(self):
+        fname = os.path.join(get_test_catalog_root(), 'last_empty.csv')
+        test_fore = load_catalog_forecast(fname)
+        total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
+        self.assertEqual(1, total_event_count)
+        self.assertEqual(10, test_fore.n_cat)
+        numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))
+
+    def test_ascii_some_missing(self):
+        fname = os.path.join(get_test_catalog_root(), 'some_empty.csv')
+        test_fore = load_catalog_forecast(fname)
+        total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
+        print([(cat.event_count, cat.catalog_id) for cat in test_fore])
+        self.assertEqual(2, total_event_count)
+        self.assertEqual(10, test_fore.n_cat)
+        numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))
+
+    def test_ascii_some_missing_verbose(self):
+        fname = os.path.join(get_test_catalog_root(), 'all_present.csv')
+        test_fore = load_catalog_forecast(fname)
+        total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
+        print([(cat.event_count, cat.catalog_id) for cat in test_fore])
+        self.assertEqual(10, total_event_count)
+        self.assertEqual(10, test_fore.n_cat)
+        numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))
+
+    def test_ascii_all_present(self):
+        pass
+
+if __name__ == '__main__':
+    unittest.main()

-Original file line number
+Diff line change
@@ @@ -0,0 +1,10 @@ @@
 +,,,,,0,
 +,,,,,1,
 +,,,,,2,
 +,,,,,3,
 +,,,,,4,
 +,,,,,5,
 +,,,,,6,
 +,,,,,7,
 +,,,,,8,
 +,,,,,9,
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+1,1,1,1992-01-01T0:0:0.0,1,0,1`
	`2`	`+,,,,,9,`