Skip to content

Commit 6e07a43

Browse files
authored
properly handle forecast with empty catalogs (#111)
* properly handle forecast with empty catalogs - fixes #107 - added ability for empty events to be specified explicitly - implement storage optimization by allowing for empty catalogs to be omitted so long as final catalog_id = (n_cat - 1) - added unit tests for catalog reading - documentation updates for catalog ascii format
1 parent a860d08 commit 6e07a43

13 files changed

+226
-46
lines changed

csep/core/catalogs.py

+65-33
Original file line numberDiff line numberDiff line change
@@ -905,10 +905,10 @@ def __init__(self, **kwargs):
905905

906906
@classmethod
907907
def load_ascii_catalogs(cls, filename, **kwargs):
908-
""" Loads multiple CSEP catalogs in ASCII format.
908+
""" Loads multiple catalogs in csep-ascii format.
909909
910-
This function can load multiple catalogs stored in a single file or directories. This typically called to
911-
load a catalog-based forecast.
910+
This function can load multiple catalogs stored in a single file. This typically called to
911+
load a catalog-based forecast, but could also load a collection of catalogs stored in the same file
912912
913913
Args:
914914
filename (str): filepath or directory of catalog files
@@ -926,15 +926,46 @@ def parse_filename(filename):
926926
start_time = strptime_to_utc_datetime(split_fname[1], format="%Y-%m-%dT%H-%M-%S-%f")
927927
return (name, start_time)
928928

929+
def read_float(val):
930+
"""Returns val as float or None if unable"""
931+
try:
932+
val = float(val)
933+
except:
934+
val = None
935+
return val
936+
929937
def is_header_line(line):
930-
if line[0] == 'lon':
938+
if line[0].lower() == 'lon':
931939
return True
932940
else:
933941
return False
934942

935-
name_from_file, start_time = parse_filename(filename)
943+
def read_catalog_line(line):
944+
# convert to correct types
945+
lon = read_float(line[0])
946+
lat = read_float(line[1])
947+
magnitude = read_float(line[2])
948+
# maybe fractional seconds are not included
949+
origin_time = line[3]
950+
if origin_time:
951+
try:
952+
origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S.%f')
953+
except ValueError:
954+
origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S')
955+
depth = read_float(line[4])
956+
catalog_id = int(line[5])
957+
event_id = line[6]
958+
# temporary event
959+
temp_event = (event_id, origin_time, lat, lon, depth, magnitude)
960+
return temp_event, catalog_id
961+
936962
# overwrite filename, if user specifies
937-
kwargs.setdefault('name', name_from_file)
963+
try:
964+
name_from_file, start_time = parse_filename(filename)
965+
kwargs.setdefault('name', name_from_file)
966+
except:
967+
pass
968+
938969
# handle all catalogs in single file
939970
if os.path.isfile(filename):
940971
with open(filename, 'r', newline='') as input_file:
@@ -948,52 +979,53 @@ def is_header_line(line):
948979
if prev_id is None:
949980
if is_header_line(line):
950981
continue
951-
# convert to correct types
952-
lon = float(line[0])
953-
lat = float(line[1])
954-
magnitude = float(line[2])
955-
# maybe fractional seconds are not included
956-
try:
957-
origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S.%f')
958-
except ValueError:
959-
origin_time = strptime_to_utc_epoch(line[3], format='%Y-%m-%dT%H:%M:%S')
960-
depth = float(line[4])
961-
catalog_id = int(line[5])
962-
event_id = line[6]
982+
# read line and return catalog id
983+
temp_event, catalog_id = read_catalog_line(line)
984+
empty = False
985+
# OK if event_id is empty
986+
if all([val in (None, '') for val in temp_event[1:]]):
987+
empty = True
963988
# first event is when prev_id is none, catalog_id should always start at zero
964989
if prev_id is None:
965990
prev_id = 0
966991
# if the first catalog doesn't start at zero
967992
if catalog_id != prev_id:
968-
prev_id = catalog_id
969-
# store this event for next time
970-
events = [(event_id, origin_time, lat, lon, depth, magnitude)]
993+
if not empty:
994+
events = [temp_event]
995+
else:
996+
events = []
971997
for id in range(catalog_id):
972998
yield cls(data=[], catalog_id=id, **kwargs)
973-
# deal with cases of events
999+
prev_id = catalog_id
1000+
continue
1001+
# accumulate event if catalog_id is the same as previous event
9741002
if catalog_id == prev_id:
1003+
if not all([val in (None, '') for val in temp_event]):
1004+
events.append(temp_event)
9751005
prev_id = catalog_id
976-
events.append((event_id, origin_time, lat, lon, depth, magnitude))
9771006
# create and yield class if the events are from different catalogs
9781007
elif catalog_id == prev_id + 1:
979-
catalog = cls(data=events, catalog_id=prev_id, **kwargs)
1008+
yield cls(data=events, catalog_id=prev_id, **kwargs)
1009+
# add event to new event list
1010+
if not empty:
1011+
events = [temp_event]
1012+
else:
1013+
events = []
9801014
prev_id = catalog_id
981-
# add first event to new event list
982-
events = [(event_id, origin_time, lat, lon, depth, magnitude)]
983-
yield catalog
9841015
# this implies there are empty catalogs, because they are not listed in the ascii file
9851016
elif catalog_id > prev_id + 1:
986-
catalog = cls(data=events, catalog_id=prev_id, **kwargs)
987-
# add event to new event list
988-
events = [(event_id, origin_time, lat, lon, depth, magnitude)]
1017+
yield cls(data=events, catalog_id=prev_id, **kwargs)
9891018
# if prev_id = 0 and catalog_id = 2, then we skipped one catalog. thus, we skip catalog_id - prev_id - 1 catalogs
9901019
num_empty_catalogs = catalog_id - prev_id - 1
991-
# create empty catalog classes
1020+
# first yield empty catalog classes
9921021
for id in range(num_empty_catalogs):
9931022
yield cls(data=[], catalog_id=catalog_id - num_empty_catalogs + id, **kwargs)
994-
# finally we want to yield the buffered catalog to preserve order
9951023
prev_id = catalog_id
996-
yield catalog
1024+
# add event to new event list
1025+
if not empty:
1026+
events = [temp_event]
1027+
else:
1028+
events = []
9971029
else:
9981030
raise ValueError(
9991031
"catalog_id should be monotonically increasing and events should be ordered by catalog_id")

docs/concepts/catalogs.rst

-2
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ PyCSEP data model.
7272
Going between a DataFrame and CSEPCatalog is a lossy transformation. It essentially only retains the essential event
7373
attributes that are defined by the ``dtype`` of the class.
7474

75-
76-
7775
****************
7876
Loading catalogs
7977
****************

docs/concepts/evaluations.rst

+5-5
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,11 @@ Consistency tests
109109
Publication reference
110110
=====================
111111

112-
1. Number test (:ref:`Savran et al., 2020`<savran-2020>`)
113-
2. Spatial test (:ref:`Savran et al., 2020`<savran-2020>`)
114-
3. Magnitude test (:ref:`Savran et al., 2020`<savran-2020>`)
115-
4. Pseudolikelihood test (:ref:`Savran et al., 2020`<savran-2020>`)
116-
5. Calibration test (:ref:`Savran et al., 2020`<savran-2020>`)
112+
1. Number test (:ref:`Savran et al., 2020<savran-2020>`)
113+
2. Spatial test (:ref:`Savran et al., 2020<savran-2020>`)
114+
3. Magnitude test (:ref:`Savran et al., 2020<savran-2020>`)
115+
4. Pseudolikelihood test (:ref:`Savran et al., 2020<savran-2020>`)
116+
5. Calibration test (:ref:`Savran et al., 2020<savran-2020>`)
117117

118118
****************************
119119
Preparing evaluation catalog

docs/concepts/forecasts.rst

+59-3
Original file line numberDiff line numberDiff line change
@@ -87,14 +87,70 @@ they are trying to model. A grid-based forecast can be easily computed from a ca
8787
space-magnitude region and counting events within each bin from each catalog in the forecast. There can be issues with
8888
under sampling, especially for larger magnitude events.
8989

90-
9190
Working with catalog-based forecasts
9291
####################################
9392

9493
.. autosummary:: csep.core.forecasts.CatalogForecast
9594

9695
Please see visit :ref:`this<catalog-forecast-evaluation>` example for an end-to-end tutorial on how to evaluate a catalog-based
9796
earthquake forecast. An example of a catalog-based forecast stored in the default PyCSEP format can be found
98-
`here<https://github.com/SCECcode/csep2/blob/dev/csep/artifacts/ExampleForecasts/CatalogForecasts/ucerf3-landers_1992-06-28T11-57-34-14.csv>_`.
97+
`here<https://github.com/SCECcode/pycsep/blob/dev/csep/artifacts/ExampleForecasts/CatalogForecasts/ucerf3-landers_1992-06-28T11-57-34-14.csv>`_.
98+
99+
100+
The standard format for catalog-based forecasts a comma separated value ASCII format. This format was chosen to be
101+
human-readable and easy to implement in all programming languages. Information about the format is shown below.
102+
103+
.. note::
104+
Custom formats can be supported by writing a custom function or sub-classing the
105+
:ref:`AbstractBaseCatalog<csep.core.forecasts.AbstractBaseCatalog>`.
106+
107+
The event format matches the follow specfication: ::
108+
109+
LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
110+
-125.4, 40.1, 3.96, 1992-01-05T0:40:3.1, 8, 0, 0
111+
112+
Each row in the catalog corresponds to an event. The catalogs are expected to be placed into the same file and are
113+
differentiated through their `catalog_id`. Catalogs with no events can be handled in a couple different ways intended to
114+
save storage.
115+
116+
The events within a catalog should be sorted in time, and the *catalog_id* should be increasing sequentially. Breaks in
117+
the *catalog_id* are interpreted as missing catalogs.
118+
119+
The following two examples show how you represent a forecast with 5 catalogs each containing zero events.
120+
121+
**1. Including all events (verbose)** ::
122+
123+
LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
124+
,,,,,0,
125+
,,,,,1,
126+
,,,,,2,
127+
,,,,,3,
128+
,,,,,4,
129+
130+
**2. Short-hand** ::
131+
132+
LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
133+
,,,,,4,
134+
135+
The following three example show how you could represent a forecast with 5 catalogs. Four of the catalogs contain zero events
136+
and one catalog contains one event.
137+
138+
**3. Including all events (verbose)** ::
139+
140+
LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
141+
,,,,,0,
142+
,,,,,1,
143+
,,,,,2,
144+
,,,,,3,
145+
-125.4, 40.1, 3.96, 1992-01-05T0:40:3.1, 8, 4, 0
146+
147+
**4. Short-hand** ::
148+
149+
LON, LAT, MAG, ORIGIN_TIME, DEPTH, CATALOG_ID, EVENT_ID
150+
-125.4, 40.1, 3.96, 1992-01-05T0:40:3.1, 8, 4, 0
151+
152+
The simplest way to orient the file follow (3) in the case where some catalogs contain zero events. The zero oriented
153+
catalog_id should be assigned to correspond with the total number of catalogs in the forecast. In the case where every catalog
154+
contains zero forecasted events, you would specify the forecasting using (2). The *catalog_id* should be assigned to
155+
correspond with the total number of catalogs in the forecast.
99156

100-
We will be adding more to these documentation pages, so stay tuned for updated.

docs/conf.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@
2828
html_show_sphinx = False
2929

3030
# The short X.Y version
31-
version = 'v0.2'
31+
version = 'v0.4'
3232
# The full version, including alpha/beta/rc tags
33-
release = 'v0.2.0'
33+
release = 'v0.4.0'
3434

3535

3636
# -- General configuration ---------------------------------------------------
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
,,,,,9,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
,,,,,0,
2+
,,,,,1,
3+
,,,,,2,
4+
,,,,,3,
5+
,,,,,4,
6+
,,,,,5,
7+
,,,,,6,
8+
,,,,,7,
9+
,,,,,8,
10+
,,,,,9,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
1,1,1,1992-01-01T0:0:0.0,1,0,1
2+
1,1,1,1992-01-01T0:0:0.0,1,1,1
3+
1,1,1,1992-01-01T0:0:0.0,1,2,1
4+
1,1,1,1992-01-01T0:0:0.0,1,3,1
5+
1,1,1,1992-01-01T0:0:0.0,1,4,1
6+
1,1,1,1992-01-01T0:0:0.0,1,5,1
7+
1,1,1,1992-01-01T0:0:0.0,1,6,1
8+
1,1,1,1992-01-01T0:0:0.0,1,7,1
9+
1,1,1,1992-01-01T0:0:0.0,1,8,1
10+
1,1,1,1992-01-01T0:0:0.0,1,9,1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
1,1,1,1992-01-01T0:0:0.0,1,0,1
2+
,,,,,9,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
1,1,1,1992-01-01T0:0:0.0,1,3,1
2+
,,,,,5,
3+
1,1,1,1992-01-01T0:0:0.0,1,7,1
4+
,,,,,9,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
,,,,,0,
2+
,,,,,1,
3+
,,,,,2,
4+
1,1,1,1992-01-01T0:0:0.0,1,3,1
5+
,,,,,4,
6+
,,,,,5,
7+
,,,,,6,
8+
1,1,1,1992-01-01T0:0:0.0,1,7,1
9+
,,,,,8,
10+
,,,,,9,

tests/test_catalog.py

-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ def test_filter_spatial(self):
9292
filtered_test_cat = test_cat.filter_spatial(region=self.cart_grid)
9393
numpy.testing.assert_equal(numpy.array([b'2', b'3'], dtype='S256').T, filtered_test_cat.get_event_ids())
9494

95-
9695
def test_filter_spatial_in_place_return(self):
9796
test_cat = copy.deepcopy(self.test_cat1)
9897
filtered_test_cat = test_cat.filter_spatial(region=self.cart_grid, in_place=False)

tests/test_forecast.py

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import os, unittest
2+
import numpy
3+
from csep import load_catalog_forecast
4+
5+
def get_test_catalog_root():
6+
root_dir = os.path.dirname(os.path.abspath(__file__))
7+
data_dir = os.path.join(root_dir, 'artifacts', 'test_ascii_catalogs')
8+
return data_dir
9+
10+
class TestCatalogForecastCreation(unittest.TestCase):
11+
12+
def test_ascii_load_all_empty(self):
13+
fname = os.path.join(get_test_catalog_root(), 'all_empty.csv')
14+
test_fore = load_catalog_forecast(fname)
15+
total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
16+
self.assertEqual(0, total_event_count)
17+
self.assertEqual(10, test_fore.n_cat)
18+
numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))
19+
20+
def test_ascii_load_all_empty_verbose(self):
21+
fname = os.path.join(get_test_catalog_root(), 'all_empty_verbose.csv')
22+
test_fore = load_catalog_forecast(fname)
23+
total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
24+
self.assertEqual(0, total_event_count)
25+
self.assertEqual(10, test_fore.n_cat)
26+
numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))
27+
28+
def test_ascii_load_last_empty(self):
29+
fname = os.path.join(get_test_catalog_root(), 'last_empty.csv')
30+
test_fore = load_catalog_forecast(fname)
31+
total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
32+
self.assertEqual(1, total_event_count)
33+
self.assertEqual(10, test_fore.n_cat)
34+
numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))
35+
36+
def test_ascii_some_missing(self):
37+
fname = os.path.join(get_test_catalog_root(), 'some_empty.csv')
38+
test_fore = load_catalog_forecast(fname)
39+
total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
40+
print([(cat.event_count, cat.catalog_id) for cat in test_fore])
41+
self.assertEqual(2, total_event_count)
42+
self.assertEqual(10, test_fore.n_cat)
43+
numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))
44+
45+
def test_ascii_some_missing_verbose(self):
46+
fname = os.path.join(get_test_catalog_root(), 'all_present.csv')
47+
test_fore = load_catalog_forecast(fname)
48+
total_event_count = numpy.array([cat.event_count for cat in test_fore]).sum()
49+
print([(cat.event_count, cat.catalog_id) for cat in test_fore])
50+
self.assertEqual(10, total_event_count)
51+
self.assertEqual(10, test_fore.n_cat)
52+
numpy.testing.assert_array_equal([cat.catalog_id for cat in test_fore], numpy.arange(10))
53+
54+
def test_ascii_all_present(self):
55+
pass
56+
57+
if __name__ == '__main__':
58+
unittest.main()

0 commit comments

Comments
 (0)