From f5e6a80b40b2a824935703f046c05cc1efa54722 Mon Sep 17 00:00:00 2001
From: Kirsty Bayliss <kirsty.bayliss@globalquakemodel.org>
Date: Mon, 8 May 2023 10:46:46 +0200
Subject: [PATCH 01/10] Add declustering options with Reasenberg and Zaliapin
 declustering

---
 .../seismicity/declusterer/dec_reasenberg.py  | 425 ++++++++++++++++++
 .../seismicity/declusterer/dec_zaliapin.py    | 292 ++++++++++++
 2 files changed, 717 insertions(+)
 create mode 100644 openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
 create mode 100644 openquake/hmtk/seismicity/declusterer/dec_zaliapin.py

diff --git a/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
new file mode 100644
index 000000000000..8cfd088aeda9
--- /dev/null
+++ b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
@@ -0,0 +1,425 @@
+# -*- coding: utf-8 -*-
+# vim: tabstop=4 shiftwidth=4 softtabstop=4
+
+"""
+Module :mod:`openquake.hmtk.seismicity.declusterer.dec_reasenberg`
+defines the Reasenberg declustering algorithm
+"""
+
+from bisect import bisect_left
+from typing import Callable, Tuple, Dict, Optional
+from openquake.hazardlib.geo.geodetic import geodetic_distance, distance, _prepare_coords 
+import numpy as np
+
+# import datetime
+from openquake.hmtk.seismicity.declusterer.base import (BaseCatalogueDecluster, DECLUSTERER_METHODS)
+from openquake.hmtk.seismicity.utils import haversine
+
+
+
+@DECLUSTERER_METHODS.add(
+    "decluster",
+    taumin=1.0,  # look ahead time for not clustered events, days
+    taumax=10.0,  # maximum look ahead time for clustered events, days
+    P=0.95,  # confidence level that this is next event in sequence
+    xk=0.5,  # factor used with xmeff to modify magnitude cutoff during clusters
+    xmeff=1.5,  # magnitude effective, used with xk to define magnitude cutoff
+    rfact=10,  # factor for interaction radius for dependent events
+    horiz_error=1.5,  # epicenter error, km.  if unspecified or None, it is pulled from the catalogue
+    depth_error=2.0,  # depth error, km.  if unspecified or None, it is pulled from the catalogue
+    interaction_formula='Reasenberg1985',  # either `Reasenberg1985` or `WellsCoppersmith1994`
+    max_interaction_dist=np.inf  # km, some studies limit to crustal thickness (ex. 30)
+)
+class Reasenberg(BaseCatalogueDecluster):
+    """
+    This class implements the Reasenberg algorithm as described in
+    this paper:
+
+    Reasenberg, P., 1985. Second‐order moment of central California
+    seismicity, 1969–1982. Journal of Geophysical Research: Solid Earth,
+    90(B7), pp.5479-5495.
+
+    Declustering code originally converted to MATLAB by A. Allman.
+    Then, highly modified and converted to Python by CG Reyes. Further modified by K Bayliss.
+
+
+    # default_config = dict(taumin=1.0,  # tau(t==0)
+    #                       taumax=10.,  # tau(t -> inf), computational simplification should be scaled to local bg rate
+    #                       P=0.95,
+    #                       xk=0.5,
+    #                       xmeff=1.5,
+    #                       rfact=10.,
+    #                       horiz_error=1.5,
+    #                       depth_error=2.,
+    #                       dmethod='gc', ## REMOVED
+    #                       interaction_formula='Reasenberg1985',
+    #                       max_interaction_dist=30  # km, limit to crustal distance
+    #                       )
+
+    """
+
+    def __init__(self):
+        self.verbose = False
+        self.interaction_formulas = {'Reasenberg1985': lambda m: 0.011 * 10 ** (0.4 * m),
+                                     'WellsCoppersmith1994': lambda m: 0.01 * 10 ** (0.5 * m)}  # Helmstetter (SRL) 2007
+
+    @staticmethod
+    def clust_look_ahead_time(mag_big: float, dt_big: np.ndarray, xk: float, xmeff: float, p: float) -> np.ndarray:
+        """ CLUSTLOOKAHEAD calculate look ahead time for clustered events
+
+        :param mag_big:
+            biggest magnitude in the cluster
+        :param dt_big:
+            days difference between biggest event and this one
+        :param xk:
+            factor used with xmeff to define magnitude cutoff - increases effective magnitude during clusters
+        :param xmeff:
+            magnitude effective, used with xk to define magnitude cutoff
+        :param p:
+            confidence level that this is next event in sequence
+        :returns:
+            tau: look-ahead time for clustered events (days)
+        """
+
+        deltam = (1 - xk) * mag_big - xmeff  # delta in magnitude
+
+        if deltam < 0:
+            deltam = 0
+
+        denom = 10.0 ** ((deltam - 1) * 2 / 3)  # expected rate of aftershocks
+        top = -np.log(1 - p) * dt_big  # natural log, verified in Reasenberg
+        tau = top / denom  # equation from Reasenberg paper
+        return tau
+         
+
+    def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        decluster the catalog using the provided configuration
+
+        :param catalogue: Catalogue of earthquakes
+        :param config: Configuration parameters
+        :returns:
+          **vcl vector** indicating cluster number,
+          **ms vector** indicating which events should be considered mainshock events
+
+        if catalogue contains additional data keys data['depthError'] and/or
+        data['horizError'], containing the location error for each event, then
+        these will be used to bring the decluster distance closer together
+        [additively].  To override the inbuilt catalog depth and horiz errors,
+        set the value of config['depth_error'] and config['horiz_error'] respectively
+
+        """
+
+        self.verbose = config.get('verbose', self.verbose)
+        # Get relevant parameters
+        neq = catalogue.get_number_events()  # Number of earthquakes
+
+        min_lookahead_days = config['taumin']
+        max_lookahead_days = config['taumax']
+
+        # Get elapsed days
+        elapsed = days_from_first_event(catalogue)
+
+        assert np.all(elapsed[1:] >= elapsed[:-1]), "catalogue needs to be in ascending date order"
+
+        # easy-access variables
+        ### What even is dmethod? This is literally never explained...
+        #if config.get('dmethod', 'gc') == 'gc':
+        #    surf_pos = (catalogue['latitude'], catalogue['longitude'])
+        #    event_distance = event_gc_distance
+
+        #elif config['dmethod'] == 'p2p':
+        #    surf_pos = geodetic_to_ecef(catalogue['latitude'], catalogue['longitude'])  # assumes at surface
+        #    event_distance = event_p2p_distance
+        #else:
+        #    raise ValueError("unknown configuration dmethod. it should be 'gc' or 'p2p'")
+
+        mags = catalogue['magnitude']
+        depth = catalogue['depth']
+
+        # Errors are determined 1st by the config. If this value doesn't exist or is None, then get the
+        # error values from the catalog.  If errors do not exist within the catalog, then set the errors to 0.
+        if config.get('horiz_error', None) is None:
+            horiz_error = catalogue.data.get('horizError', np.zeros(1))
+        else:
+            horiz_error = config['horiz_error']
+
+        if config.get('depth_error', None) is None:
+            depth_error = catalogue.data.get('depthError', np.zeros(1))
+        else:
+            depth_error = config['depth_error']
+
+        # Pre-allocate cluster index vectors
+        vcl = np.zeros(neq, dtype=int).flatten()
+        msi = np.zeros(neq, dtype=int).flatten()
+        ev_id = np.zeros(neq, dtype=int).flatten()
+        # set the interaction zones, in km
+        # Reasenberg 1987 or alternate version: Wells & Coppersmith 1994 / Helmstetter (SRL) 2007
+        zone_noclust, zone_clust = self.get_zone_distances_per_mag(
+            mags=mags,
+            rfact=config['rfact'],
+            formula=self.interaction_formulas[config['interaction_formula']],
+            max_interact_km=config.get('max_interaction_dist', np.inf))
+
+        k = 0  # clusterindex
+
+        # variable to store information whether earthquake is already clustered
+        clusmaxmag = np.array([-np.inf] * neq)
+        clus_biggest_idx = np.zeros(neq, dtype=int)
+
+        # for every earthquake in catalog, main loop
+        for i in range(neq - 1):
+            ev_id[i] = i
+            my_mag = mags[i]
+
+            # variable needed for distance and timediff
+            my_cluster = vcl[i]
+            not_classified = my_cluster == 0
+
+            # attach interaction time
+
+            if not_classified:
+                # this event is not associated with a cluster, yet
+                # self.debug_print(i, ' is not in a cluster')
+                look_ahead_days = min_lookahead_days
+
+            elif my_mag >= clusmaxmag[my_cluster]:
+                # this is the biggest event  in this cluster, so far (or equal to it).
+                # note, if this is now the biggest, then the cluster range collapses into its radius
+                # self.debug_print(f'{i} is the biggest event of cluster M={my_mag}')
+                clusmaxmag[my_cluster] = my_mag
+                clus_biggest_idx[my_cluster] = i
+                look_ahead_days = min_lookahead_days
+                # time between largest event in cluster and this event is 0, so use min_lookahead_days (rather than 0).
+            else:
+                # this event is already tied to a cluster, but is not the largest event
+                #self.debug_print(i, ' is already in cluster, but not biggest')
+                idx_biggest = clus_biggest_idx[my_cluster]
+                days_since_biggest = elapsed[i] - elapsed[idx_biggest]
+                # set new look_ahead_days (function of time from largest event in cluster)
+                look_ahead_days = self.clust_look_ahead_time(clusmaxmag[my_cluster],
+                                                             days_since_biggest,
+                                                             config['xk'],
+                                                             config['xmeff'],
+                                                             config['P'])
+                
+                # look_ahead_days should be > min and < max to prevent this running forever...
+                look_ahead_days = np.clip(look_ahead_days, min_lookahead_days, max_lookahead_days)
+
+            # extract eqs that fit interaction time window --------------
+
+            max_elapsed = elapsed[i] + look_ahead_days
+            next_event = i + 1
+            # find location of last event between elapsed and max_elapsed
+            last_event = bisect_left(elapsed, max_elapsed, lo=next_event)
+            #print("last event", last_event, "i", i)
+            # range from next event (i+1) to last event (end of the lookahead time)
+            temporal_evs = np.arange(next_event, last_event)
+            #print("temp_ev ", temporal_evs)
+            if my_cluster != 0:
+                # If we are in a cluster, consider only events that are not already in the cluster
+                temporal_evs = temporal_evs[vcl[temporal_evs] != my_cluster]
+
+            if len(temporal_evs) == 0:
+                continue
+
+            # ------------------------------------
+            # one or more events have now passed the time window test. Now compare
+            # this subcatalog in space to A) most recent and B) largest event in cluster
+            # ------------------------------------
+
+            my_biggest_idx = clus_biggest_idx[my_cluster]
+            bg_ev_for_dist = i if not_classified else my_biggest_idx
+
+            dist_to_recent = distance(catalogue.data['latitude'][i], catalogue.data['longitude'][i], depth[i], catalogue.data['latitude'][temporal_evs], catalogue.data['longitude'][temporal_evs], depth[temporal_evs])
+            dist_to_biggest = distance(catalogue.data['latitude'][bg_ev_for_dist], catalogue.data['longitude'][bg_ev_for_dist], depth[bg_ev_for_dist], catalogue.data['latitude'][temporal_evs], catalogue.data['longitude'][temporal_evs], depth[temporal_evs])
+            
+            if look_ahead_days == min_lookahead_days:
+                l_big = dist_to_biggest == 0  # all false
+                l_recent = dist_to_recent <= zone_noclust[my_mag]
+
+            else:
+                l_big = dist_to_biggest <= zone_noclust[clusmaxmag[my_cluster]]
+                l_recent = dist_to_recent <= zone_clust[my_mag]
+
+            spatial_evs = np.logical_or(l_recent, l_big)
+
+            if not any(spatial_evs):
+                # self.debug_print()
+                continue
+
+            # ------------------------------------
+            # one or more events have now passed both spatial and temporal window tests
+            #
+            # if there are events in this cluster that are already related to another
+            # cluster, figure out the smallest cluster number. Then, assign all events
+            # (both previously clustered and unclustered) to this new cluster number.
+            # ------------------------------------
+
+            # spatial events only include events AFTER i, not i itself
+            # so vcl[events_in_any_cluster] is independent from vcl[i]
+
+            candidates = temporal_evs[spatial_evs]  # eqs that fit spatial and temporal criterion
+            events_in_any_cluster = candidates[vcl[candidates] != 0]  # eqs which are already related with a cluster
+            events_in_no_cluster = candidates[vcl[candidates] == 0]  # eqs that are not already in a cluster
+
+            # if this cluster overlaps with any other cluster, then merge them
+            # assign every event in all related clusters to the same (lowest) cluster number
+            # set this cluster's maximum magnitude "clusmaxmag" to the largest magnitude of all combined events
+            # set this cluster's clus_biggest_idx to the (most recent ??) largest event of all combined events
+            # Flag largest event in each cluster
+
+            if len(events_in_any_cluster) > 0:
+                if not_classified:
+                    related_clust_nums = np.unique(vcl[events_in_any_cluster])
+                else:
+                    # include this cluster number in the reckoning
+                    related_clust_nums = np.unique(np.hstack((vcl[events_in_any_cluster], my_cluster,)))
+
+                # associate all related events with my cluster
+                my_cluster = related_clust_nums[0]
+                vcl[i] = my_cluster
+                vcl[candidates] = my_cluster
+
+                for clustnum in related_clust_nums:
+                    vcl[vcl == clustnum] = my_cluster
+
+                events_in_my_cluster = vcl == my_cluster
+                biggest_mag = np.max(mags[events_in_my_cluster])
+
+                biggest_mag_idx = np.flatnonzero(np.logical_and(mags == biggest_mag, events_in_my_cluster))[-1]
+
+                # reset values for other clusters
+                clusmaxmag[related_clust_nums] = -np.inf
+                clus_biggest_idx[related_clust_nums] = 0
+
+                # assign values for this cluster
+                clusmaxmag[my_cluster] = biggest_mag
+                clus_biggest_idx[my_cluster] = biggest_mag_idx
+
+            elif my_cluster == 0:
+                k += 1
+                vcl[i] = my_cluster = k
+                clusmaxmag[my_cluster] = my_mag
+                clus_biggest_idx[my_cluster] = i
+            else:
+                pass  # no events found, and attached to existing cluster
+
+            # attach clustnumber to catalogue yet unrelated to a cluster
+            vcl[events_in_no_cluster] = my_cluster
+            
+            # set mainshock index for all events not in a cluster to be 1 also
+            # i.e. an independent event counts as a mainshock
+            msi[events_in_no_cluster] = 1
+            
+            # for each cluster, identify mainshock
+            clusters = np.unique(vcl)
+            for cluster_no in clusters:
+                cluster_ids = ev_id[vcl == cluster_no]
+                biggest_mag_idx = np.where(np.max(mags[vcl == cluster_no]))
+                ms_id = cluster_ids[biggest_mag_idx]
+                #ms_id = np.where(mags == biggest_mag, cluster_ids))[-1]
+                #ms_id = np.asarray(np.max(mags[vcl = cluster_no]).nonzero()
+                msi[ms_id] = 1
+            
+        return vcl, msi
+
+    @staticmethod
+    def get_zone_distances_per_mag(mags: np.ndarray, rfact: float, formula: Callable,
+                                   max_interact_km: float = np.inf) -> Tuple[Dict, Dict]:
+        """
+        :param mags:
+            list of magnitudes
+        :param rfact:
+            interaction distance scaling value for clusters
+        :param formula:
+            formula that takes magnitudes and returns interaction distances
+        :param max_interact_km:
+            if used, may refer to crustal thickness
+        :returns:
+            dictionaries keyed on sorted unique magnitudes.
+            zone_noclust :
+                dictionary of interaction distances when not in a cluster
+                so, zone_noclust[mag] -> out-of-cluster interaction distance for mag
+            zone_clust :
+                dictionary of interaction distances when IN a cluster
+                so, zone_noclust[mag] -> in-cluster interaction distance for mag
+        """
+
+        unique_mags = np.unique(mags)
+        noclust_km = formula(unique_mags)
+        clust_km = noclust_km * rfact
+
+        # limit interaction distance [generally to one crustal thickness], in km
+        noclust_km[noclust_km > max_interact_km] = max_interact_km
+        clust_km[clust_km > max_interact_km] = max_interact_km
+
+        zone_noclust = dict(zip(unique_mags, noclust_km))
+        zone_clust = dict(zip(unique_mags, clust_km))
+        return zone_noclust, zone_clust
+
+    def debug_print(self, *args, **kwargs):
+        if self.verbose:
+            print(*args, **kwargs)
+
+def relative_time(years, months, days, hours=None, minutes=None, seconds=None, datetime_unit ='D', reference_date=None):
+    """ Get elapsed days since first event in catalog
+
+    :param reference_date
+        use this array of [Y, M, D, h, m, s] instead of first event in catalog
+    :type: Array
+    :returns:
+    **elapsed** number of days since reference_date (which defaults to the first event in catalog)
+    :rtype: numpy.timedelta64 array
+    """
+
+    import datetime
+    if hours is None and minutes is None and seconds is None:
+        dates = [datetime.datetime(
+            *[years[n], months[n], days[n]])
+            for n in range(len(years))]
+    else:
+        hours = hours if minutes is not None else np.zeros_like(years)
+        minutes = minutes if minutes is not None else np.zeros_like(years)
+        seconds = seconds if seconds is not None else np.zeros_like(years)
+
+        dates = [datetime.datetime(
+            *[years[n], months[n], days[n],hours[n], minutes[n], seconds[n].astype(int)])
+            for n in range(len(years))]
+
+    dt64 = np.array([np.datetime64(v) for v in dates])
+    if reference_date:
+        from_day = np.datetime64(datetime.datetime(*reference_date))
+    else:
+        from_day = min(dt64)
+
+    return (dt64 - from_day) / np.timedelta64(1, datetime_unit)
+
+
+def days_from_first_event(catalog) -> relative_time:
+    return relative_time(catalog['year'], catalog['month'], catalog['day'],
+                         catalog['hour'], catalog['minute'],
+                         catalog['second'].astype(int),
+                         datetime_unit='D')
+
+
+def get_distance_errors(directional_error, src_idx, targ_idxs) -> Tuple[float, np.ndarray]:
+    """
+
+    :param directional_error:
+    :param src_idx:
+    :param targ_idxs:
+    :return:
+    """
+    if directional_error is None:
+        directional_error = 0.
+
+    if isinstance(directional_error, (np.ndarray,)):
+        src_err = directional_error[src_idx]
+        targ_err = directional_error[targ_idxs]
+    else:
+        src_err = directional_error
+        targ_err = directional_error
+    return src_err, targ_err
+    
+
diff --git a/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py b/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
new file mode 100644
index 000000000000..f16d4c0074d9
--- /dev/null
+++ b/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
@@ -0,0 +1,292 @@
+# -*- coding: utf-8 -*-
+# vim: tabstop=4 shiftwidth=4 softtabstop=4
+
+"""
+Module :mod:`openquake.hmtk.seismicity.declusterer.dec_zaliapin`
+defines the Zaliapin declustering algorithm
+
+
+"""
+import numpy as np
+import datetime
+import matplotlib.dates as mdates
+
+from openquake.hmtk.seismicity.declusterer.base import (
+    BaseCatalogueDecluster, DECLUSTERER_METHODS)
+
+from openquake.hazardlib.geo.geodetic import geodetic_distance, distance, _prepare_coords 
+
+#from openquake.hmtk.seismicity.utils import relative_time
+
+from sklearn import mixture
+#from pymap3d import geodetic2ned
+
+
+@DECLUSTERER_METHODS.add(
+    "decluster",
+    fractal_dim=np.float,  # spatial weighting factor
+    b_value=np.float,      # magnitude weighting factor
+)
+class Zaliapin(BaseCatalogueDecluster):
+    """
+    Implements the declustering method of Zaliapin and Ben-Zion (2013), based on a nearest-neighbour distance metric.
+    
+    Implemented in Python by CG Reyes.
+    Modified to return mainshocks (largest event per cluster) and to give the option for
+    stochastic declustering (defaults to this when no threshold is specified). 
+     
+    """
+
+    def __init__(self):
+        pass
+
+    def decluster(self, catalogue, config):
+        """Zaliapin's declustering code
+
+        :param catalogue:
+            Earthquake catalog to examine
+        :param config:
+            Dictionary containing 
+            1. values for fractal dimension `frac_dim` and Gutenberg-Richter `b-value`
+            2. Method used to choose which events to keep. Use `Threshold = ` to specify a float value for probability at which to keep an event.
+               If `Stochastic = True` (or no threshold is provided) a stochastic declustering will be implemented.
+               A seed for the stochastic method can be specified with the `stoch_seed` parameter (will default to 5 if not included)
+            3. Optional flag to use depths when calculating event distances. Note that the fractal dimension should correspond to the number of dimensions in the model.
+               (Zaliapin and Ben-Zion defaults are 1.6 for 2D and 2.4 for 3D distances and these values are generally used in the literature).
+            
+        :return:
+            probability [0..1] that each event is a member of a cluster, and optionally (if
+            config['nearest_neighbor_dist'] is true) return the nearest neighbor distances, mainshock flag, root (first) event in cluster and leaf depth of cluster.
+        """
+
+        if 'depth' in config and 'depth' == True:
+            nnd, nni = self.getnnd(catalogue, config['fractal_dim'], config['b_value'], depth = True)
+            
+        else:
+            nnd, nni = self.getnnd(catalogue, config['fractal_dim'], config['b_value'], depth = False)
+
+        probability_of_independence = self.gaussianmixturefit(nnd)
+        
+        if 'threshold' in config and isinstance(s, (int,float)):
+            threshold = config['threshold']
+            root, ld, ms_flag = cluster_number(catalogue, nni, probability_of_independence, threshold = 0.5, stochastic = False)
+           
+        else:
+            root, ld, ms_flag = cluster_number(catalogue, nni, probability_of_independence,  stochastic = True)
+                 
+
+        if 'output_nearest_neighbor_distances' in config and config['output_nearest_neighbor_distances']:
+            return probability_of_independence, nnd, ms_flag, root, ld
+        
+        return root, ms_flag
+
+   
+    @staticmethod
+    def getnnd(catalogue, d_f=1.6, b=1.0, depth = False):
+        """
+        Identifies nearest neighbour parent and distance for each event.
+       
+        :param catalogue:
+            an earthquake catalogue
+        :param d_f:
+            fractal dimension (default 1.6 for 2D, recommend ~2.4 for 3D)
+        :param b:
+            b-value for the catalogue
+        :param depth:
+            flag to include depths 
+
+        :return:
+            nearest neighbor distances, index of nearest neighbour as numpy arrays
+        """
+            
+        # set up variables for later            
+        nearest_distance=[0]*len(catalogue.data['latitude'])
+        nearest_index=[0]*len(catalogue.data['latitude'])
+        
+        if (depth == True):    
+            depth = catalogue.data['depth']
+            # Set nan depths to 0
+            depth[np.isnan(depth)] = 0
+        else: depth = np.zeros(len(catalogue))
+        
+        # Is there a faster/better way to do this? Probably yes
+        # TODO: Figure it out!
+        time=[0]*len(catalogue)        
+        # Change date and time data into one list of datetimes in years (ie 1980.25)       
+        for i in range(len(time)):
+            time[i]=datetime.datetime(catalogue.data['year'][i], catalogue.data['month'][i], catalogue.data['day'][i],catalogue.data['hour'][i], catalogue.data['minute'][i],np.int(catalogue.data['second'][i]))
+            time[i]=mdates.date2num(time[i])
+            # date2num gives days, change to years
+            time[i]= (time[i] -1)/365.25
+
+            
+        for j in range(1, len(catalogue)):
+            # Calculate spatial distance between events
+            dist = distance(catalogue.data['latitude'][j], catalogue.data['longitude'][j], depth[j], catalogue.data['latitude'][:j], catalogue.data['longitude'][:j], depth[:j])
+                      
+            time_diff= time[j]-time[:j]
+            # If time difference is zero, set to very small value
+            time_diff[time_diff == 0] = 0.0000001
+            # Similarly with spatial distances = 0
+            dist[dist < 0.1] = 0.1
+            # ZBZ interevent distance is product of time_diff*(10^(b*Mi))*spat_dist^(d_f)
+            interevent_distance = time_diff*(10**(-b*catalogue.data['magnitude'][:j]))*(dist**d_f)
+
+            # Record index of nearest neighbour (needed to reconstruct the clusters) and the nnd (smallest nnd)
+            nearest_index[j] = np.argmin(interevent_distance)
+            nearest_distance[j] = np.min(interevent_distance)
+            
+            
+        ## Set zeroth event to mean NND (doesn't really matter what we do with this)  
+        nearest_distance[0]= np.mean(nearest_distance)
+            
+        return(nearest_distance, nearest_index)
+        
+    @staticmethod
+    def gaussianmixturefit(nnd):
+        """
+        fit Gaussian mixture distribution to nearest neighbour distances (Zaliapin and Ben-Zion 2013) and calculate probability of independence from mixture fit.
+        
+        :param nnd:
+            nearest neighbour distances for each event
+        
+        :return:
+            probability of independence of event based on mixture fit
+        """
+        log_nearest_neighbor_dist = np.log10(nnd)
+        samples = log_nearest_neighbor_dist[np.isfinite(log_nearest_neighbor_dist)]
+        samples = samples.reshape(-1,1)
+        
+        # find two gaussians that fit the nearest-neighbor-distances
+        clf = mixture.GaussianMixture(n_components=2, covariance_type='full')
+        clf.fit(samples)
+        
+
+        probability_of_independence = np.zeros_like(log_nearest_neighbor_dist)
+
+        # calculate the probabilities that it belongs to each of the gaussian curves (2 columns of probabilities)
+        prediction = clf.predict_proba(samples)
+
+        # keep only the chance that this is a background event (the column matching the gaussian model's largest mean)
+        probability_of_independence[np.isfinite(log_nearest_neighbor_dist)] = prediction[:, np.argmax(clf.means_)]
+
+        probability_of_independence[0] = 1  # first event, by definition, cannot depend upon prior events
+        probability_of_independence[np.isnan(probability_of_independence)] = 0
+        
+        return probability_of_independence
+
+
+# TODO: Option to skip the reconstruction phase - then this should be directly comparable to the original version
+
+## function to reconstruct clusters - need this to identify mainshocks rather than earliest events
+
+def cluster_number(catalogue, nearest_index, prob_ind, threshold = 0.5, stochastic = True, stoch_seed = 5):
+    """
+    Identifies head node (first event in cluster) and number of iterations of while loop to get to it
+    (ie how deep in cluster event is; 1st, 2nd, 3rd generation etc). 
+    
+    :param catalogue:
+        an earthquake catalogue
+    
+    :param nearest_index:
+        index of event's nearest neighbour (output of getnnd)
+    
+    :param prob_ind:
+        Probability event is independent (output of gaussianmixturefit)
+        
+    :param threshold:
+        Threshold of independence probability at which to consider an event independent. Specified in config file or default to 0.5.
+    
+    :param stochastic:
+        Instead of using a fixed threshold, use a stochastic method to thin clustered events
+        
+    :param stoch_seed:
+        Seed value for stochastic thinning to assure reproducibility. Set by default if not specified.
+        
+    :return:
+        arrays of root, event depth, mainshock flag
+        
+    """
+         
+    n_id = range(len(catalogue)+1) 
+    root = np.zeros(len(catalogue))
+    ld = np.zeros(len(catalogue))
+    indep_flag = np.zeros(len(catalogue))
+     
+     
+    if stochastic == True:
+        # set seed for reproducible results when using stochastic method
+        np.random.seed(stoch_seed)
+        indep_flag = 1*prob_ind > np.random.uniform(0,1)
+     
+    else:
+        indep_flag = 1*(prob_ind > threshold)
+     
+    parent = nearest_index*(1-indep_flag)     
+
+    for i in range(len(catalogue)):
+        
+        p = parent[i]
+        if(p == 0):
+            root[i] = i
+            ld[i] = 0
+        else:    
+            while (p != 0):
+                root[i] = p
+                ld[i] = ld[i] +1
+                p = parent[p]
+    
+    clust_heads = np.unique(root)
+    MS = clust_heads.astype(int)
+    
+    for j in range(len(clust_heads)):
+        # locate all events that are part of this cluster
+        idx = np.argwhere(np.isin(root, clust_heads[j])).ravel()
+        # collect all magnitudes
+        mags = catalogue.data['magnitude'][idx]
+        if len(mags) > 0:
+            # find mainshock and add to MS 
+            MS_rel = np.argmax(mags)
+            MS[j] = idx[MS_rel]
+    
+    ms_flag = np.zeros(len(catalogue))
+    ms_flag[MS] = 1
+    
+    return root, ld, ms_flag
+
+
+#if __name__ == "__main__":
+
+#    import os
+#    import numpy as np
+
+#    from openquake.hmtk.parsers.catalogue import CsvCatalogueParser
+#    import pytest
+
+#    BASE_DATA_PATH = os.path.join(os.path.dirname(__file__), '../../tests/seismicity/declusterer/data')
+#    BASE_TEST_PATH = os.path.join(os.path.dirname(__file__), '../../tests/seismicity/declusterer/dec_zaliapin_test.py')
+
+#    flnme = 'zaliapin_test_catalogue.csv'
+#    filename = os.path.join(BASE_DATA_PATH, flnme)
+#    print('Testing with:', filename)
+#    parser = CsvCatalogueParser(filename)
+#    cat = parser.read_file()
+#    comment = cat['comment']  # comment is: IndependentProbability,log10nearestDistances
+
+#    indep_prob, log10_nearestdists = np.array([s.split(',') for s in comment], dtype=float).T
+    # nearest_dists = [10. ** np.float(s.split(',')[1]) for s in comment]
+#    nearest_dists = 10 ** log10_nearestdists
+
+ #   config = {'fractal_dim': 2., 'b_value': 1.0, 'output_nearest_neighbor_distances': True}
+
+ #   dec = Zaliapin()
+ #   _, nearest_neighbor_dists = dec.decluster(cat, config)
+ #   for n1, n2 in zip(np.round(nearest_neighbor_dists), np.round(nearest_dists)):
+ #       if np.isnan(n1):
+ #           assert np.isnan(n2)
+ #       else:
+ #           assert n1 == n2, f"{n1:.4f} vs {n2:.4f}"
+
+#    pytest.main(['-v', BASE_TEST_PATH])
+    
+

From ecc702ede07d40c740ca86a5be284799fd36f3f3 Mon Sep 17 00:00:00 2001
From: Kirsty Bayliss <kirsty.bayliss@globalquakemodel.org>
Date: Mon, 8 May 2023 14:33:24 +0200
Subject: [PATCH 02/10] tests for Zaliapin declustering

---
 .../seismicity/declusterer/dec_zaliapin.py    | 78 +++++------------
 .../data/zaliapin_test_catalogue.csv          | 21 +++++
 .../declusterer/dec_zaliapin_test.py          | 87 +++++++++++++++++++
 3 files changed, 128 insertions(+), 58 deletions(-)
 create mode 100644 openquake/hmtk/tests/seismicity/declusterer/data/zaliapin_test_catalogue.csv
 create mode 100644 openquake/hmtk/tests/seismicity/declusterer/dec_zaliapin_test.py

diff --git a/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py b/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
index f16d4c0074d9..f1ec7fb6bcd8 100644
--- a/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
+++ b/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
@@ -24,8 +24,8 @@
 
 @DECLUSTERER_METHODS.add(
     "decluster",
-    fractal_dim=np.float,  # spatial weighting factor
-    b_value=np.float,      # magnitude weighting factor
+    fractal_dim=float,  # spatial weighting factor
+    b_value=float,      # magnitude weighting factor
 )
 class Zaliapin(BaseCatalogueDecluster):
     """
@@ -47,7 +47,7 @@ def decluster(self, catalogue, config):
             Earthquake catalog to examine
         :param config:
             Dictionary containing 
-            1. values for fractal dimension `frac_dim` and Gutenberg-Richter `b-value`
+            1. values for fractal dimension `fractal_dim` and Gutenberg-Richter `b-value`
             2. Method used to choose which events to keep. Use `Threshold = ` to specify a float value for probability at which to keep an event.
                If `Stochastic = True` (or no threshold is provided) a stochastic declustering will be implemented.
                A seed for the stochastic method can be specified with the `stoch_seed` parameter (will default to 5 if not included)
@@ -64,10 +64,10 @@ def decluster(self, catalogue, config):
             
         else:
             nnd, nni = self.getnnd(catalogue, config['fractal_dim'], config['b_value'], depth = False)
-
+        
         probability_of_independence = self.gaussianmixturefit(nnd)
         
-        if 'threshold' in config and isinstance(s, (int,float)):
+        if 'threshold' in config and isinstance(config['threshold'], (int,float)):
             threshold = config['threshold']
             root, ld, ms_flag = cluster_number(catalogue, nni, probability_of_independence, threshold = 0.5, stochastic = False)
            
@@ -77,8 +77,9 @@ def decluster(self, catalogue, config):
 
         if 'output_nearest_neighbor_distances' in config and config['output_nearest_neighbor_distances']:
             return probability_of_independence, nnd, ms_flag, root, ld
-        
-        return root, ms_flag
+ì
+        cluster_index = 1 - ms_flag
+        return root, cluster_index
 
    
     @staticmethod
@@ -107,23 +108,22 @@ def getnnd(catalogue, d_f=1.6, b=1.0, depth = False):
             depth = catalogue.data['depth']
             # Set nan depths to 0
             depth[np.isnan(depth)] = 0
-        else: depth = np.zeros(len(catalogue))
+        else: depth = np.zeros(len(catalogue.data['latitude']))
         
         # Is there a faster/better way to do this? Probably yes
         # TODO: Figure it out!
-        time=[0]*len(catalogue)        
+        time=[0]*len(catalogue.data['latitude']) 
         # Change date and time data into one list of datetimes in years (ie 1980.25)       
         for i in range(len(time)):
-            time[i]=datetime.datetime(catalogue.data['year'][i], catalogue.data['month'][i], catalogue.data['day'][i],catalogue.data['hour'][i], catalogue.data['minute'][i],np.int(catalogue.data['second'][i]))
+            time[i]=datetime.datetime(catalogue.data['year'][i], catalogue.data['month'][i], catalogue.data['day'][i],catalogue.data['hour'][i], catalogue.data['minute'][i], int(catalogue.data['second'][i]))
             time[i]=mdates.date2num(time[i])
             # date2num gives days, change to years
             time[i]= (time[i] -1)/365.25
+        
 
-            
-        for j in range(1, len(catalogue)):
+        for j in range(1, len(catalogue.data['latitude'])):
             # Calculate spatial distance between events
-            dist = distance(catalogue.data['latitude'][j], catalogue.data['longitude'][j], depth[j], catalogue.data['latitude'][:j], catalogue.data['longitude'][:j], depth[:j])
-                      
+            dist = distance(catalogue.data['latitude'][j], catalogue.data['longitude'][j], depth[j], catalogue.data['latitude'][:j], catalogue.data['longitude'][:j], depth[:j])      
             time_diff= time[j]-time[:j]
             # If time difference is zero, set to very small value
             time_diff[time_diff == 0] = 0.0000001
@@ -131,7 +131,6 @@ def getnnd(catalogue, d_f=1.6, b=1.0, depth = False):
             dist[dist < 0.1] = 0.1
             # ZBZ interevent distance is product of time_diff*(10^(b*Mi))*spat_dist^(d_f)
             interevent_distance = time_diff*(10**(-b*catalogue.data['magnitude'][:j]))*(dist**d_f)
-
             # Record index of nearest neighbour (needed to reconstruct the clusters) and the nnd (smallest nnd)
             nearest_index[j] = np.argmin(interevent_distance)
             nearest_distance[j] = np.min(interevent_distance)
@@ -208,10 +207,10 @@ def cluster_number(catalogue, nearest_index, prob_ind, threshold = 0.5, stochast
         
     """
          
-    n_id = range(len(catalogue)+1) 
-    root = np.zeros(len(catalogue))
-    ld = np.zeros(len(catalogue))
-    indep_flag = np.zeros(len(catalogue))
+    n_id = range(len(catalogue.data['latitude'])) 
+    root = np.zeros(len(catalogue.data['latitude']))
+    ld = np.zeros(len(catalogue.data['latitude']))
+    indep_flag = np.zeros(len(catalogue.data['latitude']))
      
      
     if stochastic == True:
@@ -224,7 +223,7 @@ def cluster_number(catalogue, nearest_index, prob_ind, threshold = 0.5, stochast
      
     parent = nearest_index*(1-indep_flag)     
 
-    for i in range(len(catalogue)):
+    for i in range(len(catalogue.data['latitude'])):
         
         p = parent[i]
         if(p == 0):
@@ -249,44 +248,7 @@ def cluster_number(catalogue, nearest_index, prob_ind, threshold = 0.5, stochast
             MS_rel = np.argmax(mags)
             MS[j] = idx[MS_rel]
     
-    ms_flag = np.zeros(len(catalogue))
+    ms_flag = np.zeros(len(catalogue.data['latitude']))
     ms_flag[MS] = 1
     
     return root, ld, ms_flag
-
-
-#if __name__ == "__main__":
-
-#    import os
-#    import numpy as np
-
-#    from openquake.hmtk.parsers.catalogue import CsvCatalogueParser
-#    import pytest
-
-#    BASE_DATA_PATH = os.path.join(os.path.dirname(__file__), '../../tests/seismicity/declusterer/data')
-#    BASE_TEST_PATH = os.path.join(os.path.dirname(__file__), '../../tests/seismicity/declusterer/dec_zaliapin_test.py')
-
-#    flnme = 'zaliapin_test_catalogue.csv'
-#    filename = os.path.join(BASE_DATA_PATH, flnme)
-#    print('Testing with:', filename)
-#    parser = CsvCatalogueParser(filename)
-#    cat = parser.read_file()
-#    comment = cat['comment']  # comment is: IndependentProbability,log10nearestDistances
-
-#    indep_prob, log10_nearestdists = np.array([s.split(',') for s in comment], dtype=float).T
-    # nearest_dists = [10. ** np.float(s.split(',')[1]) for s in comment]
-#    nearest_dists = 10 ** log10_nearestdists
-
- #   config = {'fractal_dim': 2., 'b_value': 1.0, 'output_nearest_neighbor_distances': True}
-
- #   dec = Zaliapin()
- #   _, nearest_neighbor_dists = dec.decluster(cat, config)
- #   for n1, n2 in zip(np.round(nearest_neighbor_dists), np.round(nearest_dists)):
- #       if np.isnan(n1):
- #           assert np.isnan(n2)
- #       else:
- #           assert n1 == n2, f"{n1:.4f} vs {n2:.4f}"
-
-#    pytest.main(['-v', BASE_TEST_PATH])
-    
-
diff --git a/openquake/hmtk/tests/seismicity/declusterer/data/zaliapin_test_catalogue.csv b/openquake/hmtk/tests/seismicity/declusterer/data/zaliapin_test_catalogue.csv
new file mode 100644
index 000000000000..3ddd1c342b0e
--- /dev/null
+++ b/openquake/hmtk/tests/seismicity/declusterer/data/zaliapin_test_catalogue.csv
@@ -0,0 +1,21 @@
+"year","month","day","hour","minute","second","longitude","latitude","magnitude","flag","comment"
+1977,2,5,0,0,0,1,0.5,5,0,
+1977,4,1,0,0,0,0,1,5,0,
+1977,5,6,0,0,0,0,0.1,5,0,
+1977,8,19,0,0,0,0.6,1,5,0,
+1977,12,13,0,0,0,0,0.5,6,0,
+1978,1,1,0,0,0,0.8,0,5,0,
+1978,9,27,0,0,0,0.2,0.8,5,0,
+1979,12,29,0,0,0,0.8,0.1,5,0," Smaller mainshock >> d"
+1979,12,31,0,0,0,0.6,0.1,5,0," Foreshock of M1"
+1980,1,1,0,0,0,0,0,8,0," Large Mainshock1 (M1)"
+1980,1,31,0,0,0,0.1,0,7,1," Aftershock of M1"
+1980,2,2,0,0,0,0.5,0.1,6,1,
+1980,2,3,0,0,0,0.1,0.1,6,1,
+1980,2,5,0,0,0,0.5,0.1,6,1,
+1980,2,8,0,0,0,0.5,0.1,6,1,
+1980,2,12,0,0,0,0,0,7,1," Aftershock of M1 "
+1980,2,14,0,0,0,0,0,6,1,
+1980,2,16,0,0,0,0,0,6,1,
+1982,7,30,0,0,0,0.3,0.1,5,0,
+1982,12,21,0,0,0,0.6,0.8,5,0,
diff --git a/openquake/hmtk/tests/seismicity/declusterer/dec_zaliapin_test.py b/openquake/hmtk/tests/seismicity/declusterer/dec_zaliapin_test.py
new file mode 100644
index 000000000000..b3f069062aaf
--- /dev/null
+++ b/openquake/hmtk/tests/seismicity/declusterer/dec_zaliapin_test.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+# vim: tabstop=4 shiftwidth=4 softtabstop=4
+
+#
+# LICENSE
+#
+# Copyright (C) 2010-2023 GEM Foundation, G. Weatherill, M. Pagani,
+# D. Monelli.
+#
+# The Hazard Modeller's Toolkit is free software: you can redistribute
+# it and/or modify it under the terms of the GNU Affero General Public
+# License as published by the Free Software Foundation, either version
+# 3 of the License, or (at your option) any later version.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with OpenQuake. If not, see <http://www.gnu.org/licenses/>
+#
+# DISCLAIMER
+#
+# The software Hazard Modeller's Toolkit (openquake.hmtk) provided herein
+# is released as a prototype implementation on behalf of
+# scientists and engineers working within the GEM Foundation (Global
+# Earthquake Model).
+#
+# It is distributed for the purpose of open collaboration and in the
+# hope that it will be useful to the scientific, engineering, disaster
+# risk and software design communities.
+#
+# The software is NOT distributed as part of GEM’s OpenQuake suite
+# (https://www.globalquakemodel.org/tools-products) and must be considered as a
+# separate entity. The software provided herein is designed and implemented
+# by scientific staff. It is not developed to the design standards, nor
+# subject to same level of critical review by professional software
+# developers, as GEM’s OpenQuake software suite.
+#
+# Feedback and contribution to the software is welcome, and can be
+# directed to the hazard scientific staff of the GEM Model Facility
+# (hazard@globalquakemodel.org).
+#
+# The Hazard Modeller's Toolkit (openquake.hmtk) is therefore distributed WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# The GEM Foundation, and the authors of the software, assume no
+# liability for use of the software.
+
+"""
+"""
+
+import unittest
+import os
+import numpy as np
+
+from openquake.hmtk.seismicity.declusterer.dec_zaliapin import Zaliapin
+from openquake.hmtk.parsers.catalogue import CsvCatalogueParser
+
+
+class ZaliapinTestCase(unittest.TestCase):
+    """
+    Unit tests for the Zaliapin declustering algorithm class.
+    """
+
+    BASE_DATA_PATH = os.path.join(os.path.dirname(__file__), 'data')
+
+    def setUp(self):
+        """
+        Read the sample catalogue
+        """
+        flnme = 'zaliapin_test_catalogue.csv'
+        filename = os.path.join(self.BASE_DATA_PATH, flnme)
+        parser = CsvCatalogueParser(filename)
+        self.cat = parser.read_file()
+
+    def test_dec_zaliapin(self):
+        # Testing the Zaliapin algorithm
+        config = {'fractal_dim': 1.6,
+                  'b_value': 1.0,
+                  'threshold': 0.5 }
+        print(config)
+        # Instantiate the declusterer and process the sample catalogue
+        dec = Zaliapin()
+        vcl, flagvector = dec.decluster(self.cat, config)
+        print('vcl:', vcl)
+        print('flagvector:', flagvector, self.cat.data['flag'])
+        np.testing.assert_allclose(flagvector, self.cat.data['flag'])
+

From 43a94df1ae792f7ceb2c72d27af585ad86322aa6 Mon Sep 17 00:00:00 2001
From: Kirsty Bayliss <kirsty.bayliss@globalquakemodel.org>
Date: Wed, 10 May 2023 12:08:23 +0200
Subject: [PATCH 03/10] Reasenberg test - something is not right here!

---
 .../seismicity/declusterer/dec_reasenberg.py  | 37 ++++++--
 .../declusterer/dec_reasenberg_test.py        | 94 +++++++++++++++++++
 .../declusterer/dec_zaliapin_test.py          |  1 -
 3 files changed, 123 insertions(+), 9 deletions(-)
 create mode 100644 openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py

diff --git a/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
index 8cfd088aeda9..4348de3cd8a7 100644
--- a/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
+++ b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
@@ -112,7 +112,10 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
 
         self.verbose = config.get('verbose', self.verbose)
         # Get relevant parameters
-        neq = catalogue.get_number_events()  # Number of earthquakes
+        #breakpoint()
+        #neq = catalogue.get_number_events()  # Number of earthquakes
+        neq = len(catalogue.data['latitude'])
+        print(neq)
 
         min_lookahead_days = config['taumin']
         max_lookahead_days = config['taumax']
@@ -182,6 +185,7 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
                 # this event is not associated with a cluster, yet
                 # self.debug_print(i, ' is not in a cluster')
                 look_ahead_days = min_lookahead_days
+                print("not classified... looking ahead ", look_ahead_days, " days")
 
             elif my_mag >= clusmaxmag[my_cluster]:
                 # this is the biggest event  in this cluster, so far (or equal to it).
@@ -190,6 +194,7 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
                 clusmaxmag[my_cluster] = my_mag
                 clus_biggest_idx[my_cluster] = i
                 look_ahead_days = min_lookahead_days
+                print("new largest event ... resetting to min lookahead time - looking ahead ", look_ahead_days, " days")
                 # time between largest event in cluster and this event is 0, so use min_lookahead_days (rather than 0).
             else:
                 # this event is already tied to a cluster, but is not the largest event
@@ -205,22 +210,31 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
                 
                 # look_ahead_days should be > min and < max to prevent this running forever...
                 look_ahead_days = np.clip(look_ahead_days, min_lookahead_days, max_lookahead_days)
+                print("event in a cluster... looking ahead ", look_ahead_days, " days")
 
             # extract eqs that fit interaction time window --------------
+            ## OK so it's not classifying any events as clustered :/
 
             max_elapsed = elapsed[i] + look_ahead_days
+            print("max elapsed time ", max_elapsed)
+            # i + 1 is returning i....
             next_event = i + 1
             # find location of last event between elapsed and max_elapsed
-            last_event = bisect_left(elapsed, max_elapsed, lo=next_event)
-            #print("last event", last_event, "i", i)
+            # This is just returning the last event though. No wonder my code is confused.
+            #last_event = bisect_left(elapsed, max_elapsed, lo=next_event)
+            events_before_max = elapsed[elapsed < max_elapsed]
+            last_event = events_before_max[i:]
+            print("last event", last_event, "next event", next_event)
+            print("elapsed ", elapsed, "max_elapsed", max_elapsed)
             # range from next event (i+1) to last event (end of the lookahead time)
             temporal_evs = np.arange(next_event, last_event)
-            #print("temp_ev ", temporal_evs)
+            print("temp_ev ", temporal_evs)
             if my_cluster != 0:
                 # If we are in a cluster, consider only events that are not already in the cluster
                 temporal_evs = temporal_evs[vcl[temporal_evs] != my_cluster]
 
             if len(temporal_evs) == 0:
+                print("no events in time window :(")
                 continue
 
             # ------------------------------------
@@ -229,6 +243,7 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             # ------------------------------------
 
             my_biggest_idx = clus_biggest_idx[my_cluster]
+            print("biggest idx:", my_biggest_idx)
             bg_ev_for_dist = i if not_classified else my_biggest_idx
 
             dist_to_recent = distance(catalogue.data['latitude'][i], catalogue.data['longitude'][i], depth[i], catalogue.data['latitude'][temporal_evs], catalogue.data['longitude'][temporal_evs], depth[temporal_evs])
@@ -245,6 +260,9 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             spatial_evs = np.logical_or(l_recent, l_big)
 
             if not any(spatial_evs):
+                print("no spatial_evs, something's gone horribly wrong!")
+                print("recent ", l_recent)
+                print("big ", l_big)
                 # self.debug_print()
                 continue
 
@@ -260,6 +278,7 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             # so vcl[events_in_any_cluster] is independent from vcl[i]
 
             candidates = temporal_evs[spatial_evs]  # eqs that fit spatial and temporal criterion
+            print("candidate events:",candidates)
             events_in_any_cluster = candidates[vcl[candidates] != 0]  # eqs which are already related with a cluster
             events_in_no_cluster = candidates[vcl[candidates] == 0]  # eqs that are not already in a cluster
 
@@ -321,9 +340,13 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
                 #ms_id = np.where(mags == biggest_mag, cluster_ids))[-1]
                 #ms_id = np.asarray(np.max(mags[vcl = cluster_no]).nonzero()
                 msi[ms_id] = 1
+
+            print(vcl)
             
         return vcl, msi
 
+
+
     @staticmethod
     def get_zone_distances_per_mag(mags: np.ndarray, rfact: float, formula: Callable,
                                    max_interact_km: float = np.inf) -> Tuple[Dict, Dict]:
@@ -353,14 +376,12 @@ def get_zone_distances_per_mag(mags: np.ndarray, rfact: float, formula: Callable
         # limit interaction distance [generally to one crustal thickness], in km
         noclust_km[noclust_km > max_interact_km] = max_interact_km
         clust_km[clust_km > max_interact_km] = max_interact_km
-
         zone_noclust = dict(zip(unique_mags, noclust_km))
         zone_clust = dict(zip(unique_mags, clust_km))
+        print("zone_noclust ", zone_noclust)
+        print("zone_clust ", zone_clust)
         return zone_noclust, zone_clust
 
-    def debug_print(self, *args, **kwargs):
-        if self.verbose:
-            print(*args, **kwargs)
 
 def relative_time(years, months, days, hours=None, minutes=None, seconds=None, datetime_unit ='D', reference_date=None):
     """ Get elapsed days since first event in catalog
diff --git a/openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py b/openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py
new file mode 100644
index 000000000000..2be9a02b4894
--- /dev/null
+++ b/openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+# vim: tabstop=4 shiftwidth=4 softtabstop=4
+
+#
+# LICENSE
+#
+# Copyright (C) 2010-2023 GEM Foundation, G. Weatherill, M. Pagani,
+# D. Monelli.
+#
+# The Hazard Modeller's Toolkit is free software: you can redistribute
+# it and/or modify it under the terms of the GNU Affero General Public
+# License as published by the Free Software Foundation, either version
+# 3 of the License, or (at your option) any later version.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with OpenQuake. If not, see <http://www.gnu.org/licenses/>
+#
+# DISCLAIMER
+#
+# The software Hazard Modeller's Toolkit (openquake.hmtk) provided herein
+# is released as a prototype implementation on behalf of
+# scientists and engineers working within the GEM Foundation (Global
+# Earthquake Model).
+#
+# It is distributed for the purpose of open collaboration and in the
+# hope that it will be useful to the scientific, engineering, disaster
+# risk and software design communities.
+#
+# The software is NOT distributed as part of GEM’s OpenQuake suite
+# (https://www.globalquakemodel.org/tools-products) and must be considered as a
+# separate entity. The software provided herein is designed and implemented
+# by scientific staff. It is not developed to the design standards, nor
+# subject to same level of critical review by professional software
+# developers, as GEM’s OpenQuake software suite.
+#
+# Feedback and contribution to the software is welcome, and can be
+# directed to the hazard scientific staff of the GEM Model Facility
+# (hazard@globalquakemodel.org).
+#
+# The Hazard Modeller's Toolkit (openquake.hmtk) is therefore distributed WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# The GEM Foundation, and the authors of the software, assume no
+# liability for use of the software.
+
+"""
+"""
+
+import unittest
+import os
+import numpy as np
+
+from openquake.hmtk.seismicity.declusterer.dec_reasenberg import Reasenberg
+from openquake.hmtk.parsers.catalogue import CsvCatalogueParser
+
+
+class ReasenbergTestCase(unittest.TestCase):
+    """
+    Unit tests for the Reasenberg declustering algorithm class.
+    """
+
+    BASE_DATA_PATH = os.path.join(os.path.dirname(__file__), 'data')
+
+    def setUp(self):
+        """
+        Read the sample catalogue
+        """
+        flnme = 'zaliapin_test_catalogue.csv'
+        filename = os.path.join(self.BASE_DATA_PATH, flnme)
+        parser = CsvCatalogueParser(filename)
+        self.cat = parser.read_file()
+
+    def test_dec_reasenberg(self):
+        # Testing the Reasenberg algorithm
+        config = {    'taumin' : 1.0,  # look ahead time for not clustered events, days
+            'taumax' : 20.0,  # maximum look ahead time for clustered events, days
+            'P' : 0.95,  # confidence level that this is next event in sequence
+            'xk' : 0.5,  # factor used with xmeff to define magnitude cutoff
+            'xmeff' : 1.5,  # magnitude effective, used with xk to define magnitude cutoff
+            'rfact' : 20,  # factor for interaction radius for dependent events
+            'horiz_error' : .5,  # epicenter error, km.  if unspecified or None, it is pulled from the catalogue
+            'depth_error' : 2.0,  # depth error, km.  if unspecified or None, it is pulled from the catalogue
+            'interaction_formula' : 'Reasenberg1985',  # either `Reasenberg1985` or `WellsCoppersmith1994`
+            'max_interaction_dist' : 100 }
+        
+        # Instantiate the declusterer and process the sample catalogue
+        dec = Reasenberg()
+        vcl, flagvector = dec.decluster(self.cat, config)
+        print('vcl:', vcl)
+        print('flagvector:', flagvector, self.cat.data['flag'])
+        np.testing.assert_allclose(flagvector, self.cat.data['flag'])
+
diff --git a/openquake/hmtk/tests/seismicity/declusterer/dec_zaliapin_test.py b/openquake/hmtk/tests/seismicity/declusterer/dec_zaliapin_test.py
index b3f069062aaf..ddacf8043231 100644
--- a/openquake/hmtk/tests/seismicity/declusterer/dec_zaliapin_test.py
+++ b/openquake/hmtk/tests/seismicity/declusterer/dec_zaliapin_test.py
@@ -77,7 +77,6 @@ def test_dec_zaliapin(self):
         config = {'fractal_dim': 1.6,
                   'b_value': 1.0,
                   'threshold': 0.5 }
-        print(config)
         # Instantiate the declusterer and process the sample catalogue
         dec = Zaliapin()
         vcl, flagvector = dec.decluster(self.cat, config)

From 941fbd76529e136a49bf2d43530a607362ec7910 Mon Sep 17 00:00:00 2001
From: Kirsty Bayliss <kirsty.bayliss@globalquakemodel.org>
Date: Mon, 12 Jun 2023 11:21:08 +0200
Subject: [PATCH 04/10] Fixes to Reasenberg algorithm to use default distance
 and test for Reasenberg

---
 .../seismicity/declusterer/dec_reasenberg.py  | 68 +++++++------------
 .../declusterer/dec_reasenberg_test.py        | 12 ++--
 2 files changed, 29 insertions(+), 51 deletions(-)

diff --git a/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
index 4348de3cd8a7..720fe253aeca 100644
--- a/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
+++ b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
@@ -112,11 +112,8 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
 
         self.verbose = config.get('verbose', self.verbose)
         # Get relevant parameters
-        #breakpoint()
         #neq = catalogue.get_number_events()  # Number of earthquakes
         neq = len(catalogue.data['latitude'])
-        print(neq)
-
         min_lookahead_days = config['taumin']
         max_lookahead_days = config['taumax']
 
@@ -138,7 +135,10 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
         #    raise ValueError("unknown configuration dmethod. it should be 'gc' or 'p2p'")
 
         mags = catalogue['magnitude']
-        depth = catalogue['depth']
+        if catalogue['depth'].size > 0:
+            depth = catalogue.data.get('depth', np.zeros(1))
+        else: 
+            depth = np.array([0]*neq)
 
         # Errors are determined 1st by the config. If this value doesn't exist or is None, then get the
         # error values from the catalog.  If errors do not exist within the catalog, then set the errors to 0.
@@ -155,9 +155,10 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
         # Pre-allocate cluster index vectors
         vcl = np.zeros(neq, dtype=int).flatten()
         msi = np.zeros(neq, dtype=int).flatten()
-        ev_id = np.zeros(neq, dtype=int).flatten()
+        #ev_id = np.zeros(neq, dtype=int).flatten()
+        ev_id = np.arange(0, neq)
         # set the interaction zones, in km
-        # Reasenberg 1987 or alternate version: Wells & Coppersmith 1994 / Helmstetter (SRL) 2007
+        # Reasenberg 1987 or alternate version: Wells & Coppersmith 1994 / Helmstetter (SRL) 2007q
         zone_noclust, zone_clust = self.get_zone_distances_per_mag(
             mags=mags,
             rfact=config['rfact'],
@@ -166,13 +167,12 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
 
         k = 0  # clusterindex
 
-        # variable to store information whether earthquake is already clustered
+        # variable to store whether earthquake is already clustered
         clusmaxmag = np.array([-np.inf] * neq)
         clus_biggest_idx = np.zeros(neq, dtype=int)
 
         # for every earthquake in catalog, main loop
         for i in range(neq - 1):
-            ev_id[i] = i
             my_mag = mags[i]
 
             # variable needed for distance and timediff
@@ -185,7 +185,7 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
                 # this event is not associated with a cluster, yet
                 # self.debug_print(i, ' is not in a cluster')
                 look_ahead_days = min_lookahead_days
-                print("not classified... looking ahead ", look_ahead_days, " days")
+                
 
             elif my_mag >= clusmaxmag[my_cluster]:
                 # this is the biggest event  in this cluster, so far (or equal to it).
@@ -194,7 +194,7 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
                 clusmaxmag[my_cluster] = my_mag
                 clus_biggest_idx[my_cluster] = i
                 look_ahead_days = min_lookahead_days
-                print("new largest event ... resetting to min lookahead time - looking ahead ", look_ahead_days, " days")
+                
                 # time between largest event in cluster and this event is 0, so use min_lookahead_days (rather than 0).
             else:
                 # this event is already tied to a cluster, but is not the largest event
@@ -210,31 +210,24 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
                 
                 # look_ahead_days should be > min and < max to prevent this running forever...
                 look_ahead_days = np.clip(look_ahead_days, min_lookahead_days, max_lookahead_days)
-                print("event in a cluster... looking ahead ", look_ahead_days, " days")
+                
 
             # extract eqs that fit interaction time window --------------
-            ## OK so it's not classifying any events as clustered :/
 
             max_elapsed = elapsed[i] + look_ahead_days
-            print("max elapsed time ", max_elapsed)
             # i + 1 is returning i....
             next_event = i + 1
             # find location of last event between elapsed and max_elapsed
             # This is just returning the last event though. No wonder my code is confused.
-            #last_event = bisect_left(elapsed, max_elapsed, lo=next_event)
-            events_before_max = elapsed[elapsed < max_elapsed]
-            last_event = events_before_max[i:]
-            print("last event", last_event, "next event", next_event)
-            print("elapsed ", elapsed, "max_elapsed", max_elapsed)
+            last_event = bisect_left(elapsed, max_elapsed, lo=next_event)
+            
             # range from next event (i+1) to last event (end of the lookahead time)
             temporal_evs = np.arange(next_event, last_event)
-            print("temp_ev ", temporal_evs)
             if my_cluster != 0:
                 # If we are in a cluster, consider only events that are not already in the cluster
                 temporal_evs = temporal_evs[vcl[temporal_evs] != my_cluster]
 
             if len(temporal_evs) == 0:
-                print("no events in time window :(")
                 continue
 
             # ------------------------------------
@@ -243,12 +236,11 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             # ------------------------------------
 
             my_biggest_idx = clus_biggest_idx[my_cluster]
-            print("biggest idx:", my_biggest_idx)
             bg_ev_for_dist = i if not_classified else my_biggest_idx
 
             dist_to_recent = distance(catalogue.data['latitude'][i], catalogue.data['longitude'][i], depth[i], catalogue.data['latitude'][temporal_evs], catalogue.data['longitude'][temporal_evs], depth[temporal_evs])
             dist_to_biggest = distance(catalogue.data['latitude'][bg_ev_for_dist], catalogue.data['longitude'][bg_ev_for_dist], depth[bg_ev_for_dist], catalogue.data['latitude'][temporal_evs], catalogue.data['longitude'][temporal_evs], depth[temporal_evs])
-            
+
             if look_ahead_days == min_lookahead_days:
                 l_big = dist_to_biggest == 0  # all false
                 l_recent = dist_to_recent <= zone_noclust[my_mag]
@@ -258,12 +250,7 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
                 l_recent = dist_to_recent <= zone_clust[my_mag]
 
             spatial_evs = np.logical_or(l_recent, l_big)
-
             if not any(spatial_evs):
-                print("no spatial_evs, something's gone horribly wrong!")
-                print("recent ", l_recent)
-                print("big ", l_big)
-                # self.debug_print()
                 continue
 
             # ------------------------------------
@@ -278,7 +265,6 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             # so vcl[events_in_any_cluster] is independent from vcl[i]
 
             candidates = temporal_evs[spatial_evs]  # eqs that fit spatial and temporal criterion
-            print("candidate events:",candidates)
             events_in_any_cluster = candidates[vcl[candidates] != 0]  # eqs which are already related with a cluster
             events_in_no_cluster = candidates[vcl[candidates] == 0]  # eqs that are not already in a cluster
 
@@ -327,21 +313,17 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             # attach clustnumber to catalogue yet unrelated to a cluster
             vcl[events_in_no_cluster] = my_cluster
             
-            # set mainshock index for all events not in a cluster to be 1 also
-            # i.e. an independent event counts as a mainshock
-            msi[events_in_no_cluster] = 1
+        # set mainshock index for all events not in a cluster to be 1 also
+        # i.e. an independent event counts as a mainshock
+        msi[vcl == 0] = 1
             
-            # for each cluster, identify mainshock
-            clusters = np.unique(vcl)
-            for cluster_no in clusters:
-                cluster_ids = ev_id[vcl == cluster_no]
-                biggest_mag_idx = np.where(np.max(mags[vcl == cluster_no]))
-                ms_id = cluster_ids[biggest_mag_idx]
-                #ms_id = np.where(mags == biggest_mag, cluster_ids))[-1]
-                #ms_id = np.asarray(np.max(mags[vcl = cluster_no]).nonzero()
-                msi[ms_id] = 1
-
-            print(vcl)
+        # for each cluster, identify mainshock
+        clusters = np.unique(vcl[vcl != 0])
+        for cluster_no in clusters:
+            cluster_ids = ev_id[vcl == cluster_no]
+            biggest_mag_idx = np.where(np.max(mags[vcl == cluster_no]))
+            ms_id = cluster_ids[biggest_mag_idx]
+            msi[ms_id] = 1
             
         return vcl, msi
 
@@ -378,8 +360,6 @@ def get_zone_distances_per_mag(mags: np.ndarray, rfact: float, formula: Callable
         clust_km[clust_km > max_interact_km] = max_interact_km
         zone_noclust = dict(zip(unique_mags, noclust_km))
         zone_clust = dict(zip(unique_mags, clust_km))
-        print("zone_noclust ", zone_noclust)
-        print("zone_clust ", zone_clust)
         return zone_noclust, zone_clust
 
 
diff --git a/openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py b/openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py
index 2be9a02b4894..9a8bc254014a 100644
--- a/openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py
+++ b/openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py
@@ -74,21 +74,19 @@ def setUp(self):
 
     def test_dec_reasenberg(self):
         # Testing the Reasenberg algorithm
-        config = {    'taumin' : 1.0,  # look ahead time for not clustered events, days
-            'taumax' : 20.0,  # maximum look ahead time for clustered events, days
+        config = {    'taumin' : 40.0,  # look ahead time for not clustered events, days
+            'taumax' : 50.0,  # maximum look ahead time for clustered events, days
             'P' : 0.95,  # confidence level that this is next event in sequence
             'xk' : 0.5,  # factor used with xmeff to define magnitude cutoff
             'xmeff' : 1.5,  # magnitude effective, used with xk to define magnitude cutoff
-            'rfact' : 20,  # factor for interaction radius for dependent events
+            'rfact' : 10,  # factor for interaction radius for dependent events
             'horiz_error' : .5,  # epicenter error, km.  if unspecified or None, it is pulled from the catalogue
             'depth_error' : 2.0,  # depth error, km.  if unspecified or None, it is pulled from the catalogue
-            'interaction_formula' : 'Reasenberg1985',  # either `Reasenberg1985` or `WellsCoppersmith1994`
+            'interaction_formula' : 'WellsCoppersmith1994',  # either `Reasenberg1985` or `WellsCoppersmith1994`
             'max_interaction_dist' : 100 }
         
         # Instantiate the declusterer and process the sample catalogue
         dec = Reasenberg()
         vcl, flagvector = dec.decluster(self.cat, config)
-        print('vcl:', vcl)
-        print('flagvector:', flagvector, self.cat.data['flag'])
-        np.testing.assert_allclose(flagvector, self.cat.data['flag'])
+        np.testing.assert_allclose(1-flagvector, self.cat.data['flag'])
 

From f32b8bc90c47d3ada7b4141881486e3930e631e0 Mon Sep 17 00:00:00 2001
From: Kirsty Bayliss <kirsty.bayliss@globalquakemodel.org>
Date: Mon, 12 Jun 2023 11:33:37 +0200
Subject: [PATCH 05/10] Fixed so that mainshock flag returns 0 for mainshocks
 (previously returned 1, now consistent with other methods)

---
 openquake/hmtk/seismicity/declusterer/dec_reasenberg.py         | 1 +
 .../hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
index 720fe253aeca..fcad8cdd1653 100644
--- a/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
+++ b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
@@ -325,6 +325,7 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             ms_id = cluster_ids[biggest_mag_idx]
             msi[ms_id] = 1
             
+        msi = 1 - msi    
         return vcl, msi
 
 
diff --git a/openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py b/openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py
index 9a8bc254014a..ff97e6a6b33f 100644
--- a/openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py
+++ b/openquake/hmtk/tests/seismicity/declusterer/dec_reasenberg_test.py
@@ -88,5 +88,5 @@ def test_dec_reasenberg(self):
         # Instantiate the declusterer and process the sample catalogue
         dec = Reasenberg()
         vcl, flagvector = dec.decluster(self.cat, config)
-        np.testing.assert_allclose(1-flagvector, self.cat.data['flag'])
+        np.testing.assert_allclose(flagvector, self.cat.data['flag'])
 

From c2f6d29c1cf8333ac4591c1547299e8a1b34197e Mon Sep 17 00:00:00 2001
From: Kirsty Bayliss <kirsty.bayliss@globalquakemodel.org>
Date: Mon, 12 Jun 2023 13:14:28 +0200
Subject: [PATCH 06/10] tidying

---
 .../seismicity/declusterer/dec_reasenberg.py  | 23 +++----------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
index fcad8cdd1653..5c317b962989 100644
--- a/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
+++ b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
@@ -40,7 +40,7 @@ class Reasenberg(BaseCatalogueDecluster):
     90(B7), pp.5479-5495.
 
     Declustering code originally converted to MATLAB by A. Allman.
-    Then, highly modified and converted to Python by CG Reyes. Further modified by K Bayliss.
+    Then, highly modified and converted to Python by CG Reyes. 
 
 
     # default_config = dict(taumin=1.0,  # tau(t==0)
@@ -122,18 +122,6 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
 
         assert np.all(elapsed[1:] >= elapsed[:-1]), "catalogue needs to be in ascending date order"
 
-        # easy-access variables
-        ### What even is dmethod? This is literally never explained...
-        #if config.get('dmethod', 'gc') == 'gc':
-        #    surf_pos = (catalogue['latitude'], catalogue['longitude'])
-        #    event_distance = event_gc_distance
-
-        #elif config['dmethod'] == 'p2p':
-        #    surf_pos = geodetic_to_ecef(catalogue['latitude'], catalogue['longitude'])  # assumes at surface
-        #    event_distance = event_p2p_distance
-        #else:
-        #    raise ValueError("unknown configuration dmethod. it should be 'gc' or 'p2p'")
-
         mags = catalogue['magnitude']
         if catalogue['depth'].size > 0:
             depth = catalogue.data.get('depth', np.zeros(1))
@@ -183,22 +171,18 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
 
             if not_classified:
                 # this event is not associated with a cluster, yet
-                # self.debug_print(i, ' is not in a cluster')
                 look_ahead_days = min_lookahead_days
                 
 
             elif my_mag >= clusmaxmag[my_cluster]:
                 # this is the biggest event  in this cluster, so far (or equal to it).
                 # note, if this is now the biggest, then the cluster range collapses into its radius
-                # self.debug_print(f'{i} is the biggest event of cluster M={my_mag}')
                 clusmaxmag[my_cluster] = my_mag
                 clus_biggest_idx[my_cluster] = i
                 look_ahead_days = min_lookahead_days
-                
                 # time between largest event in cluster and this event is 0, so use min_lookahead_days (rather than 0).
             else:
                 # this event is already tied to a cluster, but is not the largest event
-                #self.debug_print(i, ' is already in cluster, but not biggest')
                 idx_biggest = clus_biggest_idx[my_cluster]
                 days_since_biggest = elapsed[i] - elapsed[idx_biggest]
                 # set new look_ahead_days (function of time from largest event in cluster)
@@ -208,7 +192,7 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
                                                              config['xmeff'],
                                                              config['P'])
                 
-                # look_ahead_days should be > min and < max to prevent this running forever...
+                # look_ahead_days should be > min and < max to prevent this running forever.
                 look_ahead_days = np.clip(look_ahead_days, min_lookahead_days, max_lookahead_days)
                 
 
@@ -218,7 +202,6 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             # i + 1 is returning i....
             next_event = i + 1
             # find location of last event between elapsed and max_elapsed
-            # This is just returning the last event though. No wonder my code is confused.
             last_event = bisect_left(elapsed, max_elapsed, lo=next_event)
             
             # range from next event (i+1) to last event (end of the lookahead time)
@@ -271,7 +254,7 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             # if this cluster overlaps with any other cluster, then merge them
             # assign every event in all related clusters to the same (lowest) cluster number
             # set this cluster's maximum magnitude "clusmaxmag" to the largest magnitude of all combined events
-            # set this cluster's clus_biggest_idx to the (most recent ??) largest event of all combined events
+            # set this cluster's clus_biggest_idx to the largest event of all combined events
             # Flag largest event in each cluster
 
             if len(events_in_any_cluster) > 0:

From 2d6289901e2cba054908df3e69963dd7b3a600e6 Mon Sep 17 00:00:00 2001
From: Kirsty Bayliss <kirsty.bayliss@globalquakemodel.org>
Date: Tue, 1 Aug 2023 16:39:16 +0200
Subject: [PATCH 07/10] add nni to enhanced outputs of Zaliapin declustering

---
 openquake/hmtk/seismicity/declusterer/dec_zaliapin.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py b/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
index f1ec7fb6bcd8..6e881bf3e4d0 100644
--- a/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
+++ b/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
@@ -15,12 +15,7 @@
     BaseCatalogueDecluster, DECLUSTERER_METHODS)
 
 from openquake.hazardlib.geo.geodetic import geodetic_distance, distance, _prepare_coords 
-
-#from openquake.hmtk.seismicity.utils import relative_time
-
 from sklearn import mixture
-#from pymap3d import geodetic2ned
-
 
 @DECLUSTERER_METHODS.add(
     "decluster",
@@ -76,8 +71,7 @@ def decluster(self, catalogue, config):
                  
 
         if 'output_nearest_neighbor_distances' in config and config['output_nearest_neighbor_distances']:
-            return probability_of_independence, nnd, ms_flag, root, ld
-ì
+            return probability_of_independence, nnd, nni, ms_flag, root, ld
         cluster_index = 1 - ms_flag
         return root, cluster_index
 

From f1a9abec0b7926b4439cc0fc2b1b7573afaa5a6a Mon Sep 17 00:00:00 2001
From: Kirsty Bayliss <kirsty.bayliss@globalquakemodel.org>
Date: Tue, 7 Nov 2023 12:03:19 +0100
Subject: [PATCH 08/10] small updates

---
 .../parsers/catalogue/csv_catalogue_parser.py | 10 ++++----
 .../hmtk/parsers/catalogue/gcmt_ndk_parser.py |  1 +
 openquake/hmtk/seismicity/catalogue.py        | 11 ++++----
 .../seismicity/declusterer/dec_zaliapin.py    | 14 +++++++----
 openquake/hmtk/seismicity/gcmt_catalogue.py   | 13 +++++++---
 openquake/hmtk/seismicity/occurrence/utils.py | 25 +++++++++++++------
 6 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/openquake/hmtk/parsers/catalogue/csv_catalogue_parser.py b/openquake/hmtk/parsers/catalogue/csv_catalogue_parser.py
index 346d361c3f0f..a53615eebf4e 100644
--- a/openquake/hmtk/parsers/catalogue/csv_catalogue_parser.py
+++ b/openquake/hmtk/parsers/catalogue/csv_catalogue_parser.py
@@ -162,9 +162,9 @@ class CsvCatalogueWriter(BaseCatalogueWriter):
     # the preferred output order is given as a list here
     OUTPUT_LIST = ['eventID', 'Agency', 'year', 'month', 'day', 'hour',
                    'minute', 'second', 'timeError', 'longitude',
-                   'latitude', 'SemiMajor90', 'SemiMinor90', 'ErrorStrike',
-                   'depth', 'depthError', 'magnitude', 'sigmaMagnitude',
-                   'magnitudeType']
+                   'latitude', 'semimajor90', 'semiminor90', 'ErrorStrike',
+                   'depth', 'depth_error', 'magnitude', 'sigmaMagnitude',
+                   'magnitudeType', 'str1', 'dip1', 'rake1', 'str2', 'dip2', 'rake2' ]
 
     def write_file(self, catalogue, flag_vector=None, magnitude_table=None):
         '''
@@ -241,8 +241,8 @@ class CsvGCMTCatalogueWriter(CsvCatalogueWriter):
     OUTPUT_LIST = ['eventID', 'centroidID', 'Agency', 'year', 'month', 'day',
                    'hour', 'minute', 'second', 'timeError', 'longitude',
                    'latitude', 'depth', 'depthError', 'magnitude',
-                   'sigmaMagnitude', 'magnitudeType', 'moment', 'SemiMajor90',
-                   'SemiMinor90', 'ErrorStrike', 'strike1', 'rake1', 'dip1',
+                   'sigmaMagnitude', 'magnitudeType', 'moment', 'semimajor90',
+                   'semiminor90', 'ErrorStrike', 'strike1', 'rake1', 'dip1',
                    'strike2', 'rake2', 'dip2', 'f_clvd', 'e_rel',
                    'eigenvalue_b', 'azimuth_b', 'plunge_b',
                    'eigenvalue_p', 'azimuth_p', 'plunge_p',
diff --git a/openquake/hmtk/parsers/catalogue/gcmt_ndk_parser.py b/openquake/hmtk/parsers/catalogue/gcmt_ndk_parser.py
index fc18019a0e7f..5a54f98ec22c 100644
--- a/openquake/hmtk/parsers/catalogue/gcmt_ndk_parser.py
+++ b/openquake/hmtk/parsers/catalogue/gcmt_ndk_parser.py
@@ -260,6 +260,7 @@ def to_hmtk(self, use_centroid=True):
                     gcmt.hypocentre.latitude
                 self.catalogue.data['depth'][iloc] = \
                     gcmt.hypocentre.depth
+            
             # Moment, magnitude and relative errors
             self.catalogue.data['moment'][iloc] = gcmt.moment
             self.catalogue.data['magnitude'][iloc] = gcmt.magnitude
diff --git a/openquake/hmtk/seismicity/catalogue.py b/openquake/hmtk/seismicity/catalogue.py
index 4a3132bfb627..ffa7c1daff87 100644
--- a/openquake/hmtk/seismicity/catalogue.py
+++ b/openquake/hmtk/seismicity/catalogue.py
@@ -65,8 +65,9 @@ class Catalogue(object):
 
     FLOAT_ATTRIBUTE_LIST = [
         'second', 'timeError', 'longitude', 'latitude',
-        'SemiMajor90', 'SemiMinor90', 'ErrorStrike', 'depth',
-        'depthError', 'magnitude', 'sigmaMagnitude']
+        'semimajor90', 'semiminor90', 'ErrorStrike', 'depth',
+        'depth_error', 'magnitude', 'sigmaMagnitude', 'str1', 
+        'dip1', 'rake1', 'str2', 'dip2', 'rake2']
 
     INT_ATTRIBUTE_LIST = ['year', 'month', 'day', 'hour', 'minute',
                           'flag']
@@ -81,9 +82,9 @@ class Catalogue(object):
     SORTED_ATTRIBUTE_LIST = [
         'eventID', 'Agency', 'year', 'month', 'day', 'hour',
         'minute', 'second', 'timeError', 'longitude', 'latitude',
-        'SemiMajor90', 'SemiMinor90', 'ErrorStrike',
-        'depth', 'depthError', 'magnitude', 'sigmaMagnitude',
-        'magnitudeType']
+        'semimajor90', 'semiminor90', 'ErrorStrike',
+        'depth', 'depth_error', 'magnitude', 'sigmaMagnitude',
+        'magnitudeType', 'str1', 'dip1', 'rake1', 'str2', 'dip2', 'rake2']
 
     def __init__(self):
         """
diff --git a/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py b/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
index 6e881bf3e4d0..4caffa7ba174 100644
--- a/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
+++ b/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
@@ -62,12 +62,16 @@ def decluster(self, catalogue, config):
         
         probability_of_independence = self.gaussianmixturefit(nnd)
         
-        if 'threshold' in config and isinstance(config['threshold'], (int,float)):
-            threshold = config['threshold']
-            root, ld, ms_flag = cluster_number(catalogue, nni, probability_of_independence, threshold = 0.5, stochastic = False)
-           
-        else:
+        if 'threshold' in config and config['threshold'] == 'stochastic':
             root, ld, ms_flag = cluster_number(catalogue, nni, probability_of_independence,  stochastic = True)
+                       
+        else:
+        	if 'threshold' in config:
+        	    threshold = config['threshold']
+        	else:
+        	    threshold = 0.5
+        	root, ld, ms_flag = cluster_number(catalogue, nni, probability_of_independence, threshold = threshold, stochastic = False)
+
                  
 
         if 'output_nearest_neighbor_distances' in config and config['output_nearest_neighbor_distances']:
diff --git a/openquake/hmtk/seismicity/gcmt_catalogue.py b/openquake/hmtk/seismicity/gcmt_catalogue.py
index fd74977a7e81..647b7d762ee9 100644
--- a/openquake/hmtk/seismicity/gcmt_catalogue.py
+++ b/openquake/hmtk/seismicity/gcmt_catalogue.py
@@ -455,7 +455,8 @@ def serialise_to_hmtk_csv(self, filename, centroid_location=True):
         header_list = ['eventID', 'Agency', 'year', 'month', 'day', 'hour',
                        'minute', 'second', 'timeError', 'longitude',
                        'latitude', 'SemiMajor90', 'SemiMinor90', 'ErrorStrike',
-                       'depth', 'depthError', 'magnitude', 'sigmaMagnitude']
+                       'depth', 'depthError', 'magnitude', 'sigmaMagnitude', 'magnitudeType',
+                       'str1', 'dip1', 'rake1', 'str2', 'dip2', 'rake2']
         with open(filename, 'wt') as fid:
             writer = csv.DictWriter(fid, fieldnames=header_list)
             headers = dict((header, header) for header in header_list)
@@ -470,8 +471,13 @@ def serialise_to_hmtk_csv(self, filename, centroid_location=True):
                             'ErrorStrike': None,
                             'magnitude': tensor.magnitude,
                             'sigmaMagnitude': None,
-                            'depth': None,
-                            'depthError': None}
+                            'magnitudeType': 'Mw',
+                            'str1': tensor.nodal_planes.nodal_plane_1['strike'],
+                            'dip1': tensor.nodal_planes.nodal_plane_1['dip'],
+                            'rake1': tensor.nodal_planes.nodal_plane_1['rake'],
+                            'str2': tensor.nodal_planes.nodal_plane_2['strike'],
+                            'dip2': tensor.nodal_planes.nodal_plane_2['dip'],
+                            'rake2': tensor.nodal_planes.nodal_plane_2['rake']}
 
                 if centroid_location:
                     # Time and location come from centroid
@@ -503,6 +509,7 @@ def serialise_to_hmtk_csv(self, filename, centroid_location=True):
                     cmt_dict['latitude'] = tensor.hypocentre.latitude
                     cmt_dict['depth'] = tensor.hypocentre.depth
                     cmt_dict['depthError'] = None
+                
                 writer.writerow(cmt_dict)
         print('done!')
 
diff --git a/openquake/hmtk/seismicity/occurrence/utils.py b/openquake/hmtk/seismicity/occurrence/utils.py
index e422bd8878ff..50da61b54cab 100644
--- a/openquake/hmtk/seismicity/occurrence/utils.py
+++ b/openquake/hmtk/seismicity/occurrence/utils.py
@@ -217,14 +217,15 @@ def get_completeness_counts(catalogue, completeness, d_m):
         * n_obs - number of events in completeness period
     """
     mmax_obs = np.max(catalogue.data["magnitude"])
-    # thw line below was added by Nick Ackerley but it breaks the tests
+    # thw line below was added by Nick Ackerley but it breaks the tests 
     # catalogue.data["dtime"] = catalogue.get_decimal_time()
     if mmax_obs > np.max(completeness[:, 1]):
         cmag = np.hstack([completeness[:, 1], mmax_obs])
     else:
         cmag = completeness[:, 1]
+    #print(cmag)
     cyear = np.hstack([catalogue.end_year + 1, completeness[:, 0]])
-
+    #print(cyear)
     # When the magnitude value is on the bin edge numpy's histogram function
     # may assign randomly to one side or the other based on the floating
     # point value. As catalogues are rounded to the nearest 0.1 this occurs
@@ -247,10 +248,18 @@ def get_completeness_counts(catalogue, completeness, d_m):
             sel_mags,
             bins=m_bins)[0].astype(float)
         count_years[m_idx[:-1]] += float(nyrs)
-    # Removes any zero rates greater than
-    last_loc = np.where(count_rates > 0)[0][-1]
-    n_obs = count_rates[:(last_loc + 1)]
-    t_per = count_years[:(last_loc + 1)]
-    cent_mag = (master_bins[:-1] + master_bins[1:]) / 2.
-    cent_mag = np.around(cent_mag[:(last_loc + 1)], 3)
+    # Check in case all rates are 0.
+    if np.count_nonzero(count_rates) == 0:
+    	#print("all rates are zero")
+    	cent_mag = (master_bins[:-1] + master_bins[1:]) / 2.
+    	n_obs = [0]*len(cent_mag)
+    	t_per = [count_years[-1]]*len(cent_mag)
+    
+    else:    
+    	# Removes any zero rates greater than
+    	last_loc = np.where(count_rates > 0)[0][-1]
+    	n_obs = count_rates[:(last_loc + 1)]
+    	t_per = count_years[:(last_loc + 1)]
+    	cent_mag = (master_bins[:-1] + master_bins[1:]) / 2.
+    	cent_mag = np.around(cent_mag[:(last_loc + 1)], 3)
     return cent_mag, t_per, n_obs

From d555f23b6b4b7d9b5b8b149df1d4059bffeac7c3 Mon Sep 17 00:00:00 2001
From: Kirsty Bayliss <kirsty.bayliss@globalquakemodel.org>
Date: Tue, 13 Feb 2024 17:19:21 +0100
Subject: [PATCH 09/10] tidying declustering functions

---
 .../seismicity/declusterer/dec_reasenberg.py  | 162 +++++++++++-------
 .../seismicity/declusterer/dec_zaliapin.py    | 158 ++++++++++++-----
 2 files changed, 221 insertions(+), 99 deletions(-)

diff --git a/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
index 5c317b962989..e646ba60ad72 100644
--- a/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
+++ b/openquake/hmtk/seismicity/declusterer/dec_reasenberg.py
@@ -1,6 +1,54 @@
 # -*- coding: utf-8 -*-
 # vim: tabstop=4 shiftwidth=4 softtabstop=4
 
+# -*- coding: utf-8 -*-
+# vim: tabstop=4 shiftwidth=4 softtabstop=4
+
+#
+# LICENSE
+#
+# Copyright (C) 2010-2024 GEM Foundation, G. Weatherill, M. Pagani,
+# D. Monelli.
+#
+# The Hazard Modeller's Toolkit is free software: you can redistribute
+# it and/or modify it under the terms of the GNU Affero General Public
+# License as published by the Free Software Foundation, either version
+# 3 of the License, or (at your option) any later version.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with OpenQuake. If not, see <http://www.gnu.org/licenses/>
+#
+# DISCLAIMER
+#
+# The software Hazard Modeller's Toolkit (openquake.hmtk) provided herein
+# is released as a prototype implementation on behalf of
+# scientists and engineers working within the GEM Foundation (Global
+# Earthquake Model).
+#
+# It is distributed for the purpose of open collaboration and in the
+# hope that it will be useful to the scientific, engineering, disaster
+# risk and software design communities.
+#
+# The software is NOT distributed as part of GEM’s OpenQuake suite
+# (https://www.globalquakemodel.org/tools-products) and must be considered as a
+# separate entity. The software provided herein is designed and implemented
+# by scientific staff. It is not developed to the design standards, nor
+# subject to same level of critical review by professional software
+# developers, as GEM’s OpenQuake software suite.
+#
+# Feedback and contribution to the software is welcome, and can be
+# directed to the hazard scientific staff of the GEM Model Facility
+# (hazard@globalquakemodel.org).
+#
+# The Hazard Modeller's Toolkit (openquake.hmtk) is therefore distributed
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# The GEM Foundation, and the authors of the software, assume no
+# liability for use of the software.
+
+
 """
 Module :mod:`openquake.hmtk.seismicity.declusterer.dec_reasenberg`
 defines the Reasenberg declustering algorithm
@@ -16,7 +64,6 @@
 from openquake.hmtk.seismicity.utils import haversine
 
 
-
 @DECLUSTERER_METHODS.add(
     "decluster",
     taumin=1.0,  # look ahead time for not clustered events, days
@@ -30,6 +77,7 @@
     interaction_formula='Reasenberg1985',  # either `Reasenberg1985` or `WellsCoppersmith1994`
     max_interaction_dist=np.inf  # km, some studies limit to crustal thickness (ex. 30)
 )
+
 class Reasenberg(BaseCatalogueDecluster):
     """
     This class implements the Reasenberg algorithm as described in
@@ -40,31 +88,24 @@ class Reasenberg(BaseCatalogueDecluster):
     90(B7), pp.5479-5495.
 
     Declustering code originally converted to MATLAB by A. Allman.
-    Then, highly modified and converted to Python by CG Reyes. 
-
-
-    # default_config = dict(taumin=1.0,  # tau(t==0)
-    #                       taumax=10.,  # tau(t -> inf), computational simplification should be scaled to local bg rate
-    #                       P=0.95,
-    #                       xk=0.5,
-    #                       xmeff=1.5,
-    #                       rfact=10.,
-    #                       horiz_error=1.5,
-    #                       depth_error=2.,
-    #                       dmethod='gc', ## REMOVED
-    #                       interaction_formula='Reasenberg1985',
-    #                       max_interaction_dist=30  # km, limit to crustal distance
-    #                       )
+    Then, highly modified and converted to Python by CG Reyes.
+    This implementation is similar to that in Zmap (https://github.com/CelsoReyes/zmap7) 
 
     """
 
     def __init__(self):
         self.verbose = False
         self.interaction_formulas = {'Reasenberg1985': lambda m: 0.011 * 10 ** (0.4 * m),
-                                     'WellsCoppersmith1994': lambda m: 0.01 * 10 ** (0.5 * m)}  # Helmstetter (SRL) 2007
+                                     # Helmstetter (SRL) 2007:
+                                     'WellsCoppersmith1994': lambda m: 0.01 * 10 ** (0.5 * m)}  
 
     @staticmethod
-    def clust_look_ahead_time(mag_big: float, dt_big: np.ndarray, xk: float, xmeff: float, p: float) -> np.ndarray:
+    def clust_look_ahead_time(
+                mag_big: float, 
+                dt_big: np.ndarray, 
+                xk: float, 
+                xmeff: float, 
+                p: float) -> np.ndarray:
         """ CLUSTLOOKAHEAD calculate look ahead time for clustered events
 
         :param mag_big:
@@ -72,7 +113,8 @@ def clust_look_ahead_time(mag_big: float, dt_big: np.ndarray, xk: float, xmeff:
         :param dt_big:
             days difference between biggest event and this one
         :param xk:
-            factor used with xmeff to define magnitude cutoff - increases effective magnitude during clusters
+            factor used with xmeff to define magnitude cutoff 
+                - increases effective magnitude during clusters
         :param xmeff:
             magnitude effective, used with xk to define magnitude cutoff
         :param p:
@@ -112,7 +154,6 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
 
         self.verbose = config.get('verbose', self.verbose)
         # Get relevant parameters
-        #neq = catalogue.get_number_events()  # Number of earthquakes
         neq = len(catalogue.data['latitude'])
         min_lookahead_days = config['taumin']
         max_lookahead_days = config['taumax']
@@ -128,8 +169,10 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
         else: 
             depth = np.array([0]*neq)
 
-        # Errors are determined 1st by the config. If this value doesn't exist or is None, then get the
-        # error values from the catalog.  If errors do not exist within the catalog, then set the errors to 0.
+        # Errors are determined 1st by the config. 
+        # If this value doesn't exist or is None, then get the error values from the catalog.  
+        # If errors do not exist within the catalog, then set the errors to 0.
+
         if config.get('horiz_error', None) is None:
             horiz_error = catalogue.data.get('horizError', np.zeros(1))
         else:
@@ -143,10 +186,9 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
         # Pre-allocate cluster index vectors
         vcl = np.zeros(neq, dtype=int).flatten()
         msi = np.zeros(neq, dtype=int).flatten()
-        #ev_id = np.zeros(neq, dtype=int).flatten()
         ev_id = np.arange(0, neq)
         # set the interaction zones, in km
-        # Reasenberg 1987 or alternate version: Wells & Coppersmith 1994 / Helmstetter (SRL) 2007q
+        # Reasenberg 1987 or alternate version: Wells & Coppersmith 1994 / Helmstetter (SRL) 2007
         zone_noclust, zone_clust = self.get_zone_distances_per_mag(
             mags=mags,
             rfact=config['rfact'],
@@ -176,11 +218,12 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
 
             elif my_mag >= clusmaxmag[my_cluster]:
                 # this is the biggest event  in this cluster, so far (or equal to it).
-                # note, if this is now the biggest, then the cluster range collapses into its radius
+                # note, if this is now the biggest event, then the cluster range collapses to its radius
                 clusmaxmag[my_cluster] = my_mag
                 clus_biggest_idx[my_cluster] = i
                 look_ahead_days = min_lookahead_days
-                # time between largest event in cluster and this event is 0, so use min_lookahead_days (rather than 0).
+                # time between largest event in cluster and this event is 0, 
+                # so use min_lookahead_days (rather than 0).
             else:
                 # this event is already tied to a cluster, but is not the largest event
                 idx_biggest = clus_biggest_idx[my_cluster]
@@ -199,7 +242,6 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             # extract eqs that fit interaction time window --------------
 
             max_elapsed = elapsed[i] + look_ahead_days
-            # i + 1 is returning i....
             next_event = i + 1
             # find location of last event between elapsed and max_elapsed
             last_event = bisect_left(elapsed, max_elapsed, lo=next_event)
@@ -221,8 +263,18 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             my_biggest_idx = clus_biggest_idx[my_cluster]
             bg_ev_for_dist = i if not_classified else my_biggest_idx
 
-            dist_to_recent = distance(catalogue.data['latitude'][i], catalogue.data['longitude'][i], depth[i], catalogue.data['latitude'][temporal_evs], catalogue.data['longitude'][temporal_evs], depth[temporal_evs])
-            dist_to_biggest = distance(catalogue.data['latitude'][bg_ev_for_dist], catalogue.data['longitude'][bg_ev_for_dist], depth[bg_ev_for_dist], catalogue.data['latitude'][temporal_evs], catalogue.data['longitude'][temporal_evs], depth[temporal_evs])
+            dist_to_recent = distance(catalogue.data['latitude'][i], 
+                                      catalogue.data['longitude'][i], 
+                                      depth[i], 
+                                      catalogue.data['latitude'][temporal_evs], 
+                                      catalogue.data['longitude'][temporal_evs], 
+                                      depth[temporal_evs])
+            dist_to_biggest = distance(catalogue.data['latitude'][bg_ev_for_dist],
+                                       catalogue.data['longitude'][bg_ev_for_dist], 
+                                       depth[bg_ev_for_dist], 
+                                       catalogue.data['latitude'][temporal_evs], 
+                                       catalogue.data['longitude'][temporal_evs], 
+                                       depth[temporal_evs])
 
             if look_ahead_days == min_lookahead_days:
                 l_big = dist_to_biggest == 0  # all false
@@ -247,14 +299,18 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
             # spatial events only include events AFTER i, not i itself
             # so vcl[events_in_any_cluster] is independent from vcl[i]
 
-            candidates = temporal_evs[spatial_evs]  # eqs that fit spatial and temporal criterion
-            events_in_any_cluster = candidates[vcl[candidates] != 0]  # eqs which are already related with a cluster
-            events_in_no_cluster = candidates[vcl[candidates] == 0]  # eqs that are not already in a cluster
+            # eqs that fit spatial and temporal criterion
+            candidates = temporal_evs[spatial_evs]  
+            # eqs which are already related with a cluster
+            events_in_any_cluster = candidates[vcl[candidates] != 0] 
+            # eqs that are not already in a cluster
+            events_in_no_cluster = candidates[vcl[candidates] == 0]  
 
             # if this cluster overlaps with any other cluster, then merge them
             # assign every event in all related clusters to the same (lowest) cluster number
-            # set this cluster's maximum magnitude "clusmaxmag" to the largest magnitude of all combined events
-            # set this cluster's clus_biggest_idx to the largest event of all combined events
+            # set this cluster's maximum magnitude "clusmaxmag" to the largest magnitude of 
+            # all combined events set this cluster's clus_biggest_idx to the largest event 
+            # of all combined events
             # Flag largest event in each cluster
 
             if len(events_in_any_cluster) > 0:
@@ -275,7 +331,8 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
                 events_in_my_cluster = vcl == my_cluster
                 biggest_mag = np.max(mags[events_in_my_cluster])
 
-                biggest_mag_idx = np.flatnonzero(np.logical_and(mags == biggest_mag, events_in_my_cluster))[-1]
+                biggest_mag_idx = np.flatnonzero(
+                            np.logical_and(mags == biggest_mag, events_in_my_cluster))[-1]
 
                 # reset values for other clusters
                 clusmaxmag[related_clust_nums] = -np.inf
@@ -300,15 +357,14 @@ def decluster(self, catalogue, config) -> Tuple[np.ndarray, np.ndarray]:
         # i.e. an independent event counts as a mainshock
         msi[vcl == 0] = 1
             
-        # for each cluster, identify mainshock
+        # for each cluster, identify mainshock (largest event)
         clusters = np.unique(vcl[vcl != 0])
         for cluster_no in clusters:
             cluster_ids = ev_id[vcl == cluster_no]
             biggest_mag_idx = np.where(np.max(mags[vcl == cluster_no]))
             ms_id = cluster_ids[biggest_mag_idx]
             msi[ms_id] = 1
-            
-        msi = 1 - msi    
+                
         return vcl, msi
 
 
@@ -347,7 +403,15 @@ def get_zone_distances_per_mag(mags: np.ndarray, rfact: float, formula: Callable
         return zone_noclust, zone_clust
 
 
-def relative_time(years, months, days, hours=None, minutes=None, seconds=None, datetime_unit ='D', reference_date=None):
+def relative_time(
+                  years, 
+                  months, 
+                  days, 
+                  hours=None, 
+                  minutes=None, 
+                  seconds=None, 
+                  datetime_unit ='D', 
+                  reference_date=None):
     """ Get elapsed days since first event in catalog
 
     :param reference_date
@@ -386,25 +450,5 @@ def days_from_first_event(catalog) -> relative_time:
                          catalog['hour'], catalog['minute'],
                          catalog['second'].astype(int),
                          datetime_unit='D')
-
-
-def get_distance_errors(directional_error, src_idx, targ_idxs) -> Tuple[float, np.ndarray]:
-    """
-
-    :param directional_error:
-    :param src_idx:
-    :param targ_idxs:
-    :return:
-    """
-    if directional_error is None:
-        directional_error = 0.
-
-    if isinstance(directional_error, (np.ndarray,)):
-        src_err = directional_error[src_idx]
-        targ_err = directional_error[targ_idxs]
-    else:
-        src_err = directional_error
-        targ_err = directional_error
-    return src_err, targ_err
     
 
diff --git a/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py b/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
index 4caffa7ba174..dacefd85b1db 100644
--- a/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
+++ b/openquake/hmtk/seismicity/declusterer/dec_zaliapin.py
@@ -1,5 +1,48 @@
 # -*- coding: utf-8 -*-
 # vim: tabstop=4 shiftwidth=4 softtabstop=4
+#
+# LICENSE
+#
+# Copyright (C) 2010-2024 GEM Foundation, G. Weatherill, M. Pagani,
+# D. Monelli.
+#
+# The Hazard Modeller's Toolkit is free software: you can redistribute
+# it and/or modify it under the terms of the GNU Affero General Public
+# License as published by the Free Software Foundation, either version
+# 3 of the License, or (at your option) any later version.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with OpenQuake. If not, see <http://www.gnu.org/licenses/>
+#
+# DISCLAIMER
+#
+# The software Hazard Modeller's Toolkit (openquake.hmtk) provided herein
+# is released as a prototype implementation on behalf of
+# scientists and engineers working within the GEM Foundation (Global
+# Earthquake Model).
+#
+# It is distributed for the purpose of open collaboration and in the
+# hope that it will be useful to the scientific, engineering, disaster
+# risk and software design communities.
+#
+# The software is NOT distributed as part of GEM’s OpenQuake suite
+# (https://www.globalquakemodel.org/tools-products) and must be considered as a
+# separate entity. The software provided herein is designed and implemented
+# by scientific staff. It is not developed to the design standards, nor
+# subject to same level of critical review by professional software
+# developers, as GEM’s OpenQuake software suite.
+#
+# Feedback and contribution to the software is welcome, and can be
+# directed to the hazard scientific staff of the GEM Model Facility
+# (hazard@globalquakemodel.org).
+#
+# The Hazard Modeller's Toolkit (openquake.hmtk) is therefore distributed
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# The GEM Foundation, and the authors of the software, assume no
+# liability for use of the software.
 
 """
 Module :mod:`openquake.hmtk.seismicity.declusterer.dec_zaliapin`
@@ -22,13 +65,19 @@
     fractal_dim=float,  # spatial weighting factor
     b_value=float,      # magnitude weighting factor
 )
+
+
 class Zaliapin(BaseCatalogueDecluster):
     """
-    Implements the declustering method of Zaliapin and Ben-Zion (2013), based on a nearest-neighbour distance metric.
+    Implements the declustering method of Zaliapin and Ben-Zion (2013), 
+     based on a nearest-neighbour (space-time-magnitude) distance metric.
     
     Implemented in Python by CG Reyes.
-    Modified to return mainshocks (largest event per cluster) and to give the option for
-    stochastic declustering (defaults to this when no threshold is specified). 
+    Modified to:
+     - return mainshocks (largest event per cluster) rather than first event
+       (NB: this increases computational time because we need to reconstruct clusters
+       but is more useful for PSHA and more comparable to alternative methods)
+     - give the option for stochastic declustering. 
      
     """
 
@@ -39,42 +88,61 @@ def decluster(self, catalogue, config):
         """Zaliapin's declustering code
 
         :param catalogue:
-            Earthquake catalog to examine
+            Earthquake catalogue
         :param config:
-            Dictionary containing 
+            Dictionary containing: 
             1. values for fractal dimension `fractal_dim` and Gutenberg-Richter `b-value`
-            2. Method used to choose which events to keep. Use `Threshold = ` to specify a float value for probability at which to keep an event.
-               If `Stochastic = True` (or no threshold is provided) a stochastic declustering will be implemented.
-               A seed for the stochastic method can be specified with the `stoch_seed` parameter (will default to 5 if not included)
-            3. Optional flag to use depths when calculating event distances. Note that the fractal dimension should correspond to the number of dimensions in the model.
-               (Zaliapin and Ben-Zion defaults are 1.6 for 2D and 2.4 for 3D distances and these values are generally used in the literature).
+            2. Method used to choose which events to keep.
+               Use `Threshold = ` to specify a float value for probability 
+                at which to keep an event.
+               If `Stochastic = True` (or no threshold is provided)
+                a stochastic declustering will be implemented.
+               A seed for the stochastic method can be specified with the `stoch_seed` parameter
+                (will default to 5 if not included)
+            3. Optional flag to use depths when calculating event distances. 
+               Note that the fractal dimension should correspond to the number of dimensions 
+                in the model. (Zaliapin and Ben-Zion defaults are 1.6 for 2D and 2.4 for 3D 
+                distances and these values are generally used in the literature).
             
         :return:
-            probability [0..1] that each event is a member of a cluster, and optionally (if
-            config['nearest_neighbor_dist'] is true) return the nearest neighbor distances, mainshock flag, root (first) event in cluster and leaf depth of cluster.
+            probability [0..1] that each event is a member of a cluster,
+            optionally if config['nearest_neighbor_dist'] is true:
+             - return the nearest neighbor distances, mainshock flag, 
+                 root (first) event in cluster and leaf depth of cluster.
         """
 
         if 'depth' in config and 'depth' == True:
-            nnd, nni = self.getnnd(catalogue, config['fractal_dim'], config['b_value'], depth = True)
+            nnd, nni = self.getnnd(catalogue, 
+                                   config['fractal_dim'], 
+                                   config['b_value'], 
+                                   depth = True)
             
         else:
-            nnd, nni = self.getnnd(catalogue, config['fractal_dim'], config['b_value'], depth = False)
+            nnd, nni = self.getnnd(catalogue, 
+                                   config['fractal_dim'], 
+                                   config['b_value'], 
+                                   depth = False)
         
         probability_of_independence = self.gaussianmixturefit(nnd)
         
         if 'threshold' in config and config['threshold'] == 'stochastic':
-            root, ld, ms_flag = cluster_number(catalogue, nni, probability_of_independence,  stochastic = True)
+            root, ld, ms_flag = cluster_number(catalogue, 
+                                               nni, 
+                                               probability_of_independence,  
+                                               stochastic = True)
                        
         else:
         	if 'threshold' in config:
         	    threshold = config['threshold']
         	else:
         	    threshold = 0.5
-        	root, ld, ms_flag = cluster_number(catalogue, nni, probability_of_independence, threshold = threshold, stochastic = False)
-
-                 
+        	root, ld, ms_flag = cluster_number(catalogue, 
+                                               nni, 
+                                               probability_of_independence, 
+                                               threshold = threshold, 
+                                               stochastic = False)
 
-        if 'output_nearest_neighbor_distances' in config and config['output_nearest_neighbor_distances']:
+        if 'output_nearest_neighbor_distances' in config and config['output_nearest_neighbor_distances'] == True:
             return probability_of_independence, nnd, nni, ms_flag, root, ld
         cluster_index = 1 - ms_flag
         return root, cluster_index
@@ -108,32 +176,40 @@ def getnnd(catalogue, d_f=1.6, b=1.0, depth = False):
             depth[np.isnan(depth)] = 0
         else: depth = np.zeros(len(catalogue.data['latitude']))
         
-        # Is there a faster/better way to do this? Probably yes
-        # TODO: Figure it out!
         time=[0]*len(catalogue.data['latitude']) 
         # Change date and time data into one list of datetimes in years (ie 1980.25)       
         for i in range(len(time)):
-            time[i]=datetime.datetime(catalogue.data['year'][i], catalogue.data['month'][i], catalogue.data['day'][i],catalogue.data['hour'][i], catalogue.data['minute'][i], int(catalogue.data['second'][i]))
+            time[i]=datetime.datetime(catalogue.data['year'][i], 
+                                      catalogue.data['month'][i], 
+                                      catalogue.data['day'][i],
+                                      catalogue.data['hour'][i], 
+                                      catalogue.data['minute'][i], 
+                                      int(catalogue.data['second'][i]))
             time[i]=mdates.date2num(time[i])
             # date2num gives days, change to years
             time[i]= (time[i] -1)/365.25
         
-
         for j in range(1, len(catalogue.data['latitude'])):
             # Calculate spatial distance between events
-            dist = distance(catalogue.data['latitude'][j], catalogue.data['longitude'][j], depth[j], catalogue.data['latitude'][:j], catalogue.data['longitude'][:j], depth[:j])      
+            dist = distance(catalogue.data['latitude'][j],
+                            catalogue.data['longitude'][j], 
+                            depth[j], 
+                            catalogue.data['latitude'][:j], 
+                            catalogue.data['longitude'][:j], 
+                            depth[:j])      
             time_diff= time[j]-time[:j]
             # If time difference is zero, set to very small value
             time_diff[time_diff == 0] = 0.0000001
             # Similarly with spatial distances = 0
             dist[dist < 0.1] = 0.1
-            # ZBZ interevent distance is product of time_diff*(10^(b*Mi))*spat_dist^(d_f)
-            interevent_distance = time_diff*(10**(-b*catalogue.data['magnitude'][:j]))*(dist**d_f)
-            # Record index of nearest neighbour (needed to reconstruct the clusters) and the nnd (smallest nnd)
+            # ZBZ interevent distance = time_diff*(10^(b*Mi))*spat_dist^(d_f)
+            interevent_distance = time_diff*(
+                    10**(-b*catalogue.data['magnitude'][:j]))*(dist**d_f)
+            # Record index of nearest neighbour (needed to reconstruct the clusters) 
+            # and the distance to closest neighbour (smallest nnd)
             nearest_index[j] = np.argmin(interevent_distance)
             nearest_distance[j] = np.min(interevent_distance)
             
-            
         ## Set zeroth event to mean NND (doesn't really matter what we do with this)  
         nearest_distance[0]= np.mean(nearest_distance)
             
@@ -142,7 +218,9 @@ def getnnd(catalogue, d_f=1.6, b=1.0, depth = False):
     @staticmethod
     def gaussianmixturefit(nnd):
         """
-        fit Gaussian mixture distribution to nearest neighbour distances (Zaliapin and Ben-Zion 2013) and calculate probability of independence from mixture fit.
+        fit Gaussian mixture distribution to nearest neighbour distances 
+        (Zaliapin and Ben-Zion 2013) and calculate probability of 
+        independence from mixture fit.
         
         :param nnd:
             nearest neighbour distances for each event
@@ -158,13 +236,14 @@ def gaussianmixturefit(nnd):
         clf = mixture.GaussianMixture(n_components=2, covariance_type='full')
         clf.fit(samples)
         
-
         probability_of_independence = np.zeros_like(log_nearest_neighbor_dist)
 
-        # calculate the probabilities that it belongs to each of the gaussian curves (2 columns of probabilities)
+        # calculate the probabilities that it belongs to each of the gaussian curves 
+        # (2 columns of probabilities)
         prediction = clf.predict_proba(samples)
 
-        # keep only the chance that this is a background event (the column matching the gaussian model's largest mean)
+        # keep only the chance that this is a background event 
+        #(the column matching the gaussian model's largest mean)
         probability_of_independence[np.isfinite(log_nearest_neighbor_dist)] = prediction[:, np.argmax(clf.means_)]
 
         probability_of_independence[0] = 1  # first event, by definition, cannot depend upon prior events
@@ -172,13 +251,10 @@ def gaussianmixturefit(nnd):
         
         return probability_of_independence
 
-
-# TODO: Option to skip the reconstruction phase - then this should be directly comparable to the original version
-
-## function to reconstruct clusters - need this to identify mainshocks rather than earliest events
-
 def cluster_number(catalogue, nearest_index, prob_ind, threshold = 0.5, stochastic = True, stoch_seed = 5):
     """
+    function to reconstruct clusters 
+        - need this to identify mainshocks rather than earliest independent events
     Identifies head node (first event in cluster) and number of iterations of while loop to get to it
     (ie how deep in cluster event is; 1st, 2nd, 3rd generation etc). 
     
@@ -192,13 +268,15 @@ def cluster_number(catalogue, nearest_index, prob_ind, threshold = 0.5, stochast
         Probability event is independent (output of gaussianmixturefit)
         
     :param threshold:
-        Threshold of independence probability at which to consider an event independent. Specified in config file or default to 0.5.
+        Threshold of independence probability at which to consider an event independent.
+        Specified in config file or default to 0.5.
     
     :param stochastic:
         Instead of using a fixed threshold, use a stochastic method to thin clustered events
         
     :param stoch_seed:
-        Seed value for stochastic thinning to assure reproducibility. Set by default if not specified.
+        Seed value for stochastic thinning to assure reproducibility. 
+        Set by default if not specified.
         
     :return:
         arrays of root, event depth, mainshock flag
@@ -222,7 +300,6 @@ def cluster_number(catalogue, nearest_index, prob_ind, threshold = 0.5, stochast
     parent = nearest_index*(1-indep_flag)     
 
     for i in range(len(catalogue.data['latitude'])):
-        
         p = parent[i]
         if(p == 0):
             root[i] = i
@@ -233,6 +310,7 @@ def cluster_number(catalogue, nearest_index, prob_ind, threshold = 0.5, stochast
                 ld[i] = ld[i] +1
                 p = parent[p]
     
+    # Collect all unique cluster roots
     clust_heads = np.unique(root)
     MS = clust_heads.astype(int)
     

From 190294971fcdb2c21162d49e4273331ba24c0134 Mon Sep 17 00:00:00 2001
From: Kirsty Bayliss <kirsty.bayliss@globalquakemodel.org>
Date: Mon, 19 Feb 2024 10:26:03 +0100
Subject: [PATCH 10/10] removing typo

---
 openquake/hmtk/seismicity/gcmt_catalogue.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openquake/hmtk/seismicity/gcmt_catalogue.py b/openquake/hmtk/seismicity/gcmt_catalogue.py
index 89017165b6e4..f64abd0438ed 100644
--- a/openquake/hmtk/seismicity/gcmt_catalogue.py
+++ b/openquake/hmtk/seismicity/gcmt_catalogue.py
@@ -558,7 +558,7 @@ def serialise_to_hmtk_csv(self, filename, centroid_location=True):
                     'rake1': tensor.nodal_planes.nodal_plane_1['rake'],
                     'str2': tensor.nodal_planes.nodal_plane_2['strike'],
                     'dip2': tensor.nodal_planes.nodal_plane_2['dip'],
-                    'rake2': tensor.nodal_planes.nodal_plane_2['rake']}
+                    'rake2': tensor.nodal_planes.nodal_plane_2['rake']
                 }
 
                 if centroid_location: