Merge pull request #962 from MouseLand/jacob/max_sub_update

jacobpennington · web-flow · commit aa4c8796d211 · 2025-07-17T18:12:21.000-07:00
Jacob/max sub update
diff --git a/kilosort/clustering_qr.py b/kilosort/clustering_qr.py
@@ -17,22 +17,25 @@
 logger = logging.getLogger(__name__)
 
 
-def neigh_mat(Xd, nskip=20, n_neigh=10, max_sub=None):
+def neigh_mat(Xd, nskip=1, n_neigh=10, max_sub=25000):
     # Xd is spikes by PCA features in a local neighborhood
     # finding n_neigh neighbors of each spike to a subset of every nskip spike
 
     # n_samples is the number of spikes, dim is number of features
     n_samples, dim = Xd.shape
 
-    # subsampling the feature matrix
-    if max_sub is not None:
-        # NOTE: Rather than selecting a fixed-size subset, we adjust nskip.
-        #       This is much faster than the alternatives we've tried since it's
-        #       more-or-less constant speed for arbitrarily large tensors, and it
-        #       keeps the logic simple elsewhere in the code.
-        new_nskip = int(np.ceil((n_samples-1)/(max_sub-1)))
-        if new_nskip > nskip: nskip = new_nskip
+    # Downsample feature matrix by selecting every `nskip`-th spike
     Xsub = Xd[::nskip]
+    n1 = Xsub.shape[0]
+    # If the downsampled matrix is still larger than max_sub,
+    # downsample it further by selecting `max_sub` evenly distributed spikes.
+    if (max_sub is not None) and (n1 > max_sub):
+        n2 = n1 - max_sub
+        idx, rev_idx = subsample_idx(n1, n2)
+        Xsub = Xsub[idx]
+    else:
+        rev_idx = None
+
     # n_nodes are the # subsampled spikes
     n_nodes = Xsub.shape[0]
 
@@ -55,7 +58,10 @@ def neigh_mat(Xd, nskip=20, n_neigh=10, max_sub=None):
                      (kn.shape[0], n_nodes))                  # (shape)
 
     # self connections are set to 0
-    M[np.arange(0,n_samples,nskip), np.arange(n_nodes)] = 0
+    skip_idx = np.arange(0, n_samples, nskip)
+    if rev_idx is not None:
+        skip_idx = skip_idx[rev_idx]
+    M[skip_idx, np.arange(n_nodes)] = 0
 
     return kn, M
 
@@ -112,7 +118,7 @@ def Mstats(M, device=torch.device('cuda')):
     return m, ki, kj
 
 
-def cluster(Xd, iclust=None, kn=None, nskip=20, n_neigh=10, max_sub=np.inf,
+def cluster(Xd, iclust=None, kn=None, nskip=1, n_neigh=10, max_sub=25000,
             nclust=200, seed=1, niter=200, lam=0, device=torch.device('cuda'),
             verbose=False):    
 
diff --git a/kilosort/parameters.py b/kilosort/parameters.py
@@ -384,17 +384,21 @@
 
     'cluster_downsampling': {
         'gui_name': 'cluster downsampling', 'type': int, 'min': 1, 'max': np.inf,
-        'exclude': [], 'default': 20, 'step': 'clustering',
+        'exclude': [], 'default': 1, 'step': 'clustering',
         'description':
             """
-            Inverse fraction of nodes used as landmarks during clustering
-            (can be 1, but that slows down the optimization). 
+            Inverse fraction of spikes used as landmarks during clustering. By
+            default, all spikes are used up to a maximum of
+            `max_cluster_subset=25000`.
+
+            The old default behavior (version < 4.1.0) is
+            equivalent to `max_cluster_subset=None, cluster_downsampling=20`.
             """
     },
 
     'max_cluster_subset': {
         'gui_name': 'max cluster subset', 'type': int, 'min': 1, 'max': np.inf,
-        'exclude': [], 'default': None, 'step': 'clustering',
+        'exclude': [np.inf], 'default': 25000, 'step': 'clustering',
         'description':
             """
             Maximum number of spikes to use when searching for nearest neighbors
@@ -405,13 +409,16 @@
             bound for very long recordings. Using a very large number of spikes
             is not necessary and causes performance bottlenecks.
 
+            Use `max_cluster_subset = None` if you do not want a limit on
+            the subset size. The old default behavior (version < 4.1.0) is
+            equivalent to `max_cluster_subset=None, cluster_downsampling=20`.
+
             Note: In practice, the actual number of spikes used may increase or
             decrease slightly while staying under the maximum. This happens
             because the maximum is set by adjusting `cluster_downsampling` on the
             fly so that it results in a set no larger than the given size.
             """
     },
-    # TODO: Add suggested values after more testing on different datasets.
 
     'x_centers': {
         'gui_name': 'x centers', 'type': int, 'min': 1,
diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py
@@ -17,10 +17,14 @@ def test_pipeline(data_directory, results_directory, saved_ops, torch_device, ca
             )
 
     with capture_mgr.global_and_fixture_disabled():
+        # NOTE: 'cluster_downsampling' and 'max_cluster_subset' are set to be
+        #       equivalent to their default behavior prior to version 4.1.0,
+        #       since that was how the test results were generated.
         print('\nStarting run_kilosort test...')
         ops, st, clu, _, _, _, _, _, kept_spikes = run_kilosort(
             filename=bin_file, device=torch_device,
-            settings={'n_chan_bin': 385},
+            settings={'n_chan_bin': 385, 'cluster_downsampling': 20,
+                      'max_cluster_subset': None},
             probe_name='NeuroPix1_default.mat',
             verbose_console=True
             )