fixed bugs in handling of length tolerance

fspoendlin · fspoendlin · commit 1c98bd1feb2b · 2025-07-14T21:49:20.000+01:00
diff --git a/SPACE2/agglomerative_clustering.py b/SPACE2/agglomerative_clustering.py
@@ -1,10 +1,10 @@
 from sklearn.cluster import AgglomerativeClustering
 from SPACE2.exhaustive_clustering import cluster_with_algorithm
-from SPACE2.util import reg_def, same_length_only
+from SPACE2.util import reg_def
 
 
 def agglomerative_clustering(files, selection=reg_def["CDR_all"], anchors=reg_def["fw_all"], cutoff=1.25, 
-                             d_metric='rmsd', length_clustering='bins', length_tolerance=same_length_only,
+                             d_metric='rmsd', length_clustering='bins', length_tolerance='same_length_only',
                              n_jobs=-1):
     """ Sort a list of antibody pdb files into clusters using agglomerative algorithm.
     Antibodies are first clustered by CDR length and then by structural similarity
@@ -22,13 +22,14 @@ def agglomerative_clustering(files, selection=reg_def["CDR_all"], anchors=reg_de
     :param length_clustering: str, method for clustering antibodies by CDR length. Options are 'bins' or 'greedy'. (default is 'bins')
                               bins: CDRs are grouped into precalculated and equally spaced length bins
                               greedy: stochastic selection of cluster centers
-    :param length_tolerance: np.array, binwidth for length clustering per CDR. (default clustering into bins of identical length)
+    :param length_tolerance: str or np.array, binwidth for length clustering per CDR. (default clustering into bins of identical length)
                              array is required to have the same length as selection, with each element corresponding to the length 
                              tolerance of an individual CDR region.
     :param n_jobs: int, number of cpus to use for parallelising. (default is all)
 
     :return final_clustering: pd.DataFrame, containing the cluster assignments
     """
+
     clustering_algorithm = AgglomerativeClustering(n_clusters=None, metric='precomputed', distance_threshold=cutoff, linkage='complete')
     final_clustering = cluster_with_algorithm(
         clustering_algorithm, files, selection=selection, anchors=anchors, d_metric=d_metric,
diff --git a/SPACE2/exhaustive_clustering.py b/SPACE2/exhaustive_clustering.py
@@ -4,7 +4,7 @@
 from joblib import Parallel, delayed
 from SPACE2.util import (
     cluster_antibodies_by_CDR_length, rmsd, dtw, parse_antibodies, possible_combinations, check_param,
-    reg_def, reg_def_CDR_all, reg_def_fw_all, same_length_only
+    reg_def, reg_def_CDR_all, reg_def_fw_all, same_length_all_cdrs
 )
 
 
@@ -53,7 +53,7 @@ def get_distance_matrix(cluster, ids, d_funct, selection=reg_def_CDR_all, anchor
 
 
 def get_distance_matrices(files, selection=reg_def["CDR_all"], anchors=reg_def["fw_all"], d_metric='rmsd',
-                          length_clustering='bins', length_tolerance=same_length_only, n_jobs=-1):
+                          length_clustering='bins', length_tolerance=same_length_all_cdrs, n_jobs=-1):
     """ Calculate CDR distance matrices between antibody pdb files.
     Antibodies are first clustered by CDR length and then a distance matrix is calculated for each cluster.
 
@@ -168,7 +168,7 @@ def get_clustering(df, clustering):
 
 
 def cluster_with_algorithm(method, files, selection=reg_def["CDR_all"], anchors=reg_def["fw_all"], 
-                           d_metric='rmsd', length_clustering='bins', length_tolerance=same_length_only, n_jobs=-1):
+                           d_metric='rmsd', length_clustering='bins', length_tolerance='same_length_only', n_jobs=-1):
     """ Sort a list of antibody pdb files into clusters.
     Antibodies are first clustered by CDR length and the by structural similarity
 
@@ -178,10 +178,15 @@ def cluster_with_algorithm(method, files, selection=reg_def["CDR_all"], anchors=
     :param anchors: list of np.arrays, indices of residues used for structural alignment of antibodies.
     :param d_metric: str, metric for structural distance calculation. Options are 'rmsd' or 'dtw'.
     :param length_clustering: str, method for clustering antibodies by CDR length. Options are 'bins' or 'greedy'.
-    :param length_tolerance: np.array, binwidth for length clustering per CDR.
+    :param length_tolerance: str or np.array: binwidth for length clustering per CDR.
     :param n_jobs: int, number of cpus to use for parallelising.
     :return: pd.DataFrame, clustering output with columns ID, cluster_by_length, cluster_by_rmsd, matrix_index
     """
+    if isinstance(length_tolerance, str):
+        if length_tolerance == 'same_length_only':
+            n = len(selection)
+            length_tolerance = np.ones(n)
+
     matrices_dict = get_distance_matrices(
         files, selection=selection, anchors=anchors, d_metric=d_metric, length_clustering=length_clustering,
         length_tolerance=length_tolerance, n_jobs=n_jobs
diff --git a/SPACE2/greedy_clustering.py b/SPACE2/greedy_clustering.py
@@ -2,7 +2,7 @@
 from joblib import Parallel, delayed
 from SPACE2.util import (
     rmsd, dtw, parse_antibodies, cluster_antibodies_by_CDR_length, output_to_pandas, check_param,
-    reg_def, reg_def_CDR_all, reg_def_fw_all, same_length_only
+    reg_def, reg_def_CDR_all, reg_def_fw_all
 )
 
 def greedy_cluster(cluster, d_funct, selection=reg_def_CDR_all, anchors=reg_def_fw_all, cutoff=1.25):
@@ -55,7 +55,7 @@ def greedy_cluster_ids(cluster, ids, d_funct, selection=reg_def_CDR_all, anchors
 
 
 def greedy_clustering(files, selection=reg_def["CDR_all"], anchors=reg_def["fw_all"], cutoff=1.25,  d_metric='rmsd',
-                      length_clustering='bins', length_tolerance=same_length_only, n_jobs=-1):
+                      length_clustering='bins', length_tolerance='same_length_only', n_jobs=-1):
     """ Sort a list of antibody pdb files into clusters using greedy algorithm.
     Antibodies are first clustered by CDR length and then by structural similarity
 
@@ -72,13 +72,18 @@ def greedy_clustering(files, selection=reg_def["CDR_all"], anchors=reg_def["fw_a
     :param length_clustering: str, method for clustering antibodies by CDR length. Options are 'bins' or 'greedy'. (default is 'bins')
                               bins: CDRs are grouped into precalculated and equally spaced length bins
                               greedy: stochastic selection of cluster centers
-    :param length_tolerance: np.array, binwidth for length clustering per CDR. (default clustering into bins of identical length)
+    :param length_tolerance: str or np.array, binwidth for length clustering per CDR. (default clustering into bins of identical length)
                              array is required to have the same length as selection, with each element corresponding to the length 
                              tolerance of an individual CDR region.
     :param n_jobs: int, number of cpus to use for parallelising. (default is all)
 
     :return final_clustering: pd.DataFrame, containing the cluster assignments
     """
+    if isinstance(length_tolerance, str):
+        if length_tolerance == 'same_length_only':
+            n = len(selection)
+            length_tolerance = np.ones(n)
+
     check_param(length_tolerance, d_metric)
 
     antibodies = parse_antibodies(files, n_jobs=n_jobs)
diff --git a/SPACE2/util.py b/SPACE2/util.py
@@ -34,8 +34,7 @@
 reg_def_CDR_all = np.concatenate(reg_def["CDR_all"])
 reg_def_fw_all = np.concatenate(reg_def["fw_all"])
 
-same_length_only = np.array([1, 1, 1, 1, 1, 1])
-
+same_length_all_cdrs = np.ones(6)
 
 def random_rot():
     """ Just a random rotation
@@ -221,7 +220,7 @@ def dtw(ab1, ab2, selection=reg_def_CDR_all, anchors=reg_def_fw_all):
     return np.sqrt(normalisation * np.sqrt(dtw_matrix[-1][-1])**2)
 
 
-def cluster_antibodies_by_CDR_length(antibodies, ids, selection=reg_def['CDR_all'], clustering='bins', tolerance=same_length_only):
+def cluster_antibodies_by_CDR_length(antibodies, ids, selection=reg_def['CDR_all'], clustering='bins', tolerance=same_length_all_cdrs):
     """ Sort a list of antibody tuples into groups with the same CDR lengths
 
     :param cluster: list of tuples, antibodies
@@ -300,6 +299,8 @@ def output_to_pandas(output):
 
 def check_param(tolerance, d_metric):
     ''' Check if the input parameters are valid'''
+    if tolerance is None:
+        raise ValueError("Length tolerance must be specified as a np.array")
     if any(tolerance < 1):
         raise ValueError("All entries of length tolerance must be >= 1")
     if d_metric == 'rmsd' and not all(tolerance == 1):
diff --git a/notebooks/example.ipynb b/notebooks/example.ipynb
@@ -39,6 +39,7 @@
    "outputs": [],
    "source": [
     "from SPACE2 import reg_def\n",
+    "import numpy as np\n",
     "\n",
     "# residues for structural comparison\n",
     "cdr_selection = [reg_def['CDRH1'], reg_def['CDRH2'], reg_def['CDRH3']]\n",
@@ -49,7 +50,7 @@
     "# these correspond to the imgt residue number of residues to select\n",
     "\n",
     "clustered_dataframe = SPACE2.agglomerative_clustering(\n",
-    "    antibody_models, selection=cdr_selection, anchors=fw_selection, cutoff=1.25\n",
+    "    antibody_models, selection=cdr_selection, anchors=fw_selection, cutoff=1.25,\n",
     "    )"
    ]
   },