Skip to content

Commit 1c98bd1

Browse files
committed
fixed bugs in handling of length tolerance
1 parent 4dffcac commit 1c98bd1

File tree

5 files changed

+27
-14
lines changed

5 files changed

+27
-14
lines changed

SPACE2/agglomerative_clustering.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from sklearn.cluster import AgglomerativeClustering
22
from SPACE2.exhaustive_clustering import cluster_with_algorithm
3-
from SPACE2.util import reg_def, same_length_only
3+
from SPACE2.util import reg_def
44

55

66
def agglomerative_clustering(files, selection=reg_def["CDR_all"], anchors=reg_def["fw_all"], cutoff=1.25,
7-
d_metric='rmsd', length_clustering='bins', length_tolerance=same_length_only,
7+
d_metric='rmsd', length_clustering='bins', length_tolerance='same_length_only',
88
n_jobs=-1):
99
""" Sort a list of antibody pdb files into clusters using agglomerative algorithm.
1010
Antibodies are first clustered by CDR length and then by structural similarity
@@ -22,13 +22,14 @@ def agglomerative_clustering(files, selection=reg_def["CDR_all"], anchors=reg_de
2222
:param length_clustering: str, method for clustering antibodies by CDR length. Options are 'bins' or 'greedy'. (default is 'bins')
2323
bins: CDRs are grouped into precalculated and equally spaced length bins
2424
greedy: stochastic selection of cluster centers
25-
:param length_tolerance: np.array, binwidth for length clustering per CDR. (default clustering into bins of identical length)
25+
:param length_tolerance: str or np.array, binwidth for length clustering per CDR. (default clustering into bins of identical length)
2626
array is required to have the same length as selection, with each element corresponding to the length
2727
tolerance of an individual CDR region.
2828
:param n_jobs: int, number of cpus to use for parallelising. (default is all)
2929
3030
:return final_clustering: pd.DataFrame, containing the cluster assignments
3131
"""
32+
3233
clustering_algorithm = AgglomerativeClustering(n_clusters=None, metric='precomputed', distance_threshold=cutoff, linkage='complete')
3334
final_clustering = cluster_with_algorithm(
3435
clustering_algorithm, files, selection=selection, anchors=anchors, d_metric=d_metric,

SPACE2/exhaustive_clustering.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from joblib import Parallel, delayed
55
from SPACE2.util import (
66
cluster_antibodies_by_CDR_length, rmsd, dtw, parse_antibodies, possible_combinations, check_param,
7-
reg_def, reg_def_CDR_all, reg_def_fw_all, same_length_only
7+
reg_def, reg_def_CDR_all, reg_def_fw_all, same_length_all_cdrs
88
)
99

1010

@@ -53,7 +53,7 @@ def get_distance_matrix(cluster, ids, d_funct, selection=reg_def_CDR_all, anchor
5353

5454

5555
def get_distance_matrices(files, selection=reg_def["CDR_all"], anchors=reg_def["fw_all"], d_metric='rmsd',
56-
length_clustering='bins', length_tolerance=same_length_only, n_jobs=-1):
56+
length_clustering='bins', length_tolerance=same_length_all_cdrs, n_jobs=-1):
5757
""" Calculate CDR distance matrices between antibody pdb files.
5858
Antibodies are first clustered by CDR length and then a distance matrix is calculated for each cluster.
5959
@@ -168,7 +168,7 @@ def get_clustering(df, clustering):
168168

169169

170170
def cluster_with_algorithm(method, files, selection=reg_def["CDR_all"], anchors=reg_def["fw_all"],
171-
d_metric='rmsd', length_clustering='bins', length_tolerance=same_length_only, n_jobs=-1):
171+
d_metric='rmsd', length_clustering='bins', length_tolerance='same_length_only', n_jobs=-1):
172172
""" Sort a list of antibody pdb files into clusters.
173173
Antibodies are first clustered by CDR length and the by structural similarity
174174
@@ -178,10 +178,15 @@ def cluster_with_algorithm(method, files, selection=reg_def["CDR_all"], anchors=
178178
:param anchors: list of np.arrays, indices of residues used for structural alignment of antibodies.
179179
:param d_metric: str, metric for structural distance calculation. Options are 'rmsd' or 'dtw'.
180180
:param length_clustering: str, method for clustering antibodies by CDR length. Options are 'bins' or 'greedy'.
181-
:param length_tolerance: np.array, binwidth for length clustering per CDR.
181+
:param length_tolerance: str or np.array: binwidth for length clustering per CDR.
182182
:param n_jobs: int, number of cpus to use for parallelising.
183183
:return: pd.DataFrame, clustering output with columns ID, cluster_by_length, cluster_by_rmsd, matrix_index
184184
"""
185+
if isinstance(length_tolerance, str):
186+
if length_tolerance == 'same_length_only':
187+
n = len(selection)
188+
length_tolerance = np.ones(n)
189+
185190
matrices_dict = get_distance_matrices(
186191
files, selection=selection, anchors=anchors, d_metric=d_metric, length_clustering=length_clustering,
187192
length_tolerance=length_tolerance, n_jobs=n_jobs

SPACE2/greedy_clustering.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from joblib import Parallel, delayed
33
from SPACE2.util import (
44
rmsd, dtw, parse_antibodies, cluster_antibodies_by_CDR_length, output_to_pandas, check_param,
5-
reg_def, reg_def_CDR_all, reg_def_fw_all, same_length_only
5+
reg_def, reg_def_CDR_all, reg_def_fw_all
66
)
77

88
def greedy_cluster(cluster, d_funct, selection=reg_def_CDR_all, anchors=reg_def_fw_all, cutoff=1.25):
@@ -55,7 +55,7 @@ def greedy_cluster_ids(cluster, ids, d_funct, selection=reg_def_CDR_all, anchors
5555

5656

5757
def greedy_clustering(files, selection=reg_def["CDR_all"], anchors=reg_def["fw_all"], cutoff=1.25, d_metric='rmsd',
58-
length_clustering='bins', length_tolerance=same_length_only, n_jobs=-1):
58+
length_clustering='bins', length_tolerance='same_length_only', n_jobs=-1):
5959
""" Sort a list of antibody pdb files into clusters using greedy algorithm.
6060
Antibodies are first clustered by CDR length and then by structural similarity
6161
@@ -72,13 +72,18 @@ def greedy_clustering(files, selection=reg_def["CDR_all"], anchors=reg_def["fw_a
7272
:param length_clustering: str, method for clustering antibodies by CDR length. Options are 'bins' or 'greedy'. (default is 'bins')
7373
bins: CDRs are grouped into precalculated and equally spaced length bins
7474
greedy: stochastic selection of cluster centers
75-
:param length_tolerance: np.array, binwidth for length clustering per CDR. (default clustering into bins of identical length)
75+
:param length_tolerance: str or np.array, binwidth for length clustering per CDR. (default clustering into bins of identical length)
7676
array is required to have the same length as selection, with each element corresponding to the length
7777
tolerance of an individual CDR region.
7878
:param n_jobs: int, number of cpus to use for parallelising. (default is all)
7979
8080
:return final_clustering: pd.DataFrame, containing the cluster assignments
8181
"""
82+
if isinstance(length_tolerance, str):
83+
if length_tolerance == 'same_length_only':
84+
n = len(selection)
85+
length_tolerance = np.ones(n)
86+
8287
check_param(length_tolerance, d_metric)
8388

8489
antibodies = parse_antibodies(files, n_jobs=n_jobs)

SPACE2/util.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@
3434
reg_def_CDR_all = np.concatenate(reg_def["CDR_all"])
3535
reg_def_fw_all = np.concatenate(reg_def["fw_all"])
3636

37-
same_length_only = np.array([1, 1, 1, 1, 1, 1])
38-
37+
same_length_all_cdrs = np.ones(6)
3938

4039
def random_rot():
4140
""" Just a random rotation
@@ -221,7 +220,7 @@ def dtw(ab1, ab2, selection=reg_def_CDR_all, anchors=reg_def_fw_all):
221220
return np.sqrt(normalisation * np.sqrt(dtw_matrix[-1][-1])**2)
222221

223222

224-
def cluster_antibodies_by_CDR_length(antibodies, ids, selection=reg_def['CDR_all'], clustering='bins', tolerance=same_length_only):
223+
def cluster_antibodies_by_CDR_length(antibodies, ids, selection=reg_def['CDR_all'], clustering='bins', tolerance=same_length_all_cdrs):
225224
""" Sort a list of antibody tuples into groups with the same CDR lengths
226225
227226
:param cluster: list of tuples, antibodies
@@ -300,6 +299,8 @@ def output_to_pandas(output):
300299

301300
def check_param(tolerance, d_metric):
302301
''' Check if the input parameters are valid'''
302+
if tolerance is None:
303+
raise ValueError("Length tolerance must be specified as a np.array")
303304
if any(tolerance < 1):
304305
raise ValueError("All entries of length tolerance must be >= 1")
305306
if d_metric == 'rmsd' and not all(tolerance == 1):

notebooks/example.ipynb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
"outputs": [],
4040
"source": [
4141
"from SPACE2 import reg_def\n",
42+
"import numpy as np\n",
4243
"\n",
4344
"# residues for structural comparison\n",
4445
"cdr_selection = [reg_def['CDRH1'], reg_def['CDRH2'], reg_def['CDRH3']]\n",
@@ -49,7 +50,7 @@
4950
"# these correspond to the imgt residue number of residues to select\n",
5051
"\n",
5152
"clustered_dataframe = SPACE2.agglomerative_clustering(\n",
52-
" antibody_models, selection=cdr_selection, anchors=fw_selection, cutoff=1.25\n",
53+
" antibody_models, selection=cdr_selection, anchors=fw_selection, cutoff=1.25,\n",
5354
" )"
5455
]
5556
},

0 commit comments

Comments
 (0)