Skip to content

Commit dbe7b08

Browse files
committed
add clustering method
1 parent 2789691 commit dbe7b08

File tree

1 file changed

+34
-1
lines changed

1 file changed

+34
-1
lines changed

ms2query/spectral_processing/spectra_merging.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import numpy as np
2+
from scipy.sparse import csr_matrix
3+
from scipy.sparse.csgraph import connected_components
24
from matchms import Spectrum
35

46

@@ -167,7 +169,7 @@ def _merge_cluster_to_consensus(cluster_spectra, mz_tol=0.01, min_frac=0.25):
167169
return Spectrum(mz=consensus_mz, intensities=consensus_int, metadata=md)
168170

169171

170-
# --------------------- Main function ---------------------
172+
# --------------------- Main functions ---------------------
171173
def get_merged_spectra(spectra, clusters, mz_tol=0.01, min_frac=0.25):
172174
"""Given a list of spectra and clusters (lists of indices into spectra),
173175
return a list of merged consensus spectra.
@@ -193,3 +195,34 @@ def get_merged_spectra(spectra, clusters, mz_tol=0.01, min_frac=0.25):
193195
# singletons: normalize to sum=1 for consistency
194196
spectra_new.append(_normalize_spectrum_sum(spectra[cluster[0]]))
195197
return spectra_new
198+
199+
200+
def cluster_block(spectra, sim_score, threshold=0.95):
201+
"""Find clusters of highly similar spectra (according to Cosine).
202+
Hint: Use lower intensity_power to emphasize smaller peaks.
203+
204+
Parameters
205+
----------
206+
spectra:
207+
List of matchms Spectrum objects.
208+
sim_score:
209+
Matchms scoring method, e.g. CosineGreedy()
210+
threshold: float
211+
Spectra with similarity >= threshold will be merged.
212+
"""
213+
# similarity
214+
sim = sim_score.matrix(spectra, spectra, is_symmetric=True)
215+
S = sim["score"]
216+
217+
# Graph by threshold on upper triangle
218+
iu = np.triu_indices_from(S, 1)
219+
edges = np.where(S[iu] >= threshold)[0]
220+
rows = iu[0][edges]; cols = iu[1][edges]
221+
n = S.shape[0]
222+
G = csr_matrix((np.ones_like(rows), (rows, cols)), shape=(n, n))
223+
G = G + G.T + csr_matrix(np.eye(n))
224+
225+
# Connected components = clusters
226+
n_comp, labels = connected_components(G, directed=False)
227+
clusters = [np.where(labels == k)[0] for k in range(n_comp)]
228+
return clusters, S

0 commit comments

Comments
 (0)