11import numpy as np
2+ from scipy .sparse import csr_matrix
3+ from scipy .sparse .csgraph import connected_components
24from matchms import Spectrum
35
46
@@ -167,7 +169,7 @@ def _merge_cluster_to_consensus(cluster_spectra, mz_tol=0.01, min_frac=0.25):
167169 return Spectrum (mz = consensus_mz , intensities = consensus_int , metadata = md )
168170
169171
170- # --------------------- Main function ---------------------
172+ # --------------------- Main functions ---------------------
171173def get_merged_spectra (spectra , clusters , mz_tol = 0.01 , min_frac = 0.25 ):
172174 """Given a list of spectra and clusters (lists of indices into spectra),
173175 return a list of merged consensus spectra.
@@ -193,3 +195,34 @@ def get_merged_spectra(spectra, clusters, mz_tol=0.01, min_frac=0.25):
193195 # singletons: normalize to sum=1 for consistency
194196 spectra_new .append (_normalize_spectrum_sum (spectra [cluster [0 ]]))
195197 return spectra_new
198+
199+
200+ def cluster_block (spectra , sim_score , threshold = 0.95 ):
201+ """Find clusters of highly similar spectra (according to Cosine).
202+ Hint: Use lower intensity_power to emphasize smaller peaks.
203+
204+ Parameters
205+ ----------
206+ spectra:
207+ List of matchms Spectrum objects.
208+ sim_score:
209+ Matchms scoring method, e.g. CosineGreedy()
210+ threshold: float
211+ Spectra with similarity >= threshold will be merged.
212+ """
213+ # similarity
214+ sim = sim_score .matrix (spectra , spectra , is_symmetric = True )
215+ S = sim ["score" ]
216+
217+ # Graph by threshold on upper triangle
218+ iu = np .triu_indices_from (S , 1 )
219+ edges = np .where (S [iu ] >= threshold )[0 ]
220+ rows = iu [0 ][edges ]; cols = iu [1 ][edges ]
221+ n = S .shape [0 ]
222+ G = csr_matrix ((np .ones_like (rows ), (rows , cols )), shape = (n , n ))
223+ G = G + G .T + csr_matrix (np .eye (n ))
224+
225+ # Connected components = clusters
226+ n_comp , labels = connected_components (G , directed = False )
227+ clusters = [np .where (labels == k )[0 ] for k in range (n_comp )]
228+ return clusters , S
0 commit comments