-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpredict_with_integrated_similarity_flow.py
More file actions
111 lines (94 loc) · 4.63 KB
/
predict_with_integrated_similarity_flow.py
File metadata and controls
111 lines (94 loc) · 4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from typing import List, Tuple
import numpy as np
from matchms.similarity.vector_similarity_functions import jaccard_similarity_matrix
from tqdm import tqdm
from ms2query.benchmarking.Fingerprints import Fingerprints
from ms2query.benchmarking.reference_methods.PredictMS2DeepScoreSimilarity import predict_top_ms2deepscores
from ms2query.benchmarking.AnnotatedSpectrumSet import AnnotatedSpectrumSet
def predict_with_integrated_similarity_flow(
library_spectra: AnnotatedSpectrumSet,
query_spectra: AnnotatedSpectrumSet,
fingerprints: Fingerprints,
number_of_analogues_to_consider=50,
) -> Tuple[List[str], List[float]]:
all_indexes_of_library_spectra_with_highest_score, all_predicted_scores = predict_top_ms2deepscores(
library_spectra.embeddings, query_spectra.embeddings, k=number_of_analogues_to_consider
)
inchikeys_of_best_matches = []
highest_isf_scores = []
# loop over the query spectra:
for query_index in tqdm(range(len(query_spectra.spectra)), "Calculating ISF score"):
highest_isf_score, inchikey_of_highest_isf_score = get_highest_isf(
library_spectra,
all_indexes_of_library_spectra_with_highest_score[query_index],
fingerprints,
all_predicted_scores[query_index],
)
inchikeys_of_best_matches.append(inchikey_of_highest_isf_score)
highest_isf_scores.append(highest_isf_score)
return inchikeys_of_best_matches, highest_isf_scores
def get_highest_isf(
library_spectra: AnnotatedSpectrumSet,
indexes_of_library_spectra_with_highest_score: np.ndarray,
fingerprints: Fingerprints,
predicted_scores: [List[float]],
):
# Get the corresponding inchikeys
inchikeys_with_highest_ms2deepscore = [
library_spectra.spectra[index].get("inchikey")[:14] for index in indexes_of_library_spectra_with_highest_score
]
unique_inchikeys, average_scores, nr_of_spectra_per_inchikey = average_scores_per_inchikeys(
predicted_scores, inchikeys_with_highest_ms2deepscore
)
# calculate tanimoto scores
library_fingerprints = fingerprints.get_fingerprints(library_spectra.inchikeys)
tanimoto_scores = jaccard_similarity_matrix(library_fingerprints, library_fingerprints)
isf_scores = integrated_similarity_flow(average_scores, tanimoto_scores, nr_of_spectra_per_inchikey)
index_of_highest_score = np.argmax(isf_scores)
highest_isf_score = isf_scores[index_of_highest_score]
inchikey_of_highest_isf_score = unique_inchikeys[index_of_highest_score]
return highest_isf_score, inchikey_of_highest_isf_score
def average_scores_per_inchikeys(predicted_scores, inchikeys):
"""Calculate the average precicted score per inchikey
This helps speed up the computations"""
if len(predicted_scores) != len(inchikeys):
raise ValueError
scores_per_inchikey = {}
for i, score in enumerate(predicted_scores):
inchikey = inchikeys[i]
if inchikey in scores_per_inchikey:
scores_per_inchikey[inchikey].append(score)
else:
scores_per_inchikey[inchikey] = [score]
# Take the average over the scores per inchikey
unique_inchikeys = []
average_scores = []
nr_of_spectra_per_inchikey = []
for inchikey in scores_per_inchikey:
unique_inchikeys.append(inchikey)
average_scores.append(sum(scores_per_inchikey[inchikey]) / len(scores_per_inchikey[inchikey]))
nr_of_spectra_per_inchikey.append(len(scores_per_inchikey[inchikey]))
return unique_inchikeys, average_scores, nr_of_spectra_per_inchikey
def integrated_similarity_flow(
predicted_scores: List[float], similarities: np.ndarray, nr_of_spectra_per_inchikey: List[float]
) -> List[float]:
"""Compute the confidence of the prediction for each candidate.
Integrated similarity flow (ISF) scores are calculated using the similarity of candidates among each other
and their distance to the query spectrum.
Args:
distances (list): Distances of the candidates to the query spectrum in the chemical space.
similarities (list of lists): Jaccard similarity of all candidates to each other.
Returns:
dict[int, float]: ISF scores for each candid+ate.
"""
num_hits = len(predicted_scores)
isf_scores = []
# Total similarity
total_similarity = sum([predicted_scores[i] * nr_of_spectra_per_inchikey[i] for i in range(len(predicted_scores))])
for i in range(num_hits):
isf_score = (
sum(predicted_scores[j] * similarities[i][j] * nr_of_spectra_per_inchikey[j] for j in range(num_hits))
/ total_similarity
)
isf_scores.append(isf_score)
return isf_scores