ms2query_2/ms2query/benchmarking/reference_methods/predict_with_integrated_similarity_flow.py at 510d4a8d0c23262d72d289b17011df3e9660bcc6 · matchms/ms2query_2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from typing import List, Tuple
import numpy as np
from matchms.similarity.vector_similarity_functions import jaccard_similarity_matrix
from tqdm import tqdm

from ms2query.benchmarking.Fingerprints import Fingerprints
from ms2query.benchmarking.reference_methods.PredictMS2DeepScoreSimilarity import predict_top_ms2deepscores
from ms2query.benchmarking.AnnotatedSpectrumSet import AnnotatedSpectrumSet


def predict_with_integrated_similarity_flow(
    library_spectra: AnnotatedSpectrumSet,
    query_spectra: AnnotatedSpectrumSet,
    fingerprints: Fingerprints,
    number_of_analogues_to_consider=50,
) -> Tuple[List[str], List[float]]:

    all_indexes_of_library_spectra_with_highest_score, all_predicted_scores = predict_top_ms2deepscores(
        library_spectra.embeddings, query_spectra.embeddings, k=number_of_analogues_to_consider
    )
    inchikeys_of_best_matches = []
    highest_isf_scores = []
    # loop over the query spectra:
    for query_index in tqdm(range(len(query_spectra.spectra)), "Calculating ISF score"):
        highest_isf_score, inchikey_of_highest_isf_score = get_highest_isf(
            library_spectra,
            all_indexes_of_library_spectra_with_highest_score[query_index],
            fingerprints,
            all_predicted_scores[query_index],
        )
        inchikeys_of_best_matches.append(inchikey_of_highest_isf_score)
        highest_isf_scores.append(highest_isf_score)
    return inchikeys_of_best_matches, highest_isf_scores


def get_highest_isf(
    library_spectra: AnnotatedSpectrumSet,
    indexes_of_library_spectra_with_highest_score: np.ndarray,
        fingerprints: Fingerprints,
        predicted_scores: [List[float]],
):

    # Get the corresponding inchikeys
    inchikeys_with_highest_ms2deepscore = [
        library_spectra.spectra[index].get("inchikey")[:14] for index in indexes_of_library_spectra_with_highest_score
    ]
    unique_inchikeys, average_scores, nr_of_spectra_per_inchikey = average_scores_per_inchikeys(
        predicted_scores, inchikeys_with_highest_ms2deepscore
    )
    # calculate tanimoto scores
    library_fingerprints = fingerprints.get_fingerprints(library_spectra.inchikeys)
    tanimoto_scores = jaccard_similarity_matrix(library_fingerprints, library_fingerprints)

    isf_scores = integrated_similarity_flow(average_scores, tanimoto_scores, nr_of_spectra_per_inchikey)
    index_of_highest_score = np.argmax(isf_scores)
    highest_isf_score = isf_scores[index_of_highest_score]
    inchikey_of_highest_isf_score = unique_inchikeys[index_of_highest_score]
    return highest_isf_score, inchikey_of_highest_isf_score


def average_scores_per_inchikeys(predicted_scores, inchikeys):
    """Calculate the average precicted score per inchikey
    This helps speed up the computations"""
    if len(predicted_scores) != len(inchikeys):
        raise ValueError
    scores_per_inchikey = {}
    for i, score in enumerate(predicted_scores):
        inchikey = inchikeys[i]
        if inchikey in scores_per_inchikey:
            scores_per_inchikey[inchikey].append(score)
        else:
            scores_per_inchikey[inchikey] = [score]
    # Take the average over the scores per inchikey
    unique_inchikeys = []
    average_scores = []
    nr_of_spectra_per_inchikey = []
    for inchikey in scores_per_inchikey:
        unique_inchikeys.append(inchikey)
        average_scores.append(sum(scores_per_inchikey[inchikey]) / len(scores_per_inchikey[inchikey]))
        nr_of_spectra_per_inchikey.append(len(scores_per_inchikey[inchikey]))
    return unique_inchikeys, average_scores, nr_of_spectra_per_inchikey


def integrated_similarity_flow(
    predicted_scores: List[float], similarities: np.ndarray, nr_of_spectra_per_inchikey: List[float]
) -> List[float]:
    """Compute the confidence of the prediction for each candidate.
    Integrated similarity flow (ISF) scores are calculated using the similarity of candidates among each other
    and their distance to the query spectrum.

    Args:
        distances (list): Distances of the candidates to the query spectrum in the chemical space.
        similarities (list of lists): Jaccard similarity of all candidates to each other.

    Returns:
        dict[int, float]: ISF scores for each candid+ate.
    """
    num_hits = len(predicted_scores)
    isf_scores = []

    # Total similarity
    total_similarity = sum([predicted_scores[i] * nr_of_spectra_per_inchikey[i] for i in range(len(predicted_scores))])

    for i in range(num_hits):
        isf_score = (
            sum(predicted_scores[j] * similarities[i][j] * nr_of_spectra_per_inchikey[j] for j in range(num_hits))
            / total_similarity
        )
        isf_scores.append(isf_score)

    return isf_scores