Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ms2query/benchmarking/Embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def combine_embeddings(cls, embeddings_1: "Embeddings", embeddings_2: "Embedding
if embeddings_1.model_settings != embeddings_2.model_settings:
raise ValueError("Model settings of merged embeddings do not match")
if not set(embeddings_1.index_to_spectrum_hash).isdisjoint(embeddings_2.index_to_spectrum_hash):
# todo allow this to happen, but remove repeating ones and check that they are the same.
raise ValueError("There are repeated spectra in the embeddings that are added together")
combined_embeddings = np.vstack([embeddings_1.embeddings, embeddings_2.embeddings])
index_to_spectrum_hash = embeddings_1.index_to_spectrum_hash + embeddings_2.index_to_spectrum_hash
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
import numpy as np
from matchms.similarity.vector_similarity_functions import jaccard_similarity_matrix
from tqdm import tqdm

from ms2query.benchmarking.Fingerprints import Fingerprints
from ms2query.benchmarking.reference_methods.PredictMS2DeepScoreSimilarity import predict_top_ms2deepscores
from ms2query.benchmarking.AnnotatedSpectrumSet import AnnotatedSpectrumSet


def predict_with_integrated_similarity_flow(
library_spectra: AnnotatedSpectrumSet,
query_spectra: AnnotatedSpectrumSet,
fingerprints: Fingerprints,
number_of_analogues_to_consider=50,
) -> Tuple[List[str], List[float]]:

Expand All @@ -22,6 +25,7 @@ def predict_with_integrated_similarity_flow(
highest_isf_score, inchikey_of_highest_isf_score = get_highest_isf(
library_spectra,
all_indexes_of_library_spectra_with_highest_score[query_index],
fingerprints,
all_predicted_scores[query_index],
)
inchikeys_of_best_matches.append(inchikey_of_highest_isf_score)
Expand All @@ -32,7 +36,8 @@ def predict_with_integrated_similarity_flow(
def get_highest_isf(
library_spectra: AnnotatedSpectrumSet,
indexes_of_library_spectra_with_highest_score: np.ndarray,
predicted_scores: [List[float]],
fingerprints: Fingerprints,
predicted_scores: [List[float]],
):

# Get the corresponding inchikeys
Expand All @@ -43,7 +48,8 @@ def get_highest_isf(
predicted_scores, inchikeys_with_highest_ms2deepscore
)
# calculate tanimoto scores
tanimoto_scores = jaccard_similarity_matrix(library_spectra.fingerprints.fingerprints, library_spectra.fingerprints.fingerprints)
library_fingerprints = fingerprints.get_fingerprints(library_spectra.inchikeys)
tanimoto_scores = jaccard_similarity_matrix(library_fingerprints, library_fingerprints)

isf_scores = integrated_similarity_flow(average_scores, tanimoto_scores, nr_of_spectra_per_inchikey)
index_of_highest_score = np.argmax(isf_scores)
Expand Down
35 changes: 22 additions & 13 deletions tests/test_methods.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import numpy as np
import pytest
from matchms.similarity.vector_similarity_functions import jaccard_similarity_matrix

from ms2query.benchmarking.Fingerprints import Fingerprints
from ms2query.benchmarking.reference_methods.predict_best_possible_match import predict_best_possible_match
from ms2query.benchmarking.reference_methods.predict_highest_cosine import predict_highest_cosine
from ms2query.benchmarking.reference_methods.predict_highest_ms2deepscore import predict_highest_ms2deepscore
Expand All @@ -11,36 +13,43 @@
from ms2query.benchmarking.AnnotatedSpectrumSet import AnnotatedSpectrumSet
from tests.conftest import create_test_spectra, ms2deepscore_model

def get_library_and_test_spectra():
model = ms2deepscore_model()
library_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra())
test_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra(1))
library_spectra.add_embeddings(model)
test_spectra.add_embeddings(model)
return library_spectra, test_spectra

@pytest.mark.parametrize(
"prediction_function",
[
predict_highest_cosine,
predict_highest_ms2deepscore,
predict_best_possible_match,
],
)
def test_all_methods(prediction_function):
model = ms2deepscore_model()

library_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra())
test_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra(1))
library_spectra.add_embeddings(model)
test_spectra.add_embeddings(model)
library_spectra, test_spectra = get_library_and_test_spectra()
predicted_inchikeys, scores = prediction_function(library_spectra, test_spectra)
for i, spectrum in enumerate(test_spectra.spectra):
inchikey = spectrum.get("inchikey")[:14]
assert predicted_inchikeys[i] == inchikey
assert np.allclose(scores[i], np.array(1.0), atol=1e-5)


def test_predict_best_possible_match():
library_spectra, test_spectra = get_library_and_test_spectra()
fingerprints = Fingerprints.from_spectrum_set(library_spectra + test_spectra, "daylight", 2048)
predicted_inchikeys, scores = predict_best_possible_match(library_spectra, test_spectra, fingerprints)
for i, spectrum in enumerate(test_spectra.spectra):
inchikey = spectrum.get("inchikey")[:14]
assert predicted_inchikeys[i] == inchikey
assert np.allclose(scores[i], np.array(1.0), atol=1e-5)

def test_predict_with_integrated_similarity_flow():
model = ms2deepscore_model()
library_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra())
test_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra(1))
library_spectra.add_embeddings(model)
test_spectra.add_embeddings(model)
predicted_inchikeys, scores = predict_with_integrated_similarity_flow(library_spectra, test_spectra)
library_spectra, test_spectra = get_library_and_test_spectra()
fingerprints = Fingerprints.from_spectrum_set(library_spectra, "daylight", 4096)
predicted_inchikeys, scores = predict_with_integrated_similarity_flow(library_spectra, test_spectra, fingerprints)

assert predicted_inchikeys == ["RYYVLZVUVIJVGH", "ZPUCINDJVBIVPJ", "ZPUCINDJVBIVPJ"]
assert np.allclose(np.array([0.38829751082577607, 0.3919729335980483, 0.38774130710967564]), np.array(scores))
Expand Down
11 changes: 8 additions & 3 deletions tests/test_predict_using_closest_tanimoto.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import numpy as np
import pytest

from ms2query.benchmarking.Fingerprints import Fingerprints
from ms2query.benchmarking.reference_methods.predict_using_closest_tanimoto import (
get_average_predictions_for_closely_related_metabolites,
get_inchikey_and_tanimoto_scores_for_top_k,
Expand All @@ -18,7 +20,8 @@ def test_predict_using_closest_tanimoto():
test_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra(1, nr_of_inchikeys=3))
library_spectra.add_embeddings(model)
test_spectra.add_embeddings(model)
predicted_inchikeys, scores = predict_using_closest_tanimoto(library_spectra, test_spectra, 3, 3)
fingerprints = Fingerprints.from_spectrum_set(library_spectra, "daylight", 2048)
predicted_inchikeys, scores = predict_using_closest_tanimoto(library_spectra, test_spectra, fingerprints, 3, 3)

assert isinstance(predicted_inchikeys, list)
assert len(predicted_inchikeys) == 3
Expand All @@ -32,7 +35,9 @@ def test_predict_using_closest_tanimoto_single_spectrum():
test_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra(1, nr_of_inchikeys=3))
library_spectra.add_embeddings(model)
test_spectra.add_embeddings(model)
predicted_inchikey, score = predict_using_closest_tanimoto_single_spectrum(library_spectra, test_spectra, 3, 3)
fingerprints = Fingerprints.from_spectrum_set(library_spectra, "daylight", 2048)

predicted_inchikey, score = predict_using_closest_tanimoto_single_spectrum(library_spectra, test_spectra, 3, 3, fingerprints)

assert isinstance(predicted_inchikey, str)
assert len(predicted_inchikey) ==14
Expand All @@ -57,7 +62,7 @@ def test_get_average_predictions_for_closely_related_metabolites():
test_spectra = test_spectra.copy()[2:]
spectra = AnnotatedSpectrumSet.create_spectrum_set(test_spectra)

inchikeys = list(spectra.inchikey_fingerprint_pairs.keys())[:3]
inchikeys = spectra.inchikeys[:3]
ms2deepscores = np.zeros(len(spectra.spectra))
ms2deepscores[0] = 0.8
ms2deepscores[[1,2,3]] = 0.6
Expand Down
Loading