Skip to content

Commit 7b4e7f0

Browse files
authored
Merge pull request #5 from matchms/backup
backup because of laptop crash
2 parents 5bea2c3 + 510d4a8 commit 7b4e7f0

File tree

4 files changed

+39
-18
lines changed

4 files changed

+39
-18
lines changed

ms2query/benchmarking/Embeddings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def combine_embeddings(cls, embeddings_1: "Embeddings", embeddings_2: "Embedding
3131
if embeddings_1.model_settings != embeddings_2.model_settings:
3232
raise ValueError("Model settings of merged embeddings do not match")
3333
if not set(embeddings_1.index_to_spectrum_hash).isdisjoint(embeddings_2.index_to_spectrum_hash):
34+
# todo allow this to happen, but remove repeating ones and check that they are the same.
3435
raise ValueError("There are repeated spectra in the embeddings that are added together")
3536
combined_embeddings = np.vstack([embeddings_1.embeddings, embeddings_2.embeddings])
3637
index_to_spectrum_hash = embeddings_1.index_to_spectrum_hash + embeddings_2.index_to_spectrum_hash

ms2query/benchmarking/reference_methods/predict_with_integrated_similarity_flow.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
import numpy as np
33
from matchms.similarity.vector_similarity_functions import jaccard_similarity_matrix
44
from tqdm import tqdm
5+
6+
from ms2query.benchmarking.Fingerprints import Fingerprints
57
from ms2query.benchmarking.reference_methods.PredictMS2DeepScoreSimilarity import predict_top_ms2deepscores
68
from ms2query.benchmarking.AnnotatedSpectrumSet import AnnotatedSpectrumSet
79

810

911
def predict_with_integrated_similarity_flow(
1012
library_spectra: AnnotatedSpectrumSet,
1113
query_spectra: AnnotatedSpectrumSet,
14+
fingerprints: Fingerprints,
1215
number_of_analogues_to_consider=50,
1316
) -> Tuple[List[str], List[float]]:
1417

@@ -22,6 +25,7 @@ def predict_with_integrated_similarity_flow(
2225
highest_isf_score, inchikey_of_highest_isf_score = get_highest_isf(
2326
library_spectra,
2427
all_indexes_of_library_spectra_with_highest_score[query_index],
28+
fingerprints,
2529
all_predicted_scores[query_index],
2630
)
2731
inchikeys_of_best_matches.append(inchikey_of_highest_isf_score)
@@ -32,7 +36,8 @@ def predict_with_integrated_similarity_flow(
3236
def get_highest_isf(
3337
library_spectra: AnnotatedSpectrumSet,
3438
indexes_of_library_spectra_with_highest_score: np.ndarray,
35-
predicted_scores: [List[float]],
39+
fingerprints: Fingerprints,
40+
predicted_scores: [List[float]],
3641
):
3742

3843
# Get the corresponding inchikeys
@@ -43,7 +48,8 @@ def get_highest_isf(
4348
predicted_scores, inchikeys_with_highest_ms2deepscore
4449
)
4550
# calculate tanimoto scores
46-
tanimoto_scores = jaccard_similarity_matrix(library_spectra.fingerprints.fingerprints, library_spectra.fingerprints.fingerprints)
51+
library_fingerprints = fingerprints.get_fingerprints(library_spectra.inchikeys)
52+
tanimoto_scores = jaccard_similarity_matrix(library_fingerprints, library_fingerprints)
4753

4854
isf_scores = integrated_similarity_flow(average_scores, tanimoto_scores, nr_of_spectra_per_inchikey)
4955
index_of_highest_score = np.argmax(isf_scores)

tests/test_methods.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
import pytest
33
from matchms.similarity.vector_similarity_functions import jaccard_similarity_matrix
4+
5+
from ms2query.benchmarking.Fingerprints import Fingerprints
46
from ms2query.benchmarking.reference_methods.predict_best_possible_match import predict_best_possible_match
57
from ms2query.benchmarking.reference_methods.predict_highest_cosine import predict_highest_cosine
68
from ms2query.benchmarking.reference_methods.predict_highest_ms2deepscore import predict_highest_ms2deepscore
@@ -11,36 +13,43 @@
1113
from ms2query.benchmarking.AnnotatedSpectrumSet import AnnotatedSpectrumSet
1214
from tests.conftest import create_test_spectra, ms2deepscore_model
1315

16+
def get_library_and_test_spectra():
17+
model = ms2deepscore_model()
18+
library_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra())
19+
test_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra(1))
20+
library_spectra.add_embeddings(model)
21+
test_spectra.add_embeddings(model)
22+
return library_spectra, test_spectra
1423

1524
@pytest.mark.parametrize(
1625
"prediction_function",
1726
[
1827
predict_highest_cosine,
1928
predict_highest_ms2deepscore,
20-
predict_best_possible_match,
2129
],
2230
)
2331
def test_all_methods(prediction_function):
24-
model = ms2deepscore_model()
25-
26-
library_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra())
27-
test_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra(1))
28-
library_spectra.add_embeddings(model)
29-
test_spectra.add_embeddings(model)
32+
library_spectra, test_spectra = get_library_and_test_spectra()
3033
predicted_inchikeys, scores = prediction_function(library_spectra, test_spectra)
3134
for i, spectrum in enumerate(test_spectra.spectra):
3235
inchikey = spectrum.get("inchikey")[:14]
3336
assert predicted_inchikeys[i] == inchikey
3437
assert np.allclose(scores[i], np.array(1.0), atol=1e-5)
3538

3639

40+
def test_predict_best_possible_match():
41+
library_spectra, test_spectra = get_library_and_test_spectra()
42+
fingerprints = Fingerprints.from_spectrum_set(library_spectra + test_spectra, "daylight", 2048)
43+
predicted_inchikeys, scores = predict_best_possible_match(library_spectra, test_spectra, fingerprints)
44+
for i, spectrum in enumerate(test_spectra.spectra):
45+
inchikey = spectrum.get("inchikey")[:14]
46+
assert predicted_inchikeys[i] == inchikey
47+
assert np.allclose(scores[i], np.array(1.0), atol=1e-5)
48+
3749
def test_predict_with_integrated_similarity_flow():
38-
model = ms2deepscore_model()
39-
library_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra())
40-
test_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra(1))
41-
library_spectra.add_embeddings(model)
42-
test_spectra.add_embeddings(model)
43-
predicted_inchikeys, scores = predict_with_integrated_similarity_flow(library_spectra, test_spectra)
50+
library_spectra, test_spectra = get_library_and_test_spectra()
51+
fingerprints = Fingerprints.from_spectrum_set(library_spectra, "daylight", 4096)
52+
predicted_inchikeys, scores = predict_with_integrated_similarity_flow(library_spectra, test_spectra, fingerprints)
4453

4554
assert predicted_inchikeys == ["RYYVLZVUVIJVGH", "ZPUCINDJVBIVPJ", "ZPUCINDJVBIVPJ"]
4655
assert np.allclose(np.array([0.38829751082577607, 0.3919729335980483, 0.38774130710967564]), np.array(scores))

tests/test_predict_using_closest_tanimoto.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import numpy as np
22
import pytest
3+
4+
from ms2query.benchmarking.Fingerprints import Fingerprints
35
from ms2query.benchmarking.reference_methods.predict_using_closest_tanimoto import (
46
get_average_predictions_for_closely_related_metabolites,
57
get_inchikey_and_tanimoto_scores_for_top_k,
@@ -18,7 +20,8 @@ def test_predict_using_closest_tanimoto():
1820
test_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra(1, nr_of_inchikeys=3))
1921
library_spectra.add_embeddings(model)
2022
test_spectra.add_embeddings(model)
21-
predicted_inchikeys, scores = predict_using_closest_tanimoto(library_spectra, test_spectra, 3, 3)
23+
fingerprints = Fingerprints.from_spectrum_set(library_spectra, "daylight", 2048)
24+
predicted_inchikeys, scores = predict_using_closest_tanimoto(library_spectra, test_spectra, fingerprints, 3, 3)
2225

2326
assert isinstance(predicted_inchikeys, list)
2427
assert len(predicted_inchikeys) == 3
@@ -32,7 +35,9 @@ def test_predict_using_closest_tanimoto_single_spectrum():
3235
test_spectra = AnnotatedSpectrumSet.create_spectrum_set(create_test_spectra(1, nr_of_inchikeys=3))
3336
library_spectra.add_embeddings(model)
3437
test_spectra.add_embeddings(model)
35-
predicted_inchikey, score = predict_using_closest_tanimoto_single_spectrum(library_spectra, test_spectra, 3, 3)
38+
fingerprints = Fingerprints.from_spectrum_set(library_spectra, "daylight", 2048)
39+
40+
predicted_inchikey, score = predict_using_closest_tanimoto_single_spectrum(library_spectra, test_spectra, 3, 3, fingerprints)
3641

3742
assert isinstance(predicted_inchikey, str)
3843
assert len(predicted_inchikey) ==14
@@ -57,7 +62,7 @@ def test_get_average_predictions_for_closely_related_metabolites():
5762
test_spectra = test_spectra.copy()[2:]
5863
spectra = AnnotatedSpectrumSet.create_spectrum_set(test_spectra)
5964

60-
inchikeys = list(spectra.inchikey_fingerprint_pairs.keys())[:3]
65+
inchikeys = spectra.inchikeys[:3]
6166
ms2deepscores = np.zeros(len(spectra.spectra))
6267
ms2deepscores[0] = 0.8
6368
ms2deepscores[[1,2,3]] = 0.6

0 commit comments

Comments
 (0)