Merge pull request #198 from iomega/fix_zenodo_version

niekdejonge · web-flow · commit 34ebedef5233 · 2023-08-07T17:30:49.000+02:00
Change the default additional metadata to retention_time
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## 1.2.2
+### fixed
+- Set version of matchmsextras to 0.4.0, to fix dependency issue
+- Fix test with wrong sklearn version. 
+
+### Changed
+- Set default additional metadata from rtinseconds to retention_time
+
 ## 1.2.1
 ### fixed
 - Fix bug in downloading models from command line
diff --git a/README.md b/README.md
@@ -114,7 +114,7 @@ optional arguments:
   --download            This will download the most up to date model and library.The model will be stored in the folder given as the second argumentThe model will be downloaded in the in the ionization mode specified under --mode
   --results RESULTS     The folder in which the results should be stored. The default is a new results folder in the folder with the spectra
   --filter_ionmode      Filter out all spectra that are not in the specified ion-mode. The ion mode can be specified by using --ionmode
-  --addional_metadata   Return additional metadata columns in the results, for example --additional_metadata rtinseconds feature_id
+  --addional_metadata   Return additional metadata columns in the results, for example --additional_metadata retention_time feature_id
 ```
 
 ## Build MS2Query into other tools
diff --git a/ms2query/__init__.py b/ms2query/__init__.py
@@ -51,10 +51,10 @@ def command_line():
                         help="Filter out all spectra that are not in the specified ion-mode. "
                              "The ion mode can be specified by using --ionmode")
     parser.add_argument("--additional_metadata", action="store",
-                        default=("rtinseconds", "feature_id",),
+                        default=("retention_time", "feature_id",),
                         nargs="+",
                         type=str,
-                        help="Return additional metadata columns in the results, for example --additional_metadata rtinseconds feature_id")
+                        help="Return additional metadata columns in the results, for example --additional_metadata retention_time feature_id")
     args = parser.parse_args()
     ms2query_library_files_directory = args.library
     ms2_spectra_location = args.spectra
diff --git a/ms2query/__version__.py b/ms2query/__version__.py
@@ -1 +1 @@
-__version__ = '1.2.1'
+__version__ = '1.2.2'
diff --git a/ms2query/create_new_library/train_models.py b/ms2query/create_new_library/train_models.py
@@ -6,9 +6,9 @@
 import os
 from spec2vec.model_building import train_new_word2vec_model
 from ms2query.create_new_library.train_ms2deepscore import train_ms2deepscore_wrapper
-from ms2query.create_new_library.train_ms2query_model import train_ms2query_model
+from ms2query.create_new_library.train_ms2query_model import train_ms2query_model, convert_to_onnx_model
 from ms2query.create_new_library.library_files_creator import LibraryFilesCreator
-from ms2query.utils import load_matchms_spectrum_objects_from_file, convert_to_onnx_model
+from ms2query.utils import load_matchms_spectrum_objects_from_file
 from ms2query.clean_and_filter_spectra import create_spectrum_documents, clean_normalize_and_split_annotated_spectra
 
 
diff --git a/ms2query/create_new_library/train_ms2query_model.py b/ms2query/create_new_library/train_ms2query_model.py
@@ -6,6 +6,8 @@
 import os
 from typing import List
 import pandas as pd
+from onnxconverter_common import FloatTensorType
+from skl2onnx import convert_sklearn
 from tqdm import tqdm
 from matchms import Spectrum
 from sklearn.ensemble import RandomForestRegressor
@@ -15,7 +17,7 @@
 from ms2query.create_new_library.library_files_creator import LibraryFilesCreator
 from ms2query.create_new_library.split_data_for_training import split_spectra_on_inchikeys, split_training_and_validation_spectra
 from ms2query.create_new_library.calculate_tanimoto_scores import calculate_tanimoto_scores_from_smiles
-from ms2query.utils import save_pickled_file
+from ms2query.utils import save_pickled_file, return_non_existing_file_name
 
 
 class DataCollectorForTraining():
@@ -142,3 +144,17 @@ def train_ms2query_model(training_spectra,
     # Train MS2Query model
     ms2query_model = train_random_forest(training_scores, training_labels)
     return ms2query_model
+
+
+def convert_to_onnx_model(random_forest_model, file_name = None):
+    """The randomforest model is stored as an onnx model for backwards compatability"""
+    FloatTensorType([None, 5])
+    onnx = convert_sklearn(random_forest_model, initial_types=[("input",
+                                                        FloatTensorType([None, random_forest_model.n_features_in_]))],
+                   target_opset=12)
+    if file_name is not None:
+        file_name = return_non_existing_file_name(file_name)
+
+        with open(file_name, "wb") as file:
+            file.write(onnx.SerializeToString())
+    return onnx
diff --git a/ms2query/utils.py b/ms2query/utils.py
@@ -5,8 +5,6 @@
 import numpy as np
 from matchms import importing
 from spec2vec.Spec2Vec import Spectrum
-from skl2onnx import convert_sklearn
-from skl2onnx.common.data_types import FloatTensorType
 from onnxruntime import InferenceSession
 
 
@@ -216,20 +214,6 @@ def __init__(self,
         self.filter_on_ion_mode = filter_on_ion_mode
 
 
-def convert_to_onnx_model(random_forest_model, file_name = None):
-    """The randomforest model is stored as an onnx model for backwards compatability"""
-    FloatTensorType([None, 5])
-    onnx = convert_sklearn(random_forest_model, initial_types=[("input",
-                                                        FloatTensorType([None, random_forest_model.n_features_in_]))],
-                   target_opset=12)
-    if file_name is not None:
-        file_name = return_non_existing_file_name(file_name)
-
-        with open(file_name, "wb") as file:
-            file.write(onnx.SerializeToString())
-    return onnx
-
-
 def predict_onnx_model(random_forest_onnx_model: InferenceSession, input_values):
     """Makes predictions for an onnx model"""
     # input_name = random_forest_onnx_model.get_inputs()[0].name
diff --git a/setup.py b/setup.py
@@ -40,7 +40,7 @@
         "ms2deepscore",
         "gensim>=4.0.0",
         "pandas>=1.2.5,<2.0.0",
-        "matchmsextras>=0.4.0",
+        "matchmsextras==0.4.0",
         "pubchempy", #This is a dependency for matchmsextras, which is missing in setup
         "tqdm",
         "matplotlib",
diff --git a/tests/test_files/general_test_files/test_ms2q_rf_model.pickle b/tests/test_files/general_test_files/test_ms2q_rf_model.pickle
diff --git a/tests/test_train_ms2query_model.py b/tests/test_train_ms2query_model.py
@@ -1,10 +1,14 @@
 import os
+
+import numpy as np
 import pytest
 import sys
 import pandas as pd
 from ms2query.create_new_library.train_ms2query_model import \
-    DataCollectorForTraining, calculate_tanimoto_scores_with_library, train_random_forest, train_ms2query_model
-from ms2query.utils import load_pickled_file, load_matchms_spectrum_objects_from_file, convert_to_onnx_model
+    DataCollectorForTraining, calculate_tanimoto_scores_with_library, train_random_forest, train_ms2query_model, \
+    convert_to_onnx_model
+from ms2query.utils import load_pickled_file, load_matchms_spectrum_objects_from_file, load_ms2query_model, \
+    predict_onnx_model
 from onnxruntime import InferenceSession
 from ms2query.utils import predict_onnx_model
 from ms2query.ms2library import MS2Library
@@ -76,15 +80,19 @@ def test_calculate_all_tanimoto_scores(tmp_path, ms2library, query_spectra):
     pd.testing.assert_frame_equal(result, expected_result, check_dtype=False)
 
 
-def test_train_random_forest():
+def test_train_and_save_random_forest():
     training_scores, training_labels = load_pickled_file(os.path.join(
         os.path.split(os.path.dirname(__file__))[0],
         "tests/test_files/test_files_train_ms2query_nn",
         "expected_train_and_val_data.pickle"))[:2]
     ms2query_model = train_random_forest(training_scores, training_labels)
     onnx_model = convert_to_onnx_model(ms2query_model)
     onnx_model_session = InferenceSession(onnx_model.SerializeToString())
-    predictions = predict_onnx_model(onnx_model_session, training_scores.values)
+    predictions_onnx_model = predict_onnx_model(onnx_model_session, training_scores.values)
+
+    # check if saving onnx model works
+    predictions_sklearn_model = ms2query_model.predict(training_scores.values.astype(np.float32))
+    assert np.allclose(predictions_onnx_model, predictions_sklearn_model)
 
 
 @pytest.mark.integration
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -2,14 +2,11 @@
 from io import StringIO
 from typing import List
 
-import numpy as np
 import pandas as pd
 import pytest
 from matchms import Spectrum
 from ms2query.utils import (add_unknown_charges_to_spectra,
-                            load_matchms_spectrum_objects_from_file,
-                            load_pickled_file,
-                            convert_to_onnx_model, load_ms2query_model, predict_onnx_model)
+                            load_matchms_spectrum_objects_from_file)
 
 
 def test_convert_files_to_matchms_spectrum_objects_unknown_file(tmp_path):
@@ -51,24 +48,6 @@ def test_add_unknown_charges_to_spectra(hundred_test_spectra):
         assert spectrum.get("charge") == 2, "The charge is expected to be 2"
 
 
-def test_save_as_onnx_model(tmp_path):
-    path_to_test_dir = os.path.join(
-        os.path.split(os.path.dirname(__file__))[0],
-        'tests/test_files/')
-    rf_model_file = os.path.join(path_to_test_dir, 'general_test_files', "test_ms2q_rf_model.pickle")
-    rf_model = load_pickled_file(rf_model_file)
-    expected_result = load_pickled_file(os.path.join(
-        os.path.split(os.path.dirname(__file__))[0],
-        "tests/test_files/test_files_train_ms2query_nn",
-        "expected_train_and_val_data.pickle"))[0]
-    new_model = os.path.join(tmp_path, "rf_model.onnx")
-    convert_to_onnx_model(rf_model, new_model)
-    ms2query_model = load_ms2query_model(new_model)
-    result = predict_onnx_model(ms2query_model, expected_result.values)
-    original_result = rf_model.predict(expected_result.values.astype(np.float32))
-    assert np.allclose(result, original_result)
-
-
 def check_correct_results_csv_file(dataframe_found: pd.DataFrame,
                                    expected_headers: List[str],
                                    nr_of_rows_to_check=2):

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '1.2.1'`
	`1`	`+__version__ = '1.2.2'`