Skip to content

Make library creation more robust #202

@florian-huber

Description

@florian-huber

Not sure if this is indeed intended.
But I just had a workflow breaking because one smiles could not be converted to a fingerprint.

ArgumentError                             Traceback (most recent call last)
Cell In[6], line 6
      1 library_creator = LibraryFilesCreator(spectrums,
      2                                       output_directory=r"D:\summer_school_copenhagen_2023\ms2query_library2023", 
      3                                       #ms2ds_model_file_name=ms2ds_model_file_name, 
      4                                       #s2v_model_file_name=s2v_model_file_name, 
      5                                      ) 
----> 6 library_creator.create_all_library_files()

File d:\ms2query\ms2query\create_new_library\library_files_creator.py:114, in LibraryFilesCreator.create_all_library_files(self)
    111 def create_all_library_files(self):
    112     """Creates files with embeddings and a sqlite file with spectra data
    113     """
--> 114     self.create_sqlite_file()
    115     self.store_s2v_embeddings()
    116     self.store_ms2ds_embeddings()

File d:\ms2query\ms2query\create_new_library\library_files_creator.py:124, in LibraryFilesCreator.create_sqlite_file(self)
    122 else:
    123     compound_classes_df = None
--> 124 make_sqlfile_wrapper(
    125     self.sqlite_file_name,
    126     self.list_of_spectra,
    127     columns_dict={"precursor_mz": "REAL"},
    128     compound_classes=compound_classes_df,
    129     progress_bars=self.progress_bars,
    130 )

File d:\ms2query\ms2query\create_new_library\create_sqlite_database.py:53, in make_sqlfile_wrapper(sqlite_file_name, list_of_spectra, columns_dict, compound_classes, progress_bars)
     49 initialize_tables(sqlite_file_name, additional_metadata_columns_dict=columns_dict,
     50                   additional_inchikey_columns=additional_inchikey_columns)
     51 fill_spectrum_data_table(sqlite_file_name, list_of_spectra, progress_bar=progress_bars)
---> 53 fill_inchikeys_table(sqlite_file_name, list_of_spectra,
     54                      compound_classes=compound_classes,
     55                      progress_bars=progress_bars)

File d:\ms2query\ms2query\create_new_library\create_sqlite_database.py:204, in fill_inchikeys_table(sqlite_file_name, list_of_spectra, compound_classes, progress_bars)
    201 conn = sqlite3.connect(sqlite_file_name)
    202 cur = conn.cursor()
--> 204 closest_related_inchikey14s = calculate_highest_tanimoto_score(list_of_spectra, list_of_spectra, 10)
    206 # Fill table
    207 for inchikey14 in tqdm(spectra_belonging_to_inchikey14,
    208                        desc="Adding inchikey14s to sqlite table",
    209                        disable=not progress_bars):

File d:\ms2query\ms2query\create_new_library\calculate_tanimoto_scores.py:92, in calculate_highest_tanimoto_score(query_spectra, library_spectra, nr_of_top_inchikeys)
     88 def calculate_highest_tanimoto_score(query_spectra,
     89                                      library_spectra,
     90                                      nr_of_top_inchikeys):
     91     """Returns the highest scoring library spectra in """
---> 92     tanimoto_scores_df = calculate_tanimoto_scores_unique_inchikey(query_spectra, library_spectra)
     93     unique_query_inchikeys = list(tanimoto_scores_df.index)
     94     highest_score_dict = {}

File d:\ms2query\ms2query\create_new_library\calculate_tanimoto_scores.py:54, in calculate_tanimoto_scores_unique_inchikey(list_of_spectra_1, list_of_spectra_2)
     51 list_of_smiles_1 = [spectrum.get("smiles") for spectrum in spectra_with_most_frequent_inchi_per_inchikey_1]
     52 list_of_smiles_2 = [spectrum.get("smiles") for spectrum in spectra_with_most_frequent_inchi_per_inchikey_2]
---> 54 tanimoto_scores = calculate_tanimoto_scores_from_smiles(list_of_smiles_1, list_of_smiles_2)
     55 tanimoto_df = pd.DataFrame(tanimoto_scores, index=unique_inchikeys_1, columns=unique_inchikeys_2)
     56 return tanimoto_df

File d:\ms2query\ms2query\create_new_library\calculate_tanimoto_scores.py:27, in calculate_tanimoto_scores_from_smiles(list_of_smiles_1, list_of_smiles_2)
     24 def calculate_tanimoto_scores_from_smiles(list_of_smiles_1: List[str],
     25                                           list_of_smiles_2: List[str]) -> np.ndarray:
     26     """Returns a 2d ndarray containing the tanimoto scores between the smiles"""
---> 27     fingerprints_1 = np.array([get_fingerprint(spectrum) for spectrum in tqdm(list_of_smiles_1,
     28                                                                               desc="Calculating fingerprints")])
     29     fingerprints_2 = np.array([get_fingerprint(spectrum) for spectrum in tqdm(list_of_smiles_2,
     30                                                                               desc="Calculating fingerprints")])
     31     print("Calculating tanimoto scores")

File d:\ms2query\ms2query\create_new_library\calculate_tanimoto_scores.py:27, in <listcomp>(.0)
     24 def calculate_tanimoto_scores_from_smiles(list_of_smiles_1: List[str],
     25                                           list_of_smiles_2: List[str]) -> np.ndarray:
     26     """Returns a 2d ndarray containing the tanimoto scores between the smiles"""
---> 27     fingerprints_1 = np.array([get_fingerprint(spectrum) for spectrum in tqdm(list_of_smiles_1,
     28                                                                               desc="Calculating fingerprints")])
     29     fingerprints_2 = np.array([get_fingerprint(spectrum) for spectrum in tqdm(list_of_smiles_2,
     30                                                                               desc="Calculating fingerprints")])
     31     print("Calculating tanimoto scores")

File d:\ms2query\ms2query\create_new_library\calculate_tanimoto_scores.py:18, in get_fingerprint(smiles)
     17 def get_fingerprint(smiles: str):
---> 18     fingerprint = np.array(Chem.RDKFingerprint(Chem.MolFromSmiles(smiles), fpSize=2048))
     19     assert isinstance(fingerprint, np.ndarray), \
     20         f"Fingerprint for 1 spectrum could not be set smiles is {smiles}"
     21     return fingerprint

ArgumentError: Python argument types in
    rdkit.Chem.rdmolops.RDKFingerprint(NoneType)
did not match C++ signature:
    RDKFingerprint(class RDKit::ROMol mol, unsigned int minPath=1, unsigned int maxPath=7, unsigned int fpSize=2048, unsigned int nBitsPerHash=2, bool useHs=True, double tgtDensity=0.0, unsigned int minSize=128, bool branchedPaths=True, bool useBondOrder=True, class boost::python::api::object atomInvariants=0, class boost::python::api::object fromAtoms=0, class boost::python::api::object atomBits=None, class boost::python::api::object bitInfo=None)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions