Skip to content

Commit af8795d

Browse files
replaces nans with 'missing' (#119)
* replace nans with NA * add NA to unclassified strings * adds missing and NA to missing values * update tests to allow for NA smiles * change test data * replace NA with missing * remove redundant string * remove trailing whitespaces * replace NA with missing
1 parent e58d9ba commit af8795d

9 files changed

+42
-47
lines changed

q2_qemistree/_classyfire.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,10 @@ def get_classyfire_taxonomy(feature_data: pd.DataFrame) -> pd.DataFrame:
4848
for idx in feature_data.index:
4949
ms2_smiles = feature_data.loc[idx, 'ms2_smiles']
5050
csi_smiles = feature_data.loc[idx, 'csi_smiles']
51-
if pd.notna(ms2_smiles) and not ms2_smiles.isspace():
51+
if ms2_smiles != 'missing':
5252
feature_data.loc[idx, 'smiles'] = ms2_smiles
5353
feature_data.loc[idx, 'structure_source'] = 'MS2'
54-
elif pd.notna(csi_smiles):
54+
elif csi_smiles != 'missing':
5555
feature_data.loc[idx, 'smiles'] = csi_smiles
5656
feature_data.loc[idx, 'structure_source'] = 'CSIFingerID'
5757
else:
@@ -60,12 +60,14 @@ def get_classyfire_taxonomy(feature_data: pd.DataFrame) -> pd.DataFrame:
6060
if feature_data['smiles'].notna().sum() == 0:
6161
raise ValueError("The feature data table should have at least "
6262
"one structural annotation to run Classyfire")
63+
feature_data = feature_data.fillna('missing')
64+
6365
classyfire = {}
6466
no_inchikey = []
6567
unexpected = []
6668
for idx in feature_data.index:
6769
smiles = feature_data.loc[idx, 'smiles']
68-
if pd.notna(smiles):
70+
if smiles != 'missing':
6971
to_inchikey = 'https://gnps-structure.ucsd.edu/inchikey?smiles='
7072
urlencoded_smiles = urllib.parse.quote(smiles)
7173
response = requests.get(to_inchikey+urlencoded_smiles)

q2_qemistree/_plot.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def format_labels(feature_metadata, category, ms2_label, parent_mz):
6565
labels = []
6666

6767
missing_values = {'unclassified', 'unexpected server response',
68-
'SMILE parse error', np.nan}
68+
'SMILE parse error', np.nan, 'NA', 'missing'}
6969

7070
labels.append('LABELS')
7171
labels.append('SEPARATOR TAB')
@@ -74,7 +74,7 @@ def format_labels(feature_metadata, category, ms2_label, parent_mz):
7474
if ms2_label:
7575
for idx in feature_metadata.index:
7676
ms2_compound = feature_metadata.loc[idx, 'ms2_library_match']
77-
if pd.notna(ms2_compound) and not ms2_compound.isspace():
77+
if ms2_compound not in missing_values:
7878
label = ms2_compound
7979
else:
8080
label = feature_metadata.loc[idx, category]

q2_qemistree/_process_fingerprint.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,19 @@ def get_feature_smiles(csi_result: CSIDirFmt, collated_fps: pd.DataFrame,
6767
csi_summary = pd.read_csv(csi_summary, dtype=str,
6868
sep='\t').set_index('experimentName')
6969
smiles = pd.DataFrame(index=collated_fps.index)
70-
smiles['csi_smiles'] = csi_summary.loc[smiles.index, 'smiles']
70+
smiles['csi_smiles'] = csi_summary.loc[smiles.index, 'smiles'].str.strip()
7171
smiles['ms2_smiles'] = np.nan
7272
smiles['ms2_library_match'] = np.nan
7373
smiles['ms2_adduct'] = np.nan
7474
if ms2_match is not None:
7575
ms2_match.index = ms2_match.index.astype(str)
7676
ms2_ids = ms2_match.index.intersection(smiles.index)
77-
smiles['ms2_smiles'] = ms2_match.loc[ms2_ids, 'Smiles']
78-
smiles['ms2_library_match'] = ms2_match.loc[ms2_ids, 'Compound_Name']
77+
smiles['ms2_smiles'] = ms2_match.loc[ms2_ids, 'Smiles'].str.strip()
78+
smiles['ms2_library_match'] = ms2_match.loc[
79+
ms2_ids, 'Compound_Name']
7980
smiles['ms2_adduct'] = ms2_match.loc[ms2_ids, 'Adduct']
81+
smiles = smiles.fillna('missing').apply(
82+
lambda x: x.replace({' ': 'missing', '': 'missing'}))
8083
return smiles
8184

8285

q2_qemistree/_prune_hierarchy.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def prune_hierarchy(feature_data: pd.DataFrame, tree: TreeNode,
4242
column)
4343
if column:
4444
failed = {'unclassified', 'unexpected server response',
45-
'SMILE parse error'}
45+
'SMILE parse error', 'missing'}
4646
# remove all NA values or missing values
4747
feature_data = feature_data[~(feature_data[column].isin(failed) |
4848
feature_data[column].isna())]
+12-12
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
label #featureID csi_smiles ms2_smiles table_number important not_so_important structure_source ms2_library_match
2-
featurehash1 3 1 a x CSIFingerID Spectral Match to Bleepbloop
3-
featurehash2 7 CCCCCCCCCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N+](C)(C)C)O 1 a x MS2 Caffeine
4-
featurehash3 2 1 b x MS2 Fakeiine
5-
featurehash4 5 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 c x MS2
6-
featurehash5 6 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 d x MS2
7-
featurehash7 7 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 e y Spectral Match to Glu-Val from NIST14
8-
featurehash8 8 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 f y CSIFingerID Spectral Match to Glu-Val from NIST14
9-
featurehash10 10 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 g y CSIFingerID
10-
featurehash22 22 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 h y
11-
featurehash42 23 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 i y CSIFingerID
12-
featurehash43 24 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 j y
1+
label #featureID csi_smiles ms2_smiles table_number important not_so_important structure_source ms2_library_match
2+
featurehash1 3 missing missing 1 a x CSIFingerID Spectral Match to Bleepbloop
3+
featurehash2 7 CCCCCCCCCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N+](C)(C)C)O 1 a x MS2 Caffeine
4+
featurehash3 2 missing missing 1 b x MS2 Fakeiine
5+
featurehash4 5 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 c x MS2 missing
6+
featurehash5 6 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 d x MS2 missing
7+
featurehash7 7 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 e y missing Spectral Match to Glu-Val from NIST14
8+
featurehash8 8 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 f y CSIFingerID Spectral Match to Glu-Val from NIST14
9+
featurehash10 10 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 g y CSIFingerID missing
10+
featurehash22 22 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 h y missing missing
11+
featurehash42 23 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 i y CSIFingerID missing
12+
featurehash43 24 CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O 1 j y missing missing

q2_qemistree/tests/data/feature_data_no_smiles.txt

-4
This file was deleted.

q2_qemistree/tests/data/feature_data_smiles.txt

-5
This file was deleted.

q2_qemistree/tests/test_classyfire.py

+14-15
Original file line numberDiff line numberDiff line change
@@ -9,30 +9,29 @@
99
from unittest import TestCase, main
1010
import os
1111
import pandas as pd
12-
import numpy as np
1312
from q2_qemistree import get_classyfire_taxonomy
1413

1514

1615
class TestClassyfire(TestCase):
1716
def setUp(self):
18-
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
19-
no_smiles = os.path.join(THIS_DIR, 'data/feature_data_no_smiles.txt')
20-
self.no_smiles = pd.read_csv(no_smiles, sep='\t')
21-
self.no_smiles.set_index('label')
22-
smiles = os.path.join(THIS_DIR, 'data/feature_data_smiles.txt')
23-
self.smiles = pd.read_csv(smiles, sep='\t')
24-
self.smiles.set_index('label')
17+
self.no_smiles = pd.DataFrame(index=['a', 'b', 'c'], data=[1, 2, 3],
18+
columns=['#featureID'])
19+
self.smiles = pd.DataFrame(index=['a', 'b', 'c'], data=[
20+
['missing', 'missing'],
21+
[' O=C(O)[C@@H](N)Cc1ccccc1', 'missing'],
22+
['missing', 'CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O']],
23+
columns=['csi_smiles', 'ms2_smiles'])
2524
self.nan_smiles = pd.DataFrame(index=['a', 'b', 'c'],
26-
data=[[np.nan, np.nan],
27-
[np.nan, np.nan],
28-
[np.nan, np.nan]],
25+
data=[['missing', 'missing'],
26+
['missing', 'missing'],
27+
['missing', 'missing']],
2928
columns=['csi_smiles', 'ms2_smiles'])
3029
self.mal_smiles = pd.DataFrame(index=['a', 'b'],
31-
data=[[np.nan, 'foo'],
32-
['bar', np.nan]],
30+
data=[['missing', 'foo'],
31+
['bar', 'missing']],
3332
columns=['csi_smiles', 'ms2_smiles'])
3433
self.levels = set(['kingdom', 'superclass', 'class', 'subclass',
35-
'direct_parent', 'structure_source'])
34+
'direct_parent', 'structure_source'])
3635

3736
def test_no_smiles(self):
3837
msg = ('Feature data table must contain the columns `csi_smiles` '
@@ -55,7 +54,7 @@ def test_classyfire_output(self):
5554
classified = get_classyfire_taxonomy(self.smiles)
5655
classified_mols = classified[classified['kingdom'] != 'unclassified']
5756
self.assertTrue(pd.isna(classified_mols).shape, 0)
58-
self.assertTrue(classified_mols.loc[1,
57+
self.assertTrue(classified_mols.loc['b',
5958
'kingdom'] == 'Organic compounds')
6059
self.assertTrue((self.levels.issubset(set(classified.columns))))
6160

q2_qemistree/tests/test_make_hierarchy.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ def test_mergeFeatureDataSingle(self):
7171
fdata_featrs = sorted(list(merged_fdata.index))
7272
self.assertEqual('csi_smiles' in merged_fdata.columns, True)
7373
self.assertEqual('ms2_smiles' in merged_fdata.columns, True)
74-
self.assertEqual(len(merged_fdata[pd.notna(
75-
merged_fdata.ms2_smiles)]), 1)
74+
self.assertEqual(len(
75+
merged_fdata[merged_fdata['ms2_smiles'] != 'missing']), 1)
7676
self.assertEqual(len(featrs) == 3, True)
7777
self.assertEqual(fdata_featrs, featrs)
7878

0 commit comments

Comments
 (0)