replaces nans with 'missing' (#119)

anupriyatripathi · web-flow · commit af8795deadd0 · 2020-02-21T12:14:40.000-08:00
* replace nans with NA

* add NA to unclassified strings

* adds missing and NA to missing values

* update tests to allow for NA smiles

* change test data

* replace NA with missing

* remove redundant string

* remove trailing whitespaces

* replace NA with missing
diff --git a/q2_qemistree/_classyfire.py b/q2_qemistree/_classyfire.py
@@ -48,10 +48,10 @@ def get_classyfire_taxonomy(feature_data: pd.DataFrame) -> pd.DataFrame:
     for idx in feature_data.index:
         ms2_smiles = feature_data.loc[idx, 'ms2_smiles']
         csi_smiles = feature_data.loc[idx, 'csi_smiles']
-        if pd.notna(ms2_smiles) and not ms2_smiles.isspace():
+        if ms2_smiles != 'missing':
             feature_data.loc[idx, 'smiles'] = ms2_smiles
             feature_data.loc[idx, 'structure_source'] = 'MS2'
-        elif pd.notna(csi_smiles):
+        elif csi_smiles != 'missing':
             feature_data.loc[idx, 'smiles'] = csi_smiles
             feature_data.loc[idx, 'structure_source'] = 'CSIFingerID'
         else:
@@ -60,12 +60,14 @@ def get_classyfire_taxonomy(feature_data: pd.DataFrame) -> pd.DataFrame:
     if feature_data['smiles'].notna().sum() == 0:
         raise ValueError("The feature data table should have at least "
                          "one structural annotation to run Classyfire")
+    feature_data = feature_data.fillna('missing')
+
     classyfire = {}
     no_inchikey = []
     unexpected = []
     for idx in feature_data.index:
         smiles = feature_data.loc[idx, 'smiles']
-        if pd.notna(smiles):
+        if smiles != 'missing':
             to_inchikey = 'https://gnps-structure.ucsd.edu/inchikey?smiles='
             urlencoded_smiles = urllib.parse.quote(smiles)
             response = requests.get(to_inchikey+urlencoded_smiles)
diff --git a/q2_qemistree/_plot.py b/q2_qemistree/_plot.py
@@ -65,7 +65,7 @@ def format_labels(feature_metadata, category, ms2_label, parent_mz):
     labels = []
 
     missing_values = {'unclassified', 'unexpected server response',
-                      'SMILE parse error', np.nan}
+                      'SMILE parse error', np.nan, 'NA', 'missing'}
 
     labels.append('LABELS')
     labels.append('SEPARATOR TAB')
@@ -74,7 +74,7 @@ def format_labels(feature_metadata, category, ms2_label, parent_mz):
     if ms2_label:
         for idx in feature_metadata.index:
             ms2_compound = feature_metadata.loc[idx, 'ms2_library_match']
-            if pd.notna(ms2_compound) and not ms2_compound.isspace():
+            if ms2_compound not in missing_values:
                 label = ms2_compound
             else:
                 label = feature_metadata.loc[idx, category]
diff --git a/q2_qemistree/_process_fingerprint.py b/q2_qemistree/_process_fingerprint.py
@@ -67,16 +67,19 @@ def get_feature_smiles(csi_result: CSIDirFmt, collated_fps: pd.DataFrame,
     csi_summary = pd.read_csv(csi_summary, dtype=str,
                               sep='\t').set_index('experimentName')
     smiles = pd.DataFrame(index=collated_fps.index)
-    smiles['csi_smiles'] = csi_summary.loc[smiles.index, 'smiles']
+    smiles['csi_smiles'] = csi_summary.loc[smiles.index, 'smiles'].str.strip()
     smiles['ms2_smiles'] = np.nan
     smiles['ms2_library_match'] = np.nan
     smiles['ms2_adduct'] = np.nan
     if ms2_match is not None:
         ms2_match.index = ms2_match.index.astype(str)
         ms2_ids = ms2_match.index.intersection(smiles.index)
-        smiles['ms2_smiles'] = ms2_match.loc[ms2_ids, 'Smiles']
-        smiles['ms2_library_match'] = ms2_match.loc[ms2_ids, 'Compound_Name']
+        smiles['ms2_smiles'] = ms2_match.loc[ms2_ids, 'Smiles'].str.strip()
+        smiles['ms2_library_match'] = ms2_match.loc[
+            ms2_ids, 'Compound_Name']
         smiles['ms2_adduct'] = ms2_match.loc[ms2_ids, 'Adduct']
+    smiles = smiles.fillna('missing').apply(
+        lambda x: x.replace({' ': 'missing', '': 'missing'}))
     return smiles
 
 
diff --git a/q2_qemistree/_prune_hierarchy.py b/q2_qemistree/_prune_hierarchy.py
@@ -42,7 +42,7 @@ def prune_hierarchy(feature_data: pd.DataFrame, tree: TreeNode,
                          column)
     if column:
         failed = {'unclassified', 'unexpected server response',
-                  'SMILE parse error'}
+                  'SMILE parse error', 'missing'}
         # remove all NA values or missing values
         feature_data = feature_data[~(feature_data[column].isin(failed) |
                                     feature_data[column].isna())]
diff --git a/q2_qemistree/tests/data/feature_data_itol.txt b/q2_qemistree/tests/data/feature_data_itol.txt
@@ -1,12 +1,12 @@
-label	#featureID	csi_smiles	ms2_smiles	table_number	important	not_so_important	structure_source	ms2_library_match
-featurehash1	3			1	a	x	CSIFingerID	Spectral Match to Bleepbloop
-featurehash2	7	CCCCCCCCCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N+](C)(C)C)O		1	a	x	MS2	Caffeine
-featurehash3	2			1	b	x	MS2	Fakeiine
-featurehash4	5	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	c	x	MS2	
-featurehash5	6	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	d	x	MS2	
-featurehash7	7	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	e	y		Spectral Match to Glu-Val from NIST14
-featurehash8	8	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	f	y	CSIFingerID	Spectral Match to Glu-Val from NIST14
-featurehash10	10	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	g	y	CSIFingerID	
-featurehash22	22	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	h	y		
-featurehash42	23	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	i	y	CSIFingerID	
-featurehash43	24	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	j	y		
+label	#featureID	csi_smiles	ms2_smiles	table_number	important	not_so_important	structure_source	ms2_library_match	
+featurehash1	3	missing	missing	1	a	x	CSIFingerID	Spectral Match to Bleepbloop	
+featurehash2	7	CCCCCCCCCCCCCCCC(=O)OCC(COP(=O)([O-])OCC[N+](C)(C)C)O		1	a	x	MS2	Caffeine	
+featurehash3	2	missing	missing	1	b	x	MS2	Fakeiine	
+featurehash4	5	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	c	x	MS2	missing	
+featurehash5	6	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	d	x	MS2	missing	
+featurehash7	7	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	e	y	missing	Spectral Match to Glu-Val from NIST14	
+featurehash8	8	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	f	y	CSIFingerID	Spectral Match to Glu-Val from NIST14	
+featurehash10	10	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	g	y	CSIFingerID	missing	
+featurehash22	22	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	h	y	missing	missing	
+featurehash42	23	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	i	y	CSIFingerID	missing	
+featurehash43	24	CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O		1	j	y	missing	missing	
diff --git a/q2_qemistree/tests/data/feature_data_no_smiles.txt b/q2_qemistree/tests/data/feature_data_no_smiles.txt
diff --git a/q2_qemistree/tests/data/feature_data_smiles.txt b/q2_qemistree/tests/data/feature_data_smiles.txt
diff --git a/q2_qemistree/tests/test_classyfire.py b/q2_qemistree/tests/test_classyfire.py
@@ -9,30 +9,29 @@
 from unittest import TestCase, main
 import os
 import pandas as pd
-import numpy as np
 from q2_qemistree import get_classyfire_taxonomy
 
 
 class TestClassyfire(TestCase):
     def setUp(self):
-        THIS_DIR = os.path.dirname(os.path.abspath(__file__))
-        no_smiles = os.path.join(THIS_DIR, 'data/feature_data_no_smiles.txt')
-        self.no_smiles = pd.read_csv(no_smiles, sep='\t')
-        self.no_smiles.set_index('label')
-        smiles = os.path.join(THIS_DIR, 'data/feature_data_smiles.txt')
-        self.smiles = pd.read_csv(smiles, sep='\t')
-        self.smiles.set_index('label')
+        self.no_smiles = pd.DataFrame(index=['a', 'b', 'c'], data=[1, 2, 3],
+                                      columns=['#featureID'])
+        self.smiles = pd.DataFrame(index=['a', 'b', 'c'], data=[
+            ['missing', 'missing'],
+            [' O=C(O)[C@@H](N)Cc1ccccc1', 'missing'],
+            ['missing', 'CC(=NC(=O)CC(=NC(=O)C)OOC(=O)C)O']],
+            columns=['csi_smiles', 'ms2_smiles'])
         self.nan_smiles = pd.DataFrame(index=['a', 'b', 'c'],
-                                       data=[[np.nan, np.nan],
-                                             [np.nan, np.nan],
-                                             [np.nan, np.nan]],
+                                       data=[['missing', 'missing'],
+                                             ['missing', 'missing'],
+                                             ['missing', 'missing']],
                                        columns=['csi_smiles', 'ms2_smiles'])
         self.mal_smiles = pd.DataFrame(index=['a', 'b'],
-                                       data=[[np.nan, 'foo'],
-                                             ['bar', np.nan]],
+                                       data=[['missing', 'foo'],
+                                             ['bar', 'missing']],
                                        columns=['csi_smiles', 'ms2_smiles'])
         self.levels = set(['kingdom', 'superclass', 'class', 'subclass',
-                          'direct_parent', 'structure_source'])
+                           'direct_parent', 'structure_source'])
 
     def test_no_smiles(self):
         msg = ('Feature data table must contain the columns `csi_smiles` '
@@ -55,7 +54,7 @@ def test_classyfire_output(self):
         classified = get_classyfire_taxonomy(self.smiles)
         classified_mols = classified[classified['kingdom'] != 'unclassified']
         self.assertTrue(pd.isna(classified_mols).shape, 0)
-        self.assertTrue(classified_mols.loc[1,
+        self.assertTrue(classified_mols.loc['b',
                         'kingdom'] == 'Organic compounds')
         self.assertTrue((self.levels.issubset(set(classified.columns))))
 
diff --git a/q2_qemistree/tests/test_make_hierarchy.py b/q2_qemistree/tests/test_make_hierarchy.py
@@ -71,8 +71,8 @@ def test_mergeFeatureDataSingle(self):
         fdata_featrs = sorted(list(merged_fdata.index))
         self.assertEqual('csi_smiles' in merged_fdata.columns, True)
         self.assertEqual('ms2_smiles' in merged_fdata.columns, True)
-        self.assertEqual(len(merged_fdata[pd.notna(
-            merged_fdata.ms2_smiles)]), 1)
+        self.assertEqual(len(
+            merged_fdata[merged_fdata['ms2_smiles'] != 'missing']), 1)
         self.assertEqual(len(featrs) == 3, True)
         self.assertEqual(fdata_featrs, featrs)