Skip to content

Commit 5287a90

Browse files
Merge pull request #599 from biocore/csymons_metadata_freetext
Drop Free-text Columns in Metadata Repo
2 parents 210e5c3 + e91fc91 commit 5287a90

File tree

2 files changed

+78
-2
lines changed

2 files changed

+78
-2
lines changed

microsetta_private_api/repo/metadata_repo/_repo.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ def drop_private_columns(df):
7474
# sensitive in nature
7575
pm_remove = {c.lower() for c in df.columns if c.lower().startswith('pm_')}
7676

77-
remove = pm_remove | {c.lower() for c in EBI_REMOVE}
77+
freetext_fields = {c.lower() for c in _get_freetext_fields()}
78+
79+
remove = pm_remove | {c.lower() for c in EBI_REMOVE} | freetext_fields
7880
to_drop = [c for c in df.columns if c.lower() in remove]
7981

8082
return df.drop(columns=to_drop, inplace=False)
@@ -634,3 +636,24 @@ def _find_duplicates(barcodes):
634636
}
635637

636638
return dups, error
639+
640+
641+
def _get_freetext_fields():
642+
""" Retrieve a list of all free-text survey fields from the database
643+
644+
Returns
645+
-------
646+
list of str
647+
The question_shortname values for all free-text survey questions
648+
"""
649+
with Transaction() as t:
650+
with t.cursor() as cur:
651+
cur.execute(
652+
"SELECT sq.question_shortname "
653+
"FROM ag.survey_question sq "
654+
"INNER JOIN ag.survey_question_response_type sqrtype "
655+
"ON sq.survey_question_id = sqrtype.survey_question_id "
656+
"WHERE survey_response_type IN ('TEXT', 'STRING')"
657+
)
658+
rows = cur.fetchall()
659+
return [x[0] for x in rows]

microsetta_private_api/repo/metadata_repo/tests/test_repo.py

+54-1
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@
1717
_fetch_observed_survey_templates,
1818
_construct_multiselect_map,
1919
_find_best_answers,
20-
drop_private_columns)
20+
drop_private_columns,
21+
_get_freetext_fields,
22+
EBI_REMOVE)
2123
from microsetta_private_api.repo.survey_template_repo import SurveyTemplateRepo
2224
from microsetta_private_api.model.account import Account
2325
from microsetta_private_api.model.address import Address
26+
from microsetta_private_api.repo.transaction import Transaction
2427

2528

2629
class MM:
@@ -329,6 +332,32 @@ def test_drop_private_columns(self):
329332
obs = drop_private_columns(df)
330333
pdt.assert_frame_equal(obs, exp)
331334

335+
def test_drop_private_columns_freetext(self):
336+
# This test specifically asserts that the new code to drop free-text
337+
# fields works, even if those fields are not represented in the
338+
# EBI_REMOVE list
339+
340+
# First, assert that ALL_ROOMMATES is not in EBI_REMOVE
341+
self.assertFalse("ALL_ROOMMATES" in EBI_REMOVE)
342+
343+
# Next, assert that ALL_ROOMMATES is a free-text field
344+
freetext_fields = _get_freetext_fields()
345+
self.assertTrue("ALL_ROOMMATES" in freetext_fields)
346+
347+
# Now, set up a test dataframe, based on the existing
348+
# test_drop_private_columns df, but with the ALL_ROOMMATES field added
349+
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]],
350+
columns=[
351+
'pM_foo',
352+
'okay',
353+
'ABOUT_yourSELF_TEXT',
354+
'ALL_ROOMMATES'])
355+
356+
# We only expect the "okay" column to remain
357+
exp = pd.DataFrame([[2, ], [6, ]], columns=['okay'])
358+
obs = drop_private_columns(df)
359+
pdt.assert_frame_equal(obs, exp)
360+
332361
def test_build_col_name(self):
333362
tests_and_expected = [('foo', 'bar', 'foo_bar'),
334363
('foo', 'bar baz', 'foo_bar_baz')]
@@ -512,6 +541,30 @@ def test_find_best_answers(self):
512541
with self.assertRaises(KeyError):
513542
_ = obs[0]['response']['111']
514543

544+
def test_get_freetext_fields(self):
545+
with Transaction() as t:
546+
with t.cursor() as cur:
547+
# Grab the count for the number of free-text fields that exist
548+
# in the database
549+
cur.execute(
550+
"SELECT COUNT(*) "
551+
"FROM ag.survey_question_response_type "
552+
"WHERE survey_response_type IN ('TEXT', 'STRING')"
553+
)
554+
row = cur.fetchone()
555+
freetext_count = row[0]
556+
557+
# Use the _get_freetext_fields() function to pull the actual list
558+
freetext_fields = _get_freetext_fields()
559+
560+
# Assert that the field count matches
561+
self.assertEqual(len(freetext_fields), freetext_count)
562+
563+
# Assert that a few known free-text fields exist in the list
564+
self.assertTrue("ABOUT_YOURSELF_TEXT" in freetext_fields)
565+
self.assertTrue("ALL_ROOMMATES" in freetext_fields)
566+
self.assertTrue("DIET_RESTRICTIONS" in freetext_fields)
567+
515568

516569
if __name__ == '__main__':
517570
unittest.main()

0 commit comments

Comments
 (0)