add function to check both SAMPLE_CLASS fieldName and cfDNA sample

danlu1 · danlu1 · commit 30a486a8dda1 · 2024-11-26T00:30:36.000Z
diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py
@@ -5,13 +5,8 @@
 
 import pandas as pd
 import synapseutils
-from genie import (
-    create_case_lists,
-    database_to_staging,
-    extract,
-    load,
-    process_functions,
-)
+from genie import (create_case_lists, database_to_staging, extract, load,
+                   process_functions)
 
 logger = logging.getLogger(__name__)
 
@@ -145,13 +140,15 @@ def consortiumToPublic(
     )
 
     # check if SAMPLE_CLASS is present
-    if not process_functions.checkColExist(publicRelease, "SAMPLE_CLASS"):
+    if not process_functions.check_values_in_column(
+        publicRelease, "fieldName", "SAMPLE_CLASS"
+    ):
         logger.error("Must have SAMPLE_CLASS column in the public release scope.")
 
     allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)]
     # check if cfDNA samples are present
-    if not process_functions.has_cfDNA_samples(allClin):
-        logger.error("cfDNA samples should not be filtered out.")
+    if not process_functions.check_values_in_column(allClin, "SAMPLE_CLASS", "cfDNA"):
+        logger.error("cfDNA samples should not be filtered out in the clinical dataframe.")
 
     allClin.to_csv(clinical_path, sep="\t", index=False)
 
diff --git a/genie/process_functions.py b/genie/process_functions.py
@@ -982,15 +982,22 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
     return dataset[list(schema.keys())]
 
 
-def has_cfDNA_samples(df: pd.DataFrame) -> bool:
-    """Check if cfDNA exist in SAMPLE_CLASS column of the clinical dataframe.
+def check_values_in_column(
+    df: pd.DataFrame, col: str, values: Union[str, list]
+) -> bool:
+    """Check if a column in a dataframe contains specific values
     Args:
         df (pd.DataFrame): The clinical dataframe
+        col (str): The column name
+        values (list): Expected values in the column
     Returns:
-        bool: True if cfDNA samples exist(s)
+        bool: True if the column contains the specified values
     """
-    if not checkColExist(df, "SAMPLE_CLASS"):
-        logger.error("Must have SAMPLE_CLASS column in the dataframe.")
+    if not checkColExist(df, col):
+        logger.error(f"Must have {col} column in the dataframe.")
     else:
-        result = df.SAMPLE_CLASS.isin(["cfDNA"]).any()
+        # Ensure values is always a list for next step
+        if isinstance(values, str):
+            values = [values]
+        result = df[col].isin(values).any()
         return result
diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py
@@ -717,39 +717,71 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df()
 
 
 @pytest.mark.parametrize(
-    "input_df",
-    [
-        pd.DataFrame({"some_col": ["Val1", "Val1", "Val2"]}),
-    ],
-    ids=["missing_SAMPLE_CLASS_column"],
+    "input_df,col,values",
+    [(pd.DataFrame({"some_col": ["Val1", "Val1", "Val2"]}), "test_col", "test_value")],
+    ids=["missing_the_column"],
 )
-def test_has_cfDNA_samples_no_SAMPLE_CLASS_column(input_df):
+def test_check_values_in_column_no_column(input_df, col, values):
     with patch.object(process_functions, "logger") as mock_logger:
-        results = process_functions.has_cfDNA_samples(input_df)
+        results = process_functions.check_values_in_column(input_df, col, values)
     mock_logger.error.assert_called_once_with(
-        "Must have SAMPLE_CLASS column in the dataframe."
+        "Must have test_col column in the dataframe."
     )
 
 
 @pytest.mark.parametrize(
-    "input_df, expected_results",
+    "input_df,col,values,expected_results",
     [
         (
             pd.DataFrame(
                 {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
             ),
+            "SAMPLE_CLASS",
+            "cfDNA",
+            False,
+        ),
+        (
+            pd.DataFrame(
+                {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
+            ),
+            "SAMPLE_CLASS",
+            ["test_value", "cfDNA"],
             False,
         ),
         (
             pd.DataFrame(
                 {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Val1", "Val2"]}
             ),
+            "SAMPLE_CLASS",
+            "cfDNA",
             True,
         ),
+        (
+            pd.DataFrame(
+                {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]}
+            ),
+            "SAMPLE_CLASS",
+            ["cfDNA", "Tumor"],
+            True,
+        ),
+        (
+            pd.DataFrame(
+                {"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]}
+            ),
+            "SAMPLE_CLASS",
+            ["cfDNA", "Tumor", "test_value"],
+            True,
+        ),
+    ],
+    ids=[
+        "no_expected_single_value",
+        "no_expected_value_list",
+        "have_expected_single_value",
+        "have_expected_value_list",
+        "have_partial_expected_value_list",
     ],
-    ids=["no_cfDNA_sampless", "have_cfDNA_samples"],
 )
-def test_has_cfDNA_samples_has_SAMPLE_CLASS_column(input_df, expected_results):
-    results = process_functions.has_cfDNA_samples(input_df)
+def test_check_values_in_column_has_column(input_df, col, values, expected_results):
+    results = process_functions.check_values_in_column(input_df, col, values)
 
     assert results == expected_results