Skip to content

Commit 30a486a

Browse files
committed
add function to check both SAMPLE_CLASS fieldName and cfDNA sample
1 parent ec60286 commit 30a486a

File tree

3 files changed

+64
-28
lines changed

3 files changed

+64
-28
lines changed

genie/consortium_to_public.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,8 @@
55

66
import pandas as pd
77
import synapseutils
8-
from genie import (
9-
create_case_lists,
10-
database_to_staging,
11-
extract,
12-
load,
13-
process_functions,
14-
)
8+
from genie import (create_case_lists, database_to_staging, extract, load,
9+
process_functions)
1510

1611
logger = logging.getLogger(__name__)
1712

@@ -145,13 +140,15 @@ def consortiumToPublic(
145140
)
146141

147142
# check if SAMPLE_CLASS is present
148-
if not process_functions.checkColExist(publicRelease, "SAMPLE_CLASS"):
143+
if not process_functions.check_values_in_column(
144+
publicRelease, "fieldName", "SAMPLE_CLASS"
145+
):
149146
logger.error("Must have SAMPLE_CLASS column in the public release scope.")
150147

151148
allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)]
152149
# check if cfDNA samples are present
153-
if not process_functions.has_cfDNA_samples(allClin):
154-
logger.error("cfDNA samples should not be filtered out.")
150+
if not process_functions.check_values_in_column(allClin, "SAMPLE_CLASS", "cfDNA"):
151+
logger.error("cfDNA samples should not be filtered out in the clinical dataframe.")
155152

156153
allClin.to_csv(clinical_path, sep="\t", index=False)
157154

genie/process_functions.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -982,15 +982,22 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
982982
return dataset[list(schema.keys())]
983983

984984

985-
def has_cfDNA_samples(df: pd.DataFrame) -> bool:
986-
"""Check if cfDNA exist in SAMPLE_CLASS column of the clinical dataframe.
985+
def check_values_in_column(
986+
df: pd.DataFrame, col: str, values: Union[str, list]
987+
) -> bool:
988+
"""Check if a column in a dataframe contains specific values
987989
Args:
988990
df (pd.DataFrame): The clinical dataframe
991+
col (str): The column name
992+
values (list): Expected values in the column
989993
Returns:
990-
bool: True if cfDNA samples exist(s)
994+
bool: True if the column contains the specified values
991995
"""
992-
if not checkColExist(df, "SAMPLE_CLASS"):
993-
logger.error("Must have SAMPLE_CLASS column in the dataframe.")
996+
if not checkColExist(df, col):
997+
logger.error(f"Must have {col} column in the dataframe.")
994998
else:
995-
result = df.SAMPLE_CLASS.isin(["cfDNA"]).any()
999+
# Ensure values is always a list for next step
1000+
if isinstance(values, str):
1001+
values = [values]
1002+
result = df[col].isin(values).any()
9961003
return result

tests/test_process_functions.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -717,39 +717,71 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df()
717717

718718

719719
@pytest.mark.parametrize(
720-
"input_df",
721-
[
722-
pd.DataFrame({"some_col": ["Val1", "Val1", "Val2"]}),
723-
],
724-
ids=["missing_SAMPLE_CLASS_column"],
720+
"input_df,col,values",
721+
[(pd.DataFrame({"some_col": ["Val1", "Val1", "Val2"]}), "test_col", "test_value")],
722+
ids=["missing_the_column"],
725723
)
726-
def test_has_cfDNA_samples_no_SAMPLE_CLASS_column(input_df):
724+
def test_check_values_in_column_no_column(input_df, col, values):
727725
with patch.object(process_functions, "logger") as mock_logger:
728-
results = process_functions.has_cfDNA_samples(input_df)
726+
results = process_functions.check_values_in_column(input_df, col, values)
729727
mock_logger.error.assert_called_once_with(
730-
"Must have SAMPLE_CLASS column in the dataframe."
728+
"Must have test_col column in the dataframe."
731729
)
732730

733731

734732
@pytest.mark.parametrize(
735-
"input_df, expected_results",
733+
"input_df,col,values,expected_results",
736734
[
737735
(
738736
pd.DataFrame(
739737
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
740738
),
739+
"SAMPLE_CLASS",
740+
"cfDNA",
741+
False,
742+
),
743+
(
744+
pd.DataFrame(
745+
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
746+
),
747+
"SAMPLE_CLASS",
748+
["test_value", "cfDNA"],
741749
False,
742750
),
743751
(
744752
pd.DataFrame(
745753
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Val1", "Val2"]}
746754
),
755+
"SAMPLE_CLASS",
756+
"cfDNA",
747757
True,
748758
),
759+
(
760+
pd.DataFrame(
761+
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]}
762+
),
763+
"SAMPLE_CLASS",
764+
["cfDNA", "Tumor"],
765+
True,
766+
),
767+
(
768+
pd.DataFrame(
769+
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]}
770+
),
771+
"SAMPLE_CLASS",
772+
["cfDNA", "Tumor", "test_value"],
773+
True,
774+
),
775+
],
776+
ids=[
777+
"no_expected_single_value",
778+
"no_expected_value_list",
779+
"have_expected_single_value",
780+
"have_expected_value_list",
781+
"have_partial_expected_value_list",
749782
],
750-
ids=["no_cfDNA_sampless", "have_cfDNA_samples"],
751783
)
752-
def test_has_cfDNA_samples_has_SAMPLE_CLASS_column(input_df, expected_results):
753-
results = process_functions.has_cfDNA_samples(input_df)
784+
def test_check_values_in_column_has_column(input_df, col, values, expected_results):
785+
results = process_functions.check_values_in_column(input_df, col, values)
754786

755787
assert results == expected_results

0 commit comments

Comments
 (0)