Skip to content

Commit 567f3f0

Browse files
authored
[GEN-1622] remove sample class filter (#581)
* remove sample_class_filter function * add function to check both SAMPLE_CLASS fieldName and cfDNA sample * add function to check SAMPLE_CLASS and cfDNA existence
1 parent 28c33ee commit 567f3f0

File tree

4 files changed

+120
-27
lines changed

4 files changed

+120
-27
lines changed

genie/consortium_to_public.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
import logging
44
import os
5+
import sys
56

6-
import synapseutils
77
import pandas as pd
8-
8+
import synapseutils
99
from genie import (
1010
create_case_lists,
1111
database_to_staging,
@@ -15,6 +15,9 @@
1515
)
1616

1717
logger = logging.getLogger(__name__)
18+
stdout_handler = logging.StreamHandler(stream=sys.stdout)
19+
stdout_handler.setLevel(logging.INFO)
20+
logger.addHandler(stdout_handler)
1821

1922

2023
# TODO: Add to transform.py
@@ -119,8 +122,6 @@ def consortiumToPublic(
119122
clinicalDf, processingDate, publicReleaseCutOff
120123
)
121124
logger.info("SAMPLE CLASS FILTER")
122-
remove_sc_samples = database_to_staging.sample_class_filter(clinical_df=clinicalDf)
123-
removeForPublicSamples = list(set(removeForPublicSamples).union(remove_sc_samples))
124125
# comment back in when public release filter back on
125126
# publicReleaseSamples = publicReleaseSamples.append(keepForPublicSamples)
126127
# Make sure all null oncotree codes are removed
@@ -147,7 +148,19 @@ def consortiumToPublic(
147148
query_string=f"SELECT * FROM {clinical_tier_release_scope_synid} where releaseScope = 'public'",
148149
)
149150

151+
# check if SAMPLE_CLASS is present
152+
if not process_functions.check_values_in_column(
153+
publicRelease, "fieldName", "SAMPLE_CLASS"
154+
):
155+
logger.error("Must have SAMPLE_CLASS column in the public release scope.")
156+
150157
allClin = clinicalDf[clinicalDf["SAMPLE_ID"].isin(publicReleaseSamples)]
158+
# check if cfDNA samples are present
159+
if not process_functions.check_values_in_column(allClin, "SAMPLE_CLASS", "cfDNA"):
160+
logger.error(
161+
"cfDNA samples should not be filtered out in the clinical dataframe."
162+
)
163+
151164
allClin.to_csv(clinical_path, sep="\t", index=False)
152165

153166
gene_matrixdf = gene_matrixdf[gene_matrixdf["SAMPLE_ID"].isin(publicReleaseSamples)]

genie/database_to_staging.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -501,24 +501,6 @@ def seq_date_filter(clinicalDf, processingDate, consortiumReleaseCutOff):
501501
return removeSeqDateSamples
502502

503503

504-
def sample_class_filter(clinical_df: pd.DataFrame) -> list:
505-
"""Filter samples by SAMPLE_CLASS
506-
507-
Args:
508-
clinical_df (pd.DataFrame): Clinical dataframe
509-
510-
Returns:
511-
list: List of samples to filter out
512-
"""
513-
if clinical_df.get("SAMPLE_CLASS") is not None:
514-
remove_samples = clinical_df["SAMPLE_ID"][
515-
clinical_df["SAMPLE_CLASS"] == "cfDNA"
516-
].tolist()
517-
else:
518-
remove_samples = []
519-
return remove_samples
520-
521-
522504
# TODO: Add to transform.py
523505
def mutation_in_cis_filter(
524506
syn,

genie/process_functions.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -980,3 +980,24 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
980980
elif data_type == "boolean":
981981
dataset[column] = dataset[column].astype(pd.BooleanDtype())
982982
return dataset[list(schema.keys())]
983+
984+
985+
def check_values_in_column(
986+
df: pd.DataFrame, col: str, values: Union[str, list]
987+
) -> bool:
988+
"""Check if a column in a dataframe contains specific values
989+
Args:
990+
df (pd.DataFrame): The clinical dataframe
991+
col (str): The column name
992+
values (list): Expected values in the column
993+
Returns:
994+
bool: True if the column contains the specified values
995+
"""
996+
if not checkColExist(df, col):
997+
logger.error(f"Must have {col} column in the dataframe.")
998+
else:
999+
# Ensure values is always a list for next step
1000+
if isinstance(values, str):
1001+
values = [values]
1002+
result = df[col].isin(values).any()
1003+
return result

tests/test_process_functions.py

Lines changed: 82 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,17 @@
1-
from unittest.mock import Mock, patch
21
import uuid
2+
from unittest.mock import Mock, patch
33

44
import pandas as pd
5+
import pytest
6+
import synapseclient
7+
from genie import process_functions
58
from pandas.api.types import (
69
is_bool_dtype,
710
is_float_dtype,
811
is_integer_dtype,
912
is_string_dtype,
1013
)
1114
from pandas.testing import assert_frame_equal
12-
import pytest
13-
import synapseclient
14-
15-
from genie import process_functions
1615

1716
DATABASE_DF = pd.DataFrame(
1817
{
@@ -715,3 +714,81 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df()
715714
assert result.isna().sum().sum() == 11
716715

717716
assert_frame_equal(result, expected_output, check_exact=True)
717+
718+
719+
@pytest.mark.parametrize(
720+
"input_df,col,values",
721+
[(pd.DataFrame({"some_col": ["Val1", "Val1", "Val2"]}), "test_col", "test_value")],
722+
ids=["missing_the_column"],
723+
)
724+
def test_check_values_in_column_no_column(input_df, col, values):
725+
with patch.object(process_functions, "logger") as mock_logger:
726+
results = process_functions.check_values_in_column(input_df, col, values)
727+
mock_logger.error.assert_called_once_with(
728+
"Must have test_col column in the dataframe."
729+
)
730+
731+
732+
@pytest.mark.parametrize(
733+
"input_df,col,values,expected_results",
734+
[
735+
(
736+
pd.DataFrame(
737+
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
738+
),
739+
"SAMPLE_CLASS",
740+
"cfDNA",
741+
False,
742+
),
743+
(
744+
pd.DataFrame(
745+
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["Val1", "Val1", "Val2"]}
746+
),
747+
"SAMPLE_CLASS",
748+
["test_value", "cfDNA"],
749+
False,
750+
),
751+
(
752+
pd.DataFrame(
753+
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Val1", "Val2"]}
754+
),
755+
"SAMPLE_CLASS",
756+
"cfDNA",
757+
True,
758+
),
759+
(
760+
pd.DataFrame(
761+
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]}
762+
),
763+
"SAMPLE_CLASS",
764+
["cfDNA", "Tumor"],
765+
True,
766+
),
767+
(
768+
pd.DataFrame(
769+
{"SAMPLE_ID": [1, 2, 3], "SAMPLE_CLASS": ["cfDNA", "Tumor", "Val2"]}
770+
),
771+
"SAMPLE_CLASS",
772+
["cfDNA", "Tumor", "test_value"],
773+
True,
774+
),
775+
(
776+
pd.DataFrame({"SAMPLE_ID": [], "SAMPLE_CLASS": []}),
777+
"SAMPLE_CLASS",
778+
["cfDNA", "Tumor", "test_value"],
779+
False,
780+
),
781+
],
782+
ids=[
783+
"no_expected_single_value",
784+
"no_expected_value_list",
785+
"have_expected_single_value",
786+
"have_expected_value_list",
787+
"have_partial_expected_value_list",
788+
"empty_dataframe_with_required_column",
789+
],
790+
)
791+
def test_check_values_in_column_has_column(input_df, col, values, expected_results):
792+
results = process_functions.check_values_in_column(input_df, col, values)
793+
794+
assert results == expected_results

0 commit comments

Comments
 (0)