Skip to content

Commit 458376b

Browse files
committed
add casing support for status col
1 parent 0cd8f8c commit 458376b

File tree

2 files changed

+21
-6
lines changed

2 files changed

+21
-6
lines changed

genie/database_to_staging.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -683,18 +683,24 @@ def store_gene_panel_files(
683683

684684

685685
def filter_out_germline_variants(
686-
input_data: pd.DataFrame, status_col: str
686+
input_data: pd.DataFrame, status_col_str: str
687687
) -> pd.DataFrame:
688-
"""Filters out germline variants given a status col. Genie pipeline
689-
cannot have any of these variants.
688+
"""Filters out germline variants given a status col str. Genie pipeline
689+
cannot have any of these variants. NOTE: We have to search for the
690+
status column because there's no column name validation in the release
691+
steps so the status column may have different casing.
690692
691693
Args:
692694
input_data (pd.DataFrame): input data with germline variants to filter out
693-
status_col (str): status column for the data
695+
status_col_str (str): search string for the status column for the data
694696
695697
Returns:
696698
pd.DataFrame: filtered out germline variant data
697699
"""
700+
# find status col SV_Status
701+
status_col = [
702+
col for col in input_data.columns if col.lower() == status_col_str.lower()
703+
][0]
698704
return input_data[input_data[status_col] != "GERMLINE"].reset_index(drop=True)
699705

700706

@@ -750,7 +756,7 @@ def store_sv_files(
750756
)
751757

752758
sv_df = sv_df[sv_df["SAMPLE_ID"].isin(keep_for_merged_consortium_samples)]
753-
sv_df = filter_out_germline_variants(input_data=sv_df, status_col="SV_STATUS")
759+
sv_df = filter_out_germline_variants(input_data=sv_df)
754760
sv_df.rename(columns=transform._col_name_to_titlecase, inplace=True)
755761
sv_text = process_functions.removePandasDfFloat(sv_df)
756762
sv_path = os.path.join(GENIE_RELEASE_DIR, "data_sv.txt")

tests/test_database_to_staging.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,17 @@ def test_store_assay_info_files(syn):
140140
dict(SV_STATUS=["SOMATIC", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"])
141141
),
142142
),
143+
(
144+
pd.DataFrame(
145+
dict(SV_Status=["GERMLINE", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"])
146+
),
147+
"SV_STATUS",
148+
pd.DataFrame(
149+
dict(SV_Status=["SOMATIC"], Sample_ID=["GENIE-2"])
150+
),
151+
),
143152
],
144-
ids=["all_germline", "some_germline", "no_germline"],
153+
ids=["all_germline", "some_germline", "no_germline", "diff_status_col_case"],
145154
)
146155
def test_that_filter_out_germline_variants_returns_expected(
147156
input_data, filter_col, expected_result

0 commit comments

Comments
 (0)