Skip to content

Commit de2909b

Browse files
authored
[GEN-1704] Filter out germline variants from sv files (#583)
* initial filtering * correct col name * add casing support for status col * add missing param
1 parent 567f3f0 commit de2909b

File tree

2 files changed

+74
-1
lines changed

2 files changed

+74
-1
lines changed

genie/database_to_staging.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,28 @@ def store_gene_panel_files(
682682
return genePanelEntities
683683

684684

685+
def filter_out_germline_variants(
686+
input_data: pd.DataFrame, status_col_str: str
687+
) -> pd.DataFrame:
688+
"""Filters out germline variants given a status col str. Genie pipeline
689+
cannot have any of these variants. NOTE: We have to search for the
690+
status column because there's no column name validation in the release
691+
steps so the status column may have different casing.
692+
693+
Args:
694+
input_data (pd.DataFrame): input data with germline variants to filter out
695+
status_col_str (str): search string for the status column for the data
696+
697+
Returns:
698+
pd.DataFrame: filtered out germline variant data
699+
"""
700+
# find status col SV_Status
701+
status_col = [
702+
col for col in input_data.columns if col.lower() == status_col_str.lower()
703+
][0]
704+
return input_data[input_data[status_col] != "GERMLINE"].reset_index(drop=True)
705+
706+
685707
# TODO: add to load.py
686708
def store_sv_files(
687709
syn: synapseclient.Synapse,
@@ -716,7 +738,6 @@ def store_sv_files(
716738
# sv_df["ENTREZ_GENE_ID"].mask(
717739
# sv_df["ENTREZ_GENE_ID"] == 0, float("nan"), inplace=True
718740
# )
719-
720741
if not current_release_staging:
721742
sv_staging_df = sv_df[
722743
sv_df["SAMPLE_ID"].isin(keep_for_center_consortium_samples)
@@ -735,6 +756,7 @@ def store_sv_files(
735756
)
736757

737758
sv_df = sv_df[sv_df["SAMPLE_ID"].isin(keep_for_merged_consortium_samples)]
759+
sv_df = filter_out_germline_variants(input_data=sv_df, status_col_str="SV_STATUS")
738760
sv_df.rename(columns=transform._col_name_to_titlecase, inplace=True)
739761
sv_text = process_functions.removePandasDfFloat(sv_df)
740762
sv_path = os.path.join(GENIE_RELEASE_DIR, "data_sv.txt")

tests/test_database_to_staging.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
import os
44
from unittest import mock
55
from unittest.mock import patch
6+
import pytest
67

78
import pandas as pd
9+
from pandas.testing import assert_frame_equal
810
import synapseclient
911

1012
from genie import database_to_staging, extract, load
@@ -106,3 +108,52 @@ def test_store_assay_info_files(syn):
106108
used=f"{FILEVIEW_SYNID}.2",
107109
)
108110
assert wes_ids == ["A"]
111+
112+
113+
@pytest.mark.parametrize(
114+
"input_data, filter_col, expected_result",
115+
[
116+
(
117+
pd.DataFrame(
118+
dict(
119+
SV_STATUS=["GERMLINE", "GERMLINE"], Sample_ID=["GENIE-1", "GENIE-2"]
120+
)
121+
),
122+
"SV_STATUS",
123+
pd.DataFrame(columns=["SV_STATUS", "Sample_ID"]),
124+
),
125+
(
126+
pd.DataFrame(
127+
dict(
128+
SV_STATUS=["GERMLINE", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"]
129+
)
130+
),
131+
"SV_STATUS",
132+
pd.DataFrame(dict(SV_STATUS=["SOMATIC"], Sample_ID=["GENIE-2"])),
133+
),
134+
(
135+
pd.DataFrame(
136+
dict(SV_STATUS=["SOMATIC", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"])
137+
),
138+
"SV_STATUS",
139+
pd.DataFrame(
140+
dict(SV_STATUS=["SOMATIC", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"])
141+
),
142+
),
143+
(
144+
pd.DataFrame(
145+
dict(
146+
SV_Status=["GERMLINE", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"]
147+
)
148+
),
149+
"SV_STATUS",
150+
pd.DataFrame(dict(SV_Status=["SOMATIC"], Sample_ID=["GENIE-2"])),
151+
),
152+
],
153+
ids=["all_germline", "some_germline", "no_germline", "diff_status_col_case"],
154+
)
155+
def test_that_filter_out_germline_variants_returns_expected(
156+
input_data, filter_col, expected_result
157+
):
158+
result = database_to_staging.filter_out_germline_variants(input_data, filter_col)
159+
assert_frame_equal(result, expected_result, check_index_type=False)

0 commit comments

Comments
 (0)