Skip to content

Commit 799d2aa

Browse files
committed
initial filtering
1 parent 567f3f0 commit 799d2aa

File tree

2 files changed

+57
-0
lines changed

2 files changed

+57
-0
lines changed

genie/database_to_staging.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,20 @@ def store_gene_panel_files(
682682
return genePanelEntities
683683

684684

685+
def filter_out_germline_variants(input_data : pd.DataFrame, status_col : str) -> pd.DataFrame:
686+
"""Filters out germline variants given a status col. Genie pipeline
687+
cannot have any of these variants.
688+
689+
Args:
690+
input_data (pd.DataFrame): input data with germline variants to filter out
691+
status_col (str): status column for the data
692+
693+
Returns:
694+
pd.DataFrame: filtered out germline variant data
695+
"""
696+
return input_data[input_data[status_col] != "GERMLINE"].reset_index(drop=True)
697+
698+
685699
# TODO: add to load.py
686700
def store_sv_files(
687701
syn: synapseclient.Synapse,
@@ -735,6 +749,7 @@ def store_sv_files(
735749
)
736750

737751
sv_df = sv_df[sv_df["SAMPLE_ID"].isin(keep_for_merged_consortium_samples)]
752+
sv_df = filter_out_germline_variants(input_data = sv_df, status_col = "SV_Status")
738753
sv_df.rename(columns=transform._col_name_to_titlecase, inplace=True)
739754
sv_text = process_functions.removePandasDfFloat(sv_df)
740755
sv_path = os.path.join(GENIE_RELEASE_DIR, "data_sv.txt")

tests/test_database_to_staging.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
import os
44
from unittest import mock
55
from unittest.mock import patch
6+
import pytest
67

78
import pandas as pd
9+
from pandas.testing import assert_frame_equal
810
import synapseclient
911

1012
from genie import database_to_staging, extract, load
@@ -106,3 +108,43 @@ def test_store_assay_info_files(syn):
106108
used=f"{FILEVIEW_SYNID}.2",
107109
)
108110
assert wes_ids == ["A"]
111+
112+
113+
@pytest.mark.parametrize(
114+
"input_data, filter_col, expected_result",
115+
[
116+
(
117+
pd.DataFrame(
118+
dict(
119+
SV_Status=["GERMLINE", "GERMLINE"], Sample_ID=["GENIE-1", "GENIE-2"]
120+
)
121+
),
122+
"SV_Status",
123+
pd.DataFrame(columns=["SV_Status", "Sample_ID"]),
124+
),
125+
(
126+
pd.DataFrame(
127+
dict(
128+
SV_Status=["GERMLINE", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"]
129+
)
130+
),
131+
"SV_Status",
132+
pd.DataFrame(dict(SV_Status=["SOMATIC"], Sample_ID=["GENIE-2"])),
133+
),
134+
(
135+
pd.DataFrame(
136+
dict(SV_Status=["SOMATIC", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"])
137+
),
138+
"SV_Status",
139+
pd.DataFrame(
140+
dict(SV_Status=["SOMATIC", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"])
141+
),
142+
),
143+
],
144+
ids=["all_germline", "some_germline", "no_germline"],
145+
)
146+
def test_that_filter_out_germline_variants_returns_expected(
147+
input_data, filter_col, expected_result
148+
):
149+
result = database_to_staging.filter_out_germline_variants(input_data, filter_col)
150+
assert_frame_equal(result, expected_result, check_index_type = False)

0 commit comments

Comments
 (0)