Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,28 @@ def store_gene_panel_files(
return genePanelEntities


def filter_out_germline_variants(
input_data: pd.DataFrame, status_col_str: str
) -> pd.DataFrame:
"""Filters out germline variants given a status col str. Genie pipeline
cannot have any of these variants. NOTE: We have to search for the
status column because there's no column name validation in the release
steps so the status column may have different casing.

Args:
input_data (pd.DataFrame): input data with germline variants to filter out
status_col_str (str): search string for the status column for the data

Returns:
pd.DataFrame: filtered out germline variant data
"""
# find status col SV_Status
status_col = [
col for col in input_data.columns if col.lower() == status_col_str.lower()
][0]
return input_data[input_data[status_col] != "GERMLINE"].reset_index(drop=True)


# TODO: add to load.py
def store_sv_files(
syn: synapseclient.Synapse,
Expand Down Expand Up @@ -716,7 +738,6 @@ def store_sv_files(
# sv_df["ENTREZ_GENE_ID"].mask(
# sv_df["ENTREZ_GENE_ID"] == 0, float("nan"), inplace=True
# )

if not current_release_staging:
sv_staging_df = sv_df[
sv_df["SAMPLE_ID"].isin(keep_for_center_consortium_samples)
Expand All @@ -735,6 +756,7 @@ def store_sv_files(
)

sv_df = sv_df[sv_df["SAMPLE_ID"].isin(keep_for_merged_consortium_samples)]
sv_df = filter_out_germline_variants(input_data=sv_df, status_col_str="SV_STATUS")
sv_df.rename(columns=transform._col_name_to_titlecase, inplace=True)
sv_text = process_functions.removePandasDfFloat(sv_df)
sv_path = os.path.join(GENIE_RELEASE_DIR, "data_sv.txt")
Expand Down
51 changes: 51 additions & 0 deletions tests/test_database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import os
from unittest import mock
from unittest.mock import patch
import pytest

import pandas as pd
from pandas.testing import assert_frame_equal
import synapseclient

from genie import database_to_staging, extract, load
Expand Down Expand Up @@ -106,3 +108,52 @@ def test_store_assay_info_files(syn):
used=f"{FILEVIEW_SYNID}.2",
)
assert wes_ids == ["A"]


@pytest.mark.parametrize(
"input_data, filter_col, expected_result",
[
(
pd.DataFrame(
dict(
SV_STATUS=["GERMLINE", "GERMLINE"], Sample_ID=["GENIE-1", "GENIE-2"]
)
),
"SV_STATUS",
pd.DataFrame(columns=["SV_STATUS", "Sample_ID"]),
),
(
pd.DataFrame(
dict(
SV_STATUS=["GERMLINE", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"]
)
),
"SV_STATUS",
pd.DataFrame(dict(SV_STATUS=["SOMATIC"], Sample_ID=["GENIE-2"])),
),
(
pd.DataFrame(
dict(SV_STATUS=["SOMATIC", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"])
),
"SV_STATUS",
pd.DataFrame(
dict(SV_STATUS=["SOMATIC", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"])
),
),
(
pd.DataFrame(
dict(
SV_Status=["GERMLINE", "SOMATIC"], Sample_ID=["GENIE-1", "GENIE-2"]
)
),
"SV_STATUS",
pd.DataFrame(dict(SV_Status=["SOMATIC"], Sample_ID=["GENIE-2"])),
),
],
ids=["all_germline", "some_germline", "no_germline", "diff_status_col_case"],
)
def test_that_filter_out_germline_variants_returns_expected(
input_data, filter_col, expected_result
):
result = database_to_staging.filter_out_germline_variants(input_data, filter_col)
assert_frame_equal(result, expected_result, check_index_type=False)
Loading