@@ -683,18 +683,24 @@ def store_gene_panel_files(
683683
684684
685685def filter_out_germline_variants (
686- input_data : pd .DataFrame , status_col : str
686+ input_data : pd .DataFrame , status_col_str : str
687687) -> pd .DataFrame :
688- """Filters out germline variants given a status col. Genie pipeline
689- cannot have any of these variants.
688+ """Filters out germline variants given a status col str. Genie pipeline
689+ cannot have any of these variants. NOTE: We have to search for the
690+ status column because there's no column name validation in the release
691+ steps so the status column may have different casing.
690692
691693 Args:
692694 input_data (pd.DataFrame): input data with germline variants to filter out
693- status_col (str): status column for the data
695+ status_col_str (str): search string for the status column for the data
694696
695697 Returns:
696698 pd.DataFrame: filtered out germline variant data
697699 """
700+ # find status col SV_Status
701+ status_col = [
702+ col for col in input_data .columns if col .lower () == status_col_str .lower ()
703+ ][0 ]
698704 return input_data [input_data [status_col ] != "GERMLINE" ].reset_index (drop = True )
699705
700706
@@ -750,7 +756,7 @@ def store_sv_files(
750756 )
751757
752758 sv_df = sv_df [sv_df ["SAMPLE_ID" ].isin (keep_for_merged_consortium_samples )]
753- sv_df = filter_out_germline_variants (input_data = sv_df , status_col = "SV_STATUS" )
759+ sv_df = filter_out_germline_variants (input_data = sv_df )
754760 sv_df .rename (columns = transform ._col_name_to_titlecase , inplace = True )
755761 sv_text = process_functions .removePandasDfFloat (sv_df )
756762 sv_path = os .path .join (GENIE_RELEASE_DIR , "data_sv.txt" )
0 commit comments