7070 "meta_clinical_sample.txt" ,
7171]
7272
73+
7374def filter_out_non_analyses_samples (input_df : pd .DataFrame ) -> pd .DataFrame :
74- """Filter out the non analyses samples.
75+ """Filter out the non analyses samples.
7576 This is on a dataset by dataset basis.
76-
77- Here non-analyses samples are defined as DNA only tumor samples and
77+
78+ Here non-analyses samples are defined as DNA only tumor samples and
7879 RNA samples not used in the analyses.
7980
8081 Args:
8182 input_df (pd.DataFrame): input clinical data
8283
8384 Returns:
84- pd.DataFrame: output clinical data with the non analyses samples
85+ pd.DataFrame: output clinical data with the non analyses samples
8586 filtered out
8687 """
8788 filtered_df = input_df [
88- (~ (input_df ["SAMPLE_ID" ].str .contains (r'-(?:nd|ad|nr)-' , na = False )) &
89- (input_df ["Dataset" ]== "Anders_JITC_2022" )) | (input_df ["Dataset" ]!= "Anders_JITC_2022" )
89+ (
90+ ~ (input_df ["SAMPLE_ID" ].str .contains (r"-(?:nd|ad|nr)-" , na = False ))
91+ & (input_df ["Dataset" ] == "Anders_JITC_2022" )
92+ )
93+ | (input_df ["Dataset" ] != "Anders_JITC_2022" )
9094 ]
9195 return filtered_df
9296
@@ -244,7 +248,9 @@ def add_lens_id_as_sample_display_name(
244248 columns = {"lens_id" : "SAMPLE_DISPLAY_NAME" , "study_sample_name" : "SAMPLE_ID" }
245249 )
246250 # convert lens sample_id to string
247- lens_id_mapping_renamed ["SAMPLE_ID" ] = lens_id_mapping_renamed ["SAMPLE_ID" ].astype (str )
251+ lens_id_mapping_renamed ["SAMPLE_ID" ] = lens_id_mapping_renamed ["SAMPLE_ID" ].astype (
252+ str
253+ )
248254 input_df_mapped = input_df .merge (
249255 lens_id_mapping_renamed , on = ["SAMPLE_ID" ], how = "left"
250256 )
@@ -257,9 +263,8 @@ def add_lens_id_as_sample_display_name(
257263
258264
259265def merge_in_neoantigen_study_data (
260- input_df : pd .DataFrame ,
261- neoantigen_data_synid : str , ** kwargs
262- ) -> pd .DataFrame :
266+ input_df : pd .DataFrame , neoantigen_data_synid : str , ** kwargs
267+ ) -> pd .DataFrame :
263268 """Adds in the new neoantigen summaries study data for the specific
264269 dataset to the overall clinical dataset (which contains all datasets)
265270
@@ -271,14 +276,10 @@ def merge_in_neoantigen_study_data(
271276 pd.DataFrame: clinical data with neoantigen data added in
272277 """
273278 logger = kwargs .get ("logger" , logging .getLogger (__name__ ))
274- neoantigen_data = pd .read_csv (syn .get (neoantigen_data_synid ).path , sep = "\t " )
275- neoantigen_data = neoantigen_data .rename (columns = {"Sample_ID" :"SAMPLE_ID" })
276- neoantigen_data ['SAMPLE_ID' ] = neoantigen_data ['SAMPLE_ID' ].astype (str )
277- df_with_neoantigen = input_df .merge (
278- neoantigen_data ,
279- how = "outer" ,
280- on = "SAMPLE_ID"
281- )
279+ neoantigen_data = pd .read_csv (syn .get (neoantigen_data_synid ).path , sep = "\t " )
280+ neoantigen_data = neoantigen_data .rename (columns = {"Sample_ID" : "SAMPLE_ID" })
281+ neoantigen_data ["SAMPLE_ID" ] = neoantigen_data ["SAMPLE_ID" ].astype (str )
282+ df_with_neoantigen = input_df .merge (neoantigen_data , how = "outer" , on = "SAMPLE_ID" )
282283 if len (df_with_neoantigen ) > len (input_df ):
283284 logger .error (
284285 "There are more rows in the clinical data after merging in the neoantigen data."
@@ -290,7 +291,7 @@ def preprocessing(
290291 input_df_synid : str ,
291292 cli_to_cbio_mapping : pd .DataFrame ,
292293 cli_to_oncotree_mapping_synid : str ,
293- neoantigen_data_synid : str ,
294+ neoantigen_data_synid : str ,
294295 datahub_tools_path : str ,
295296 ** kwargs ,
296297) -> pd .DataFrame :
@@ -326,9 +327,9 @@ def preprocessing(
326327 )
327328 cli_remapped = remap_clinical_ids_to_paper_ids (input_df = cli_with_oncotree )
328329 cli_with_neoantigen = merge_in_neoantigen_study_data (
329- input_df = cli_remapped ,
330- neoantigen_data_synid = neoantigen_data_synid ,
331- logger = logger
330+ input_df = cli_remapped ,
331+ neoantigen_data_synid = neoantigen_data_synid ,
332+ logger = logger ,
332333 )
333334 cli_to_cbio_mapping_dict = dict (
334335 zip (
@@ -339,8 +340,12 @@ def preprocessing(
339340 cli_remapped = cli_with_neoantigen .rename (columns = cli_to_cbio_mapping_dict )
340341 cli_remapped = filter_out_non_analyses_samples (cli_remapped )
341342 cli_remapped = remap_column_values (input_df = cli_remapped )
342- cli_remapped = convert_days_to_months (input_df = cli_remapped , col = "OS_MONTHS" )
343- cli_remapped = convert_days_to_months (input_df = cli_remapped , col = "PFS_MONTHS" )
343+ cli_remapped = convert_days_to_months (
344+ input_df = cli_remapped , col = "OS_MONTHS" , logger = logger
345+ )
346+ cli_remapped = convert_days_to_months (
347+ input_df = cli_remapped , col = "PFS_MONTHS" , logger = logger
348+ )
344349 cli_remapped_cleaned = remove_suffix_from_column_values (input_df = cli_remapped )
345350 cli_remapped_cleaned = update_case_of_column_values (
346351 input_df = cli_remapped_cleaned , cli_to_cbio_mapping = cli_to_cbio_mapping
@@ -445,7 +450,7 @@ def get_updated_cli_attributes(
445450 """
446451 cli_attr = pd .read_csv (
447452 f"{ datahub_tools_path } /add-clinical-header/clinical_attributes_metadata.txt" ,
448- sep = "\t "
453+ sep = "\t " ,
449454 )
450455 cli_to_cbio_mapping_to_append = cli_to_cbio_mapping .rename (
451456 columns = {
@@ -491,13 +496,13 @@ def convert_oncotree_codes(datahub_tools_path: str) -> pd.DataFrame:
491496 return cli_w_cancer_types
492497
493498
494- def rename_files_on_disk (filepath : str ) -> None :
499+ def rename_files_on_disk (filepath : str ) -> None :
495500 """Renames files on disk by removing the .metadata ext from filenames.
496501 NOTE: This will overwrite previous files with the same name.
497-
502+
498503 This is needed because the insert_clinical_metadata script from
499504 datahub-curation-tools saves the sample and patient files with
500- ".metadata" ext but the cbioportal validation tool expects them to be
505+ ".metadata" ext but the cbioportal validation tool expects them to be
501506 withou the ".metadata"
502507
503508 Args:
@@ -507,23 +512,29 @@ def rename_files_on_disk(filepath : str) -> None:
507512 os .replace (filepath , filepath_new )
508513
509514
510- def convert_days_to_months (input_df : pd .DataFrame , col : str ) -> pd .DataFrame :
511- """Convert the column that's in days into months
515+ def convert_days_to_months (input_df : pd .DataFrame , col : str , ** kwargs ) -> pd .DataFrame :
516+ """Convert the column that's in days into months
512517 using the conversion rate 1 month = 30.44 days,
513518 rounding to two decimal places
514519
515520 Args:
516521 input_df (pd.DataFrame): input data
522+ col (str): the column to convert from days to months
517523
518524 Returns:
519525 pd.DataFrame: Output data with data transformed
520526 from days to months
521527 """
522- converted_df = input_df .copy ()
523- converted_df [col ] = (converted_df [col ] / 30.44 ).round (decimals = 2 )
524- return converted_df
528+ logger = kwargs .get ("logger" , logging .getLogger (__name__ ))
529+ if col in input_df .columns :
530+ converted_df = input_df .copy ()
531+ converted_df [col ] = (converted_df [col ] / 30.44 ).round (decimals = 2 )
532+ return converted_df
533+ else :
534+ logger .info (f"Nothing to convert. { col } doesn't exist in the data." )
535+ return input_df
536+
525537
526-
527538def get_all_non_na_columns (input_df : pd .DataFrame ) -> List [str ]:
528539 """Gets all the columns in input data without all (100%) NAs
529540 Args:
@@ -589,13 +600,13 @@ def add_clinical_header(
589600 python3 { datahub_tools_path } /add-clinical-header/insert_clinical_metadata.py \
590601 -d { dataset_dir }
591602 """
592- time .sleep (2 ) # give subprocess some time before checking
603+ time .sleep (2 ) # give subprocess some time before checking
593604 subprocess .run (cmd , shell = True , executable = "/bin/bash" )
594- time .sleep (2 ) # give subprocess some time before checking
595-
605+ time .sleep (2 ) # give subprocess some time before checking
606+
596607 # remove .metadata from files
597- rename_files_on_disk (filepath = f"{ dataset_dir } /data_clinical_patient.txt.metadata" )
598- rename_files_on_disk (filepath = f"{ dataset_dir } /data_clinical_sample.txt.metadata" )
608+ rename_files_on_disk (filepath = f"{ dataset_dir } /data_clinical_patient.txt.metadata" )
609+ rename_files_on_disk (filepath = f"{ dataset_dir } /data_clinical_sample.txt.metadata" )
599610
600611 # saved merged for case lists
601612 merged_df_subset = input_dfs ["merged" ][
@@ -855,7 +866,7 @@ def main():
855866 "--lens_id_mapping_synid" ,
856867 type = str ,
857868 help = "Synapse id for the study_sample_name (paper ids) to lens id mapping file. Optional. Defaults to None, then adding lens id mapping is skipped" ,
858- default = None
869+ default = None ,
859870 )
860871 parser .add_argument (
861872 "--neoantigen_data_synid" ,
@@ -893,7 +904,7 @@ def main():
893904 cli_to_oncotree_mapping_synid = args .cli_to_oncotree_mapping_synid ,
894905 neoantigen_data_synid = args .neoantigen_data_synid ,
895906 datahub_tools_path = args .datahub_tools_path ,
896- logger = main_logger ,
907+ logger = main_logger ,
897908 )
898909 cli_dfs = split_into_patient_and_sample_data (
899910 input_data = cli_df , cli_to_cbio_mapping = cli_to_cbio_mapping
@@ -914,7 +925,7 @@ def main():
914925 dataset_name = dataset ,
915926 datahub_tools_path = args .datahub_tools_path ,
916927 log_file_name = "iatlas_cli_validation_log.txt" ,
917- flagger = dataset_flagger
928+ flagger = dataset_flagger ,
918929 )
919930 add_clinical_header (
920931 input_dfs = cli_dfs ,
0 commit comments