Skip to content

Commit a1ccd9f

Browse files
committed
add test case for missing column and lint
1 parent 1d6c941 commit a1ccd9f

File tree

2 files changed

+59
-42
lines changed

2 files changed

+59
-42
lines changed

src/iatlascbioportalexport/clinical.py

Lines changed: 53 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -70,23 +70,27 @@
7070
"meta_clinical_sample.txt",
7171
]
7272

73+
7374
def filter_out_non_analyses_samples(input_df: pd.DataFrame) -> pd.DataFrame:
74-
"""Filter out the non analyses samples.
75+
"""Filter out the non analyses samples.
7576
This is on a dataset by dataset basis.
76-
77-
Here non-analyses samples are defined as DNA only tumor samples and
77+
78+
Here non-analyses samples are defined as DNA only tumor samples and
7879
RNA samples not used in the analyses.
7980
8081
Args:
8182
input_df (pd.DataFrame): input clinical data
8283
8384
Returns:
84-
pd.DataFrame: output clinical data with the non analyses samples
85+
pd.DataFrame: output clinical data with the non analyses samples
8586
filtered out
8687
"""
8788
filtered_df = input_df[
88-
(~(input_df["SAMPLE_ID"].str.contains(r'-(?:nd|ad|nr)-', na=False)) &
89-
(input_df["Dataset"]=="Anders_JITC_2022")) | (input_df["Dataset"]!="Anders_JITC_2022")
89+
(
90+
~(input_df["SAMPLE_ID"].str.contains(r"-(?:nd|ad|nr)-", na=False))
91+
& (input_df["Dataset"] == "Anders_JITC_2022")
92+
)
93+
| (input_df["Dataset"] != "Anders_JITC_2022")
9094
]
9195
return filtered_df
9296

@@ -244,7 +248,9 @@ def add_lens_id_as_sample_display_name(
244248
columns={"lens_id": "SAMPLE_DISPLAY_NAME", "study_sample_name": "SAMPLE_ID"}
245249
)
246250
# convert lens sample_id to string
247-
lens_id_mapping_renamed["SAMPLE_ID"] = lens_id_mapping_renamed["SAMPLE_ID"].astype(str)
251+
lens_id_mapping_renamed["SAMPLE_ID"] = lens_id_mapping_renamed["SAMPLE_ID"].astype(
252+
str
253+
)
248254
input_df_mapped = input_df.merge(
249255
lens_id_mapping_renamed, on=["SAMPLE_ID"], how="left"
250256
)
@@ -257,9 +263,8 @@ def add_lens_id_as_sample_display_name(
257263

258264

259265
def merge_in_neoantigen_study_data(
260-
input_df : pd.DataFrame,
261-
neoantigen_data_synid : str, **kwargs
262-
) -> pd.DataFrame:
266+
input_df: pd.DataFrame, neoantigen_data_synid: str, **kwargs
267+
) -> pd.DataFrame:
263268
"""Adds in the new neoantigen summaries study data for the specific
264269
dataset to the overall clinical dataset (which contains all datasets)
265270
@@ -271,14 +276,10 @@ def merge_in_neoantigen_study_data(
271276
pd.DataFrame: clinical data with neoantigen data added in
272277
"""
273278
logger = kwargs.get("logger", logging.getLogger(__name__))
274-
neoantigen_data = pd.read_csv(syn.get(neoantigen_data_synid).path, sep = "\t")
275-
neoantigen_data = neoantigen_data.rename(columns = {"Sample_ID":"SAMPLE_ID"})
276-
neoantigen_data['SAMPLE_ID'] = neoantigen_data['SAMPLE_ID'].astype(str)
277-
df_with_neoantigen = input_df.merge(
278-
neoantigen_data,
279-
how = "outer",
280-
on = "SAMPLE_ID"
281-
)
279+
neoantigen_data = pd.read_csv(syn.get(neoantigen_data_synid).path, sep="\t")
280+
neoantigen_data = neoantigen_data.rename(columns={"Sample_ID": "SAMPLE_ID"})
281+
neoantigen_data["SAMPLE_ID"] = neoantigen_data["SAMPLE_ID"].astype(str)
282+
df_with_neoantigen = input_df.merge(neoantigen_data, how="outer", on="SAMPLE_ID")
282283
if len(df_with_neoantigen) > len(input_df):
283284
logger.error(
284285
"There are more rows in the clinical data after merging in the neoantigen data."
@@ -290,7 +291,7 @@ def preprocessing(
290291
input_df_synid: str,
291292
cli_to_cbio_mapping: pd.DataFrame,
292293
cli_to_oncotree_mapping_synid: str,
293-
neoantigen_data_synid : str,
294+
neoantigen_data_synid: str,
294295
datahub_tools_path: str,
295296
**kwargs,
296297
) -> pd.DataFrame:
@@ -326,9 +327,9 @@ def preprocessing(
326327
)
327328
cli_remapped = remap_clinical_ids_to_paper_ids(input_df=cli_with_oncotree)
328329
cli_with_neoantigen = merge_in_neoantigen_study_data(
329-
input_df = cli_remapped,
330-
neoantigen_data_synid = neoantigen_data_synid,
331-
logger = logger
330+
input_df=cli_remapped,
331+
neoantigen_data_synid=neoantigen_data_synid,
332+
logger=logger,
332333
)
333334
cli_to_cbio_mapping_dict = dict(
334335
zip(
@@ -339,8 +340,12 @@ def preprocessing(
339340
cli_remapped = cli_with_neoantigen.rename(columns=cli_to_cbio_mapping_dict)
340341
cli_remapped = filter_out_non_analyses_samples(cli_remapped)
341342
cli_remapped = remap_column_values(input_df=cli_remapped)
342-
cli_remapped = convert_days_to_months(input_df=cli_remapped, col="OS_MONTHS")
343-
cli_remapped = convert_days_to_months(input_df=cli_remapped, col="PFS_MONTHS")
343+
cli_remapped = convert_days_to_months(
344+
input_df=cli_remapped, col="OS_MONTHS", logger=logger
345+
)
346+
cli_remapped = convert_days_to_months(
347+
input_df=cli_remapped, col="PFS_MONTHS", logger=logger
348+
)
344349
cli_remapped_cleaned = remove_suffix_from_column_values(input_df=cli_remapped)
345350
cli_remapped_cleaned = update_case_of_column_values(
346351
input_df=cli_remapped_cleaned, cli_to_cbio_mapping=cli_to_cbio_mapping
@@ -445,7 +450,7 @@ def get_updated_cli_attributes(
445450
"""
446451
cli_attr = pd.read_csv(
447452
f"{datahub_tools_path}/add-clinical-header/clinical_attributes_metadata.txt",
448-
sep="\t"
453+
sep="\t",
449454
)
450455
cli_to_cbio_mapping_to_append = cli_to_cbio_mapping.rename(
451456
columns={
@@ -491,13 +496,13 @@ def convert_oncotree_codes(datahub_tools_path: str) -> pd.DataFrame:
491496
return cli_w_cancer_types
492497

493498

494-
def rename_files_on_disk(filepath : str) -> None:
499+
def rename_files_on_disk(filepath: str) -> None:
495500
"""Renames files on disk by removing the .metadata ext from filenames.
496501
NOTE: This will overwrite previous files with the same name.
497-
502+
498503
This is needed because the insert_clinical_metadata script from
499504
datahub-curation-tools saves the sample and patient files with
500-
".metadata" ext but the cbioportal validation tool expects them to be
505+
".metadata" ext but the cbioportal validation tool expects them to be
501506
withou the ".metadata"
502507
503508
Args:
@@ -507,23 +512,29 @@ def rename_files_on_disk(filepath : str) -> None:
507512
os.replace(filepath, filepath_new)
508513

509514

510-
def convert_days_to_months(input_df: pd.DataFrame, col : str) -> pd.DataFrame:
511-
"""Convert the column that's in days into months
515+
def convert_days_to_months(input_df: pd.DataFrame, col: str, **kwargs) -> pd.DataFrame:
516+
"""Convert the column that's in days into months
512517
using the conversion rate 1 month = 30.44 days,
513518
rounding to two decimal places
514519
515520
Args:
516521
input_df (pd.DataFrame): input data
522+
col (str): the column to convert from days to months
517523
518524
Returns:
519525
pd.DataFrame: Output data with data transformed
520526
from days to months
521527
"""
522-
converted_df = input_df.copy()
523-
converted_df[col] = (converted_df[col] / 30.44).round(decimals = 2)
524-
return converted_df
528+
logger = kwargs.get("logger", logging.getLogger(__name__))
529+
if col in input_df.columns:
530+
converted_df = input_df.copy()
531+
converted_df[col] = (converted_df[col] / 30.44).round(decimals=2)
532+
return converted_df
533+
else:
534+
logger.info(f"Nothing to convert. {col} doesn't exist in the data.")
535+
return input_df
536+
525537

526-
527538
def get_all_non_na_columns(input_df: pd.DataFrame) -> List[str]:
528539
"""Gets all the columns in input data without all (100%) NAs
529540
Args:
@@ -589,13 +600,13 @@ def add_clinical_header(
589600
python3 {datahub_tools_path}/add-clinical-header/insert_clinical_metadata.py \
590601
-d {dataset_dir}
591602
"""
592-
time.sleep(2) # give subprocess some time before checking
603+
time.sleep(2) # give subprocess some time before checking
593604
subprocess.run(cmd, shell=True, executable="/bin/bash")
594-
time.sleep(2) # give subprocess some time before checking
595-
605+
time.sleep(2) # give subprocess some time before checking
606+
596607
# remove .metadata from files
597-
rename_files_on_disk(filepath = f"{dataset_dir}/data_clinical_patient.txt.metadata")
598-
rename_files_on_disk(filepath = f"{dataset_dir}/data_clinical_sample.txt.metadata")
608+
rename_files_on_disk(filepath=f"{dataset_dir}/data_clinical_patient.txt.metadata")
609+
rename_files_on_disk(filepath=f"{dataset_dir}/data_clinical_sample.txt.metadata")
599610

600611
# saved merged for case lists
601612
merged_df_subset = input_dfs["merged"][
@@ -855,7 +866,7 @@ def main():
855866
"--lens_id_mapping_synid",
856867
type=str,
857868
help="Synapse id for the study_sample_name (paper ids) to lens id mapping file. Optional. Defaults to None, then adding lens id mapping is skipped",
858-
default=None
869+
default=None,
859870
)
860871
parser.add_argument(
861872
"--neoantigen_data_synid",
@@ -893,7 +904,7 @@ def main():
893904
cli_to_oncotree_mapping_synid=args.cli_to_oncotree_mapping_synid,
894905
neoantigen_data_synid=args.neoantigen_data_synid,
895906
datahub_tools_path=args.datahub_tools_path,
896-
logger = main_logger,
907+
logger=main_logger,
897908
)
898909
cli_dfs = split_into_patient_and_sample_data(
899910
input_data=cli_df, cli_to_cbio_mapping=cli_to_cbio_mapping
@@ -914,7 +925,7 @@ def main():
914925
dataset_name=dataset,
915926
datahub_tools_path=args.datahub_tools_path,
916927
log_file_name="iatlas_cli_validation_log.txt",
917-
flagger=dataset_flagger
928+
flagger=dataset_flagger,
918929
)
919930
add_clinical_header(
920931
input_dfs=cli_dfs,

tests/test_clinical.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,13 @@ def syn_mock():
3030
{"OS_TIME": [7.59, 32.85, 11.33, 23.2], "EXTRA_COL": [1, 0, 0, 2]}
3131
),
3232
),
33+
(
34+
pd.DataFrame({"EXTRA_COL": [1, 0, 0, 2]}),
35+
"OS_TIME",
36+
pd.DataFrame({"EXTRA_COL": [1, 0, 0, 2]}),
37+
),
3338
],
39+
ids=["converted_mdf", "nothing_to_convert"],
3440
)
3541
def test_that_convert_days_to_months_converts_correctly(input, col, expected):
3642
result = cli_to_cbio.convert_days_to_months(input, col)

0 commit comments

Comments
 (0)