From f49dca79810d19126df398bfea7e141b88f12eb6 Mon Sep 17 00:00:00 2001 From: PlushZ Date: Fri, 4 Apr 2025 16:38:55 +0200 Subject: [PATCH 1/5] add flexynesis_cbioportal_import --- tools/flexynesis/fetch_cbioportal_data.py | 60 +++++++++ .../flexynesis_cbioportal_import.xml | 121 ++++++++++++++++++ 2 files changed, 181 insertions(+) create mode 100644 tools/flexynesis/fetch_cbioportal_data.py create mode 100644 tools/flexynesis/flexynesis_cbioportal_import.xml diff --git a/tools/flexynesis/fetch_cbioportal_data.py b/tools/flexynesis/fetch_cbioportal_data.py new file mode 100644 index 0000000000..cab8f49129 --- /dev/null +++ b/tools/flexynesis/fetch_cbioportal_data.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +import argparse +import os +from flexynesis.utils import CBioPortalData + +def main(): + parser = argparse.ArgumentParser(description="Fetch and prepare cBioPortal data for Flexynesis.") + parser.add_argument("--study_id", required=True, help="cBioPortal study ID (e.g., 'brca_tcga')") + parser.add_argument("--data_types", required=True, help="Comma-separated list of data types (e.g., 'clin,mut,omics')") + parser.add_argument("--mapped_files", default=None, help="Comma-separated list of .txt files to map to data_types (optional)") + parser.add_argument("--split_ratio", type=float, default=0.7, help="Training/test split ratio (0.0 to 1.0)") + parser.add_argument("--output_dir", required=True, help="Output directory for datasets") + + args = parser.parse_args() + + data_types = args.data_types.split(",") + if "clin" not in data_types: + raise ValueError("Clinical data ('clin') is required for splitting the dataset.") + + file_mapping = { + "clin": "data_clinical_patient.txt", # can be any with 'clinical' in file name + "mut": "data_mutations.txt", # any with 'mutations' in file name + "omics": "data_cna.txt", + "other": None + } + + if args.mapped_files: + mapped_files = args.mapped_files.split(",") + if len(mapped_files) != len(data_types): + raise ValueError(f"Number of mapped files ({len(mapped_files)}) must match number of data types ({len(data_types)}).") + files_to_fetch = {dt: mf for dt, mf in zip(data_types, mapped_files)} + for mf in mapped_files: + if not mf.endswith(".txt"): + raise ValueError(f"Mapped file '{mf}' must end with '.txt'.") + else: + files_to_fetch = {dt: file_mapping[dt] for dt in data_types if dt in file_mapping} + + invalid_types = set(data_types) - set(file_mapping.keys()) + if invalid_types: + raise ValueError(f"Invalid data types: {invalid_types}. Supported types: {list(file_mapping.keys())}") + + cbioportal = CBioPortalData(study_id=args.study_id) + cbioportal.get_cbioportal_data(study_id=args.study_id, files=files_to_fetch) + dataset = cbioportal.split_data(ratio=args.split_ratio) + + os.makedirs(args.output_dir, exist_ok=True) + + for data_type in data_types: + if data_type in dataset['train']: + train_file = os.path.join(args.output_dir, f"{data_type}_train.csv") + dataset['train'][data_type].to_csv(train_file, index=True) + print(f"Wrote training data to {train_file}") + if data_type in dataset['test']: + test_file = os.path.join(args.output_dir, f"{data_type}_test.csv") + dataset['test'][data_type].to_csv(test_file, index=True) + print(f"Wrote test data to {test_file}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tools/flexynesis/flexynesis_cbioportal_import.xml b/tools/flexynesis/flexynesis_cbioportal_import.xml new file mode 100644 index 0000000000..644b459159 --- /dev/null +++ b/tools/flexynesis/flexynesis_cbioportal_import.xml @@ -0,0 +1,121 @@ + + and prepare cBioPortal data for Flexynesis analysis + + macros.xml + + + + + + value == True + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1101/2024.07.16.603606 + + \ No newline at end of file From f3386f44ea2c84b3a7ff810ce7385f5212c88868 Mon Sep 17 00:00:00 2001 From: PlushZ Date: Mon, 7 Apr 2025 09:47:07 +0200 Subject: [PATCH 2/5] fix linting --- tools/flexynesis/fetch_cbioportal_data.py | 15 ++++++++------- tools/flexynesis/flexynesis_cbioportal_import.xml | 12 ++++++------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/tools/flexynesis/fetch_cbioportal_data.py b/tools/flexynesis/fetch_cbioportal_data.py index cab8f49129..68bfde3390 100644 --- a/tools/flexynesis/fetch_cbioportal_data.py +++ b/tools/flexynesis/fetch_cbioportal_data.py @@ -2,8 +2,10 @@ import argparse import os + from flexynesis.utils import CBioPortalData + def main(): parser = argparse.ArgumentParser(description="Fetch and prepare cBioPortal data for Flexynesis.") parser.add_argument("--study_id", required=True, help="cBioPortal study ID (e.g., 'brca_tcga')") @@ -19,8 +21,8 @@ def main(): raise ValueError("Clinical data ('clin') is required for splitting the dataset.") file_mapping = { - "clin": "data_clinical_patient.txt", # can be any with 'clinical' in file name - "mut": "data_mutations.txt", # any with 'mutations' in file name + "clin": "data_clinical_patient.txt", # can be any with 'clinical' in file name + "mut": "data_mutations.txt", # any with 'mutations' in file name "omics": "data_cna.txt", "other": None } @@ -49,12 +51,11 @@ def main(): for data_type in data_types: if data_type in dataset['train']: train_file = os.path.join(args.output_dir, f"{data_type}_train.csv") - dataset['train'][data_type].to_csv(train_file, index=True) - print(f"Wrote training data to {train_file}") + dataset['train'][data_type].to_csv(train_file, index=True) if data_type in dataset['test']: test_file = os.path.join(args.output_dir, f"{data_type}_test.csv") - dataset['test'][data_type].to_csv(test_file, index=True) - print(f"Wrote test data to {test_file}") + dataset['test'][data_type].to_csv(test_file, index=True) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tools/flexynesis/flexynesis_cbioportal_import.xml b/tools/flexynesis/flexynesis_cbioportal_import.xml index 644b459159..862e97cd4e 100644 --- a/tools/flexynesis/flexynesis_cbioportal_import.xml +++ b/tools/flexynesis/flexynesis_cbioportal_import.xml @@ -41,22 +41,22 @@ - + - + - + - + - + - + From 8b5f9b41dce187e8bc42a7f1ab415b8bb70541a6 Mon Sep 17 00:00:00 2001 From: PlushZ Date: Mon, 7 Apr 2025 10:30:57 +0200 Subject: [PATCH 3/5] fix linting py script --- tools/flexynesis/fetch_cbioportal_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/flexynesis/fetch_cbioportal_data.py b/tools/flexynesis/fetch_cbioportal_data.py index 68bfde3390..c1121d0b05 100644 --- a/tools/flexynesis/fetch_cbioportal_data.py +++ b/tools/flexynesis/fetch_cbioportal_data.py @@ -13,13 +13,13 @@ def main(): parser.add_argument("--mapped_files", default=None, help="Comma-separated list of .txt files to map to data_types (optional)") parser.add_argument("--split_ratio", type=float, default=0.7, help="Training/test split ratio (0.0 to 1.0)") parser.add_argument("--output_dir", required=True, help="Output directory for datasets") - + args = parser.parse_args() - + data_types = args.data_types.split(",") if "clin" not in data_types: raise ValueError("Clinical data ('clin') is required for splitting the dataset.") - + file_mapping = { "clin": "data_clinical_patient.txt", # can be any with 'clinical' in file name "mut": "data_mutations.txt", # any with 'mutations' in file name @@ -37,7 +37,7 @@ def main(): raise ValueError(f"Mapped file '{mf}' must end with '.txt'.") else: files_to_fetch = {dt: file_mapping[dt] for dt in data_types if dt in file_mapping} - + invalid_types = set(data_types) - set(file_mapping.keys()) if invalid_types: raise ValueError(f"Invalid data types: {invalid_types}. Supported types: {list(file_mapping.keys())}") From 5f5973472c6158f271c956d5a747aa23a1c4ee4e Mon Sep 17 00:00:00 2001 From: Polina Polunina <55543056+PlushZ@users.noreply.github.com> Date: Mon, 7 Apr 2025 11:36:11 +0200 Subject: [PATCH 4/5] Apply suggestions from code review Co-authored-by: Saim Momin <64724322+SaimMomin12@users.noreply.github.com> --- tools/flexynesis/flexynesis_cbioportal_import.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/flexynesis/flexynesis_cbioportal_import.xml b/tools/flexynesis/flexynesis_cbioportal_import.xml index 862e97cd4e..21d45fc53b 100644 --- a/tools/flexynesis/flexynesis_cbioportal_import.xml +++ b/tools/flexynesis/flexynesis_cbioportal_import.xml @@ -35,7 +35,7 @@ - + @@ -63,7 +63,7 @@ - + From 7323508a86f6d76a3b77a5fe6419d8e365ef996e Mon Sep 17 00:00:00 2001 From: Polina Polunina <55543056+PlushZ@users.noreply.github.com> Date: Mon, 7 Apr 2025 11:38:17 +0200 Subject: [PATCH 5/5] bump tool version --- tools/flexynesis/macros.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flexynesis/macros.xml b/tools/flexynesis/macros.xml index 09196f4624..7243db49bb 100644 --- a/tools/flexynesis/macros.xml +++ b/tools/flexynesis/macros.xml @@ -1,6 +1,6 @@ 0.2.17 - 0 + 1 24.1