From f49dca79810d19126df398bfea7e141b88f12eb6 Mon Sep 17 00:00:00 2001
From: PlushZ <polpolunina@gmail.com>
Date: Fri, 4 Apr 2025 16:38:55 +0200
Subject: [PATCH 1/5] add flexynesis_cbioportal_import

---
 tools/flexynesis/fetch_cbioportal_data.py     |  60 +++++++++
 .../flexynesis_cbioportal_import.xml          | 121 ++++++++++++++++++
 2 files changed, 181 insertions(+)
 create mode 100644 tools/flexynesis/fetch_cbioportal_data.py
 create mode 100644 tools/flexynesis/flexynesis_cbioportal_import.xml
diff --git a/tools/flexynesis/fetch_cbioportal_data.py b/tools/flexynesis/fetch_cbioportal_data.py
new file mode 100644
index 0000000000..cab8f49129
--- /dev/null
+++ b/tools/flexynesis/fetch_cbioportal_data.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+import argparse
+import os
+from flexynesis.utils import CBioPortalData
+
+def main():
+    parser = argparse.ArgumentParser(description="Fetch and prepare cBioPortal data for Flexynesis.")
+    parser.add_argument("--study_id", required=True, help="cBioPortal study ID (e.g., 'brca_tcga')")
+    parser.add_argument("--data_types", required=True, help="Comma-separated list of data types (e.g., 'clin,mut,omics')")
+    parser.add_argument("--mapped_files", default=None, help="Comma-separated list of .txt files to map to data_types (optional)")
+    parser.add_argument("--split_ratio", type=float, default=0.7, help="Training/test split ratio (0.0 to 1.0)")
+    parser.add_argument("--output_dir", required=True, help="Output directory for datasets")
+    
+    args = parser.parse_args()
+    
+    data_types = args.data_types.split(",")
+    if "clin" not in data_types:
+        raise ValueError("Clinical data ('clin') is required for splitting the dataset.")
+    
+    file_mapping = {
+        "clin": "data_clinical_patient.txt", # can be any with 'clinical' in file name
+        "mut": "data_mutations.txt", # any with 'mutations' in file name
+        "omics": "data_cna.txt",
+        "other": None
+    }
+
+    if args.mapped_files:
+        mapped_files = args.mapped_files.split(",")
+        if len(mapped_files) != len(data_types):
+            raise ValueError(f"Number of mapped files ({len(mapped_files)}) must match number of data types ({len(data_types)}).")
+        files_to_fetch = {dt: mf for dt, mf in zip(data_types, mapped_files)}
+        for mf in mapped_files:
+            if not mf.endswith(".txt"):
+                raise ValueError(f"Mapped file '{mf}' must end with '.txt'.")
+    else:
+        files_to_fetch = {dt: file_mapping[dt] for dt in data_types if dt in file_mapping}
+    
+    invalid_types = set(data_types) - set(file_mapping.keys())
+    if invalid_types:
+        raise ValueError(f"Invalid data types: {invalid_types}. Supported types: {list(file_mapping.keys())}")
+
+    cbioportal = CBioPortalData(study_id=args.study_id)
+    cbioportal.get_cbioportal_data(study_id=args.study_id, files=files_to_fetch)
+    dataset = cbioportal.split_data(ratio=args.split_ratio)
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    for data_type in data_types:
+        if data_type in dataset['train']:
+            train_file = os.path.join(args.output_dir, f"{data_type}_train.csv")
+            dataset['train'][data_type].to_csv(train_file, index=True)  
+            print(f"Wrote training data to {train_file}")
+        if data_type in dataset['test']:
+            test_file = os.path.join(args.output_dir, f"{data_type}_test.csv")
+            dataset['test'][data_type].to_csv(test_file, index=True) 
+            print(f"Wrote test data to {test_file}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tools/flexynesis/flexynesis_cbioportal_import.xml b/tools/flexynesis/flexynesis_cbioportal_import.xml
new file mode 100644
index 0000000000..644b459159
--- /dev/null
+++ b/tools/flexynesis/flexynesis_cbioportal_import.xml
@@ -0,0 +1,121 @@
+<tool id="flexynesis_cbioportal_import" name="Flexynesis cBioPortal import" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>and prepare cBioPortal data for Flexynesis analysis</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+        @CHECK_NON_COMMERCIAL_USE@
+        python '$__tool_directory__/fetch_cbioportal_data.py'
+            --study_id '$study_id'
+            --data_types '$data_types'
+            #if $mapped_files:
+                --mapped_files '$mapped_files'
+            #end if
+            --split_ratio '$split_ratio'
+            --output_dir 'output'
+    ]]></command>
+    <inputs>
+        <param name="non_commercial_use" label="I certify that I am not using this tool for commercial purposes." type="boolean" truevalue="NON_COMMERCIAL_USE" falsevalue="COMMERCIAL_USE" checked="False">
+            <validator type="expression" message="This tool is only available for non-commercial use.">value == True</validator>
+        </param>
+        <param name="study_id" type="text" label="cBioPortal study ID" help="The ID of the study to fetch from cBioPortal (e.g., 'brca_tcga')." />
+        <param name="data_types" type="select" multiple="true" label="Data types to fetch" help="Select the types of data to retrieve from cBioPortal.">
+            <option value="clin" selected="true">Clinical data (default: data_clinical_patient.txt)</option>
+            <option value="mut">Mutations (default: data_mutations.txt)</option>
+            <option value="omics">Omics data (default: data_cna.txt)</option>
+            <option value="other">Other custom data</option>
+        </param>
+        <param name="mapped_files" type="text" optional="true" label="Mapped files" help="Comma-separated list of .txt files to map to the selected data types (e.g., 'data_clinical_sample.txt,data_mutations.txt,data_castom.txt'). Must match the number and order of data types." />
+        <param name="split_ratio" type="float" value="0.7" min="0.0" max="1.0" label="Training/Test split ratio" help="Proportion of data to use for training (e.g., 0.7 means 70% train, 30% test)." />
+    </inputs>
+    <outputs>
+        <collection name="datasets" type="list" label="${tool.name} on ${study_id}: datasets">
+            <discover_datasets pattern="(?P&lt;name&gt;.+_(train|test))\.csv$" format="csv" directory="output" />
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <param name="non_commercial_use" value="True"/>
+            <param name="study_id" value="lgg_tcga" />
+            <param name="data_types" value="clin,mut" />
+            <param name="split_ratio" value="0.7" />
+            <output_collection name="datasets" type="list">
+                <element name="clin_train">
+                    <assert_contents>
+                        <has_text_matching expression="PATIENT_ID"/>
+                    </assert_contents>
+                </element>
+                <element name="mut_train">
+                    <assert_contents>
+                        <has_text_matching expression="Hugo_Symbol"/>
+                    </assert_contents>
+                </element>
+                <element name="clin_test">
+                    <assert_contents>
+                        <has_text_matching expression="PATIENT_ID"/>
+                    </assert_contents>
+                </element>
+                <element name="mut_test">
+                    <assert_contents>
+                        <has_text_matching expression="Hugo_Symbol"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test>
+            <param name="non_commercial_use" value="True"/>
+            <param name="study_id" value="lgg_tcga" />
+            <param name="data_types" value="clin,mut,other" />
+            <param name="mapped_files" value="data_clinical_patient.txt,data_mutations.txt,data_cna.txt" />
+            <param name="split_ratio" value="0.8" />
+            <output_collection name="datasets" type="list">
+                <element name="clin_train">
+                    <assert_contents>
+                        <has_text_matching expression="PATIENT_ID"/>
+                    </assert_contents>
+                </element>
+                <element name="mut_train">
+                    <assert_contents>
+                        <has_text_matching expression="Hugo_Symbol"/>
+                    </assert_contents>
+                </element>
+                <element name="other_test">
+                    <assert_contents>
+                        <has_text_matching expression="Hugo_Symbol"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+    </tests><help><![CDATA[
+**Flexynesis cBioPortal import**
+
+This tool fetches data from cBioPortal using the Flexynesis `CBioPortalData` class and prepares it for use with the Flexynesis Galaxy tool. It downloads a specified study, extracts the requested data types, splits them into training and test sets, and outputs them as CSV files compatible with Flexynesis (e.g., `clin_train.csv`, `mut_test.csv`).
+
+**Inputs**
+
+- **cBioPortal study ID**: The identifier of the study to fetch (e.g., `brca_tcga`, `lgg_tcga`). Find study IDs on the cBioPortal.
+- **Data types to fetch**: Select one or more data types to retrieve. 'Clinical data' (`clin`) is required for splitting. Options:
+    - `clin`: Clinical data (default: `data_clinical_patient.txt`)
+    - `mut`: Mutation data (default: `data_mutations.txt`)
+    - `omics`: Omics data (default: `data_cna.txt`)
+    - `other`: Custom data type (requires `--mapped_files`)
+- **Mapped files (optional)**: A comma-separated list of `.txt` files to override default filenames. Must match the number and order of selected data types (e.g., `data_clinical_sample.txt,data_mutations.txt,data_custom.txt`).
+- **Training/Test split ratio**: The proportion of data for training (e.g., 0.7 means 70% train, 30% test).
+
+**Outputs**
+
+A collection of datasets including:
+    - `clin_train.csv` and `clin_test.csv`: Training and test clinical data (always included).
+    - `mut_train.csv` and `mut_test.csv`: Training and test mutation data (if selected).
+    - `omics_train.csv` and `omics_test.csv`: Training and test omics/CNA data (if selected).
+    - `other_train.csv` and `other_test.csv`: Training and test custom data (if `other` is selected with a mapped file).
+
+These datasets can be used as inputs to the Flexynesis Galaxy tool for multi-omics analysis.
+
+**Note**: Ensure the study ID is valid and the selected data types (or mapped files) are available in the study archive. Clinical data (`clin`) is mandatory for splitting.
+]]></help>
+    <citations>
+        <citation type="doi">10.1101/2024.07.16.603606</citation>
+    </citations>
+</tool>
\ No newline at end of file

From f3386f44ea2c84b3a7ff810ce7385f5212c88868 Mon Sep 17 00:00:00 2001
From: PlushZ <polpolunina@gmail.com>
Date: Mon, 7 Apr 2025 09:47:07 +0200
Subject: [PATCH 2/5] fix linting

---
 tools/flexynesis/fetch_cbioportal_data.py         | 15 ++++++++-------
 tools/flexynesis/flexynesis_cbioportal_import.xml | 12 ++++++------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/tools/flexynesis/fetch_cbioportal_data.py b/tools/flexynesis/fetch_cbioportal_data.py
index cab8f49129..68bfde3390 100644
--- a/tools/flexynesis/fetch_cbioportal_data.py
+++ b/tools/flexynesis/fetch_cbioportal_data.py
@@ -2,8 +2,10 @@
 
 import argparse
 import os
+
 from flexynesis.utils import CBioPortalData
 
+
 def main():
     parser = argparse.ArgumentParser(description="Fetch and prepare cBioPortal data for Flexynesis.")
     parser.add_argument("--study_id", required=True, help="cBioPortal study ID (e.g., 'brca_tcga')")
@@ -19,8 +21,8 @@ def main():
         raise ValueError("Clinical data ('clin') is required for splitting the dataset.")
     
     file_mapping = {
-        "clin": "data_clinical_patient.txt", # can be any with 'clinical' in file name
-        "mut": "data_mutations.txt", # any with 'mutations' in file name
+        "clin": "data_clinical_patient.txt",  # can be any with 'clinical' in file name
+        "mut": "data_mutations.txt",  # any with 'mutations' in file name
         "omics": "data_cna.txt",
         "other": None
     }
@@ -49,12 +51,11 @@ def main():
     for data_type in data_types:
         if data_type in dataset['train']:
             train_file = os.path.join(args.output_dir, f"{data_type}_train.csv")
-            dataset['train'][data_type].to_csv(train_file, index=True)  
-            print(f"Wrote training data to {train_file}")
+            dataset['train'][data_type].to_csv(train_file, index=True)
         if data_type in dataset['test']:
             test_file = os.path.join(args.output_dir, f"{data_type}_test.csv")
-            dataset['test'][data_type].to_csv(test_file, index=True) 
-            print(f"Wrote test data to {test_file}")
+            dataset['test'][data_type].to_csv(test_file, index=True)
+
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/tools/flexynesis/flexynesis_cbioportal_import.xml b/tools/flexynesis/flexynesis_cbioportal_import.xml
index 644b459159..862e97cd4e 100644
--- a/tools/flexynesis/flexynesis_cbioportal_import.xml
+++ b/tools/flexynesis/flexynesis_cbioportal_import.xml
@@ -41,22 +41,22 @@
             <param name="data_types" value="clin,mut" />
             <param name="split_ratio" value="0.7" />
             <output_collection name="datasets" type="list">
-                <element name="clin_train">
+                <element name="clin_test">
                     <assert_contents>
                         <has_text_matching expression="PATIENT_ID"/>
                     </assert_contents>
                 </element>
-                <element name="mut_train">
+                <element name="clin_train">
                     <assert_contents>
-                        <has_text_matching expression="Hugo_Symbol"/>
+                        <has_text_matching expression="PATIENT_ID"/>
                     </assert_contents>
                 </element>
-                <element name="clin_test">
+                <element name="mut_test">
                     <assert_contents>
-                        <has_text_matching expression="PATIENT_ID"/>
+                        <has_text_matching expression="Hugo_Symbol"/>
                     </assert_contents>
                 </element>
-                <element name="mut_test">
+                <element name="mut_train">
                     <assert_contents>
                         <has_text_matching expression="Hugo_Symbol"/>
                     </assert_contents>

From 8b5f9b41dce187e8bc42a7f1ab415b8bb70541a6 Mon Sep 17 00:00:00 2001
From: PlushZ <polpolunina@gmail.com>
Date: Mon, 7 Apr 2025 10:30:57 +0200
Subject: [PATCH 3/5] fix linting py script

---
 tools/flexynesis/fetch_cbioportal_data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/flexynesis/fetch_cbioportal_data.py b/tools/flexynesis/fetch_cbioportal_data.py
index 68bfde3390..c1121d0b05 100644
--- a/tools/flexynesis/fetch_cbioportal_data.py
+++ b/tools/flexynesis/fetch_cbioportal_data.py
@@ -13,13 +13,13 @@ def main():
     parser.add_argument("--mapped_files", default=None, help="Comma-separated list of .txt files to map to data_types (optional)")
     parser.add_argument("--split_ratio", type=float, default=0.7, help="Training/test split ratio (0.0 to 1.0)")
     parser.add_argument("--output_dir", required=True, help="Output directory for datasets")
-    
+
     args = parser.parse_args()
-    
+
     data_types = args.data_types.split(",")
     if "clin" not in data_types:
         raise ValueError("Clinical data ('clin') is required for splitting the dataset.")
-    
+
     file_mapping = {
         "clin": "data_clinical_patient.txt",  # can be any with 'clinical' in file name
         "mut": "data_mutations.txt",  # any with 'mutations' in file name
@@ -37,7 +37,7 @@ def main():
                 raise ValueError(f"Mapped file '{mf}' must end with '.txt'.")
     else:
         files_to_fetch = {dt: file_mapping[dt] for dt in data_types if dt in file_mapping}
-    
+
     invalid_types = set(data_types) - set(file_mapping.keys())
     if invalid_types:
         raise ValueError(f"Invalid data types: {invalid_types}. Supported types: {list(file_mapping.keys())}")

From 5f5973472c6158f271c956d5a747aa23a1c4ee4e Mon Sep 17 00:00:00 2001
From: Polina Polunina <55543056+PlushZ@users.noreply.github.com>
Date: Mon, 7 Apr 2025 11:36:11 +0200
Subject: [PATCH 4/5] Apply suggestions from code review

Co-authored-by: Saim Momin <64724322+SaimMomin12@users.noreply.github.com>
---
 tools/flexynesis/flexynesis_cbioportal_import.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/flexynesis/flexynesis_cbioportal_import.xml b/tools/flexynesis/flexynesis_cbioportal_import.xml
index 862e97cd4e..21d45fc53b 100644
--- a/tools/flexynesis/flexynesis_cbioportal_import.xml
+++ b/tools/flexynesis/flexynesis_cbioportal_import.xml
@@ -35,7 +35,7 @@
         </collection>
     </outputs>
     <tests>
-        <test>
+        <test expect_num_outputs="1">
             <param name="non_commercial_use" value="True"/>
             <param name="study_id" value="lgg_tcga" />
             <param name="data_types" value="clin,mut" />
@@ -63,7 +63,7 @@
                 </element>
             </output_collection>
         </test>
-        <test>
+        <test expect_num_outputs="1">
             <param name="non_commercial_use" value="True"/>
             <param name="study_id" value="lgg_tcga" />
             <param name="data_types" value="clin,mut,other" />

From 7323508a86f6d76a3b77a5fe6419d8e365ef996e Mon Sep 17 00:00:00 2001
From: Polina Polunina <55543056+PlushZ@users.noreply.github.com>
Date: Mon, 7 Apr 2025 11:38:17 +0200
Subject: [PATCH 5/5] bump tool version

---
 tools/flexynesis/macros.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/flexynesis/macros.xml b/tools/flexynesis/macros.xml
index 09196f4624..7243db49bb 100644
--- a/tools/flexynesis/macros.xml
+++ b/tools/flexynesis/macros.xml
@@ -1,6 +1,6 @@
 <macros>
     <token name="@TOOL_VERSION@">0.2.17</token>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <token name="@PROFILE@">24.1</token>
     <xml name="requirements">
         <requirements>