Skip to content

Commit f49dca7

Browse files
committed
add flexynesis_cbioportal_import
1 parent cefdfdc commit f49dca7

File tree

2 files changed

+181
-0
lines changed

2 files changed

+181
-0
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/usr/bin/env python
2+
3+
import argparse
4+
import os
5+
from flexynesis.utils import CBioPortalData
6+
7+
def main():
8+
parser = argparse.ArgumentParser(description="Fetch and prepare cBioPortal data for Flexynesis.")
9+
parser.add_argument("--study_id", required=True, help="cBioPortal study ID (e.g., 'brca_tcga')")
10+
parser.add_argument("--data_types", required=True, help="Comma-separated list of data types (e.g., 'clin,mut,omics')")
11+
parser.add_argument("--mapped_files", default=None, help="Comma-separated list of .txt files to map to data_types (optional)")
12+
parser.add_argument("--split_ratio", type=float, default=0.7, help="Training/test split ratio (0.0 to 1.0)")
13+
parser.add_argument("--output_dir", required=True, help="Output directory for datasets")
14+
15+
args = parser.parse_args()
16+
17+
data_types = args.data_types.split(",")
18+
if "clin" not in data_types:
19+
raise ValueError("Clinical data ('clin') is required for splitting the dataset.")
20+
21+
file_mapping = {
22+
"clin": "data_clinical_patient.txt", # can be any with 'clinical' in file name
23+
"mut": "data_mutations.txt", # any with 'mutations' in file name
24+
"omics": "data_cna.txt",
25+
"other": None
26+
}
27+
28+
if args.mapped_files:
29+
mapped_files = args.mapped_files.split(",")
30+
if len(mapped_files) != len(data_types):
31+
raise ValueError(f"Number of mapped files ({len(mapped_files)}) must match number of data types ({len(data_types)}).")
32+
files_to_fetch = {dt: mf for dt, mf in zip(data_types, mapped_files)}
33+
for mf in mapped_files:
34+
if not mf.endswith(".txt"):
35+
raise ValueError(f"Mapped file '{mf}' must end with '.txt'.")
36+
else:
37+
files_to_fetch = {dt: file_mapping[dt] for dt in data_types if dt in file_mapping}
38+
39+
invalid_types = set(data_types) - set(file_mapping.keys())
40+
if invalid_types:
41+
raise ValueError(f"Invalid data types: {invalid_types}. Supported types: {list(file_mapping.keys())}")
42+
43+
cbioportal = CBioPortalData(study_id=args.study_id)
44+
cbioportal.get_cbioportal_data(study_id=args.study_id, files=files_to_fetch)
45+
dataset = cbioportal.split_data(ratio=args.split_ratio)
46+
47+
os.makedirs(args.output_dir, exist_ok=True)
48+
49+
for data_type in data_types:
50+
if data_type in dataset['train']:
51+
train_file = os.path.join(args.output_dir, f"{data_type}_train.csv")
52+
dataset['train'][data_type].to_csv(train_file, index=True)
53+
print(f"Wrote training data to {train_file}")
54+
if data_type in dataset['test']:
55+
test_file = os.path.join(args.output_dir, f"{data_type}_test.csv")
56+
dataset['test'][data_type].to_csv(test_file, index=True)
57+
print(f"Wrote test data to {test_file}")
58+
59+
if __name__ == "__main__":
60+
main()
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
<tool id="flexynesis_cbioportal_import" name="Flexynesis cBioPortal import" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2+
<description>and prepare cBioPortal data for Flexynesis analysis</description>
3+
<macros>
4+
<import>macros.xml</import>
5+
</macros>
6+
<expand macro="requirements"/>
7+
<command detect_errors="exit_code"><![CDATA[
8+
@CHECK_NON_COMMERCIAL_USE@
9+
python '$__tool_directory__/fetch_cbioportal_data.py'
10+
--study_id '$study_id'
11+
--data_types '$data_types'
12+
#if $mapped_files:
13+
--mapped_files '$mapped_files'
14+
#end if
15+
--split_ratio '$split_ratio'
16+
--output_dir 'output'
17+
]]></command>
18+
<inputs>
19+
<param name="non_commercial_use" label="I certify that I am not using this tool for commercial purposes." type="boolean" truevalue="NON_COMMERCIAL_USE" falsevalue="COMMERCIAL_USE" checked="False">
20+
<validator type="expression" message="This tool is only available for non-commercial use.">value == True</validator>
21+
</param>
22+
<param name="study_id" type="text" label="cBioPortal study ID" help="The ID of the study to fetch from cBioPortal (e.g., 'brca_tcga')." />
23+
<param name="data_types" type="select" multiple="true" label="Data types to fetch" help="Select the types of data to retrieve from cBioPortal.">
24+
<option value="clin" selected="true">Clinical data (default: data_clinical_patient.txt)</option>
25+
<option value="mut">Mutations (default: data_mutations.txt)</option>
26+
<option value="omics">Omics data (default: data_cna.txt)</option>
27+
<option value="other">Other custom data</option>
28+
</param>
29+
<param name="mapped_files" type="text" optional="true" label="Mapped files" help="Comma-separated list of .txt files to map to the selected data types (e.g., 'data_clinical_sample.txt,data_mutations.txt,data_castom.txt'). Must match the number and order of data types." />
30+
<param name="split_ratio" type="float" value="0.7" min="0.0" max="1.0" label="Training/Test split ratio" help="Proportion of data to use for training (e.g., 0.7 means 70% train, 30% test)." />
31+
</inputs>
32+
<outputs>
33+
<collection name="datasets" type="list" label="${tool.name} on ${study_id}: datasets">
34+
<discover_datasets pattern="(?P&lt;name&gt;.+_(train|test))\.csv$" format="csv" directory="output" />
35+
</collection>
36+
</outputs>
37+
<tests>
38+
<test>
39+
<param name="non_commercial_use" value="True"/>
40+
<param name="study_id" value="lgg_tcga" />
41+
<param name="data_types" value="clin,mut" />
42+
<param name="split_ratio" value="0.7" />
43+
<output_collection name="datasets" type="list">
44+
<element name="clin_train">
45+
<assert_contents>
46+
<has_text_matching expression="PATIENT_ID"/>
47+
</assert_contents>
48+
</element>
49+
<element name="mut_train">
50+
<assert_contents>
51+
<has_text_matching expression="Hugo_Symbol"/>
52+
</assert_contents>
53+
</element>
54+
<element name="clin_test">
55+
<assert_contents>
56+
<has_text_matching expression="PATIENT_ID"/>
57+
</assert_contents>
58+
</element>
59+
<element name="mut_test">
60+
<assert_contents>
61+
<has_text_matching expression="Hugo_Symbol"/>
62+
</assert_contents>
63+
</element>
64+
</output_collection>
65+
</test>
66+
<test>
67+
<param name="non_commercial_use" value="True"/>
68+
<param name="study_id" value="lgg_tcga" />
69+
<param name="data_types" value="clin,mut,other" />
70+
<param name="mapped_files" value="data_clinical_patient.txt,data_mutations.txt,data_cna.txt" />
71+
<param name="split_ratio" value="0.8" />
72+
<output_collection name="datasets" type="list">
73+
<element name="clin_train">
74+
<assert_contents>
75+
<has_text_matching expression="PATIENT_ID"/>
76+
</assert_contents>
77+
</element>
78+
<element name="mut_train">
79+
<assert_contents>
80+
<has_text_matching expression="Hugo_Symbol"/>
81+
</assert_contents>
82+
</element>
83+
<element name="other_test">
84+
<assert_contents>
85+
<has_text_matching expression="Hugo_Symbol"/>
86+
</assert_contents>
87+
</element>
88+
</output_collection>
89+
</test>
90+
</tests><help><![CDATA[
91+
**Flexynesis cBioPortal import**
92+
93+
This tool fetches data from cBioPortal using the Flexynesis `CBioPortalData` class and prepares it for use with the Flexynesis Galaxy tool. It downloads a specified study, extracts the requested data types, splits them into training and test sets, and outputs them as CSV files compatible with Flexynesis (e.g., `clin_train.csv`, `mut_test.csv`).
94+
95+
**Inputs**
96+
97+
- **cBioPortal study ID**: The identifier of the study to fetch (e.g., `brca_tcga`, `lgg_tcga`). Find study IDs on the cBioPortal.
98+
- **Data types to fetch**: Select one or more data types to retrieve. 'Clinical data' (`clin`) is required for splitting. Options:
99+
- `clin`: Clinical data (default: `data_clinical_patient.txt`)
100+
- `mut`: Mutation data (default: `data_mutations.txt`)
101+
- `omics`: Omics data (default: `data_cna.txt`)
102+
- `other`: Custom data type (requires `--mapped_files`)
103+
- **Mapped files (optional)**: A comma-separated list of `.txt` files to override default filenames. Must match the number and order of selected data types (e.g., `data_clinical_sample.txt,data_mutations.txt,data_custom.txt`).
104+
- **Training/Test split ratio**: The proportion of data for training (e.g., 0.7 means 70% train, 30% test).
105+
106+
**Outputs**
107+
108+
A collection of datasets including:
109+
- `clin_train.csv` and `clin_test.csv`: Training and test clinical data (always included).
110+
- `mut_train.csv` and `mut_test.csv`: Training and test mutation data (if selected).
111+
- `omics_train.csv` and `omics_test.csv`: Training and test omics/CNA data (if selected).
112+
- `other_train.csv` and `other_test.csv`: Training and test custom data (if `other` is selected with a mapped file).
113+
114+
These datasets can be used as inputs to the Flexynesis Galaxy tool for multi-omics analysis.
115+
116+
**Note**: Ensure the study ID is valid and the selected data types (or mapped files) are available in the study archive. Clinical data (`clin`) is mandatory for splitting.
117+
]]></help>
118+
<citations>
119+
<citation type="doi">10.1101/2024.07.16.603606</citation>
120+
</citations>
121+
</tool>

0 commit comments

Comments
 (0)