Skip to content

Commit 89ea985

Browse files
authored
Merge pull request #176 from mwang87/workbench-native
Workbench native support
2 parents 7e8d5ca + 547b0d3 commit 89ea985

File tree

6 files changed

+93
-72
lines changed

6 files changed

+93
-72
lines changed

download.py

Lines changed: 3 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
from tqdm import tqdm
1212
from time import sleep
1313

14+
from download_msv import _resolve_msv_usi
15+
from download_workbench import _resolve_metabolomicsworkbench_usi
16+
1417
def _get_usi_display_filename(usi):
1518
usi_splits = usi.split(":")
1619

@@ -65,54 +68,6 @@ def _usi_to_local_filename(usi):
6568

6669

6770

68-
69-
def _resolve_msv_usi(usi, force_massive=False):
70-
"""
71-
72-
73-
Args:
74-
usi ([type]): [description]
75-
force_massive (bool, optional): [description]. Defaults to False, we try to create the url given the USI, usually for non mzML/RAW files, e.g. CDF files
76-
77-
Returns:
78-
[type]: [description]
79-
"""
80-
81-
usi_splits = usi.split(':')
82-
83-
msv_usi = usi
84-
if len(usi.split(":")) == 3:
85-
msv_usi = "{}:scan:1".format(usi)
86-
87-
lookup_url = f'https://massive.ucsd.edu/ProteoSAFe/QuerySpectrum?id={msv_usi}'
88-
lookup_request = requests.get(lookup_url)
89-
90-
try:
91-
resolution_json = lookup_request.json()
92-
93-
remote_path = None
94-
95-
mzML_resolutions = [resolution for resolution in resolution_json["row_data"] if os.path.splitext(resolution["file_descriptor"])[1] == ".mzML"]
96-
mzXML_resolutions = [resolution for resolution in resolution_json["row_data"] if os.path.splitext(resolution["file_descriptor"])[1] == ".mzXML"]
97-
raw_resolutions = [resolution for resolution in resolution_json["row_data"] if os.path.splitext(resolution["file_descriptor"])[1].lower() == ".raw"]
98-
99-
if len(mzML_resolutions) > 0:
100-
remote_path = mzML_resolutions[0]["file_descriptor"]
101-
elif len(mzXML_resolutions) > 0:
102-
remote_path = mzXML_resolutions[0]["file_descriptor"]
103-
elif len(raw_resolutions) > 0:
104-
remote_path = raw_resolutions[0]["file_descriptor"]
105-
106-
# Format into FTP link
107-
remote_link = f"ftp://massive.ucsd.edu/{remote_path[2:]}"
108-
except:
109-
# We did not successfully look it up, this is the fallback try
110-
if force_massive:
111-
return f"ftp://massive.ucsd.edu/{usi_splits[1]}/{usi_splits[2]}"
112-
raise
113-
114-
return remote_link
115-
11671
def _resolve_gnps_usi(usi):
11772
usi_splits = usi.split(':')
11873

@@ -148,23 +103,7 @@ def _resolve_mtbls_usi(usi):
148103

149104
return remote_link
150105

151-
def _resolve_metabolomicsworkbench_usi(usi):
152-
usi_splits = usi.split(':')
153-
154-
# First looking
155-
dataset_accession = usi_splits[1]
156-
filename = usi_splits[2]
157-
158-
# Query Accession
159-
url = "https://massive.ucsd.edu/ProteoSAFe/QueryDatasets?task=N%2FA&file=&pageSize=30&offset=0&query=%257B%2522full_search_input%2522%253A%2522%2522%252C%2522table_sort_history%2522%253A%2522createdMillis_dsc%2522%252C%2522query%2522%253A%257B%257D%252C%2522title_input%2522%253A%2522{}%2522%257D&target=&_=1606254845533".format(dataset_accession)
160-
r = requests.get(url)
161-
data_json = r.json()
162-
163-
msv_accession = data_json["row_data"][0]["dataset"]
164-
165-
msv_usi = "mzspec:{}:{}:scan:1".format(msv_accession, filename)
166106

167-
return _resolve_msv_usi(msv_usi)
168107

169108
def _resolve_pxd_usi(usi):
170109
usi_splits = usi.split(':')

download_msv.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import requests
2+
import os
3+
import sys
4+
5+
6+
def _resolve_msv_usi(usi, force_massive=False):
7+
"""
8+
9+
Args:
10+
usi ([type]): [description]
11+
force_massive (bool, optional): [description]. Defaults to False. This is to force the url in massive even is the USI resolver from MassIVE didn't return successfully, mostly the case in CDF files
12+
13+
Returns:
14+
[type]: [description]
15+
"""
16+
17+
usi_splits = usi.split(':')
18+
19+
msv_usi = usi
20+
if len(usi.split(":")) == 3:
21+
msv_usi = "{}:scan:1".format(usi)
22+
23+
lookup_url = f'https://massive.ucsd.edu/ProteoSAFe/QuerySpectrum?id={msv_usi}'
24+
lookup_request = requests.get(lookup_url)
25+
26+
try:
27+
resolution_json = lookup_request.json()
28+
29+
remote_path = None
30+
31+
mzML_resolutions = [resolution for resolution in resolution_json["row_data"] if os.path.splitext(resolution["file_descriptor"])[1] == ".mzML"]
32+
mzXML_resolutions = [resolution for resolution in resolution_json["row_data"] if os.path.splitext(resolution["file_descriptor"])[1] == ".mzXML"]
33+
raw_resolutions = [resolution for resolution in resolution_json["row_data"] if os.path.splitext(resolution["file_descriptor"])[1].lower() == ".raw"]
34+
35+
if len(mzML_resolutions) > 0:
36+
remote_path = mzML_resolutions[0]["file_descriptor"]
37+
elif len(mzXML_resolutions) > 0:
38+
remote_path = mzXML_resolutions[0]["file_descriptor"]
39+
elif len(raw_resolutions) > 0:
40+
remote_path = raw_resolutions[0]["file_descriptor"]
41+
42+
# Format into FTP link
43+
remote_link = f"ftp://massive.ucsd.edu/{remote_path[2:]}"
44+
except:
45+
# We did not successfully look it up, this is the fallback try
46+
if force_massive:
47+
return f"ftp://massive.ucsd.edu/{usi_splits[1]}/{usi_splits[2]}"
48+
raise
49+
50+
return remote_link

download_workbench.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import requests
2+
from download_msv import _resolve_msv_usi
3+
4+
def _resolve_metabolomicsworkbench_usi(usi):
5+
usi_splits = usi.split(':')
6+
7+
# First looking
8+
dataset_accession = usi_splits[1]
9+
filename = usi_splits[2]
10+
11+
try:
12+
# Checking if Data is in Metabolomics Workbench
13+
dataset_list_url = "https://www.metabolomicsworkbench.org/data/show_archive_contents_json.php?STUDY_ID={}".format(dataset_accession)
14+
mw_file_list = requests.get(dataset_list_url).json()
15+
for file_obj in mw_file_list:
16+
if filename in file_obj["FILENAME"]:
17+
return file_obj["URL"]
18+
except:
19+
pass
20+
21+
# Checking if Data is in MSV
22+
url = "https://massive.ucsd.edu/ProteoSAFe/QueryDatasets?task=N%2FA&file=&pageSize=30&offset=0&query=%257B%2522full_search_input%2522%253A%2522%2522%252C%2522table_sort_history%2522%253A%2522createdMillis_dsc%2522%252C%2522query%2522%253A%257B%257D%252C%2522title_input%2522%253A%2522{}%2522%257D&target=&_=1606254845533".format(dataset_accession)
23+
r = requests.get(url)
24+
data_json = r.json()
25+
26+
msv_accession = data_json["row_data"][0]["dataset"]
27+
msv_usi = "mzspec:{}:{}:scan:1".format(msv_accession, filename)
28+
29+
return _resolve_msv_usi(msv_usi)

layout_misc.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,9 @@
7777

7878
html.A("LCMS from Metabolights", href="/?usi=mzspec:MTBLS1124:QC07.mzML"),
7979
html.Br(),
80-
html.A("LCMS from Metabolomics Workbench that include MS/MS", href="/?usi=mzspec:ST000763:20160411_MB_CS00000074-1_P.mzXML"),
80+
html.A("LCMS from Metabolomics Workbench Native", href="/?usi=mzspec:ST001709:Sample_01___neg.mzXML"),
81+
html.Br(),
82+
html.A("LCMS from Metabolomics Workbench Imported into GNPS", href="/?usi=mzspec:ST000763:20160411_MB_CS00000074-1_P.mzXML"),
8183
html.Br(),
8284
html.A("Thermo LCMS from GNPS Analysis Classical Molecular Networking Task", href="/?usi=mzspec:GNPS:TASK-5ecfcf81cb3c471698995b194d8246a0-f.MSV000085444/ccms_peak/peak/Hui_N1_fe.mzML#%7B%7D"),
8385
html.Br(),

test/test_usi_resolution.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ def test_resolve_remote_url():
1010
for record in df.to_dict(orient="records"):
1111
print(record["usi"])
1212
remote_link = download._resolve_usi_remotelink(record["usi"])
13-
13+
print(remote_link)
1414
assert(len(remote_link) > 0)
1515

1616
# These might not work on a standard system because they include raw files
@@ -23,11 +23,11 @@ def test_resolve_remote_url():
2323
# assert(os.path.exists(local_filename))
2424

2525

26-
# def test_resolve_filename():
27-
# df = pd.read_csv("usi_list.tsv", sep='\t')
28-
# for record in df.to_dict(orient="records"):
29-
# converted_filename = download._usi_to_local_filename(record["usi"])
30-
# print(record["usi"], converted_filename)
26+
def test_resolve_filename():
27+
df = pd.read_csv("usi_list.tsv", sep='\t')
28+
for record in df.to_dict(orient="records"):
29+
converted_filename = download._usi_to_local_filename(record["usi"])
30+
print(record["usi"], converted_filename)
3131

3232
# def test_raw_filename():
3333
# converted_filename = download._usi_to_local_filename("mzspec:PXD007600:20150416_41_F1_S28_ZT_1_4.raw") # Should be in PRIDE

test/usi_list.tsv

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,5 @@ mzspec:MSV000086729:ccms_peak/raw/Identification/UP_Fusion_AcX_SAT_ob_pool_1_1.m
3030
mzspec:PXD002854:20150414_QEp1_LC7_GaPI_SA_Serum_DT_03_150416181741.mzXML:scan:2308:[+314.188]-QQKPGQAPR/2
3131
mzspec:PXD002854:20150414_QEp1_LC7_GaPI_SA_Serum_DT_03_150416181741:scan:2308:[+314.188]-QQKPGQAPR/2
3232
mzspec:PXD002854:20150414_QEp1_LC7_GaPI_SA_Serum_DT_03_150416181741.raw:scan:2308:[+314.188]-QQKPGQAPR/2
33-
mzspec:PXD023650:03552_GA1_P_041575_P00_A00_30min_R1.raw
33+
mzspec:PXD023650:03552_GA1_P_041575_P00_A00_30min_R1.raw
34+
mzspec:ST001709:Sample_01___neg.mzXML

0 commit comments

Comments
 (0)