Skip to content

Commit 82e0820

Browse files
committed
Add support for tarball archives in TransformerModel
1 parent 91cb88a commit 82e0820

1 file changed

Lines changed: 97 additions & 2 deletions

File tree

chem_spectra/model/transformer.py

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import json
2+
import io
23
import zipfile
4+
import tarfile
35
import tempfile
46
import glob # noqa: F401
57
import os
@@ -98,6 +100,29 @@ def __init__(self, file, molfile=None, params=False, multiple_files=False):
98100
self.params = params
99101
self.multiple_files = multiple_files
100102

103+
@staticmethod
104+
def _is_tarball(name: str) -> bool:
105+
lname = (name or "").lower()
106+
return lname.endswith(".tar.gz") or lname.endswith(".tgz") or lname.endswith(".tar") or lname.endswith(".tar.xz")
107+
108+
def _detect_archive_type(self) -> str | None:
109+
if not getattr(self.file, "bcore", None):
110+
return None
111+
try:
112+
if zipfile.is_zipfile(io.BytesIO(self.file.bcore)):
113+
return "zip"
114+
except Exception:
115+
pass
116+
try:
117+
with tempfile.NamedTemporaryFile(suffix=".tar") as tf:
118+
tf.write(self.file.bcore)
119+
tf.flush()
120+
if tarfile.is_tarfile(tf.name):
121+
return "tar"
122+
except Exception:
123+
pass
124+
return None
125+
101126
def convert2jcamp(self):
102127
cmpsr, _ = self.to_composer()
103128
if isinstance(cmpsr, BagItBaseConverter):
@@ -119,17 +144,23 @@ def convert2jcamp_img(self):
119144
return cmpsr.tf_jcamp(), cmpsr.tf_img(), cmpsr.tf_csv()
120145

121146
def to_composer(self):
147+
archive_type = self._detect_archive_type()
122148
is_raw_mzml = self.file.name.split('.')[-1].lower() in ['raw', 'mzml', 'mzxml'] # noqa: E501
123149
is_cdf = self.file.name.split('.')[-1].lower() in ['cdf']
124-
is_zip = self.file.name.split('.')[-1].lower() in ['zip']
150+
is_zip = self.file.name.split('.')[-1].lower() in ['zip'] or archive_type == "zip"
151+
is_tar = self._is_tarball(self.file.name) or archive_type == "tar"
125152
is_raw_mzml_by_params = self.params['ext'] in ['raw', 'mzml', 'mzxml']
126153
is_cdf_by_params = self.params['ext'] in ['cdf']
127154
is_zip_by_params = self.params['ext'] in ['zip']
155+
is_tar_by_params = self.params['ext'] in ['tar', 'tar.gz', 'tgz', 'tar.xz']
128156
if is_raw_mzml or is_raw_mzml_by_params:
129157
return self.ms2composer(), False
130158
if is_cdf or is_cdf_by_params:
131159
_, cp = self.cdf2cvp()
132160
return cp, False
161+
if is_tar or is_tar_by_params:
162+
_, cp, invalid_molfile = self.tar2cvp()
163+
return cp, invalid_molfile
133164
if is_zip or is_zip_by_params:
134165
_, cp, invalid_molfile = self.zip2cvp()
135166
return cp, invalid_molfile
@@ -138,17 +169,23 @@ def to_composer(self):
138169
return cp, invalid_molfile
139170

140171
def to_converter(self):
172+
archive_type = self._detect_archive_type()
141173
is_raw_mzml = self.file.name.split('.')[-1].lower() in ['raw', 'mzml', 'mzxml'] # noqa: E501
142174
is_cdf = self.file.name.split('.')[-1].lower() in ['cdf']
143-
is_zip = self.file.name.split('.')[-1].lower() in ['zip']
175+
is_zip = self.file.name.split('.')[-1].lower() in ['zip'] or archive_type == "zip"
176+
is_tar = self._is_tarball(self.file.name) or archive_type == "tar"
144177
is_raw_mzml_by_params = self.params['ext'] in ['raw', 'mzml', 'mzxml']
145178
is_cdf_by_params = self.params['ext'] in ['cdf']
146179
is_zip_by_params = self.params['ext'] in ['zip']
180+
is_tar_by_params = self.params['ext'] in ['tar', 'tar.gz', 'tgz', 'tar.xz']
147181
if is_raw_mzml or is_raw_mzml_by_params:
148182
return self.ms2composer()
149183
if is_cdf or is_cdf_by_params:
150184
cv, _ = self.cdf2cvp()
151185
return cv
186+
if is_tar or is_tar_by_params:
187+
cv, _, _ = self.tar2cvp()
188+
return cv
152189
if is_zip or is_zip_by_params:
153190
cv, _, _ = self.zip2cvp()
154191
return cv
@@ -271,6 +308,64 @@ def zip2cvp(self):
271308

272309
return False, False, False
273310

311+
def tar2cvp(self):
312+
with tempfile.TemporaryDirectory() as td:
313+
suffix = '.tar.gz' if self._is_tarball(self.file.name) else '.tar'
314+
tt = store_byte_in_tmp(self.file.bcore, suffix=suffix)
315+
with tarfile.open(tt.name, 'r:*') as t:
316+
t.extractall(td)
317+
318+
openlab_dir = _find_dir_with_cdf(td)
319+
if openlab_dir:
320+
normalized_dir = os.path.join(td, 'normalized')
321+
os.makedirs(normalized_dir, exist_ok=True)
322+
323+
converter_frames = lcms_frames_from_converter_app(openlab_dir)
324+
if converter_frames is not None:
325+
lc_df, minus_df, plus_df = converter_frames
326+
else:
327+
read_lc, read_ms = get_openlab_readers()
328+
if read_lc is None or read_ms is None:
329+
return False, False, False
330+
lc_df = read_lc(openlab_dir)
331+
minus_df, plus_df = read_ms(openlab_dir)
332+
333+
required_lc_cols = ['RetentionTime', 'DetectorSignal', 'wavelength']
334+
for col in required_lc_cols:
335+
if col not in lc_df.columns:
336+
raise RuntimeError(f'LC output missing required column {col}')
337+
lc_df = lc_df[required_lc_cols]
338+
339+
for label, df in {'MINUS': minus_df, 'PLUS': plus_df}.items():
340+
for col in ('mz', 'intensities', 'time'):
341+
if col not in df.columns:
342+
raise RuntimeError(f'MS {label} missing column {col}')
343+
344+
tic_minus = compute_tic(minus_df)
345+
tic_plus = compute_tic(plus_df)
346+
347+
lc_df.to_csv(os.path.join(normalized_dir, 'LCMS.csv'), index=False)
348+
tic_minus.to_csv(os.path.join(normalized_dir, 'TIC_MINUS.csv'), index=False)
349+
tic_plus.to_csv(os.path.join(normalized_dir, 'TIC_PLUS.csv'), index=False)
350+
minus_df[['time', 'mz', 'intensities']].to_csv(
351+
os.path.join(normalized_dir, 'MZ_MINUS_Spectra.csv'), index=False
352+
)
353+
plus_df[['time', 'mz', 'intensities']].to_csv(
354+
os.path.join(normalized_dir, 'MZ_PLUS_Spectra.csv'), index=False
355+
)
356+
357+
lcms_cv = LCMSConverter(normalized_dir, self.params, os.path.basename(self.file.name))
358+
lcms_peaks = None
359+
if self.params and 'lcms_peaks' in self.params:
360+
try:
361+
lcms_peaks = json.loads(self.params['lcms_peaks'])
362+
except (json.JSONDecodeError, TypeError):
363+
lcms_peaks = None
364+
lcms_np = LCMSComposer(lcms_cv, lcms_peaks)
365+
return lcms_cv, lcms_np, False
366+
367+
return False, False, False
368+
274369
def zip2cv_with_processed_file(self, target_dir, params, file_name):
275370
fid_brucker = FidHasBruckerProcessed(target_dir, params, file_name)
276371
if not fid_brucker:

0 commit comments

Comments
 (0)