Skip to content

Commit b80b58a

Browse files
committed
refactor(htsget): use htslurp as htsget client
1 parent a3bb23e commit b80b58a

1 file changed

Lines changed: 36 additions & 16 deletions

File tree

src/modos/genomics/htsget.py

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,20 @@
4141
import io
4242
from pathlib import Path
4343
import re
44-
import tempfile
4544
from typing import Any
4645
from urllib.parse import urlparse, parse_qs
4746

47+
from pysam.libcalignedsegment import AlignedSegment
48+
49+
import htslurp
4850
from pydantic import HttpUrl, validate_call
4951
from pydantic.dataclasses import dataclass
5052
import pysam
5153
import requests
5254

5355
from modos.remote import get_session
5456
from modos.genomics.region import Region
55-
from modos.genomics.formats import GenomicFileSuffix, read_pysam
57+
from modos.genomics.formats import GenomicFileSuffix
5658

5759

5860
@validate_call
@@ -255,39 +257,57 @@ def to_file(self, path: Path):
255257
for block in source:
256258
sink.write(block)
257259

260+
@property
261+
def format(self) -> str:
262+
return GenomicFileSuffix.from_path(self.path).name
263+
258264
@classmethod
259265
def from_url(cls, url: str):
260266
"""Open connection directly from an htsget URL."""
261267
host, path, region = parse_htsget_url(url)
262268
return cls(host, path, region=region)
263269

270+
def records(self, reference: Path | None = None) -> htslurp.RecordIter:
271+
records = htslurp.stream_records(
272+
base_url=self.url,
273+
id=str(self.path),
274+
format=self.format,
275+
region=self.region,
276+
reference=reference,
277+
)
278+
return records
279+
264280
def to_pysam(
265-
self, reference_filename: str | None = None
281+
self, reference_filename: Path | None = None
266282
) -> Iterator[pysam.AlignedSegment | pysam.VariantRecord]:
267283
"""Convert the stream to a pysam object."""
268284

269-
# NOTE: pysam needs a path or file descriptor,
270-
# we have to stream from drive until this is addressed:
285+
# NOTE: we use a dedicated client because pysam does not support bytestreams
271286
# ref: https://github.com/pysam-developers/pysam/blob/0787ca9da997b5911c00fd12584dad9741c82fb4/pysam/libcalignmentfile.pyx#L855
272-
# TODO: when above addressed, replace temporary file with
287+
# TODO: if above addressed, replace temporary file with
273288
# self.open() to stream directly from in-memory buffer.
274-
buffer = tempfile.NamedTemporaryFile(
275-
"w+b", delete=False, suffix="".join(self.path.suffixes)
276-
).name
277289

278-
self.to_file(Path(buffer))
279-
buffer = read_pysam(
280-
Path(buffer), reference_filename=reference_filename
281-
)
290+
stream = self.records(reference_filename)
291+
292+
for record in stream:
293+
match self.format:
294+
case "CRAM" | "BAM" | "SAM":
295+
parsed = AlignedSegment.fromstring(
296+
record.decode(), stream.header
297+
)
298+
case _:
299+
# NOTE: pysam does not support instantiating VariantRecord on the fly.
300+
raise ValueError(
301+
f"Cannot convert {self.format} records to pysam."
302+
)
282303

283-
for record in buffer:
284304
if self.region is None:
285-
yield record
305+
yield parsed
286306
continue
287307

288308
# htsget includes all returns in the bgzf block
289309
# we filter out records outside requested region
290310
record_region = Region.from_pysam(record)
291311
if not record_region.overlaps(self.region):
292312
continue
293-
yield record
313+
yield parsed

0 commit comments

Comments
 (0)