Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 32 additions & 15 deletions src/decoupler/op/_hallmark.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from concurrent.futures import Future, ThreadPoolExecutor

import pandas as pd

from decoupler._docs import docs
Expand All @@ -11,7 +13,8 @@ def hallmark(
organism: str = "human",
license: str = "academic",
verbose: bool = False,
) -> pd.DataFrame:
as_future: bool = False,
) -> pd.DataFrame | Future:
"""
Hallmark gene sets :cite:p:`msigdb`.

Expand All @@ -23,10 +26,13 @@ def hallmark(
%(organism)s
%(license)s
%(verbose)s
future : bool
If True, returns a `Future` to allow asynchronous execution.

Returns
-------
Dataframe in long format containing the hallmark gene sets.
Dataframe in long format containing the hallmark gene sets
or a Future that resolves to it.

Example
-------
Expand All @@ -36,17 +42,28 @@ def hallmark(

hm = dc.op.hallmark()
hm

# Asynchronous
future = dc.op.hallmark(as_future=True)
hm = future.result()
"""
url = "https://static.omnipathdb.org/tables/msigdb-hallmark.tsv.gz"
hm = _download(url, verbose=verbose)
hm = _bytes_to_pandas(hm, sep="\t", compression="gzip")
hm = hm[["geneset", "genesymbol"]]
hm["geneset"] = hm["geneset"].str.replace("HALLMARK_", "")
hm["genesymbol"] = hm["genesymbol"].str.replace("COMPLEX:", "").str.split("_")
hm = hm.explode("genesymbol")
hm = _infer_dtypes(hm)
if organism != "human":
hm = translate(hm, columns=["genesymbol"], target_organism=organism, verbose=verbose)
hm = hm.rename(columns={"geneset": "source", "genesymbol": "target"})
hm = hm.drop_duplicates(["source", "target"]).reset_index(drop=True)
return hm

def _task():
url = "https://static.omnipathdb.org/tables/msigdb-hallmark.tsv.gz"
hm = _download(url, verbose=verbose)
hm = _bytes_to_pandas(hm, sep="\t", compression="gzip")
hm = hm[["geneset", "genesymbol"]]
hm["geneset"] = hm["geneset"].str.replace("HALLMARK_", "")
hm["genesymbol"] = hm["genesymbol"].str.replace("COMPLEX:", "").str.split("_")
hm = hm.explode("genesymbol")
hm = _infer_dtypes(hm)
if organism != "human":
hm = translate(hm, columns=["genesymbol"], target_organism=organism, verbose=verbose)
hm = hm.rename(columns={"geneset": "source", "genesymbol": "target"})
hm = hm.drop_duplicates(["source", "target"]).reset_index(drop=True)
return hm

if as_future:
with ThreadPoolExecutor(max_workers=1) as executor:
return executor.submit(_task)
return _task()
9 changes: 9 additions & 0 deletions tests/op/test_hallmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,12 @@ def test_hallmark():
cols = {"source", "target"}
assert cols.issubset(hm.columns)
assert not hm.duplicated(["source", "target"]).any()


def test_hallmark_as_future():
future = dc.op.hallmark(as_future=True)
hm = future.result()
assert isinstance(hm, pd.DataFrame)
cols = {"source", "target"}
assert cols.issubset(hm.columns)
assert not hm.duplicated(["source", "target"]).any()
Loading