snakemake · nacnoriko · Jul 20, 2023 · Jul 28, 2023 · May 13, 2025 · dlaehnemann
diff --git a/bio/reference/inphared-db/environment.yaml b/bio/reference/inphared-db/environment.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - nodefaults
+dependencies:
+  - curl
diff --git a/bio/reference/inphared-db/meta.yaml b/bio/reference/inphared-db/meta.yaml
@@ -0,0 +1,4 @@
+name: inphared-db
+description: Download sequence file from the Inphared database (https://github.com/RyanCook94/inphared/blob/main/README.md), and store them in a single .fasta file. Please check the current database available at the above link and adjust the config file. 
-description: Download sequence file from the Inphared database (https://github.com/RyanCook94/inphared/blob/main/README.md), and store them in a single .fasta file. Please check the current database available at the above link and adjust the config file. 
+description: Download sequence file from the [inphared database](https://github.com/RyanCook94/inphared/blob/main/README.md), and store them in a single .fasta file. Please check the above link for available database version and adjust the config file. 
-description: Download sequence file from the Inphared database (https://github.com/RyanCook94/inphared/blob/main/README.md), and store them in a single .fasta file. Please check the current database available at the above link and adjust the config file. 
+description: Download sequence file from the [inphared database](https://github.com/RyanCook94/inphared/blob/main/README.md), and store them in a single .fasta file. Please check the above link for available database version and adjust the config file. 
+authors:
+  - Noriko A. Cassman
diff --git a/bio/reference/inphared-db/old_wrapper.py b/bio/reference/inphared-db/old_wrapper.py
@@ -0,0 +1,80 @@
+__author__ = "Johannes Köster"
+__copyright__ = "Copyright 2019, Johannes Köster"
+__email__ = "[email protected]"
+__license__ = "MIT"
+
+import subprocess as sp
+import sys
+from itertools import product
+from snakemake.shell import shell
+
+species = snakemake.params.species.lower()
+release = int(snakemake.params.release)
+build = snakemake.params.build
+
+branch = ""
+if release >= 81 and build == "GRCh37":
+    # use the special grch37 branch for new releases
+    branch = "grch37/"
+elif snakemake.params.get("branch"):
+    branch = snakemake.params.branch + "/"
+
+log = snakemake.log_fmt_shell(stdout=False, stderr=True)
+
+spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
+    build=build, release=release
+)
+
+suffixes = ""
+datatype = snakemake.params.get("datatype", "")
+chromosome = snakemake.params.get("chromosome", "")
+if datatype == "dna":
+    if chromosome:
+        suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)]
+    else:
+        suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
+elif datatype == "cdna":
+    suffixes = ["cdna.all.fa.gz"]
+elif datatype == "cds":
+    suffixes = ["cds.all.fa.gz"]
+elif datatype == "ncrna":
+    suffixes = ["ncrna.fa.gz"]
+elif datatype == "pep":
+    suffixes = ["pep.all.fa.gz"]
+else:
+    raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep")
+
+if chromosome:
+    if not datatype == "dna":
+        raise ValueError(
+            "invalid datatype, to select a single chromosome the datatype must be dna"
+        )
+
+spec = spec.format(build=build, release=release)
+url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"
+
+success = False
+for suffix in suffixes:
+    url = f"{url_prefix}.{suffix}"
+
+    try:
+        shell("curl -sSf {url} > /dev/null 2> /dev/null")
+    except sp.CalledProcessError:
+        continue
+
+    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
+    success = True
+    break
+
+if not success:
+    if len(suffixes) > 1:
+        url = f"{url_prefix}.[{'|'.join(suffixes)}]"
+    else:
+        url = f"{url_prefix}.{suffixes[0]}"
+    print(
+        f"Unable to download requested sequence data from Ensembl ({url}). "
+        "Please check whether above URL is currently available (might be a temporal server issue). "
+        "Apart from that, did you check that this combination of species, build, and release is actually provided?",
+        file=sys.stderr,
+    )
+    exit(1)
diff --git a/bio/reference/inphared-db/test/Snakefile b/bio/reference/inphared-db/test/Snakefile
@@ -0,0 +1,12 @@
+configfile: "config.yaml"
+
+rule get_inphareddb:
+    output:
+        expand("{date}{suffix}", date=config["date"], suffix=config["suffix"])    
-        expand("{date}{suffix}", date=config["date"], suffix=config["suffix"])    
+        "resources/inphared.fasta"
-        expand("{date}{suffix}", date=config["date"], suffix=config["suffix"])    
+        "resources/inphared.fasta"
+    params:
+        prefix = config["prefix"], 
-        prefix = config["prefix"], 
+        url = config["url"], 
-        prefix = config["prefix"], 
+        url = config["url"], 
+        date = config["date"],
+        suffix = config["suffix"]
+    wrapper:
+        "master/bio/reference/inphared-db"
+
diff --git a/bio/reference/inphared-db/test/config.yaml b/bio/reference/inphared-db/test/config.yaml
@@ -0,0 +1,9 @@
+date:
+    "2Jul2023"
+
+suffix:
+    "_refseq_genomes.fa"
+    #"_genomes_excluding_refseq.fa"
+
+prefix:
+    "https://millardlab-inphared.s3.climb.ac.uk/"
-prefix:
-    "https://millardlab-inphared.s3.climb.ac.uk/"
+url:
+    "https://millardlab-inphared.s3.climb.ac.uk"
-prefix:
-    "https://millardlab-inphared.s3.climb.ac.uk/"
+url:
+    "https://millardlab-inphared.s3.climb.ac.uk"
diff --git a/bio/reference/inphared-db/test/old_release.smk b/bio/reference/inphared-db/test/old_release.smk
@@ -0,0 +1,29 @@
+rule get_genome:
+    output:
+        "refs/genome.fasta",
+    params:
+        species="saccharomyces_cerevisiae",
+        datatype="dna",
+        build="R64-1-1",
+        release="75",
+    log:
+        "logs/get_genome.log",
+    cache: "omit-software"  # save space and time with between workflow caching (see docs)
+    wrapper:
+        "master/bio/reference/ensembl-sequence"
+
+
+rule get_chromosome:
+    output:
+        "refs/old_release.chr1.fasta",
+    params:
+        species="saccharomyces_cerevisiae",
+        datatype="dna",
+        build="R64-1-1",
+        release="75",
+        chromosome="I",
+    log:
+        "logs/get_genome.log",
-        "logs/get_genome.log",
+    log:
+        "logs/get_chromosome.log",
-        "logs/get_genome.log",
+    log:
+        "logs/get_chromosome.log",
+    cache: "omit-software"  # save space and time with between workflow caching (see docs)
+    wrapper:
+        "master/bio/reference/ensembl-sequence"
diff --git a/bio/reference/inphared-db/test/old_snakefile.smk b/bio/reference/inphared-db/test/old_snakefile.smk
@@ -0,0 +1,30 @@
+rule get_genome:
+    output:
+        "refs/genome.fasta",
+    params:
+        species="saccharomyces_cerevisiae",
+        datatype="dna",
+        build="R64-1-1",
+        release="98",
+    log:
+        "logs/get_genome.log",
+    cache: "mit-software"  # save space and time with between workflow caching (see docs)
-    cache: "mit-software"  # save space and time with between workflow caching (see docs)
+    cache: "omit-software"  # save space and time with between workflow caching (see docs)
-    cache: "mit-software"  # save space and time with between workflow caching (see docs)
+    cache: "omit-software"  # save space and time with between workflow caching (see docs)
+    wrapper:
+        "master/bio/reference/ensembl-sequence"
+
+
+rule get_chromosome:
+    output:
+        "refs/chr1.fasta",
+    params:
+        species="saccharomyces_cerevisiae",
+        datatype="dna",
+        build="R64-1-1",
+        release="101",
+        chromosome="I",  # optional: restrict to chromosome
+        # branch="plants",  # optional: specify branch
+    log:
+        "logs/get_genome.log",
-        "logs/get_genome.log",
+    log:
+        "logs/get_chromosome.log",
-        "logs/get_genome.log",
+    log:
+        "logs/get_chromosome.log",
+    cache: "omit-software"  # save space and time with between workflow caching (see docs)
+    wrapper:
+        "master/bio/reference/ensembl-sequence"
diff --git a/bio/reference/inphared-db/wrapper.py b/bio/reference/inphared-db/wrapper.py
@@ -0,0 +1,9 @@
+__author__ = "Noriko A. Cassman"
+__copyright__ = "Copyright 2023, Noriko A. Cassman"
+__email__ = "[email protected]"
+__license__ = "MIT"
+
+from snakemake.shell import shell
+
+    shell:
+        "curl {params.prefix}{params.date}{params.suffix} -o {params.date}{params.suffix}"
-    shell:
-        "curl {params.prefix}{params.date}{params.suffix} -o {params.date}{params.suffix}"
+    shell(f"curl {snakemake.params.url}/{snakemake.params.date}{snakemake.params.suffix} -o {snakemake.output}"
-    shell:
-        "curl {params.prefix}{params.date}{params.suffix} -o {params.date}{params.suffix}"
+    shell(f"curl {snakemake.params.url}/{snakemake.params.date}{snakemake.params.suffix} -o {snakemake.output}"