Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3,434 changes: 969 additions & 2,465 deletions pixi.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,6 @@ snakefmt = "*"
ruff = "*"
awscli = "2.22.*"
taplo = "*"
pysam = "*"

[pypi-dependencies]
80 changes: 71 additions & 9 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,22 +32,30 @@ if config.get("full-version", False):
MAX_THREADS = config.get("max_threads", 4)
SORT_THREADS = config.get("sort_threads", 8)

# sample, haplotype, and chromosome wildcard building
#FAI_DF = get_fai_df()
DEFAULT_ENV = config.get("env", "../envs/env.yaml")
MANIFEST = get_manifest()
MIN_FIRE_FDR = config.get("min_fire_fdr", 0.10)

# reference genome and reference regions
REF = get_ref()
FAI = get_fai()
REF_NAME = config["ref_name"]
EXCLUDES = get_excludes()
#REF = get_ref()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

delete completely instead of commenting

#FAI = get_fai()
#REF_NAME = config["ref_name"]
#REF_NAME = get_ref_name()
#EXCLUDES = get_excludes()
KEEP_CHRS = config.get("keep_chromosomes", ".*")
#FAI_DF = get_fai_df()

# coverage requirements
MIN_COVERAGE = config.get("min_coverage", 4)
COVERAGE_WITHIN_N_SD = config.get("coverage_within_n_sd", 5)

# sample, haplotype, and chromosome wildcard building
FAI_DF = get_fai_df()
DEFAULT_ENV = config.get("env", "../envs/env.yaml")
MANIFEST = get_manifest()
MIN_FIRE_FDR = config.get("min_fire_fdr", 0.10)
#FAI_DF = get_fai_df()
#DEFAULT_ENV = config.get("env", "../envs/env.yaml")
#MANIFEST = get_manifest()
#MIN_FIRE_FDR = config.get("min_fire_fdr", 0.10)

# FDR / peak calling thresholds
MAX_PEAK_FDR = config.get("max_peak_fdr", 0.05)
Expand Down Expand Up @@ -109,8 +117,62 @@ if DSA:
include: "rules/levio.smk"


## mod manifest to include reference
if "ref" not in MANIFEST.columns:
if "ref" not in config or str(config.get("ref")) == "None":
raise ValueError("FIRE: ref parameter is missing in config.yaml")
else:
temp_ref = config.get("ref")
MANIFEST["ref"] = temp_ref
if "ref_name" not in config or str(config.get("ref_name")) == "None":
raise ValueError("FIRE: ref_name parameter is missing in config.yaml")
else:
temp_ref_name = config.get("ref_name")
MANIFEST["ref_name"] = temp_ref_name

# make list of all chrs for
chroms_list_joined = ""

for index, row in MANIFEST.iterrows():
first_report_var = True
min_contig_length = config.get("min_contig_length", 0)
fai = row['ref'] + ".fai"
fai_df = pd.read_csv(fai, sep="\t", names=["chr", "length", "x", "y", "z"])

skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length]
if len(skipped_contigs) > 0 and first_report_var:
print(
f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
file=sys.stderr,
)

chroms = fai_df["chr"][fai_df["length"] >= min_contig_length]
chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
chroms = [chrom for chrom in chroms if "_random" not in chrom]
chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]

if first_report_var:
first_report_var = False
print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr)

if len(chroms) == 0:
raise ValueError(
f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
f"Your fai file contains the following chromosomes: {fai_df['chr']}"
)
temp_chroms_list="|".join(chroms)
if chroms_list_joined == "":
chroms_list_joined = temp_chroms_list
else:
chroms_list_joined = chroms_list_joined + "|" + temp_chroms_list

unique_chroms_list_joined = "|".join(set(chroms_list_joined.split("|")))

#print(unique_chroms_list_joined)

wildcard_constraints:
chrom="|".join(get_chroms()),
#chrom="|".join(get_chroms()),
chrom=unique_chroms_list_joined,
call="|".join(["msp", "m6a"]),
sm="|".join(MANIFEST.index),
types="|".join(types),
Expand Down
1 change: 1 addition & 0 deletions workflow/envs/env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ dependencies:
- csvtk
- mosdepth==0.3.7
- bioconda::bigtools==0.5.5
#- pysam==0.22
2 changes: 1 addition & 1 deletion workflow/envs/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ dependencies:
- pip
- pip:
- polars-lts-cpu[pyarrow]==1.7.1
#- pysam==0.22
#- tqdm
#- numba::numba==0.60
#- pysam==0.21
2 changes: 1 addition & 1 deletion workflow/envs/runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ dependencies:
#- tqdm
#- pip
#- pip:
#- pysam==0.22.1
- pysam==0.22.1
5 changes: 3 additions & 2 deletions workflow/rules/apply-model.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
rule fire:
input:
bam=ancient(get_input_bam),
ref=ancient(REF),
ref=ancient(get_ref)
#ref=ancient(REF),
output:
cram="results/{sm}/{sm}-fire-{v}-filtered.cram",
crai="results/{sm}/{sm}-fire-{v}-filtered.cram.crai",
Expand Down Expand Up @@ -75,7 +76,7 @@ rule fire_sites_chrom:
rule fire_sites:
input:
beds=expand(
rules.fire_sites_chrom.output.bed, chrom=get_chroms(), allow_missing=True
rules.fire_sites_chrom.output.bed, chrom=get_chroms, allow_missing=True
),
output:
bed="results/{sm}/additional-outputs-{v}/fire-peaks/{sm}-{v}-fire-elements.bed.gz",
Expand Down
177 changes: 166 additions & 11 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
@@ -1,29 +1,73 @@
import re
import logging
import sys
import pysam

FIRST_REPORT = True


def get_ref():
def get_ref_orig():
if "ref" not in config:
raise ValueError("FIRE: ref parameter is missing in config.yaml")
ref = config["ref"]
if not os.path.exists(ref):
raise ValueError(f"FIRE: reference file {ref} does not exist")
return os.path.abspath(ref)

def get_ref_old(wc):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the functions you have added with _old or _orig are not used. if so they should be removed.

if "ref" in config:
ref = config["ref"]
if "ref" not in config:
if "ref" in MANIFEST.columns:
ref = get_input_ref()
else:
raise ValueError("FIRE: ref parameter is missing in config.yaml and no ref column in manifest")
#ref = config["ref"]
if not os.path.exists(ref):
raise ValueError(f"FIRE: reference file {ref} does not exist")
return os.path.abspath(ref)

def get_fai():
fai = f"{get_ref()}.fai"
def get_ref(wc):
ref = MANIFEST.loc[wc.sm, "ref"]
if not os.path.exists(ref):
raise ValueError(f"FIRE: reference file {ref} does not exist")
return os.path.abspath(ref)

def get_ref_name_old(wc):
if "ref_name" in config:
ref_name=config["ref_name"]
else:
if "ref_name" in MANIFEST.columns:
ref_name= get_input_ref_name()
else:
raise ValueError("FIRE: ref_name parameter is missing in config.yaml and no ref_name column in manifest")
#ref=get_ref()
#temp_split=ref.strip().split('/')[-1]
#fa_split=temp_split.split('.fa')
#ref_name = '.fa'.join(fa_split[:-1])
return(ref_name)

def get_ref_name(wc):
return MANIFEST.loc[wc.sm, "ref_name"]

def get_fai_orig(wc):
fai = f"{get_ref(wc)}.fai"
if not os.path.exists(fai):
raise ValueError(f"FIRE: reference index file {fai} does not exist")
return fai

def get_fai(wc):
ref = MANIFEST.loc[wc.sm, "ref"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to get ref you should call get_ref. And then generally it is good to use f strings in situations like this:

ref = get_ref(wc)
fai = f"{ref}.fai"
...

#fai= ref + ".fai"
fai = str(ref) + ".fai"
if not os.path.exists(fai):
raise ValueError(f"FIRE: reference index file {fai} does not exist")
return fai

def get_excludes():
def get_excludes(wc):
excludes = config.get("excludes", [])
if REF_NAME == "hg38" or REF_NAME == "GRCh38":
ref_name = get_ref_name(wc)
if ref_name == "hg38" or ref_name == "GRCh38":
files = [
"../annotations/hg38.gap.bed.gz",
"../annotations/hg38.blacklist.ENCFF356LFX.bed.gz",
Expand All @@ -33,22 +77,23 @@ def get_excludes():
return excludes


def get_fai_df():
fai = get_fai()
def get_fai_df(wc):
fai = get_fai(wc)
return pd.read_csv(fai, sep="\t", names=["chr", "length", "x", "y", "z"])


def get_chroms():
def get_chroms_orig(wc):
global FIRST_REPORT
min_contig_length = config.get("min_contig_length", 0)
skipped_contigs = FAI_DF["chr"][FAI_DF["length"] < min_contig_length]
fai_df=get_fai_df(wc)
skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length]
if len(skipped_contigs) > 0 and FIRST_REPORT:
print(
f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
file=sys.stderr,
)

chroms = FAI_DF["chr"][FAI_DF["length"] >= min_contig_length]
chroms = fai_df["chr"][fai_df["length"] >= min_contig_length]
chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
chroms = [chrom for chrom in chroms if "_random" not in chrom]
chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]
Expand All @@ -60,10 +105,115 @@ def get_chroms():
if len(chroms) == 0:
raise ValueError(
f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
f"Your fai file contains the following chromosomes: {FAI_DF['chr']}"
f"Your fai file contains the following chromosomes: {fai_df['chr']}"
)
return chroms

def get_chroms(wc):
global FIRST_REPORT
min_contig_length = config.get("min_contig_length", 0)
fai_df=get_fai_df(wc)
skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length]
if len(skipped_contigs) > 0 and FIRST_REPORT:
print(
f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
file=sys.stderr,
)

bam_chr_list=[]
input_bam_path=get_input_bam(wc)
input_bam = pysam.AlignmentFile(input_bam_path, "rc", threads=MAX_THREADS)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you don't need extra threads here, I would drop that. Also the "rc" part can be infered and not all inputs will be cram so we dont want it. basically delete the last two args.

bam_header_dict = input_bam.header.to_dict()

for line in bam_header_dict['SQ']:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The length of each chromosome is also available in the bam header. So we should use the bam header only instead of mixing fai_df and this method.

chr_name=line['SN']
bam_chr_list.append(chr_name)

input_bam.close()

chroms = fai_df["chr"][fai_df["length"] >= min_contig_length]
chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
chroms = [chrom for chrom in chroms if "_random" not in chrom]
chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]
chroms = [chrom for chrom in chroms if chrom in bam_chr_list]

if FIRST_REPORT:
FIRST_REPORT = False
print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr)

if len(chroms) == 0:
raise ValueError(
f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
f"Your fai file contains the following chromosomes: {fai_df['chr']}"
)
return chroms

def get_chroms_first_element(wc):
global FIRST_REPORT
min_contig_length = config.get("min_contig_length", 0)
fai_df=get_fai_df(wc)
skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length]
if len(skipped_contigs) > 0 and FIRST_REPORT:
print(
f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
file=sys.stderr,
)

bam_chr_list=[]
input_bam_path=get_input_bam(wc)
input_bam = pysam.AlignmentFile(input_bam_path, "rc", threads=MAX_THREADS)
bam_header_dict = input_bam.header.to_dict()

for line in bam_header_dict['SQ']:
chr_name=line['SN']
bam_chr_list.append(chr_name)

input_bam.close()

chroms = fai_df["chr"][fai_df["length"] >= min_contig_length]
chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
chroms = [chrom for chrom in chroms if "_random" not in chrom]
chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]
chroms = [chrom for chrom in chroms if chrom in bam_chr_list]

if FIRST_REPORT:
FIRST_REPORT = False
print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr)

if len(chroms) == 0:
raise ValueError(
f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
f"Your fai file contains the following chromosomes: {fai_df['chr']}"
)
return chroms[0]

def get_chroms_first_element_orig(wc):
global FIRST_REPORT
min_contig_length = config.get("min_contig_length", 0)
fai_df=get_fai_df(wc)
skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length]
if len(skipped_contigs) > 0 and FIRST_REPORT:
print(
f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
file=sys.stderr,
)

chroms = fai_df["chr"][fai_df["length"] >= min_contig_length]
chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
chroms = [chrom for chrom in chroms if "_random" not in chrom]
chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]

if FIRST_REPORT:
FIRST_REPORT = False
print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr)

if len(chroms) == 0:
raise ValueError(
f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
f"Your fai file contains the following chromosomes: {fai_df['chr']}"
)
return chroms[0]


def get_manifest():
manifest = config.get("manifest")
Expand All @@ -80,6 +230,11 @@ def get_manifest():
def get_input_bam(wc):
return MANIFEST.loc[wc.sm, "bam"]

def get_input_ref(wc):
return MANIFEST.loc[wc.sm, "ref"]

def get_input_ref_name(wc):
return MANIFEST.loc[wc.sm, "ref_name"]

def get_mem_mb(wildcards, attempt):
if attempt < 3:
Expand Down
Loading
Loading