fiberseq · minkinaa · Jun 18, 2025 · Oct 28, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/pixi.lock b/pixi.lock
diff --git a/pixi.toml b/pixi.toml
@@ -65,5 +65,6 @@ snakefmt = "*"
 ruff = "*"
 awscli = "2.22.*"
 taplo = "*"
+pysam = "*"
 
 [pypi-dependencies]
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -32,22 +32,30 @@ if config.get("full-version", False):
 MAX_THREADS = config.get("max_threads", 4)
 SORT_THREADS = config.get("sort_threads", 8)
 
+# sample, haplotype, and chromosome wildcard building
+#FAI_DF = get_fai_df()
+DEFAULT_ENV = config.get("env", "../envs/env.yaml")
+MANIFEST = get_manifest()
+MIN_FIRE_FDR = config.get("min_fire_fdr", 0.10)
+
 # reference genome and reference regions
-REF = get_ref()
-FAI = get_fai()
-REF_NAME = config["ref_name"]
-EXCLUDES = get_excludes()
+#REF = get_ref()
+#FAI = get_fai()
+#REF_NAME = config["ref_name"]
+#REF_NAME = get_ref_name()
+#EXCLUDES = get_excludes()
 KEEP_CHRS = config.get("keep_chromosomes", ".*")
+#FAI_DF = get_fai_df()
 
 # coverage requirements
 MIN_COVERAGE = config.get("min_coverage", 4)
 COVERAGE_WITHIN_N_SD = config.get("coverage_within_n_sd", 5)
 
 # sample, haplotype, and chromosome wildcard building
-FAI_DF = get_fai_df()
-DEFAULT_ENV = config.get("env", "../envs/env.yaml")
-MANIFEST = get_manifest()
-MIN_FIRE_FDR = config.get("min_fire_fdr", 0.10)
+#FAI_DF = get_fai_df()
+#DEFAULT_ENV = config.get("env", "../envs/env.yaml")
+#MANIFEST = get_manifest()
+#MIN_FIRE_FDR = config.get("min_fire_fdr", 0.10)
 
 # FDR / peak calling thresholds
 MAX_PEAK_FDR = config.get("max_peak_fdr", 0.05)
@@ -109,8 +117,62 @@ if DSA:
     include: "rules/levio.smk"
 
 
+## mod manifest to include reference
+if "ref" not in MANIFEST.columns:
+    if "ref" not in config or str(config.get("ref")) == "None":
+        raise ValueError("FIRE: ref parameter is missing in config.yaml")
+    else:
+        temp_ref = config.get("ref")
+        MANIFEST["ref"] = temp_ref
+    if "ref_name" not in config or str(config.get("ref_name")) == "None":
+        raise ValueError("FIRE: ref_name parameter is missing in config.yaml")
+    else:
+        temp_ref_name = config.get("ref_name")
+        MANIFEST["ref_name"] = temp_ref_name
+
+# make list of all chrs for 
+chroms_list_joined = ""
+
+for index, row in MANIFEST.iterrows():
+    first_report_var = True
+    min_contig_length = config.get("min_contig_length", 0)
+    fai = row['ref'] + ".fai"
+    fai_df = pd.read_csv(fai, sep="\t", names=["chr", "length", "x", "y", "z"])
+
+    skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length]
+    if len(skipped_contigs) > 0 and first_report_var:
+        print(
+            f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
+            file=sys.stderr,
+        )
+
+    chroms = fai_df["chr"][fai_df["length"] >= min_contig_length]
+    chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
+    chroms = [chrom for chrom in chroms if "_random" not in chrom]
+    chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]
+
+    if first_report_var:
+        first_report_var = False
+        print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr)
+
+    if len(chroms) == 0:
+        raise ValueError(
+            f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
+            f"Your fai file contains the following chromosomes: {fai_df['chr']}"
+        )
+    temp_chroms_list="|".join(chroms)
+    if chroms_list_joined == "":
+        chroms_list_joined = temp_chroms_list
+    else:
+        chroms_list_joined = chroms_list_joined + "|" + temp_chroms_list
+
+unique_chroms_list_joined = "|".join(set(chroms_list_joined.split("|")))
+
+#print(unique_chroms_list_joined)
+
 wildcard_constraints:
-    chrom="|".join(get_chroms()),
+    #chrom="|".join(get_chroms()),
+    chrom=unique_chroms_list_joined,
     call="|".join(["msp", "m6a"]),
     sm="|".join(MANIFEST.index),
     types="|".join(types),

diff --git a/workflow/envs/env.yaml b/workflow/envs/env.yaml
@@ -14,3 +14,4 @@ dependencies:
   - csvtk
   - mosdepth==0.3.7
   - bioconda::bigtools==0.5.5
+ #- pysam==0.22
diff --git a/workflow/envs/python.yaml b/workflow/envs/python.yaml
@@ -12,6 +12,6 @@ dependencies:
   - pip
   - pip:
       - polars-lts-cpu[pyarrow]==1.7.1
+#- pysam==0.22
 #- tqdm
 #- numba::numba==0.60
-#- pysam==0.21
diff --git a/workflow/envs/runner.yaml b/workflow/envs/runner.yaml
@@ -8,4 +8,4 @@ dependencies:
 #- tqdm
 #- pip
 #- pip:
-#- pysam==0.22.1
+  - pysam==0.22.1
diff --git a/workflow/rules/apply-model.smk b/workflow/rules/apply-model.smk
@@ -4,7 +4,8 @@
 rule fire:
     input:
         bam=ancient(get_input_bam),
-        ref=ancient(REF),
+        ref=ancient(get_ref)
+        #ref=ancient(REF),
     output:
         cram="results/{sm}/{sm}-fire-{v}-filtered.cram",
         crai="results/{sm}/{sm}-fire-{v}-filtered.cram.crai",
@@ -75,7 +76,7 @@ rule fire_sites_chrom:
 rule fire_sites:
     input:
         beds=expand(
-            rules.fire_sites_chrom.output.bed, chrom=get_chroms(), allow_missing=True
+            rules.fire_sites_chrom.output.bed, chrom=get_chroms, allow_missing=True
         ),
     output:
         bed="results/{sm}/additional-outputs-{v}/fire-peaks/{sm}-{v}-fire-elements.bed.gz",

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -1,29 +1,73 @@
 import re
 import logging
 import sys
+import pysam
 
 FIRST_REPORT = True
 
 
-def get_ref():
+def get_ref_orig():
     if "ref" not in config:
         raise ValueError("FIRE: ref parameter is missing in config.yaml")
     ref = config["ref"]
     if not os.path.exists(ref):
         raise ValueError(f"FIRE: reference file {ref} does not exist")
     return os.path.abspath(ref)
 
+def get_ref_old(wc):
+    if "ref" in config:
+        ref = config["ref"]
+    if "ref" not in config:
+        if "ref" in MANIFEST.columns:
+            ref = get_input_ref()
+        else: 
+            raise ValueError("FIRE: ref parameter is missing in config.yaml and no ref column in manifest")
+    #ref = config["ref"]
+    if not os.path.exists(ref):
+        raise ValueError(f"FIRE: reference file {ref} does not exist")
+    return os.path.abspath(ref)
 
-def get_fai():
-    fai = f"{get_ref()}.fai"
+def get_ref(wc):
+    ref = MANIFEST.loc[wc.sm, "ref"]
+    if not os.path.exists(ref):
+        raise ValueError(f"FIRE: reference file {ref} does not exist")
+    return os.path.abspath(ref)
+
+def get_ref_name_old(wc):
+    if "ref_name" in config:
+        ref_name=config["ref_name"]
+    else:
+        if "ref_name" in MANIFEST.columns:
+            ref_name= get_input_ref_name()
+        else:
+            raise ValueError("FIRE: ref_name parameter is missing in config.yaml and no ref_name column in manifest")
+    #ref=get_ref()
+    #temp_split=ref.strip().split('/')[-1]
+    #fa_split=temp_split.split('.fa')
+    #ref_name = '.fa'.join(fa_split[:-1])
+    return(ref_name)
+
+def get_ref_name(wc):
+    return MANIFEST.loc[wc.sm, "ref_name"]
+
+def get_fai_orig(wc):
+    fai = f"{get_ref(wc)}.fai"
     if not os.path.exists(fai):
         raise ValueError(f"FIRE: reference index file {fai} does not exist")
     return fai
 
+def get_fai(wc):
+    ref = MANIFEST.loc[wc.sm, "ref"]
+    #fai= ref + ".fai"
+    fai = str(ref) + ".fai"
+    if not os.path.exists(fai):
+        raise ValueError(f"FIRE: reference index file {fai} does not exist")
+    return fai
 
-def get_excludes():
+def get_excludes(wc):
     excludes = config.get("excludes", [])
-    if REF_NAME == "hg38" or REF_NAME == "GRCh38":
+    ref_name = get_ref_name(wc)
+    if ref_name == "hg38" or ref_name == "GRCh38":
         files = [
             "../annotations/hg38.gap.bed.gz",
             "../annotations/hg38.blacklist.ENCFF356LFX.bed.gz",
@@ -33,22 +77,23 @@ def get_excludes():
     return excludes
 
 
-def get_fai_df():
-    fai = get_fai()
+def get_fai_df(wc):
+    fai = get_fai(wc)
     return pd.read_csv(fai, sep="\t", names=["chr", "length", "x", "y", "z"])
 
 
-def get_chroms():
+def get_chroms_orig(wc):
     global FIRST_REPORT
     min_contig_length = config.get("min_contig_length", 0)
-    skipped_contigs = FAI_DF["chr"][FAI_DF["length"] < min_contig_length]
+    fai_df=get_fai_df(wc)
+    skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length]
     if len(skipped_contigs) > 0 and FIRST_REPORT:
         print(
             f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
             file=sys.stderr,
         )
 
-    chroms = FAI_DF["chr"][FAI_DF["length"] >= min_contig_length]
+    chroms = fai_df["chr"][fai_df["length"] >= min_contig_length]
     chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
     chroms = [chrom for chrom in chroms if "_random" not in chrom]
     chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]
@@ -60,10 +105,115 @@ def get_chroms():
     if len(chroms) == 0:
         raise ValueError(
             f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
-            f"Your fai file contains the following chromosomes: {FAI_DF['chr']}"
+            f"Your fai file contains the following chromosomes: {fai_df['chr']}"
         )
     return chroms
 
+def get_chroms(wc):
+    global FIRST_REPORT
+    min_contig_length = config.get("min_contig_length", 0)
+    fai_df=get_fai_df(wc)
+    skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length]
+    if len(skipped_contigs) > 0 and FIRST_REPORT:
+        print(
+            f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
+            file=sys.stderr,
+        )
+
+    bam_chr_list=[]
+    input_bam_path=get_input_bam(wc)
+    input_bam = pysam.AlignmentFile(input_bam_path, "rc", threads=MAX_THREADS)
+    bam_header_dict = input_bam.header.to_dict()
+
+    for line in bam_header_dict['SQ']:
+        chr_name=line['SN']
+        bam_chr_list.append(chr_name)
+
+    input_bam.close()
+
+    chroms = fai_df["chr"][fai_df["length"] >= min_contig_length]
+    chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
+    chroms = [chrom for chrom in chroms if "_random" not in chrom]
+    chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]
+    chroms = [chrom for chrom in chroms if chrom in bam_chr_list]
+
+    if FIRST_REPORT:
+        FIRST_REPORT = False
+        print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr)
+
+    if len(chroms) == 0:
+        raise ValueError(
+            f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
+            f"Your fai file contains the following chromosomes: {fai_df['chr']}"
+        )
+    return chroms
+
+def get_chroms_first_element(wc):
+    global FIRST_REPORT
+    min_contig_length = config.get("min_contig_length", 0)
+    fai_df=get_fai_df(wc)
+    skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length]
+    if len(skipped_contigs) > 0 and FIRST_REPORT:
+        print(
+            f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
+            file=sys.stderr,
+        )
+
+    bam_chr_list=[]
+    input_bam_path=get_input_bam(wc)
+    input_bam = pysam.AlignmentFile(input_bam_path, "rc", threads=MAX_THREADS)
+    bam_header_dict = input_bam.header.to_dict()
+
+    for line in bam_header_dict['SQ']:
+        chr_name=line['SN']
+        bam_chr_list.append(chr_name)
+
+    input_bam.close()
+
+    chroms = fai_df["chr"][fai_df["length"] >= min_contig_length]
+    chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
+    chroms = [chrom for chrom in chroms if "_random" not in chrom]
+    chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]
+    chroms = [chrom for chrom in chroms if chrom in bam_chr_list]
+
+    if FIRST_REPORT:
+        FIRST_REPORT = False
+        print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr)
+
+    if len(chroms) == 0:
+        raise ValueError(
+            f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
+            f"Your fai file contains the following chromosomes: {fai_df['chr']}"
+        )
+    return chroms[0]
+
+def get_chroms_first_element_orig(wc):
+    global FIRST_REPORT
+    min_contig_length = config.get("min_contig_length", 0)
+    fai_df=get_fai_df(wc)
+    skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length]
+    if len(skipped_contigs) > 0 and FIRST_REPORT:
+        print(
+            f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
+            file=sys.stderr,
+        )
+
+    chroms = fai_df["chr"][fai_df["length"] >= min_contig_length]
+    chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
+    chroms = [chrom for chrom in chroms if "_random" not in chrom]
+    chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]
+
+    if FIRST_REPORT:
+        FIRST_REPORT = False
+        print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr)
+
+    if len(chroms) == 0:
+        raise ValueError(
+            f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
+            f"Your fai file contains the following chromosomes: {fai_df['chr']}"
+        )
+    return chroms[0]
+
 
 def get_manifest():
     manifest = config.get("manifest")
@@ -80,6 +230,11 @@ def get_manifest():
 def get_input_bam(wc):
     return MANIFEST.loc[wc.sm, "bam"]
 
+def get_input_ref(wc):
+    return MANIFEST.loc[wc.sm, "ref"]
+
+def get_input_ref_name(wc):
+    return MANIFEST.loc[wc.sm, "ref_name"]
 
 def get_mem_mb(wildcards, attempt):
     if attempt < 3:
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,4 +8,4 @@ dependencies: @@
     #- tqdm
     #- pip
     #- pip:
-    #- pysam==0.22.1
+      - pysam==0.22.1