yttria-aniseia · yttria-aniseia · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@ glnexus-test
 6tf_ipsc_wt
 all_Ngn2-KOLF
 all_KOLF21J
+26apr10_Ngn2-KOLF
 
 hifisomatic_resources
 hifi-wdl-resources-v*
@@ -18,6 +19,7 @@ wgs_backlog
 *.bam
 *.tbi
 *.bai
+*.fai
 *.gz
 *.bak
 *.gz

diff --git a/genbank_to_crispr_json.py b/genbank_to_crispr_json.py
@@ -223,6 +223,15 @@ def parse_genbank_to_crispr_json(genbank_path: str, edit_id: str,
     right_ha_seq = get_feature_sequence(right_ha_feature, record)
     payload_seq = get_feature_sequence(payload_feature, record)
 
+    # Extract donor context (e.g. plasmid backbone) flanking the HDR cassette.
+    # Taken linearly from the GenBank record: everything before left_ha and
+    # after right_ha. For circular plasmids exported with backbone at the
+    # record boundaries (the common case), this captures the full backbone.
+    lha_start = int(left_ha_feature.location.start)
+    rha_end = int(right_ha_feature.location.end)
+    donor_context_before = str(record.seq[:lha_start])
+    donor_context_after = str(record.seq[rha_end:])
+
     # Find payload components
     payload_start = int(payload_feature.location.start)
     payload_end = int(payload_feature.location.end)
@@ -263,6 +272,11 @@ def parse_genbank_to_crispr_json(genbank_path: str, edit_id: str,
         ]
     }
 
+    if donor_context_before:
+        crispr_json['edits'][0]['edit']['donor_context_before'] = donor_context_before
+    if donor_context_after:
+        crispr_json['edits'][0]['edit']['donor_context_after'] = donor_context_after
+
     # Add optional symbol if available
     if gene_info.get('symbol'):
         crispr_json['edits'][0]['target']['symbol'] = gene_info['symbol']
@@ -328,6 +342,11 @@ def main():
         print(f"  Gene: {crispr_json['edits'][0]['target']['symbol']}", file=sys.stderr)
     print(f"  Payload components: {len(crispr_json['edits'][0]['edit'].get('payload_components', []))}",
           file=sys.stderr)
+    edit_out = crispr_json['edits'][0]['edit']
+    if 'donor_context_before' in edit_out or 'donor_context_after' in edit_out:
+        print(f"  Donor context: {len(edit_out.get('donor_context_before', ''))} bp before, "
+              f"{len(edit_out.get('donor_context_after', ''))} bp after",
+              file=sys.stderr)
 
 
 if __name__ == '__main__':

diff --git a/scripts/process_input_config.py b/scripts/process_input_config.py
@@ -72,30 +72,38 @@ def main():
             all_exist = all(os.path.exists(hifi_read_path) for hifi_read_path in sample["hifi_reads"])
 
             if all_exist and should_copy_file(dest_path):
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    tmpdir_path = Path(tmpdirname)
-                    temp_path = tmpdir_path / f"{sample['sample_id']}_hifi_reads_temp.bam"
-                    print(f"Merging {len(sample['hifi_reads'])} HiFi read files -> {dest_path} (stripping tags: fi,ri,fp,rp,ip,pw,HP,PS,PC)")
-
-                    # Step 1: Concatenate BAM files
-                    print(f"  Step 1/2: Concatenating BAM files...")
-                    subprocess.run([
-                        "samtools", "cat",
-                        "-o", str(temp_path)
-                    ] + sample["hifi_reads"], check=True)
-
-                    # Step 2: Strip tags using samtools reset
-                    print(f"  Step 2/2: Stripping tags...")
-                    subprocess.run([
+                print(f"Merging {len(sample['hifi_reads'])} HiFi read files -> {dest_path} (stripping tags: fi,ri,fp,rp,ip,pw,HP,PS,PC)")
+
+                # Write to a local /tmp file first to avoid slow NFS streaming writes,
+                # then copy to NFS destination in one sequential transfer.
+                # samtools cat stdout is piped directly into samtools reset (no temp BAM on disk).
+                tmp_fd, tmp_out_str = tempfile.mkstemp(dir="/tmp", suffix=".bam")
+                os.close(tmp_fd)
+                tmp_out = Path(tmp_out_str)
+                try:
+                    # Step 1 + 2 (pipelined): cat BAMs from NFS | reset/strip tags -> local /tmp
+                    print(f"  Step 1/2: Streaming cat | reset to local /tmp...")
+                    cat_proc = subprocess.Popen(
+                        ["samtools", "cat"] + sample["hifi_reads"],
+                        stdout=subprocess.PIPE
+                    )
+                    reset_proc = subprocess.run([
                         "samtools", "reset",
                         "--thread", "15",
                         "--remove-tag", "fi,ri,fp,rp,ip,pw,HP,PS,PC",
-                        "-o", str(dest_path),
-                        str(temp_path)
-                    ], check=True)
-
-                    # Clean up temporary files
-                    temp_path.unlink()
+                        "-o", str(tmp_out),
+                        "-"
+                    ], stdin=cat_proc.stdout, check=True)
+                    cat_proc.stdout.close()
+                    cat_proc.wait()
+                    if cat_proc.returncode != 0:
+                        raise subprocess.CalledProcessError(cat_proc.returncode, "samtools cat")
+
+                    # Step 2/2: Copy finished BAM from local /tmp to NFS destination
+                    print(f"  Step 2/2: Copying local /tmp result to NFS destination...")
+                    shutil.copy2(str(tmp_out), str(dest_path))
+                finally:
+                    tmp_out.unlink(missing_ok=True)
 
                 new_sample["hifi_reads"] = [str(dest_path.absolute())]
             elif all_exist:

diff --git a/workflows/assembly/assembly_tasks.wdl b/workflows/assembly/assembly_tasks.wdl
@@ -223,8 +223,8 @@ task pav {
   >>>
 
   output {
-    File? pav_vcf = "pav_outputs_~{sub(sample_name, "\\.", "_")}/~{sub(sample_name, "\\.", "_")}.vcf.gz"
-    File? pav_vcf_index = "pav_outputs_~{sub(sample_name, "\\.", "_")}/~{sub(sample_name, "\\.", "_")}.vcf.gz.tbi"
+    File? pav_vcf = "pav_outputs_~{sample_name}/~{sub(sample_name, "\\.", "_")}.vcf.gz"
+    File? pav_vcf_index = "pav_outputs_~{sample_name}/~{sub(sample_name, "\\.", "_")}.vcf.gz.tbi"
   }
 
   runtime {

diff --git a/workflows/edit-qc/bcftools_aux.wdl b/workflows/edit-qc/bcftools_aux.wdl
@@ -116,8 +116,8 @@ task count_vcf_variants {
 
   runtime {
     docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87"
-    cpu: 1
-    memory: "2 GB"
+    cpu: 2
+    memory: "8 GB"
     disk: disk_size + " GB"
     disks: "local-disk " + disk_size + " HDD"
     preemptible: runtime_attributes.preemptible_tries
@@ -162,8 +162,83 @@ task count_tsv_rows {
 
   runtime {
     docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87"
-    cpu: 1
-    memory: "2 GB"
+    cpu: 2
+    memory: "8 GB"
+    disk: disk_size + " GB"
+    disks: "local-disk " + disk_size + " HDD"
+    preemptible: runtime_attributes.preemptible_tries
+    maxRetries: runtime_attributes.max_retries
+    awsBatchRetryAttempts: runtime_attributes.max_retries  # !UnknownRuntimeKey
+    zones: runtime_attributes.zones
+  }
+}
+
+task bcftools_qual_filter {
+  meta {
+    description: "Filter variants to FILTER=PASS. Small variants and sawfish SVs additionally require GQ>=20 and, if AD is present, VAF>0.1. PAV assembly SVs (identified by presence of INFO/COV_PROP) skip the GQ filter and instead require all haplotype COV_PROP values >0.1."
+  }
+
+  parameter_meta {
+    vcf: { name: "Input VCF" }
+    out_prefix: { name: "Output file prefix" }
+    runtime_attributes: { name: "Runtime attribute structure" }
+    filtered_vcf: { name: "Filtered VCF" }
+    filtered_vcf_index: { name: "Filtered VCF index" }
+  }
+
+  input {
+    File vcf
+    String out_prefix
+    RuntimeAttributes runtime_attributes
+  }
+
+  Int disk_size = ceil(size(vcf, "GB") * 2 + 10)
+
+  command <<<
+    set -euo pipefail
+
+    bcftools --version
+
+    # PAV assembly SVs carry INFO/COV_PROP typed as String in their VCF header.
+    # Reheader to Float (or add the tag as Float if absent) so bcftools can
+    # evaluate MIN(INFO/COV_PROP) numerically.  Records without the tag remain
+    # absent (compared as ".") in either case.
+    if bcftools view -h ~{vcf} | grep -q '##INFO=<ID=COV_PROP'; then
+      bcftools view -h ~{vcf} | \
+        sed 's|##INFO=<ID=COV_PROP,Number=\.,Type=String|##INFO=<ID=COV_PROP,Number=.,Type=Float|' \
+        > fixed_header.txt
+    else
+      bcftools view -h ~{vcf} | grep -v '^#CHROM' > fixed_header.txt
+      echo '##INFO=<ID=COV_PROP,Number=.,Type=Float,Description="Assembly haplotype coverage proportion (PAV)">' \
+        >> fixed_header.txt
+      bcftools view -h ~{vcf} | grep '^#CHROM' >> fixed_header.txt
+    fi
+    bcftools reheader -h fixed_header.txt -o reheaded.vcf.gz ~{vcf}
+    bcftools index --tbi reheaded.vcf.gz
+
+    # Unified filter:
+    #   Non-PAV (INFO/COV_PROP absent): require GQ>=20; if AD present, VAF>0.1.
+    #   PAV (INFO/COV_PROP present):    require MIN(COV_PROP)>0.1; no GQ check.
+    bcftools view \
+      --output-type z \
+      -i '(FILTER="PASS" || FILTER=".") && (
+        (INFO/COV_PROP="." && GQ>=20 && (AD[0:0]="." || (AD[0:0]+AD[0:1])=0 || AD[0:1]/(AD[0:0]+AD[0:1]) > 0.1)) ||
+        (INFO/COV_PROP!="." && MIN(INFO/COV_PROP) > 0.1)
+      )' \
+      reheaded.vcf.gz \
+      -o ~{out_prefix}.vcf.gz
+    bcftools index --tbi ~{out_prefix}.vcf.gz
+  >>>
+
+  output {
+    File filtered_vcf       = "~{out_prefix}.vcf.gz"
+    File filtered_vcf_index = "~{out_prefix}.vcf.gz.tbi"
+  }
+
+  runtime {
+    docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87"
+    cpu: 4
+    memory: "8 GB"
     disk: disk_size + " GB"
     disks: "local-disk " + disk_size + " HDD"
     preemptible: runtime_attributes.preemptible_tries
@@ -182,6 +257,9 @@ task bcftools_merge_assembly_align {
     vcfs: {
       name: "VCFs"
     }
+    sample_id: {
+      name: "Sample ID to normalize VCF sample names to before merging"
+    }
     out_prefix: {
       name: "Output VCF name"
     }
@@ -190,14 +268,15 @@ task bcftools_merge_assembly_align {
     }
     merged_vcf: {
       name: "Merged VCF"
-    }  
+    }
     merged_vcf_index: {
       name: "Merged VCF index"
     }
   }
 
   input {
     Array[File] vcfs
+    String sample_id
 
     String out_prefix
 
@@ -210,9 +289,11 @@ task bcftools_merge_assembly_align {
 
   command <<<
     set -euo pipefail
+    echo "~{sample_id}" > sample_name.txt
     for vcf in ~{sep=' ' vcfs}; do
-        cp "$vcf" ./
-        bcftools index --tbi "$(basename $vcf)"
+        base="$(basename $vcf)"
+        bcftools reheader -s sample_name.txt "$vcf" -o "$base"
+        bcftools index --tbi "$base"
     done
 
     bcftools --version
@@ -247,4 +328,4 @@ task bcftools_merge_assembly_align {
     awsBatchRetryAttempts: runtime_attributes.max_retries  # !UnknownRuntimeKey
     zones: runtime_attributes.zones
   }
-}
+}