diff --git a/.gitignore b/.gitignore index 0233c583..ae94401e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ glnexus-test 6tf_ipsc_wt all_Ngn2-KOLF all_KOLF21J +26apr10_Ngn2-KOLF hifisomatic_resources hifi-wdl-resources-v* @@ -18,6 +19,7 @@ wgs_backlog *.bam *.tbi *.bai +*.fai *.gz *.bak *.gz diff --git a/genbank_to_crispr_json.py b/genbank_to_crispr_json.py index 2c6ec7a3..b4006108 100755 --- a/genbank_to_crispr_json.py +++ b/genbank_to_crispr_json.py @@ -223,6 +223,15 @@ def parse_genbank_to_crispr_json(genbank_path: str, edit_id: str, right_ha_seq = get_feature_sequence(right_ha_feature, record) payload_seq = get_feature_sequence(payload_feature, record) + # Extract donor context (e.g. plasmid backbone) flanking the HDR cassette. + # Taken linearly from the GenBank record: everything before left_ha and + # after right_ha. For circular plasmids exported with backbone at the + # record boundaries (the common case), this captures the full backbone. + lha_start = int(left_ha_feature.location.start) + rha_end = int(right_ha_feature.location.end) + donor_context_before = str(record.seq[:lha_start]) + donor_context_after = str(record.seq[rha_end:]) + # Find payload components payload_start = int(payload_feature.location.start) payload_end = int(payload_feature.location.end) @@ -263,6 +272,11 @@ def parse_genbank_to_crispr_json(genbank_path: str, edit_id: str, ] } + if donor_context_before: + crispr_json['edits'][0]['edit']['donor_context_before'] = donor_context_before + if donor_context_after: + crispr_json['edits'][0]['edit']['donor_context_after'] = donor_context_after + # Add optional symbol if available if gene_info.get('symbol'): crispr_json['edits'][0]['target']['symbol'] = gene_info['symbol'] @@ -328,6 +342,11 @@ def main(): print(f" Gene: {crispr_json['edits'][0]['target']['symbol']}", file=sys.stderr) print(f" Payload components: {len(crispr_json['edits'][0]['edit'].get('payload_components', []))}", file=sys.stderr) + edit_out = crispr_json['edits'][0]['edit'] + if 'donor_context_before' in edit_out or 'donor_context_after' in edit_out: + print(f" Donor context: {len(edit_out.get('donor_context_before', ''))} bp before, " + f"{len(edit_out.get('donor_context_after', ''))} bp after", + file=sys.stderr) if __name__ == '__main__': diff --git a/scripts/process_input_config.py b/scripts/process_input_config.py index da1ef136..cb906457 100755 --- a/scripts/process_input_config.py +++ b/scripts/process_input_config.py @@ -72,30 +72,38 @@ def main(): all_exist = all(os.path.exists(hifi_read_path) for hifi_read_path in sample["hifi_reads"]) if all_exist and should_copy_file(dest_path): - with tempfile.TemporaryDirectory() as tmpdirname: - tmpdir_path = Path(tmpdirname) - temp_path = tmpdir_path / f"{sample['sample_id']}_hifi_reads_temp.bam" - print(f"Merging {len(sample['hifi_reads'])} HiFi read files -> {dest_path} (stripping tags: fi,ri,fp,rp,ip,pw,HP,PS,PC)") - - # Step 1: Concatenate BAM files - print(f" Step 1/2: Concatenating BAM files...") - subprocess.run([ - "samtools", "cat", - "-o", str(temp_path) - ] + sample["hifi_reads"], check=True) - - # Step 2: Strip tags using samtools reset - print(f" Step 2/2: Stripping tags...") - subprocess.run([ + print(f"Merging {len(sample['hifi_reads'])} HiFi read files -> {dest_path} (stripping tags: fi,ri,fp,rp,ip,pw,HP,PS,PC)") + + # Write to a local /tmp file first to avoid slow NFS streaming writes, + # then copy to NFS destination in one sequential transfer. + # samtools cat stdout is piped directly into samtools reset (no temp BAM on disk). + tmp_fd, tmp_out_str = tempfile.mkstemp(dir="/tmp", suffix=".bam") + os.close(tmp_fd) + tmp_out = Path(tmp_out_str) + try: + # Step 1 + 2 (pipelined): cat BAMs from NFS | reset/strip tags -> local /tmp + print(f" Step 1/2: Streaming cat | reset to local /tmp...") + cat_proc = subprocess.Popen( + ["samtools", "cat"] + sample["hifi_reads"], + stdout=subprocess.PIPE + ) + reset_proc = subprocess.run([ "samtools", "reset", "--thread", "15", "--remove-tag", "fi,ri,fp,rp,ip,pw,HP,PS,PC", - "-o", str(dest_path), - str(temp_path) - ], check=True) - - # Clean up temporary files - temp_path.unlink() + "-o", str(tmp_out), + "-" + ], stdin=cat_proc.stdout, check=True) + cat_proc.stdout.close() + cat_proc.wait() + if cat_proc.returncode != 0: + raise subprocess.CalledProcessError(cat_proc.returncode, "samtools cat") + + # Step 2/2: Copy finished BAM from local /tmp to NFS destination + print(f" Step 2/2: Copying local /tmp result to NFS destination...") + shutil.copy2(str(tmp_out), str(dest_path)) + finally: + tmp_out.unlink(missing_ok=True) new_sample["hifi_reads"] = [str(dest_path.absolute())] elif all_exist: diff --git a/workflows/assembly/assembly_tasks.wdl b/workflows/assembly/assembly_tasks.wdl index 2e5fcb52..e6fde86b 100755 --- a/workflows/assembly/assembly_tasks.wdl +++ b/workflows/assembly/assembly_tasks.wdl @@ -223,8 +223,8 @@ task pav { >>> output { - File? pav_vcf = "pav_outputs_~{sub(sample_name, "\\.", "_")}/~{sub(sample_name, "\\.", "_")}.vcf.gz" - File? pav_vcf_index = "pav_outputs_~{sub(sample_name, "\\.", "_")}/~{sub(sample_name, "\\.", "_")}.vcf.gz.tbi" + File? pav_vcf = "pav_outputs_~{sample_name}/~{sub(sample_name, "\\.", "_")}.vcf.gz" + File? pav_vcf_index = "pav_outputs_~{sample_name}/~{sub(sample_name, "\\.", "_")}.vcf.gz.tbi" } runtime { diff --git a/workflows/edit-qc/bcftools_aux.wdl b/workflows/edit-qc/bcftools_aux.wdl index 07911cbc..a1ad2fa9 100644 --- a/workflows/edit-qc/bcftools_aux.wdl +++ b/workflows/edit-qc/bcftools_aux.wdl @@ -116,8 +116,8 @@ task count_vcf_variants { runtime { docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" - cpu: 1 - memory: "2 GB" + cpu: 2 + memory: "8 GB" disk: disk_size + " GB" disks: "local-disk " + disk_size + " HDD" preemptible: runtime_attributes.preemptible_tries @@ -162,8 +162,83 @@ task count_tsv_rows { runtime { docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" - cpu: 1 - memory: "2 GB" + cpu: 2 + memory: "8 GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey + zones: runtime_attributes.zones + } +} + +task bcftools_qual_filter { + meta { + description: "Filter variants to FILTER=PASS. Small variants and sawfish SVs additionally require GQ>=20 and, if AD is present, VAF>0.1. PAV assembly SVs (identified by presence of INFO/COV_PROP) skip the GQ filter and instead require all haplotype COV_PROP values >0.1." + } + + parameter_meta { + vcf: { name: "Input VCF" } + out_prefix: { name: "Output file prefix" } + runtime_attributes: { name: "Runtime attribute structure" } + filtered_vcf: { name: "Filtered VCF" } + filtered_vcf_index: { name: "Filtered VCF index" } + } + + input { + File vcf + String out_prefix + RuntimeAttributes runtime_attributes + } + + Int disk_size = ceil(size(vcf, "GB") * 2 + 10) + + command <<< + set -euo pipefail + + bcftools --version + + # PAV assembly SVs carry INFO/COV_PROP typed as String in their VCF header. + # Reheader to Float (or add the tag as Float if absent) so bcftools can + # evaluate MIN(INFO/COV_PROP) numerically. Records without the tag remain + # absent (compared as ".") in either case. + if bcftools view -h ~{vcf} | grep -q '##INFO= fixed_header.txt + else + bcftools view -h ~{vcf} | grep -v '^#CHROM' > fixed_header.txt + echo '##INFO=' \ + >> fixed_header.txt + bcftools view -h ~{vcf} | grep '^#CHROM' >> fixed_header.txt + fi + bcftools reheader -h fixed_header.txt -o reheaded.vcf.gz ~{vcf} + bcftools index --tbi reheaded.vcf.gz + + # Unified filter: + # Non-PAV (INFO/COV_PROP absent): require GQ>=20; if AD present, VAF>0.1. + # PAV (INFO/COV_PROP present): require MIN(COV_PROP)>0.1; no GQ check. + bcftools view \ + --output-type z \ + -i '(FILTER="PASS" || FILTER=".") && ( + (INFO/COV_PROP="." && GQ>=20 && (AD[0:0]="." || (AD[0:0]+AD[0:1])=0 || AD[0:1]/(AD[0:0]+AD[0:1]) > 0.1)) || + (INFO/COV_PROP!="." && MIN(INFO/COV_PROP) > 0.1) + )' \ + reheaded.vcf.gz \ + -o ~{out_prefix}.vcf.gz + bcftools index --tbi ~{out_prefix}.vcf.gz + >>> + + output { + File filtered_vcf = "~{out_prefix}.vcf.gz" + File filtered_vcf_index = "~{out_prefix}.vcf.gz.tbi" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" + cpu: 4 + memory: "8 GB" disk: disk_size + " GB" disks: "local-disk " + disk_size + " HDD" preemptible: runtime_attributes.preemptible_tries @@ -182,6 +257,9 @@ task bcftools_merge_assembly_align { vcfs: { name: "VCFs" } + sample_id: { + name: "Sample ID to normalize VCF sample names to before merging" + } out_prefix: { name: "Output VCF name" } @@ -190,7 +268,7 @@ task bcftools_merge_assembly_align { } merged_vcf: { name: "Merged VCF" - } + } merged_vcf_index: { name: "Merged VCF index" } @@ -198,6 +276,7 @@ task bcftools_merge_assembly_align { input { Array[File] vcfs + String sample_id String out_prefix @@ -210,9 +289,11 @@ task bcftools_merge_assembly_align { command <<< set -euo pipefail + echo "~{sample_id}" > sample_name.txt for vcf in ~{sep=' ' vcfs}; do - cp "$vcf" ./ - bcftools index --tbi "$(basename $vcf)" + base="$(basename $vcf)" + bcftools reheader -s sample_name.txt "$vcf" -o "$base" + bcftools index --tbi "$base" done bcftools --version @@ -247,4 +328,4 @@ task bcftools_merge_assembly_align { awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey zones: runtime_attributes.zones } -} \ No newline at end of file +} diff --git a/workflows/edit-qc/crispr_consensus_tasks.wdl b/workflows/edit-qc/crispr_consensus_tasks.wdl index c7b06662..e45c58b3 100644 --- a/workflows/edit-qc/crispr_consensus_tasks.wdl +++ b/workflows/edit-qc/crispr_consensus_tasks.wdl @@ -2,531 +2,20 @@ version 1.1 import "../wdl-common/wdl/structs.wdl" -task create_sample_reference_consensus { - meta { - description: "Create sample-specific reference consensus for edit region using bcftools consensus with sample variants" - } - - parameter_meta { - ref_fasta: "Reference genome FASTA" - ref_fasta_index: "Reference genome FASTA index" - small_variant_vcf: "Sample small variant VCF (phased)" - small_variant_vcf_index: "Index for small variant VCF" - sv_vcf: "Sample structural variant VCF (phased)" - sv_vcf_index: "Index for structural variant VCF" - crispr_edit_json: "JSON file describing CRISPR edit with target region" - } - - input { - File ref_fasta - File ref_fasta_index - File small_variant_vcf - File small_variant_vcf_index - File sv_vcf - File sv_vcf_index - File crispr_edit_json - String sample_id - RuntimeAttributes runtime_attributes - } - - String out_prefix = "~{sample_id}_sample_ref" - Int padding = 5000 # Add padding around edit region for better alignment context - - command <<< - set -euxo pipefail - - python3 <<'EOF' -import json - -# Read CRISPR edit JSON to get target region -with open("~{crispr_edit_json}", 'r') as f: - crispr_data = json.load(f) - -# Get first edit's target region -edit = crispr_data['edits'][0] -chr = edit['target']['chr'] -start = max(1, edit['target']['start'] - ~{padding}) # Add padding but don't go below 1 -end = edit['target']['end'] + ~{padding} -symbol = edit['target']['symbol'] - -# Write region info for bash -with open('region.txt', 'w') as f: - f.write(f"{chr}:{start}-{end}\n") -with open('symbol.txt', 'w') as f: - f.write(f"{symbol}\n") -with open('chr.txt', 'w') as f: - f.write(f"{chr}\n") -EOF - - REGION=$(cat region.txt) - SYMBOL=$(cat symbol.txt) - CHR=$(cat chr.txt) - - echo "Creating sample-specific reference for region: ${REGION}" - - # Extract region from reference genome - samtools faidx ~{ref_fasta} "${REGION}" > region_ref.fasta - - # Index the small variant VCF if needed and extract region - tabix -p vcf ~{small_variant_vcf} || true - bcftools view -r "${REGION}" ~{small_variant_vcf} -O z -o small_variants_region.vcf.gz - tabix -p vcf small_variants_region.vcf.gz - - # Index the SV VCF if needed and extract region - tabix -p vcf ~{sv_vcf} || true - bcftools view -r "${REGION}" ~{sv_vcf} -O z -o sv_region.vcf.gz - tabix -p vcf sv_region.vcf.gz - - # Merge small variants and SVs - bcftools concat -a -D small_variants_region.vcf.gz sv_region.vcf.gz -O z -o merged_variants.vcf.gz - tabix -p vcf merged_variants.vcf.gz - - # Filter out symbolic alleles (except , <*>, which are supported) - # This removes alleles like , , , , etc. - bcftools view -e 'ALT~"" || ALT~"" || ALT~"" || ALT~""' \ - merged_variants.vcf.gz -O z -o merged_variants_filtered.vcf.gz - tabix -p vcf merged_variants_filtered.vcf.gz - - echo "Filtered VCF stats:" - echo " Total variants: $(bcftools view -H merged_variants.vcf.gz | wc -l)" - echo " After filtering: $(bcftools view -H merged_variants_filtered.vcf.gz | wc -l)" - - # Apply variants to reference using bcftools consensus - # This creates a sample-specific reference that includes the variants - cat region_ref.fasta | bcftools consensus merged_variants_filtered.vcf.gz > sample_ref_consensus.fasta - - # Rename header to include sample info - python3 <<'EOF' -symbol = open('symbol.txt').read().strip() -region = open('region.txt').read().strip() - -with open('sample_ref_consensus.fasta', 'r') as f: - lines = f.readlines() - -# Replace header -with open('~{out_prefix}_consensus.fasta', 'w') as out: - if lines: - out.write(f">{symbol}_sample_ref_{region}\n") - for line in lines[1:]: - out.write(line) -EOF - - # Index the consensus - samtools faidx ~{out_prefix}_consensus.fasta - - echo "Sample-specific reference consensus created for ${SYMBOL}" - >>> - - output { - File sample_ref_consensus_fasta = "~{out_prefix}_consensus.fasta" - File sample_ref_consensus_fasta_index = "~{out_prefix}_consensus.fasta.fai" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" - cpu: 2 - memory: "8 GB" - disk: "20 GB" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey - zones: runtime_attributes.zones - } -} - -task extract_region_reads { - meta { - description: "Extract aligned reads from edit region preserving haplotype tags" - } - - parameter_meta { - haplotagged_bam: "Haplotagged BAM file from phasing" - haplotagged_bam_index: "Index for haplotagged BAM" - crispr_edit_json: "JSON file describing CRISPR edit with target region" - } - - input { - File haplotagged_bam - File haplotagged_bam_index - File crispr_edit_json - String sample_id - Int threads = 4 - RuntimeAttributes runtime_attributes - } - - String out_prefix = "~{sample_id}_region_reads" - Int padding = 5000 # Must match padding in create_sample_reference_consensus - - command <<< - set -euxo pipefail - - python3 <<'EOF' -import json - -# Read CRISPR edit JSON to get target region -with open("~{crispr_edit_json}", 'r') as f: - crispr_data = json.load(f) - -# Get first edit's target region -edit = crispr_data['edits'][0] -chr = edit['target']['chr'] -start = max(1, edit['target']['start'] - ~{padding}) -end = edit['target']['end'] + ~{padding} -symbol = edit['target']['symbol'] - -# Write region info for bash -with open('region.txt', 'w') as f: - f.write(f"{chr}:{start}-{end}\n") -with open('symbol.txt', 'w') as f: - f.write(f"{symbol}\n") -EOF - - REGION=$(cat region.txt) - SYMBOL=$(cat symbol.txt) - - echo "Extracting reads from region: ${REGION}" - - # Extract reads from the edit region (with haplotype tags preserved) - samtools view -h -@ ~{threads} ~{haplotagged_bam} "${REGION}" | \ - samtools fastq -T HP,PS - | \ - gzip > ~{out_prefix}.fastq.gz - - # Count extracted reads - READ_COUNT=$(zcat ~{out_prefix}.fastq.gz | grep -c "^@" || echo 0) - echo "Extracted ${READ_COUNT} reads from ${REGION}" - >>> - - output { - File region_reads_fastq = "~{out_prefix}.fastq.gz" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" - cpu: threads - memory: "8 GB" - disk: "50 GB" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey - zones: runtime_attributes.zones - } -} - -task minimap2_align_to_sample_reference { - meta { - description: "Align extracted reads to sample-specific reference using minimap2" - } - - parameter_meta { - region_reads_fastq: "FASTQ reads extracted from edit region" - sample_ref_consensus_fasta: "Sample-specific reference consensus FASTA" - } - - input { - File region_reads_fastq - File sample_ref_consensus_fasta - String sample_id - Int threads = 4 - RuntimeAttributes runtime_attributes - } - - String out_prefix = "~{sample_id}_realigned" - - command <<< - set -euxo pipefail - - # Align reads to sample-specific reference using minimap2 - minimap2 -ax map-hifi -t ~{threads} -Y \ - ~{sample_ref_consensus_fasta} \ - ~{region_reads_fastq} > ~{out_prefix}.sam - - echo "Alignment complete" - >>> - - output { - File realigned_sam = "~{out_prefix}.sam" - } - - runtime { - docker: "quay.io/biocontainers/minimap2@sha256:fdc9ef8bfbd31bab59a61b2e90dd226647deed971556175a4cd004f0bcdc7608" - cpu: threads - memory: "16 GB" - disk: "50 GB" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey - zones: runtime_attributes.zones - } -} - -task sam_to_sorted_tagged_bam { - meta { - description: "Convert SAM to sorted BAM, generate stats, and transfer haplotype tags from original BAM" - } - - parameter_meta { - sam_file: "SAM alignment file" - original_bam: "Original haplotagged BAM (source of HP/PS tags)" - original_bam_index: "Index for original BAM" - crispr_edit_json: "JSON file with region info" - } - - input { - File sam_file - File original_bam - File original_bam_index - File crispr_edit_json - String sample_id - Int threads = 4 - RuntimeAttributes runtime_attributes - } - - String out_prefix = "~{sample_id}_realigned" - String tagged_prefix = "~{sample_id}_realigned_tagged" - Int padding = 5000 - - command <<< - set -euxo pipefail - - # Step 1: Convert to BAM, sort, and index - samtools view -b -@ ~{threads} ~{sam_file} | \ - samtools sort -@ ~{threads} -o ~{out_prefix}.bam - - samtools index -@ ~{threads} ~{out_prefix}.bam - - # Count aligned reads and generate stats - ALIGNED_COUNT=$(samtools view -c ~{out_prefix}.bam) - echo "Sorted ${ALIGNED_COUNT} reads" - - samtools flagstat ~{out_prefix}.bam > ~{out_prefix}_flagstat.txt - - # Step 2: Transfer haplotype tags from original BAM - python3 <<'EOF' -import json -import pysam - -# Read CRISPR edit JSON to get target region -with open("~{crispr_edit_json}", 'r') as f: - crispr_data = json.load(f) - -edit = crispr_data['edits'][0] -chr = edit['target']['chr'] -start = max(1, edit['target']['start'] - ~{padding}) -end = edit['target']['end'] + ~{padding} - -# Open BAMs -original = pysam.AlignmentFile("~{original_bam}", "rb") -realigned = pysam.AlignmentFile("~{out_prefix}.bam", "rb") - -# Create output BAM with same header as realigned -output = pysam.AlignmentFile("~{tagged_prefix}.bam", "wb", template=realigned) - -# Build dict of read_name -> (HP, PS) from original BAM in the region -tags_by_read = {} -for read in original.fetch(chr, start, end): - hp_tag = read.get_tag("HP") if read.has_tag("HP") else None - ps_tag = read.get_tag("PS") if read.has_tag("PS") else None - if hp_tag is not None or ps_tag is not None: - tags_by_read[read.query_name] = (hp_tag, ps_tag) - -print(f"Extracted tags from {len(tags_by_read)} reads in original BAM") - -# Transfer tags to realigned reads -transferred = 0 -for read in realigned.fetch(): - if read.query_name in tags_by_read: - hp_tag, ps_tag = tags_by_read[read.query_name] - if hp_tag is not None: - read.set_tag("HP", hp_tag, value_type='i') - if ps_tag is not None: - read.set_tag("PS", ps_tag, value_type='i') - transferred += 1 - output.write(read) - -print(f"Transferred tags to {transferred} reads in realigned BAM") - -original.close() -realigned.close() -output.close() -EOF - - # Index the tagged BAM - samtools index ~{tagged_prefix}.bam - - echo "Combined SAM conversion, sorting, and tag transfer complete" - >>> - - output { - File sorted_bam = "~{out_prefix}.bam" - File sorted_bam_index = "~{out_prefix}.bam.bai" - File alignment_stats = "~{out_prefix}_flagstat.txt" - File tagged_bam = "~{tagged_prefix}.bam" - File tagged_bam_index = "~{tagged_prefix}.bam.bai" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" - cpu: threads - memory: "8 GB" - disk: "50 GB" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey - zones: runtime_attributes.zones - } -} - -task create_haplotype_consensus_from_realigned { - meta { - description: "Create haplotype consensus sequences from reads realigned to sample-specific reference" - } - - parameter_meta { - realigned_bam: "BAM file with reads realigned to sample-specific reference" - realigned_bam_index: "Index for realigned BAM" - sample_ref_consensus_fasta: "Sample-specific reference consensus FASTA (used as coordinate system)" - crispr_edit_json: "JSON file describing CRISPR edit with target region" - } - - input { - File realigned_bam - File realigned_bam_index - File sample_ref_consensus_fasta - File crispr_edit_json - String sample_id - RuntimeAttributes runtime_attributes - } - - String out_prefix = "~{sample_id}_haplotype_consensus" - - command <<< - set -euxo pipefail - - python3 <<'EOF' -import json - -# Read CRISPR edit JSON to get target info -with open("~{crispr_edit_json}", 'r') as f: - crispr_data = json.load(f) - -# Get first edit's metadata -edit = crispr_data['edits'][0] -symbol = edit['target']['symbol'] - -# Write symbol for bash -with open('symbol.txt', 'w') as f: - f.write(f"{symbol}\n") -EOF - - SYMBOL=$(cat symbol.txt) - - echo "Creating haplotype consensus sequences for ${SYMBOL}" - - # Get the sequence name from the sample reference (should be first/only sequence) - REF_NAME=$(head -n 1 ~{sample_ref_consensus_fasta} | cut -d' ' -f1 | sed 's/>//') - echo "Reference sequence name: ${REF_NAME}" - - # Split BAM by haplotype tag (HP) - samtools split -d HP --output-fmt BAM --write-index -f "hap%#.%." -u "unphased.bam" ~{realigned_bam} - - # Check if haplotype files were created and generate consensus for each - if [ -f "hap0.bam" ]; then - echo "Generating consensus for haplotype 1..." - samtools consensus -a -m simple --min-MQ 10 --min-BQ 10 -f fasta -X hifi hap0.bam > hap1_consensus.fasta || echo ">hap1_no_consensus" > hap1_consensus.fasta - - # Get depth info - samtools depth hap0.bam | awk '{sum+=$3; count++} END {if(count>0) print "Haplotype 1 mean depth:", sum/count; else print "Haplotype 1 mean depth: 0"}' > hap1_depth.txt - cat hap1_depth.txt - else - echo "No haplotype 1 reads found" - echo ">hap1_no_consensus" > hap1_consensus.fasta - echo "Haplotype 1 mean depth: 0" > hap1_depth.txt - fi - - if [ -f "hap1.bam" ]; then - echo "Generating consensus for haplotype 2..." - samtools consensus -a -m simple --min-MQ 10 --min-BQ 10 -f fasta -X hifi hap1.bam > hap2_consensus.fasta || echo ">hap2_no_consensus" > hap2_consensus.fasta - - # Get depth info - samtools depth hap1.bam | awk '{sum+=$3; count++} END {if(count>0) print "Haplotype 2 mean depth:", sum/count; else print "Haplotype 2 mean depth: 0"}' > hap2_depth.txt - cat hap2_depth.txt - else - echo "No haplotype 2 reads found" - echo ">hap2_no_consensus" > hap2_consensus.fasta - echo "Haplotype 2 mean depth: 0" > hap2_depth.txt - fi - - # Combine consensus sequences with informative headers - python3 <<'EOF' -import sys - -symbol = open('symbol.txt').read().strip() - -# Combine haplotype consensus files -with open('~{out_prefix}_combined.fasta', 'w') as out: - # Haplotype 1 - with open('hap1_consensus.fasta', 'r') as f: - lines = f.readlines() - if len(lines) > 1: - seq = ''.join(line.strip() for line in lines[1:]) - out.write(f">{symbol}_hap1_realigned\n{seq}\n") - else: - out.write(f">{symbol}_hap1_no_consensus\nN\n") - - # Haplotype 2 - with open('hap2_consensus.fasta', 'r') as f: - lines = f.readlines() - if len(lines) > 1: - seq = ''.join(line.strip() for line in lines[1:]) - out.write(f">{symbol}_hap2_realigned\n{seq}\n") - else: - out.write(f">{symbol}_hap2_no_consensus\nN\n") - -print("Combined haplotype consensus sequences created") -EOF - - # Create summary file - echo "Haplotype Consensus Summary for ${SYMBOL}" > ~{out_prefix}_summary.txt - echo "==========================================" >> ~{out_prefix}_summary.txt - echo "" >> ~{out_prefix}_summary.txt - cat hap1_depth.txt >> ~{out_prefix}_summary.txt - cat hap2_depth.txt >> ~{out_prefix}_summary.txt - echo "" >> ~{out_prefix}_summary.txt - echo "Consensus sequences:" >> ~{out_prefix}_summary.txt - grep -c "^>" ~{out_prefix}_combined.fasta >> ~{out_prefix}_summary.txt || echo "0" >> ~{out_prefix}_summary.txt - >>> - - output { - File haplotype_consensus_fasta = "~{out_prefix}_combined.fasta" - File consensus_summary = "~{out_prefix}_summary.txt" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" - cpu: 2 - memory: "8 GB" - disk: "20 GB" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey - zones: runtime_attributes.zones - } -} - task annotate_consensus_genbank { meta { description: "Create GenBank files annotating edit parts on haplotype consensus sequences" } parameter_meta { - parts_alignment_paf: "PAF file from minimap2 alignment of edit parts to consensus sequences" + parts_alignment_tsv: "Blastn tabular output from blastn_remap_parts (edit parts aligned to consensus sequences)" consensus_fasta: "FASTA file with haplotype consensus sequences" crispr_edit_json: "JSON file describing CRISPR edit structure" coverage_threshold: "Minimum coverage threshold for faithful alignments" } input { - File parts_alignment_paf + File parts_alignment_tsv File consensus_fasta File crispr_edit_json Float coverage_threshold = 0.8 @@ -562,37 +51,42 @@ from Bio.SeqFeature import SeqFeature, FeatureLocation # Type aliases Hit = Tuple[int, int, int, int, str, int] # q_start, q_end, t_start, t_end, strand, n_match -def parse_paf_line(line: str) -> Dict: - """Parse a single PAF format line""" +def parse_blastn_line(line: str) -> Dict: + """Parse a single blastn tabular line (outfmt 6 qseqid qlen qstart qend sseqid slen sstart send nident mismatch gapopen gaps length sstrand bitscore)""" f = line.rstrip("\n").split("\t") - if len(f) < 12: - raise ValueError("PAF line has fewer than 12 mandatory columns") + if len(f) < 15: + raise ValueError(f"Expected >=15 columns, got {len(f)}") + sstart, send = int(f[6]), int(f[7]) return dict( query_name=f[0], query_len=int(f[1]), - q_start=int(f[2]), q_end=int(f[3]), strand=f[4], - target_name=f[5], target_len=int(f[6]), - t_start=int(f[7]), t_end=int(f[8]), - n_match=int(f[9]), n_align=int(f[10]), mapq=int(f[11]), + q_start=int(f[2]) - 1, q_end=int(f[3]), + strand="+" if f[13] == "plus" else "-", + target_name=f[4], target_len=int(f[5]), + t_start=min(sstart, send) - 1, t_end=max(sstart, send), + n_match=int(f[8]), n_align=int(f[12]), ) -def load_faithful_alignments(paf_path: str, cov_thr: float) -> Tuple[Dict[str, Dict[str, List[Hit]]], Dict[str, int]]: +def load_faithful_alignments(tsv_path: str, cov_thr: float) -> Tuple[Dict[str, Dict[str, List[Hit]]], Dict[str, int]]: """ - Load only faithful alignments (high coverage, low gaps) from PAF. + Load only faithful alignments (high coverage, low gaps) from blastn tabular output. Returns: ({target_consensus_name: {query_part_name: [Hit, ...]}}, {query_name: length}) """ faithful = defaultdict(lambda: defaultdict(list)) qlen = {} - with open(paf_path) as fh: + with open(tsv_path) as fh: for ln in fh: - if not ln.strip(): + if not ln.strip() or ln.startswith('#'): + continue + try: + r = parse_blastn_line(ln) + except ValueError: continue - r = parse_paf_line(ln) qlen.setdefault(r["query_name"], r["query_len"]) # Calculate coverage and gap metrics cov_hit = r["n_match"] / r["query_len"] - gap_ok = abs(r["n_match"] - r["n_align"]) <= 0.05 * r["n_match"] + gap_ok = r["n_align"] - r["n_match"] <= 0.05 * r["n_match"] # Only keep faithful alignments if cov_hit >= cov_thr and gap_ok: @@ -620,21 +114,33 @@ def load_edit_metadata(json_path: str) -> Dict: 'symbol': edit['target']['symbol'] } -def create_source_and_gene_features(seq_len: int, metadata: Dict, haplotype: str) -> List[SeqFeature]: +def parse_group_label(seq_id: str, description: str) -> str: + """ + Extract a human-readable group label from MSA consensus FASTA headers. + Header format: '>group_{N} n_reads={K}' or '>group_{N}_singleton n_reads=1' + or '>group_{N}_hap_{L} n_reads={K}' for split haplotypes. + Falls back gracefully for anything else. + """ + import re + m = re.match(r'(group_\d+(?:_(?:singleton|hap_\w+))?)', seq_id) + if m: + nr = '' + for token in description.split(): + if token.startswith('n_reads='): + nr = token[8:] + parts = [m.group(1)] + if nr: + parts.append(f"n_reads={nr}") + return ' '.join(parts) + # Fallback + return seq_id + +def create_source_and_gene_features(seq_len: int, metadata: Dict, group_label: str) -> List[SeqFeature]: """ Create source and gene features for the genomic context of the consensus sequence. - - Args: - seq_len: Length of the consensus sequence - metadata: Edit metadata with genomic coordinates - haplotype: 'hap1' or 'hap2' - - Returns: - List containing source and gene SeqFeature objects """ features = [] - # Source feature covering entire sequence source_feature = SeqFeature( location=FeatureLocation(0, seq_len, strand=+1), type="source", @@ -644,12 +150,11 @@ def create_source_and_gene_features(seq_len: int, metadata: Dict, haplotype: str "db_xref": ["taxon:9606"], "chromosome": [metadata['chromosome'].replace('chr', '')], "map": [f"{metadata['start']}-{metadata['end']}"], - "note": [f"Consensus sequence from ~{sample_id} {haplotype}"] + "note": [f"Consensus from ~{sample_id} {group_label}"] } ) features.append(source_feature) - # Gene feature covering entire sequence gene_feature = SeqFeature( location=FeatureLocation(0, seq_len, strand=+1), type="gene", @@ -756,17 +261,11 @@ def annotate_consensus_sequences(faithful_alignments: Dict[str, Dict[str, List[H # Get consensus sequence original_rec = consensus_seqs[seq_id] - # Determine haplotype from sequence ID - if '_hap1' in seq_id: - haplotype = 'hap1' - elif '_hap2' in seq_id: - haplotype = 'hap2' - else: - haplotype = 'unknown' + group_label = parse_group_label(seq_id, original_rec.description) # Create source and gene features source_gene_features = create_source_and_gene_features( - len(original_rec.seq), edit_metadata, haplotype + len(original_rec.seq), edit_metadata, group_label ) # Create edit part features @@ -780,7 +279,7 @@ def annotate_consensus_sequences(faithful_alignments: Dict[str, Dict[str, List[H seq=original_rec.seq, id=original_rec.id, name=original_rec.id[:16], # GenBank name max 16 chars - description=f"~{sample_id} {edit_metadata['symbol']} {haplotype} consensus {edit_metadata['chromosome']}:{edit_metadata['start']}-{edit_metadata['end']}", + description=f"~{sample_id} {edit_metadata['symbol']} {group_label} consensus {edit_metadata['chromosome']}:{edit_metadata['start']}-{edit_metadata['end']}", features=all_features, annotations={ "source": "~{sample_id}", @@ -804,7 +303,7 @@ def main(): parser = argparse.ArgumentParser( description="Create GenBank annotations for CRISPR edit parts on haplotype consensus sequences" ) - parser.add_argument("paf", help="PAF alignment file") + parser.add_argument("tsv", help="Blastn tabular alignment file") parser.add_argument("fasta", help="FASTA file with consensus sequences") parser.add_argument("json", help="CRISPR edit JSON file") parser.add_argument("-c", "--coverage_threshold", type=float, default=0.8, @@ -814,8 +313,8 @@ def main(): args = parser.parse_args() - print("Loading faithful alignments from PAF...") - faithful_alignments, query_lens = load_faithful_alignments(args.paf, args.coverage_threshold) + print("Loading faithful alignments from blastn TSV...") + faithful_alignments, query_lens = load_faithful_alignments(args.tsv, args.coverage_threshold) print(f"Found {len(faithful_alignments)} consensus sequences with faithful alignments") print("Loading edit metadata...") @@ -856,7 +355,7 @@ EOF # Run the annotation script python3 create_genbank_annotations.py \ - ~{parts_alignment_paf} \ + ~{parts_alignment_tsv} \ ~{consensus_fasta} \ ~{crispr_edit_json} \ -c ~{coverage_threshold} \ diff --git a/workflows/edit-qc/crispr_edit.schema.json b/workflows/edit-qc/crispr_edit.schema.json index 8b307efd..1e46159a 100644 --- a/workflows/edit-qc/crispr_edit.schema.json +++ b/workflows/edit-qc/crispr_edit.schema.json @@ -79,35 +79,43 @@ "type": "string", "description": "right homology arm sequence" }, + "donor_context_before": { + "type": "string", + "description": "optional donor sequence upstream of left_ha (e.g. plasmid backbone 5' of the HDR cassette); not used for edit validation but included when constructing the full donor sequence for knock-knock" + }, + "donor_context_after": { + "type": "string", + "description": "optional donor sequence downstream of right_ha (e.g. plasmid backbone 3' of the HDR cassette); not used for edit validation but included when constructing the full donor sequence for knock-knock" + }, "payload_components": { "type": "array", "items": { "$ref": "#/$defs/NamedSequence" } - }, - "required": [ - "type", - "strand", - "guide", - "left_ha", - "payload", - "right_ha" - ] - } - }, - "required": [ - "id", - "target", - "edit" - ] - } + } + }, + "required": [ + "type", + "strand", + "guide", + "left_ha", + "payload", + "right_ha" + ] + } + }, + "required": [ + "id", + "target", + "edit" + ] } - }, - "required": [ - "ref", - "edits" - ] + } }, + "required": [ + "ref", + "edits" + ], "$defs": { "NamedSequence": { "type": "object", diff --git a/workflows/edit-qc/crispr_edit_consensus.wdl b/workflows/edit-qc/crispr_edit_consensus.wdl index f1816a8e..2854871f 100644 --- a/workflows/edit-qc/crispr_edit_consensus.wdl +++ b/workflows/edit-qc/crispr_edit_consensus.wdl @@ -6,131 +6,55 @@ import "crispr_consensus_tasks.wdl" as CrisprConsensusTasks workflow crispr_edit_consensus { meta { - description: "Generate haplotype consensus sequences and annotate CRISPR edit parts in GenBank format using sample-specific reference" + description: "Annotate MSA-based group consensus sequences with CRISPR edit parts in GenBank format" } parameter_meta { sample_id: "Sample identifier" - haplotagged_bam: "Haplotagged BAM file from phasing" - haplotagged_bam_index: "Index for haplotagged BAM" - ref_fasta: "Reference genome FASTA" - ref_fasta_index: "Reference genome FASTA index" - small_variant_vcf: "Sample small variant VCF (phased)" - small_variant_vcf_index: "Index for small variant VCF" - sv_vcf: "Sample structural variant VCF (phased)" - sv_vcf_index: "Index for structural variant VCF" - crispr_edit_json: "JSON file describing expected CRISPR edit with target region" + consensus_fasta: "Per-group consensus FASTA from build_group_consensus (crispr_edit_qc output)" + crispr_edit_json: "JSON file describing expected CRISPR edit" coverage_threshold: "Minimum coverage threshold for faithful alignments (0-1)" } input { String sample_id - File haplotagged_bam - File haplotagged_bam_index - File ref_fasta - File ref_fasta_index - File small_variant_vcf - File small_variant_vcf_index - File sv_vcf - File sv_vcf_index + File consensus_fasta File crispr_edit_json Float coverage_threshold = 0.8 RuntimeAttributes runtime_attributes } - # Step 1: Create sample-specific reference consensus using bcftools consensus - call CrisprConsensusTasks.create_sample_reference_consensus { - input: - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - small_variant_vcf = small_variant_vcf, - small_variant_vcf_index = small_variant_vcf_index, - sv_vcf = sv_vcf, - sv_vcf_index = sv_vcf_index, - crispr_edit_json = crispr_edit_json, - sample_id = sample_id, - runtime_attributes = runtime_attributes - } - - # Step 2a: Extract reads from edit region - call CrisprConsensusTasks.extract_region_reads { - input: - haplotagged_bam = haplotagged_bam, - haplotagged_bam_index = haplotagged_bam_index, - crispr_edit_json = crispr_edit_json, - sample_id = sample_id, - runtime_attributes = runtime_attributes - } - - # Step 2b: Align reads to sample-specific reference using minimap2 - call CrisprConsensusTasks.minimap2_align_to_sample_reference { - input: - region_reads_fastq = extract_region_reads.region_reads_fastq, - sample_ref_consensus_fasta = create_sample_reference_consensus.sample_ref_consensus_fasta, - sample_id = sample_id, - runtime_attributes = runtime_attributes - } - - # Step 2c: Convert SAM to sorted BAM and transfer haplotype tags - call CrisprConsensusTasks.sam_to_sorted_tagged_bam { - input: - sam_file = minimap2_align_to_sample_reference.realigned_sam, - original_bam = haplotagged_bam, - original_bam_index = haplotagged_bam_index, - crispr_edit_json = crispr_edit_json, - sample_id = sample_id, - runtime_attributes = runtime_attributes - } - - # Step 3: Create haplotype consensus sequences from realigned reads - call CrisprConsensusTasks.create_haplotype_consensus_from_realigned { - input: - realigned_bam = sam_to_sorted_tagged_bam.tagged_bam, - realigned_bam_index = sam_to_sorted_tagged_bam.tagged_bam_index, - sample_ref_consensus_fasta = create_sample_reference_consensus.sample_ref_consensus_fasta, - crispr_edit_json = crispr_edit_json, - sample_id = sample_id, - runtime_attributes = runtime_attributes - } - - # Step 4: Create FASTA with edit parts for alignment (including payload_components for detailed annotation) + # Build parts query FASTA (including payload components for detailed annotation) call CrisprEditTasks.create_parts_query_fasta { input: - crispr_edit_json = crispr_edit_json, + crispr_edit_json = crispr_edit_json, include_payload_components = true, - runtime_attributes = runtime_attributes + runtime_attributes = runtime_attributes } - # Step 5: Align edit parts to haplotype consensus sequences - call CrisprEditTasks.minimap2_remap_parts { + # Align edit parts to group consensus sequences + call CrisprEditTasks.blastn_remap_parts as blastn_remap_consensus_parts { input: - filtered_reads_fasta = create_haplotype_consensus_from_realigned.haplotype_consensus_fasta, - parts_query_fasta = create_parts_query_fasta.parts_query_fasta, - sample_id = sample_id, - runtime_attributes = runtime_attributes + filtered_reads_fasta = consensus_fasta, + parts_query_fasta = create_parts_query_fasta.parts_query_fasta, + sample_id = sample_id, + runtime_attributes = runtime_attributes } - # Step 6: Create annotated GenBank files + # Annotate consensus sequences and emit GenBank call CrisprConsensusTasks.annotate_consensus_genbank { input: - parts_alignment_paf = minimap2_remap_parts.parts_alignment_paf, - consensus_fasta = create_haplotype_consensus_from_realigned.haplotype_consensus_fasta, - crispr_edit_json = crispr_edit_json, - coverage_threshold = coverage_threshold, - sample_id = sample_id, - runtime_attributes = runtime_attributes + parts_alignment_tsv = blastn_remap_consensus_parts.parts_alignment_tsv, + consensus_fasta = consensus_fasta, + crispr_edit_json = crispr_edit_json, + coverage_threshold = coverage_threshold, + sample_id = sample_id, + runtime_attributes = runtime_attributes } output { - File sample_ref_consensus_fasta = create_sample_reference_consensus.sample_ref_consensus_fasta - File realigned_bam = sam_to_sorted_tagged_bam.tagged_bam - File realigned_bam_index = sam_to_sorted_tagged_bam.tagged_bam_index - File alignment_stats = sam_to_sorted_tagged_bam.alignment_stats - File haplotype_consensus_fasta = create_haplotype_consensus_from_realigned.haplotype_consensus_fasta - File consensus_summary = create_haplotype_consensus_from_realigned.consensus_summary - File parts_alignment_paf = minimap2_remap_parts.parts_alignment_paf + File parts_alignment_tsv = blastn_remap_consensus_parts.parts_alignment_tsv File genbank_annotations = annotate_consensus_genbank.genbank_annotations - File annotation_summary = annotate_consensus_genbank.annotation_summary - File region_reads_fastq = extract_region_reads.region_reads_fastq + File annotation_summary = annotate_consensus_genbank.annotation_summary } } diff --git a/workflows/edit-qc/crispr_edit_qc.wdl b/workflows/edit-qc/crispr_edit_qc.wdl index 82c1a652..a7c26b8f 100644 --- a/workflows/edit-qc/crispr_edit_qc.wdl +++ b/workflows/edit-qc/crispr_edit_qc.wdl @@ -10,70 +10,187 @@ workflow crispr_edit_qc { parameter_meta { sample_id: "Sample identifier" - fasta_reads: "FASTA converted reads from upstream workflow" + fasta_reads: "FASTA converted reads from upstream workflow (whole genome; mutually exclusive with pre_filtered_reads_fasta)" + pre_filtered_reads_fasta: "Pre-filtered FASTA already restricted to the edit region (skips minimap2 filtering when provided)" + haplotagged_bam: "Optional whole-genome aligned BAM. When provided, reads with high-confidence primary alignments >offtarget_window_bp from the edit site are excluded before edit-part search; unmapped and low-MAPQ reads are retained." + haplotagged_bam_index: "Index for haplotagged_bam" + offtarget_window_bp: "Distance (bp) from edit target interval within which on-target primary alignments are kept" + offtarget_mapq_threshold: "Primary alignments with MAPQ <= this value are kept as low-confidence (not excluded as off-target)" crispr_edit_json: "JSON file describing expected CRISPR edit" min_identity: "Minimum alignment identity for initial filtering (0-1)" coverage_threshold: "Minimum coverage threshold for faithful alignments (0-1)" - threads: "Number of threads for minimap2" + threads: "Number of threads for minimap2 (initial read filtering step)" + clip_padding: "Flanking bases to retain around part-hit span when clipping reads for MSA" + gap_merge_threshold: "Max gap difference (bp) between two reads to merge into the same structural group" + wt_gap_threshold: "Max gap (bp) between left_HA and right_HA to call WT_candidate (covers cut-site sequence)" + min_flank_bp: "Minimum flanking bases beyond part hits required to call HDR or WT (vs inconclusive)" + min_part_hit_bp: "Minimum query-span (bp) for a blastn hit to be counted; filters repetitive short matches" } input { String sample_id - File fasta_reads + File? fasta_reads + File? pre_filtered_reads_fasta + File? haplotagged_bam + File? haplotagged_bam_index File crispr_edit_json Float min_identity = 0.80 Float coverage_threshold = 0.8 + Int offtarget_window_bp = 500000 + Int offtarget_mapq_threshold = 20 Int threads = 8 + Int clip_padding = 100 + Int gap_merge_threshold = 5 + Int wt_gap_threshold = 20 + Int min_flank_bp = 50 + Int min_part_hit_bp = 100 RuntimeAttributes runtime_attributes } + # ── Build parts query FASTA from edit JSON ──────────────────────────────── call CrisprEditTasks.create_parts_query_fasta { input: crispr_edit_json = crispr_edit_json, runtime_attributes = runtime_attributes } - call CrisprEditTasks.minimap2_align_reads { + # ── Optional off-target read exclusion using whole-genome BAM ───────────── + if (!defined(pre_filtered_reads_fasta) && defined(haplotagged_bam)) { + call CrisprEditTasks.filter_offtarget_reads { + input: + haplotagged_bam = select_first([haplotagged_bam]), + haplotagged_bam_index = select_first([haplotagged_bam_index]), + crispr_edit_json = crispr_edit_json, + sample_id = sample_id, + window_bp = offtarget_window_bp, + mapq_threshold = offtarget_mapq_threshold, + runtime_attributes = runtime_attributes + } + } + + # ── Initial read filter: minimap2 (skipped when pre_filtered_reads_fasta is supplied) ─ + if (!defined(pre_filtered_reads_fasta)) { + File minimap2_input_fasta = select_first([filter_offtarget_reads.filtered_reads_fasta, + fasta_reads]) + call CrisprEditTasks.minimap2_align_reads { + input: + fasta_reads = minimap2_input_fasta, + parts_query_fasta = create_parts_query_fasta.parts_query_fasta, + min_identity = min_identity, + threads = threads, + sample_id = sample_id, + runtime_attributes = runtime_attributes + } + + call CrisprEditTasks.extract_reads { + input: + fasta_reads = minimap2_input_fasta, + matched_read_names = minimap2_align_reads.matched_read_names, + sample_id = sample_id, + runtime_attributes = runtime_attributes + } + } + + File actual_filtered_fasta = select_first([pre_filtered_reads_fasta, + extract_reads.filtered_reads_fasta]) + + # ── Pass-1 blastn: map parts to filtered reads ──────────────────────────── + call CrisprEditTasks.blastn_remap_parts as blastn_pass1 { input: - fasta_reads = fasta_reads, + filtered_reads_fasta = actual_filtered_fasta, parts_query_fasta = create_parts_query_fasta.parts_query_fasta, - min_identity = min_identity, - threads = threads, + sample_id = sample_id + "_pass1", + runtime_attributes = runtime_attributes + } + + # ── Clip reads to union of part-hit spans ───────────────────────────────── + call CrisprEditTasks.clip_reads_to_parts { + input: + parts_alignment_tsv = blastn_pass1.parts_alignment_tsv, + filtered_reads_fasta = actual_filtered_fasta, sample_id = sample_id, + clip_padding = clip_padding, + min_part_hit_bp = min_part_hit_bp, runtime_attributes = runtime_attributes } - call CrisprEditTasks.extract_reads { + # ── Structural grouping: signature + prototype clustering ───────────────── + call CrisprEditTasks.structural_grouping { input: - fasta_reads = fasta_reads, - matched_read_names = minimap2_align_reads.matched_read_names, + parts_alignment_tsv = blastn_pass1.parts_alignment_tsv, sample_id = sample_id, + gap_merge_threshold = gap_merge_threshold, + min_part_hit_bp = min_part_hit_bp, runtime_attributes = runtime_attributes } - call CrisprEditTasks.minimap2_remap_parts { + # ── Per-group MSA and error-corrected consensus (three steps) ──────────── + call CrisprEditTasks.prepare_group_fastas { input: - filtered_reads_fasta = extract_reads.filtered_reads_fasta, - parts_query_fasta = create_parts_query_fasta.parts_query_fasta, + clipped_reads_fasta = clip_reads_to_parts.clipped_reads_fasta, + groups_tsv = structural_grouping.groups_tsv, sample_id = sample_id, runtime_attributes = runtime_attributes } - call CrisprEditTasks.parse_edit_parts { + call CrisprEditTasks.run_group_msa { input: - parts_alignment_paf = minimap2_remap_parts.parts_alignment_paf, + group_inputs_tarball = prepare_group_fastas.group_inputs_tarball, + sample_id = sample_id, + runtime_attributes = runtime_attributes + } + + call CrisprEditTasks.build_group_consensus { + input: + msa_tarball = run_group_msa.msa_tarball, + singletons_fasta = prepare_group_fastas.singletons_fasta, + groups_tsv = structural_grouping.groups_tsv, + sample_id = sample_id, + runtime_attributes = runtime_attributes + } + + # ── Wide per-part coverage table (backwards-compatible) ────────────────── + call CrisprEditTasks.make_wide_coverage_table { + input: + parts_alignment_tsv = blastn_pass1.parts_alignment_tsv, coverage_threshold = coverage_threshold, + min_part_hit_bp = min_part_hit_bp, + sample_id = sample_id, + runtime_attributes = runtime_attributes + } + + # ── Final categorization ────────────────────────────────────────────────── + call CrisprEditTasks.categorize_reads { + input: + filtered_reads_fasta = actual_filtered_fasta, + groups_tsv = structural_grouping.groups_tsv, + subhaps_tsv = build_group_consensus.subhaps_tsv, + pass2_alignment_tsv = blastn_pass1.parts_alignment_tsv, sample_id = sample_id, + gap_merge_threshold = gap_merge_threshold, + wt_gap_threshold = wt_gap_threshold, + min_flank_bp = min_flank_bp, + min_part_hit_bp = min_part_hit_bp, runtime_attributes = runtime_attributes } output { - File? parts_query_fasta = create_parts_query_fasta.parts_query_fasta - File? filtered_reads_fasta = extract_reads.filtered_reads_fasta - File? parts_alignment_paf = minimap2_remap_parts.parts_alignment_paf - File? summary_table = parse_edit_parts.summary_table - File? faithful_table = parse_edit_parts.faithful_table - File? wide_table = parse_edit_parts.wide_table - File? wide_faithful_table = parse_edit_parts.wide_faithful_table + # Intermediate / diagnostic + File? parts_query_fasta = create_parts_query_fasta.parts_query_fasta + File? filtered_reads_fasta = extract_reads.filtered_reads_fasta + File? filtered_reads_fastq = extract_reads.filtered_reads_fastq + File? offtarget_filter_counts_tsv = filter_offtarget_reads.counts_tsv + File? parts_alignment_tsv = blastn_pass1.parts_alignment_tsv + File? clipped_reads_fasta = clip_reads_to_parts.clipped_reads_fasta + + # Per-part coverage table (wide faithful format, backwards-compatible) + File? wide_faithful_table = make_wide_coverage_table.wide_faithful_table + + # MSA outputs + File? msa_tarball = build_group_consensus.all_msa_tarball + File? consensus_fasta = build_group_consensus.consensus_fasta + + # Primary result + File? categories_tsv = categorize_reads.categories_tsv } } diff --git a/workflows/edit-qc/crispr_edit_tasks.wdl b/workflows/edit-qc/crispr_edit_tasks.wdl index cbb41216..a374173d 100644 --- a/workflows/edit-qc/crispr_edit_tasks.wdl +++ b/workflows/edit-qc/crispr_edit_tasks.wdl @@ -24,38 +24,32 @@ task create_parts_query_fasta { command <<< set -euxo pipefail - python3 <{edit_id}_left_HA\n{left_ha}\n") fasta_out.write(f">{edit_id}_payload\n{payload}\n") fasta_out.write(f">{edit_id}_right_HA\n{right_ha}\n") fasta_out.write(f">{edit_id}_HDR_template\n{left_ha}{payload}{right_ha}\n") fasta_out.write(f">{edit_id}_WT_template\n{left_ha}{right_ha}\n") - # Optionally include payload components for detailed annotation if include_components and 'payload_components' in edit['edit']: for component in edit['edit']['payload_components']: comp_name = component['name'].replace(" ", "_") - comp_seq = component['seq'] - fasta_out.write(f">{comp_name}\n{comp_seq}\n") -EOF + fasta_out.write(f">{comp_name}\n{component['seq']}\n") +PYEOF >>> output { @@ -74,6 +68,99 @@ EOF } } +task filter_offtarget_reads { + meta { + description: "Exclude reads with high-confidence primary alignments far from the edit site (off-target integrations). Keeps on-target (within window_bp of cut site), low-confidence (primary MAPQ<=threshold), and unmapped reads. Emits a gzipped FASTA and a counts report." + } + + parameter_meta { + haplotagged_bam: "Whole-genome aligned BAM (expected to include unmapped reads)" + haplotagged_bam_index: "Index for aligned BAM" + crispr_edit_json: "JSON file describing CRISPR edit with target region" + window_bp: "Keep high-confidence reads whose primary alignment start is within this many bp of the edit target interval (same chromosome)" + mapq_threshold: "Primary alignments with MAPQ <= this value are treated as low-confidence and kept" + } + + input { + File haplotagged_bam + File haplotagged_bam_index + File crispr_edit_json + String sample_id + Int window_bp = 500000 + Int mapq_threshold = 20 + RuntimeAttributes runtime_attributes + } + + String out_prefix = "~{sample_id}_ontarget_filter" + Float file_size = ceil(size(haplotagged_bam, "GB") * 2) + + command <<< + set -euxo pipefail + + python3 <<'PYEOF' +import json +with open("~{crispr_edit_json}") as fh: + data = json.load(fh) +edit = data['edits'][0] +chrom = edit['target']['chr'] +lo = max(1, edit['target']['start'] - ~{window_bp}) +hi = edit['target']['end'] + ~{window_bp} +with open('region.env', 'w') as fh: + fh.write(f"CHR={chrom}\nLO={lo}\nHI={hi}\n") +PYEOF + source region.env + echo "On-target window: ${CHR}:${LO}-${HI}" + + # Single pass over primary records: classify and emit keep list + counts. + # Flag bit 4 (unmapped) detected via integer arithmetic for portable awk. + samtools view -F 2304 ~{haplotagged_bam} \ + | awk -F'\t' -v OFS='\t' \ + -v chr="${CHR}" -v lo="${LO}" -v hi="${HI}" -v q=~{mapq_threshold} ' + { + flag = $2 + 0 + unmapped = (int(flag/4) % 2) + if (unmapped) { u++; print $1 > "keep.txt"; next } + if (($5 + 0) <= q) { l++; print $1 > "keep.txt"; next } + if ($3 == chr && ($4+0) >= lo && ($4+0) <= hi) { o++; print $1 > "keep.txt"; next } + x++ + } + END { + print "on_target", o+0 > "counts.tsv" + print "low_confidence", l+0 > "counts.tsv" + print "unmapped", u+0 > "counts.tsv" + print "off_target_excluded", x+0 > "counts.tsv" + }' + + { echo -e "category\tread_count"; cat counts.tsv; } > ~{out_prefix}_counts.tsv + cat ~{out_prefix}_counts.tsv + + # Deduplicate keep list (a read may appear on multiple primary records across references + # only when split — still a single primary, but guard anyway) + LC_ALL=C sort -u keep.txt > keep.uniq.txt + echo "Kept unique read names: $(wc -l < keep.uniq.txt)" + + samtools view -h -N keep.uniq.txt -F 2304 ~{haplotagged_bam} \ + | samtools fasta - \ + | gzip -c > ~{out_prefix}.fasta.gz + >>> + + output { + File filtered_reads_fasta = "~{out_prefix}.fasta.gz" + File counts_tsv = "~{out_prefix}_counts.tsv" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" + cpu: 2 + memory: "8 GB" + disk: file_size + " GB" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey + zones: runtime_attributes.zones + } +} + task minimap2_align_reads { meta { description: "Use minimap2 to identify ALL reads containing edit parts" @@ -155,10 +242,31 @@ task extract_reads { # Count reads MATCHED_READS=$(grep -c '^>' ~{out_prefix}_filtered_reads.fasta || echo 0) echo "Reads matching edit parts: $MATCHED_READS" + + # Convert FASTA to gzip FASTQ with synthetic Q40 quality scores (for tools requiring FASTQ) + python3 <<'PYEOF' +import gzip +name, buf = None, [] +with open("~{out_prefix}_filtered_reads.fasta") as fh, \ + gzip.open("~{out_prefix}_filtered_reads.fastq.gz", 'wt') as out: + for line in fh: + line = line.rstrip() + if line.startswith('>'): + if name is not None: + seq = ''.join(buf) + out.write(f"@{name}\n{seq}\n+\n{'I' * len(seq)}\n") + name, buf = line[1:], [] + else: + buf.append(line) + if name is not None: + seq = ''.join(buf) + out.write(f"@{name}\n{seq}\n+\n{'I' * len(seq)}\n") +PYEOF >>> output { File filtered_reads_fasta = "~{out_prefix}_filtered_reads.fasta" + File filtered_reads_fastq = "~{out_prefix}_filtered_reads.fastq.gz" } runtime { @@ -174,14 +282,101 @@ task extract_reads { } -task minimap2_remap_parts { +task extract_region_reads_fasta { meta { - description: "Remap edit parts to filtered reads with relaxed parameters" + description: "Extract reads from a haplotagged BAM overlapping the edit target region; output FASTA" } parameter_meta { - filtered_reads_fasta: "FASTA with reads containing edit parts" - parts_query_fasta: "FASTA file with edit parts sequences" + haplotagged_bam: "Haplotagged BAM file from phasing" + haplotagged_bam_index: "Index for haplotagged BAM" + crispr_edit_json: "JSON file describing CRISPR edit with target region" + padding: "Bases to add on each side of the target region" + } + + input { + File haplotagged_bam + File haplotagged_bam_index + File crispr_edit_json + String sample_id + Int padding = 10000 + RuntimeAttributes runtime_attributes + } + + String out_prefix = "~{sample_id}_region_reads" + + command <<< + set -euxo pipefail + + python3 <<'PYEOF' +import json +with open("~{crispr_edit_json}") as fh: + data = json.load(fh) +edit = data['edits'][0] +chrom = edit['target']['chr'] +start = max(1, edit['target']['start'] - ~{padding}) +end = edit['target']['end'] + ~{padding} +with open('region.txt', 'w') as fh: + fh.write(f"{chrom}:{start}-{end}\n") +PYEOF + + REGION=$(cat region.txt) + echo "Extracting reads from ${REGION}" + + samtools view -h ~{haplotagged_bam} "${REGION}" | \ + samtools fasta -F 2304 - \ + > ~{out_prefix}.fasta + + READ_COUNT=$(grep -c '^>' ~{out_prefix}.fasta || echo 0) + echo "Extracted ${READ_COUNT} reads" + + # Synthetic Q40 FASTQ for tools that require FASTQ + python3 <<'PYEOF' +import gzip +name, buf = None, [] +with open("~{out_prefix}.fasta") as fh, \ + gzip.open("~{out_prefix}.fastq.gz", 'wt') as out: + for line in fh: + line = line.rstrip() + if line.startswith('>'): + if name is not None: + seq = ''.join(buf) + out.write(f"@{name}\n{seq}\n+\n{'I' * len(seq)}\n") + name, buf = line[1:], [] + else: + buf.append(line) + if name is not None: + seq = ''.join(buf) + out.write(f"@{name}\n{seq}\n+\n{'I' * len(seq)}\n") +PYEOF + >>> + + output { + File region_reads_fasta = "~{out_prefix}.fasta" + File region_reads_fastq = "~{out_prefix}.fastq.gz" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" + cpu: 2 + memory: "4 GiB" + disk: "20 GB" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey + zones: runtime_attributes.zones + } +} + + +task blastn_remap_parts { + meta { + description: "Remap edit parts to filtered reads using blastn" + } + + parameter_meta { + filtered_reads_fasta: "FASTA with reads containing edit parts (subject)" + parts_query_fasta: "FASTA file with edit parts sequences (query)" } input { @@ -193,25 +388,155 @@ task minimap2_remap_parts { String out_prefix = "~{sample_id}_edit_parts" + # Tabular columns: query(part) coords, subject(read) coords, alignment stats, strand + # Coordinates are 1-based; strand is "plus"/"minus". + # word_size=32 prevents spurious 31 bp hits from XTEN-family repeat sequences + # that share k-mers with the human genome (validated on czML1116 EEA1 dataset). + # r=1 p=-2 go=2 ge=1: best CIGAR accuracy in ground-truth evaluation. + String blastn_outfmt = "6 qseqid qlen qstart qend sseqid slen sstart send nident mismatch gapopen gaps length sstrand bitscore" + command <<< set -euxo pipefail - minimap2 -cx asm10 -N 1000 -p 0.5 \ - ~{filtered_reads_fasta} ~{parts_query_fasta} \ - > ~{out_prefix}_minimap2_crispr_parts.paf + blastn \ + -query ~{parts_query_fasta} \ + -subject ~{filtered_reads_fasta} \ + -outfmt "~{blastn_outfmt}" \ + -task blastn \ + -word_size 32 \ + -reward 1 \ + -penalty -2 \ + -gapopen 2 \ + -gapextend 1 \ + -evalue 1 \ + -perc_identity 80 \ + -max_target_seqs 10000 \ + -parse_deflines \ + > ~{out_prefix}_blastn_crispr_parts.tsv + + echo "Total alignments: $(wc -l < ~{out_prefix}_blastn_crispr_parts.tsv)" + >>> - # Count alignments - echo "Total alignments: $(wc -l < ~{out_prefix}_minimap2_crispr_parts.paf)" + output { + File parts_alignment_tsv = "~{out_prefix}_blastn_crispr_parts.tsv" + } + + runtime { + docker: "quay.io/biocontainers/blast@sha256:9dfc69f990c0aeb936276ee591ed32919a79f46dfa34060055e05a050a17959c" + cpu: 2 + memory: "8 GiB" + disk: "20 GB" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey + zones: runtime_attributes.zones + } +} + + + +task clip_reads_to_parts { + meta { + description: "Clip filtered reads to the union of all blastn part-hit spans plus padding" + } + + parameter_meta { + parts_alignment_tsv: "Pass-1 blastn tabular output (from blastn_remap_parts)" + filtered_reads_fasta: "Uncompressed FASTA with reads overlapping edit parts" + clip_padding: "Bases to retain on each side of the union hit span (default 100)" + min_part_hit_bp: "Minimum query-span (bp) for a blastn hit to be counted; filters repetitive short matches" + } + + input { + File parts_alignment_tsv + File filtered_reads_fasta + String sample_id + Int clip_padding = 100 + Int min_part_hit_bp = 100 + RuntimeAttributes runtime_attributes + } + + String out_prefix = "~{sample_id}_clipped" + + command <<< + set -euxo pipefail + + python3 <<'PYEOF' +import sys +from collections import defaultdict + +# Parse blastn TSV: collect hit t-spans per read (all parts, all hits) +# Columns: qseqid qlen qstart qend sseqid slen sstart send nident mismatch gapopen gaps length sstrand bitscore +hit_spans = defaultdict(list) # read_name -> [(t_start, t_end), ...] +read_lens = {} + +MIN_HIT = ~{min_part_hit_bp} + +with open("~{parts_alignment_tsv}") as fh: + for ln in fh: + if not ln.strip() or ln.startswith('#'): + continue + f = ln.rstrip('\n').split('\t') + if len(f) < 15: + continue + if abs(int(f[3]) - int(f[2])) < MIN_HIT: + continue + read_name = f[4] + sstart, send = int(f[6]), int(f[7]) + t_start = min(sstart, send) - 1 # 0-based half-open + t_end = max(sstart, send) + hit_spans[read_name].append((t_start, t_end)) + read_lens[read_name] = int(f[5]) + +padding = ~{clip_padding} + +# Compute clip window per read +clip_regions = {} +for rname, spans in hit_spans.items(): + rlen = read_lens[rname] + cs = max(0, min(s[0] for s in spans) - padding) + ce = min(rlen, max(s[1] for s in spans) + padding) + clip_regions[rname] = (cs, ce) + +# Stream FASTA and clip +n_written = 0 +name = None +buf = [] + +with open("~{filtered_reads_fasta}") as fh, \ + open("~{out_prefix}.fasta", 'w') as out: + def flush(name, buf): + global n_written + if name not in clip_regions: + return + seq = ''.join(buf) + cs, ce = clip_regions[name] + out.write(f">{name} clip={cs}-{ce} orig_len={len(seq)}\n{seq[cs:ce]}\n") + n_written += 1 + for ln in fh: + ln = ln.rstrip('\n') + if ln.startswith('>'): + if name is not None: + flush(name, buf) + name = ln[1:].split()[0] + buf = [] + else: + buf.append(ln.upper()) + if name is not None: + flush(name, buf) + +print(f"Clipped {n_written} reads (padding={padding} bp)") +PYEOF >>> output { - File parts_alignment_paf = "~{out_prefix}_minimap2_crispr_parts.paf" + File clipped_reads_fasta = "~{out_prefix}.fasta" } runtime { - docker: "quay.io/biocontainers/minimap2@sha256:fdc9ef8bfbd31bab59a61b2e90dd226647deed971556175a4cd004f0bcdc7608" - cpu: 4 - memory: "8 GB" + docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" + cpu: 1 + memory: "8 GiB" disk: "20 GB" preemptible: runtime_attributes.preemptible_tries maxRetries: runtime_attributes.max_retries @@ -220,170 +545,1038 @@ task minimap2_remap_parts { } } -task parse_edit_parts { + +task structural_grouping { meta { - description: "Parse PAF alignments to create summary tables with coverage metrics" + description: "Derive structural signatures from blastn part hits and cluster reads into groups" } parameter_meta { - parts_alignment_paf: "PAF file from minimap2 alignment of edit parts to reads" - coverage_threshold: "Minimum coverage threshold for faithful alignments" + parts_alignment_tsv: "Pass-1 blastn tabular output" + gap_merge_threshold: "Max gap difference (bp) between two reads to merge into the same structural group" + min_part_hit_bp: "Minimum query-span (bp) for a blastn hit to be counted; filters repetitive short matches" } input { - File parts_alignment_paf - Float coverage_threshold = 0.8 + File parts_alignment_tsv String sample_id + Int gap_merge_threshold = 5 + Int min_part_hit_bp = 100 RuntimeAttributes runtime_attributes } - String out_prefix = "~{sample_id}_edit_parts_summary" + String out_prefix = "~{sample_id}_structural" command <<< set -euxo pipefail - cat > parse_paf_to_table.py <<'EOF' -#!/usr/bin/env python3 -""" -parse_paf_to_table.py – 2025-07-03 + python3 <<'PYEOF' +from collections import defaultdict, Counter + +# ── Parse blastn TSV ────────────────────────────────────────────────────────── +# Collect hits for individual parts only (left_HA, payload, right_HA). +# Skip HDR_template and WT_template — they are composite sequences and their +# hits are redundant with (and noisier than) the per-part hits. + +hits_by_read = defaultdict(list) # read_name -> [(part_base, strand, t_start, t_end)] + +MIN_HIT = ~{min_part_hit_bp} + +with open("~{parts_alignment_tsv}") as fh: + for ln in fh: + if not ln.strip() or ln.startswith('#'): + continue + f = ln.rstrip('\n').split('\t') + if len(f) < 15: + continue + qname = f[0] + read_name = f[4] + if abs(int(f[3]) - int(f[2])) < MIN_HIT: + continue + sstart, send = int(f[6]), int(f[7]) + t_start = min(sstart, send) - 1 + t_end = max(sstart, send) + strand = '+' if f[13] == 'plus' else '-' + + if qname.endswith('_left_HA'): part_base = 'left_HA' + elif qname.endswith('_right_HA'): part_base = 'right_HA' + elif qname.endswith('_payload'): part_base = 'payload' + else: continue # composite or unknown + + hits_by_read[read_name].append((part_base, strand, t_start, t_end)) + +# ── Build structural signature for each read ───────────────────────────────── +# Signature: list of (part_base, strand) in t_start order; gaps between them. + +def build_sig(hits): + hits_s = sorted(hits, key=lambda h: h[2]) # sort by t_start + parts_strands = [(h[0], h[1]) for h in hits_s] + lengths = [h[3] - h[2] for h in hits_s] + gaps = [hits_s[i+1][2] - hits_s[i][3] for i in range(len(hits_s)-1)] + return parts_strands, gaps, lengths + +def sig_str(ps, gs, ls): + tokens = [] + for i, (p, s) in enumerate(ps): + tokens.append(f"{p}:{s}:{ls[i]}") + if i < len(gs): + tokens.append(f"gap:{gs[i]}") + return '|'.join(tokens) if tokens else 'empty' + +# ── Cluster reads into structural groups ────────────────────────────────────── +# Prototype-based greedy clustering. +# Two reads join the same group iff: +# 1. Identical ordered (part, strand) sequence +# 2. |gap_i(A) - gap_i(B)| <= gap_merge_threshold for every gap position i + +GAP_THR = ~{gap_merge_threshold} + +prototypes = defaultdict(list) # pattern_key -> [(prototype_gaps, group_id)] +group_id_ctr = 0 +read_groups = {} # read_name -> (gid, sig_string) + +for read_name, hits in sorted(hits_by_read.items()): + ps, gs, ls = build_sig(hits) + key = tuple(ps) + + matched_gid = None + for proto_gs, gid in prototypes[key]: + if len(proto_gs) == len(gs) and all(abs(pg - g) <= GAP_THR + for pg, g in zip(proto_gs, gs)): + matched_gid = gid + break + + if matched_gid is None: + group_id_ctr += 1 + matched_gid = group_id_ctr + prototypes[key].append((gs, matched_gid)) + + read_groups[read_name] = (matched_gid, sig_str(ps, gs, ls)) + +# ── Write output ────────────────────────────────────────────────────────────── +group_sizes = Counter(gid for gid, _ in read_groups.values()) + +with open("~{out_prefix}_groups.tsv", 'w') as out: + out.write("read_name\tgroup_id\tsignature\tgroup_size\n") + for rname, (gid, sig) in sorted(read_groups.items()): + out.write(f"{rname}\t{gid}\t{sig}\t{group_sizes[gid]}\n") + +with open("~{out_prefix}_hits.tsv", 'w') as out: + out.write("read_name\tpart\tstrand\tt_start\tt_end\tmatch_len\n") + for rname, hits in sorted(hits_by_read.items()): + for part, strand, t_start, t_end in sorted(hits, key=lambda h: h[2]): + out.write(f"{rname}\t{part}\t{strand}\t{t_start}\t{t_end}\t{t_end - t_start}\n") + +print(f"Reads grouped : {len(read_groups)}") +print(f"Groups formed : {group_id_ctr}") +PYEOF + >>> -Adds per-query coverage columns computed as ∑n_match / query_len ----------------------------------------------------------------- + output { + File groups_tsv = "~{out_prefix}_groups.tsv" + File hits_tsv = "~{out_prefix}_hits.tsv" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" + cpu: 1 + memory: "4 GiB" + disk: "10 GB" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey + zones: runtime_attributes.zones + } +} -Long table header: - target_name target_len query_name query_len coverage num_hits hits -Wide table header: - target_name target_len all_one_hit - _len _cov _num_hits _hit_coordinates - _len _cov _num_hits _hit_coordinates ... -""" +task prepare_group_fastas { + meta { + description: "Partition clipped reads into per-group input FASTAs ready for MAFFT; emit singletons separately" + } -import argparse, csv + input { + File clipped_reads_fasta + File groups_tsv + String sample_id + RuntimeAttributes runtime_attributes + } + + String out_prefix = "~{sample_id}_msa" + + command <<< + set -euxo pipefail + + python3 <<'PYEOF' +import os, tarfile from collections import defaultdict -from typing import Dict, List, Tuple -# ──────────── type aliases ──────────── # -Hit = Tuple[int, int, int, int, str, int] # … + n_match -HitDict = Dict[str, Dict[str, List[Hit]]] # hits[target][query] = [Hit, …] -LenDict = Dict[str, int] - -# ──────────── parsing ──────────── # -def parse_paf_line(line: str) -> Dict: - f = line.rstrip("\n").split("\t") - if len(f) < 12: - raise ValueError("PAF line has fewer than 12 mandatory columns") - return dict( - query_name=f[0], query_len=int(f[1]), - q_start=int(f[2]), q_end=int(f[3]), strand=f[4], - target_name=f[5], target_len=int(f[6]), - t_start=int(f[7]), t_end=int(f[8]), - n_match=int(f[9]), n_align=int(f[10]), mapq=int(f[11]), - ) - -def load_hits(paf_path: str, cov_thr: float): - hits, faithful = defaultdict(lambda: defaultdict(list)), defaultdict(lambda: defaultdict(list)) - tlen, qlen = {}, {} - with open(paf_path) as fh: +groups = defaultdict(list) + +with open("~{groups_tsv}") as fh: + fh.readline() + for ln in fh: + f = ln.rstrip('\n').split('\t') + if len(f) < 4: + continue + rname, gid = f[0], int(f[1]) + groups[gid].append(rname) + +reads = {} +name = None +buf = [] +with open("~{clipped_reads_fasta}") as fh: + for ln in fh: + ln = ln.rstrip('\n') + if ln.startswith('>'): + if name is not None: + reads[name] = ''.join(buf) + name = ln[1:].split()[0] + buf = [] + else: + buf.append(ln.upper()) + if name is not None: + reads[name] = ''.join(buf) + +os.makedirs("msa_inputs", exist_ok=True) +singletons_out = open("~{out_prefix}_singletons.fasta", 'w') + +for gid in sorted(groups): + members = [r for r in groups[gid] if r in reads] + if not members: + continue + if len(members) == 1: + seq = reads[members[0]] + singletons_out.write(f">group_{gid}_singleton:{members[0]}\n{seq}\n") + continue + with open(f"msa_inputs/group_{gid}_input.fasta", 'w') as gf: + for r in members: + gf.write(f">{r}\n{reads[r]}\n") + +singletons_out.close() + +with tarfile.open("~{out_prefix}_inputs.tar.gz", "w:gz") as tar: + tar.add("msa_inputs", arcname="msa_inputs") + +print(f"Groups written: {sum(1 for gid in groups if len([r for r in groups[gid] if r in reads]) > 1)} multi-read, " + f"{sum(1 for gid in groups if len([r for r in groups[gid] if r in reads]) == 1)} singleton") +PYEOF + >>> + + output { + File group_inputs_tarball = "~{out_prefix}_inputs.tar.gz" + File singletons_fasta = "~{out_prefix}_singletons.fasta" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" + cpu: 1 + memory: "8 GiB" + disk: "20 GB" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey + zones: runtime_attributes.zones + } +} + + +task run_group_msa { + meta { + description: "Run MAFFT --auto on each per-group input FASTA; pure bash, no Python" + } + + input { + File group_inputs_tarball + String sample_id + RuntimeAttributes runtime_attributes + } + + String out_prefix = "~{sample_id}_msa" + + command <<< + set -euxo pipefail + + mkdir msa_inputs msa_outputs + tar -xzf ~{group_inputs_tarball} -C msa_inputs --strip-components=1 + + shopt -s nullglob + for input_fasta in msa_inputs/group_*_input.fasta; do + gid=$(basename "$input_fasta" _input.fasta | sed 's/^group_//') + mafft --auto --quiet "$input_fasta" > "msa_outputs/group_${gid}.fasta" + done + + tar -czf ~{out_prefix}_msa.tar.gz -C msa_outputs . + echo "MAFFT done: $(ls msa_outputs/*.fasta 2>/dev/null | wc -l) groups aligned" + >>> + + output { + File msa_tarball = "~{out_prefix}_msa.tar.gz" + } + + runtime { + docker: "quay.io/biocontainers/mafft@sha256:b8ccc402a9156868fefda5caa8693f847b47dcc69aae848557f7115850e60790" + cpu: 4 + memory: "16 GiB" + disk: "50 GB" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey + zones: runtime_attributes.zones + } +} + + +task build_group_consensus { + meta { + description: "Parse per-group MSA FASTAs; build error-corrected consensus; emit group metadata" + } + + parameter_meta { + mask_min_support: "Mask consensus positions where plurality support <= this AND coverage >= mask_min_coverage" + mask_min_coverage: "Coverage threshold for masking (see mask_min_support)" + } + + input { + File msa_tarball + File singletons_fasta + File groups_tsv + String sample_id + Int mask_min_support = 2 + Int mask_min_coverage = 5 + RuntimeAttributes runtime_attributes + } + + String out_prefix = "~{sample_id}_msa" + + command <<< + set -euxo pipefail + + python3 <<'PYEOF' +import os, re, tarfile +from collections import defaultdict + +# ── Load group metadata ─────────────────────────────────────────────────────── +groups = defaultdict(list) +group_strand = {} # gid -> '+' or '-' +group_category = {} # gid -> 'HDR'|'WT'|'partial'|'misintegration'|'other' + +_FWD_HDR_PARTS = [('left_HA', '+'), ('payload', '+'), ('right_HA', '+')] +_REV_HDR_PARTS = [('right_HA', '-'), ('payload', '-'), ('left_HA', '-')] + +def _classify_sig(sig): + """Classify a group signature string into HDR / WT / partial / misintegration. + Signature format: '{part}:{strand}:{len}|gap:{bp}|...'. Structural only — does + not check gap sizes or flanking (partial consensus may represent a real + haplotype for inclusion decisions even if reads are truncated).""" + pairs = [] + for tok in sig.split('|'): + if tok.startswith('gap:') or tok == 'empty': + continue + parts = tok.split(':') + if len(parts) >= 2: + pairs.append((parts[0], parts[1])) + if not pairs: + return 'other' + strands = {s for _, s in pairs} + if len(strands) != 1: + return 'misintegration' + parts_only = [p for p, _ in pairs] + strand = next(iter(strands)) + if strand == '+': + if parts_only == ['left_HA', 'payload', 'right_HA']: + return 'HDR' + if parts_only == ['left_HA', 'right_HA']: + return 'WT' + if parts_only in (['left_HA', 'payload'], ['payload', 'right_HA'], + ['left_HA'], ['payload'], ['right_HA']): + return 'partial' + else: + if parts_only == ['right_HA', 'payload', 'left_HA']: + return 'HDR' + if parts_only == ['right_HA', 'left_HA']: + return 'WT' + if parts_only in (['payload', 'left_HA'], ['right_HA', 'payload'], + ['left_HA'], ['payload'], ['right_HA']): + return 'partial' + return 'misintegration' + +with open("~{groups_tsv}") as fh: + fh.readline() + for ln in fh: + f = ln.rstrip('\n').split('\t') + if len(f) < 4: + continue + rname, gid, sig = f[0], int(f[1]), f[2] + groups[gid].append(rname) + if gid not in group_strand: + m = re.search(r':([+-]):', sig) + if m: + group_strand[gid] = m.group(1) + if gid not in group_category: + group_category[gid] = _classify_sig(sig) + +def header_tags(gid): + tags = [] + if gid in group_strand: + tags.append(f"strand={group_strand[gid]}") + if gid in group_category: + tags.append(f"category={group_category[gid]}") + return (' ' + ' '.join(tags)) if tags else '' + +MASK_SUPP = ~{mask_min_support} +MASK_COV = ~{mask_min_coverage} + +os.makedirs("msa_groups", exist_ok=True) + +# ── Helpers ─────────────────────────────────────────────────────────────────── +def plurality_consensus(seqs_dict): + """Plurality-vote consensus with N-masking over {name: msa_seq}. + Gaps are treated as votes for deletion: if gaps are the plurality at a + column, that position is omitted from the consensus (correct for a + haplotype where most reads carry a deletion or the column is an insertion + in a minority of reads). + """ + width = len(next(iter(seqs_dict.values()))) + bases = [] + for col in range(width): + col_bases = [s[col] for s in seqs_dict.values()] + gap_count = col_bases.count('-') + non_gap = [b for b in col_bases if b != '-'] + if not non_gap or gap_count >= len(non_gap): + continue # all-gap or gap-plurality column → omit from consensus + freq = {} + for b in non_gap: + freq[b] = freq.get(b, 0) + 1 + best_base, best_count = max(freq.items(), key=lambda x: x[1]) + bases.append('N' if best_count <= MASK_SUPP and len(non_gap) >= MASK_COV + else best_base) + return ''.join(bases) + +def hamming(v1, v2): + # Gap vs base counts as a mismatch; only N positions are skipped + return sum(1 for a, b in zip(v1, v2) + if 'N' not in (a, b) and a != b) + +def retained_columns(seqs_dict): + """Return MSA column indices that plurality_consensus keeps (non-gap-plurality). + Used to restrict haplotype clustering to error-corrected positions only.""" + width = len(next(iter(seqs_dict.values()))) + cols = [] + for col in range(width): + col_bases = [s[col] for s in seqs_dict.values()] + gap_count = col_bases.count('-') + non_gap = [b for b in col_bases if b != '-'] + if non_gap and gap_count < len(non_gap): + cols.append(col) + return cols + +def cluster_haplotypes(msa_seqs, cols=None): + """ + Detect variable sites (>= 2 distinct non-N alleles — including '-' for + indels — each in >= 2 reads) and cluster reads by identical allele vector + (Hamming = 0, skipping N positions only). Returns {read_name: hap_label} + where label is 'A', 'B', etc. Returns all 'A' if no variable sites exist + or any resulting cluster has fewer than 2 reads. + + cols: restrict variable-site search to these MSA column indices (pass-1 + error-corrected columns). Defaults to all columns. + """ + rnames = list(msa_seqs.keys()) + width = len(next(iter(msa_seqs.values()))) + if cols is None: + cols = range(width) + + var_cols = [] + for col in cols: + freq = {} + for r in rnames: + b = msa_seqs[r][col] + if b != 'N': # gaps count as a real allele + freq[b] = freq.get(b, 0) + 1 + if sum(1 for cnt in freq.values() if cnt >= 2) >= 2: + var_cols.append(col) + + if not var_cols: + return {r: 'A' for r in rnames} + + vecs = {r: tuple(msa_seqs[r][col] for col in var_cols) for r in rnames} + clusters = [] + for r in rnames: + for cluster in clusters: + if hamming(vecs[r], vecs[next(iter(cluster))]) == 0: + cluster.add(r) + break + else: + clusters.append({r}) + + # Require every cluster to have >= 2 reads; otherwise the split is unreliable + if any(len(c) < 2 for c in clusters): + return {r: 'A' for r in rnames} + + labels = [chr(65 + i) for i in range(len(clusters))] + return {r: labels[i] for i, cluster in enumerate(clusters) for r in cluster} + +# ── Sub-haplotype assignments (written at end) ──────────────────────────────── +subhap_assignments = {} # read_name -> "group_{gid}_hap_{label}" + +# ── Singletons pass-through ─────────────────────────────────────────────────── +name = None +buf = [] + +with open("~{out_prefix}_consensus.fasta", 'w') as consensus_out, \ + open("~{out_prefix}_group_meta.tsv", 'w') as group_meta_out: + group_meta_out.write("group_id\tn_reads\thas_consensus\n") + + def flush_singleton(header, buf): + tag = header.split()[0] # group_{gid}_singleton:{read_name} + try: + gid = int(tag.split('_')[1]) + except ValueError: + return + seq = ''.join(buf) + read_name = tag.split(':')[1] if ':' in tag else tag + consensus_out.write(f">group_{gid}_singleton n_reads=1{header_tags(gid)}\n{seq}\n") + with open(f"msa_groups/group_{gid}.fasta", 'w') as mf: + mf.write(f">{read_name}\n{seq}\n") + group_meta_out.write(f"{gid}\t1\tFALSE\n") + subhap_assignments[read_name] = f"group_{gid}_hap_A" + + with open("~{singletons_fasta}") as fh: for ln in fh: - if not ln.strip(): continue - r = parse_paf_line(ln) - tlen.setdefault(r["target_name"], r["target_len"]) - qlen.setdefault(r["query_name"], r["query_len"]) - hit: Hit = ( - r["q_start"], r["q_end"], - r["t_start"], r["t_end"], - r["strand"], r["n_match"] - ) - hits[r["target_name"]][r["query_name"]].append(hit) - - cov_hit = r["n_match"] / r["query_len"] - gap_ok = abs(r["n_match"] - r["n_align"]) <= 0.05 * r["n_match"] - if cov_hit >= cov_thr and gap_ok: - faithful[r["target_name"]][r["query_name"]].append(hit) - return hits, faithful, tlen, qlen - -# ──────────── writers ──────────── # -def _aggregate_coverage(hit_list: List[Hit], q_len: int) -> float: - total_nm = sum(h[5] for h in hit_list) - return min(1.0, total_nm / q_len) if q_len else 0.0 - -def write_long(h: HitDict, tlen: LenDict, qlen: LenDict, path: str): - with open(path,"w",newline="") as f: - w = csv.writer(f, delimiter="\t") - w.writerow(["target_name","target_len", - "query_name","query_len","coverage", - "num_hits","hits"]) - for tgt in sorted(h): - for qry in sorted(h[tgt]): - hl = h[tgt][qry] - cov = _aggregate_coverage(hl, qlen[qry]) - hits_str = ";".join(f"{qs}-{qe}:{ts}-{te}:{s}" - for qs,qe,ts,te,s,_ in hl) - w.writerow([tgt, tlen[tgt], - qry, qlen[qry], f"{cov:.3f}", - len(hl), hits_str]) - -def write_wide(h: HitDict, tlen: LenDict, qlen: LenDict, path: str): - all_q = sorted({q for t in h.values() for q in t}) - header=["target_name","target_len","all_one_hit"] - for q in all_q: - header.extend([f"{q}_len", f"{q}_cov", f"{q}_num_hits", f"{q}_hit_coordinates"]) - with open(path,"w",newline="") as f: - w=csv.writer(f,delimiter="\t"); w.writerow(header) - for tgt in sorted(h): - qdict=h[tgt] - num_hits=[len(qdict[q]) if q in qdict else 0 for q in all_q] - all_one=all(n==1 for n in num_hits) - row=[tgt, tlen[tgt], "TRUE" if all_one else "FALSE"] - for q,n in zip(all_q,num_hits): - row.append(qlen[q]) # _len - if n: - cov=_aggregate_coverage(qdict[q], qlen[q]) - hits_str=";".join(f"{qs}-{qe}:{ts}-{te}:{s}" - for qs,qe,ts,te,s,_ in qdict[q]) - row.extend([f"{cov:.3f}", str(n), hits_str]) + ln = ln.rstrip('\n') + if ln.startswith('>'): + if name is not None: + flush_singleton(name, buf) + name = ln[1:] + buf = [] + else: + buf.append(ln.upper()) + if name is not None: + flush_singleton(name, buf) + + # ── Multi-read groups: parse MSA, sub-haplotype, build consensus ───────── + with tarfile.open("~{msa_tarball}") as tar: + for member in tar.getmembers(): + bn = os.path.basename(member.name) + if not bn.endswith('.fasta'): + continue + try: + gid = int(bn.replace('group_', '').replace('.fasta', '')) + except ValueError: + continue + + fobj = tar.extractfile(member) + if not fobj: + continue + + msa_seqs = {} + mname, mbuf = None, [] + for ln in fobj.read().decode('utf-8').splitlines(): + if ln.startswith('>'): + if mname is not None: + msa_seqs[mname] = ''.join(mbuf) + mname = ln[1:] + mbuf = [] else: - row.extend(["", "", ""]) # cov, num_hits, coords blank - w.writerow(row) - -# ──────────── CLI ──────────── # -def main(): - ap=argparse.ArgumentParser(description="PAF summary tables with coverage") - ap.add_argument("paf"); ap.add_argument("-o","--output",default="paf_target_summary.tsv") - ap.add_argument("-c","--coverage_threshold",type=float,default=0.8) - a=ap.parse_args() - hits,faithful,tlen,qlen=load_hits(a.paf,a.coverage_threshold) - write_long(hits,tlen,qlen,a.output) - write_long(faithful,tlen,qlen,a.output.replace(".tsv","_faithful.tsv")) - write_wide(hits,tlen,qlen,a.output.replace(".tsv","_wide.tsv")) - write_wide(faithful,tlen,qlen,a.output.replace(".tsv","_wide_faithful.tsv")) - print("✓ tables with coverage written") - -if __name__=="__main__": - main() -EOF - - chmod +x parse_paf_to_table.py - - # Run the parser - python3 parse_paf_to_table.py \ - ~{parts_alignment_paf} \ - -o ~{out_prefix}.tsv \ - -c ~{coverage_threshold} + mbuf.append(ln.upper()) + if mname is not None: + msa_seqs[mname] = ''.join(mbuf) + + if not msa_seqs: + group_meta_out.write(f"{gid}\t{len(groups[gid])}\tFALSE\n") + continue + + with open(f"msa_groups/group_{gid}.fasta", 'w') as mf: + for rname, seq in msa_seqs.items(): + mf.write(f">{rname}\n{seq}\n") + + # Pass 1: identify error-corrected columns (indel noise filtered by + # plurality thresholds), then cluster haplotypes only on those columns. + kept_cols = retained_columns(msa_seqs) + hap_map = cluster_haplotypes(msa_seqs, kept_cols) # read -> 'A', 'B', ... + hap_labels = sorted(set(hap_map.values())) + for r, lbl in hap_map.items(): + subhap_assignments[r] = f"group_{gid}_hap_{lbl}" + + if len(hap_labels) > 1: + # Emit one consensus per haplotype + for lbl in hap_labels: + hap_seqs = {r: s for r, s in msa_seqs.items() + if hap_map[r] == lbl} + consensus = plurality_consensus(hap_seqs) + consensus_out.write( + f">group_{gid}_hap_{lbl} n_reads={len(hap_seqs)}{header_tags(gid)}\n" + f"{consensus}\n") + group_meta_out.write(f"{gid}\t{len(msa_seqs)}\tTRUE\n") + continue + + consensus = plurality_consensus(msa_seqs) + consensus_out.write( + f">group_{gid} n_reads={len(msa_seqs)}{header_tags(gid)}\n{consensus}\n") + group_meta_out.write(f"{gid}\t{len(msa_seqs)}\tTRUE\n") + +# ── Write sub-haplotype assignments ────────────────────────────────────────── +read_to_group = {r: gid for gid, rlist in groups.items() for r in rlist} +with open("~{out_prefix}_subhaps.tsv", 'w') as out: + out.write("read_name\tgroup_id\tsub_haplotype\n") + for r, hap in sorted(subhap_assignments.items()): + out.write(f"{r}\t{read_to_group.get(r, 'NA')}\t{hap}\n") + +# Tarball per-group MSA files (for diagnostic download) +with tarfile.open("~{out_prefix}_all_msa.tar.gz", "w:gz") as tar: + tar.add("msa_groups", arcname="msa_groups") + +n_multi = sum(1 for gid in set(read_to_group[r] for r in subhap_assignments) + if len({subhap_assignments[r] + for r in subhap_assignments if read_to_group.get(r) == gid}) > 1) +print(f"Consensus build complete; {len(subhap_assignments)} reads assigned sub-haplotypes, " + f"{n_multi} group(s) split by haplotype") +PYEOF >>> output { - File summary_table = "~{out_prefix}.tsv" - File faithful_table = "~{out_prefix}_faithful.tsv" - File wide_table = "~{out_prefix}_wide.tsv" - File wide_faithful_table = "~{out_prefix}_wide_faithful.tsv" + File all_msa_tarball = "~{out_prefix}_all_msa.tar.gz" + File consensus_fasta = "~{out_prefix}_consensus.fasta" + File group_meta_tsv = "~{out_prefix}_group_meta.tsv" + File subhaps_tsv = "~{out_prefix}_subhaps.tsv" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" + cpu: 1 + memory: "8 GiB" + disk: "50 GB" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey + zones: runtime_attributes.zones + } +} + + +task categorize_reads { + meta { + description: "Assign final edit-outcome category to each read using structural groups, sub-haplotypes, and Pass-2 blastn evidence" + } + + parameter_meta { + filtered_reads_fasta: "Original (unclipped) filtered reads FASTA — used to enumerate all reads" + groups_tsv: "Structural group assignments (from structural_grouping)" + subhaps_tsv: "Sub-haplotype assignments (from build_group_consensus)" + pass2_alignment_tsv: "Pass-2 blastn tabular output run on unclipped filtered reads" + gap_merge_threshold: "Max gap (bp) between abutting HDR parts to call HDR_candidate" + wt_gap_threshold: "Max gap (bp) between left_HA and right_HA to call WT_candidate" + min_flank_bp: "Minimum bases flanking all part hits required to call HDR or WT (vs inconclusive)" + min_part_hit_bp: "Minimum query-span (bp) for a blastn hit to be counted; filters repetitive short matches" + } + + input { + File filtered_reads_fasta + File groups_tsv + File subhaps_tsv + File pass2_alignment_tsv + String sample_id + Int gap_merge_threshold = 5 + Int wt_gap_threshold = 20 + Int min_flank_bp = 50 + Int min_part_hit_bp = 100 + RuntimeAttributes runtime_attributes + } + + String out_prefix = "~{sample_id}_edit_categories" + + command <<< + set -euxo pipefail + + python3 <<'PYEOF' +from collections import defaultdict, Counter + +HDR_THR = ~{gap_merge_threshold} +WT_THR = ~{wt_gap_threshold} +MIN_HIT = ~{min_part_hit_bp} + +# HDR part order constants (used for truncated-read detection) +_FWD_HDR = [('left_HA', '+'), ('payload', '+'), ('right_HA', '+')] +_REV_HDR = [('right_HA', '-'), ('payload', '-'), ('left_HA', '-')] + +def is_partial_hdr(pairs, ordered): + """Return True iff pairs is a non-empty strict contiguous proper subset of ordered.""" + n = len(pairs) + if n == 0 or n >= len(ordered): + return False + for start in range(len(ordered) - n + 1): + if list(ordered[start:start + n]) == pairs: + return True + return False + +# ── Structural category from ordered part hits ──────────────────────────────── +# HiFi reads can be in either orientation. A reverse-strand HDR read has all +# three parts on '-' in t_start order: right_HA:- → payload:- → left_HA:-. +# WT on '-': right_HA:- → left_HA:-. +# Mixed strands → misintegration. +def structural_category(ps, gs): + """ps = [(part, strand), ...] sorted by t_start; gs = gaps between them.""" + if not ps: + return 'no_hit' + parts_list = [p for p, s in ps] + strands_list = [s for p, s in ps] + + if len(set(strands_list)) != 1: + return 'misintegration' + + strand = strands_list[0] + if strand == '+': + if parts_list == ['left_HA', 'payload', 'right_HA'] and all(abs(g) <= HDR_THR for g in gs): + return 'HDR_candidate' + if parts_list == ['left_HA', 'right_HA'] and len(gs) == 1 and abs(gs[0]) <= WT_THR: + return 'WT_candidate' + if parts_list in (['left_HA', 'payload'], ['payload', 'right_HA']) and all(abs(g) <= HDR_THR for g in gs): + return 'partial' + else: + if parts_list == ['right_HA', 'payload', 'left_HA'] and all(abs(g) <= HDR_THR for g in gs): + return 'HDR_candidate' + if parts_list == ['right_HA', 'left_HA'] and len(gs) == 1 and abs(gs[0]) <= WT_THR: + return 'WT_candidate' + if parts_list in (['payload', 'left_HA'], ['right_HA', 'payload']) and all(abs(g) <= HDR_THR for g in gs): + return 'partial' + + return 'misintegration' + +# ── Load structural groups ──────────────────────────────────────────────────── +group_info = {} # read_name -> {gid, sig, size} +with open("~{groups_tsv}") as fh: + fh.readline() + for ln in fh: + f = ln.rstrip('\n').split('\t') + if len(f) < 4: + continue + group_info[f[0]] = {'gid': int(f[1]), 'sig': f[2], 'size': int(f[3])} + +# ── Load sub-haplotype assignments ──────────────────────────────────────────── +subhap_info = {} +with open("~{subhaps_tsv}") as fh: + fh.readline() + for ln in fh: + f = ln.rstrip('\n').split('\t') + if len(f) >= 3: + subhap_info[f[0]] = f[2] + +# ── Load Pass-2 blastn hits ─────────────────────────────────────────────────── +# Collect both span tuples (for flanking) and ordered (part, strand) lists (for category). +# Two-pass: parse all hits first, then apply proximity-aware size filter. +# A hit is filtered out only if ALL of: +# (a) hit query-span < max(MIN_HIT, 0.40 * part_len) +# (b) not within WT_THR bp of any other hit on the same read +pass2_hits = defaultdict(list) # read_name -> [(t_start, t_end)] +hits_by_read = defaultdict(list) # read_name -> [(part, strand, t_start, t_end)] +read_lens = {} + +_raw_hits = defaultdict(list) # read_name -> [(part, strand, t_start, t_end, hit_size, qlen)] +_raw_slen = {} # read_name -> slen + +with open("~{pass2_alignment_tsv}") as fh: + for ln in fh: + if not ln.strip() or ln.startswith('#'): + continue + f = ln.rstrip('\n').split('\t') + if len(f) < 15: + continue + qname = f[0] + if not (qname.endswith('_left_HA') or qname.endswith('_right_HA') + or qname.endswith('_payload')): + continue + read_name = f[4] + qlen = int(f[1]) + hit_size = abs(int(f[3]) - int(f[2])) + sstart, send = int(f[6]), int(f[7]) + t_start = min(sstart, send) - 1 + t_end = max(sstart, send) + strand = '+' if f[13] == 'plus' else '-' + if qname.endswith('_left_HA'): part = 'left_HA' + elif qname.endswith('_right_HA'): part = 'right_HA' + else: part = 'payload' + _raw_hits[read_name].append((part, strand, t_start, t_end, hit_size, qlen)) + _raw_slen[read_name] = int(f[5]) + +def _keep_hit(hit, others): + """Return True if this hit should be kept (passes size OR is near a size-passing hit).""" + part, strand, t_start, t_end, hit_size, qlen = hit + min_size = max(MIN_HIT, int(0.40 * qlen)) + if hit_size >= min_size: + return True + # Keep if within WT_THR bp of another hit that itself passes the size threshold + for o in others: + o_hit_size, o_qlen = o[4], o[5] + o_min_size = max(MIN_HIT, int(0.40 * o_qlen)) + if o_hit_size < o_min_size: + continue + o_start, o_end = o[2], o[3] + gap = max(t_start, o_start) - min(t_end, o_end) # <=0 means overlap + if gap <= WT_THR: + return True + return False + +for read_name, raw in _raw_hits.items(): + others_by_idx = [raw[:i] + raw[i+1:] for i in range(len(raw))] + for hit, others in zip(raw, others_by_idx): + if not _keep_hit(hit, others): + continue + part, strand, t_start, t_end, hit_size, qlen = hit + pass2_hits[read_name].append((t_start, t_end)) + hits_by_read[read_name].append((part, strand, t_start, t_end)) + read_lens[read_name] = _raw_slen[read_name] + +# Merge overlapping hits to the same (part, strand) per read. +# The proximity-aware keep filter retains short sub-hits that overlap a +# full-length hit of the same part (they pass the "near a size-passing hit" +# condition). Without merging they appear as duplicate part entries in ps, +# turning a valid HDR signature into an apparent misintegration. +def _merge_part_hits(hits): + by_key = defaultdict(list) + for part, strand, t_start, t_end in hits: + by_key[(part, strand)].append((t_start, t_end)) + merged = [] + for (part, strand), spans in by_key.items(): + spans.sort() + cs, ce = spans[0] + for s, e in spans[1:]: + if s <= ce: + ce = max(ce, e) + else: + merged.append((part, strand, cs, ce)) + cs, ce = s, e + merged.append((part, strand, cs, ce)) + return merged + +hits_by_read = {r: _merge_part_hits(h) for r, h in hits_by_read.items()} + +def get_ps_gs(read_name): + hits = sorted(hits_by_read[read_name], key=lambda h: h[2]) + ps = [(h[0], h[1]) for h in hits] + gs = [hits[i+1][2] - hits[i][3] for i in range(len(hits)-1)] + return ps, gs + +# ── Enumerate all reads from FASTA, collecting lengths ─────────────────────── +all_reads = [] +fasta_lens = {} # read_name -> sequence length +with open("~{filtered_reads_fasta}") as fh: + cur_name = None + cur_len = 0 + for ln in fh: + if ln.startswith('>'): + if cur_name is not None: + fasta_lens[cur_name] = cur_len + cur_name = ln[1:].split()[0] + cur_len = 0 + all_reads.append(cur_name) + else: + cur_len += len(ln.rstrip('\n')) + if cur_name is not None: + fasta_lens[cur_name] = cur_len + +MIN_FLANK = ~{min_flank_bp} + +def flanking(read_name): + hits = pass2_hits.get(read_name, []) + rlen = read_lens.get(read_name, 0) + if not hits: + return 0, 0 + return min(h[0] for h in hits), rlen - max(h[1] for h in hits) + +def categorize(read_name): + info = group_info.get(read_name) + lf, rf = flanking(read_name) + has_flank = lf >= MIN_FLANK or rf >= MIN_FLANK + + if not hits_by_read.get(read_name): + return 'inconclusive', 'no_parts_hit', '' + + ps, gs = get_ps_gs(read_name) + cat = structural_category(ps, gs) + sig = info['sig'] if info else '|'.join(f"{p}:{s}" for p, s in ps) + gid = info['gid'] if info else 'NA' + size = info['size'] if info else 1 + + if cat == 'no_hit': + return 'inconclusive', 'no_parts_hit', '' + + if cat == 'HDR_candidate': + if not has_flank: + return 'inconclusive', f'HDR_sig_no_flank|{sig}', '' + subhap = subhap_info.get(read_name, f'group_{gid}_hap_A') + evidence = f'singleton|{sig}' if size == 1 else sig + return 'HDR', evidence, subhap + + if cat == 'WT_candidate': + if not has_flank: + return 'inconclusive', f'WT_sig_no_flank|{sig}', '' + subhap = subhap_info.get(read_name, f'group_{gid}_hap_A') + return 'WT', sig, subhap + + # partial or misintegration + # A partial signature where the read is truncated on the missing-part side + # is inconclusive — the read simply ends before the edit is complete. + if lf < MIN_FLANK or rf < MIN_FLANK: + if is_partial_hdr(ps, _FWD_HDR) or is_partial_hdr(ps, _REV_HDR): + return 'inconclusive', f'truncated_read|{sig}', '' + return 'misintegration', sig, '' + +# ── Write output ────────────────────────────────────────────────────────────── +with open("~{out_prefix}.tsv", 'w') as out: + out.write("read_name\tcategory\tevidence\tsub_haplotype\t" + "group_id\tgroup_size\tread_length\tleft_flank_bp\tright_flank_bp\n") + cats = [] + for rname in all_reads: + cat, evidence, subhap = categorize(rname) + info = group_info.get(rname) + gid = info['gid'] if info else 'NA' + gsize = info['size'] if info else 0 + rlen = fasta_lens.get(rname, 0) + lf, rf = flanking(rname) + out.write(f"{rname}\t{cat}\t{evidence}\t{subhap}\t{gid}\t{gsize}\t{rlen}\t{lf}\t{rf}\n") + cats.append(cat) + +counts = Counter(cats) +print(f"Total reads : {len(all_reads)}") +for cat, n in sorted(counts.items()): + print(f" {cat}: {n}") +PYEOF + >>> + + output { + File categories_tsv = "~{out_prefix}.tsv" } runtime { docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" cpu: 2 - memory: "4 GB" + memory: "4 GiB" + disk: "10 GB" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey + zones: runtime_attributes.zones + } +} + + +task make_wide_coverage_table { + meta { + description: "Build wide per-part coverage table from blastn part alignments (backwards-compatible replacement for parse_edit_parts)" + } + + parameter_meta { + parts_alignment_tsv: "Blastn tabular output from blastn_remap_parts" + coverage_threshold: "Minimum per-hit coverage (n_match / query_len) for faithful alignments" + min_part_hit_bp: "Minimum query-span (bp) for a blastn hit to be counted; filters repetitive short matches" + } + + input { + File parts_alignment_tsv + Float coverage_threshold = 0.8 + Int min_part_hit_bp = 100 + String sample_id + RuntimeAttributes runtime_attributes + } + + String out_prefix = "~{sample_id}_edit_parts_coverage" + + command <<< + set -euxo pipefail + + python3 <<'PYEOF' +import csv +from collections import defaultdict +from typing import Dict, List, Tuple + +# Hit tuple: (q_start, q_end, t_start, t_end, strand, n_match) +Hit = Tuple[int, int, int, int, str, int] +HitDict = Dict[str, Dict[str, List[Hit]]] +LenDict = Dict[str, int] + +# ── blastn tabular columns ──────────────────────────────────────────────────── +# 0 qseqid 1 qlen 2 qstart 3 qend 4 sseqid 5 slen 6 sstart 7 send +# 8 nident 9 mismatch 10 gapopen 11 gaps 12 length 13 sstrand 14 bitscore + +PART_SUFFIXES = ('_left_HA', '_payload', '_right_HA') +MIN_HIT = ~{min_part_hit_bp} +COV_THR = ~{coverage_threshold} + +faithful: HitDict = defaultdict(lambda: defaultdict(list)) +tlen: LenDict = {} +qlen: LenDict = {} + +with open("~{parts_alignment_tsv}") as fh: + for ln in fh: + ln = ln.rstrip("\n") + if not ln or ln.startswith("#"): + continue + f = ln.split("\t") + if len(f) < 15: + continue + qname = f[0] + if not any(qname.endswith(s) for s in PART_SUFFIXES): + continue + if abs(int(f[3]) - int(f[2])) < MIN_HIT: + continue + n_match = int(f[8]) + n_gaps = int(f[11]) + q_len = int(f[1]) + cov_hit = n_match / q_len + gap_ok = n_gaps <= 0.05 * n_match + if cov_hit < COV_THR or not gap_ok: + continue + sstart, send = int(f[6]), int(f[7]) + tname = f[4] + tlen.setdefault(tname, int(f[5])) + qlen.setdefault(qname, q_len) + hit: Hit = ( + int(f[2]) - 1, int(f[3]), + min(sstart, send) - 1, max(sstart, send), + "+" if f[13] == "plus" else "-", + n_match, + ) + faithful[tname][qname].append(hit) + +def _agg_cov(hit_list: List[Hit], q_len: int) -> float: + return min(1.0, sum(h[5] for h in hit_list) / q_len) if q_len else 0.0 + +all_q = sorted({q for t in faithful.values() for q in t}) +header = ["target_name", "target_len", "all_one_hit"] +for q in all_q: + header.extend([f"{q}_len", f"{q}_cov", f"{q}_num_hits", f"{q}_hit_coordinates"]) + +with open("~{out_prefix}_wide_faithful.tsv", "w", newline="") as f: + w = csv.writer(f, delimiter="\t") + w.writerow(header) + for tgt in sorted(faithful): + qdict = faithful[tgt] + num_hits = [len(qdict[q]) if q in qdict else 0 for q in all_q] + row = [tgt, tlen[tgt], "TRUE" if all(n == 1 for n in num_hits) else "FALSE"] + for q, n in zip(all_q, num_hits): + row.append(qlen[q]) + if n: + cov = _agg_cov(qdict[q], qlen[q]) + hits_str = ";".join(f"{qs}-{qe}:{ts}-{te}:{s}" + for qs, qe, ts, te, s, _ in qdict[q]) + row.extend([f"{cov:.3f}", str(n), hits_str]) + else: + row.extend(["", "", ""]) + w.writerow(row) + +print(f"Wide faithful table: {len(faithful)} reads, {len(all_q)} parts") +PYEOF + >>> + + output { + File wide_faithful_table = "~{out_prefix}_wide_faithful.tsv" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87" + cpu: 1 + memory: "4 GiB" disk: "10 GB" preemptible: runtime_attributes.preemptible_tries maxRetries: runtime_attributes.max_retries @@ -392,3 +1585,164 @@ EOF } } +task align_consensus_msa { + meta { + description: "Align child + parent consensus sequences plus the expected HDR template into a single MAFFT alignment. Reverse-strand consensus sequences are reverse-complemented so all inputs share a common orientation. Group consensus records tagged category=partial are omitted (partial-part signatures do not represent a complete observation of an allele)." + } + + parameter_meta { + child_consensus_fasta: "Consensus FASTA from child sample (build_group_consensus); headers must contain strand=+/- and category=HDR|WT|partial|misintegration" + parent_consensus_fasta: "Consensus FASTA from parent sample (build_group_consensus); headers must contain strand=+/- and category=..." + child_sample_id: "Sample ID used to prefix child sequence names in the MSA" + parent_sample_id: "Sample ID used to prefix parent sequence names in the MSA" + crispr_edit_json: "CRISPR edit JSON; left_ha + payload + right_ha are concatenated and included in the MSA as the expected HDR template" + } + + input { + File child_consensus_fasta + File parent_consensus_fasta + String child_sample_id + String parent_sample_id + File crispr_edit_json + RuntimeAttributes runtime_attributes + } + + String out_prefix = "~{child_sample_id}_vs_~{parent_sample_id}_consensus" + + command <<< + set -euxo pipefail + + OUT="~{out_prefix}" + INPUT="${OUT}_input.fasta" + + # Extract left_ha/payload/right_ha from crispr_edit_json (DNA strings, no embedded quotes) + extract_field() { + grep -oE "\"$1\"[[:space:]]*:[[:space:]]*\"[^\"]*\"" "~{crispr_edit_json}" \ + | head -1 \ + | sed -E 's/^[^:]*:[[:space:]]*"([^"]*)"$/\1/' + } + LEFT_HA=$(extract_field left_ha) + PAYLOAD=$(extract_field payload) + RIGHT_HA=$(extract_field right_ha) + HDR_TEMPLATE=$(printf '%s%s%s' "$LEFT_HA" "$PAYLOAD" "$RIGHT_HA" | tr 'acgtn' 'ACGTN') + + : > "$INPUT" + if [[ -n "$HDR_TEMPLATE" ]]; then + printf ">HDR_expected\n%s\n" "$HDR_TEMPLATE" >> "$INPUT" + fi + echo "HDR template: ${#HDR_TEMPLATE} bp" >&2 + + # Parse a consensus FASTA: skip category=partial, RC strand=-, prefix names with label. + parse_consensus() { + local fasta=$1 label=$2 + # Use first token of label (e.g. czML910 from czML910_Parental_Ngn2-KOLF) as + # name prefix so CLUSTAL 30-char name column fits group/reads/category info. + local prefix + prefix=$(echo "$label" | cut -d_ -f1) + awk -v label="$label" -v prefix="$prefix" ' + function rc_char(c) { + if (c=="A") return "T"; if (c=="T") return "A" + if (c=="C") return "G"; if (c=="G") return "C" + return "N" + } + function rc(s, i,r) { + r="" + for (i=length(s); i>=1; i--) r = r rc_char(substr(s,i,1)) + return r + } + function flush( out, short_name) { + if (name == "" || seq == "") return + if (cat == "partial") { + print " skip partial: " label "_" name > "/dev/stderr" + return + } + out = toupper(seq) + if (strand == "-") out = rc(out) + # Name: {prefix}_{cat}{groupnum}[{hap}] [n] + # e.g. czML1098_HDR1A [3], czML910_WT2 [17] + # misintegration abbreviated to mis; singleton appended as s + short_name = prefix "_" cat_short groupnum hap_suffix singleton_suffix "x" nreads "" + print ">" short_name + print out + kept++ + } + /^>/ { + flush() + name=""; strand=""; cat=""; cat_short=""; nreads="?"; seq="" + groupnum=""; hap_suffix=""; singleton_suffix="" + header=substr($0, 2) + if (match(header, /group_[0-9]+(_hap_[A-Za-z0-9_]+)?(_singleton)?/)) + name = substr(header, RSTART, RLENGTH) + if (match(header, /strand=[+-]/)) + strand = substr(header, RSTART+7, 1) + if (match(header, /category=[A-Za-z_]+/)) + cat = substr(header, RSTART+9, RLENGTH-9) + if (match(header, /n_reads=[0-9]+/)) + nreads = substr(header, RSTART+8, RLENGTH-8) + # Extract group number + groupnum = name + gsub(/^.*group_/, "", groupnum) + gsub(/[^0-9].*$/, "", groupnum) + # Extract hap letter(s): _hap_A -> A, absent -> "" + hap_suffix = "" + if (match(name, /_hap_[A-Za-z0-9]+/)) + hap_suffix = substr(name, RSTART+5, RLENGTH-5) + # Singleton marker + singleton_suffix = (name ~ /_singleton/) ? "s" : "" + # Category abbreviation + cat_short = cat + if (cat == "misintegration") cat_short = "m" + if (cat == "partial") cat_short = "p" + if (cat == "HDR") cat_short = "h" + if (cat == "WT") cat_short = "w" + if (name == "" || strand == "") name = "" + next + } + { seq = seq $0 } + END { + flush() + print label ": " kept+0 " non-partial records" > "/dev/stderr" + } + ' "$fasta" + } + + parse_consensus "~{parent_consensus_fasta}" "~{parent_sample_id}" >> "$INPUT" + parse_consensus "~{child_consensus_fasta}" "~{child_sample_id}" >> "$INPUT" + + OUTPUT_ALN="${OUT}.aln" + + nseq=$(grep -c '^>' "$INPUT" || true) + + if [[ "$nseq" -eq 0 ]]; then + echo "No sequences to align" >&2 + printf "CLUSTAL W\n\n" > "$OUTPUT_ALN" + elif [[ "$nseq" -eq 1 ]]; then + name=$(grep '^>' "$INPUT" | head -1 | sed 's/^>//') + seq=$(grep -v '^>' "$INPUT" | tr -d '\n') + stars=$(printf '%*s' "${#seq}" '' | tr ' ' '*') + printf "CLUSTAL W\n\n%-30s %s\n%-30s %s\n" \ + "${name:0:30}" "$seq" "" "$stars" > "$OUTPUT_ALN" + else + # E-INS-i: generalized affine gap cost (--genafpair) with --ep 0 allows + # single large internal gaps cleanly, which is needed when parent WT + # sequences lack the ~1kb payload and must gap over it as one block. + mafft --ep 0 --genafpair --maxiterate 1000 --quiet --nuc --clustalout "$INPUT" > "$OUTPUT_ALN" + fi + echo "${nseq} sequences → ${OUTPUT_ALN}" + >>> + + output { + File? consensus_msa_clustal = "~{out_prefix}.aln" + } + + runtime { + docker: "quay.io/biocontainers/mafft@sha256:b8ccc402a9156868fefda5caa8693f847b47dcc69aae848557f7115850e60790" + cpu: 4 + memory: "16 GiB" + disk: "20 GB" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey + zones: runtime_attributes.zones + } +} diff --git a/workflows/edit-qc/knock_knock.wdl b/workflows/edit-qc/knock_knock.wdl index b9134091..b40c4697 100644 --- a/workflows/edit-qc/knock_knock.wdl +++ b/workflows/edit-qc/knock_knock.wdl @@ -56,7 +56,9 @@ guide_seq = edit_info['guide'] left_ha = edit_info.get('left_ha', '') payload = edit_info.get('payload', '') right_ha = edit_info.get('right_ha', '') -donor_seq = left_ha + payload + right_ha +ctx_before = edit_info.get('donor_context_before', '') +ctx_after = edit_info.get('donor_context_after', '') +donor_seq = ctx_before + left_ha + payload + right_ha + ctx_after donor_name = f"{symbol}_donor" for fname, val in [ @@ -80,7 +82,7 @@ with open('donor.fasta', 'w') as f: print(f"Target: {symbol} at {chrom}:{t_start}-{t_end}", file=sys.stderr) print(f"Guide: {guide_seq}", file=sys.stderr) -print(f"Donor: {len(donor_seq)} bp ({len(left_ha)} HA_L + {len(payload)} payload + {len(right_ha)} HA_R)", file=sys.stderr) +print(f"Donor: {len(donor_seq)} bp ({len(ctx_before)} ctx_5p + {len(left_ha)} HA_L + {len(payload)} payload + {len(right_ha)} HA_R + {len(ctx_after)} ctx_3p)", file=sys.stderr) PYEOF SYMBOL=$(cat symbol.txt) @@ -146,6 +148,88 @@ CSVEOF cat "${BASE}/data/${BATCH}/sample_sheet.csv" >&2 + # ── 3b. Patch knock-knock: guard against reads with zero alignments ──────── + # Upstream bug: ReadDiagram.draw_reference() in visualize/architecture.py + # indexes alignment_coordinates[0] in the 'centered on primers' branch + # without a length check, crashing with IndexError on unaligned reads. + # The conda site-packages dir is read-only in the container, so patch at + # runtime via a sitecustomize.py on PYTHONPATH that monkey-patches the + # method when the module is imported. + KK_PATCH_DIR="$(pwd)/kk_patch" + mkdir -p "${KK_PATCH_DIR}" + cat > "${KK_PATCH_DIR}/sitecustomize.py" <<'PYEOF' +# Runtime monkey-patch for knock_knock.visualize.architecture.ReadDiagram. +# Wraps draw_reference so that an IndexError from alignment_coordinates[0] +# on a read with no alignments is caught and that ref_name is simply skipped. +import sys + +def _install(): + try: + from knock_knock.visualize import architecture as _arch + except Exception: + return + RD = getattr(_arch, "ReadDiagram", None) + if RD is None or getattr(RD.draw_reference, "_kk_patched", False): + return + _orig = RD.draw_reference + def draw_reference(self, ref_name, ref_y, *args, **kwargs): + try: + return _orig(self, ref_name, ref_y, *args, **kwargs) + except IndexError as e: + print(f"[kk_patch] skipping draw_reference for {ref_name!r}: {e}", + file=sys.stderr) + return None + draw_reference._kk_patched = True + RD.draw_reference = draw_reference + +def _install_num_examples(): + # Raise default num_examples for generate_all_outcome_example_figures + # from 10 → 100 so more per-category example diagrams are emitted. + try: + from knock_knock import experiment as _exp + except Exception: + return + Exp = getattr(_exp, "Experiment", None) + if Exp is None: + return + fn = getattr(Exp, "generate_all_outcome_example_figures", None) + if fn is None or getattr(fn, "_kk_patched", False): + return + def generate_all_outcome_example_figures(self, num_examples=100, **kwargs): + return fn(self, num_examples=num_examples, **kwargs) + generate_all_outcome_example_figures._kk_patched = True + Exp.generate_all_outcome_example_figures = generate_all_outcome_example_figures + +def _hook(modname, installer): + if modname in sys.modules: + installer() + return + import importlib.abc, importlib.util + class _Hook(importlib.abc.MetaPathFinder): + def find_spec(self, name, path, target=None): + if name == modname: + sys.meta_path.remove(self) + spec = importlib.util.find_spec(name) + if spec is not None: + loader = spec.loader + _exec = loader.exec_module + def exec_module(module): + _exec(module) + installer() + loader.exec_module = exec_module + return None + return None + sys.meta_path.insert(0, _Hook()) + +# Install now if already imported; otherwise hook import. +_install() +_hook("knock_knock.visualize.architecture", _install) +_install_num_examples() +_hook("knock_knock.experiment", _install_num_examples) +PYEOF + + export PYTHONPATH="${KK_PATCH_DIR}${PYTHONPATH:+:${PYTHONPATH}}" + # ── 4. Build strategies ──────────────────────────────────────────────────── # Reads sample sheet, aligns protospacer into ctx.fa, extracts the target # window, and writes strategies/{genome}_{symbol}/target.gb + parameters.yaml. diff --git a/workflows/edit-qc/loh.wdl b/workflows/edit-qc/loh.wdl index 75ab3ccf..13c37e54 100644 --- a/workflows/edit-qc/loh.wdl +++ b/workflows/edit-qc/loh.wdl @@ -17,6 +17,7 @@ task check_loh { expected_cut_site: "Expected cut site in chr:pos format (e.g. chr15:65869582)" cut_strand: "Strand of the guide at the cut site (+ or -)" window_size: "Window size in bp upstream and downstream of the cut site (default 200000)" + chrom_window_size: "Window size in bp for chromosome-wide het bedgraph (default 1000000)" } input { @@ -28,6 +29,7 @@ task check_loh { String expected_cut_site String cut_strand Int window_size = 200000 + Int chrom_window_size = 100000 Int mem_gb = 8 RuntimeAttributes runtime_attributes } @@ -136,6 +138,67 @@ with open(f"{sample_id}_loh_summary.tsv", "w") as fh: fh.write(f"downstream_both_het\t{dn_num}\n") fh.write(f"downstream_het_intact_ratio\t{dn_ratio_str}\n") fh.write(f"downstream_het_intact_pct\t{dn_pct:.6f}\n") + +# --------------------------------------------------------------------------- +# Chromosome-wide windowed het analysis → bedgraph +# --------------------------------------------------------------------------- +chrom_window_size = ~{chrom_window_size} + +# Get chromosome length from VCF header (for clean window ends) +import re +header_result = subprocess.run( + ["bcftools", "view", "-h", joint_vcf], + capture_output=True, check=True, +) +chrom_len = None +for line in header_result.stdout.decode().splitlines(): + if line.startswith("##contig=") and f"ID={chrom}," in line: + m = re.search(r"length=(\d+)", line) + if m: + chrom_len = int(m.group(1)) + break + +print(f"Chromosome {chrom} length: {chrom_len}", file=sys.stderr) + +# Dump all variant positions + GTs on the chromosome in one bcftools pass +query_result = subprocess.run( + ["bcftools", "query", "-r", chrom, "-f", "%POS[\t%GT]\n", joint_vcf], + capture_output=True, check=True, +) + +def is_het(gt): + alleles = gt.replace("|", "/").split("/") + return len(set(alleles)) > 1 and "." not in alleles + +from collections import defaultdict +window_parent_het = defaultdict(int) +window_both_het = defaultdict(int) + +for line in query_result.stdout.decode().splitlines(): + parts = line.strip().split("\t") + if len(parts) < 2: + continue + pos = int(parts[0]) + gts = parts[1:] + if parent_sample_idx >= len(gts) or sample_idx >= len(gts): + continue + if is_het(gts[parent_sample_idx]): + win = pos // chrom_window_size + window_parent_het[win] += 1 + if is_het(gts[sample_idx]): + window_both_het[win] += 1 + +bedgraph_file = f"{sample_id}_loh_chrom_het.bedgraph" +with open(bedgraph_file, "w") as fh: + for win in sorted(window_parent_het): + start = win * chrom_window_size + end = min((win + 1) * chrom_window_size, chrom_len) if chrom_len else (win + 1) * chrom_window_size + phet = window_parent_het[win] + bhet = window_both_het[win] + frac = bhet / phet + fh.write(f"{chrom}\t{start}\t{end}\t{frac:.6f}\n") + +print(f"Wrote bedgraph: {bedgraph_file} ({len(window_parent_het)} windows)", file=sys.stderr) PYEOF >>> @@ -145,6 +208,7 @@ PYEOF Float het_intact_upstream_pct = read_float("~{sample_id}_loh_het_intact_upstream_pct.txt") Float het_intact_downstream_pct = read_float("~{sample_id}_loh_het_intact_downstream_pct.txt") File loh_summary = "~{sample_id}_loh_summary.tsv" + File loh_chrom_bedgraph = "~{sample_id}_loh_chrom_het.bedgraph" } runtime { diff --git a/workflows/edit-qc/truvari_parent_filter.wdl b/workflows/edit-qc/truvari_parent_filter.wdl index 7f83d374..4d645ff6 100644 --- a/workflows/edit-qc/truvari_parent_filter.wdl +++ b/workflows/edit-qc/truvari_parent_filter.wdl @@ -104,7 +104,7 @@ task bcftools_split_for_truvari { bcftools +split -W \ -Oz \ -o truvari_merge_split \ - -e 'GT=".|." || GT="./." || GT="." || GT="ref"' + -e 'GT=".|." || GT="./." || GT="." || GT="./0" || GT="0/." || GT="ref"' for file in truvari_merge_split/*.vcf.gz; do mv "$file" "${file%.vcf.gz}~{out_suffix}.vcf.gz" @@ -234,6 +234,12 @@ task filter_parent_variants { parent_index: { name: "Parent index in consistency table" } + family_merged_vcf: { + name: "Full multi-sample merged VCF (pre-split); when provided along with parent_sample_id, parent genotype fields are added as INFO annotations" + } + parent_sample_id: { + name: "Parent sample ID to extract from family_merged_vcf for parent genotype annotations" + } runtime_attributes: { name: "Runtime attribute structure" } @@ -251,12 +257,14 @@ task filter_parent_variants { File truvari_consistency_tsv Int sample_index Int parent_index = 999 + File? family_merged_vcf + String? parent_sample_id RuntimeAttributes runtime_attributes } Int threads = 4 Int mem_gb = 8 - Int disk_size = ceil(size(sample_vcf, "GB") * 2 + size(truvari_consistency_tsv, "GB") + 20) + Int disk_size = ceil(size(sample_vcf, "GB") * 2 + size(truvari_consistency_tsv, "GB") + size(family_merged_vcf, "GB") + 20) String filtered_consistency_tsv = sub(basename(truvari_consistency_tsv), "\\.tsv$", "") + ".parent_filtered.tsv" String filtered_vcf_file = sub(basename(sample_vcf), "\\.vcf.gz$", "") + ".parent_filtered.vcf.gz" String filtered_vcf_index_file = "~{filtered_vcf_file}.tbi" @@ -291,6 +299,32 @@ EOF -h truvari_consistency.hdr \ -Ob \ ~{sample_vcf} | bcftools filter -i "FLAG>0" -Ob | bcftools sort -Oz -Wtbi -o ~{filtered_vcf_file} + + # Annotate with parent genotype fields if family merged VCF and parent sample ID are available + if [[ -n "~{default="" parent_sample_id}" && -n "~{default="" family_merged_vcf}" ]]; then + cat > parent_info.hdr <<-'EOF' +##INFO= +##INFO= +##INFO= +##INFO= +EOF + + bcftools view -s "~{default="" parent_sample_id}" "~{default="" family_merged_vcf}" | \ + bcftools +fill-tags -- -t AN,AC | \ + bcftools sort --max-mem "~{mem_gb/2}G" | \ + bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t[%GT]\t[%AD]\t%AN\t%AC\n' | \ + bgzip > parent_annotations.tsv.gz + tabix -s1 -b2 -e2 parent_annotations.tsv.gz + + bcftools annotate \ + -a parent_annotations.tsv.gz \ + -c 'CHROM,POS,REF,ALT,INFO/parent_GT,INFO/parent_AD,INFO/parent_AN,INFO/parent_AC' \ + -h parent_info.hdr \ + ~{filtered_vcf_file} \ + -Oz -Wtbi -o tmp_parent_annotated.vcf.gz + mv tmp_parent_annotated.vcf.gz ~{filtered_vcf_file} + mv tmp_parent_annotated.vcf.gz.tbi ~{filtered_vcf_index_file} + fi else # No parent specified, copy original VCF cp ~{sample_vcf} ~{filtered_vcf_file} diff --git a/workflows/family.wdl b/workflows/family.wdl index e3b7d6fc..94b8ed9f 100755 --- a/workflows/family.wdl +++ b/workflows/family.wdl @@ -13,6 +13,7 @@ import "edit-qc/bcftools_aux.wdl" as Bcftools_aux import "edit-qc/bcftools_norm.wdl" as Bcftools_norm import "edit-qc/truvari_parent_filter.wdl" as TruvariParentFilter import "edit-qc/crispr_edit_qc.wdl" as CrisprEditQC +import "edit-qc/crispr_edit_tasks.wdl" as CrisprEditTasks import "edit-qc/crispr_edit_consensus.wdl" as CrisprEditConsensus import "edit-qc/plot_cnv.wdl" as PlotCNV import "edit-qc/offtarget.wdl" as OffTarget @@ -255,7 +256,8 @@ workflow humanwgs_family { scatter (sample_index in range(length(family.samples))) { call Bcftools_aux.bcftools_merge_assembly_align as merge_sv_vcfs_align_assembly { input: - vcfs = select_all([upstream.large_sv_filtered_vcf[sample_index], select_all(select_first([joint.split_joint_structural_variant_vcfs,upstream.sv_vcf]))[sample_index]]), + vcfs = select_all([upstream.large_sv_filtered_vcf[sample_index], select_all(select_first([joint.split_joint_structural_variant_vcfs,upstream.sv_vcf]))[sample_index]]), + sample_id = sample_id[sample_index], out_prefix = "~{sample_id[sample_index]}.merged_structural_variants", runtime_attributes = default_runtime_attributes } @@ -323,8 +325,6 @@ workflow humanwgs_family { ############################################################# # Lookup maps for downstream BAM access by sample ID - Map[String, File] bam_files_by_id = as_map(zip(sample_id, downstream.merged_haplotagged_bam)) - Map[String, File] bam_index_by_id = as_map(zip(sample_id, downstream.merged_haplotagged_bam_index)) Map[String, Int] sidx_by_id = as_map(zip(sample_id, range(length(sample_id)))) scatter (sample_index in range(length(family.samples))) { @@ -353,48 +353,64 @@ workflow humanwgs_family { scatter (sample_index in range(length(family.samples))) { # Consensus-based CRISPR edit annotation - runs only if expected_edit is provided if (defined(family.samples[sample_index].expected_edit)) { + # Annotate MSA group consensus sequences with edit parts call CrisprEditConsensus.crispr_edit_consensus { input: - sample_id = sample_id[sample_index], - haplotagged_bam = downstream.merged_haplotagged_bam[sample_index], - haplotagged_bam_index = downstream.merged_haplotagged_bam_index[sample_index], - ref_fasta = ref_map["fasta"], # !FileCoercion - ref_fasta_index = ref_map["fasta_index"], # !FileCoercion - small_variant_vcf = downstream.phased_small_variant_vcf[sample_index], - small_variant_vcf_index = downstream.phased_small_variant_vcf_index[sample_index], - sv_vcf = downstream.phased_sv_vcf[sample_index], - sv_vcf_index = downstream.phased_sv_vcf_index[sample_index], - crispr_edit_json = select_first([family.samples[sample_index].expected_edit]), - runtime_attributes = default_runtime_attributes + sample_id = sample_id[sample_index], + consensus_fasta = select_first([crispr_edit_qc.consensus_fasta[sample_index]]), + crispr_edit_json = select_first([family.samples[sample_index].expected_edit]), + runtime_attributes = default_runtime_attributes } - # knock-knock HDR outcome classification + # knock-knock HDR outcome classification (uses edit-QC filtered reads, no BAM dependency) call KnockKnock.knock_knock { input: sample_id = sample_id[sample_index], crispr_edit_json = select_first([family.samples[sample_index].expected_edit]), - region_reads_fastq = crispr_edit_consensus.region_reads_fastq, + region_reads_fastq = select_first([crispr_edit_qc.filtered_reads_fastq[sample_index]]), ref_fasta = ref_map["fasta"], # !FileCoercion ref_fasta_index = ref_map["fasta_index"], # !FileCoercion runtime_attributes = default_runtime_attributes } - # Parent consensus annotation - for wild type comparison at same locus + # Parent: run edit QC at same locus (child's edit JSON) then annotate consensus if (defined(resolved_parent_id[sample_index])) { Int parent_idx_for_consensus = sidx_by_id[select_first([resolved_parent_id[sample_index]])] + + # Extract parent reads overlapping the edit region from the haplotagged BAM — + # much faster than running the full minimap2 filter on the whole-genome FASTA. + call CrisprEditTasks.extract_region_reads_fasta as parent_region_reads { + input: + haplotagged_bam = downstream.merged_haplotagged_bam[parent_idx_for_consensus], + haplotagged_bam_index = downstream.merged_haplotagged_bam_index[parent_idx_for_consensus], + crispr_edit_json = select_first([family.samples[sample_index].expected_edit]), + sample_id = select_first([resolved_parent_id[sample_index]]), + runtime_attributes = default_runtime_attributes + } + + call CrisprEditQC.crispr_edit_qc as parent_edit_qc { + input: + sample_id = select_first([resolved_parent_id[sample_index]]), + pre_filtered_reads_fasta = parent_region_reads.region_reads_fasta, + crispr_edit_json = select_first([family.samples[sample_index].expected_edit]), + runtime_attributes = default_runtime_attributes + } call CrisprEditConsensus.crispr_edit_consensus as parent_consensus { input: - sample_id = select_first([resolved_parent_id[sample_index]]), - haplotagged_bam = bam_files_by_id[select_first([resolved_parent_id[sample_index]])], - haplotagged_bam_index = bam_index_by_id[select_first([resolved_parent_id[sample_index]])], - ref_fasta = ref_map["fasta"], # !FileCoercion - ref_fasta_index = ref_map["fasta_index"], # !FileCoercion - small_variant_vcf = downstream.phased_small_variant_vcf[parent_idx_for_consensus], - small_variant_vcf_index = downstream.phased_small_variant_vcf_index[parent_idx_for_consensus], - sv_vcf = downstream.phased_sv_vcf[parent_idx_for_consensus], - sv_vcf_index = downstream.phased_sv_vcf_index[parent_idx_for_consensus], - crispr_edit_json = select_first([family.samples[sample_index].expected_edit]), - runtime_attributes = default_runtime_attributes + sample_id = select_first([resolved_parent_id[sample_index]]), + consensus_fasta = select_first([parent_edit_qc.consensus_fasta]), + crispr_edit_json = select_first([family.samples[sample_index].expected_edit]), + runtime_attributes = default_runtime_attributes + } + + call CrisprEditTasks.align_consensus_msa { + input: + child_consensus_fasta = select_first([crispr_edit_qc.consensus_fasta[sample_index]]), + parent_consensus_fasta = select_first([parent_edit_qc.consensus_fasta]), + child_sample_id = sample_id[sample_index], + parent_sample_id = select_first([resolved_parent_id[sample_index]]), + crispr_edit_json = select_first([family.samples[sample_index].expected_edit]), + runtime_attributes = default_runtime_attributes } } } @@ -555,6 +571,8 @@ workflow humanwgs_family { truvari_consistency_tsv = truvari_consistency_small_variants.truvari_consistency_tsv, sample_index = sample_index, parent_index = parent_index, + family_merged_vcf = if parent_index != 999 then truvari_collapse_small_variants.truvari_merge_vcf else None, + parent_sample_id = if parent_index != 999 then resolved_parent_id[sample_index] else None, runtime_attributes = default_runtime_attributes } @@ -565,21 +583,38 @@ workflow humanwgs_family { truvari_consistency_tsv = truvari_consistency_sv.truvari_consistency_tsv, sample_index = sample_index, parent_index = parent_index, + family_merged_vcf = if parent_index != 999 then truvari_collapse_sv.truvari_merge_vcf else None, + parent_sample_id = if parent_index != 999 then resolved_parent_id[sample_index] else None, runtime_attributes = default_runtime_attributes } + # Quality filtering: GQ > 20 and FILTER=PASS, applied per-sample after parent filtering + call Bcftools_aux.bcftools_qual_filter as qual_filter_small_variants { + input: + vcf = parent_filter_small_variants.filtered_vcf, + out_prefix = "~{sample_id[sample_index]}.small_variants.qual_filtered", + runtime_attributes = default_runtime_attributes + } + + call Bcftools_aux.bcftools_qual_filter as qual_filter_sv { + input: + vcf = parent_filter_sv.filtered_vcf, + out_prefix = "~{sample_id[sample_index]}.sv.qual_filtered", + runtime_attributes = default_runtime_attributes + } + # AnnotSV after split because we need its tsv conversion call Somatic_annotation.annotsv as annotate_parent_filter_sv { input: - sv_vcf = parent_filter_sv.filtered_vcf, - sv_vcf_index = parent_filter_sv.filtered_vcf_index, + sv_vcf = qual_filter_sv.filtered_vcf, + sv_vcf_index = qual_filter_sv.filtered_vcf_index, annotsv_cache = somatic_map["annotsv_cache"], # !FileCoercion threads = 2 } call Somatic_annotation.prioritize_small_variants as prioritizeSomatic { input: - vep_annotated_vcf = parent_filter_small_variants.filtered_vcf, + vep_annotated_vcf = qual_filter_small_variants.filtered_vcf, threads = 2, sample = family.samples[sample_index].sample_id } @@ -601,6 +636,18 @@ workflow humanwgs_family { runtime_attributes = default_runtime_attributes } + # Count variants after GQ/VAF filtering + call Bcftools_aux.count_vcf_variants as count_qual_filtered_small_variants { + input: + vcf = qual_filter_small_variants.filtered_vcf, + runtime_attributes = default_runtime_attributes + } + call Bcftools_aux.count_vcf_variants as count_qual_filtered_sv { + input: + vcf = qual_filter_sv.filtered_vcf, + runtime_attributes = default_runtime_attributes + } + # Count variants after concern annotation call Bcftools_aux.count_tsv_rows as count_concerning_snv { input: @@ -653,6 +700,8 @@ workflow humanwgs_family { flatten([['quality_filtered_SV_count'], select_first([count_tertiary_sv.variant_count, []])]), flatten([['parent_filtered_SNV_count'], select_first([count_parent_filtered_snv.variant_count, []])]), flatten([['parent_filtered_SV_count'], select_first([count_parent_filtered_sv.variant_count, []])]), + flatten([['qual_filtered_small_variant_count'], select_first([count_qual_filtered_small_variants.variant_count, []])]), + flatten([['qual_filtered_SV_count'], select_first([count_qual_filtered_sv.variant_count, []])]), flatten([['concerning_SNV_count'], select_first([count_concerning_snv.row_count, []])]), flatten([['concerning_SV_count'], select_first([count_concerning_sv.row_count, []])]) ] @@ -835,13 +884,20 @@ workflow humanwgs_family { Array[File]? parent_filtered_sv_vcf = parent_filter_sv.filtered_vcf Array[File]? parent_filtered_sv_vcf_index = parent_filter_sv.filtered_vcf_index + Array[File]? qual_filtered_small_variant_vcf = qual_filter_small_variants.filtered_vcf + Array[File]? qual_filtered_small_variant_vcf_index = qual_filter_small_variants.filtered_vcf_index + Array[File]? qual_filtered_sv_vcf = qual_filter_sv.filtered_vcf + Array[File]? qual_filtered_sv_vcf_index = qual_filter_sv.filtered_vcf_index + # Variant count summary stats at each filtering stage - Array[String]? stat_quality_filtered_SNV_count = count_tertiary_snv.variant_count - Array[String]? stat_quality_filtered_SV_count = count_tertiary_sv.variant_count - Array[String]? stat_parent_filtered_SNV_count = count_parent_filtered_snv.variant_count - Array[String]? stat_parent_filtered_SV_count = count_parent_filtered_sv.variant_count - Array[String]? stat_concerning_SNV_count = count_concerning_snv.row_count - Array[String]? stat_concerning_SV_count = count_concerning_sv.row_count + Array[String]? stat_quality_filtered_SNV_count = count_tertiary_snv.variant_count + Array[String]? stat_quality_filtered_SV_count = count_tertiary_sv.variant_count + Array[String]? stat_parent_filtered_SNV_count = count_parent_filtered_snv.variant_count + Array[String]? stat_parent_filtered_SV_count = count_parent_filtered_sv.variant_count + Array[String]? stat_qual_filtered_small_variant_count = count_qual_filtered_small_variants.variant_count + Array[String]? stat_qual_filtered_SV_count = count_qual_filtered_sv.variant_count + Array[String]? stat_concerning_SNV_count = count_concerning_snv.row_count + Array[String]? stat_concerning_SV_count = count_concerning_sv.row_count # Somatic SV calling # Array[File] Severus_somatic_vcf = select_all(select_first([phased_severus.output_vcf])) @@ -885,14 +941,21 @@ workflow humanwgs_family { # CRISPR edit QC outputs (read-based) Array[File?] edit_qc_filtered_reads_fasta = crispr_edit_qc.filtered_reads_fasta - Array[File?] edit_qc_parts_alignment_paf = crispr_edit_qc.parts_alignment_paf + Array[File?] edit_qc_parts_alignment_tsv = crispr_edit_qc.parts_alignment_tsv Array[File?] edit_qc_wide_faithful_tsv = crispr_edit_qc.wide_faithful_table + Array[File?] edit_qc_consensus_fasta = crispr_edit_qc.consensus_fasta + Array[File?] edit_qc_categories_tsv = crispr_edit_qc.categories_tsv # CRISPR edit consensus annotation outputs Array[File?] edit_consensus_genbank = crispr_edit_consensus.genbank_annotations Array[File?] edit_consensus_summary = crispr_edit_consensus.annotation_summary - Array[File?] parent_consensus_genbank = parent_consensus.genbank_annotations - Array[File?] parent_consensus_summary = parent_consensus.annotation_summary + + # Parent edit QC outputs (WT at same locus, using child's edit JSON) + Array[File?] parent_edit_qc_categories_tsv = parent_edit_qc.categories_tsv + Array[File?] parent_edit_qc_consensus_fasta = parent_edit_qc.consensus_fasta + Array[File?] edit_consensus_msa_clustal = align_consensus_msa.consensus_msa_clustal + Array[File?] parent_consensus_genbank = parent_consensus.genbank_annotations + Array[File?] parent_consensus_summary = parent_consensus.annotation_summary # knock-knock HDR outcome classification outputs Array[File?] knock_knock_outcome_list_txt = knock_knock.outcome_list_txt @@ -907,6 +970,7 @@ workflow humanwgs_family { Array[Float?] loh_het_intact_upstream_pct = check_loh.het_intact_upstream_pct Array[Float?] loh_het_intact_downstream_pct = check_loh.het_intact_downstream_pct Array[File?] loh_summary = check_loh.loh_summary + Array[File?] loh_chrom_bedgraph = check_loh.loh_chrom_bedgraph # Off-target analysis outputs Array[String?] predicted_cut_site = convert_offtarget_to_bed.expected_cut_site