Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ glnexus-test
6tf_ipsc_wt
all_Ngn2-KOLF
all_KOLF21J
26apr10_Ngn2-KOLF

hifisomatic_resources
hifi-wdl-resources-v*
Expand All @@ -18,6 +19,7 @@ wgs_backlog
*.bam
*.tbi
*.bai
*.fai
*.gz
*.bak
*.gz
Expand Down
19 changes: 19 additions & 0 deletions genbank_to_crispr_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,15 @@ def parse_genbank_to_crispr_json(genbank_path: str, edit_id: str,
right_ha_seq = get_feature_sequence(right_ha_feature, record)
payload_seq = get_feature_sequence(payload_feature, record)

# Extract donor context (e.g. plasmid backbone) flanking the HDR cassette.
# Taken linearly from the GenBank record: everything before left_ha and
# after right_ha. For circular plasmids exported with backbone at the
# record boundaries (the common case), this captures the full backbone.
lha_start = int(left_ha_feature.location.start)
rha_end = int(right_ha_feature.location.end)
donor_context_before = str(record.seq[:lha_start])
donor_context_after = str(record.seq[rha_end:])

# Find payload components
payload_start = int(payload_feature.location.start)
payload_end = int(payload_feature.location.end)
Expand Down Expand Up @@ -263,6 +272,11 @@ def parse_genbank_to_crispr_json(genbank_path: str, edit_id: str,
]
}

if donor_context_before:
crispr_json['edits'][0]['edit']['donor_context_before'] = donor_context_before
if donor_context_after:
crispr_json['edits'][0]['edit']['donor_context_after'] = donor_context_after

# Add optional symbol if available
if gene_info.get('symbol'):
crispr_json['edits'][0]['target']['symbol'] = gene_info['symbol']
Expand Down Expand Up @@ -328,6 +342,11 @@ def main():
print(f" Gene: {crispr_json['edits'][0]['target']['symbol']}", file=sys.stderr)
print(f" Payload components: {len(crispr_json['edits'][0]['edit'].get('payload_components', []))}",
file=sys.stderr)
edit_out = crispr_json['edits'][0]['edit']
if 'donor_context_before' in edit_out or 'donor_context_after' in edit_out:
print(f" Donor context: {len(edit_out.get('donor_context_before', ''))} bp before, "
f"{len(edit_out.get('donor_context_after', ''))} bp after",
file=sys.stderr)


if __name__ == '__main__':
Expand Down
50 changes: 29 additions & 21 deletions scripts/process_input_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,30 +72,38 @@ def main():
all_exist = all(os.path.exists(hifi_read_path) for hifi_read_path in sample["hifi_reads"])

if all_exist and should_copy_file(dest_path):
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdir_path = Path(tmpdirname)
temp_path = tmpdir_path / f"{sample['sample_id']}_hifi_reads_temp.bam"
print(f"Merging {len(sample['hifi_reads'])} HiFi read files -> {dest_path} (stripping tags: fi,ri,fp,rp,ip,pw,HP,PS,PC)")

# Step 1: Concatenate BAM files
print(f" Step 1/2: Concatenating BAM files...")
subprocess.run([
"samtools", "cat",
"-o", str(temp_path)
] + sample["hifi_reads"], check=True)

# Step 2: Strip tags using samtools reset
print(f" Step 2/2: Stripping tags...")
subprocess.run([
print(f"Merging {len(sample['hifi_reads'])} HiFi read files -> {dest_path} (stripping tags: fi,ri,fp,rp,ip,pw,HP,PS,PC)")

# Write to a local /tmp file first to avoid slow NFS streaming writes,
# then copy to NFS destination in one sequential transfer.
# samtools cat stdout is piped directly into samtools reset (no temp BAM on disk).
tmp_fd, tmp_out_str = tempfile.mkstemp(dir="/tmp", suffix=".bam")
os.close(tmp_fd)
tmp_out = Path(tmp_out_str)
try:
# Step 1 + 2 (pipelined): cat BAMs from NFS | reset/strip tags -> local /tmp
print(f" Step 1/2: Streaming cat | reset to local /tmp...")
cat_proc = subprocess.Popen(
["samtools", "cat"] + sample["hifi_reads"],
stdout=subprocess.PIPE
)
reset_proc = subprocess.run([
"samtools", "reset",
"--thread", "15",
"--remove-tag", "fi,ri,fp,rp,ip,pw,HP,PS,PC",
"-o", str(dest_path),
str(temp_path)
], check=True)

# Clean up temporary files
temp_path.unlink()
"-o", str(tmp_out),
"-"
], stdin=cat_proc.stdout, check=True)
cat_proc.stdout.close()
cat_proc.wait()
if cat_proc.returncode != 0:
raise subprocess.CalledProcessError(cat_proc.returncode, "samtools cat")

# Step 2/2: Copy finished BAM from local /tmp to NFS destination
print(f" Step 2/2: Copying local /tmp result to NFS destination...")
shutil.copy2(str(tmp_out), str(dest_path))
finally:
tmp_out.unlink(missing_ok=True)

new_sample["hifi_reads"] = [str(dest_path.absolute())]
elif all_exist:
Expand Down
4 changes: 2 additions & 2 deletions workflows/assembly/assembly_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,8 @@ task pav {
>>>

output {
File? pav_vcf = "pav_outputs_~{sub(sample_name, "\\.", "_")}/~{sub(sample_name, "\\.", "_")}.vcf.gz"
File? pav_vcf_index = "pav_outputs_~{sub(sample_name, "\\.", "_")}/~{sub(sample_name, "\\.", "_")}.vcf.gz.tbi"
File? pav_vcf = "pav_outputs_~{sample_name}/~{sub(sample_name, "\\.", "_")}.vcf.gz"
File? pav_vcf_index = "pav_outputs_~{sample_name}/~{sub(sample_name, "\\.", "_")}.vcf.gz.tbi"
}

runtime {
Expand Down
97 changes: 89 additions & 8 deletions workflows/edit-qc/bcftools_aux.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ task count_vcf_variants {

runtime {
docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87"
cpu: 1
memory: "2 GB"
cpu: 2
memory: "8 GB"
disk: disk_size + " GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: runtime_attributes.preemptible_tries
Expand Down Expand Up @@ -162,8 +162,83 @@ task count_tsv_rows {

runtime {
docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87"
cpu: 1
memory: "2 GB"
cpu: 2
memory: "8 GB"
disk: disk_size + " GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: runtime_attributes.preemptible_tries
maxRetries: runtime_attributes.max_retries
awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey
zones: runtime_attributes.zones
}
}

task bcftools_qual_filter {
meta {
description: "Filter variants to FILTER=PASS. Small variants and sawfish SVs additionally require GQ>=20 and, if AD is present, VAF>0.1. PAV assembly SVs (identified by presence of INFO/COV_PROP) skip the GQ filter and instead require all haplotype COV_PROP values >0.1."
}

parameter_meta {
vcf: { name: "Input VCF" }
out_prefix: { name: "Output file prefix" }
runtime_attributes: { name: "Runtime attribute structure" }
filtered_vcf: { name: "Filtered VCF" }
filtered_vcf_index: { name: "Filtered VCF index" }
}

input {
File vcf
String out_prefix
RuntimeAttributes runtime_attributes
}

Int disk_size = ceil(size(vcf, "GB") * 2 + 10)

command <<<
set -euo pipefail

bcftools --version

# PAV assembly SVs carry INFO/COV_PROP typed as String in their VCF header.
# Reheader to Float (or add the tag as Float if absent) so bcftools can
# evaluate MIN(INFO/COV_PROP) numerically. Records without the tag remain
# absent (compared as ".") in either case.
if bcftools view -h ~{vcf} | grep -q '##INFO=<ID=COV_PROP'; then
bcftools view -h ~{vcf} | \
sed 's|##INFO=<ID=COV_PROP,Number=\.,Type=String|##INFO=<ID=COV_PROP,Number=.,Type=Float|' \
> fixed_header.txt
else
bcftools view -h ~{vcf} | grep -v '^#CHROM' > fixed_header.txt
echo '##INFO=<ID=COV_PROP,Number=.,Type=Float,Description="Assembly haplotype coverage proportion (PAV)">' \
>> fixed_header.txt
bcftools view -h ~{vcf} | grep '^#CHROM' >> fixed_header.txt
fi
bcftools reheader -h fixed_header.txt -o reheaded.vcf.gz ~{vcf}
bcftools index --tbi reheaded.vcf.gz

# Unified filter:
# Non-PAV (INFO/COV_PROP absent): require GQ>=20; if AD present, VAF>0.1.
# PAV (INFO/COV_PROP present): require MIN(COV_PROP)>0.1; no GQ check.
bcftools view \
--output-type z \
-i '(FILTER="PASS" || FILTER=".") && (
(INFO/COV_PROP="." && GQ>=20 && (AD[0:0]="." || (AD[0:0]+AD[0:1])=0 || AD[0:1]/(AD[0:0]+AD[0:1]) > 0.1)) ||
(INFO/COV_PROP!="." && MIN(INFO/COV_PROP) > 0.1)
)' \
reheaded.vcf.gz \
-o ~{out_prefix}.vcf.gz
bcftools index --tbi ~{out_prefix}.vcf.gz
>>>

output {
File filtered_vcf = "~{out_prefix}.vcf.gz"
File filtered_vcf_index = "~{out_prefix}.vcf.gz.tbi"
}

runtime {
docker: "~{runtime_attributes.container_registry}/pb_wdl_base@sha256:4b889a1f21a6a7fecf18820613cf610103966a93218de772caba126ab70a8e87"
cpu: 4
memory: "8 GB"
disk: disk_size + " GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: runtime_attributes.preemptible_tries
Expand All @@ -182,6 +257,9 @@ task bcftools_merge_assembly_align {
vcfs: {
name: "VCFs"
}
sample_id: {
name: "Sample ID to normalize VCF sample names to before merging"
}
out_prefix: {
name: "Output VCF name"
}
Expand All @@ -190,14 +268,15 @@ task bcftools_merge_assembly_align {
}
merged_vcf: {
name: "Merged VCF"
}
}
merged_vcf_index: {
name: "Merged VCF index"
}
}

input {
Array[File] vcfs
String sample_id

String out_prefix

Expand All @@ -210,9 +289,11 @@ task bcftools_merge_assembly_align {

command <<<
set -euo pipefail
echo "~{sample_id}" > sample_name.txt
for vcf in ~{sep=' ' vcfs}; do
cp "$vcf" ./
bcftools index --tbi "$(basename $vcf)"
base="$(basename $vcf)"
bcftools reheader -s sample_name.txt "$vcf" -o "$base"
bcftools index --tbi "$base"
done

bcftools --version
Expand Down Expand Up @@ -247,4 +328,4 @@ task bcftools_merge_assembly_align {
awsBatchRetryAttempts: runtime_attributes.max_retries # !UnknownRuntimeKey
zones: runtime_attributes.zones
}
}
}
Loading