Skip to content

Commit 3f2e759

Browse files
authored
Merge pull request #588 from broadinstitute/ct-parameterize-annot-transfer-aln-mem
align_and_annot_transfer_single: parameterize mem; set default mem higher 15GB->30GB; increase cpu count 4->8; strip other fasta extensions in genbank_single and impose regex on assembly_id pattern; map 'sus scrofa [domesticus]' to 'swine' in biosample_to_genbank
2 parents 6486c78 + 049be59 commit 3f2e759

File tree

2 files changed

+18
-5
lines changed

2 files changed

+18
-5
lines changed

pipes/WDL/tasks/tasks_ncbi.wdl

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ task align_and_annot_transfer_single {
181181
Array[File]+ reference_feature_tables
182182

183183
String out_basename = basename(genome_fasta, '.fasta')
184+
Int machine_mem_gb = 30
184185
String docker = "quay.io/broadinstitute/viral-phylo:2.4.1.0"
185186
}
186187

@@ -222,8 +223,8 @@ task align_and_annot_transfer_single {
222223

223224
runtime {
224225
docker: docker
225-
memory: "15 GB"
226-
cpu: 4
226+
memory: machine_mem_gb + " GB"
227+
cpu: 8
227228
dx_instance_type: "mem2_ssd1_v2_x4"
228229
preemptible: 1
229230
maxRetries: 2
@@ -954,8 +955,15 @@ task biosample_to_genbank {
954955
year = outrow['collection_date'].split('-')[0]
955956
country = outrow['geo_loc_name'].split(':')[0]
956957
host = outrow['host'].lower()
957-
if host == 'homo sapiens':
958-
host = 'human'
958+
959+
# host name mapping to the informal names used by NCBI in sequence titles for certain species
960+
host_to_informal_name_map = {
961+
'homo sapiens': 'human',
962+
'sus scrofa domesticus': 'swine',
963+
'sus scrofa': 'swine'
964+
}
965+
host = host_to_informal_name_map.get(host, host)
966+
959967
if len(outrow['isolate'].split('/')) >= 2:
960968
state_inst_labid = outrow['isolate'].split('/')[-2]
961969
else:

pipes/WDL/workflows/genbank_single.wdl

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ workflow genbank_single {
1515

1616
input {
1717
File assembly_fasta
18-
String assembly_id = basename(assembly_fasta, ".fasta")
18+
String assembly_id = basename(basename(basename(assembly_fasta, ".fasta"), ".fsa") , ".fa")
19+
1920
File? aligned_bam
2021

2122
String ref_accessions_colon_delim
@@ -37,6 +38,10 @@ workflow genbank_single {
3738
description: "Genome to prepare for Genbank submission. All segments/chromosomes included in one file. Must contain exactly the same number of sequences as reference_accessions.",
3839
patterns: ["*.fasta"]
3940
}
41+
assembly_id: {
42+
description: "Unique identifier for this assembly. Defaults to the basename of assembly_fasta. table2asn requires this value to be <=50 characters; see: https://www.ncbi.nlm.nih.gov/genbank/table2asn/#fsa",
43+
patterns: ["^[A-Za-z0-9\-_\.:\*#]{1,50}$"]
44+
}
4045
ref_accessions_colon_delim: {
4146
description: "Reference genome Genbank accessions, each segment/chromosome in the exact same count and order as the segments/chromosomes described in assemblies_fasta. List of accessions should be colon delimited.",
4247
patterns: ["*.fasta"]

0 commit comments

Comments
 (0)