@@ -37,96 +37,118 @@ workflow InputQC {
3737
3838 # convert cram manifest to arrays of crams, cram indices, and sample ids if manifest is provided
3939 if (defined (cram_manifest ) && !defined (crams )) {
40- call ConvertCramManifestToCramArrays {
40+ call ConvertCramManifestToInputArrays {
4141 input :
4242 cram_manifest = select_first ([cram_manifest ])
4343 }
4444 }
4545
46- Boolean do_cram_qc = select_first ([ConvertCramManifestToCramArrays .passes_qc , defined (crams ) && !defined (cram_manifest ), false ])
46+ # Boolean do_cram_qc = select_first([ConvertCramManifestToInputArrays .passes_qc, defined(crams) && !defined(cram_manifest), false])
4747
4848 # validations for array crams, cram indices, and sample ids (whether supplied directly or via manifest)
4949 if (do_cram_qc ) {
50- Array [String ] cram_array = select_first ([crams , ConvertCramManifestToCramArrays .crams ])
51-
52- if ((!defined (cram_indices ) && !defined (ConvertCramManifestToCramArrays .cram_indices )) || (!defined (sample_ids ) && !defined (ConvertCramManifestToCramArrays .sample_ids ))) {
53- Boolean no_cram_index_or_sample_id_passes_qc = false
54- String no_cram_index_or_sample_id_message = "CRAM indices and sample IDs are required when CRAM files are provided. Please provide both CRAM index files and a corresponding list of sample IDs."
55- }
50+ # Array[String] cram_array = select_first([crams, ConvertCramManifestToInputArrays.crams])
51+
52+ # Array[String] evaluated_cram_indices = select_first([cram_indices, ConvertCramManifestToInputArrays.cram_indices, []])
53+ # Array[String] evaluated_sample_ids = select_first([sample_ids, ConvertCramManifestToInputArrays.sample_ids, []])
5654
57- # really wish wdl had else statements
58- if ((defined (cram_indices ) || defined (ConvertCramManifestToCramArrays .cram_indices )) && (defined (sample_ids ) || defined (ConvertCramManifestToCramArrays .sample_ids ))) {
59- Array [String ] cram_index_array = select_first ([cram_indices , ConvertCramManifestToCramArrays .cram_indices ])
60- Array [String ] sample_id_array = select_first ([sample_ids , ConvertCramManifestToCramArrays .sample_ids ])
55+ # Boolean cram_indices_and_sample_ids_provided = (length(evaluated_cram_indices) > 0) && (length(evaluated_sample_ids) > 0)
56+
57+ # if (!cram_indices_and_sample_ids_provided) {
58+ # Boolean no_cram_index_or_sample_id_passes_qc = false
59+ # String no_cram_index_or_sample_id_message = "CRAM indices and sample IDs are required when CRAM files are provided. Please provide both CRAM index files and a corresponding list of sample IDs."
60+ # }
6161
62- call ValidateCramsAndIndices {
62+ if (cram_indices_and_sample_ids_provided ) {
63+ call ValidateCramsAndIndicesAndSampleIds {
6364 input :
6465 crams = cram_array ,
65- cram_indices = cram_index_array ,
66- sample_ids = sample_id_array ,
66+ cram_indices = evaluated_cram_indices ,
67+ sample_ids = evaluated_sample_ids ,
6768 billing_project_for_rp = billing_project_for_rp
6869 }
6970 }
7071 }
7172
7273 output {
73- Boolean passes_qc = select_first ([ValidateCramsAndIndices .passes_qc , ConvertCramManifestToCramArrays .passes_qc , no_data_passes_qc , multiple_data_types_passes_qc , no_cram_index_or_sample_id_passes_qc ])
74- String qc_messages = select_first ([ValidateCramsAndIndices .qc_messages , ConvertCramManifestToCramArrays .qc_messages , no_data_message , multiple_data_types_message , no_cram_index_or_sample_id_message ])
74+ Boolean passes_qc = select_first ([ValidateCramsAndIndicesAndSampleIds .passes_qc , ConvertCramManifestToInputArrays .passes_qc , no_data_passes_qc , multiple_data_types_passes_qc , no_cram_index_or_sample_id_passes_qc ])
75+ String qc_messages = select_first ([ValidateCramsAndIndicesAndSampleIds .qc_messages , ConvertCramManifestToInputArrays .qc_messages , no_data_message , multiple_data_types_message , no_cram_index_or_sample_id_message ])
7576 }
7677}
7778
7879
79- task ConvertCramManifestToCramArrays {
80+ task ConvertCramManifestToInputArrays {
8081 input {
8182 File cram_manifest
82-
83- String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
84- Int cpu = 1
85- Int memory_mb = 4000
86- Int disk_size_gb = 10
8783 }
8884
8985 command <<<
90- # create empty qc messages file
91- touch qc_messages.txt
92-
93- # convert the cram manifest into arrays of crams, cram indices, and sample ids
94- head -n 1 ~{cram_manifest } > header.txt
95-
96- sample_id_col = $(cat header.txt | tr '\t' '\n' | grep -n "^sample_id$" | cut -d : -f1 )
97- cram_path_col = $(cat header.txt | tr '\t' '\n' | grep -n "^cram_path$" | cut -d : -f1 )
98- cram_index_col = $(cat header.txt | tr '\t' '\n' | grep -n "^cram_index_path$" | cut -d : -f1 )
99-
100- if [ -z "${sample_id_col} " ] || [ -z "${cram_path_col} " ] || [ -z "${cram_index_col} " ]; then
101- echo "Unable to determine column positions for sample_id, cram_path, or cram_index_path in the CRAM manifest." >> qc_messages.txt
102- echo "false" > passes_qc.txt
103- # create empty output files to ensure the task succeeds
104- touch crams.txt cram_indices.txt sample_ids.txt
105- exit 0
106- fi
107-
108- tail -n +2 ~{cram_manifest } | cut -f ${sample_id_col } > sample_ids.txt
109- tail -n +2 ~{cram_manifest } | cut -f ${cram_path_col } > crams.txt
110- tail -n +2 ~{cram_manifest } | cut -f ${cram_index_col } > cram_indices.txt
111-
112- # passes_qc is true if qc_messages is empty
113- if [ ! -s qc_messages.txt ]; then
114- echo "true" > passes_qc.txt
115- else
116- echo "false" > passes_qc.txt
117- fi
86+ cat << EOF > script.py
87+ import pandas as pd
88+
89+ qc_messages = []
90+
91+ qc_messages_filename = "qc_messages.txt"
92+ passes_qc_filename = "passes_qc.txt"
93+ crams_filename = "crams.txt"
94+ cram_indices_filename = "cram_indices.txt"
95+ sample_ids_filename = "sample_ids.txt"
96+
97+ # Read the manifest
98+ try:
99+ df = pd.read_csv("~{cram_manifest }", sep='\t')
100+
101+ # Check for required columns
102+ required_cols = ['sample_id', 'cram_path', 'cram_index_path']
103+ missing_cols = [col for col in required_cols if col not in df.columns]
104+
105+ if missing_cols:
106+ with open(qc_messages_filename, 'w') as qc_file:
107+ qc_file.write(f"Missing required columns in the CRAM manifest: {', '.join(missing_cols)}")
108+ with open(passes_qc_filename, 'w') as f:
109+ f.write("false")
110+
111+ # Create empty output files
112+ open(crams_filename, 'w').close()
113+ open(cram_indices_filename, 'w').close()
114+ open(sample_ids_filename, 'w').close()
115+ else:
116+ # Write to output files
117+ df['sample_id'].to_csv(sample_ids_filename, index=False, header=False)
118+ df['cram_path'].to_csv(crams_filename, index=False, header=False)
119+ df['cram_index_path'].to_csv(cram_indices_filename, index=False, header=False)
120+
121+ # Write QC results
122+ with open(qc_messages_filename, 'w') as f:
123+ f.write('\n'.join(qc_messages) if qc_messages else '')
124+
125+ with open(passes_qc_filename, 'w') as f:
126+ f.write("true" if not qc_messages else "false")
127+
128+ except Exception as e:
129+ with open(qc_messages_filename, 'w') as qc_file:
130+ qc_file.write(f"Error reading CRAM manifest: {str(e)}")
131+ with open(passes_qc_filename, 'w') as f:
132+ f.write("false")
133+
134+ # Create empty output files
135+ open(crams_filename, 'w').close()
136+ open(cram_indices_filename, 'w').close()
137+ open(sample_ids_filename, 'w').close()
138+
139+ EOF
140+ python3 script.py
118141
119142 # This task should always succeed
120143 exit 0
121144 >>>
122145
123146 runtime {
124- docker : gatk_docker
125- disks : "local-disk ${ disk_size_gb } SSD"
126- memory : "${ memory_mb } MiB "
127- cpu : cpu
147+ docker : "us.gcr.io/broad-dsde-methods/python-data-slim:1.0"
148+ cpu : 1
149+ disks : "local-disk 10 HDD "
150+ memory : "1 GiB"
128151 preemptible : 3
129- maxRetries : 1
130152 noAddress : true
131153 }
132154
@@ -139,7 +161,8 @@ task ConvertCramManifestToCramArrays {
139161 }
140162}
141163
142- task ValidateCramsAndIndices {
164+
165+ task ValidateCramsAndIndicesAndSampleIds {
143166 input {
144167 Array [String ] crams
145168 Array [String ] cram_indices
@@ -161,79 +184,93 @@ task ValidateCramsAndIndices {
161184 String gcloud_requester_pays_flag = if defined (billing_project_for_rp ) then "--billing-project ${billing_project_for_rp }" else ""
162185
163186 command <<<
164- # create empty qc messages file
165- touch qc_messages.txt
166-
167- # write WDL arrays to files
168- printf "%s\n" ~{sep =' ' sample_ids } > sample_ids_list.txt
169- printf "%s\n" ~{sep =' ' crams } > crams_list.txt
170- printf "%s\n" ~{sep =' ' cram_indices } > cram_indices_list.txt
171-
172- # validate that the number of CRAMs, CRAIs, and sample IDs match
173- if [ ~{num_crams } -ne ~{num_cram_indices } ] || [ ~{num_crams } -ne ~{num_sample_ids } ]; then
174- echo "Found different numbers of CRAMs (~{num_crams}), CRAIs (~{num_cram_indices}), and sample IDs (~{num_sample_ids})." >> qc_messages.txt
175- else
176- echo "Number of CRAMs, CRAIs, and sample IDs match: found ~{num_crams} of each."
177- fi
178-
179- # validate that sample IDs are unique
180- unique_sample_ids = $(cat sample_ids_list.txt | sort -u | wc -l )
181- if [ $unique_sample_ids -ne ~{num_sample_ids } ]; then
182- # find duplicate sample IDs
183- duplicate_sample_ids = $(cat sample_ids_list.txt | sort | uniq -d | paste -sd , | sed 's/,/, /g' )
184- echo "Duplicate sample IDs found: ${duplicate_sample_ids} " >> qc_messages.txt
185- else
186- echo "Sample IDs are unique."
187- fi
188-
189- # ensure all crams end with .cram and all cram indices end with .crai
190- crams_with_wrong_extension = $(cat crams_list.txt | grep -vE "\.cram$" || true )
191- if [ ! -z "${crams_with_wrong_extension} " ]; then
192- echo "The following CRAM files do not have a .cram extension: ${crams_with_wrong_extension} " >> qc_messages.txt
193- else
194- echo "All CRAM files have the correct .cram extension."
195- fi
196- cram_indices_with_wrong_extension = $(cat cram_indices_list.txt | grep -vE "\.crai$" || true )
197- if [ ! -z "${cram_indices_with_wrong_extension} " ]; then
198- echo "The following CRAM index files do not have a .crai extension: ${cram_indices_with_wrong_extension} " >> qc_messages.txt
199- else
200- echo "All CRAM index files have the correct .crai extension."
201- fi
202-
203- # validate that cram paths are unique
204- unique_crams = $(cat crams_list.txt | sort -u | wc -l )
205- if [ $unique_crams -ne ~{num_crams } ]; then
206- # find duplicate CRAM paths
207- duplicate_crams = $(cat crams_list.txt | sort | uniq -d | paste -sd , | sed 's/,/, /g' )
208- echo "Duplicate CRAM paths found: ${duplicate_crams} " >> qc_messages.txt
209- else
210- echo "CRAM paths are unique."
211- fi
212-
213- # ensure that all CRAM files are less than the maximum file size allowed by the service (currently 10GB)
214- # this also serves as an access check, which should already have been performed by the service
215- crams_exceeding_max_size = $(cat crams_list.txt | while read cram ; do
216- file_size_bytes = $(gcloud storage ls -L "$cram " ~{gcloud_requester_pays_flag } | grep "Content-Length:" | awk '{print $2}' )
217- file_size_gb = $((file_size_bytes / 1024 / 1024 / 1024 ))
218- if [ $file_size_gb -gt ~{max_cram_file_size_gb } ]; then
219- echo "$cram (${file_size_gb} GB)"
220- fi
221- done )
222- if [ ! -z "${crams_exceeding_max_size} " ]; then
223- echo "The following CRAM files exceed the maximum allowed file size of ~{max_cram_file_size_gb}GB: ${crams_exceeding_max_size} " >> qc_messages.txt
224- else
225- echo "All CRAM files are within the maximum allowed file size of ~{max_cram_file_size_gb}GB."
226- fi
227-
228- # passes_qc is true if qc_messages is empty
229- if [ ! -s qc_messages.txt ]; then
230- echo "true" > passes_qc.txt
231- else
232- echo "false" > passes_qc.txt
233- fi
234-
235- # This task should always succeed
236- exit 0
187+ cat <<'EOF' > script.py
188+ import subprocess
189+ import re
190+
191+ qc_messages = []
192+
193+ # Parse WDL arrays from space-separated strings
194+ sample_ids = """~{sep=' ' sample_ids}""" .split ()
195+ crams = """~{sep=' ' crams}""" .split ()
196+ cram_indices = """~{sep=' ' cram_indices}""" .split ()
197+
198+ num_crams = ~{num_crams }
199+ num_cram_indices = ~{num_cram_indices }
200+ num_sample_ids = ~{num_sample_ids }
201+ max_cram_file_size_gb = ~{max_cram_file_size_gb }
202+ billing_project_flag = "~{gcloud_requester_pays_flag}"
203+
204+ # Validate that the number of CRAMs, CRAIs, and sample IDs match
205+ if num_crams != num_cram_indices or num_crams != num_sample_ids :
206+ qc_messages.append (f "Found different numbers of CRAMs ({num_crams}), CRAIs ({num_cram_indices}), and sample IDs ({num_sample_ids})." )
207+ else :
208+ print (f "Number of CRAMs, CRAIs, and sample IDs match: found {num_crams} of each." )
209+
210+ # Validate that sample IDs are unique
211+ unique_sample_ids = set (sample_ids )
212+ if len (unique_sample_ids ) != num_sample_ids :
213+ duplicates = [sid for sid in unique_sample_ids if sample_ids.count (sid ) > 1 ]
214+ qc_messages.append (f "Duplicate sample IDs found: {', '.join(sorted(duplicates))}" )
215+ else :
216+ print ("Sample IDs are unique." )
217+
218+ # Ensure all crams end with .cram and all cram indices end with .crai
219+ crams_with_wrong_extension = [c for c in crams if not c.endswith ('.cram' )]
220+ if crams_with_wrong_extension :
221+ qc_messages.append (f "The following CRAM files do not have a .cram extension: {', '.join(crams_with_wrong_extension)}" )
222+ else :
223+ print ("All CRAM files have the correct .cram extension." )
224+
225+ cram_indices_with_wrong_extension = [c for c in cram_indices if not c.endswith ('.crai' )]
226+ if cram_indices_with_wrong_extension :
227+ qc_messages.append (f "The following CRAM index files do not have a .crai extension: {', '.join(cram_indices_with_wrong_extension)}" )
228+ else :
229+ print ("All CRAM index files have the correct .crai extension." )
230+
231+ # Validate that cram paths are unique
232+ unique_crams = set (crams )
233+ if len (unique_crams ) != num_crams :
234+ duplicates = [c for c in unique_crams if crams.count (c ) > 1 ]
235+ qc_messages.append (f "Duplicate CRAM paths found: {', '.join(sorted(duplicates))}" )
236+ else :
237+ print ("CRAM paths are unique." )
238+
239+ # Ensure that all CRAM files are less than the maximum file size allowed
240+ crams_exceeding_max_size = []
241+ for cram in crams :
242+ try :
243+ cmd = ['gcloud' , 'storage' , 'ls' , '-L' , cram ]
244+ if billing_project_flag :
245+ cmd.extend (billing_project_flag.split ())
246+
247+ result = subprocess.run (cmd , capture_output = True , text = True , check = True )
248+
249+ # Parse Content-Length from output
250+ match = re.search (r 'Content-Length:\s+(\d+)' , result.stdout )
251+ if match :
252+ file_size_bytes = int (match.group (1 ))
253+ file_size_gb = file_size_bytes // (1024 ** 3 )
254+
255+ if file_size_gb > max_cram_file_size_gb :
256+ crams_exceeding_max_size.append (f "{cram} ({file_size_gb}GB)" )
257+ except subprocess.CalledProcessError as e :
258+ qc_messages.append (f "Error checking file size for {cram}: {e.stderr}" )
259+
260+ if crams_exceeding_max_size :
261+ qc_messages.append (f "The following CRAM files exceed the maximum allowed file size of {max_cram_file_size_gb}GB: {', '.join(crams_exceeding_max_size)}" )
262+ else :
263+ print (f "All CRAM files are within the maximum allowed file size of {max_cram_file_size_gb}GB." )
264+
265+ # Write output files
266+ with open ("qc_messages.txt" , 'w' ) as f :
267+ f.write ('\n' .join (qc_messages ) if qc_messages else '' )
268+
269+ with open ("passes_qc.txt" , 'w' ) as f :
270+ f.write ("true" if not qc_messages else "false" )
271+
272+ EOF
273+ python3 script.py
237274 >>>
238275
239276 runtime {
0 commit comments