Skip to content

Commit 33c7a2b

Browse files
committed
wip
1 parent 96d6013 commit 33c7a2b

File tree

2 files changed

+172
-132
lines changed

2 files changed

+172
-132
lines changed
Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
# 0.0.1
22
2026-03-20 (Date of Last Commit)
33

4-
* Initial release of pipeline to perform QC checks on inputs of the Low Pass Imputation pipeline using GLIMPSE2.
4+
* Initial release of pipeline to perform QC checks on inputs to the Low Pass Imputation pipeline using GLIMPSE2.
55
* Checks include:
6-
- TBD
6+
- If manifest input, all required columns are present
7+
- Same number of CRAMs, CRAIs, and sample IDs provided
8+
- Unique CRAM paths and sample IDs provided
9+
- Each CRAM file size is less than 10GB

pipelines/wdl/glimpse/low_pass_imputation/input_qc/LowPassImputationQC.wdl

Lines changed: 167 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -37,96 +37,118 @@ workflow InputQC {
3737

3838
# convert cram manifest to arrays of crams, cram indices, and sample ids if manifest is provided
3939
if (defined(cram_manifest) && !defined(crams)) {
40-
call ConvertCramManifestToCramArrays {
40+
call ConvertCramManifestToInputArrays {
4141
input:
4242
cram_manifest = select_first([cram_manifest])
4343
}
4444
}
4545
46-
Boolean do_cram_qc = select_first([ConvertCramManifestToCramArrays.passes_qc, defined(crams) && !defined(cram_manifest), false])
46+
# Boolean do_cram_qc = select_first([ConvertCramManifestToInputArrays.passes_qc, defined(crams) && !defined(cram_manifest), false])
4747
4848
# validations for array crams, cram indices, and sample ids (whether supplied directly or via manifest)
4949
if (do_cram_qc) {
50-
Array[String] cram_array = select_first([crams, ConvertCramManifestToCramArrays.crams])
51-
52-
if ((!defined(cram_indices) && !defined(ConvertCramManifestToCramArrays.cram_indices)) || (!defined(sample_ids) && !defined(ConvertCramManifestToCramArrays.sample_ids))) {
53-
Boolean no_cram_index_or_sample_id_passes_qc = false
54-
String no_cram_index_or_sample_id_message = "CRAM indices and sample IDs are required when CRAM files are provided. Please provide both CRAM index files and a corresponding list of sample IDs."
55-
}
50+
# Array[String] cram_array = select_first([crams, ConvertCramManifestToInputArrays.crams])
51+
52+
# Array[String] evaluated_cram_indices = select_first([cram_indices, ConvertCramManifestToInputArrays.cram_indices, []])
53+
# Array[String] evaluated_sample_ids = select_first([sample_ids, ConvertCramManifestToInputArrays.sample_ids, []])
5654
57-
# really wish wdl had else statements
58-
if ((defined(cram_indices) || defined(ConvertCramManifestToCramArrays.cram_indices)) && (defined(sample_ids) || defined(ConvertCramManifestToCramArrays.sample_ids))) {
59-
Array[String] cram_index_array = select_first([cram_indices, ConvertCramManifestToCramArrays.cram_indices])
60-
Array[String] sample_id_array = select_first([sample_ids, ConvertCramManifestToCramArrays.sample_ids])
55+
# Boolean cram_indices_and_sample_ids_provided = (length(evaluated_cram_indices) > 0) && (length(evaluated_sample_ids) > 0)
56+
57+
# if (!cram_indices_and_sample_ids_provided) {
58+
# Boolean no_cram_index_or_sample_id_passes_qc = false
59+
# String no_cram_index_or_sample_id_message = "CRAM indices and sample IDs are required when CRAM files are provided. Please provide both CRAM index files and a corresponding list of sample IDs."
60+
# }
6161
62-
call ValidateCramsAndIndices {
62+
if (cram_indices_and_sample_ids_provided) {
63+
call ValidateCramsAndIndicesAndSampleIds {
6364
input:
6465
crams = cram_array,
65-
cram_indices = cram_index_array,
66-
sample_ids = sample_id_array,
66+
cram_indices = evaluated_cram_indices,
67+
sample_ids = evaluated_sample_ids,
6768
billing_project_for_rp = billing_project_for_rp
6869
}
6970
}
7071
}
7172
7273
output {
73-
Boolean passes_qc = select_first([ValidateCramsAndIndices.passes_qc, ConvertCramManifestToCramArrays.passes_qc, no_data_passes_qc, multiple_data_types_passes_qc, no_cram_index_or_sample_id_passes_qc])
74-
String qc_messages = select_first([ValidateCramsAndIndices.qc_messages, ConvertCramManifestToCramArrays.qc_messages, no_data_message, multiple_data_types_message, no_cram_index_or_sample_id_message])
74+
Boolean passes_qc = select_first([ValidateCramsAndIndicesAndSampleIds.passes_qc, ConvertCramManifestToInputArrays.passes_qc, no_data_passes_qc, multiple_data_types_passes_qc, no_cram_index_or_sample_id_passes_qc])
75+
String qc_messages = select_first([ValidateCramsAndIndicesAndSampleIds.qc_messages, ConvertCramManifestToInputArrays.qc_messages, no_data_message, multiple_data_types_message, no_cram_index_or_sample_id_message])
7576
}
7677
}
7778

7879

79-
task ConvertCramManifestToCramArrays {
80+
task ConvertCramManifestToInputArrays {
8081
input {
8182
File cram_manifest
82-
83-
String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0"
84-
Int cpu = 1
85-
Int memory_mb = 4000
86-
Int disk_size_gb = 10
8783
}
8884

8985
command <<<
90-
# create empty qc messages file
91-
touch qc_messages.txt
92-
93-
# convert the cram manifest into arrays of crams, cram indices, and sample ids
94-
head -n 1 ~{cram_manifest} > header.txt
95-
96-
sample_id_col=$(cat header.txt | tr '\t' '\n' | grep -n "^sample_id$" | cut -d: -f1)
97-
cram_path_col=$(cat header.txt | tr '\t' '\n' | grep -n "^cram_path$" | cut -d: -f1)
98-
cram_index_col=$(cat header.txt | tr '\t' '\n' | grep -n "^cram_index_path$" | cut -d: -f1)
99-
100-
if [ -z "${sample_id_col}" ] || [ -z "${cram_path_col}" ] || [ -z "${cram_index_col}" ]; then
101-
echo "Unable to determine column positions for sample_id, cram_path, or cram_index_path in the CRAM manifest." >> qc_messages.txt
102-
echo "false" > passes_qc.txt
103-
# create empty output files to ensure the task succeeds
104-
touch crams.txt cram_indices.txt sample_ids.txt
105-
exit 0
106-
fi
107-
108-
tail -n +2 ~{cram_manifest} | cut -f${sample_id_col} > sample_ids.txt
109-
tail -n +2 ~{cram_manifest} | cut -f${cram_path_col} > crams.txt
110-
tail -n +2 ~{cram_manifest} | cut -f${cram_index_col} > cram_indices.txt
111-
112-
# passes_qc is true if qc_messages is empty
113-
if [ ! -s qc_messages.txt ]; then
114-
echo "true" > passes_qc.txt
115-
else
116-
echo "false" > passes_qc.txt
117-
fi
86+
cat <<EOF > script.py
87+
import pandas as pd
88+
89+
qc_messages = []
90+
91+
qc_messages_filename = "qc_messages.txt"
92+
passes_qc_filename = "passes_qc.txt"
93+
crams_filename = "crams.txt"
94+
cram_indices_filename = "cram_indices.txt"
95+
sample_ids_filename = "sample_ids.txt"
96+
97+
# Read the manifest
98+
try:
99+
df = pd.read_csv("~{cram_manifest}", sep='\t')
100+
101+
# Check for required columns
102+
required_cols = ['sample_id', 'cram_path', 'cram_index_path']
103+
missing_cols = [col for col in required_cols if col not in df.columns]
104+
105+
if missing_cols:
106+
with open(qc_messages_filename, 'w') as qc_file:
107+
qc_file.write(f"Missing required columns in the CRAM manifest: {', '.join(missing_cols)}")
108+
with open(passes_qc_filename, 'w') as f:
109+
f.write("false")
110+
111+
# Create empty output files
112+
open(crams_filename, 'w').close()
113+
open(cram_indices_filename, 'w').close()
114+
open(sample_ids_filename, 'w').close()
115+
else:
116+
# Write to output files
117+
df['sample_id'].to_csv(sample_ids_filename, index=False, header=False)
118+
df['cram_path'].to_csv(crams_filename, index=False, header=False)
119+
df['cram_index_path'].to_csv(cram_indices_filename, index=False, header=False)
120+
121+
# Write QC results
122+
with open(qc_messages_filename, 'w') as f:
123+
f.write('\n'.join(qc_messages) if qc_messages else '')
124+
125+
with open(passes_qc_filename, 'w') as f:
126+
f.write("true" if not qc_messages else "false")
127+
128+
except Exception as e:
129+
with open(qc_messages_filename, 'w') as qc_file:
130+
qc_file.write(f"Error reading CRAM manifest: {str(e)}")
131+
with open(passes_qc_filename, 'w') as f:
132+
f.write("false")
133+
134+
# Create empty output files
135+
open(crams_filename, 'w').close()
136+
open(cram_indices_filename, 'w').close()
137+
open(sample_ids_filename, 'w').close()
138+
139+
EOF
140+
python3 script.py
118141
119142
# This task should always succeed
120143
exit 0
121144
>>>
122145

123146
runtime {
124-
docker: gatk_docker
125-
disks: "local-disk ${disk_size_gb} SSD"
126-
memory: "${memory_mb} MiB"
127-
cpu: cpu
147+
docker: "us.gcr.io/broad-dsde-methods/python-data-slim:1.0"
148+
cpu: 1
149+
disks: "local-disk 10 HDD"
150+
memory: "1 GiB"
128151
preemptible: 3
129-
maxRetries: 1
130152
noAddress: true
131153
}
132154

@@ -139,7 +161,8 @@ task ConvertCramManifestToCramArrays {
139161
}
140162
}
141163

142-
task ValidateCramsAndIndices {
164+
165+
task ValidateCramsAndIndicesAndSampleIds {
143166
input {
144167
Array[String] crams
145168
Array[String] cram_indices
@@ -161,79 +184,93 @@ task ValidateCramsAndIndices {
161184
String gcloud_requester_pays_flag = if defined(billing_project_for_rp) then "--billing-project ${billing_project_for_rp}" else ""
162185

163186
command <<<
164-
# create empty qc messages file
165-
touch qc_messages.txt
166-
167-
# write WDL arrays to files
168-
printf "%s\n" ~{sep=' ' sample_ids} > sample_ids_list.txt
169-
printf "%s\n" ~{sep=' ' crams} > crams_list.txt
170-
printf "%s\n" ~{sep=' ' cram_indices} > cram_indices_list.txt
171-
172-
# validate that the number of CRAMs, CRAIs, and sample IDs match
173-
if [ ~{num_crams} -ne ~{num_cram_indices} ] || [ ~{num_crams} -ne ~{num_sample_ids} ]; then
174-
echo "Found different numbers of CRAMs (~{num_crams}), CRAIs (~{num_cram_indices}), and sample IDs (~{num_sample_ids})." >> qc_messages.txt
175-
else
176-
echo "Number of CRAMs, CRAIs, and sample IDs match: found ~{num_crams} of each."
177-
fi
178-
179-
# validate that sample IDs are unique
180-
unique_sample_ids=$(cat sample_ids_list.txt | sort -u | wc -l)
181-
if [ $unique_sample_ids -ne ~{num_sample_ids} ]; then
182-
# find duplicate sample IDs
183-
duplicate_sample_ids=$(cat sample_ids_list.txt | sort | uniq -d | paste -sd, | sed 's/,/, /g')
184-
echo "Duplicate sample IDs found: ${duplicate_sample_ids}" >> qc_messages.txt
185-
else
186-
echo "Sample IDs are unique."
187-
fi
188-
189-
# ensure all crams end with .cram and all cram indices end with .crai
190-
crams_with_wrong_extension=$(cat crams_list.txt | grep -vE "\.cram$" || true)
191-
if [ ! -z "${crams_with_wrong_extension}" ]; then
192-
echo "The following CRAM files do not have a .cram extension: ${crams_with_wrong_extension}" >> qc_messages.txt
193-
else
194-
echo "All CRAM files have the correct .cram extension."
195-
fi
196-
cram_indices_with_wrong_extension=$(cat cram_indices_list.txt | grep -vE "\.crai$" || true)
197-
if [ ! -z "${cram_indices_with_wrong_extension}" ]; then
198-
echo "The following CRAM index files do not have a .crai extension: ${cram_indices_with_wrong_extension}" >> qc_messages.txt
199-
else
200-
echo "All CRAM index files have the correct .crai extension."
201-
fi
202-
203-
# validate that cram paths are unique
204-
unique_crams=$(cat crams_list.txt | sort -u | wc -l)
205-
if [ $unique_crams -ne ~{num_crams} ]; then
206-
# find duplicate CRAM paths
207-
duplicate_crams=$(cat crams_list.txt | sort | uniq -d | paste -sd, | sed 's/,/, /g')
208-
echo "Duplicate CRAM paths found: ${duplicate_crams}" >> qc_messages.txt
209-
else
210-
echo "CRAM paths are unique."
211-
fi
212-
213-
# ensure that all CRAM files are less than the maximum file size allowed by the service (currently 10GB)
214-
# this also serves as an access check, which should already have been performed by the service
215-
crams_exceeding_max_size=$(cat crams_list.txt | while read cram; do
216-
file_size_bytes=$(gcloud storage ls -L "$cram" ~{gcloud_requester_pays_flag} | grep "Content-Length:" | awk '{print $2}')
217-
file_size_gb=$((file_size_bytes / 1024 / 1024 / 1024))
218-
if [ $file_size_gb -gt ~{max_cram_file_size_gb} ]; then
219-
echo "$cram (${file_size_gb}GB)"
220-
fi
221-
done)
222-
if [ ! -z "${crams_exceeding_max_size}" ]; then
223-
echo "The following CRAM files exceed the maximum allowed file size of ~{max_cram_file_size_gb}GB: ${crams_exceeding_max_size}" >> qc_messages.txt
224-
else
225-
echo "All CRAM files are within the maximum allowed file size of ~{max_cram_file_size_gb}GB."
226-
fi
227-
228-
# passes_qc is true if qc_messages is empty
229-
if [ ! -s qc_messages.txt ]; then
230-
echo "true" > passes_qc.txt
231-
else
232-
echo "false" > passes_qc.txt
233-
fi
234-
235-
# This task should always succeed
236-
exit 0
187+
cat <<'EOF' > script.py
188+
import subprocess
189+
import re
190+
191+
qc_messages = []
192+
193+
# Parse WDL arrays from space-separated strings
194+
sample_ids = """~{sep=' ' sample_ids}""".split()
195+
crams = """~{sep=' ' crams}""".split()
196+
cram_indices = """~{sep=' ' cram_indices}""".split()
197+
198+
num_crams = ~{num_crams}
199+
num_cram_indices = ~{num_cram_indices}
200+
num_sample_ids = ~{num_sample_ids}
201+
max_cram_file_size_gb = ~{max_cram_file_size_gb}
202+
billing_project_flag = "~{gcloud_requester_pays_flag}"
203+
204+
# Validate that the number of CRAMs, CRAIs, and sample IDs match
205+
if num_crams != num_cram_indices or num_crams != num_sample_ids:
206+
qc_messages.append(f"Found different numbers of CRAMs ({num_crams}), CRAIs ({num_cram_indices}), and sample IDs ({num_sample_ids}).")
207+
else:
208+
print(f"Number of CRAMs, CRAIs, and sample IDs match: found {num_crams} of each.")
209+
210+
# Validate that sample IDs are unique
211+
unique_sample_ids = set(sample_ids)
212+
if len(unique_sample_ids) != num_sample_ids:
213+
duplicates = [sid for sid in unique_sample_ids if sample_ids.count(sid) > 1]
214+
qc_messages.append(f"Duplicate sample IDs found: {', '.join(sorted(duplicates))}")
215+
else:
216+
print("Sample IDs are unique.")
217+
218+
# Ensure all crams end with .cram and all cram indices end with .crai
219+
crams_with_wrong_extension = [c for c in crams if not c.endswith('.cram')]
220+
if crams_with_wrong_extension:
221+
qc_messages.append(f"The following CRAM files do not have a .cram extension: {', '.join(crams_with_wrong_extension)}")
222+
else:
223+
print("All CRAM files have the correct .cram extension.")
224+
225+
cram_indices_with_wrong_extension = [c for c in cram_indices if not c.endswith('.crai')]
226+
if cram_indices_with_wrong_extension:
227+
qc_messages.append(f"The following CRAM index files do not have a .crai extension: {', '.join(cram_indices_with_wrong_extension)}")
228+
else:
229+
print("All CRAM index files have the correct .crai extension.")
230+
231+
# Validate that cram paths are unique
232+
unique_crams = set(crams)
233+
if len(unique_crams) != num_crams:
234+
duplicates = [c for c in unique_crams if crams.count(c) > 1]
235+
qc_messages.append(f"Duplicate CRAM paths found: {', '.join(sorted(duplicates))}")
236+
else:
237+
print("CRAM paths are unique.")
238+
239+
# Ensure that all CRAM files are less than the maximum file size allowed
240+
crams_exceeding_max_size = []
241+
for cram in crams:
242+
try:
243+
cmd = ['gcloud', 'storage', 'ls', '-L', cram]
244+
if billing_project_flag:
245+
cmd.extend(billing_project_flag.split())
246+
247+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
248+
249+
# Parse Content-Length from output
250+
match = re.search(r'Content-Length:\s+(\d+)', result.stdout)
251+
if match:
252+
file_size_bytes = int(match.group(1))
253+
file_size_gb = file_size_bytes // (1024 ** 3)
254+
255+
if file_size_gb > max_cram_file_size_gb:
256+
crams_exceeding_max_size.append(f"{cram} ({file_size_gb}GB)")
257+
except subprocess.CalledProcessError as e:
258+
qc_messages.append(f"Error checking file size for {cram}: {e.stderr}")
259+
260+
if crams_exceeding_max_size:
261+
qc_messages.append(f"The following CRAM files exceed the maximum allowed file size of {max_cram_file_size_gb}GB: {', '.join(crams_exceeding_max_size)}")
262+
else:
263+
print(f"All CRAM files are within the maximum allowed file size of {max_cram_file_size_gb}GB.")
264+
265+
# Write output files
266+
with open("qc_messages.txt", 'w') as f:
267+
f.write('\n'.join(qc_messages) if qc_messages else '')
268+
269+
with open("passes_qc.txt", 'w') as f:
270+
f.write("true" if not qc_messages else "false")
271+
272+
EOF
273+
python3 script.py
237274
>>>
238275

239276
runtime {

0 commit comments

Comments
 (0)