Skip to content

Commit 64798a7

Browse files
authored
Run cnmops on chromosome arms separately (#925)
1 parent 689eb83 commit 64798a7

File tree

6 files changed

+136
-29
lines changed

6 files changed

+136
-29
lines changed
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import gzip
2+
import argparse
3+
import re
4+
from collections import defaultdict
5+
from dataclasses import dataclass
6+
7+
8+
@dataclass
9+
class ArmData:
10+
p_end: int = 0
11+
q_start: int = None
12+
q_end: int = 0
13+
14+
15+
def natural_sort_key(s):
16+
return [int(text) if text.isdigit() else text.lower() for text in re.split("([0-9]+)", s)]
17+
18+
19+
def main(input_file, output_file):
20+
arms = defaultdict(ArmData)
21+
22+
with gzip.open(input_file, "rt") as f:
23+
for line in f:
24+
if line.startswith("#") or not line.strip():
25+
continue
26+
27+
parts = line.strip().split("\t")
28+
chrom, start, end, band = parts[0], int(parts[1]), int(parts[2]), parts[3]
29+
arm_type = band[0].lower()
30+
31+
if arm_type == "p":
32+
if end > arms[chrom].p_end:
33+
arms[chrom].p_end = end
34+
elif arm_type == "q":
35+
# Convert 0-based BED start to 1-based GATK start
36+
one_based_start = start + 1
37+
if arms[chrom].q_start is None:
38+
arms[chrom].q_start = one_based_start
39+
arms[chrom].q_end = end
40+
41+
with open(output_file, "w") as out:
42+
for chrom in sorted(arms.keys(), key=natural_sort_key):
43+
a = arms[chrom]
44+
45+
# P-arm starts at 1, Q-arm starts at the first q-band + 1
46+
p_range = f"1-{a.p_end}" if a.p_end > 0 else "0"
47+
q_range = f"{a.q_start}-{a.q_end}" if a.q_start is not None else "0"
48+
49+
# Output format: chrom \t p-interval \t q-interval
50+
out.write(f"{chrom}\t{p_range}\t{q_range}\n")
51+
52+
53+
if __name__ == "__main__":
54+
parser = argparse.ArgumentParser()
55+
parser.add_argument("-i", "--input", required=True)
56+
parser.add_argument("-o", "--output", required=True)
57+
args = parser.parse_args()
58+
59+
main(args.input, args.output)

src/sv_shell/cnmops.sh

Lines changed: 63 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,23 @@ JVM_MAX_MEM=$(getJavaMem MemTotal)
1919
echo "JVM memory: $JVM_MAX_MEM"
2020

2121

22-
function CNSampleNormal() {
22+
function CNSampleNormalArm() {
2323
local _chr=$1
2424
local _mode=$2
2525
local _r=$3
26+
local _interval=${4:-${_chr}}
2627

2728
echo "----------- Starting CN Sample Normal -------------"
2829
echo "chr: ${_chr}"
30+
echo "interval: ${_interval}"
2931
echo "mode: ${_mode}"
3032
echo "r: ${_r}"
3133
echo "---------------------------------------------------"
3234

3335
java "-Xmx${JVM_MAX_MEM}" -jar /opt/gatk.jar PrintSVEvidence \
3436
--sequence-dictionary "${ref_dict}" \
3537
--evidence-file "${bincov_matrix}" \
36-
-L "${_chr}" \
38+
-L "${_interval}" \
3739
-O "${_chr}.RD.txt"
3840

3941
if [ "${_mode}" == "normal" ]; then
@@ -50,23 +52,55 @@ function CNSampleNormal() {
5052
EMPTY_OUTPUT_ERROR="No CNV regions in result object. Rerun cn.mops with different parameters!"
5153
set +e
5254
echo "Starting to run cnMOPS_workflow"
53-
bash /opt/WGD/bin/cnMOPS_workflow.sh -S "${exclude_list}" -x "${exclude_list}" -r "${_r}" -o . -M "${_chr}.${_mode}.RD.txt" </dev/null wor2>&1 | tee cnmops.out
55+
bash /opt/WGD/bin/cnMOPS_workflow.sh -S "${exclude_list}" -x "${exclude_list}" -r "${_r}" -o . -M "${_chr}.${_mode}.RD.txt" </dev/null 2>&1 | tee cnmops.out
5456
echo "Finished running cnMOPS_workflow"
5557
RC=$?
5658
set -e
57-
if [ ! $RC -eq 0 ]; then
58-
if grep -q "$EMPTY_OUTPUT_ERROR" "cnmops.out"; then
59-
touch calls/cnMOPS.cnMOPS.gff
60-
else
61-
echo "cnMOPS_workflow.sh returned a non-zero code that was not due to an empty call file."
62-
exit $RC
63-
fi
59+
if grep -q "$EMPTY_OUTPUT_ERROR" "cnmops.out"; then
60+
echo "No CNV regions detected, creating empty GFF."
61+
touch calls/cnMOPS.cnMOPS.gff
62+
elif [ ! $RC -eq 0 ]; then
63+
echo "cnMOPS_workflow.sh returned a non-zero code that was not due to an empty call file."
64+
exit $RC
65+
elif [ ! -f calls/cnMOPS.cnMOPS.gff ]; then
66+
echo "cnMOPS_workflow.sh succeeded but did not produce calls/cnMOPS.cnMOPS.gff. Creating empty file."
67+
touch calls/cnMOPS.cnMOPS.gff
6468
fi
6569

6670
echo "----------- Finished CN Sample Normal -------------"
6771
}
6872

6973

74+
function CNSampleNormal() {
75+
local _chr=$1
76+
local _mode=$2
77+
local _r=$3
78+
local _parent_dir
79+
_parent_dir=$(pwd)
80+
local _arm_gffs=()
81+
82+
local _arm_idx=0
83+
for _arm_range in $(awk -v chr="${_chr}" '$1 == chr {for(i=2;i<=NF;i++) print $i}' "${chr_arms_file}"); do
84+
_arm_idx=$((_arm_idx + 1))
85+
local _arm_dir="${_parent_dir}/arm_${_arm_idx}"
86+
mkdir -p "${_arm_dir}"
87+
cd "${_arm_dir}"
88+
89+
CNSampleNormalArm "${_chr}_arm${_arm_idx}" "${_mode}" "${_r}" "${_chr}:${_arm_range}"
90+
_arm_gffs+=("${_arm_dir}/calls/cnMOPS.cnMOPS.gff")
91+
92+
cd "${_parent_dir}"
93+
done
94+
95+
mkdir -p calls
96+
if [ ${#_arm_gffs[@]} -gt 0 ]; then
97+
cat "${_arm_gffs[@]}" > calls/cnMOPS.cnMOPS.gff
98+
else
99+
touch calls/cnMOPS.cnMOPS.gff
100+
fi
101+
}
102+
103+
70104
function CleanCNMops() {
71105
local _sample_list=$1
72106
local _cnmops_gff=$2
@@ -149,25 +183,32 @@ working_dir="$(realpath ${working_dir})"
149183
cd "${working_dir}"
150184
echo "cnMOPS Working directory: ${working_dir}"
151185

152-
batch=($(jq -r '.batch' "$input_json"))
153-
allo_file=($(jq -r '.allo_file' "$input_json"))
154-
chrom_file=($(jq -r '.chrom_file' "$input_json"))
155-
exclude_list=($(jq -r '.exclude_list' "$input_json"))
156-
ped_file=($(jq -r '.ped_file' "$input_json"))
157-
r1=($(jq -r '.r1' "$input_json"))
158-
r2=($(jq -r '.r2' "$input_json"))
159-
ref_dict=($(jq -r '.ref_dict' "$input_json"))
160-
bincov_matrix=($(jq -r '.bincov_matrix' "$input_json"))
161-
stitch_and_clean_large_events=($(jq -r '.stitch_and_clean_large_events' "$input_json"))
162-
min_size=($(jq -r '.min_size' "$input_json"))
163-
prefix=($(jq -r '.prefix' "$input_json"))
186+
batch=$(jq -r '.batch' "$input_json")
187+
allo_file=$(jq -r '.allo_file' "$input_json")
188+
chrom_file=$(jq -r '.chrom_file' "$input_json")
189+
exclude_list=$(jq -r '.exclude_list' "$input_json")
190+
ped_file=$(jq -r '.ped_file' "$input_json")
191+
r1=$(jq -r '.r1' "$input_json")
192+
r2=$(jq -r '.r2' "$input_json")
193+
ref_dict=$(jq -r '.ref_dict' "$input_json")
194+
bincov_matrix=$(jq -r '.bincov_matrix' "$input_json")
195+
stitch_and_clean_large_events=$(jq -r '.stitch_and_clean_large_events' "$input_json")
196+
min_size=$(jq -r '.min_size' "$input_json")
197+
prefix=$(jq -r '.prefix' "$input_json")
198+
cytobands=$(jq -r '.cytobands' "$input_json")
164199

165200

166201
# -------------------------------------------------------
167202
# ======================= Command =======================
168203
# -------------------------------------------------------
169204

170205

206+
# Get chr arms from cytobands
207+
# ---------------------------
208+
chr_arms_file=$(realpath "chr_arms.bed")
209+
python /opt/sv-pipeline/scripts/cytoband_to_arms.py -i "${cytobands}" -o "${chr_arms_file}"
210+
211+
171212
male_r1_gff=()
172213
male_r2_gff=()
173214
allos=($(awk '{print $1}' "${allo_file}"))

src/sv_shell/gather_batch_evidence.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ cnmops_inputs_json="$(realpath "${output_dir}/cnmops_inputs.json")"
186186
cnmops_outputs_json="$(realpath "${output_dir}/cnmops_outputs.json")"
187187

188188
jq -n \
189+
--slurpfile inputs "${input_json}" \
189190
--arg r1 "3" \
190191
--arg r2 "10" \
191192
--arg batch "${batch}" \
@@ -213,7 +214,8 @@ jq -n \
213214
"allo_file": $allo_file,
214215
"ref_dict": $ref_dict,
215216
"prefix": $prefix,
216-
"stitch_and_clean_large_events": $stitch_and_clean_large_events
217+
"stitch_and_clean_large_events": $stitch_and_clean_large_events,
218+
"cytobands": $inputs[0].cytobands
217219
}' > "${cnmops_inputs_json}"
218220

219221
bash /opt/sv_shell/cnmops.sh "${cnmops_inputs_json}" "${cnmops_outputs_json}"
@@ -229,6 +231,7 @@ cnmops_large_inputs_json="$(realpath "${output_dir}/cnmops_large_inputs.json")"
229231
cnmops_large_outputs_json="$(realpath "${output_dir}/cnmops_large_outputs.json")"
230232

231233
jq -n \
234+
--slurpfile inputs "${input_json}" \
232235
--arg r1 "1000" \
233236
--arg r2 "100" \
234237
--arg batch "${batch}" \
@@ -256,7 +259,8 @@ jq -n \
256259
"allo_file": $allo_file,
257260
"ref_dict": $ref_dict,
258261
"prefix": $prefix,
259-
"stitch_and_clean_large_events": $stitch_and_clean_large_events
262+
"stitch_and_clean_large_events": $stitch_and_clean_large_events,
263+
"cytobands": $inputs[0].cytobands
260264
}' > "${cnmops_large_inputs_json}"
261265

262266
bash /opt/sv_shell/cnmops.sh "${cnmops_large_inputs_json}" "${cnmops_large_outputs_json}"

src/sv_shell/sample_inputs/cnmops.json

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@
22
"batch": "NA12878",
33
"r1": "3",
44
"r2": "10",
5-
"bincov_matrix": "/inputs/NA12878.RD.txt.gz",
5+
"bincov_matrix": "/intermediate/NA12878.RD.txt.gz",
66
"chrom_file": "/inputs/autosome.fai",
7-
"ped_file": "/inputs/combined_ped_file.ped",
7+
"ped_file": "/intermediate/combined_ped_file.ped",
88
"exclude_list": "/inputs/GRCh38_Nmask.bed",
99
"allo_file": "/inputs/allosome.fai",
1010
"ref_dict": "/inputs/Homo_sapiens_assembly38.dict",
1111
"samples": [ "NA12878", "HG00096", "HG00129", "HG00140", "HG00150", "HG00187", "HG00239", "HG00277", "HG00288", "HG00337", "HG00349", "HG00375", "HG00410", "HG00457", "HG00557", "HG00599", "HG00625", "HG00701", "HG00740", "HG00844", "HG01060", "HG01085", "HG01112", "HG01275", "HG01325", "HG01344", "HG01356", "HG01384", "HG01393", "HG01396", "HG01474", "HG01507", "HG01572", "HG01607", "HG01709", "HG01747", "HG01790", "HG01794", "HG01799", "HG01861", "HG01874", "HG01880", "HG01885", "HG01958", "HG01982", "HG02002", "HG02010", "HG02019", "HG02020", "HG02069", "HG02085", "HG02186", "HG02221", "HG02235", "HG02272", "HG02275", "HG02299", "HG02332", "HG02367", "HG02374", "HG02489", "HG02490", "HG02491", "HG02586", "HG02588", "HG02611", "HG02620", "HG02642", "HG02648", "HG02658", "HG02855", "HG02953", "HG03007", "HG03009", "HG03085", "HG03099", "HG03100", "HG03111", "HG03369", "HG03370", "HG03436", "HG03449", "HG03472", "HG03476", "HG03556", "HG03604", "HG03649", "HG03684", "HG03694", "HG03709", "HG03722", "HG03727", "HG03744", "HG03756", "HG03789", "HG03850", "HG03864", "HG03872", "HG03888", "HG04118", "HG04158", "HG04161", "HG04183", "NA06984", "NA10847", "NA11894", "NA12340", "NA12489", "NA12872", "NA18499", "NA18507", "NA18530", "NA18539", "NA18549", "NA18553", "NA18560", "NA18638", "NA18923", "NA18941", "NA18945", "NA18956", "NA18995", "NA19001", "NA19035", "NA19062", "NA19102", "NA19143", "NA19184", "NA19350", "NA19351", "NA19377", "NA19443", "NA19449", "NA19661", "NA19678", "NA19679", "NA19684", "NA19746", "NA19795", "NA19818", "NA19913", "NA20126", "NA20320", "NA20321", "NA20346", "NA20509", "NA20510", "NA20522", "NA20752", "NA20764", "NA20802", "NA20845", "NA20869", "NA20895", "NA21102", "NA21122", "NA21133" ],
1212
"prefix": "header",
1313
"stitch_and_clean_large_events": false,
14-
"min_size": 1000000
14+
"min_size": 1000000,
15+
"cytobands": "/inputs/cytobands_hg38.bed.gz"
1516
}

src/sv_shell/sample_inputs/gather_batch_evidence.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -520,5 +520,6 @@
520520
"wham_vcfs": [
521521
"/inputs/NA12878.wham.vcf.gz"
522522
],
523-
"min_svsize": 50
523+
"min_svsize": 50,
524+
"cytobands": "/inputs/cytobands_hg38.bed.gz"
524525
}

src/sv_shell/single_sample_pipeline.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,8 @@ jq -n \
319319
run_module_metrics: $inputs[0].run_batchevidence_metrics,
320320
median_cov_mem_gb_per_sample: $inputs[0].median_cov_mem_gb_per_sample,
321321
ref_panel_median_cov: $inputs[0].ref_panel_median_cov,
322-
sample_median_cov: $eqc_outputs[0].bincov_median
322+
sample_median_cov: $eqc_outputs[0].bincov_median,
323+
"cytobands": $inputs[0].cytobands,
323324
}' > "${gather_batch_evidence_inputs_json_filename}"
324325

325326
bash /opt/sv_shell/gather_batch_evidence.sh \

0 commit comments

Comments
 (0)