Skip to content

Commit 3e34b6b

Browse files
committed
v0.1.9: Fix issues dealing with complex input samples. Handle inputs with identical file names. Thanks to Severine Catreux. Resort multisample inputs that have inconsistent sample orders. Fixes bcbio/bcbio-nextgen#653
1 parent d54089e commit 3e34b6b

File tree

10 files changed

+135
-12
lines changed

10 files changed

+135
-12
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ test/data/*-filter*vcf
2727
test/data/*-nomnp*vcf
2828
test/data/*-prep*vcf
2929
test/data/*-preclean*vcf
30+
test/data/*-evalsafe*vcf
3031
test/data/*-nofilter*vcf
3132
test/data/*-*subset*vcf
3233
test/data/*-*haploid.vcf

HISTORY.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
## 0.1.9 (6 November 2014)
2+
3+
- Handle file inputs with identical final file names (but in different
4+
directories) by uniquifying the names. Thanks to Severine Catreux.
5+
- Resort multisample inputs to ensemble calling with inconsistent sample orders.
6+
Thanks to Shangqian.
7+
18
## 0.1.8 (29 October 2014)
29

310
- Fix variant-prep problem for multiple sample input where headers would get out

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ associated with different variant representations.
3333

3434
### Download
3535

36-
The latest release is 0.1.8 (29 October 2014): [bcbio.variation-0.1.8-standalone.jar][dl].
36+
The latest release is 0.1.9 (6 November 2014): [bcbio.variation-0.1.9-standalone.jar][dl].
3737
Run from the command line:
3838

3939
$ java -jar bcbio.variation-VERSION-standalone.jar [arguments]
@@ -44,7 +44,7 @@ the library for variant comparison, normalization and ensemble calling. Note
4444
that bcbio.variation requires Java 1.7 since the underlying GATK libraries are
4545
not compatible with earlier versions.
4646

47-
[dl]: https://github.com/chapmanb/bcbio.variation/releases/download/v0.1.8/bcbio.variation-0.1.8-standalone.jar
47+
[dl]: https://github.com/chapmanb/bcbio.variation/releases/download/v0.1.9/bcbio.variation-0.1.9-standalone.jar
4848

4949
### As a library
5050

project.clj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
(defproject bcbio.variation "0.1.8"
1+
(defproject bcbio.variation "0.1.9"
22
:description "Toolkit to analyze genomic variation data, built on the GATK with Clojure"
33
:license {:name "MIT" :url "http://www.opensource.org/licenses/mit-license.html"}
44
:dependencies [[org.clojure/clojure "1.5.1"]
55
[org.clojure/math.combinatorics "0.0.3" :exclusions [org.clojure/clojure]]
66
[org.clojure/data.csv "0.1.2" :exclusions [org.clojure/clojure]]
77
[org.clojure/tools.cli "0.2.2"]
88
[clj-stacktrace "0.2.5"]
9-
[bcbio.run "0.0.1-SNAPSHOT"]
9+
[bcbio.run "0.0.1"]
1010
;; GATK requirements
1111
[org.clojars.chapmanb/gatk-engine "3.2"]
1212
[org.clojars.chapmanb/gatk-tools-public "3.2"]

src/bcbio/variation/combine.clj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@
177177
out-fname (str (get-out-basename exp call in-files) ".vcf")
178178
_ (transition :clean (str "Cleaning input VCF: " (:name call)))
179179
clean-files (vec (map #(if-not (:preclean call) %
180-
(clean-problem-vcf % (:ref exp) (:sample exp) call :out-dir out-dir))
180+
(clean-problem-vcf % (:ref exp) (:sample exp) call exp :out-dir out-dir))
181181
in-files))
182182
_ (transition :merge (str "Merging multiple input files: " (:name call)))
183183
merge-file (if (> (count clean-files) 1)

src/bcbio/variation/ensemble.clj

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
[bcbio.run.fsp :as fsp]
1313
[bcbio.run.itx :as itx]
1414
[bcbio.variation.compare :as compare]
15+
[bcbio.variation.vcfsample :as vcfsample]
1516
[bcbio.variation.variantcontext :as gvc]))
1617

1718
(defn- setup-work-dir
@@ -102,10 +103,11 @@
102103
Handles cleaning up and normalizing input files, generating consensus
103104
calls and returns ensemble output."
104105
[vrn-files ref-file out-file in-config]
105-
(let [vrn-files (map fsp/abspath vrn-files)
106-
out-file (fsp/abspath out-file)
106+
(let [out-file (fsp/abspath out-file)
107107
ref-file (fsp/abspath ref-file)
108108
dirs (setup-work-dir out-file)
109+
vrn-files (vcfsample/consistent-order (map fsp/abspath vrn-files)
110+
(str (fs/file (:prep dirs) "sort")))
109111
config-file (create-ready-config vrn-files ref-file in-config dirs)]
110112
(check-vcf-headers vrn-files)
111113
(compare/variant-comparison-from-config config-file)

src/bcbio/variation/normalize.clj

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
[lonocloud.synthread :as ->]
1818
[bcbio.run.fsp :as fsp]
1919
[bcbio.run.itx :as itx]
20+
[bcbio.variation.multisample :as multisample]
2021
[bcbio.variation.structural :as structural]
2122
[bcbio.variation.variantcontext :as gvc]
2223
[taoensso.timbre :as timbre]))
@@ -566,9 +567,10 @@
566567
- Gap characters (-) found in REF or ALT indels.
567568
- Fixes indels without reference padding or N padding.
568569
- Removes spaces in INFO fields."
569-
[in-vcf-file ref-file sample call & {:keys [out-dir]}]
570+
[in-vcf-file ref-file sample call exp & {:keys [out-dir]}]
570571
(let [get-ref-base (ref-base-getter ref-file)
571-
out-file (string/replace (fsp/add-file-part in-vcf-file "preclean" out-dir) ".vcf.gz" ".vcf")]
572+
out-file (str (file out-dir
573+
(str (multisample/get-out-basename exp call [in-vcf-file]) "-preclean.vcf")))]
572574
(letfn [(remove-gap [n xs]
573575
(assoc xs n
574576
(-> (nth xs n)

src/bcbio/variation/variantcontext.clj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@
147147
(defn get-samples
148148
"Retrieve samples from VCF header"
149149
[vcf-file]
150-
(.getGenotypeSamples (get-vcf-header vcf-file)))
150+
(map str (.getGenotypeSamples (get-vcf-header vcf-file))))
151151

152152
;; ## Writing VCF files
153153

src/bcbio/variation/vcfsample.clj

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
(ns bcbio.variation.vcfsample
2+
"Sort VCF sample columns to have a consistent order between multiple inputs.
3+
Variant callers order called outputs differently and this ensures they are
4+
consistent to feed into ensemble calling."
5+
(:import [htsjdk.samtools.util BlockCompressedInputStream BlockCompressedOutputStream])
6+
(:require [bcbio.run.fsp :as fsp]
7+
[bcbio.run.itx :as itx]
8+
[bcbio.variation.variantcontext :as vc]
9+
[clojure.core.strint :refer [<<]]
10+
[clojure.java.io :as io]
11+
[clojure.string :as string]
12+
[me.raynes.fs :as fs]))
13+
14+
;; ## bgzipped and indexed files
15+
16+
(defn- tabix-index-vcf
17+
"Tabix index input VCF inside a transactional directory."
18+
[bgzip-file]
19+
(let [tabix-file (str bgzip-file ".tbi")]
20+
(when (or (itx/needs-run? tabix-file) (not (itx/up-to-date? tabix-file bgzip-file)))
21+
(itx/with-tx-file [tx-tabix-file tabix-file]
22+
(let [tx-bgzip-file (fsp/file-root tx-tabix-file)
23+
full-bgzip-file (str (fs/file bgzip-file))
24+
tmp-dir (str (fs/parent tx-bgzip-file))]
25+
(itx/check-run (<< "ln -s ~{full-bgzip-file} ~{tx-bgzip-file}"))
26+
(itx/check-run (<< "bcftools tabix -p vcf ~{tx-bgzip-file}")))))
27+
tabix-file))
28+
29+
(defn bgzip-index-vcf
30+
"Prepare a VCF file for positional query with bgzip and tabix indexing."
31+
[vcf-file & {:keys [remove-orig? remove-nopass? dir]}]
32+
(let [out-orig (str (fsp/file-root vcf-file)
33+
(if remove-nopass? "-passonly" "")
34+
".vcf.gz")
35+
out-file (if dir (str (io/file dir (fs/base-name out-orig))) out-orig)]
36+
(if remove-nopass?
37+
(itx/run-cmd out-file "bcftools view -f 'PASS,.' ~{vcf-file} | bgzip -c > ~{out-file}")
38+
(itx/run-cmd out-file "bgzip -c ~{vcf-file} > ~{out-file}"))
39+
(when (and (not (.endsWith vcf-file ".gz")) remove-orig?)
40+
(fsp/remove-path vcf-file))
41+
(tabix-index-vcf out-file)
42+
out-file))
43+
44+
;; ## Unique file names
45+
46+
(defn unique-work-file
47+
"Create a work file with unique name in case of shared base names."
48+
[orig-file ext all-files work-dir]
49+
(let [cmp-files (remove #(= % orig-file) all-files)
50+
parts (reverse (string/split orig-file #"/"))
51+
unique-file (loop [i 1]
52+
(let [cur-name (string/join "-" (reverse (take i parts)))]
53+
(if (not-any? #(.endsWith % cur-name) cmp-files)
54+
cur-name
55+
(recur (inc i)))))]
56+
(fsp/add-file-part unique-file ext work-dir)))
57+
58+
;; ## Consistent order sorting
59+
60+
(defn- sort-sample-line
61+
"Sort samples in a VCF line using reordered indexes from calculate-reorder."
62+
[line reorder]
63+
(let [[keep samples] (split-at 9 (string/split line #"\t"))]
64+
(string/join "\t"
65+
(concat keep
66+
(->> samples
67+
(map-indexed vector)
68+
(sort-by (fn [[i x]] (get reorder i)))
69+
(map second))))))
70+
71+
(defn- calculate-reorder
72+
"Create a dictionary of sample indexes in the original VCF to those desired in the output."
73+
[orig-order want-order]
74+
(let [want-indexes (reduce (fn [coll [i x]]
75+
(assoc coll x i))
76+
{} (map-indexed vector want-order))]
77+
(reduce (fn [coll [i x]]
78+
(assoc coll i (get want-indexes x)))
79+
{} (map-indexed vector orig-order))))
80+
81+
(defn- sort-samples
82+
"Sort samples in a VCF file, moving from orig-order to want-order."
83+
[vcf-file orig-order want-order all-vcfs work-dir]
84+
(let [out-file (unique-work-file vcf-file "ssort" all-vcfs work-dir)
85+
sample-reorder (calculate-reorder orig-order want-order)]
86+
(with-open [rdr (io/reader (BlockCompressedInputStream. (io/file vcf-file)))
87+
wtr (io/writer (BlockCompressedOutputStream. (io/file out-file)))]
88+
(doseq [line (line-seq rdr)]
89+
(.write wtr (str (if (.startsWith line "##")
90+
line
91+
(sort-sample-line line sample-reorder))
92+
"\n"))))
93+
(bgzip-index-vcf out-file)
94+
out-file))
95+
96+
(defn- maybe-sort-names
97+
"Extract sample names for the current file and do sorting if needed."
98+
[vcf-file sorder all-vcfs work-dir]
99+
(let [cur-sorder (vc/get-samples vcf-file)]
100+
(if (not= cur-sorder sorder)
101+
(sort-samples vcf-file cur-sorder sorder all-vcfs work-dir)
102+
vcf-file)))
103+
104+
(defn consistent-order
105+
"Ensure the set of VCF files have a consistent sample order relative to the first."
106+
[vcf-files work-dir]
107+
(fsp/safe-mkdir work-dir)
108+
(let [sorder (vc/get-samples (first vcf-files))]
109+
(cons (first vcf-files)
110+
(map #(maybe-sort-names % sorder vcf-files work-dir) (rest vcf-files)))))

test/bcbio/variation/test/compare.clj

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,13 +177,14 @@
177177
vcf (str (fs/file data-dir "cg-normalize.vcf"))
178178
out-vcf (fsp/add-file-part vcf "prep")
179179
prevcf (str (fs/file data-dir "illumina-needclean.vcf"))
180-
out-prevcf (fsp/add-file-part prevcf "preclean")]
180+
out-prevcf (str (fs/file data-dir "NA12878-tester-preclean.vcf"))]
181181
(against-background [(before :facts (vec (map fsp/remove-path [out-vcf out-prevcf
182182
(str vcf ".idx")])))]
183183
(facts "Normalize variant representation of chromosomes, order, genotypes and samples."
184184
(prep-vcf vcf ref "Test1" :config {:prep-sort-pos true}) => out-vcf)
185185
(facts "Pre-cleaning of problematic VCF input files"
186-
(clean-problem-vcf prevcf ref "NA12878" {}) => out-prevcf)))
186+
(clean-problem-vcf prevcf ref "NA12878" {:name "tester"} {:sample "NA12878"}
187+
:out-dir data-dir) => out-prevcf)))
187188

188189
(facts "Choose a reference genome based on VCF contig"
189190
(pick-best-ref vcf1 [ref]) => ref)

0 commit comments

Comments
 (0)