Skip to content

Commit 3b0bc03

Browse files
authored
Preprocessing step for RSEM (#7752)
Implement PostProcessReadsForRSEM
1 parent 8bce5ed commit 3b0bc03

File tree

5 files changed

+516
-0
lines changed

5 files changed

+516
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
package org.broadinstitute.hellbender.tools.walkers.qc;
2+
3+
import htsjdk.samtools.CigarOperator;
4+
import htsjdk.samtools.SAMFileHeader;
5+
import org.apache.commons.lang3.tuple.ImmutablePair;
6+
import org.apache.commons.lang3.tuple.Pair;
7+
import org.apache.http.annotation.Experimental;
8+
import org.broadinstitute.barclay.argparser.Argument;
9+
import org.broadinstitute.barclay.argparser.BetaFeature;
10+
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
11+
import org.broadinstitute.barclay.help.DocumentedFeature;
12+
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
13+
import org.broadinstitute.hellbender.engine.*;
14+
import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
15+
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
16+
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
17+
import org.broadinstitute.hellbender.exceptions.UserException;
18+
import org.broadinstitute.hellbender.utils.Utils;
19+
import org.broadinstitute.hellbender.utils.read.GATKRead;
20+
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
21+
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
22+
23+
import java.util.*;
24+
import java.util.stream.Collectors;
25+
26+
/**
27+
* Performs post-processing steps to get a bam aligned to a transcriptome ready for RSEM (https://github.com/deweylab/RSEM)
28+
*
29+
*
30+
* Suppose the read name "Q1" aligns to multiple loci in the transcriptome.
31+
* STAR aligner outputs the reads in the following order:
32+
*
33+
* Q1: First-of-pair (Chr 1:1000)
34+
* Q1: Second-of-pair (Chr 1:2000)
35+
* Q1: First-of-pair (Chr 20:5000)
36+
* Q1: Second-of-pair (Chr 20:6000)
37+
*
38+
* This is the format required by RSEM. After query-name sorting the reads for duplicate marking,
39+
* the reads will be ordered as follows;
40+
*
41+
* Q1: First-of-pair (Chr 1:1000)
42+
* Q1: First-of-pair (Chr 20:5000)
43+
* Q1: Second-of-pair (Chr 1:2000)
44+
* Q1: Second-of-pair (Chr 20:6000)
45+
*
46+
* That is, all the read1's come first, then read2's.
47+
*
48+
* This tool reorders such that the alignment pair appears together as in the first example.
49+
*
50+
* Caveat: It may be desirable to remove duplicate reads before running RSEM.
51+
* This tool does not remove duplicate reads; it assumes they have been removed upstream e.g.
52+
* MarkDuplicates with REMOVE_DUPLICATES=true
53+
*
54+
*
55+
* Usage Example:
56+
*
57+
* gatk PostProcessReadsForRSEM \
58+
* -I input.bam \
59+
* -O output.bam
60+
*/
61+
@CommandLineProgramProperties(
62+
summary = "Reorder reads before running RSEM",
63+
oneLineSummary = "Reorder reads before running RSEM",
64+
programGroup = ReadDataManipulationProgramGroup.class
65+
)
66+
67+
@BetaFeature
68+
@DocumentedFeature
69+
public class PostProcessReadsForRSEM extends GATKTool {
70+
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME)
71+
public GATKPath outSam;
72+
73+
SAMFileGATKReadWriter writer;
74+
75+
// Should use CountingFilter or another existing framework for counting and printing filtered reads
76+
int totalOutputPair = 0;
77+
int notBothMapped = 0;
78+
int chimera = 0;
79+
int unsupportedCigar = 0;
80+
81+
@Override
82+
public boolean requiresReads(){
83+
return true;
84+
}
85+
86+
@Override
87+
public void onTraversalStart(){
88+
Utils.nonNull(getHeaderForReads(), "Input bam must have a header");
89+
if (getHeaderForReads().getSortOrder() != SAMFileHeader.SortOrder.queryname){
90+
throw new UserException("Input must be query-name sorted.");
91+
}
92+
93+
writer = createSAMWriter(outSam, true);
94+
}
95+
96+
@Override
97+
public List<ReadFilter> getDefaultReadFilters() {
98+
return Collections.singletonList(ReadFilterLibrary.NOT_SUPPLEMENTARY_ALIGNMENT);
99+
}
100+
101+
@Override
102+
public void traverse() {
103+
final CountingReadFilter countingReadFilter = makeReadFilter();
104+
final Iterator<GATKRead> readIterator = getTransformedReadStream(countingReadFilter).iterator();
105+
ReadPair currentReadPair = null;
106+
107+
// Initialize the first ReadPair object
108+
if (readIterator.hasNext()) {
109+
currentReadPair = new ReadPair(readIterator.next());
110+
totalOutputPair++;
111+
}
112+
113+
while (readIterator.hasNext()){
114+
final GATKRead read = readIterator.next();
115+
if (!currentReadPair.getName().equals(read.getName())){
116+
// We have gathered all the reads with the same query name (primary, secondary, and supplementary alignments)
117+
// Write the reads to output file, reordering the reads as needed.
118+
writeReads(currentReadPair);
119+
currentReadPair = new ReadPair(read);
120+
} else {
121+
currentReadPair.add(read);
122+
}
123+
progressMeter.update(read);
124+
}
125+
126+
if (currentReadPair != null){
127+
writeReads(currentReadPair);
128+
}
129+
130+
}
131+
132+
/**
133+
* For the list of conditions required by RSEM, see: https://github.com/deweylab/RSEM/blob/master/samValidator.cpp
134+
*/
135+
public boolean passesRSEMFilter(final GATKRead read1, final GATKRead read2) {
136+
if (read1 == null || read2 == null){
137+
logger.warn("read1 or read2 is null. This read will not be output. " + read1.getName());
138+
return false;
139+
}
140+
// If either of the pair is unmapped, throw out.
141+
// With the STAR argument --quantTranscriptomeBan IndelSoftclipSingleend this should not occur,
142+
// but we check just to be thorough.
143+
if (read1.isUnmapped() || read2.isUnmapped()){
144+
notBothMapped++;
145+
return false;
146+
}
147+
148+
// Chimeric reads are not allowed. Chimeric alignments should be just as suspect (or potentially interesting)
149+
// in the case of transcriptome as in the genome.
150+
if (!read1.contigsMatch(read2)){
151+
chimera++;
152+
return false;
153+
}
154+
155+
// Cigar must have a single operator M.
156+
if (read1.numCigarElements() != 1 || read2.numCigarElements() != 1){
157+
unsupportedCigar++;
158+
return false;
159+
}
160+
161+
if (read1.getCigarElement(0).getOperator() != CigarOperator.M ||
162+
read2.getCigarElement(0).getOperator() != CigarOperator.M){
163+
unsupportedCigar++;
164+
return false;
165+
} else {
166+
return true;
167+
}
168+
}
169+
170+
/** Write reads in this order
171+
* 1. Primary. First of pair, second of pair
172+
* 2. For each secondary. First of pair, second of pair.
173+
* **/
174+
private void writeReads(final ReadPair readPair){
175+
// Update read by side effect
176+
final GATKRead firstOfPair = readPair.getFirstOfPair();
177+
final GATKRead secondOfPair = readPair.getSecondOfPair();
178+
179+
// Write Primary Reads. If either fails, we discard both, and we don't bother with the secondary alignments.
180+
if (passesRSEMFilter(firstOfPair, secondOfPair)){
181+
writer.addRead(firstOfPair);
182+
writer.addRead(secondOfPair);
183+
totalOutputPair += 1;
184+
185+
// Now handle secondary alignments.
186+
final List<Pair<GATKRead, GATKRead>> secondaryAlignmentPairs = groupSecondaryReads(readPair.getSecondaryAlignments());
187+
for (Pair<GATKRead, GATKRead> mates : secondaryAlignmentPairs){
188+
// The pair is either both written or both not written.
189+
if (passesRSEMFilter(mates.getLeft(), mates.getRight())) {
190+
writer.addRead(mates.getLeft());
191+
writer.addRead(mates.getRight());
192+
totalOutputPair += 1;
193+
194+
}
195+
}
196+
197+
// Supplementary reads are not handled i.e. removed
198+
}
199+
}
200+
201+
/**
202+
*
203+
* Reorder the secondary alignments as described above.
204+
* @param secondaryAlignments may be empty.
205+
* @return a list of pair of matching reads (i.e. read1 with the corresponding read2)
206+
*/
207+
private List<Pair<GATKRead, GATKRead>> groupSecondaryReads(final List<GATKRead> secondaryAlignments){
208+
if (secondaryAlignments.isEmpty()){
209+
return Collections.emptyList();
210+
}
211+
212+
final boolean isRead1 = true;
213+
final Map<Boolean, List<GATKRead>> groupedByRead1 =
214+
secondaryAlignments.stream().collect(Collectors.groupingBy(r -> r.isFirstOfPair()));
215+
final List<GATKRead> read1Reads = groupedByRead1.get(isRead1);
216+
final List<GATKRead> read2Reads = groupedByRead1.get(!isRead1);
217+
if(read1Reads.size() != read2Reads.size()){
218+
logger.warn("Num read1s != num read2s among the secondary alignments; " +
219+
secondaryAlignments.get(0).getName());
220+
return Collections.emptyList();
221+
}
222+
223+
224+
final List<Pair<GATKRead, GATKRead>> result = new ArrayList<>(read1Reads.size());
225+
for (GATKRead read1 : read1Reads){
226+
final List<GATKRead> read2s = read2Reads.stream()
227+
.filter(r -> r.contigsMatch(read1)
228+
&& r.getStart() == read1.getMateStart()
229+
&& r.getMateStart() == read1.getStart())
230+
.collect(Collectors.toList());
231+
if (read2s.size() == 1){
232+
result.add(new ImmutablePair<>(read1, read2s.get(0)));
233+
} else if (read2s.size() > 1){
234+
logger.warn("Multiple mates found for the secondary alignment " + read1.getName());
235+
} else {
236+
logger.warn("Mate not found for the secondary alignment " + read1.getName());
237+
}
238+
}
239+
return result;
240+
}
241+
242+
@Override
243+
public Object onTraversalSuccess(){
244+
// Write out the last set of reads
245+
logger.info("Total read pairs output: " + totalOutputPair);
246+
logger.info("Read pairs filtered due to unmapped mate: " + notBothMapped);
247+
logger.info("Read pairs filtered due to chimeric alignment: " + chimera);
248+
logger.info("Read pairs filtered due to a cigar element not supported by RSEM: " + unsupportedCigar);
249+
return "SUCCESS";
250+
}
251+
252+
@Override
253+
public void closeTool(){
254+
if (writer != null){
255+
writer.close();
256+
}
257+
}
258+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
package org.broadinstitute.hellbender.tools.walkers.qc;
2+
3+
import org.broadinstitute.hellbender.exceptions.UserException;
4+
import org.broadinstitute.hellbender.utils.Utils;
5+
import org.broadinstitute.hellbender.utils.read.GATKRead;
6+
7+
import java.util.ArrayList;
8+
import java.util.Arrays;
9+
import java.util.List;
10+
11+
/**
12+
* Data structure that contains the set of reads sharing the same queryname, including
13+
* the primary, secondary (i.e. multi-mapping), and supplementary (e.g. chimeric) alignments.
14+
*
15+
*/
16+
public class ReadPair {
17+
private GATKRead firstOfPair;
18+
private GATKRead secondOfPair;
19+
private final List<GATKRead> secondaryAlignments = new ArrayList<>();
20+
private final List<GATKRead> supplementaryAlignments = new ArrayList<>();
21+
private final String queryName;
22+
23+
public ReadPair(final GATKRead read) {
24+
this.queryName = read.getName();
25+
add(read);
26+
}
27+
28+
public int numberOfAlignments(){
29+
int num = 0;
30+
num = firstOfPair != null ? num + 1 : num;
31+
num = secondOfPair != null ? num + 1 : num;
32+
num += secondaryAlignments.size();
33+
num += supplementaryAlignments.size();
34+
return num;
35+
}
36+
37+
public String getName() {
38+
return queryName;
39+
}
40+
41+
public GATKRead getFirstOfPair(){
42+
return firstOfPair;
43+
}
44+
45+
public GATKRead getSecondOfPair(){
46+
return secondOfPair;
47+
}
48+
49+
public List<GATKRead> getSecondaryAlignments() {
50+
return secondaryAlignments;
51+
}
52+
53+
public void add(final GATKRead read) {
54+
if (! this.queryName.equals(read.getName())){
55+
throw new UserException("Read names do not match: " + this.queryName + " vs " + read.getName());
56+
}
57+
58+
if (isPrimaryAlignment(read) && read.isFirstOfPair()) {
59+
Utils.validate(this.firstOfPair == null,
60+
"The primary firstOfPair is already set. Read = " + read.getName());
61+
this.firstOfPair = read;
62+
} else if (isPrimaryAlignment(read) && read.isSecondOfPair()) {
63+
Utils.validate(this.secondOfPair == null,
64+
"The primary secondOfPair is already set. Read = " + read.getName());
65+
this.secondOfPair = read;
66+
} else if (read.isSecondaryAlignment()) {
67+
this.secondaryAlignments.add(read);
68+
} else if (read.isSupplementaryAlignment()) {
69+
this.supplementaryAlignments.add(read);
70+
} else {
71+
throw new UserException("Unknown read type: " + read.getContig());
72+
}
73+
}
74+
75+
private boolean isPrimaryAlignment(final GATKRead read){
76+
return ! read.isSecondaryAlignment() && ! read.isSupplementaryAlignment();
77+
}
78+
79+
public boolean isDuplicateMarked() {
80+
if (firstOfPair.isDuplicate()) {
81+
// Make sure the rest is duplicate-marked
82+
if (!secondOfPair.isDuplicate() || secondaryAlignments.stream().anyMatch(r -> !r.isDuplicate())) {
83+
throw new UserException("First of pair a duplicate but the rest is not" + secondOfPair.getName());
84+
}
85+
} else {
86+
// Make sure the rest is not duplicate-marked
87+
if (secondOfPair.isDuplicate() || secondaryAlignments.stream().anyMatch(r -> r.isDuplicate())) {
88+
throw new UserException("First of pair a not duplicate but the rest is " + secondOfPair.getName());
89+
}
90+
}
91+
92+
return firstOfPair.isDuplicate();
93+
}
94+
95+
public List<GATKRead> getReads(final boolean onlyPrimaryAlignments){
96+
List<GATKRead> reads = Arrays.asList(firstOfPair, secondOfPair);
97+
if (onlyPrimaryAlignments){
98+
return(reads);
99+
} else {
100+
reads.addAll(secondaryAlignments);
101+
reads.addAll(supplementaryAlignments);
102+
return(reads);
103+
}
104+
}
105+
}

0 commit comments

Comments
 (0)