|
| 1 | +package org.broadinstitute.hellbender.tools.walkers.qc; |
| 2 | + |
| 3 | +import htsjdk.samtools.CigarOperator; |
| 4 | +import htsjdk.samtools.SAMFileHeader; |
| 5 | +import org.apache.commons.lang3.tuple.ImmutablePair; |
| 6 | +import org.apache.commons.lang3.tuple.Pair; |
| 7 | +import org.apache.http.annotation.Experimental; |
| 8 | +import org.broadinstitute.barclay.argparser.Argument; |
| 9 | +import org.broadinstitute.barclay.argparser.BetaFeature; |
| 10 | +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; |
| 11 | +import org.broadinstitute.barclay.help.DocumentedFeature; |
| 12 | +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; |
| 13 | +import org.broadinstitute.hellbender.engine.*; |
| 14 | +import org.broadinstitute.hellbender.engine.filters.CountingReadFilter; |
| 15 | +import org.broadinstitute.hellbender.engine.filters.ReadFilter; |
| 16 | +import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; |
| 17 | +import org.broadinstitute.hellbender.exceptions.UserException; |
| 18 | +import org.broadinstitute.hellbender.utils.Utils; |
| 19 | +import org.broadinstitute.hellbender.utils.read.GATKRead; |
| 20 | +import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter; |
| 21 | +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; |
| 22 | + |
| 23 | +import java.util.*; |
| 24 | +import java.util.stream.Collectors; |
| 25 | + |
| 26 | +/** |
| 27 | + * Performs post-processing steps to get a bam aligned to a transcriptome ready for RSEM (https://github.com/deweylab/RSEM) |
| 28 | + * |
| 29 | + * |
| 30 | + * Suppose the read name "Q1" aligns to multiple loci in the transcriptome. |
| 31 | + * STAR aligner outputs the reads in the following order: |
| 32 | + * |
| 33 | + * Q1: First-of-pair (Chr 1:1000) |
| 34 | + * Q1: Second-of-pair (Chr 1:2000) |
| 35 | + * Q1: First-of-pair (Chr 20:5000) |
| 36 | + * Q1: Second-of-pair (Chr 20:6000) |
| 37 | + * |
| 38 | + * This is the format required by RSEM. After query-name sorting the reads for duplicate marking, |
| 39 | + * the reads will be ordered as follows; |
| 40 | + * |
| 41 | + * Q1: First-of-pair (Chr 1:1000) |
| 42 | + * Q1: First-of-pair (Chr 20:5000) |
| 43 | + * Q1: Second-of-pair (Chr 1:2000) |
| 44 | + * Q1: Second-of-pair (Chr 20:6000) |
| 45 | + * |
| 46 | + * That is, all the read1's come first, then read2's. |
| 47 | + * |
| 48 | + * This tool reorders such that the alignment pair appears together as in the first example. |
| 49 | + * |
| 50 | + * Caveat: It may be desirable to remove duplicate reads before running RSEM. |
| 51 | + * This tool does not remove duplicate reads; it assumes they have been removed upstream e.g. |
| 52 | + * MarkDuplicates with REMOVE_DUPLICATES=true |
| 53 | + * |
| 54 | + * |
| 55 | + * Usage Example: |
| 56 | + * |
| 57 | + * gatk PostProcessReadsForRSEM \ |
| 58 | + * -I input.bam \ |
| 59 | + * -O output.bam |
| 60 | + */ |
| 61 | +@CommandLineProgramProperties( |
| 62 | + summary = "Reorder reads before running RSEM", |
| 63 | + oneLineSummary = "Reorder reads before running RSEM", |
| 64 | + programGroup = ReadDataManipulationProgramGroup.class |
| 65 | +) |
| 66 | + |
| 67 | +@BetaFeature |
| 68 | +@DocumentedFeature |
| 69 | +public class PostProcessReadsForRSEM extends GATKTool { |
| 70 | + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME) |
| 71 | + public GATKPath outSam; |
| 72 | + |
| 73 | + SAMFileGATKReadWriter writer; |
| 74 | + |
| 75 | + // Should use CountingFilter or another existing framework for counting and printing filtered reads |
| 76 | + int totalOutputPair = 0; |
| 77 | + int notBothMapped = 0; |
| 78 | + int chimera = 0; |
| 79 | + int unsupportedCigar = 0; |
| 80 | + |
| 81 | + @Override |
| 82 | + public boolean requiresReads(){ |
| 83 | + return true; |
| 84 | + } |
| 85 | + |
| 86 | + @Override |
| 87 | + public void onTraversalStart(){ |
| 88 | + Utils.nonNull(getHeaderForReads(), "Input bam must have a header"); |
| 89 | + if (getHeaderForReads().getSortOrder() != SAMFileHeader.SortOrder.queryname){ |
| 90 | + throw new UserException("Input must be query-name sorted."); |
| 91 | + } |
| 92 | + |
| 93 | + writer = createSAMWriter(outSam, true); |
| 94 | + } |
| 95 | + |
| 96 | + @Override |
| 97 | + public List<ReadFilter> getDefaultReadFilters() { |
| 98 | + return Collections.singletonList(ReadFilterLibrary.NOT_SUPPLEMENTARY_ALIGNMENT); |
| 99 | + } |
| 100 | + |
| 101 | + @Override |
| 102 | + public void traverse() { |
| 103 | + final CountingReadFilter countingReadFilter = makeReadFilter(); |
| 104 | + final Iterator<GATKRead> readIterator = getTransformedReadStream(countingReadFilter).iterator(); |
| 105 | + ReadPair currentReadPair = null; |
| 106 | + |
| 107 | + // Initialize the first ReadPair object |
| 108 | + if (readIterator.hasNext()) { |
| 109 | + currentReadPair = new ReadPair(readIterator.next()); |
| 110 | + totalOutputPair++; |
| 111 | + } |
| 112 | + |
| 113 | + while (readIterator.hasNext()){ |
| 114 | + final GATKRead read = readIterator.next(); |
| 115 | + if (!currentReadPair.getName().equals(read.getName())){ |
| 116 | + // We have gathered all the reads with the same query name (primary, secondary, and supplementary alignments) |
| 117 | + // Write the reads to output file, reordering the reads as needed. |
| 118 | + writeReads(currentReadPair); |
| 119 | + currentReadPair = new ReadPair(read); |
| 120 | + } else { |
| 121 | + currentReadPair.add(read); |
| 122 | + } |
| 123 | + progressMeter.update(read); |
| 124 | + } |
| 125 | + |
| 126 | + if (currentReadPair != null){ |
| 127 | + writeReads(currentReadPair); |
| 128 | + } |
| 129 | + |
| 130 | + } |
| 131 | + |
| 132 | + /** |
| 133 | + * For the list of conditions required by RSEM, see: https://github.com/deweylab/RSEM/blob/master/samValidator.cpp |
| 134 | + */ |
| 135 | + public boolean passesRSEMFilter(final GATKRead read1, final GATKRead read2) { |
| 136 | + if (read1 == null || read2 == null){ |
| 137 | + logger.warn("read1 or read2 is null. This read will not be output. " + read1.getName()); |
| 138 | + return false; |
| 139 | + } |
| 140 | + // If either of the pair is unmapped, throw out. |
| 141 | + // With the STAR argument --quantTranscriptomeBan IndelSoftclipSingleend this should not occur, |
| 142 | + // but we check just to be thorough. |
| 143 | + if (read1.isUnmapped() || read2.isUnmapped()){ |
| 144 | + notBothMapped++; |
| 145 | + return false; |
| 146 | + } |
| 147 | + |
| 148 | + // Chimeric reads are not allowed. Chimeric alignments should be just as suspect (or potentially interesting) |
| 149 | + // in the case of transcriptome as in the genome. |
| 150 | + if (!read1.contigsMatch(read2)){ |
| 151 | + chimera++; |
| 152 | + return false; |
| 153 | + } |
| 154 | + |
| 155 | + // Cigar must have a single operator M. |
| 156 | + if (read1.numCigarElements() != 1 || read2.numCigarElements() != 1){ |
| 157 | + unsupportedCigar++; |
| 158 | + return false; |
| 159 | + } |
| 160 | + |
| 161 | + if (read1.getCigarElement(0).getOperator() != CigarOperator.M || |
| 162 | + read2.getCigarElement(0).getOperator() != CigarOperator.M){ |
| 163 | + unsupportedCigar++; |
| 164 | + return false; |
| 165 | + } else { |
| 166 | + return true; |
| 167 | + } |
| 168 | + } |
| 169 | + |
| 170 | + /** Write reads in this order |
| 171 | + * 1. Primary. First of pair, second of pair |
| 172 | + * 2. For each secondary. First of pair, second of pair. |
| 173 | + * **/ |
| 174 | + private void writeReads(final ReadPair readPair){ |
| 175 | + // Update read by side effect |
| 176 | + final GATKRead firstOfPair = readPair.getFirstOfPair(); |
| 177 | + final GATKRead secondOfPair = readPair.getSecondOfPair(); |
| 178 | + |
| 179 | + // Write Primary Reads. If either fails, we discard both, and we don't bother with the secondary alignments. |
| 180 | + if (passesRSEMFilter(firstOfPair, secondOfPair)){ |
| 181 | + writer.addRead(firstOfPair); |
| 182 | + writer.addRead(secondOfPair); |
| 183 | + totalOutputPair += 1; |
| 184 | + |
| 185 | + // Now handle secondary alignments. |
| 186 | + final List<Pair<GATKRead, GATKRead>> secondaryAlignmentPairs = groupSecondaryReads(readPair.getSecondaryAlignments()); |
| 187 | + for (Pair<GATKRead, GATKRead> mates : secondaryAlignmentPairs){ |
| 188 | + // The pair is either both written or both not written. |
| 189 | + if (passesRSEMFilter(mates.getLeft(), mates.getRight())) { |
| 190 | + writer.addRead(mates.getLeft()); |
| 191 | + writer.addRead(mates.getRight()); |
| 192 | + totalOutputPair += 1; |
| 193 | + |
| 194 | + } |
| 195 | + } |
| 196 | + |
| 197 | + // Supplementary reads are not handled i.e. removed |
| 198 | + } |
| 199 | + } |
| 200 | + |
| 201 | + /** |
| 202 | + * |
| 203 | + * Reorder the secondary alignments as described above. |
| 204 | + * @param secondaryAlignments may be empty. |
| 205 | + * @return a list of pair of matching reads (i.e. read1 with the corresponding read2) |
| 206 | + */ |
| 207 | + private List<Pair<GATKRead, GATKRead>> groupSecondaryReads(final List<GATKRead> secondaryAlignments){ |
| 208 | + if (secondaryAlignments.isEmpty()){ |
| 209 | + return Collections.emptyList(); |
| 210 | + } |
| 211 | + |
| 212 | + final boolean isRead1 = true; |
| 213 | + final Map<Boolean, List<GATKRead>> groupedByRead1 = |
| 214 | + secondaryAlignments.stream().collect(Collectors.groupingBy(r -> r.isFirstOfPair())); |
| 215 | + final List<GATKRead> read1Reads = groupedByRead1.get(isRead1); |
| 216 | + final List<GATKRead> read2Reads = groupedByRead1.get(!isRead1); |
| 217 | + if(read1Reads.size() != read2Reads.size()){ |
| 218 | + logger.warn("Num read1s != num read2s among the secondary alignments; " + |
| 219 | + secondaryAlignments.get(0).getName()); |
| 220 | + return Collections.emptyList(); |
| 221 | + } |
| 222 | + |
| 223 | + |
| 224 | + final List<Pair<GATKRead, GATKRead>> result = new ArrayList<>(read1Reads.size()); |
| 225 | + for (GATKRead read1 : read1Reads){ |
| 226 | + final List<GATKRead> read2s = read2Reads.stream() |
| 227 | + .filter(r -> r.contigsMatch(read1) |
| 228 | + && r.getStart() == read1.getMateStart() |
| 229 | + && r.getMateStart() == read1.getStart()) |
| 230 | + .collect(Collectors.toList()); |
| 231 | + if (read2s.size() == 1){ |
| 232 | + result.add(new ImmutablePair<>(read1, read2s.get(0))); |
| 233 | + } else if (read2s.size() > 1){ |
| 234 | + logger.warn("Multiple mates found for the secondary alignment " + read1.getName()); |
| 235 | + } else { |
| 236 | + logger.warn("Mate not found for the secondary alignment " + read1.getName()); |
| 237 | + } |
| 238 | + } |
| 239 | + return result; |
| 240 | + } |
| 241 | + |
| 242 | + @Override |
| 243 | + public Object onTraversalSuccess(){ |
| 244 | + // Write out the last set of reads |
| 245 | + logger.info("Total read pairs output: " + totalOutputPair); |
| 246 | + logger.info("Read pairs filtered due to unmapped mate: " + notBothMapped); |
| 247 | + logger.info("Read pairs filtered due to chimeric alignment: " + chimera); |
| 248 | + logger.info("Read pairs filtered due to a cigar element not supported by RSEM: " + unsupportedCigar); |
| 249 | + return "SUCCESS"; |
| 250 | + } |
| 251 | + |
| 252 | + @Override |
| 253 | + public void closeTool(){ |
| 254 | + if (writer != null){ |
| 255 | + writer.close(); |
| 256 | + } |
| 257 | + } |
| 258 | +} |
0 commit comments