Skip to content

Commit ee81df1

Browse files
jonn-smithdroazen
authored andcommitted
Better HG19/B37 handling, and caching speed fixes for Funcotator. (#4927)
Now Funcotator assumes all data sources for the HG19 reference are compliant with HG19 contig names, and translates B37 contig names to their HG19 equivalents as needed. This fixes a major performance issue with HG19/B37 inputs where we were systematically getting cache misses when querying the datasources with the wrong contig names. Released new version of datasources to go with this release (1.4.20180615). This was necessary because the data sources needed to be made consistent with HG19 (before they were a mix of HG19 and B37 contig names). Fixes #4586
1 parent cd1b453 commit ee81df1

File tree

26 files changed

+720
-384
lines changed

26 files changed

+720
-384
lines changed

scripts/funcotator/data_sources/cosmic/createSqliteCosmicDb.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ sqlite3 ${OUT_DB_FILE} <<EOF
4444
.import ${COSMIC_FILE} RawCosmic
4545
CREATE TABLE Cosmic AS SELECT * FROM RawCosmic WHERE ("Mutation AA" != "" OR "Mutation genome position" != "");
4646
DROP TABLE RawCosmic;
47+
UPDATE Cosmic SET "Mutation genome position" = "chr"||"Mutation genome position" WHERE "Mutation genome position" != "";
4748
CREATE INDEX GeneIndex ON Cosmic("Gene name");
4849
VACUUM;
4950
EOF

scripts/funcotator/data_sources/getDbSNP.sh

Lines changed: 60 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ function usage()
4949
echo -e " 3 UNKNOWN ARGUMENT"
5050
echo -e " 4 BAD CHECKSUM"
5151
echo -e " 5 OUTPUT DIRECTORY ALREADY EXISTS"
52+
echo -e " 6 COULD NOT FIND BGZIP UTILITY"
5253
echo -e ""
5354
}
5455

@@ -160,42 +161,48 @@ function downloadAndVerifyVcfFiles() {
160161
echo "${indentSpace}Retrieving MD5 sum file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${md5File} ... "
161162
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${md5File}
162163

164+
# Get the VCF file, then make sure that the contig names are correct for HG19 (if applicable)
163165
echo "${indentSpace}Retrieving VCF file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile} ... "
164-
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile}
165-
166-
echo "${indentSpace}Retrieving VCF Index file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile} ... "
167-
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile}
168-
169-
echo "${indentSpace}Verifying VCF checksum ..."
170-
if [[ "$(uname)" == "Darwin" ]] ; then
171-
which md5sum-lite &> /dev/null
172-
r=$?
173-
if [ $r == 0 ] ; then
174-
checksum=$( md5sum-lite ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
175-
expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
176-
177-
if [[ "${checksum}" != "${expected}" ]] ; then
178-
error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})"
179-
error "FAILING"
180-
exit 4
181-
fi
182-
else
183-
error "Unable to validate md5sum of file: cannot locate 'md5sum-lite'. Use these data with caution."
184-
fi
166+
if [[ "${filePrefix}" == "hg19" ]] ; then
167+
curl ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile} | gunzip | sed -e 's#^\([0-9][0-9]*\)#chr\1#' -e 's#^MT#chrM#' -e 's#^X#chrX#' -e 's#^Y#chrY#' | bgzip > ${vcfFile}
185168
else
186-
which md5sum &> /dev/null
187-
r=$?
188-
if [ $r == 0 ] ; then
189-
checksum=$( md5sum ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
190-
expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
191-
192-
if [[ "${checksum}" != "${expected}" ]] ; then
193-
error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})"
194-
error "FAILING"
195-
exit 4
169+
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile}
170+
171+
echo "${indentSpace}Retrieving VCF Index file: ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile} ... "
172+
wget ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${tbiFile}
173+
174+
# We can only verify the checksum with hg38 because we modify the hg19 file as we stream it in:
175+
echo "${indentSpace}Verifying VCF checksum ..."
176+
if [[ "$(uname)" == "Darwin" ]] ; then
177+
which md5sum-lite &> /dev/null
178+
r=$?
179+
if [ $r == 0 ] ; then
180+
checksum=$( md5sum-lite ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
181+
expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
182+
183+
if [[ "${checksum}" != "${expected}" ]] ; then
184+
error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})"
185+
error "FAILING"
186+
exit 4
187+
fi
188+
else
189+
error "Unable to validate md5sum of file: cannot locate 'md5sum-lite'. Use these data with caution."
196190
fi
197191
else
198-
error "Unable to validate md5sum of file: cannot locate 'md5sum'. Use these data with caution."
192+
which md5sum &> /dev/null
193+
r=$?
194+
if [ $r == 0 ] ; then
195+
checksum=$( md5sum ${vcfFile} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
196+
expected=$( head -n1 ${md5File} | awk '{print $1}' | sed -e 's#^[ \t]*##g' -e 's#[ \t]*$##g' )
197+
198+
if [[ "${checksum}" != "${expected}" ]] ; then
199+
error "DOWNLOADED FILE IS CORRUPT! (${checksum} != ${expected})"
200+
error "FAILING"
201+
exit 4
202+
fi
203+
else
204+
error "Unable to validate md5sum of file: cannot locate 'md5sum'. Use these data with caution."
205+
fi
199206
fi
200207
fi
201208

@@ -205,8 +212,10 @@ function downloadAndVerifyVcfFiles() {
205212

206213
echo "${indentSpace}Moving files to output directory ..."
207214
mv ${vcfFile} ${outputFolder}/${filePrefix}_${vcfFile}
208-
mv ${tbiFile} ${outputFolder}/${filePrefix}_${tbiFile}
209-
rm ${md5File}
215+
if [[ ! "${filePrefix}" == "hg19" ]] ; then
216+
mv ${tbiFile} ${outputFolder}/${filePrefix}_${tbiFile}
217+
rm ${md5File}
218+
fi
210219

211220
echo "${indentSpace}Creating Config File ... "
212221
createConfigFile "${DATA_SOURCE_NAME}" "${version}" ${filePrefix}_${vcfFile} "ftp://ftp.ncbi.nih.gov/snp/organisms/${remoteFolder}/VCF/${vcfFile}" > ${outputFolder}/${DATA_SOURCE_NAME}.config
@@ -258,6 +267,14 @@ if [[ -d ${OUT_DIR_NAME} ]] ; then
258267
exit 5
259268
fi
260269

270+
# Make sure bgzip exists:
271+
which bgzip > /dev/null
272+
r=$?
273+
if [[ $r -ne 0 ]] ; then
274+
error "bgzip utility not found on path. You must have bgzip installed to get the dbSNP resource. Please install bgzip and try again. - aborting!"
275+
exit 6
276+
fi
277+
261278
echo "Querying NCBI listing..."
262279
tmpListing="$( makeTemp )"
263280
curl ftp://ftp.ncbi.nih.gov/snp/organisms/ 2>/dev/null > ${tmpListing}
@@ -275,4 +292,13 @@ hg38Version=$( cat ${tmpListing} | awk '{print $9}' | grep 'human' | grep 'GRCh3
275292

276293
downloadAndVerifyVcfFiles ${hg38Version} ${OUT_DIR_NAME}/hg38 "hg38"
277294

295+
echo -e "\033[1;33;40m##################################################################################\033[0;0m"
296+
echo -e "\033[1;33;40m# #\033[0;0m"
297+
echo -e "\033[1;33;40m# \033[1;5;37;41mWARNING\033[0;0m: You \033[4;37;40mMUST\033[0;0m index the VCF files for \033[4;37;40mHG19\033[0;0m #\033[0;0m"
298+
echo -e "\033[1;33;40m# before using this data source. #\033[0;0m"
299+
echo -e "\033[1;33;40m# #\033[0;0m"
300+
echo -e "\033[1;33;40m# Use gatk IndexFeatureFile <GTF_FILE> #\033[0;0m"
301+
echo -e "\033[1;33;40m# #\033[0;0m"
302+
echo -e "\033[1;33;40m##################################################################################\033[0;0m"
303+
278304
exit 0

scripts/funcotator/data_sources/sortClinvarHgmd.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ def row_comparator(x,y):
8585
print 'Writing dataset ...',
8686
FLUSH()
8787
for row in clinvarHgmd:
88+
row[2] = "chr" + row[2]
8889
out_tsv_writer.writerow(row)
8990
print 'DONE!'
9091
FLUSH()

src/main/java/org/broadinstitute/hellbender/engine/AbstractConcordanceWalker.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
import org.broadinstitute.hellbender.utils.SimpleInterval;
1515
import org.broadinstitute.hellbender.utils.Utils;
1616
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
17-
import org.broadinstitute.hellbender.utils.io.IOUtils;
1817

1918
import java.io.File;
2019
import java.util.Iterator;

src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionWalker.java

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package org.broadinstitute.hellbender.engine;
22

3-
import htsjdk.samtools.util.Locatable;
43
import org.broadinstitute.barclay.argparser.Advanced;
54
import org.broadinstitute.barclay.argparser.Argument;
65
import org.broadinstitute.barclay.argparser.CommandLineException;
@@ -12,13 +11,10 @@
1211
import org.broadinstitute.hellbender.utils.IGVUtils;
1312
import org.broadinstitute.hellbender.utils.IntervalUtils;
1413
import org.broadinstitute.hellbender.utils.SimpleInterval;
15-
import org.broadinstitute.hellbender.utils.Utils;
1614
import org.broadinstitute.hellbender.utils.activityprofile.ActivityProfileState;
1715
import org.broadinstitute.hellbender.utils.downsampling.PositionalDownsampler;
1816
import org.broadinstitute.hellbender.utils.downsampling.ReadsDownsampler;
19-
import org.broadinstitute.hellbender.utils.read.GATKRead;
2017

21-
import java.io.File;
2218
import java.io.IOException;
2319
import java.io.PrintStream;
2420
import java.util.ArrayList;
@@ -147,7 +143,7 @@ public abstract class AssemblyRegionWalker extends GATKTool {
147143

148144
@Override
149145
public String getProgressMeterRecordLabel() { return "regions"; }
150-
146+
151147
private List<MultiIntervalLocalReadShard> readShards;
152148

153149
/**

src/main/java/org/broadinstitute/hellbender/engine/FeatureCache.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,11 +223,23 @@ public List<CACHED_FEATURE> getCachedFeaturesUpToStopPosition( final int stopPos
223223
}
224224

225225
/**
226-
* Print statistics about the cache hit rate for debugging
226+
* Print statistics about the cache hit rate for debugging.
227227
*/
228228
public void printCacheStatistics() {
229+
printCacheStatistics("");
230+
}
231+
232+
/**
233+
* Print statistics about the cache hit rate for debugging.
234+
* @param sourceName The source for the features in this cache.
235+
*/
236+
public void printCacheStatistics(final String sourceName) {
237+
238+
final String sourceNameString = sourceName.isEmpty() ? "" : "for data source " + sourceName;
239+
229240
final int totalQueries = getNumCacheHits() + getNumCacheMisses();
230-
logger.debug(String.format("Cache hit rate was %.2f%% (%d out of %d total queries)",
241+
logger.debug(String.format("Cache hit rate %s was %.2f%% (%d out of %d total queries)",
242+
sourceNameString,
231243
totalQueries > 0 ? ((double)getNumCacheHits() / totalQueries) * 100.0 : 0.0,
232244
getNumCacheHits(),
233245
totalQueries));

src/main/java/org/broadinstitute/hellbender/engine/FeatureWalker.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import org.broadinstitute.hellbender.utils.Utils;
99

1010
import java.io.File;
11-
import java.nio.file.Path;
1211

1312
/**
1413
* A FeatureWalker is a tool that processes a {@link Feature} at a time from a source of Features, with

src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,6 @@
1010
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
1111
import htsjdk.variant.vcf.VCFHeaderLine;
1212
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
13-
import java.io.File;
14-
import java.nio.file.Path;
15-
import java.time.ZonedDateTime;
16-
import java.util.*;
17-
import java.util.stream.Stream;
1813
import org.broadinstitute.barclay.argparser.Argument;
1914
import org.broadinstitute.barclay.argparser.ArgumentCollection;
2015
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
@@ -43,6 +38,12 @@
4338
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
4439
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
4540

41+
import java.io.File;
42+
import java.nio.file.Path;
43+
import java.time.ZonedDateTime;
44+
import java.util.*;
45+
import java.util.stream.Stream;
46+
4647
/**
4748
* Base class for all GATK tools. Tool authors that wish to write a "GATK" tool but not use one of
4849
* the pre-packaged Walker traversals should feel free to extend this class directly. All other
@@ -69,7 +70,7 @@ public abstract class GATKTool extends CommandLineProgram {
6970
private double secondsBetweenProgressUpdates = ProgressMeter.DEFAULT_SECONDS_BETWEEN_UPDATES;
7071

7172
@ArgumentCollection
72-
private SequenceDictionaryValidationArgumentCollection seqValidationArguments = getSequenceDictionaryValidationArgumentCollection();
73+
protected SequenceDictionaryValidationArgumentCollection seqValidationArguments = getSequenceDictionaryValidationArgumentCollection();
7374

7475
@Argument(fullName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_LONG_NAME,
7576
shortName=StandardArgumentDefinitions.CREATE_OUTPUT_BAM_INDEX_SHORT_NAME,
@@ -150,6 +151,7 @@ public abstract class GATKTool extends CommandLineProgram {
150151
FeatureManager features;
151152

152153
/**
154+
*
153155
* Intervals to be used for traversal (null if no intervals were provided).
154156
*
155157
* Walker base classes (ReadWalker, etc.) are responsible for hooking these intervals up to

src/main/java/org/broadinstitute/hellbender/engine/IntervalWalker.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
44
import org.broadinstitute.hellbender.utils.SimpleInterval;
55

6-
import java.nio.file.Path;
7-
86
/**
97
* An IntervalWalker is a tool that processes a single interval at a time, with the ability to query
108
* optional overlapping sources of reads, reference data, and/or variants/features.

src/main/java/org/broadinstitute/hellbender/engine/LocusWalker.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,16 @@
99
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
1010
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
1111
import org.broadinstitute.hellbender.exceptions.UserException;
12-
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentContextIteratorBuilder;
1312
import org.broadinstitute.hellbender.utils.SimpleInterval;
13+
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentContextIteratorBuilder;
1414
import org.broadinstitute.hellbender.utils.locusiterator.LIBSDownsamplingInfo;
1515
import org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState;
1616
import org.broadinstitute.hellbender.utils.read.GATKRead;
1717

18-
import java.util.*;
18+
import java.util.ArrayList;
19+
import java.util.Iterator;
20+
import java.util.List;
21+
import java.util.Set;
1922
import java.util.stream.Collectors;
2023

2124
/**

0 commit comments

Comments
 (0)