broadinstitute
diff --git a/‎scripts/gatkcondaenv.yml‎
Lines changed: 5 additions & 1 deletion b/‎scripts/gatkcondaenv.yml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/main/java/org/broadinstitute/hellbender/engine/filters/VariantFilterLibrary.java‎
Lines changed: 1 addition & 0 deletions b/‎src/main/java/org/broadinstitute/hellbender/engine/filters/VariantFilterLibrary.java‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariants.java‎
Lines changed: 492 additions & 0 deletions b/‎src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariants.java‎
Lines changed: 492 additions & 0 deletions
diff --git a/‎src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java‎
Lines changed: 144 additions & 0 deletions b/‎src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java‎
Lines changed: 159 additions & 0 deletions b/‎src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java‎
Lines changed: 159 additions & 0 deletions
@@ -19,13 +19,14 @@ dependencies:
 - xz=5.2.3=0
 - zlib=1.2.11=0
 - pip:
+  - biopython==1.70
   - bleach==1.5.0
   - cycler==0.10.0
   - enum34==1.1.6
   - h5py==2.7.1
   - html5lib==0.9999999
   - joblib==0.11
-  - keras==2.1.1
+  - keras==2.1.4
   - markdown==2.6.9
   - matplotlib==2.1.0
   - numpy==1.13.3
@@ -34,9 +35,12 @@ dependencies:
   - protobuf==3.5.0.post1
   - pymc3==3.1
   - pyparsing==2.2.0
+  - pysam==0.13
   - python-dateutil==2.6.1
   - pytz==2017.3
+  - pyvcf==0.6.8
   - pyyaml==3.12
+  - scikit-learn==0.19.1
   - scipy==1.0.0
   - six==1.11.0
   - tensorflow==1.4.0
 
@@ -5,4 +5,5 @@
  */
 public final class VariantFilterLibrary {
     public static VariantFilter ALLOW_ALL_VARIANTS = variant -> true;
+    public static VariantFilter NOT_SV_OR_SYMBOLIC = variant -> !variant.isSymbolicOrSV();
 }
@@ -0,0 +1,144 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr;
+
+import org.broadinstitute.barclay.argparser.*;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.utils.io.Resource;
+import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor;
+import picard.cmdline.programgroups.VariantEvaluationProgramGroup;
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Train a Convolutional Neural Network (CNN) for filtering variants.
+ * This tool expects requires training data generated by {@link CNNVariantWriteTensors}.
+ *
+ *
+ * <h3>Inputs</h3>
+ * <ul>
+ *      <li>data-dir The training data created by {@link CNNVariantWriteTensors}.</li>
+ *      <li>The tensor-name argument determines what types of tensors the model will expect.
+ *      Set it to "reference" for 1D tensors or "read_tensor" for 2D tensors.</li>
+ * </ul>
+ *
+ * <h3>Outputs</h3>
+ * <ul>
+ * <li>output-dir The model weights file and semantic configuration json are saved here.
+ *  This default to the current working directory.</li>
+ * <li>model-name The name for your model.</li>
+ * </ul>
+ *
+ * <h3>Usage example</h3>
+ *
+ * <h4>Train a 1D CNN on Reference Tensors</h4>
+ * <pre>
+ * gatk CNNVariantTrain \
+ *   -tensor-type reference \
+ *   -input-tensors-dir my_tensor_folder \
+ *   -model-name my_1d_model
+ * </pre>
+ *
+ * <h4>Train a 2D CNN on Read Tensors</h4>
+ * <pre>
+ * gatk CNNVariantTrain \
+ *   -input-tensors-dir my_tensor_folder \
+ *   -tensor-type read-tensor \
+ *   -model-name my_2d_model
+ * </pre>
+ *
+ */
+@CommandLineProgramProperties(
+        summary = "Train a CNN model for filtering variants",
+        oneLineSummary = "Train a CNN model for filtering variants",
+        programGroup = VariantEvaluationProgramGroup.class
+)
+@DocumentedFeature
+@ExperimentalFeature
+public class CNNVariantTrain extends CommandLineProgram {
+
+    @Argument(fullName = "input-tensor-dir", shortName = "input-tensor-dir", doc = "Directory of training tensors to create.")
+    private String inputTensorDir;
+
+    @Argument(fullName = "output-dir", shortName = "output-dir", doc = "Directory where models will be saved, defaults to current working directory.", optional = true)
+    private String outputDir = "./";
+
+    @Argument(fullName = "tensor-type", shortName = "tensor-type", doc = "Name of the tensors to generate, reference for 1D reference tensors and read_tensor for 2D tensors.", optional = true)
+    private TensorType tensorType = TensorType.reference;
+
+    @Argument(fullName = "model-name", shortName = "model-name", doc = "Name of the model to be trained.", optional = true)
+    private String modelName = "variant_filter_model";
+
+    @Argument(fullName = "epochs", shortName = "epochs", doc = "Maximum number of training epochs.", optional = true, minValue = 0)
+    private int epochs = 10;
+
+    @Argument(fullName = "training-steps", shortName = "training-steps", doc = "Number of training steps per epoch.", optional = true, minValue = 0)
+    private int trainingSteps = 10;
+
+    @Argument(fullName = "validation-steps", shortName = "validation-steps", doc = "Number of validation steps per epoch.", optional = true, minValue = 0)
+    private int validationSteps = 2;
+
+    @Argument(fullName = "image-dir", shortName = "image-dir", doc = "Path where plots and figures are saved.", optional = true)
+    private String imageDir;
+
+    @Advanced
+    @Argument(fullName = "channels-last", shortName = "channels-last", doc = "Store the channels in the last axis of tensors, tensorflow->true, theano->false", optional = true)
+    private boolean channelsLast = true;
+
+    @Advanced
+    @Argument(fullName = "annotation-set", shortName = "annotation-set", doc = "Which set of annotations to use.", optional = true)
+    private String annotationSet = "best_practices";
+
+    // Start the Python executor. This does not actually start the Python process, but fails if python can't be located
+    final PythonScriptExecutor pythonExecutor = new PythonScriptExecutor(true);
+
+
+    @Override
+    protected void onStartup() {
+        PythonScriptExecutor.checkPythonEnvironmentForPackage("vqsr_cnn");
+    }
+
+    @Override
+    protected Object doWork() {
+        final Resource pythonScriptResource = new Resource("training.py", FilterVariantTranches.class);
+        List<String> arguments = new ArrayList<>(Arrays.asList(
+                "--data_dir", inputTensorDir,
+                "--output_dir", outputDir,
+                "--tensor_name", tensorType.name(),
+                "--annotation_set", annotationSet,
+                "--epochs", Integer.toString(epochs),
+                "--training_steps", Integer.toString(trainingSteps),
+                "--validation_steps", Integer.toString(validationSteps),
+                "--id", modelName));
+
+        if(channelsLast){
+            arguments.add("--channels_last");
+        } else {
+            arguments.add("--channels_first");
+        }
+
+        if(imageDir != null){
+            arguments.addAll(Arrays.asList("--image_dir", imageDir));
+        }
+
+        if (tensorType == TensorType.reference) {
+            arguments.addAll(Arrays.asList("--mode", "train_on_reference_tensors_and_annotations"));
+        } else if (tensorType == TensorType.read_tensor) {
+            arguments.addAll(Arrays.asList("--mode", "train_small_model_on_read_tensors_and_annotations"));
+        } else {
+            throw new GATKException("Unknown tensor mapping mode:"+ tensorType.name());
+        }
+
+        logger.info("Args are:"+ Arrays.toString(arguments.toArray()));
+        final boolean pythonReturnCode = pythonExecutor.executeScript(
+                pythonScriptResource,
+                null,
+                arguments
+        );
+        return pythonReturnCode;
+    }
+
+}
@@ -0,0 +1,159 @@
+package org.broadinstitute.hellbender.tools.walkers.vqsr;
+
+import org.broadinstitute.barclay.argparser.*;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.utils.io.Resource;
+import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor;
+import picard.cmdline.programgroups.VariantEvaluationProgramGroup;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Write variant tensors for training a Convolutional Neural Network (CNN) for filtering variants.
+ * After running this tool, a model can be trained with the {@link CNNVariantTrain} tool.
+ *
+ *
+ * <h3>Inputs</h3>
+ * <ul>
+ *      <li>The input variants to make into tensors.
+ *      These variant calls must be annotated with the standard best practices annotations.</li>
+ *      <li>The truth VCF has validated variant calls, like those in the genomes in a bottle,
+ *      platinum genomes, or CHM VCFs.  Variants in both the input VCF and the truth VCF
+ *      will be used as positive training data.</li>
+ *      <li>The truth BED is a bed file define the confident region for the validated calls.
+ *      Variants from the input VCF inside this region, but not included in the truth VCF
+ *      will be used as negative training data.</li>
+ *      <li>The tensor-name argument determines what types of tensors will be written.
+ *      Set it to "reference" to write 1D tensors or "read_tensor" to write 2D tensors.</li>
+ *      <li>The bam-file argument is necessary to write 2D tensors which incorporate read data.</li>
+ * </ul>
+ *
+ * <h3>Outputs</h3>
+ * <ul>
+ * <li>data-dir This directory is created and populated with variant tensors.
+ *  it will be divided into training, validation and test sets and each set will be further divided into
+ *  positive and negative SNPs and INDELs.</li>
+ * </ul>
+ *
+ * <h3>Usage example</h3>
+ *
+ * <h4>Write Reference Tensors</h4>
+ * <pre>
+ * gatk CNNVariantWriteTensors \
+ *   -R reference.fasta \
+ *   -V input.vcf.gz \
+ *   -truth-vcf platinum-genomes.vcf \
+ *   -truth-bed platinum-confident-region.bed \
+ *   -tensor-name reference \
+ *   -output-tensor-dir my-tensor-folder
+ * </pre>
+ *
+ * <h4>Write Read Tensors</h4>
+ * <pre>
+ * gatk CNNVariantWriteTensors \
+ *   -R reference.fasta \
+ *   -V input.vcf.gz \
+ *   -truth-vcf platinum-genomes.vcf \
+ *   -truth-bed platinum-confident-region.bed \
+ *   -tensor-name read_tensor \
+ *   -bam-file input.bam \
+ *   -output-tensor-dir my-tensor-folder
+ * </pre>
+ *
+ */
+@CommandLineProgramProperties(
+        summary = "Write variant tensors for training a CNN to filter variants",
+        oneLineSummary = "Write variant tensors for training a CNN to filter variants",
+        programGroup = VariantEvaluationProgramGroup.class
+)
+@DocumentedFeature
+@ExperimentalFeature
+public class CNNVariantWriteTensors extends CommandLineProgram {
+
+    @Argument(fullName = StandardArgumentDefinitions.REFERENCE_LONG_NAME,
+            shortName = StandardArgumentDefinitions.REFERENCE_SHORT_NAME,
+            doc = "Reference fasta file.")
+    private String reference;
+
+    @Argument(fullName = StandardArgumentDefinitions.VARIANT_LONG_NAME,
+            shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME,
+            doc = "Input VCF file")
+    private String inputVcf;
+
+    @Argument(fullName = "output-tensor-dir", shortName = "output-tensor-dir", doc = "Directory of training tensors. Subdivided into train, valid and test sets.")
+    private String outputTensorsDir;
+
+    @Argument(fullName = "truth-vcf", shortName = "truth-vcf", doc = "Validated VCF file.")
+    private String truthVcf;
+
+    @Argument(fullName = "truth-bed", shortName = "truth-bed", doc = "Confident region of the validated VCF file.")
+    private String truthBed;
+
+    @Argument(fullName = "bam-file", shortName = "bam-file", doc = "BAM or BAMout file to use for read data when generating 2D tensors.", optional = true)
+    private String bamFile = "";
+
+    @Argument(fullName = "tensor-type", shortName = "tensor-type", doc = "Name of the tensors to generate.")
+    private TensorType tensorType = TensorType.reference;
+
+    @Advanced
+    @Argument(fullName = "channels-last", shortName = "channels-last", doc = "Store the channels in the last axis of tensors, tensorflow->true, theano->false", optional = true)
+    private boolean channelsLast = true;
+
+    @Advanced
+    @Argument(fullName = "annotation-set", shortName = "annotation-set", doc = "Which set of annotations to use.", optional = true)
+    private String annotationSet = "best_practices";
+
+    @Argument(fullName = "max-tensors", shortName = "max-tensors", doc = "Maximum number of tensors to write.", optional = true, minValue = 0)
+    private int maxTensors = 1000000;
+
+    // Start the Python executor. This does not actually start the Python process, but fails if python can't be located
+    final PythonScriptExecutor pythonExecutor = new PythonScriptExecutor(true);
+
+    @Override
+    protected void onStartup() {
+        PythonScriptExecutor.checkPythonEnvironmentForPackage("vqsr_cnn");
+    }
+
+    @Override
+    protected Object doWork() {
+        final Resource pythonScriptResource = new Resource("training.py", FilterVariantTranches.class);
+        List<String> arguments = new ArrayList<>(Arrays.asList(
+                "--reference_fasta", reference,
+                "--input_vcf", inputVcf,
+                "--bam_file", bamFile,
+                "--train_vcf", truthVcf,
+                "--bed_file", truthBed,
+                "--tensor_name", tensorType.name(),
+                "--annotation_set", annotationSet,
+                "--samples", Integer.toString(maxTensors),
+                "--data_dir", outputTensorsDir));
+
+        if(channelsLast){
+            arguments.add("--channels_last");
+        } else{
+            arguments.add("--channels_first");
+        }
+
+        if (tensorType == TensorType.reference) {
+            arguments.addAll(Arrays.asList("--mode", "write_reference_and_annotation_tensors"));
+        } else if (tensorType == TensorType.read_tensor) {
+            arguments.addAll(Arrays.asList("--mode", "write_read_and_annotation_tensors"));
+        } else {
+            throw new GATKException("Unknown tensor mapping mode:"+ tensorType.name());
+        }
+
+        logger.info("Args are:"+ Arrays.toString(arguments.toArray()));
+        final boolean pythonReturnCode = pythonExecutor.executeScript(
+                pythonScriptResource,
+                null,
+                arguments
+        );
+        return pythonReturnCode;
+    }
+
+}
Original file line number	Diff line number	Diff line change
`@@ -5,4 +5,5 @@`
`5`	`5`	`*/`
`6`	`6`	`public final class VariantFilterLibrary {`
`7`	`7`	`public static VariantFilter ALLOW_ALL_VARIANTS = variant -> true;`
	`8`	`+ public static VariantFilter NOT_SV_OR_SYMBOLIC = variant -> !variant.isSymbolicOrSV();`
`8`	`9`	`}`