Allele-specific VQSR convergence fix (#6262)

Increase jitter for AS_MQ to solve production pipeline convergence problems for exomes with AS annotations Add VQSR debug arg
broadinstitute · Nov 26, 2019 · c61cb50 · c61cb50
1 parent 8a365d1
commit c61cb50
Show file tree

Hide file tree

Showing 13 changed files with 208 additions and 49 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/exceptions/UserException.java b/src/main/java/org/broadinstitute/hellbender/exceptions/UserException.java
@@ -457,4 +457,16 @@ public UnimplementedFeature(String message){
             super(message);
         }
     }
+
+    public static final class VQSRPositiveModelFailure extends UserException {
+        private static final long serialVersionUID = 0L;
+
+        public VQSRPositiveModelFailure(String message) { super(message); }
+    }
+
+    public static final class VQSRNegativeModelFailure extends UserException {
+        private static final long serialVersionUID = 0L;
+
+        public VQSRNegativeModelFailure(String message) { super(message); }
+    }
 }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/MultivariateGaussian.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/MultivariateGaussian.java
@@ -2,6 +2,7 @@
 
 import org.apache.commons.math3.special.Gamma;
 
+import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.exceptions.UserException;
 import org.broadinstitute.hellbender.utils.MathUtils;
 
@@ -89,8 +90,7 @@ public void divideEqualsMu( final double x ) {
     private void precomputeInverse() {
         try {
             cachedSigmaInverse = sigma.inverse();
-        } catch( Exception e ) {
-            //TODO: there must be something narrower than Exception to catch here
+        } catch( RuntimeException e ) {
             throw new UserException(
                     "Error during clustering. Most likely there are too few variants used during Gaussian mixture " +
                             "modeling. Please consider raising the number of variants used to train the negative "+
@@ -104,6 +104,9 @@ private void precomputeInverse() {
     public void precomputeDenominatorForEvaluation() {
         precomputeInverse();
         cachedDenomLog10 = Math.log10(Math.pow(2.0 * Math.PI, -1.0 * ((double) mu.length) / 2.0)) + Math.log10(Math.pow(sigma.det(), -0.5)) ;
+        if (Double.isNaN(cachedDenomLog10)) {
+            throw new GATKException("Denominator for gaussian evaluation cannot be computed. One or more annotations (usually MQ) may have insufficient variance.");
+        }
     }
 
     public void precomputeDenominatorForVariationalBayes( final double sumHyperParameterLambda ) {

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantDataManager.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantDataManager.java
@@ -7,6 +7,7 @@
 import htsjdk.variant.variantcontext.VariantContextBuilder;
 import htsjdk.variant.vcf.VCFConstants;
 import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang3.AnnotationUtils;
 import org.apache.logging.log4j.Logger;
 import org.apache.logging.log4j.LogManager;
 
@@ -224,6 +225,8 @@ public List<VariantDatum> getTrainingData() {
         for( final VariantDatum datum : data ) {
             if( datum.atTrainingSite && !datum.failingSTDThreshold ) {
                 trainingData.add( datum );
+            } else if (datum.failingSTDThreshold && VRAC.debugStdevThresholding) {
+                logger.warn("Datum at " + datum.loc + " with ref " + datum.referenceAllele + " and alt " + datum.alternateAllele + " failing std thresholding: " + Arrays.toString(datum.annotations));
             }
         }
         logger.info( "Training with " + trainingData.size() + " variants after standard deviation thresholding." );
@@ -352,7 +355,7 @@ private static double decodeAnnotation( final String annotationKey, final Varian
             if( jitter && (annotationKey.equalsIgnoreCase(GATKVCFConstants.FISHER_STRAND_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_FILTER_STATUS_KEY)) && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
             if( jitter && annotationKey.equalsIgnoreCase(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY) && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
             if( jitter && (annotationKey.equalsIgnoreCase(GATKVCFConstants.STRAND_ODDS_RATIO_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_STRAND_ODDS_RATIO_KEY)) && MathUtils.compareDoubles(value, LOG_OF_TWO, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }   //min SOR is 2.0, then we take ln
-            if( jitter && (annotationKey.equalsIgnoreCase(VCFConstants.RMS_MAPPING_QUALITY_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_RMS_MAPPING_QUALITY_KEY))) {
+            if( jitter && (annotationKey.equalsIgnoreCase(VCFConstants.RMS_MAPPING_QUALITY_KEY))) {
                 if( vrac.MQ_CAP > 0) {
                     value = logitTransform(value, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET);
                     if (MathUtils.compareDoubles(value, logitTransform(vrac.MQ_CAP, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET), PRECISION) == 0 ) {
@@ -362,9 +365,11 @@ private static double decodeAnnotation( final String annotationKey, final Varian
                     value += vrac.MQ_JITTER * Utils.getRandomGenerator().nextGaussian();
                 }
             }
-        } catch( Exception e ) {
-            //TODO: what exception is this handling ? it seems overly broad
-            value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model
+            if( jitter && (annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_RMS_MAPPING_QUALITY_KEY))){
+                value += vrac.MQ_JITTER * Utils.getRandomGenerator().nextGaussian();
+            }
+        } catch( NumberFormatException e ) {
+            value = Double.NaN; // VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model
         }
 
         return value;

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java
@@ -649,14 +649,20 @@ public Object onTraversalSuccess() {
                     // Generate the positive model using the training data and evaluate each variant
                     goodModel = engine.generateModel(positiveTrainingData, VRAC.MAX_GAUSSIANS);
                     engine.evaluateData(dataManager.getData(), goodModel, false);
+                    if (goodModel.failedToConverge) {
+                        throw new UserException.VQSRPositiveModelFailure("Positive training model failed to converge.  One or more annotations " +
+                                "(usually MQ) may have insufficient variance.  Please consider lowering the maximum number" +
+                                " of Gaussians allowed for use in the model (via --max-gaussians 4, for example).");
+                    }
                     // Generate the negative model using the worst performing data and evaluate each variant contrastively
                     negativeTrainingData = dataManager.selectWorstVariants();
                     badModel = engine.generateModel(negativeTrainingData,
                             Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS));
-
-                    if (badModel.failedToConverge || goodModel.failedToConverge) {
-                        throw new UserException(
-                                "NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --minimum-bad-variants 5000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --max-gaussians 4, for example)."));
+                    if (badModel.failedToConverge) {
+                        throw new UserException.VQSRNegativeModelFailure(
+                                "NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe." +
+                                        " Please consider raising the number of variants used to train the negative model " +
+                                        "(via --minimum-bad-variants 5000, for example).");
                     }
                 }
 

diff --git a/...g/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibratorArgumentCollection.java b/...g/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibratorArgumentCollection.java
@@ -117,29 +117,22 @@ public enum Mode {
      * logit on [0,X] (+ epsilon to avoid division by zero) to make the blob more Gaussian-like and (2) the transformed
      * MQ=X are jittered to break the peak into a narrow Gaussian.
      *
-     * Beware that IndelRealigner, if used, adds 10 to MQ for successfully realigned indels. We recommend to either use
-     * --read-filter ReassignOriginalMQAfterIndelRealignment with HaplotypeCaller or use a MQCap=max+10 to take that
-     * into account.
-     *
      * If this option is not used, or if MQCap is set to 0, MQ will not be transformed.
+     * Note that AS_MQ uses --mq-jitter instead to control artificial variance
      */
     @Advanced
     @Argument(fullName="mq-cap-for-logit-jitter-transform", shortName = "mq-cap", doc="Apply logit transform and jitter to MQ values", optional=true)
     public int MQ_CAP = 0;
 
     /**
-     * The following 2 arguments are hidden because they are only for testing different jitter amounts with and without logit transform.
-     * Once this will have been tested, and the correct jitter amount chosen (perhaps as a function of the logit range [0,max]) they can be removed.
+     * Amount of jitter (as a multiplier to a Normal(0,1) distribution) to add to the AS_MQ and transformed MQ values
      */
-
-    @Hidden
     @Advanced
-    @Argument(fullName = "no-mq-logit", doc="MQ is by default transformed to log[(MQ_cap + epsilon - MQ)/(MQ + epsilon)] to make it more Gaussian-like.  Use this flag to not do that.", optional = true)
-    public boolean NO_MQ_LOGIT = false;
+    @Argument(fullName="mq-jitter", doc="Amount of jitter (as a multiplier to a Normal(0,1) distribution) to add to the AS_MQ and transformed MQ values", optional = true)
+    public double MQ_JITTER = 0.05;
 
-    @Hidden
     @Advanced
-    @Argument(fullName="mq-jitter", doc="Amount of jitter (as a factor to a Normal(0,1) noise) to add to the MQ capped values", optional = true)
-    public double MQ_JITTER = 0.05;
+    @Argument(fullName = "debug-stdev-thresholding", doc="Output variants that fail standard deviation thresholding to the log for debugging purposes. Redirection of stdout to a file is recommended.", optional = true)
+    public boolean debugStdevThresholding = false;
 
 }
diff --git a/...main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibratorEngine.java b/...main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibratorEngine.java
@@ -3,6 +3,7 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.logging.log4j.LogManager;
 
+import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.utils.Utils;
 
 import java.util.List;
@@ -53,7 +54,7 @@ public void evaluateData( final List<VariantDatum> data, final GaussianMixtureMo
             try {
                 model.precomputeDenominatorForEvaluation();
             } catch( Exception e ) {
-                logger.warn("Model could not pre-compute denominators.");  //this happened when we were reading in VQSR models that didn't have enough precision
+                logger.warn("Model could not pre-compute denominators. " + e.getMessage());  //this happened when we were reading in VQSR models that didn't have enough precision
                 model.failedToConverge = true;
                 return;
             }
@@ -63,7 +64,6 @@ public void evaluateData( final List<VariantDatum> data, final GaussianMixtureMo
         for( final VariantDatum datum : data ) {
             final double thisLod = evaluateDatum( datum, model );
             if( Double.isNaN(thisLod) ) {
-                logger.warn("Evaluate datum returned a NaN.");
                 model.failedToConverge = true;
                 return;
             }