Skip to content

Commit

Permalink
Allele-specific VQSR convergence fix (#6262)
Browse files Browse the repository at this point in the history
Increase jitter for AS_MQ to solve production pipeline convergence problems for exomes with AS annotations
Add VQSR debug arg
  • Loading branch information
ldgauthier authored Nov 26, 2019
1 parent 8a365d1 commit c61cb50
Show file tree
Hide file tree
Showing 13 changed files with 208 additions and 49 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -457,4 +457,16 @@ public UnimplementedFeature(String message){
super(message);
}
}

public static final class VQSRPositiveModelFailure extends UserException {
private static final long serialVersionUID = 0L;

public VQSRPositiveModelFailure(String message) { super(message); }
}

public static final class VQSRNegativeModelFailure extends UserException {
private static final long serialVersionUID = 0L;

public VQSRNegativeModelFailure(String message) { super(message); }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.apache.commons.math3.special.Gamma;

import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.MathUtils;

Expand Down Expand Up @@ -89,8 +90,7 @@ public void divideEqualsMu( final double x ) {
private void precomputeInverse() {
try {
cachedSigmaInverse = sigma.inverse();
} catch( Exception e ) {
//TODO: there must be something narrower than Exception to catch here
} catch( RuntimeException e ) {
throw new UserException(
"Error during clustering. Most likely there are too few variants used during Gaussian mixture " +
"modeling. Please consider raising the number of variants used to train the negative "+
Expand All @@ -104,6 +104,9 @@ private void precomputeInverse() {
public void precomputeDenominatorForEvaluation() {
precomputeInverse();
cachedDenomLog10 = Math.log10(Math.pow(2.0 * Math.PI, -1.0 * ((double) mu.length) / 2.0)) + Math.log10(Math.pow(sigma.det(), -0.5)) ;
if (Double.isNaN(cachedDenomLog10)) {
throw new GATKException("Denominator for gaussian evaluation cannot be computed. One or more annotations (usually MQ) may have insufficient variance.");
}
}

public void precomputeDenominatorForVariationalBayes( final double sumHyperParameterLambda ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang3.AnnotationUtils;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;

Expand Down Expand Up @@ -224,6 +225,8 @@ public List<VariantDatum> getTrainingData() {
for( final VariantDatum datum : data ) {
if( datum.atTrainingSite && !datum.failingSTDThreshold ) {
trainingData.add( datum );
} else if (datum.failingSTDThreshold && VRAC.debugStdevThresholding) {
logger.warn("Datum at " + datum.loc + " with ref " + datum.referenceAllele + " and alt " + datum.alternateAllele + " failing std thresholding: " + Arrays.toString(datum.annotations));
}
}
logger.info( "Training with " + trainingData.size() + " variants after standard deviation thresholding." );
Expand Down Expand Up @@ -352,7 +355,7 @@ private static double decodeAnnotation( final String annotationKey, final Varian
if( jitter && (annotationKey.equalsIgnoreCase(GATKVCFConstants.FISHER_STRAND_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_FILTER_STATUS_KEY)) && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
if( jitter && annotationKey.equalsIgnoreCase(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY) && MathUtils.compareDoubles(value, 0.0, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); }
if( jitter && (annotationKey.equalsIgnoreCase(GATKVCFConstants.STRAND_ODDS_RATIO_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_STRAND_ODDS_RATIO_KEY)) && MathUtils.compareDoubles(value, LOG_OF_TWO, PRECISION) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln
if( jitter && (annotationKey.equalsIgnoreCase(VCFConstants.RMS_MAPPING_QUALITY_KEY) || annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_RMS_MAPPING_QUALITY_KEY))) {
if( jitter && (annotationKey.equalsIgnoreCase(VCFConstants.RMS_MAPPING_QUALITY_KEY))) {
if( vrac.MQ_CAP > 0) {
value = logitTransform(value, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET);
if (MathUtils.compareDoubles(value, logitTransform(vrac.MQ_CAP, -SAFETY_OFFSET, vrac.MQ_CAP + SAFETY_OFFSET), PRECISION) == 0 ) {
Expand All @@ -362,9 +365,11 @@ private static double decodeAnnotation( final String annotationKey, final Varian
value += vrac.MQ_JITTER * Utils.getRandomGenerator().nextGaussian();
}
}
} catch( Exception e ) {
//TODO: what exception is this handling ? it seems overly broad
value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model
if( jitter && (annotationKey.equalsIgnoreCase(GATKVCFConstants.AS_RMS_MAPPING_QUALITY_KEY))){
value += vrac.MQ_JITTER * Utils.getRandomGenerator().nextGaussian();
}
} catch( NumberFormatException e ) {
value = Double.NaN; // VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model
}

return value;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -649,14 +649,20 @@ public Object onTraversalSuccess() {
// Generate the positive model using the training data and evaluate each variant
goodModel = engine.generateModel(positiveTrainingData, VRAC.MAX_GAUSSIANS);
engine.evaluateData(dataManager.getData(), goodModel, false);
if (goodModel.failedToConverge) {
throw new UserException.VQSRPositiveModelFailure("Positive training model failed to converge. One or more annotations " +
"(usually MQ) may have insufficient variance. Please consider lowering the maximum number" +
" of Gaussians allowed for use in the model (via --max-gaussians 4, for example).");
}
// Generate the negative model using the worst performing data and evaluate each variant contrastively
negativeTrainingData = dataManager.selectWorstVariants();
badModel = engine.generateModel(negativeTrainingData,
Math.min(VRAC.MAX_GAUSSIANS_FOR_NEGATIVE_MODEL, VRAC.MAX_GAUSSIANS));

if (badModel.failedToConverge || goodModel.failedToConverge) {
throw new UserException(
"NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider " + (badModel.failedToConverge ? "raising the number of variants used to train the negative model (via --minimum-bad-variants 5000, for example)." : "lowering the maximum number of Gaussians allowed for use in the model (via --max-gaussians 4, for example)."));
if (badModel.failedToConverge) {
throw new UserException.VQSRNegativeModelFailure(
"NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe." +
" Please consider raising the number of variants used to train the negative model " +
"(via --minimum-bad-variants 5000, for example).");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,29 +117,22 @@ public enum Mode {
* logit on [0,X] (+ epsilon to avoid division by zero) to make the blob more Gaussian-like and (2) the transformed
* MQ=X are jittered to break the peak into a narrow Gaussian.
*
* Beware that IndelRealigner, if used, adds 10 to MQ for successfully realigned indels. We recommend to either use
* --read-filter ReassignOriginalMQAfterIndelRealignment with HaplotypeCaller or use a MQCap=max+10 to take that
* into account.
*
* If this option is not used, or if MQCap is set to 0, MQ will not be transformed.
* Note that AS_MQ uses --mq-jitter instead to control artificial variance
*/
@Advanced
@Argument(fullName="mq-cap-for-logit-jitter-transform", shortName = "mq-cap", doc="Apply logit transform and jitter to MQ values", optional=true)
public int MQ_CAP = 0;

/**
* The following 2 arguments are hidden because they are only for testing different jitter amounts with and without logit transform.
* Once this will have been tested, and the correct jitter amount chosen (perhaps as a function of the logit range [0,max]) they can be removed.
* Amount of jitter (as a multiplier to a Normal(0,1) distribution) to add to the AS_MQ and transformed MQ values
*/

@Hidden
@Advanced
@Argument(fullName = "no-mq-logit", doc="MQ is by default transformed to log[(MQ_cap + epsilon - MQ)/(MQ + epsilon)] to make it more Gaussian-like. Use this flag to not do that.", optional = true)
public boolean NO_MQ_LOGIT = false;
@Argument(fullName="mq-jitter", doc="Amount of jitter (as a multiplier to a Normal(0,1) distribution) to add to the AS_MQ and transformed MQ values", optional = true)
public double MQ_JITTER = 0.05;

@Hidden
@Advanced
@Argument(fullName="mq-jitter", doc="Amount of jitter (as a factor to a Normal(0,1) noise) to add to the MQ capped values", optional = true)
public double MQ_JITTER = 0.05;
@Argument(fullName = "debug-stdev-thresholding", doc="Output variants that fail standard deviation thresholding to the log for debugging purposes. Redirection of stdout to a file is recommended.", optional = true)
public boolean debugStdevThresholding = false;

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;

import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.List;
Expand Down Expand Up @@ -53,7 +54,7 @@ public void evaluateData( final List<VariantDatum> data, final GaussianMixtureMo
try {
model.precomputeDenominatorForEvaluation();
} catch( Exception e ) {
logger.warn("Model could not pre-compute denominators."); //this happened when we were reading in VQSR models that didn't have enough precision
logger.warn("Model could not pre-compute denominators. " + e.getMessage()); //this happened when we were reading in VQSR models that didn't have enough precision
model.failedToConverge = true;
return;
}
Expand All @@ -63,7 +64,6 @@ public void evaluateData( final List<VariantDatum> data, final GaussianMixtureMo
for( final VariantDatum datum : data ) {
final double thisLod = evaluateDatum( datum, model );
if( Double.isNaN(thisLod) ) {
logger.warn("Evaluate datum returned a NaN.");
model.failedToConverge = true;
return;
}
Expand Down
Loading

0 comments on commit c61cb50

Please sign in to comment.