Skip to content

Strand based annotations will use both reads in an overlapping read pair #5286

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 11, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
import org.apache.commons.math3.distribution.BetaDistribution;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.mutect.SomaticGenotypingEngine;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;

import java.util.*;
import java.util.stream.Collectors;

import static org.broadinstitute.hellbender.tools.walkers.annotator.StrandArtifact.ArtifactState.*;

Expand Down Expand Up @@ -88,13 +90,22 @@ public void annotate(final ReferenceContext ref,
return;
}
final int indexOfMaxTumorLod = MathUtils.maxElementIndex(tumorLods);
final Allele altAlelle = vc.getAlternateAllele(indexOfMaxTumorLod);

final Collection<ReadLikelihoods<Allele>.BestAllele> bestAlleles = likelihoods.bestAllelesBreakingTies(g.getSampleName());
final int numFwdAltReads = (int) bestAlleles.stream().filter(ba -> !ba.read.isReverseStrand() && ba.isInformative() && ba.allele.equals(altAlelle)).count();
final int numRevAltReads = (int) bestAlleles.stream().filter(ba -> ba.read.isReverseStrand() && ba.isInformative() && ba.allele.equals(altAlelle)).count();
final int numFwdReads = (int) bestAlleles.stream().filter(ba -> !ba.read.isReverseStrand() && ba.isInformative()).count();
final int numRevReads = (int) bestAlleles.stream().filter(ba -> ba.read.isReverseStrand() && ba.isInformative()).count();
final Allele altAllele = vc.getAlternateAllele(indexOfMaxTumorLod);

final Collection<ReadLikelihoods<Allele>.BestAllele> informativeBestAlleles = likelihoods.bestAllelesBreakingTies(g.getSampleName()).stream().
filter(ba -> ba.isInformative()).collect(Collectors.toList());
final Map<Strand, List<ReadLikelihoods<Allele>.BestAllele>> altReads = informativeBestAlleles.stream().filter(ba -> ba.allele.equals(altAllele))
.collect(Collectors.groupingBy(ba -> ba.read.isReverseStrand() ? Strand.REV : Strand.FWD));
final Map<Strand, List<ReadLikelihoods<Allele>.BestAllele>> refReads = informativeBestAlleles.stream().filter(ba -> ba.allele.equals(vc.getReference()))
.collect(Collectors.groupingBy(ba -> ba.read.isReverseStrand() ? Strand.REV : Strand.FWD));

final int numFwdAltReads = countReads(altReads, Strand.FWD);
final int numRevAltReads = countReads(altReads, Strand.REV);
final int numFwdRefReads = countReads(refReads, Strand.FWD);
final int numRevRefReads = countReads(refReads, Strand.REV);

final int numFwdReads = numFwdRefReads + numFwdAltReads;
final int numRevReads = numRevRefReads + numRevAltReads;
final int numAltReads = numFwdAltReads + numRevAltReads;
final int numReads = numFwdReads + numRevReads;

Expand Down Expand Up @@ -162,6 +173,10 @@ public void annotate(final ReferenceContext ref,
gb.attribute(MAP_ALLELE_FRACTIONS_KEY, estimatedAlleleFractions.values().stream().mapToDouble(Double::doubleValue).toArray());
}

private enum Strand {
FWD, REV
}

@Override
public List<VCFFormatHeaderLine> getDescriptions() {
return Arrays.asList(new VCFFormatHeaderLine(POSTERIOR_PROBABILITIES_KEY, 3, VCFHeaderLineType.Float, "posterior probabilities of the presence of strand artifact"),
Expand All @@ -177,5 +192,19 @@ private double getIntegrandGivenArtifact(final double f, final double epsilon, f
MathUtils.binomialProbability(nNoArtifact, xNoArtifact, f);
}

/**
*
* Count the number of reads in {@param reads}, including the overlapping mates discarded by {@link SomaticGenotypingEngine}.
* @param strand is the strand of the reads we're counting. For example, if strand = Strand.FWD, the we look among {@param reads}
* for reverse reads with {@link SomaticGenotypingEngine.DISCARDED_MATE_READ_TAG} set to 1
*/
private int countReads(final Map<Strand, List<ReadLikelihoods<Allele>.BestAllele>> reads, final Strand strand){
final Strand strandOfKeptRead = strand == Strand.FWD ? Strand.REV : Strand.FWD;

// check of the key in case there are no forward or reverse in {@link reads}
final int numDiscardedReads = ! reads.containsKey(strandOfKeptRead) ? 0 : (int) reads.get(strandOfKeptRead).stream().filter(ba -> ba.read.hasAttribute(SomaticGenotypingEngine.DISCARDED_MATE_READ_TAG)).count();
final int numReads = ! reads.containsKey(strand) ? 0 : reads.get(strand).size();
return numReads + numDiscardedReads;

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.walkers.mutect.SomaticGenotypingEngine;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
Expand Down Expand Up @@ -170,8 +171,14 @@ private static void updateTable(final int[] table, final Allele allele, final GA
final int offset = matchesRef ? 0 : ARRAY_DIM;

// a normal read with an actual strand
final boolean isFW = !read.isReverseStrand();
table[offset + (isFW ? 0 : 1)]++;
final boolean isForward = !read.isReverseStrand();
table[offset + (isForward ? 0 : 1)]++;

// This read's mate got discarded by SomaticGenotypingEngine::clipOverlappingReads()
if (read.hasAttribute(SomaticGenotypingEngine.DISCARDED_MATE_READ_TAG)){
// Note that if this read is forward, then we increment its mate which we assume is reverse
table[offset + (isForward ? 1 : 0)]++;
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public abstract class AssemblyBasedCallerArgumentCollection extends StandardCall

public static final String MIN_BASE_QUALITY_SCORE_LONG_NAME = "min-base-quality-score";
public static final String SMITH_WATERMAN_LONG_NAME = "smith-waterman";
public static final String CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "correct-overlapping-quality";

@ArgumentCollection
public AssemblyRegionTrimmerArgumentCollection assemblyRegionTrimmerArgs = new AssemblyRegionTrimmerArgumentCollection();
Expand Down Expand Up @@ -123,4 +124,7 @@ public abstract class AssemblyBasedCallerArgumentCollection extends StandardCall
@Argument(fullName = SMITH_WATERMAN_LONG_NAME, doc = "Which Smith-Waterman implementation to use, generally FASTEST_AVAILABLE is the right choice", optional = true)
public SmithWatermanAligner.Implementation smithWatermanImplementation = SmithWatermanAligner.Implementation.JAVA;

@Argument(fullName = CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME)
public boolean doNotCorrectOverlappingBaseQualities = false;

}
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ public static void finalizeRegion(final AssemblyRegion region,
final boolean dontUseSoftClippedBases,
final byte minTailQuality,
final SAMFileHeader readsHeader,
final SampleList samplesList) {
final SampleList samplesList,
final boolean correctOverlappingBaseQualities) {
if ( region.isFinalized() ) {
return;
}
Expand Down Expand Up @@ -98,7 +99,7 @@ public static void finalizeRegion(final AssemblyRegion region,
Collections.sort(readsToUse, new ReadCoordinateComparator(readsHeader)); // TODO: sort may be unnecessary here

// handle overlapping read pairs from the same fragment
cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader);
cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader, correctOverlappingBaseQualities);

region.clearReads();
region.addAll(readsToUse);
Expand All @@ -109,12 +110,14 @@ public static void finalizeRegion(final AssemblyRegion region,
* Clean up reads/bases that overlap within read pairs
*
* @param reads the list of reads to consider
* @param correctOverlappingBaseQualities
*/
private static void cleanOverlappingReadPairs(final List<GATKRead> reads, final SampleList samplesList, final SAMFileHeader readsHeader) {
private static void cleanOverlappingReadPairs(final List<GATKRead> reads, final SampleList samplesList, final SAMFileHeader readsHeader,
final boolean correctOverlappingBaseQualities) {
for ( final List<GATKRead> perSampleReadList : splitReadsBySample(samplesList, readsHeader, reads).values() ) {
final FragmentCollection<GATKRead> fragmentCollection = FragmentCollection.create(perSampleReadList);
for ( final List<GATKRead> overlappingPair : fragmentCollection.getOverlappingPairs() ) {
FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair);
FragmentUtils.adjustQualsOfOverlappingPairedFragments(overlappingPair, correctOverlappingBaseQualities);
}
}
}
Expand Down Expand Up @@ -238,7 +241,7 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
final ReferenceSequenceFile referenceReader,
final ReadThreadingAssembler assemblyEngine,
final SmithWatermanAligner aligner){
finalizeRegion(region, argumentCollection.errorCorrectReads, argumentCollection.dontUseSoftClippedBases, (byte)(argumentCollection.minBaseQualityScore - 1), header, sampleList);
finalizeRegion(region, argumentCollection.errorCorrectReads, argumentCollection.dontUseSoftClippedBases, (byte)(argumentCollection.minBaseQualityScore - 1), header, sampleList, ! argumentCollection.doNotCorrectOverlappingBaseQualities);
if( argumentCollection.debug) {
logger.info("Assembling " + region.getSpan() + " with " + region.size() + " reads: (with overlap region = " + region.getExtendedSpan() + ")");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ private List<VariantContext> referenceModelForNoVariation(final AssemblyRegion r
//TODO - why the activeRegion cannot manage its own one-time finalization and filtering?
//TODO - perhaps we can remove the last parameter of this method and the three lines bellow?
if ( needsToBeFinalized ) {
finalizeRegion(region);
AssemblyBasedCallerUtils.finalizeRegion(region, hcArgs.errorCorrectReads, hcArgs.dontUseSoftClippedBases, minTailQuality, readsHeader, samplesList, ! hcArgs.doNotCorrectOverlappingBaseQualities);
}
filterNonPassingReads(region);

Expand Down Expand Up @@ -723,10 +723,6 @@ public void shutdown() {

}

private void finalizeRegion(final AssemblyRegion region) {
AssemblyBasedCallerUtils.finalizeRegion(region, hcArgs.errorCorrectReads, hcArgs.dontUseSoftClippedBases, minTailQuality, readsHeader, samplesList);
}

private Set<GATKRead> filterNonPassingReads( final AssemblyRegion activeRegion ) {
// TODO: can we unify this additional filtering with makeStandardHCReadFilter()?

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String IGNORE_ITR_ARTIFACTS_LONG_NAME = "ignore-itr-artifacts";
public static final String ARTIFACT_PRIOR_TABLE_NAME = "orientation-bias-artifact-priors";
public static final String GET_AF_FROM_AD_LONG_NAME = "get-af-from-ad";
public static final String ANNOTATE_BASED_ON_READS_LONG_NAME = "annotate-fragments";

public static final double DEFAULT_AF_FOR_TUMOR_ONLY_CALLING = 5e-8;
public static final double DEFAULT_AF_FOR_TUMOR_NORMAL_CALLING = 1e-6;
Expand Down Expand Up @@ -169,4 +170,12 @@ public double getDefaultAlleleFrequency() {
@Argument(fullName = GET_AF_FROM_AD_LONG_NAME, doc="Use allelic depth to calculate tumor allele fraction; recommended for mitochondrial applications", optional = true)
public boolean calculateAFfromAD = false;

/**
* If set to true, count an overlapping read pair as two separate reads instead of one for {@link StrandArtifact} and {@link StrandBiasBySample} annotations,
* which is the correct behavior for these annotations. Note that doing so would break the independence assumption of reads and over-count the alt depth in these annotations.
* On the other hand it could prevent spurious false negatives that could arise if by chance one strand in overlapping pairs is dropped disproportionately
*/
@Argument(fullName = ANNOTATE_BASED_ON_READS_LONG_NAME, doc = "If true, strand-based annotations use the number of reads, not fragments")
public boolean annotateBasedOnReads = false;

}
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ public List<VariantContext> callRegion(final AssemblyRegion originalAssemblyRegi
final AssemblyRegion assemblyActiveRegion = AssemblyBasedCallerUtils.assemblyRegionWithWellMappedReads(originalAssemblyRegion, READ_QUALITY_FILTER_THRESHOLD, header);
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(assemblyActiveRegion, givenAlleles, MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner);
final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents(MTAC.maxMnpDistance);
final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion,allVariationEvents);
final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion, allVariationEvents);
if (!trimmingResult.isVariationPresent()) {
return NO_CALLS;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.apache.commons.math3.util.FastMath;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.StrandBiasTest;
import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.AFCalculator;
import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.AFCalculatorProvider;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyBasedCallerGenotypingEngine;
Expand All @@ -37,6 +38,7 @@ public class SomaticGenotypingEngine extends AssemblyBasedCallerGenotypingEngine
public final String tumorSample;
private final String normalSample;
final boolean hasNormal;
public static final String DISCARDED_MATE_READ_TAG = "DM";

// {@link GenotypingEngine} requires a non-null {@link AFCalculatorProvider} but this class doesn't need it. Thus we make a dummy
private static final AFCalculatorProvider DUMMY_AF_CALCULATOR_PROVIDER = new AFCalculatorProvider() {
Expand Down Expand Up @@ -226,8 +228,8 @@ private PerAlleleCollection<Double> somaticLog10Odds(final LikelihoodMatrix<Alle
}

private void addGenotypes(final LikelihoodMatrix<Allele> tumorLog10Matrix,
final Optional<LikelihoodMatrix<Allele>> normalLog10Matrix,
final VariantContextBuilder callVcb) {
final Optional<LikelihoodMatrix<Allele>> normalLog10Matrix,
final VariantContextBuilder callVcb) {
final double[] tumorAlleleCounts = getEffectiveCounts(tumorLog10Matrix);
final int[] adArray = Arrays.stream(tumorAlleleCounts).mapToInt(x -> (int) FastMath.round(x)).toArray();
final int dp = (int) MathUtils.sum(adArray);
Expand Down Expand Up @@ -325,6 +327,13 @@ private void filterOverlappingReads(final ReadLikelihoods<Allele> likelihoods, f
if (read.allele.equals(mate.allele)) {
// keep the higher-quality read
readsToDiscard.add(read.likelihood < mate.likelihood ? read.read : mate.read);

// mark the read to indicate that its mate was dropped - so that we can account for it in {@link StrandArtifact}
// and {@link StrandBiasBySample}
if (MTAC.annotateBasedOnReads){
final GATKRead readToKeep = read.likelihood >= mate.likelihood ? read.read : mate.read;
readToKeep.setAttribute(DISCARDED_MATE_READ_TAG, 1);
}
} else if (retainMismatches) {
// keep the alt read
readsToDiscard.add(read.allele.equals(ref) ? read.read : mate.read);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,13 @@ private FragmentUtils() {}
* Assumes that firstRead starts before secondRead (according to their soft clipped starts)
* Sets the qualities of clippedFirstRead and clippedSecondRead to mimic a merged read or
* nothing if the algorithm cannot create a meaningful one
*
* @param clippedFirstRead the left most read
* @param clippedFirstRead the left most read
* @param clippedSecondRead the right most read
* @param correctOverlappingBaseQualities
*
*/
public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippedFirstRead, final GATKRead clippedSecondRead) {
public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippedFirstRead, final GATKRead clippedSecondRead,
final boolean correctOverlappingBaseQualities) {
Utils.nonNull(clippedFirstRead);
Utils.nonNull(clippedSecondRead);
Utils.validateArg(clippedFirstRead.getName().equals(clippedSecondRead.getName()), () ->
Expand All @@ -51,6 +52,10 @@ public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippe
final byte[] secondReadQuals = clippedSecondRead.getBaseQualities();

for ( int i = 0; i < numOverlappingBases; i++ ) {
if (! correctOverlappingBaseQualities) {
break;
}

final int firstReadIndex = firstReadStop + i;
final byte firstReadBase = firstReadBases[firstReadIndex];
final byte secondReadBase = secondReadBases[i];
Expand All @@ -69,17 +74,17 @@ public static void adjustQualsOfOverlappingPairedFragments(final GATKRead clippe
clippedSecondRead.setBaseQualities(secondReadQuals);
}

public static void adjustQualsOfOverlappingPairedFragments( final List<GATKRead> overlappingPair ) {
public static void adjustQualsOfOverlappingPairedFragments(final List<GATKRead> overlappingPair, final boolean correctOverlappingBaseQualities) {
Utils.validateArg( overlappingPair.size() == 2, () -> "Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2.");

final GATKRead firstRead = overlappingPair.get(0);
final GATKRead secondRead = overlappingPair.get(1);

if ( secondRead.getSoftStart() < firstRead.getSoftStart() ) {
adjustQualsOfOverlappingPairedFragments(secondRead, firstRead);
adjustQualsOfOverlappingPairedFragments(secondRead, firstRead, correctOverlappingBaseQualities);
}
else {
adjustQualsOfOverlappingPairedFragments(firstRead, secondRead);
adjustQualsOfOverlappingPairedFragments(firstRead, secondRead, correctOverlappingBaseQualities);
}
}
}
Loading