Skip to content

Commit 90bf668

Browse files
authored
Adding argument to GenotypeGVCFs to keep specified raw annotations (#7996)
1 parent 277bf00 commit 90bf668

25 files changed

+128
-27
lines changed

src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/DefaultGATKVariantAnnotationArgumentCollection.java

-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package org.broadinstitute.hellbender.cmdline.GATKPlugin;
22

3-
import com.google.common.collect.Lists;
43
import org.broadinstitute.barclay.argparser.Advanced;
54
import org.broadinstitute.barclay.argparser.Argument;
65
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;

src/main/java/org/broadinstitute/hellbender/cmdline/GATKPlugin/GATKAnnotationPluginDescriptor.java

+10-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import org.broadinstitute.hellbender.utils.config.ConfigFactory;
1717
import org.broadinstitute.hellbender.utils.config.GATKConfig;
1818

19-
import java.io.File;
2019
import java.lang.reflect.Modifier;
2120
import java.util.*;
2221
import java.util.stream.Collectors;
@@ -485,6 +484,16 @@ public List<Annotation> getResolvedInstances() {
485484
return resolvedInstances;
486485
}
487486

487+
/**
488+
* Returns a map of the String to Annotations only in the resolved instances.
489+
*
490+
* @return a Map of Strings to Annotations of resolved instances
491+
*/
492+
public Map<String, Annotation> getResolvedInstancesMap() {
493+
return allDiscoveredAnnotations.entrySet().stream()
494+
.filter(e -> getResolvedInstances().contains(e.getValue()))
495+
.collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue()));
496+
}
488497

489498
/**
490499
* Return the class representing the instance of the plugin specified by {@code pluginName}
@@ -496,5 +505,4 @@ public List<Annotation> getResolvedInstances() {
496505
public Class<?> getClassForPluginHelp(final String pluginName) {
497506
return allDiscoveredAnnotations.containsKey(pluginName) ? allDiscoveredAnnotations.get(pluginName).getClass() : null;
498507
}
499-
500508
}

src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java

+34-5
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import htsjdk.variant.vcf.VCFHeaderLine;
1010
import org.broadinstitute.barclay.argparser.*;
1111
import org.broadinstitute.barclay.help.DocumentedFeature;
12+
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKAnnotationPluginDescriptor;
13+
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
1214
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
1315
import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
1416
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
@@ -17,6 +19,7 @@
1719
import org.broadinstitute.hellbender.engine.ReadsContext;
1820
import org.broadinstitute.hellbender.engine.ReferenceContext;
1921
import org.broadinstitute.hellbender.engine.VariantLocusWalker;
22+
import org.broadinstitute.hellbender.exceptions.UserException;
2023
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection;
2124
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBImport;
2225
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
@@ -28,6 +31,7 @@
2831
import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection;
2932
import org.broadinstitute.hellbender.utils.*;
3033
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
34+
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;
3135

3236
import java.util.*;
3337
import java.util.stream.Collectors;
@@ -132,17 +136,17 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
132136
doc = "LOD threshold to emit variant to VCF.")
133137
protected double tlodThreshold = 3.5; //allow for some lower quality variants
134138

135-
136139
/**
137140
* Margin of error in allele fraction to consider a somatic variant homoplasmic, i.e. if there is less than a 0.1% reference allele fraction, those reads are likely errors
138141
*/
139142
@Argument(fullName=CombineGVCFs.ALLELE_FRACTION_DELTA_LONG_NAME, doc = "Margin of error in allele fraction to consider a somatic variant homoplasmic")
140143
protected double afTolerance = 1e-3; //based on Q30 as a "good" base quality score
141144

142145
/**
143-
* If specified, keep the combined raw annotations (e.g. AS_SB_TABLE) after genotyping. This is applicable to Allele-Specific annotations
146+
* If specified, keep all the combined raw annotations (e.g. AS_SB_TABLE) after genotyping. This is applicable to Allele-Specific annotations. See {@link ReducibleAnnotation}
144147
*/
145-
@Argument(fullName=KEEP_COMBINED_LONG_NAME, shortName = KEEP_COMBINED_SHORT_NAME, doc = "If specified, keep the combined raw annotations")
148+
@Argument(fullName=KEEP_COMBINED_LONG_NAME, shortName = KEEP_COMBINED_SHORT_NAME, doc = "If specified, keep the combined raw annotations",
149+
mutex = {GenotypeGVCFsAnnotationArgumentCollection.KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME})
146150
protected boolean keepCombined = false;
147151

148152
@ArgumentCollection
@@ -172,6 +176,9 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
172176
@ArgumentCollection
173177
private final DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
174178

179+
// @ArgumentCollection deliberately omitted since this is passed to the annotation plugin
180+
final GenotypeGVCFsAnnotationArgumentCollection genotypeGVCFsAnnotationArgs = new GenotypeGVCFsAnnotationArgumentCollection();
181+
175182
// the annotation engine
176183
private VariantAnnotatorEngine annotationEngine;
177184

@@ -221,6 +228,16 @@ protected GenomicsDBOptions getGenomicsDBOptions() {
221228
@Override
222229
public boolean useVariantAnnotations() { return true;}
223230

231+
@Override
232+
public List<? extends CommandLinePluginDescriptor<?>> getPluginDescriptors() {
233+
GATKReadFilterPluginDescriptor readFilterDescriptor = new GATKReadFilterPluginDescriptor(getDefaultReadFilters());
234+
return useVariantAnnotations()?
235+
Arrays.asList(readFilterDescriptor, new GATKAnnotationPluginDescriptor(
236+
genotypeGVCFsAnnotationArgs,
237+
getDefaultVariantAnnotations(), getDefaultVariantAnnotationGroups())):
238+
Collections.singletonList(readFilterDescriptor);
239+
}
240+
224241
@Override
225242
public List<Class<? extends Annotation>> getDefaultVariantAnnotationGroups() {
226243
return Arrays.asList(StandardAnnotation.class);
@@ -261,8 +278,9 @@ public void onTraversalStart() {
261278
intervals = hasUserSuppliedIntervals() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
262279
Collections.emptyList();
263280

264-
Collection<Annotation> variantAnnotations = makeVariantAnnotations();
265-
annotationEngine = new VariantAnnotatorEngine(variantAnnotations, dbsnp.dbsnp, Collections.emptyList(), false, keepCombined);
281+
final Collection<Annotation> variantAnnotations = makeVariantAnnotations();
282+
final Set<Annotation> annotationsToKeep = getAnnotationsToKeep();
283+
annotationEngine = new VariantAnnotatorEngine(variantAnnotations, dbsnp.dbsnp, Collections.emptyList(), false, keepCombined, annotationsToKeep);
266284

267285
merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants(), somaticInput, false, true);
268286

@@ -279,6 +297,17 @@ public void onTraversalStart() {
279297

280298
}
281299

300+
private Set<Annotation> getAnnotationsToKeep() {
301+
final GATKAnnotationPluginDescriptor pluginDescriptor = getCommandLineParser().getPluginDescriptor(GATKAnnotationPluginDescriptor.class);
302+
final List<String> annotationStringsToKeep = genotypeGVCFsAnnotationArgs.getKeepSpecifiedCombinedAnnotationNames();
303+
final Map<String, Annotation> resolvedInstancesMap = pluginDescriptor.getResolvedInstancesMap();
304+
return annotationStringsToKeep.stream()
305+
.peek(s -> {Annotation a = resolvedInstancesMap.get(s); if (a == null)
306+
throw new UserException("Requested --" + GenotypeGVCFsAnnotationArgumentCollection.KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME + ": " + s + " was not found in annotation list. Was it excluded with --" + StandardArgumentDefinitions.ANNOTATIONS_TO_EXCLUDE_LONG_NAME + " or not provided with --" + StandardArgumentDefinitions.ANNOTATION_LONG_NAME + "?"); })
307+
.map(resolvedInstancesMap::get)
308+
.collect(Collectors.toSet());
309+
}
310+
282311
@Override
283312
public void apply(final Locatable loc, List<VariantContext> variants, ReadsContext reads, ReferenceContext ref, FeatureContext features) {
284313

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package org.broadinstitute.hellbender.tools.walkers;
2+
3+
import org.broadinstitute.barclay.argparser.Argument;
4+
import org.broadinstitute.hellbender.cmdline.GATKPlugin.DefaultGATKVariantAnnotationArgumentCollection;
5+
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;
6+
7+
import java.util.ArrayList;
8+
import java.util.Collections;
9+
import java.util.List;
10+
11+
public class GenotypeGVCFsAnnotationArgumentCollection extends DefaultGATKVariantAnnotationArgumentCollection {
12+
private static final long serialVersionUID = 1L;
13+
14+
public static final String KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME = "keep-specific-combined-raw-annotation";
15+
public static final String KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_SHORT_NAME = "keep-specific-combined";
16+
17+
/**
18+
* Keep only the specific combined raw annotations specified. Cannot be used with --keep-combined-raw-annotations which saves all raw annotations.
19+
* Duplicate values will be ignored. See {@link ReducibleAnnotation} for more information on raw annotations.
20+
*/
21+
@Argument(fullName= KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME, shortName = KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_SHORT_NAME, optional = true,
22+
mutex = {GenotypeGVCFs.KEEP_COMBINED_LONG_NAME},
23+
doc="Keep only the specific combined raw annotations specified (removing the other raw annotations). Duplicate values will be ignored.")
24+
protected List<String> keepSpecifiedCombined = new ArrayList<>();
25+
26+
public List<String> getKeepSpecifiedCombinedAnnotationNames() {return Collections.unmodifiableList(keepSpecifiedCombined);}
27+
}

src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java

+28-1
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44
import htsjdk.variant.vcf.*;
55
import org.apache.logging.log4j.LogManager;
66
import org.apache.logging.log4j.Logger;
7+
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
78
import org.broadinstitute.hellbender.engine.FeatureContext;
89
import org.broadinstitute.hellbender.engine.FeatureDataSource;
910
import org.broadinstitute.hellbender.engine.FeatureInput;
1011
import org.broadinstitute.hellbender.engine.ReferenceContext;
1112
import org.broadinstitute.hellbender.exceptions.GATKException;
1213
import org.broadinstitute.hellbender.exceptions.UserException;
14+
import org.broadinstitute.hellbender.tools.walkers.GenotypeGVCFsAnnotationArgumentCollection;
1315
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;
1416
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotationData;
1517
import org.broadinstitute.hellbender.utils.Utils;
@@ -43,6 +45,7 @@ public final class VariantAnnotatorEngine {
4345
private boolean expressionAlleleConcordance;
4446
private final boolean useRawAnnotations;
4547
private final boolean keepRawCombinedAnnotations;
48+
private final List<String> rawAnnotationsToKeep;
4649

4750
private final static Logger logger = LogManager.getLogger(VariantAnnotatorEngine.class);
4851
private final static OneShotLogger jumboAnnotationsLogger = new OneShotLogger(VariantAnnotatorEngine.class);
@@ -59,17 +62,20 @@ public final class VariantAnnotatorEngine {
5962
* @param useRaw When this is set to true, the annotation engine will call {@link ReducibleAnnotation#annotateRawData(ReferenceContext, VariantContext, AlleleLikelihoods)}
6063
* on annotations that extend {@link ReducibleAnnotation}, instead of {@link InfoFieldAnnotation#annotate(ReferenceContext, VariantContext, AlleleLikelihoods)},
6164
* @param keepCombined If true, retain the combined raw annotation values instead of removing them after finalizing
65+
* @param rawAnnotationsToKeep List of raw annotations to keep even when others are removed
6266
*/
6367
public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
6468
final FeatureInput<VariantContext> dbSNPInput,
6569
final List<FeatureInput<VariantContext>> featureInputs,
6670
final boolean useRaw,
67-
boolean keepCombined){
71+
final boolean keepCombined,
72+
final Collection<Annotation> rawAnnotationsToKeep){
6873
Utils.nonNull(featureInputs, "comparisonFeatureInputs is null");
6974
infoAnnotations = new ArrayList<>();
7075
genotypeAnnotations = new ArrayList<>();
7176
jumboInfoAnnotations = new ArrayList<>();
7277
jumboGenotypeAnnotations = new ArrayList<>();
78+
this.rawAnnotationsToKeep = new ArrayList<>();
7379
for (Annotation annot : annotationList) {
7480
if (annot instanceof InfoFieldAnnotation) {
7581
infoAnnotations.add((InfoFieldAnnotation) annot);
@@ -87,6 +93,9 @@ public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
8793
reducibleKeys = new LinkedHashSet<>();
8894
useRawAnnotations = useRaw;
8995
keepRawCombinedAnnotations = keepCombined;
96+
for (final Annotation rawAnnot : rawAnnotationsToKeep) {
97+
this.rawAnnotationsToKeep.addAll(((VariantAnnotation) rawAnnot).getKeyNames());
98+
}
9099
for (InfoFieldAnnotation annot : infoAnnotations) {
91100
if (annot instanceof ReducibleAnnotation) {
92101
for (final String rawKey : ((ReducibleAnnotation) annot).getRawKeyNames()) {
@@ -96,6 +105,14 @@ public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
96105
}
97106
}
98107

108+
public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
109+
final FeatureInput<VariantContext> dbSNPInput,
110+
final List<FeatureInput<VariantContext>> featureInputs,
111+
final boolean useRaw,
112+
boolean keepCombined){
113+
this(annotationList, dbSNPInput, featureInputs, useRaw, keepCombined, Collections.emptyList());
114+
}
115+
99116
private VariantOverlapAnnotator initializeOverlapAnnotator(final FeatureInput<VariantContext> dbSNPInput, final List<FeatureInput<VariantContext>> featureInputs) {
100117
final Map<FeatureInput<VariantContext>, String> overlaps = new LinkedHashMap<>();
101118
for ( final FeatureInput<VariantContext> fi : featureInputs) {
@@ -253,6 +270,14 @@ public Map<String, Object> combineAnnotations(final List<Allele> allelesList, Ma
253270
public VariantContext finalizeAnnotations(VariantContext vc, VariantContext originalVC) {
254271
final Map<String, Object> variantAnnotations = new LinkedHashMap<>(vc.getAttributes());
255272

273+
//save annotations that have been requested to be kept
274+
final Map<String, Object> savedRawAnnotations = new LinkedHashMap<>();
275+
for(final String rawAnnot : rawAnnotationsToKeep) {
276+
if (variantAnnotations.containsKey(rawAnnot)) {
277+
savedRawAnnotations.put(rawAnnot, variantAnnotations.get(rawAnnot));
278+
}
279+
}
280+
256281
// go through all the requested info annotationTypes
257282
for (final InfoFieldAnnotation annotationType : infoAnnotations) {
258283
if (annotationType instanceof ReducibleAnnotation) {
@@ -280,6 +305,8 @@ public VariantContext finalizeAnnotations(VariantContext vc, VariantContext orig
280305
variantAnnotations.remove(GATKVCFConstants.VARIANT_DEPTH_KEY);
281306
variantAnnotations.remove(GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY);
282307
}
308+
//add back raw annotations that have specifically been requested to keep
309+
variantAnnotations.putAll(savedRawAnnotations);
283310

284311
// generate a new annotated VC
285312
final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(variantAnnotations);

src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/ReducibleAnnotation.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,21 @@
33
import htsjdk.variant.variantcontext.Allele;
44
import htsjdk.variant.variantcontext.VariantContext;
55
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
6-
import htsjdk.variant.vcf.VCFHeaderLine;
7-
import htsjdk.variant.vcf.VCFInfoHeaderLine;
86
import org.broadinstitute.hellbender.engine.ReferenceContext;
97
import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
108
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
119
import org.broadinstitute.hellbender.utils.read.GATKRead;
1210
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
1311

1412
import java.util.ArrayList;
15-
import java.util.Arrays;
1613
import java.util.List;
1714
import java.util.Map;
1815

1916
/**
2017
* An interface for annotations that are calculated using raw data across samples, rather than the median (or median of median) of samples values
18+
* The Raw annotation keeps some summary (one example might be a histogram of the raw values for each sample) of the individual sample (or allele)
19+
* level annotation. As the annotations are combined across multiple samples the raw annotation continues to contain individual values while
20+
* the final reduced annotation will typically be a summary statistic from these raw values.
2121
*
2222
*/
2323
public interface ReducibleAnnotation extends Annotation {

src/main/java/org/broadinstitute/hellbender/utils/variant/GATKVCFHeaderLines.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ public static VCFFormatHeaderLine getEquivalentFormatHeaderLine(final String inf
159159
addInfoLine(new VCFInfoHeaderLine(INBREEDING_COEFFICIENT_KEY, 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"));
160160
addInfoLine(new VCFInfoHeaderLine(AS_INBREEDING_COEFFICIENT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele-specific inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"));
161161
addInfoLine(new VCFInfoHeaderLine(EXCESS_HET_KEY, 1, VCFHeaderLineType.Float, "Phred-scaled p-value for exact test of excess heterozygosity"));
162-
addInfoLine(new VCFInfoHeaderLine(RAW_GENOTYPE_COUNT_KEY, 3, VCFHeaderLineType.Integer, "Counts of genotypes w.r.t. the reference allele: 0/0, 0/*, */*, i.e. all alts lumped together; for use in calculating excess heterozygosity"));
162+
addInfoLine(new VCFInfoHeaderLine(RAW_GENOTYPE_COUNT_KEY, 3, VCFHeaderLineType.Integer, "Counts of genotypes w.r.t. the reference allele in the following order: 0/0, 0/*, */*, i.e. all alts lumped together; for use in calculating excess heterozygosity"));
163163
addInfoLine(new VCFInfoHeaderLine(LIKELIHOOD_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref haplotype likelihoods"));
164164
addInfoLine(new VCFInfoHeaderLine(MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"));
165165
addInfoLine(new VCFInfoHeaderLine(AS_MAP_QUAL_RANK_SUM_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "allele specific Z-score From Wilcoxon rank sum test of each Alt vs. Ref read mapping qualities"));

0 commit comments

Comments
 (0)