Skip to content

Commit b41d8af

Browse files
authored
VCF Data Sources now preserve FILTER field (#5598)
* Added in code to include FILTER values in VCF annotations. * Updated test data for updates to VCF data sources. * Adding in a specific test that the FILTER field exists.
1 parent 52c1b34 commit b41d8af

26 files changed

+4348
-3811
lines changed

src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/vcf/VcfFuncotationFactory.java

+22
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,12 @@ public class VcfFuncotationFactory extends DataSourceFuncotationFactory {
8787
*/
8888
private static final String ID_FIELD_NAME = "ID";
8989

90+
/**
91+
* The name of the additional FILTER status field to add to VCF annotations to preserve the FILTER status of the original (data source)
92+
* variant.
93+
*/
94+
private static final String FILTER_FIELD_NAME = "FILTER";
95+
9096
@VisibleForTesting
9197
int cacheHits = 0;
9298
@VisibleForTesting
@@ -186,6 +192,15 @@ List<VCFInfoHeaderLine> createFuncotationVcfInfoHeaderLines(final VCFHeader vcfH
186192
);
187193
supportedVcfInfoHeaderLines.add( idHeaderLine );
188194

195+
// Add in the ID field to the meta data:
196+
final VCFInfoHeaderLine filterHeaderLine = new VCFInfoHeaderLine(
197+
createFinalFieldName(name, FILTER_FIELD_NAME),
198+
1,
199+
VCFHeaderLineType.String,
200+
"FILTER status of the variant from the data source creating this annotation."
201+
);
202+
supportedVcfInfoHeaderLines.add( filterHeaderLine );
203+
189204
// Make sure to rename the input VCF field names to the output funcotation field names for this funcotation factory.
190205
return supportedVcfInfoHeaderLines;
191206
}
@@ -288,6 +303,9 @@ protected List<Funcotation> createFuncotationsOnVariant(final VariantContext var
288303
// Add the ID of the variant:
289304
annotations.put(createFinalFieldName(name, ID_FIELD_NAME), featureVariant.getID());
290305

306+
// Add the FILTER status of the variant:
307+
annotations.put(createFinalFieldName(name, FILTER_FIELD_NAME), featureVariant.getFilters().stream().collect(Collectors.joining(";")));
308+
291309
final TableFuncotation newFuncotation = TableFuncotation.create(annotations, queryAltAllele, name, supportedFieldMetadata);
292310
outputOrderedMap.merge(queryAltAllele, newFuncotation, VcfFuncotationFactory::mergeDuplicateFuncotationFactoryVariant);
293311
}
@@ -481,6 +499,10 @@ private void populateSupportedFieldNamesFromVcfFile() {
481499
// Add our ID to the supported fields:
482500
supportedFieldNamesAndDefaults.put(createFinalFieldName(name, ID_FIELD_NAME), "" );
483501
supportedFieldNames.add(createFinalFieldName(name, ID_FIELD_NAME));
502+
503+
// Add our FILTER status to the supported fields:
504+
supportedFieldNamesAndDefaults.put(createFinalFieldName(name, FILTER_FIELD_NAME), "" );
505+
supportedFieldNames.add(createFinalFieldName(name, FILTER_FIELD_NAME));
484506
}
485507

486508
@VisibleForTesting

src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorIntegrationTest.java

+38
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,10 @@ public class FuncotatorIntegrationTest extends CommandLineProgramTest {
6060
private static final String GERMLINE_DATASOURCES_FOLDER = "funcotator_dataSources_germline_latest";
6161

6262
private static final String XSV_CLINVAR_MULTIHIT_TEST_VCF = toolsTestDir + "funcotator" + File.separator + "clinvar_hg19_multihit_test.vcf";
63+
private static final String FILTER_TEST_VCF = toolsTestDir + "funcotator" + File.separator + "FILTER_test.vcf";
6364
private static final String DS_XSV_CLINVAR_TESTS = largeFileTestDir + "funcotator" + File.separator + "small_ds_clinvar_hg19" + File.separator;
65+
private static final String DS_FILTER_PARSE_TESTS = largeFileTestDir + "funcotator" + File.separator + "small_ds_FILTER_test" + File.separator;
66+
6467
private static final String NOT_M2_TEST_HG19 = toolsTestDir + "funcotator/NotM2_test_custom_maf_fields.vcf";
6568
private static final String M2_TEST_HG19 = toolsTestDir + "funcotator/M2_test_custom_maf_fields.vcf";
6669
private static final String NOT_M2_TEST_HG19_TUMOR_ONLY = toolsTestDir + "funcotator/NotM2_test_custom_maf_fields_tumor_only.vcf";
@@ -903,6 +906,41 @@ public void testCanAnnotateHg38ClinvarAndGencodeV28() {
903906
.count(), NUM_CLINVAR_HITS);
904907
}
905908

909+
@Test
910+
public void testFilterParsing() {
911+
912+
final File outputFile = createTempFile("tmpTestFilterParsing", "vcf");
913+
914+
final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
915+
FILTER_TEST_VCF,
916+
outputFile,
917+
b37Reference,
918+
DS_FILTER_PARSE_TESTS,
919+
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
920+
FuncotatorArgumentDefinitions.OutputFormatType.VCF,
921+
false);
922+
923+
arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true);
924+
925+
runCommandLine(arguments);
926+
927+
final Pair<VCFHeader, List<VariantContext>> tempVcf = VariantContextTestUtils.readEntireVCFIntoMemory(outputFile.getAbsolutePath());
928+
Assert.assertEquals( tempVcf.getRight().size(), 1 );
929+
930+
final String[] funcotatorKeys = FuncotatorUtils.extractFuncotatorKeysFromHeaderDescription(tempVcf.getLeft().getInfoHeaderLine(VcfOutputRenderer.FUNCOTATOR_VCF_FIELD_NAME).getDescription());
931+
932+
final VariantContext variantContext = tempVcf.getRight().get(0);
933+
final Map<Allele, FuncotationMap> funcs = FuncotatorUtils.createAlleleToFuncotationMapFromFuncotationVcfAttribute(
934+
funcotatorKeys, variantContext, "Gencode_19_annotationTranscript", "FAKE_SOURCE");
935+
936+
final String txId = funcs.get(variantContext.getAlternateAllele(0)).getTranscriptList().get(0);
937+
Assert.assertEquals( funcs.get(variantContext.getAlternateAllele(0)).get(txId).size(), 1 );
938+
939+
final Funcotation funcotation = funcs.get(variantContext.getAlternateAllele(0)).get(txId).get(0);
940+
941+
Assert.assertEquals(funcotation.getField("dbSnp_FILTER"), "FILTER_8");
942+
}
943+
906944
@Test
907945
public void testExclusionFromDatasourceVcfToVcf() {
908946
// Clinvar datasource did go through one round of preprocessing to make contig names "1" --> "chr1" (for example). This is an issue with ClinVar, not GATK.

src/test/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorTestConstants.java

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ public class FuncotatorTestConstants {
7171
public static final String COSMIC_TEST_DB = FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER + "cosmic" + File.separator + "hg19" + File.separator + "CosmicTest.db";
7272

7373
public static final String DBSNP_HG19_SNIPPET_FILE_PATH = FUNCOTATOR_TEST_DIR + "dbSNP_hg19_snippet.vcf";
74+
public static final String DBSNP_HG19_SNIPPET_WITH_FILTERS_FILE_PATH = FUNCOTATOR_TEST_DIR + "dbSNP_hg19_snippet_with_filters.vcf";
7475

7576
public static final String GENCODE_TRANSCRIPT_FASTA_FILE_NAME = FUNCOTATOR_LARGE_FILES_DIR + "gencode.v19.pc_transcripts.fasta";
7677

src/test/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/vcf/VcfFuncotationFactoryUnitTest.java

+33-20
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
import org.testng.annotations.DataProvider;
2929
import org.testng.annotations.Test;
3030

31-
import java.io.File;
3231
import java.nio.file.Path;
3332
import java.nio.file.Paths;
3433
import java.util.*;
@@ -48,11 +47,10 @@ public class VcfFuncotationFactoryUnitTest extends GATKBaseTest {
4847
private static final String FACTORY_VERSION = "TEST_VERSION";
4948
private static final String EXAC_SNIPPET = toolsTestDir + "funcotator/test_exac.vcf";
5049

50+
private static final String DEFAULT_FILTER_STRING = "TODAY;A;Variant;Was;FILTERED";
51+
5152
//==================================================================================================================
5253
// Private Members:
53-
54-
private static final ReferenceDataSource CHR3_REF_DATA_SOURCE = ReferenceDataSource.of(new File(FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref()).toPath());
55-
5654
private static final LinkedHashMap<String, Object> FIELD_DEFAULT_MAP = new LinkedHashMap<>();
5755

5856
static {
@@ -105,22 +103,25 @@ public class VcfFuncotationFactoryUnitTest extends GATKBaseTest {
105103
FIELD_DEFAULT_MAP.put("WTD", "false");
106104
FIELD_DEFAULT_MAP.put("dbSNPBuildID", "");
107105
FIELD_DEFAULT_MAP.put("ID", "");
106+
FIELD_DEFAULT_MAP.put("FILTER", "");
108107
}
109108

110109
//==================================================================================================================
111110
// Helper Methods:
112111

113112

114113

115-
private Object[] helpProvideForTestCreateFuncotations(final String contig,
114+
private Object[] helpProvideForTestCreateFuncotations(final String variantFeatureFileName,
115+
final String contig,
116116
final int start,
117117
final int end,
118118
final String refAlleleString,
119119
final String altAlleleString,
120120
final List<Funcotation> expected) {
121121
return new Object[]{
122+
variantFeatureFileName,
122123
FuncotatorTestUtils.createSimpleVariantContext(FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(), contig, start, end, refAlleleString, altAlleleString),
123-
new ReferenceContext(CHR3_REF_DATA_SOURCE, new SimpleInterval(contig, start, end)),
124+
new ReferenceContext(ReferenceDataSource.of(IOUtils.getPath(b37Reference)), new SimpleInterval(contig, start, end)),
124125
expected
125126
};
126127
}
@@ -145,27 +146,39 @@ private Object[][] provideForTestCreateFuncotationsOnVariant() {
145146

146147
return new Object[][]{
147148
// Trivial Case: No overlapping features:
148-
helpProvideForTestCreateFuncotations("3", 61650, 61650, "T", "C",
149+
helpProvideForTestCreateFuncotations(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH,
150+
"3", 61650, 61650, "T", "C",
149151
Collections.singletonList(
150152
TableFuncotation.create(FIELD_DEFAULT_MAP.keySet().stream().map(s -> FACTORY_NAME + "_" + s).collect(Collectors.toList()),
151153
FIELD_DEFAULT_MAP.values().stream().map(Object::toString).collect(Collectors.toList()),
152154
Allele.create("C"), FACTORY_NAME, null)
153155
)
154156
),
155157
// One overlapping VCF feature:
156-
helpProvideForTestCreateFuncotations("3", 61662, 61662, "T", "C",
158+
helpProvideForTestCreateFuncotations(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH,
159+
"3", 61662, 61662, "T", "C",
157160
Collections.singletonList(
158161
TableFuncotation.create(FIELD_DEFAULT_MAP.keySet().stream().map(s -> FACTORY_NAME + "_" + s).collect(Collectors.toList()),
159-
Arrays.asList("true", "false", "0.9744,0.02556", "false", "false", "1", "false", "true", "false", "", "false", "true", "false", "true", "true", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "73009205", "61662", "false", "false", "0", "true", "0", "false", "0.954392,0.0456075", "false", "false", "false", "SNV", "true", "0x05010000000515043e000100", "1", "false", "130", "rs73009205"),
162+
Arrays.asList("true", "false", "0.9744,0.02556", "false", "false", "1", "false", "true", "false", "", "false", "true", "false", "true", "true", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "73009205", "61662", "false", "false", "0", "true", "0", "false", "0.954392,0.0456075", "false", "false", "false", "SNV", "true", "0x05010000000515043e000100", "1", "false", "130", "rs73009205", ""),
160163
Allele.create("C"), FACTORY_NAME, null)
161164
)
162165
),
163166
// No matching VCF features (three overlap by position only), since there are no indels in dbSNP (the test datasource), so the ground truth should be a default entry, which was constructed here manually:
164-
helpProvideForTestCreateFuncotations("3", 64157, 64166, "AGAAAGGTCA", "TCTTTCCAGT",
167+
helpProvideForTestCreateFuncotations(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH,
168+
"3", 64157, 64166, "AGAAAGGTCA", "TCTTTCCAGT",
165169
Collections.singletonList(TableFuncotation.create(FIELD_DEFAULT_MAP.keySet().stream().map(s -> FACTORY_NAME + "_" + s).collect(Collectors.toList()),
166-
Arrays.asList("false", "false", "", "false", "false", "", "false", "false", "false", "", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "", "", "false", "false", "", "false", "", "false", "", "false", "false", "false", "", "false", "", "", "false", "", ""),
170+
Arrays.asList("false", "false", "", "false", "false", "", "false", "false", "false", "", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "", "", "false", "false", "", "false", "", "false", "", "false", "false", "false", "", "false", "", "", "false", "", "", ""),
167171
Allele.create("TCTTTCCAGT"), FACTORY_NAME, null))
168172
),
173+
// One overlapping VCF feature, non-empty FILTER field:
174+
helpProvideForTestCreateFuncotations(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_WITH_FILTERS_FILE_PATH,
175+
"3", 61662, 61662, "T", "C",
176+
Collections.singletonList(
177+
TableFuncotation.create(FIELD_DEFAULT_MAP.keySet().stream().map(s -> FACTORY_NAME + "_" + s).collect(Collectors.toList()),
178+
Arrays.asList("true", "false", "0.9744,0.02556", "false", "false", "1", "false", "true", "false", "", "false", "true", "false", "true", "true", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "73009205", "61662", "false", "false", "0", "true", "0", "false", "0.954392,0.0456075", "false", "false", "false", "SNV", "true", "0x05010000000515043e000100", "1", "false", "130", "rs73009205", "FILTER_73"),
179+
Allele.create("C"), FACTORY_NAME, null)
180+
)
181+
),
169182
};
170183
}
171184

@@ -205,17 +218,17 @@ public void testGetSupportedFuncotationFields() {
205218
}
206219

207220
@Test(dataProvider = "provideForTestCreateFuncotationsOnVariant")
208-
public void testCreateFuncotationsOnVariant(final VariantContext variant,
221+
public void testCreateFuncotationsOnVariant(final String variantFeatureDataFileName,
222+
final VariantContext variant,
209223
final ReferenceContext referenceContext,
210224
final List<Funcotation> expected) {
211-
212225
// Make our factory:
213226
final VcfFuncotationFactory vcfFuncotationFactory =
214-
createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH));
227+
createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(variantFeatureDataFileName));
215228

216229
// Create features from the file:
217230
final List<Feature> vcfFeatures;
218-
try (final VCFFileReader vcfReader = new VCFFileReader(IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH))) {
231+
try (final VCFFileReader vcfReader = new VCFFileReader(IOUtils.getPath(variantFeatureDataFileName))) {
219232
vcfFeatures = vcfReader.query(variant.getContig(), variant.getStart(), variant.getEnd()).stream().collect(Collectors.toList());
220233
}
221234

@@ -238,21 +251,21 @@ public void testCreateFuncotationsOnVariant(final VariantContext variant,
238251
),
239252
expected
240253
);
241-
242254
}
243255

244256
@Test(dataProvider = "provideForTestCreateFuncotationsOnVariant")
245-
public void testCreateFuncotationMetadata(final VariantContext variant,
257+
public void testCreateFuncotationMetadata(final String variantFeatureDataFileName,
258+
final VariantContext variant,
246259
final ReferenceContext referenceContext,
247260
final List<Funcotation> expected) {
248261
// Don't need the expected gt for this test, but useful to reuse the data provider.
249262
// Make our factory:
250263
final VcfFuncotationFactory vcfFuncotationFactory =
251-
createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH));
264+
createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(variantFeatureDataFileName));
252265

253266
// Create features from the file:
254267
final List<Feature> vcfFeatures;
255-
try (final VCFFileReader vcfReader = new VCFFileReader(IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH))) {
268+
try (final VCFFileReader vcfReader = new VCFFileReader(IOUtils.getPath(variantFeatureDataFileName))) {
256269
vcfFeatures = vcfReader.query(variant.getContig(), variant.getStart(), variant.getEnd()).stream().collect(Collectors.toList());
257270
}
258271

@@ -265,7 +278,7 @@ public void testCreateFuncotationMetadata(final VariantContext variant,
265278
);
266279

267280
Assert.assertEquals(funcotations.stream().map(f -> f.getMetadata().retrieveAllHeaderInfo()).collect(Collectors.toSet()).size(), 1);
268-
final Pair<VCFHeader, List<VariantContext>> vcfInfo = VariantContextTestUtils.readEntireVCFIntoMemory(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH);
281+
final Pair<VCFHeader, List<VariantContext>> vcfInfo = VariantContextTestUtils.readEntireVCFIntoMemory(variantFeatureDataFileName);
269282
final List<VCFInfoHeaderLine> gtOutputVcfInfoHeaderLines = vcfFuncotationFactory.createFuncotationVcfInfoHeaderLines(vcfInfo.getLeft());
270283

271284
// Get the info headers that are in the VCF and make sure that these are also present in the metadata

0 commit comments

Comments
 (0)