Skip to content

VCF Data Sources now preserve FILTER field #5598

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@ public class VcfFuncotationFactory extends DataSourceFuncotationFactory {
*/
private static final String ID_FIELD_NAME = "ID";

/**
* The name of the additional FILTER status field to add to VCF annotations to preserve the FILTER status of the original (data source)
* variant.
*/
private static final String FILTER_FIELD_NAME = "FILTER";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't there a VCFConstants constant for this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not that I can see. There's one that's close, but contains comments which won't work:

public static final String FILTER_HEADER_START = "##FILTER";


@VisibleForTesting
int cacheHits = 0;
@VisibleForTesting
Expand Down Expand Up @@ -186,6 +192,15 @@ List<VCFInfoHeaderLine> createFuncotationVcfInfoHeaderLines(final VCFHeader vcfH
);
supportedVcfInfoHeaderLines.add( idHeaderLine );

// Add in the ID field to the meta data:
final VCFInfoHeaderLine filterHeaderLine = new VCFInfoHeaderLine(
createFinalFieldName(name, FILTER_FIELD_NAME),
1,
VCFHeaderLineType.String,
"FILTER status of the variant from the data source creating this annotation."
);
supportedVcfInfoHeaderLines.add( filterHeaderLine );

// Make sure to rename the input VCF field names to the output funcotation field names for this funcotation factory.
return supportedVcfInfoHeaderLines;
}
Expand Down Expand Up @@ -288,6 +303,9 @@ protected List<Funcotation> createFuncotationsOnVariant(final VariantContext var
// Add the ID of the variant:
annotations.put(createFinalFieldName(name, ID_FIELD_NAME), featureVariant.getID());

// Add the FILTER status of the variant:
annotations.put(createFinalFieldName(name, FILTER_FIELD_NAME), featureVariant.getFilters().stream().collect(Collectors.joining(";")));

final TableFuncotation newFuncotation = TableFuncotation.create(annotations, queryAltAllele, name, supportedFieldMetadata);
outputOrderedMap.merge(queryAltAllele, newFuncotation, VcfFuncotationFactory::mergeDuplicateFuncotationFactoryVariant);
}
Expand Down Expand Up @@ -481,6 +499,10 @@ private void populateSupportedFieldNamesFromVcfFile() {
// Add our ID to the supported fields:
supportedFieldNamesAndDefaults.put(createFinalFieldName(name, ID_FIELD_NAME), "" );
supportedFieldNames.add(createFinalFieldName(name, ID_FIELD_NAME));

// Add our FILTER status to the supported fields:
supportedFieldNamesAndDefaults.put(createFinalFieldName(name, FILTER_FIELD_NAME), "" );
supportedFieldNames.add(createFinalFieldName(name, FILTER_FIELD_NAME));
}

@VisibleForTesting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ public class FuncotatorIntegrationTest extends CommandLineProgramTest {
private static final String GERMLINE_DATASOURCES_FOLDER = "funcotator_dataSources_germline_latest";

private static final String XSV_CLINVAR_MULTIHIT_TEST_VCF = toolsTestDir + "funcotator" + File.separator + "clinvar_hg19_multihit_test.vcf";
private static final String FILTER_TEST_VCF = toolsTestDir + "funcotator" + File.separator + "FILTER_test.vcf";
private static final String DS_XSV_CLINVAR_TESTS = largeFileTestDir + "funcotator" + File.separator + "small_ds_clinvar_hg19" + File.separator;
private static final String DS_FILTER_PARSE_TESTS = largeFileTestDir + "funcotator" + File.separator + "small_ds_FILTER_test" + File.separator;

private static final String NOT_M2_TEST_HG19 = toolsTestDir + "funcotator/NotM2_test_custom_maf_fields.vcf";
private static final String M2_TEST_HG19 = toolsTestDir + "funcotator/M2_test_custom_maf_fields.vcf";
private static final String NOT_M2_TEST_HG19_TUMOR_ONLY = toolsTestDir + "funcotator/NotM2_test_custom_maf_fields_tumor_only.vcf";
Expand Down Expand Up @@ -903,6 +906,41 @@ public void testCanAnnotateHg38ClinvarAndGencodeV28() {
.count(), NUM_CLINVAR_HITS);
}

@Test
public void testFilterParsing() {

final File outputFile = createTempFile("tmpTestFilterParsing", "vcf");

final ArgumentsBuilder arguments = createBaselineArgumentsForFuncotator(
FILTER_TEST_VCF,
outputFile,
b37Reference,
DS_FILTER_PARSE_TESTS,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
FuncotatorArgumentDefinitions.OutputFormatType.VCF,
false);

arguments.addBooleanArgument(FuncotatorArgumentDefinitions.FORCE_B37_TO_HG19_REFERENCE_CONTIG_CONVERSION, true);

runCommandLine(arguments);

final Pair<VCFHeader, List<VariantContext>> tempVcf = VariantContextTestUtils.readEntireVCFIntoMemory(outputFile.getAbsolutePath());
Assert.assertEquals( tempVcf.getRight().size(), 1 );

final String[] funcotatorKeys = FuncotatorUtils.extractFuncotatorKeysFromHeaderDescription(tempVcf.getLeft().getInfoHeaderLine(VcfOutputRenderer.FUNCOTATOR_VCF_FIELD_NAME).getDescription());

final VariantContext variantContext = tempVcf.getRight().get(0);
final Map<Allele, FuncotationMap> funcs = FuncotatorUtils.createAlleleToFuncotationMapFromFuncotationVcfAttribute(
funcotatorKeys, variantContext, "Gencode_19_annotationTranscript", "FAKE_SOURCE");

final String txId = funcs.get(variantContext.getAlternateAllele(0)).getTranscriptList().get(0);
Assert.assertEquals( funcs.get(variantContext.getAlternateAllele(0)).get(txId).size(), 1 );

final Funcotation funcotation = funcs.get(variantContext.getAlternateAllele(0)).get(txId).get(0);

Assert.assertEquals(funcotation.getField("dbSnp_FILTER"), "FILTER_8");
}

@Test
public void testExclusionFromDatasourceVcfToVcf() {
// Clinvar datasource did go through one round of preprocessing to make contig names "1" --> "chr1" (for example). This is an issue with ClinVar, not GATK.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ public class FuncotatorTestConstants {
public static final String COSMIC_TEST_DB = FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER + "cosmic" + File.separator + "hg19" + File.separator + "CosmicTest.db";

public static final String DBSNP_HG19_SNIPPET_FILE_PATH = FUNCOTATOR_TEST_DIR + "dbSNP_hg19_snippet.vcf";
public static final String DBSNP_HG19_SNIPPET_WITH_FILTERS_FILE_PATH = FUNCOTATOR_TEST_DIR + "dbSNP_hg19_snippet_with_filters.vcf";

public static final String GENCODE_TRANSCRIPT_FASTA_FILE_NAME = FUNCOTATOR_LARGE_FILES_DIR + "gencode.v19.pc_transcripts.fasta";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
Expand All @@ -48,11 +47,10 @@ public class VcfFuncotationFactoryUnitTest extends GATKBaseTest {
private static final String FACTORY_VERSION = "TEST_VERSION";
private static final String EXAC_SNIPPET = toolsTestDir + "funcotator/test_exac.vcf";

private static final String DEFAULT_FILTER_STRING = "TODAY;A;Variant;Was;FILTERED";

//==================================================================================================================
// Private Members:

private static final ReferenceDataSource CHR3_REF_DATA_SOURCE = ReferenceDataSource.of(new File(FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref()).toPath());

private static final LinkedHashMap<String, Object> FIELD_DEFAULT_MAP = new LinkedHashMap<>();

static {
Expand Down Expand Up @@ -105,22 +103,25 @@ public class VcfFuncotationFactoryUnitTest extends GATKBaseTest {
FIELD_DEFAULT_MAP.put("WTD", "false");
FIELD_DEFAULT_MAP.put("dbSNPBuildID", "");
FIELD_DEFAULT_MAP.put("ID", "");
FIELD_DEFAULT_MAP.put("FILTER", "");
}

//==================================================================================================================
// Helper Methods:



private Object[] helpProvideForTestCreateFuncotations(final String contig,
private Object[] helpProvideForTestCreateFuncotations(final String variantFeatureFileName,
final String contig,
final int start,
final int end,
final String refAlleleString,
final String altAlleleString,
final List<Funcotation> expected) {
return new Object[]{
variantFeatureFileName,
FuncotatorTestUtils.createSimpleVariantContext(FuncotatorReferenceTestUtils.retrieveHg19Chr3Ref(), contig, start, end, refAlleleString, altAlleleString),
new ReferenceContext(CHR3_REF_DATA_SOURCE, new SimpleInterval(contig, start, end)),
new ReferenceContext(ReferenceDataSource.of(IOUtils.getPath(b37Reference)), new SimpleInterval(contig, start, end)),
expected
};
}
Expand All @@ -145,27 +146,39 @@ private Object[][] provideForTestCreateFuncotationsOnVariant() {

return new Object[][]{
// Trivial Case: No overlapping features:
helpProvideForTestCreateFuncotations("3", 61650, 61650, "T", "C",
helpProvideForTestCreateFuncotations(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH,
"3", 61650, 61650, "T", "C",
Collections.singletonList(
TableFuncotation.create(FIELD_DEFAULT_MAP.keySet().stream().map(s -> FACTORY_NAME + "_" + s).collect(Collectors.toList()),
FIELD_DEFAULT_MAP.values().stream().map(Object::toString).collect(Collectors.toList()),
Allele.create("C"), FACTORY_NAME, null)
)
),
// One overlapping VCF feature:
helpProvideForTestCreateFuncotations("3", 61662, 61662, "T", "C",
helpProvideForTestCreateFuncotations(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH,
"3", 61662, 61662, "T", "C",
Collections.singletonList(
TableFuncotation.create(FIELD_DEFAULT_MAP.keySet().stream().map(s -> FACTORY_NAME + "_" + s).collect(Collectors.toList()),
Arrays.asList("true", "false", "0.9744,0.02556", "false", "false", "1", "false", "true", "false", "", "false", "true", "false", "true", "true", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "73009205", "61662", "false", "false", "0", "true", "0", "false", "0.954392,0.0456075", "false", "false", "false", "SNV", "true", "0x05010000000515043e000100", "1", "false", "130", "rs73009205"),
Arrays.asList("true", "false", "0.9744,0.02556", "false", "false", "1", "false", "true", "false", "", "false", "true", "false", "true", "true", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "73009205", "61662", "false", "false", "0", "true", "0", "false", "0.954392,0.0456075", "false", "false", "false", "SNV", "true", "0x05010000000515043e000100", "1", "false", "130", "rs73009205", ""),
Allele.create("C"), FACTORY_NAME, null)
)
),
// No matching VCF features (three overlap by position only), since there are no indels in dbSNP (the test datasource), so the ground truth should be a default entry, which was constructed here manually:
helpProvideForTestCreateFuncotations("3", 64157, 64166, "AGAAAGGTCA", "TCTTTCCAGT",
helpProvideForTestCreateFuncotations(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH,
"3", 64157, 64166, "AGAAAGGTCA", "TCTTTCCAGT",
Collections.singletonList(TableFuncotation.create(FIELD_DEFAULT_MAP.keySet().stream().map(s -> FACTORY_NAME + "_" + s).collect(Collectors.toList()),
Arrays.asList("false", "false", "", "false", "false", "", "false", "false", "false", "", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "", "", "false", "false", "", "false", "", "false", "", "false", "false", "false", "", "false", "", "", "false", "", ""),
Arrays.asList("false", "false", "", "false", "false", "", "false", "false", "false", "", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "", "", "false", "false", "", "false", "", "false", "", "false", "false", "false", "", "false", "", "", "false", "", "", ""),
Allele.create("TCTTTCCAGT"), FACTORY_NAME, null))
),
// One overlapping VCF feature, non-empty FILTER field:
helpProvideForTestCreateFuncotations(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_WITH_FILTERS_FILE_PATH,
"3", 61662, 61662, "T", "C",
Collections.singletonList(
TableFuncotation.create(FIELD_DEFAULT_MAP.keySet().stream().map(s -> FACTORY_NAME + "_" + s).collect(Collectors.toList()),
Arrays.asList("true", "false", "0.9744,0.02556", "false", "false", "1", "false", "true", "false", "", "false", "true", "false", "true", "true", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "false", "73009205", "61662", "false", "false", "0", "true", "0", "false", "0.954392,0.0456075", "false", "false", "false", "SNV", "true", "0x05010000000515043e000100", "1", "false", "130", "rs73009205", "FILTER_73"),
Allele.create("C"), FACTORY_NAME, null)
)
),
};
}

Expand Down Expand Up @@ -205,17 +218,17 @@ public void testGetSupportedFuncotationFields() {
}

@Test(dataProvider = "provideForTestCreateFuncotationsOnVariant")
public void testCreateFuncotationsOnVariant(final VariantContext variant,
public void testCreateFuncotationsOnVariant(final String variantFeatureDataFileName,
final VariantContext variant,
final ReferenceContext referenceContext,
final List<Funcotation> expected) {

// Make our factory:
final VcfFuncotationFactory vcfFuncotationFactory =
createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH));
createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(variantFeatureDataFileName));

// Create features from the file:
final List<Feature> vcfFeatures;
try (final VCFFileReader vcfReader = new VCFFileReader(IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH))) {
try (final VCFFileReader vcfReader = new VCFFileReader(IOUtils.getPath(variantFeatureDataFileName))) {
vcfFeatures = vcfReader.query(variant.getContig(), variant.getStart(), variant.getEnd()).stream().collect(Collectors.toList());
}

Expand All @@ -238,21 +251,21 @@ public void testCreateFuncotationsOnVariant(final VariantContext variant,
),
expected
);

}

@Test(dataProvider = "provideForTestCreateFuncotationsOnVariant")
public void testCreateFuncotationMetadata(final VariantContext variant,
public void testCreateFuncotationMetadata(final String variantFeatureDataFileName,
final VariantContext variant,
final ReferenceContext referenceContext,
final List<Funcotation> expected) {
// Don't need the expected gt for this test, but useful to reuse the data provider.
// Make our factory:
final VcfFuncotationFactory vcfFuncotationFactory =
createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH));
createVcfFuncotationFactory(FACTORY_NAME, FACTORY_VERSION, IOUtils.getPath(variantFeatureDataFileName));

// Create features from the file:
final List<Feature> vcfFeatures;
try (final VCFFileReader vcfReader = new VCFFileReader(IOUtils.getPath(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH))) {
try (final VCFFileReader vcfReader = new VCFFileReader(IOUtils.getPath(variantFeatureDataFileName))) {
vcfFeatures = vcfReader.query(variant.getContig(), variant.getStart(), variant.getEnd()).stream().collect(Collectors.toList());
}

Expand All @@ -265,7 +278,7 @@ public void testCreateFuncotationMetadata(final VariantContext variant,
);

Assert.assertEquals(funcotations.stream().map(f -> f.getMetadata().retrieveAllHeaderInfo()).collect(Collectors.toSet()).size(), 1);
final Pair<VCFHeader, List<VariantContext>> vcfInfo = VariantContextTestUtils.readEntireVCFIntoMemory(FuncotatorTestConstants.DBSNP_HG19_SNIPPET_FILE_PATH);
final Pair<VCFHeader, List<VariantContext>> vcfInfo = VariantContextTestUtils.readEntireVCFIntoMemory(variantFeatureDataFileName);
final List<VCFInfoHeaderLine> gtOutputVcfInfoHeaderLines = vcfFuncotationFactory.createFuncotationVcfInfoHeaderLines(vcfInfo.getLeft());

// Get the info headers that are in the VCF and make sure that these are also present in the metadata
Expand Down
Loading