-
Notifications
You must be signed in to change notification settings - Fork 602
Make sure SelectVariants outputs variants in correct order (assuming input vcf is correctly sorted) #6444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make sure SelectVariants outputs variants in correct order (assuming input vcf is correctly sorted) #6444
Changes from 5 commits
b0b9116
7f10680
623a02e
0724904
99f912b
15555d4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -451,6 +451,8 @@ private enum NumberAlleleRestriction { | |
|
||
private final Map<Integer, Integer> ploidyToNumberOfAlleles = new LinkedHashMap<Integer, Integer>(); | ||
|
||
private PriorityQueue<VariantContext> pendingVariants; | ||
|
||
/** | ||
* Set up the VCF writer, the sample expressions and regexs, filters inputs, and the JEXL matcher | ||
* | ||
|
@@ -525,6 +527,8 @@ public void onTraversalStart() { | |
} | ||
} | ||
|
||
pendingVariants = new PriorityQueue<>(Comparator.comparingInt(VariantContext::getStart)); | ||
|
||
final Path outPath = vcfOutput.toPath(); | ||
vcfWriter = createVCFWriter(outPath); | ||
vcfWriter.writeHeader(new VCFHeader(actualLines, samples)); | ||
|
@@ -533,6 +537,13 @@ public void onTraversalStart() { | |
@Override | ||
public void apply(VariantContext vc, ReadsContext readsContext, ReferenceContext ref, FeatureContext featureContext) { | ||
|
||
/*check for pending variants to write out | ||
since variant starts will only be moved further right, we can write out a pending variant if the current variant start is after the pending variant start | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moving variants to the right may seem like a non sequitur to the average reader so specify that it can occur if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
*/ | ||
while (!pendingVariants.isEmpty() && (pendingVariants.peek().getStart()<=vc.getStart() || !(pendingVariants.peek().getContig().equals(vc.getContig())))) { | ||
vcfWriter.add(pendingVariants.poll()); | ||
} | ||
|
||
if (fullyDecode) { | ||
vc = vc.fullyDecode(getHeaderForVariants(), lenientVCFProcessing); | ||
} | ||
|
@@ -580,11 +591,15 @@ public void apply(VariantContext vc, ReadsContext readsContext, ReferenceContext | |
initalizeAlleleAnyploidIndicesCache(vc); | ||
|
||
final VariantContext sub = subsetRecord(vc, preserveAlleles, removeUnusedAlternates); | ||
final VariantContextBuilder builder = new VariantContextBuilder(vc); | ||
final VariantContext filteredGenotypeToNocall; | ||
|
||
if ( setFilteredGenotypesToNocall ) { | ||
final VariantContextBuilder builder = new VariantContextBuilder(sub); | ||
GATKVariantContextUtils.setFilteredGenotypeToNocall(builder, sub, setFilteredGenotypesToNocall, this::getGenotypeFilters); | ||
filteredGenotypeToNocall = builder.make(); | ||
} else { | ||
filteredGenotypeToNocall = sub; | ||
} | ||
final VariantContext filteredGenotypeToNocall = setFilteredGenotypesToNocall ? builder.make(): sub; | ||
|
||
// Not excluding non-variants OR (subsetted polymorphic variants AND not spanning deletion) AND (including filtered loci OR subsetted variant) is not filtered | ||
// If exclude non-variants argument is not called, filtering will NOT occur. | ||
|
@@ -617,11 +632,28 @@ public void apply(VariantContext vc, ReadsContext readsContext, ReferenceContext | |
(!selectRandomFraction || Utils.getRandomGenerator().nextDouble() < fractionRandom)) { | ||
//remove annotations being dropped and write variantcontext | ||
final VariantContext variantContextToWrite = buildVariantContextWithDroppedAnnotationsRemoved(filteredGenotypeToNocall); | ||
vcfWriter.add(variantContextToWrite); | ||
if (variantContextToWrite.getStart() != vc.getStart()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would just add every variant to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah you're right, I was trying to avoid doing too many comparisons in the priority queue, but since it is nearly always empty this is an essentially useless optimization. |
||
//if variant has shifted, need to add to priority queue it is now after a variant to follow | ||
pendingVariants.add(variantContextToWrite); | ||
} else { | ||
vcfWriter.add(variantContextToWrite); | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* write out all remaining pending variants | ||
* @return | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't need the return javadoc |
||
*/ | ||
@Override | ||
public Object onTraversalSuccess() { | ||
while(!pendingVariants.isEmpty()) { | ||
vcfWriter.add(pendingVariants.poll()); | ||
} | ||
return null; | ||
} | ||
|
||
private VariantContext buildVariantContextWithDroppedAnnotationsRemoved(final VariantContext vc) { | ||
if (infoAnnotationsToDrop.isEmpty() && genotypeAnnotationsToDrop.isEmpty()) { | ||
return vc; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -85,6 +85,30 @@ public void testComplexSelection() throws IOException { | |
spec.executeTest("testComplexSelection--" + testFile, this); | ||
} | ||
|
||
@Test | ||
public void testUntrimmedVariants() throws IOException { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test needs some javadoc explaining the potential output ordering pitfall of the input untrimmed variants. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
final String testFile = getToolTestDataDir() + "untrimmed.vcf"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm going to be opinionated here and argue against using
The virtues I see in this approach are 1) Only one test file needs to be added/maintained. 2) Doesn't require exact match and therefore less brittle. 3) Intent is explicit. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well argued, I'm convinced! |
||
|
||
final IntegrationTestSpec spec = new IntegrationTestSpec( | ||
baseTestString(" -sn SAMPLE_01", testFile), | ||
Collections.singletonList(getToolTestDataDir() + "expected/" + "untrimmed.vcf") | ||
); | ||
|
||
spec.executeTest("testUntrimmedVariants--" + testFile, this); | ||
} | ||
|
||
@Test | ||
public void testUntrimmedVariantsWithSetFilteredGtToNocall() throws IOException { | ||
final String testFile = getToolTestDataDir() + "untrimmed.vcf"; | ||
|
||
final IntegrationTestSpec spec = new IntegrationTestSpec( | ||
baseTestString(" -sn SAMPLE_01 --set-filtered-gt-to-nocall", testFile), | ||
Collections.singletonList(getToolTestDataDir() + "expected/" + "untrimmed.vcf") | ||
); | ||
|
||
spec.executeTest("testUntrimmedVariants--" + testFile, this); | ||
} | ||
|
||
@Test | ||
public void testComplexSelectionWithNonExistingSamples() throws IOException { | ||
final String testFile = getToolTestDataDir() + "vcfexample2.vcf"; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
##fileformat=VCFv4.2 | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> | ||
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes"> | ||
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> | ||
##contig=<ID=chr1,length=248956422> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_01 | ||
chr1 17148447 . GAAACAAAACA G . . AC=0;AF=0.00;AN=2 GT 0|0 | ||
chr1 17148448 . A G . . AC=0;AF=0.00;AN=2 GT 0|0 | ||
chr1 17148456 . C T . . AC=0;AF=0.00;AN=2 GT 0|0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
##fileformat=VCFv4.2 | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##contig=<ID=chr1,length=248956422,assembly=38> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_01 SAMPLE_02 | ||
chr1 17148447 . GAAACAAAACA GAAACAAAATA . . . GT 0|0 0|0 | ||
chr1 17148447 . GAAACAAAACA G . . . GT 0|0 0|0 | ||
chr1 17148448 . A G . . . GT 0|0 0|0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can make this
final
by moving its constructor up to the declaration.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done