-
Notifications
You must be signed in to change notification settings - Fork 602
Fix non-determinism in KbestHaplotypeFinding/SeqGraphZippingCode #6195
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
6cb3e69
03aa830
662e224
c31edb7
6430504
20ee649
57f2427
9be44aa
2131b15
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -130,6 +130,12 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu | |
@Argument(fullName = "just-determine-active-regions", doc = "Just determine ActiveRegions, don't perform assembly or calling", optional = true) | ||
public boolean justDetermineActiveRegions = false; | ||
|
||
|
||
@Hidden | ||
@Advanced | ||
@Argument(fullName="debug-assembly-region-state", doc="Write output files for assembled regions with read summaries and called haplotypes", optional = true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The doc should explain what the the value is supposed to be, it sounds like it should be a boolean. |
||
public String assemblyStateOutput = null; | ||
|
||
/** | ||
* This argument is intended for benchmarking and scalability testing. | ||
*/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,8 @@ | |
import org.broadinstitute.hellbender.utils.variant.writers.GVCFWriter; | ||
|
||
import java.io.IOException; | ||
import java.io.PrintStream; | ||
import java.nio.file.Files; | ||
import java.util.*; | ||
import java.util.stream.Collectors; | ||
|
||
|
@@ -71,6 +73,8 @@ public final class HaplotypeCallerEngine implements AssemblyRegionEvaluator { | |
|
||
private AssemblyRegionTrimmer trimmer = new AssemblyRegionTrimmer(); | ||
|
||
private PrintStream assemblyDebugOutStream; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. final? |
||
|
||
// the genotyping engine for the isActive() determination | ||
private MinimalGenotypingEngine activeRegionEvaluationGenotyperEngine = null; | ||
|
||
|
@@ -120,7 +124,6 @@ public final class HaplotypeCallerEngine implements AssemblyRegionEvaluator { | |
*/ | ||
private static final int MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY = 2; | ||
|
||
|
||
/** | ||
* Reads with length lower than this number, after clipping off overhands outside the active region, | ||
* won't be considered for genotyping. | ||
|
@@ -159,6 +162,13 @@ public HaplotypeCallerEngine( final HaplotypeCallerArgumentCollection hcArgs, bo | |
this.aligner = SmithWatermanAligner.getAligner(hcArgs.smithWatermanImplementation); | ||
forceCallingAllelesPresent = hcArgs.alleles != null; | ||
initialize(createBamOutIndex, createBamOutMD5); | ||
if (hcArgs.assemblyStateOutput != null) { | ||
try { | ||
assemblyDebugOutStream = new PrintStream(Files.newOutputStream(IOUtils.getPath(hcArgs.assemblyStateOutput))); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Throw a |
||
} | ||
} | ||
} | ||
|
||
/** | ||
|
@@ -524,9 +534,23 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur | |
return referenceModelForNoVariation(region, true, VCpriors); | ||
} | ||
|
||
if (assemblyDebugOutStream != null) { | ||
assemblyDebugOutStream.println("\n\n\n\n"+region.getSpan()+"\nNumber of reads in region: " + region.getReads().size() + " they are:"); | ||
for (GATKRead read : region.getReads()) { | ||
assemblyDebugOutStream.println(read.getName() + " " + read.convertToSAMRecord(region.getHeader()).getFlags()); | ||
} | ||
} | ||
|
||
// run the local assembler, getting back a collection of information on how we should proceed | ||
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner, !hcArgs.doNotCorrectOverlappingBaseQualities); | ||
|
||
|
||
if (assemblyDebugOutStream != null) { | ||
assemblyDebugOutStream.println("\nThere were " + untrimmedAssemblyResult.getHaplotypeList().size() + " haplotypes found. Here they are:"); | ||
for (String haplotype : untrimmedAssemblyResult.getHaplotypeList().stream().map(haplotype -> haplotype.toString()).sorted().collect(Collectors.toList())) { | ||
assemblyDebugOutStream.println(haplotype); | ||
} | ||
} | ||
|
||
final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents(hcArgs.maxMnpDistance); | ||
|
||
final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(region, allVariationEvents); | ||
|
@@ -697,6 +721,9 @@ public void shutdown() { | |
} | ||
} | ||
|
||
if (assemblyDebugOutStream != null) { | ||
assemblyDebugOutStream.close(); | ||
} | ||
// Write assembly region debug output if present | ||
assemblyEngine.printDebugHistograms(); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,18 +76,22 @@ public final boolean isSink( final V v ) { | |
|
||
/** | ||
* Get the set of source vertices of this graph | ||
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(), | ||
* which is deterministic in output due to the underlying sets all being LinkedHashSets. | ||
* @return a non-null set | ||
*/ | ||
public final Set<V> getSources() { | ||
return vertexSet().stream().filter(v -> isSource(v)).collect(Collectors.toSet()); | ||
public final LinkedHashSet<V> getSources() { | ||
return vertexSet().stream().filter(v -> isSource(v)).collect(Collectors.toCollection(LinkedHashSet::new)); | ||
} | ||
|
||
/** | ||
* Get the set of sink vertices of this graph | ||
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(), | ||
* which is deterministic in output due to the underlying sets all being LinkedHashSets. | ||
* @return a non-null set | ||
*/ | ||
public final Set<V> getSinks() { | ||
return vertexSet().stream().filter(v -> isSink(v)).collect(Collectors.toSet()); | ||
public final LinkedHashSet<V> getSinks() { | ||
return vertexSet().stream().filter(v -> isSink(v)).collect(Collectors.toCollection(LinkedHashSet::new)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a comment here (and at similar lines below) about why the |
||
} | ||
|
||
/** | ||
|
@@ -326,22 +330,26 @@ public final void addEdges(final Supplier<E> template, final V start, final V... | |
|
||
/** | ||
* Get the set of vertices connected by outgoing edges of V | ||
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(), | ||
* which is deterministic in output due to the underlying sets all being LinkedHashSets. | ||
* @param v a non-null vertex | ||
* @return a set of vertices connected by outgoing edges from v | ||
*/ | ||
public final Set<V> outgoingVerticesOf(final V v) { | ||
Utils.nonNull(v); | ||
return outgoingEdgesOf(v).stream().map(e -> getEdgeTarget(e)).collect(Collectors.toSet()); | ||
return outgoingEdgesOf(v).stream().map(e -> getEdgeTarget(e)).collect(Collectors.toCollection(LinkedHashSet::new)); | ||
} | ||
|
||
/** | ||
* Get the set of vertices connected to v by incoming edges | ||
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(), | ||
* which is deterministic in output due to the underlying sets all being LinkedHashSets. | ||
* @param v a non-null vertex | ||
* @return a set of vertices {X} connected X -> v | ||
*/ | ||
public final Set<V> incomingVerticesOf(final V v) { | ||
Utils.nonNull(v); | ||
return incomingEdgesOf(v).stream().map(e -> getEdgeSource(e)).collect(Collectors.toSet()); | ||
return incomingEdgesOf(v).stream().map(e -> getEdgeSource(e)).collect(Collectors.toCollection(LinkedHashSet::new)); | ||
} | ||
|
||
/** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -258,29 +258,35 @@ private static boolean pathIsTooDivergentFromReference(final Cigar c) { | |
*/ | ||
private void printDebugGraphTransform(final BaseGraph<?,?> graph, final String fileName) { | ||
File outputFile = new File(debugGraphOutputPath, fileName); | ||
if ( debugGraphTransformations ) { | ||
if ( PRINT_FULL_GRAPH_FOR_DEBUGGING ) { | ||
graph.printGraph(outputFile, pruneFactor); | ||
} else { | ||
graph.subsetToRefSource(10).printGraph(outputFile, pruneFactor); | ||
} | ||
if ( PRINT_FULL_GRAPH_FOR_DEBUGGING ) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since you removed the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. Every usage. I will add a comment to be doubly sure that its someone elses fault if that causes problems in the future. |
||
graph.printGraph(outputFile, pruneFactor); | ||
} else { | ||
graph.subsetToRefSource(10).printGraph(outputFile, pruneFactor); | ||
} | ||
} | ||
|
||
private AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) { | ||
printDebugGraphTransform(seqGraph, "sequenceGraph.1.dot"); | ||
private AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph, final Haplotype refHaplotype) { | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.0.non_ref_removed.dot"); | ||
} | ||
|
||
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive | ||
seqGraph.zipLinearChains(); | ||
printDebugGraphTransform(seqGraph, "sequenceGraph.2.zipped.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.1.zipped.dot"); | ||
} | ||
|
||
// now go through and prune the graph, removing vertices no longer connected to the reference chain | ||
seqGraph.removeSingletonOrphanVertices(); | ||
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); | ||
|
||
printDebugGraphTransform(seqGraph, "sequenceGraph.3.pruned.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.2.pruned.dot"); | ||
} | ||
seqGraph.simplifyGraph(); | ||
printDebugGraphTransform(seqGraph, "sequenceGraph.4.merged.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.3.merged.dot"); | ||
} | ||
|
||
// The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can | ||
// happen in cases where for example the reference somehow manages to acquire a cycle, or | ||
|
@@ -300,7 +306,9 @@ private AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) { | |
seqGraph.addVertex(dummy); | ||
seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); | ||
} | ||
printDebugGraphTransform(seqGraph, "sequenceGraph.5.final.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.4.final.dot"); | ||
} | ||
return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph, null); | ||
} | ||
|
||
|
@@ -453,7 +461,9 @@ private AssemblyResult createGraph(final Iterable<GATKRead> reads, | |
} | ||
|
||
private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int kmerSize, final ReadThreadingGraph rtgraph, final SmithWatermanAligner aligner) { | ||
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot"); | ||
} | ||
|
||
// look at all chains in the graph that terminate in a non-ref node (dangling sources and sinks) and see if | ||
// we can recover them by merging some N bases from the chain back into the reference | ||
|
@@ -467,7 +477,9 @@ private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int | |
rtgraph.removePathsNotConnectedToRef(); | ||
} | ||
|
||
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot"); | ||
} | ||
|
||
final SeqGraph initialSeqGraph = rtgraph.toSequenceGraph(); | ||
if (debugGraphTransformations) { | ||
|
@@ -482,10 +494,12 @@ private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int | |
if ( debug ) { | ||
logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); | ||
} | ||
printDebugGraphTransform(initialSeqGraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(initialSeqGraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot"); | ||
} | ||
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction | ||
|
||
final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph); | ||
final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph, refHaplotype); | ||
final AssemblyResult.Status status = cleaned.getStatus(); | ||
return new AssemblyResult(status, cleaned.getGraph(), rtgraph); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I dislike hidden options, but /shrug