-
Notifications
You must be signed in to change notification settings - Fork 603
Fix non-determinism in KbestHaplotypeFinding/SeqGraphZippingCode #6195
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6cb3e69
03aa830
662e224
c31edb7
6430504
20ee649
57f2427
9be44aa
2131b15
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,18 +76,22 @@ public final boolean isSink( final V v ) { | |
|
||
/** | ||
* Get the set of source vertices of this graph | ||
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(), | ||
* which is deterministic in output due to the underlying sets all being LinkedHashSets. | ||
* @return a non-null set | ||
*/ | ||
public final Set<V> getSources() { | ||
return vertexSet().stream().filter(v -> isSource(v)).collect(Collectors.toSet()); | ||
public final LinkedHashSet<V> getSources() { | ||
return vertexSet().stream().filter(v -> isSource(v)).collect(Collectors.toCollection(LinkedHashSet::new)); | ||
} | ||
|
||
/** | ||
* Get the set of sink vertices of this graph | ||
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(), | ||
* which is deterministic in output due to the underlying sets all being LinkedHashSets. | ||
* @return a non-null set | ||
*/ | ||
public final Set<V> getSinks() { | ||
return vertexSet().stream().filter(v -> isSink(v)).collect(Collectors.toSet()); | ||
public final LinkedHashSet<V> getSinks() { | ||
return vertexSet().stream().filter(v -> isSink(v)).collect(Collectors.toCollection(LinkedHashSet::new)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a comment here (and at similar lines below) about why the |
||
} | ||
|
||
/** | ||
|
@@ -326,22 +330,26 @@ public final void addEdges(final Supplier<E> template, final V start, final V... | |
|
||
/** | ||
* Get the set of vertices connected by outgoing edges of V | ||
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(), | ||
* which is deterministic in output due to the underlying sets all being LinkedHashSets. | ||
* @param v a non-null vertex | ||
* @return a set of vertices connected by outgoing edges from v | ||
*/ | ||
public final Set<V> outgoingVerticesOf(final V v) { | ||
Utils.nonNull(v); | ||
return outgoingEdgesOf(v).stream().map(e -> getEdgeTarget(e)).collect(Collectors.toSet()); | ||
return outgoingEdgesOf(v).stream().map(e -> getEdgeTarget(e)).collect(Collectors.toCollection(LinkedHashSet::new)); | ||
} | ||
|
||
/** | ||
* Get the set of vertices connected to v by incoming edges | ||
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(), | ||
* which is deterministic in output due to the underlying sets all being LinkedHashSets. | ||
* @param v a non-null vertex | ||
* @return a set of vertices {X} connected X -> v | ||
*/ | ||
public final Set<V> incomingVerticesOf(final V v) { | ||
Utils.nonNull(v); | ||
return incomingEdgesOf(v).stream().map(e -> getEdgeSource(e)).collect(Collectors.toSet()); | ||
return incomingEdgesOf(v).stream().map(e -> getEdgeSource(e)).collect(Collectors.toCollection(LinkedHashSet::new)); | ||
} | ||
|
||
/** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -252,35 +252,42 @@ private static boolean pathIsTooDivergentFromReference(final Cigar c) { | |
} | ||
|
||
/** | ||
* Print graph to file if debugGraphTransformations is enabled | ||
* Print graph to file NOTE this requires that debugGraphTransformations be enabled. | ||
* | ||
* @param graph the graph to print | ||
* @param fileName the name to give the graph file | ||
*/ | ||
private void printDebugGraphTransform(final BaseGraph<?,?> graph, final String fileName) { | ||
File outputFile = new File(debugGraphOutputPath, fileName); | ||
if ( debugGraphTransformations ) { | ||
if ( PRINT_FULL_GRAPH_FOR_DEBUGGING ) { | ||
graph.printGraph(outputFile, pruneFactor); | ||
} else { | ||
graph.subsetToRefSource(10).printGraph(outputFile, pruneFactor); | ||
} | ||
if ( PRINT_FULL_GRAPH_FOR_DEBUGGING ) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since you removed the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. Every usage. I will add a comment to be doubly sure that its someone elses fault if that causes problems in the future. |
||
graph.printGraph(outputFile, pruneFactor); | ||
} else { | ||
graph.subsetToRefSource(10).printGraph(outputFile, pruneFactor); | ||
} | ||
} | ||
|
||
private AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) { | ||
printDebugGraphTransform(seqGraph, "sequenceGraph.1.dot"); | ||
private AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph, final Haplotype refHaplotype) { | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.0.non_ref_removed.dot"); | ||
} | ||
|
||
// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive | ||
seqGraph.zipLinearChains(); | ||
printDebugGraphTransform(seqGraph, "sequenceGraph.2.zipped.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.1.zipped.dot"); | ||
} | ||
|
||
// now go through and prune the graph, removing vertices no longer connected to the reference chain | ||
seqGraph.removeSingletonOrphanVertices(); | ||
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection(); | ||
|
||
printDebugGraphTransform(seqGraph, "sequenceGraph.3.pruned.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.2.pruned.dot"); | ||
} | ||
seqGraph.simplifyGraph(); | ||
printDebugGraphTransform(seqGraph, "sequenceGraph.4.merged.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.3.merged.dot"); | ||
} | ||
|
||
// The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can | ||
// happen in cases where for example the reference somehow manages to acquire a cycle, or | ||
|
@@ -300,7 +307,9 @@ private AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) { | |
seqGraph.addVertex(dummy); | ||
seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0)); | ||
} | ||
printDebugGraphTransform(seqGraph, "sequenceGraph.5.final.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.4.final.dot"); | ||
} | ||
return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph, null); | ||
} | ||
|
||
|
@@ -453,7 +462,9 @@ private AssemblyResult createGraph(final Iterable<GATKRead> reads, | |
} | ||
|
||
private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int kmerSize, final ReadThreadingGraph rtgraph, final SmithWatermanAligner aligner) { | ||
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot"); | ||
} | ||
|
||
// look at all chains in the graph that terminate in a non-ref node (dangling sources and sinks) and see if | ||
// we can recover them by merging some N bases from the chain back into the reference | ||
|
@@ -467,7 +478,9 @@ private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int | |
rtgraph.removePathsNotConnectedToRef(); | ||
} | ||
|
||
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot"); | ||
} | ||
|
||
final SeqGraph initialSeqGraph = rtgraph.toSequenceGraph(); | ||
if (debugGraphTransformations) { | ||
|
@@ -482,10 +495,12 @@ private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int | |
if ( debug ) { | ||
logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler"); | ||
} | ||
printDebugGraphTransform(initialSeqGraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot"); | ||
if (debugGraphTransformations) { | ||
printDebugGraphTransform(initialSeqGraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot"); | ||
} | ||
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction | ||
|
||
final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph); | ||
final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph, refHaplotype); | ||
final AssemblyResult.Status status = cleaned.getStatus(); | ||
return new AssemblyResult(status, cleaned.getGraph(), rtgraph); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I dislike hidden options, but /shrug