Skip to content

Fix non-determinism in KbestHaplotypeFinding/SeqGraphZippingCode #6195

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 7, 2019
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu
@Argument(fullName = "just-determine-active-regions", doc = "Just determine ActiveRegions, don't perform assembly or calling", optional = true)
public boolean justDetermineActiveRegions = false;


@Hidden
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dislike hidden options, but /shrug

@Advanced
@Argument(fullName="debug-assembly-region-state", doc="Write output files for assembled regions with read summaries and called haplotypes to the specified path", optional = true)
public String assemblyStateOutput = null;

/**
* This argument is intended for benchmarking and scalability testing.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
import org.broadinstitute.hellbender.utils.variant.writers.GVCFWriter;

import java.io.IOException;
import java.io.PrintStream;
import java.nio.file.Files;
import java.util.*;
import java.util.stream.Collectors;

Expand All @@ -71,6 +73,8 @@ public final class HaplotypeCallerEngine implements AssemblyRegionEvaluator {

private AssemblyRegionTrimmer trimmer = new AssemblyRegionTrimmer();

private final PrintStream assemblyDebugOutStream;

// the genotyping engine for the isActive() determination
private MinimalGenotypingEngine activeRegionEvaluationGenotyperEngine = null;

Expand Down Expand Up @@ -120,7 +124,6 @@ public final class HaplotypeCallerEngine implements AssemblyRegionEvaluator {
*/
private static final int MINIMUM_PUTATIVE_PLOIDY_FOR_ACTIVE_REGION_DISCOVERY = 2;


/**
* Reads with length lower than this number, after clipping off overhands outside the active region,
* won't be considered for genotyping.
Expand Down Expand Up @@ -159,6 +162,15 @@ public HaplotypeCallerEngine( final HaplotypeCallerArgumentCollection hcArgs, bo
this.aligner = SmithWatermanAligner.getAligner(hcArgs.smithWatermanImplementation);
forceCallingAllelesPresent = hcArgs.alleles != null;
initialize(createBamOutIndex, createBamOutMD5);
if (hcArgs.assemblyStateOutput != null) {
try {
assemblyDebugOutStream = new PrintStream(Files.newOutputStream(IOUtils.getPath(hcArgs.assemblyStateOutput)));
} catch (IOException e) {
throw new UserException.CouldNotCreateOutputFile(hcArgs.assemblyStateOutput, "Provided argument for assembly debug graph location could not be created");
}
} else {
assemblyDebugOutStream = null;
}
}

/**
Expand Down Expand Up @@ -524,9 +536,23 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
return referenceModelForNoVariation(region, true, VCpriors);
}

if (assemblyDebugOutStream != null) {
assemblyDebugOutStream.println("\n\n\n\n"+region.getSpan()+"\nNumber of reads in region: " + region.getReads().size() + " they are:");
for (GATKRead read : region.getReads()) {
assemblyDebugOutStream.println(read.getName() + " " + read.convertToSAMRecord(region.getHeader()).getFlags());
}
}

// run the local assembler, getting back a collection of information on how we should proceed
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner, !hcArgs.doNotCorrectOverlappingBaseQualities);


if (assemblyDebugOutStream != null) {
assemblyDebugOutStream.println("\nThere were " + untrimmedAssemblyResult.getHaplotypeList().size() + " haplotypes found. Here they are:");
for (String haplotype : untrimmedAssemblyResult.getHaplotypeList().stream().map(haplotype -> haplotype.toString()).sorted().collect(Collectors.toList())) {
assemblyDebugOutStream.println(haplotype);
}
}

final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents(hcArgs.maxMnpDistance);

final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(region, allVariationEvents);
Expand Down Expand Up @@ -697,6 +723,9 @@ public void shutdown() {
}
}

if (assemblyDebugOutStream != null) {
assemblyDebugOutStream.close();
}
// Write assembly region debug output if present
assemblyEngine.printDebugHistograms();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,18 +76,22 @@ public final boolean isSink( final V v ) {

/**
* Get the set of source vertices of this graph
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(),
* which is deterministic in output due to the underlying sets all being LinkedHashSets.
* @return a non-null set
*/
public final Set<V> getSources() {
return vertexSet().stream().filter(v -> isSource(v)).collect(Collectors.toSet());
public final LinkedHashSet<V> getSources() {
return vertexSet().stream().filter(v -> isSource(v)).collect(Collectors.toCollection(LinkedHashSet::new));
}

/**
* Get the set of sink vertices of this graph
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(),
* which is deterministic in output due to the underlying sets all being LinkedHashSets.
* @return a non-null set
*/
public final Set<V> getSinks() {
return vertexSet().stream().filter(v -> isSink(v)).collect(Collectors.toSet());
public final LinkedHashSet<V> getSinks() {
return vertexSet().stream().filter(v -> isSink(v)).collect(Collectors.toCollection(LinkedHashSet::new));
Copy link
Contributor

@droazen droazen Oct 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment here (and at similar lines below) about why the LinkedHashSet is important here.

}

/**
Expand Down Expand Up @@ -326,22 +330,26 @@ public final void addEdges(final Supplier<E> template, final V start, final V...

/**
* Get the set of vertices connected by outgoing edges of V
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(),
* which is deterministic in output due to the underlying sets all being LinkedHashSets.
* @param v a non-null vertex
* @return a set of vertices connected by outgoing edges from v
*/
public final Set<V> outgoingVerticesOf(final V v) {
Utils.nonNull(v);
return outgoingEdgesOf(v).stream().map(e -> getEdgeTarget(e)).collect(Collectors.toSet());
return outgoingEdgesOf(v).stream().map(e -> getEdgeTarget(e)).collect(Collectors.toCollection(LinkedHashSet::new));
}

/**
* Get the set of vertices connected to v by incoming edges
* NOTE: We return a LinkedHashSet here in order to preserve the determinism in the output order of VertexSet(),
* which is deterministic in output due to the underlying sets all being LinkedHashSets.
* @param v a non-null vertex
* @return a set of vertices {X} connected X -> v
*/
public final Set<V> incomingVerticesOf(final V v) {
Utils.nonNull(v);
return incomingEdgesOf(v).stream().map(e -> getEdgeSource(e)).collect(Collectors.toSet());
return incomingEdgesOf(v).stream().map(e -> getEdgeSource(e)).collect(Collectors.toCollection(LinkedHashSet::new));
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -252,35 +252,42 @@ private static boolean pathIsTooDivergentFromReference(final Cigar c) {
}

/**
* Print graph to file if debugGraphTransformations is enabled
* Print graph to file NOTE this requires that debugGraphTransformations be enabled.
*
* @param graph the graph to print
* @param fileName the name to give the graph file
*/
private void printDebugGraphTransform(final BaseGraph<?,?> graph, final String fileName) {
File outputFile = new File(debugGraphOutputPath, fileName);
if ( debugGraphTransformations ) {
if ( PRINT_FULL_GRAPH_FOR_DEBUGGING ) {
graph.printGraph(outputFile, pruneFactor);
} else {
graph.subsetToRefSource(10).printGraph(outputFile, pruneFactor);
}
if ( PRINT_FULL_GRAPH_FOR_DEBUGGING ) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you removed the if ( debugGraphTransformations ) { guard from this method, did you check all usages of it to ensure that the callsites are all now checking the debugGraphTransformations flag?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Every usage. I will add a comment to be doubly sure that its someone elses fault if that causes problems in the future.

graph.printGraph(outputFile, pruneFactor);
} else {
graph.subsetToRefSource(10).printGraph(outputFile, pruneFactor);
}
}

private AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) {
printDebugGraphTransform(seqGraph, "sequenceGraph.1.dot");
private AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph, final Haplotype refHaplotype) {
if (debugGraphTransformations) {
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.0.non_ref_removed.dot");
}

// the very first thing we need to do is zip up the graph, or pruneGraph will be too aggressive
seqGraph.zipLinearChains();
printDebugGraphTransform(seqGraph, "sequenceGraph.2.zipped.dot");
if (debugGraphTransformations) {
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.1.zipped.dot");
}

// now go through and prune the graph, removing vertices no longer connected to the reference chain
seqGraph.removeSingletonOrphanVertices();
seqGraph.removeVerticesNotConnectedToRefRegardlessOfEdgeDirection();

printDebugGraphTransform(seqGraph, "sequenceGraph.3.pruned.dot");
if (debugGraphTransformations) {
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.2.pruned.dot");
}
seqGraph.simplifyGraph();
printDebugGraphTransform(seqGraph, "sequenceGraph.4.merged.dot");
if (debugGraphTransformations) {
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.3.merged.dot");
}

// The graph has degenerated in some way, so the reference source and/or sink cannot be id'd. Can
// happen in cases where for example the reference somehow manages to acquire a cycle, or
Expand All @@ -300,7 +307,9 @@ private AssemblyResult cleanupSeqGraph(final SeqGraph seqGraph) {
seqGraph.addVertex(dummy);
seqGraph.addEdge(complete, dummy, new BaseEdge(true, 0));
}
printDebugGraphTransform(seqGraph, "sequenceGraph.5.final.dot");
if (debugGraphTransformations) {
printDebugGraphTransform(seqGraph, refHaplotype.getLocation() + "-sequenceGraph."+seqGraph.getKmerSize()+".1.4.final.dot");
}
return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph, null);
}

Expand Down Expand Up @@ -453,7 +462,9 @@ private AssemblyResult createGraph(final Iterable<GATKRead> reads,
}

private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int kmerSize, final ReadThreadingGraph rtgraph, final SmithWatermanAligner aligner) {
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot");
if (debugGraphTransformations) {
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot");
}

// look at all chains in the graph that terminate in a non-ref node (dangling sources and sinks) and see if
// we can recover them by merging some N bases from the chain back into the reference
Expand All @@ -467,7 +478,9 @@ private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int
rtgraph.removePathsNotConnectedToRef();
}

printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot");
if (debugGraphTransformations) {
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot");
}

final SeqGraph initialSeqGraph = rtgraph.toSequenceGraph();
if (debugGraphTransformations) {
Expand All @@ -482,10 +495,12 @@ private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int
if ( debug ) {
logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
}
printDebugGraphTransform(initialSeqGraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot");
if (debugGraphTransformations) {
printDebugGraphTransform(initialSeqGraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot");
}
initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction

final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph);
final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph, refHaplotype);
final AssemblyResult.Status status = cleaned.getStatus();
return new AssemblyResult(status, cleaned.getGraph(), rtgraph);
}
Expand Down