Skip to content

Commit e3aa4ee

Browse files
ldgauthierEdwardDixon
authored andcommitted
Improve MQ calculation accuracy (broadinstitute#4969)
Change raw MQ to a tuple of (sumSquaredMQs, totalDepth) for better accuracy where there are lots of uninformative reads or called single-sample variants with homRef genotypes. Note that incorporating this change into a pipeline will require a concomitant update to this version for GenomicsDBImport and GenotypeGVCFs.
1 parent ce5f699 commit e3aa4ee

File tree

39 files changed

+50129
-816
lines changed

39 files changed

+50129
-816
lines changed

src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java

+6-168
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
package org.broadinstitute.hellbender.engine;
22

3+
import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
4+
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
35
import htsjdk.samtools.SAMSequenceDictionary;
46
import htsjdk.samtools.util.IOUtil;
57
import htsjdk.tribble.*;
68
import htsjdk.variant.bcf2.BCF2Codec;
7-
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
89
import htsjdk.variant.variantcontext.VariantContext;
910
import htsjdk.variant.vcf.VCFHeader;
1011
import org.apache.logging.log4j.LogManager;
@@ -19,25 +20,18 @@
1920
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
2021
import org.broadinstitute.hellbender.utils.io.IOUtils;
2122
import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;
22-
23-
import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
24-
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
25-
import com.googlecode.protobuf.format.JsonFormat;
26-
import com.intel.genomicsdb.model.GenomicsDBVidMapProto;
23+
import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.*;
2724

2825
import java.io.File;
2926
import java.io.IOException;
3027
import java.nio.channels.SeekableByteChannel;
31-
import java.nio.file.Files;
3228
import java.nio.file.Path;
33-
import java.nio.file.Paths;
3429
import java.util.Iterator;
3530
import java.util.List;
3631
import java.util.Optional;
3732
import java.util.function.Function;
38-
import java.io.FileReader;
39-
import java.util.HashMap;
40-
import java.util.Map;
33+
34+
4135

4236
/**
4337
* Enables traversals and queries over sources of Features, which are metadata associated with a location
@@ -68,11 +62,6 @@
6862
public final class FeatureDataSource<T extends Feature> implements GATKDataSource<T>, AutoCloseable {
6963
private static final Logger logger = LogManager.getLogger(FeatureDataSource.class);
7064

71-
/**
72-
* identifies a path as a GenomicsDB URI
73-
*/
74-
public static final String GENOMIC_DB_URI_SCHEME = "gendb://";
75-
7665
/**
7766
* Feature reader used to retrieve records from our file
7867
*/
@@ -288,14 +277,6 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
288277
this.queryLookaheadBases = queryLookaheadBases;
289278
}
290279

291-
/**
292-
* @param path String containing the path to test
293-
* @return true if path represent a GenomicsDB URI, otherwise false
294-
*/
295-
public static boolean isGenomicsDBPath(final String path) {
296-
return path != null && path.startsWith(GENOMIC_DB_URI_SCHEME);
297-
}
298-
299280
@SuppressWarnings("unchecked")
300281
private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
301282
final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
@@ -368,7 +349,7 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
368349
}
369350
}
370351

371-
private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
352+
protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
372353
if (!isGenomicsDBPath(path)) {
373354
throw new IllegalArgumentException("Trying to create a GenomicsDBReader from a non-GenomicsDB input");
374355
}
@@ -404,149 +385,6 @@ private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final St
404385
}
405386
}
406387

407-
private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace,
408-
final File callsetJson, final File vidmapJson,
409-
final File vcfHeader) {
410-
final GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder =
411-
GenomicsDBExportConfiguration.ExportConfiguration.newBuilder()
412-
.setWorkspace(workspace.getAbsolutePath())
413-
.setReferenceGenome(reference.getAbsolutePath())
414-
.setVidMappingFile(vidmapJson.getAbsolutePath())
415-
.setCallsetMappingFile(callsetJson.getAbsolutePath())
416-
.setVcfHeaderFilename(vcfHeader.getAbsolutePath())
417-
.setProduceGTField(false)
418-
.setProduceGTWithMinPLValueForSpanningDeletions(false)
419-
.setSitesOnlyQuery(false)
420-
.setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
421-
final Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath();
422-
423-
// For the multi-interval support, we create multiple arrays (directories) in a single workspace -
424-
// one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]),
425-
// you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the
426-
// partition bounds.
427-
428-
// During the read phase, the user only supplies the workspace. The array names are obtained by scanning
429-
// the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2",
430-
// 50, 50M), then only the second array is queried.
431-
432-
// In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version
433-
// will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found,
434-
// the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the
435-
// directory entries.
436-
if (Files.exists(arrayFolder)) {
437-
exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME);
438-
} else {
439-
exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
440-
}
441-
442-
//Sample code snippet to show how combine operations for INFO fields can be specified using the Protobuf
443-
//API
444-
//
445-
//References
446-
//GenomicsDB Protobuf structs: https://github.com/Intel-HLS/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto
447-
//Protobuf generated Java code guide:
448-
//https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api
449-
//https://developers.google.com/protocol-buffers/docs/reference/java-generated
450-
451-
//Parse the vid json and create an in-memory Protobuf structure representing the
452-
//information in the JSON file
453-
GenomicsDBVidMapProto.VidMappingPB vidMapPB;
454-
try {
455-
vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson);
456-
} catch (final IOException e) {
457-
throw new UserException("Could not open vid json file " + vidmapJson, e);
458-
}
459-
460-
//In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
461-
//Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
462-
//We iterate over the list and create a field name to list index map
463-
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
464-
getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);
465-
466-
//Example: set MQ combine operation to median (default is also median, but this is just an example)
467-
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
468-
"MQ", "median");
469-
if (vidMapPB != null) {
470-
//Use rebuilt vidMap in exportConfiguration
471-
//NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to
472-
//C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information
473-
//in the JSON file
474-
exportConfigurationBuilder.setVidMapping(vidMapPB);
475-
}
476-
477-
return exportConfigurationBuilder.build();
478-
}
479-
480-
/**
481-
* Parse the vid json and create an in-memory Protobuf structure representing the
482-
* information in the JSON file
483-
*
484-
* @param vidmapJson vid JSON file
485-
* @return Protobuf object
486-
*/
487-
public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson)
488-
throws IOException {
489-
final GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder();
490-
try (final FileReader reader = new FileReader(vidmapJson)) {
491-
JsonFormat.merge(reader, vidMapBuilder);
492-
}
493-
return vidMapBuilder.build();
494-
}
495-
496-
/**
497-
* In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
498-
* Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
499-
* We iterate over the list and create a field name to list index map
500-
*
501-
* @param vidMapPB Protobuf vid mapping object
502-
* @return map from field name to index in vidMapPB.fields list
503-
*/
504-
public static HashMap<String, Integer> getFieldNameToListIndexInProtobufVidMappingObject(
505-
final GenomicsDBVidMapProto.VidMappingPB vidMapPB) {
506-
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList = new HashMap<>();
507-
for (int fieldIdx = 0; fieldIdx < vidMapPB.getFieldsCount(); ++fieldIdx) {
508-
fieldNameToIndexInVidFieldsList.put(vidMapPB.getFields(fieldIdx).getName(), fieldIdx);
509-
}
510-
return fieldNameToIndexInVidFieldsList;
511-
}
512-
513-
/**
514-
* Update vid Protobuf object with new combine operation for field
515-
*
516-
* @param vidMapPB input vid object
517-
* @param fieldNameToIndexInVidFieldsList name to index in list
518-
* @param fieldName INFO field name
519-
* @param newCombineOperation combine op ("sum", "median")
520-
* @return updated vid Protobuf object if field exists, else null
521-
*/
522-
public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation(
523-
final GenomicsDBVidMapProto.VidMappingPB vidMapPB,
524-
final Map<String, Integer> fieldNameToIndexInVidFieldsList,
525-
final String fieldName,
526-
final String newCombineOperation) {
527-
final int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName)
528-
? fieldNameToIndexInVidFieldsList.get(fieldName) : -1;
529-
if (fieldIdx >= 0) {
530-
//Would need to rebuild vidMapPB - so get top level builder first
531-
final GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder();
532-
//To update the list element corresponding to fieldName, we get the builder for that specific list element
533-
final GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder fieldBuilder =
534-
updatedVidMapBuilder.getFieldsBuilder(fieldIdx);
535-
//And update its combine operation
536-
fieldBuilder.setVCFFieldCombineOperation(newCombineOperation);
537-
538-
//Shorter way of writing the same operation
539-
/*
540-
updatedVidMapBuilder.getFieldsBuilder(fieldIdx)
541-
.setVCFFieldCombineOperation(newCombineOperation);
542-
*/
543-
544-
//Rebuild full vidMap
545-
return updatedVidMapBuilder.build();
546-
}
547-
return null;
548-
}
549-
550388
/**
551389
* Returns the sequence dictionary for this source of Features.
552390
* Uses the dictionary from the VCF header (if present) for variant inputs,

src/main/java/org/broadinstitute/hellbender/engine/FeatureInput.java

+3-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import htsjdk.tribble.Feature;
55
import htsjdk.tribble.FeatureCodec;
66
import org.broadinstitute.barclay.argparser.CommandLineException;
7+
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils;
78
import org.broadinstitute.hellbender.utils.Utils;
89
import org.broadinstitute.hellbender.utils.io.IOUtils;
910

@@ -244,8 +245,8 @@ public void setFeatureCodecClass(final Class<FeatureCodec<T, ?>> featureCodecCla
244245
* creates a name from the given filePath by finding the absolute path of the given input
245246
*/
246247
private static String makeIntoAbsolutePath(final String filePath){
247-
if(FeatureDataSource.isGenomicsDBPath(filePath)){
248-
return FeatureDataSource.GENOMIC_DB_URI_SCHEME + new File(filePath.replace(FeatureDataSource.GENOMIC_DB_URI_SCHEME,"")).getAbsolutePath();
248+
if(GenomicsDBUtils.isGenomicsDBPath(filePath)){
249+
return GenomicsDBUtils.GENOMIC_DB_URI_SCHEME + new File(filePath.replace(GenomicsDBUtils.GENOMIC_DB_URI_SCHEME,"")).getAbsolutePath();
249250
} else if (URI.create(filePath).getScheme() != null) {
250251
return IOUtils.getPath(filePath).toAbsolutePath().toUri().toString();
251252
} else {

0 commit comments

Comments
 (0)