Skip to content

Commit 840bb25

Browse files
committed
Cleanup new GDB code
1 parent 13b47a5 commit 840bb25

File tree

2 files changed

+186
-172
lines changed

2 files changed

+186
-172
lines changed

src/main/java/org/broadinstitute/hellbender/engine/FeatureDataSource.java

+6-172
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
package org.broadinstitute.hellbender.engine;
22

3-
import com.netflix.servo.util.VisibleForTesting;
3+
import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
4+
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
45
import htsjdk.samtools.SAMSequenceDictionary;
56
import htsjdk.samtools.util.IOUtil;
67
import htsjdk.tribble.*;
78
import htsjdk.variant.bcf2.BCF2Codec;
8-
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
99
import htsjdk.variant.variantcontext.VariantContext;
1010
import htsjdk.variant.vcf.VCFHeader;
1111
import org.apache.logging.log4j.LogManager;
@@ -20,26 +20,18 @@
2020
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
2121
import org.broadinstitute.hellbender.utils.io.IOUtils;
2222
import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;
23-
24-
import com.intel.genomicsdb.model.GenomicsDBExportConfiguration;
25-
import com.intel.genomicsdb.reader.GenomicsDBFeatureReader;
26-
import com.googlecode.protobuf.format.JsonFormat;
27-
import com.intel.genomicsdb.model.GenomicsDBVidMapProto;
28-
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
23+
import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.*;
2924

3025
import java.io.File;
3126
import java.io.IOException;
3227
import java.nio.channels.SeekableByteChannel;
33-
import java.nio.file.Files;
3428
import java.nio.file.Path;
35-
import java.nio.file.Paths;
3629
import java.util.Iterator;
3730
import java.util.List;
3831
import java.util.Optional;
3932
import java.util.function.Function;
40-
import java.io.FileReader;
41-
import java.util.HashMap;
42-
import java.util.Map;
33+
34+
4335

4436
/**
4537
* Enables traversals and queries over sources of Features, which are metadata associated with a location
@@ -70,11 +62,6 @@
7062
public final class FeatureDataSource<T extends Feature> implements GATKDataSource<T>, AutoCloseable {
7163
private static final Logger logger = LogManager.getLogger(FeatureDataSource.class);
7264

73-
/**
74-
* identifies a path as a GenomicsDB URI
75-
*/
76-
public static final String GENOMIC_DB_URI_SCHEME = "gendb://";
77-
7865
/**
7966
* Feature reader used to retrieve records from our file
8067
*/
@@ -290,14 +277,6 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
290277
this.queryLookaheadBases = queryLookaheadBases;
291278
}
292279

293-
/**
294-
* @param path String containing the path to test
295-
* @return true if path represent a GenomicsDB URI, otherwise false
296-
*/
297-
public static boolean isGenomicsDBPath(final String path) {
298-
return path != null && path.startsWith(GENOMIC_DB_URI_SCHEME);
299-
}
300-
301280
@SuppressWarnings("unchecked")
302281
private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
303282
final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
@@ -370,8 +349,7 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
370349
}
371350
}
372351

373-
@VisibleForTesting
374-
public static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
352+
protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) {
375353
if (!isGenomicsDBPath(path)) {
376354
throw new IllegalArgumentException("Trying to create a GenomicsDBReader from a non-GenomicsDB input");
377355
}
@@ -407,150 +385,6 @@ public static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final Str
407385
}
408386
}
409387

410-
private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace,
411-
final File callsetJson, final File vidmapJson,
412-
final File vcfHeader) {
413-
final GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder =
414-
GenomicsDBExportConfiguration.ExportConfiguration.newBuilder()
415-
.setWorkspace(workspace.getAbsolutePath())
416-
.setReferenceGenome(reference.getAbsolutePath())
417-
.setVidMappingFile(vidmapJson.getAbsolutePath())
418-
.setCallsetMappingFile(callsetJson.getAbsolutePath())
419-
.setVcfHeaderFilename(vcfHeader.getAbsolutePath())
420-
.setProduceGTField(false)
421-
.setProduceGTWithMinPLValueForSpanningDeletions(false)
422-
.setSitesOnlyQuery(false)
423-
.setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
424-
final Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath();
425-
426-
// For the multi-interval support, we create multiple arrays (directories) in a single workspace -
427-
// one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]),
428-
// you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the
429-
// partition bounds.
430-
431-
// During the read phase, the user only supplies the workspace. The array names are obtained by scanning
432-
// the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2",
433-
// 50, 50M), then only the second array is queried.
434-
435-
// In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version
436-
// will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found,
437-
// the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the
438-
// directory entries.
439-
if (Files.exists(arrayFolder)) {
440-
exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME);
441-
} else {
442-
exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true);
443-
}
444-
445-
//Sample code snippet to show how combine operations for INFO fields can be specified using the Protobuf
446-
//API
447-
//
448-
//References
449-
//GenomicsDB Protobuf structs: https://github.com/Intel-HLS/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto
450-
//Protobuf generated Java code guide:
451-
//https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api
452-
//https://developers.google.com/protocol-buffers/docs/reference/java-generated
453-
454-
//Parse the vid json and create an in-memory Protobuf structure representing the
455-
//information in the JSON file
456-
GenomicsDBVidMapProto.VidMappingPB vidMapPB;
457-
try {
458-
vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson);
459-
} catch (final IOException e) {
460-
throw new UserException("Could not open vid json file " + vidmapJson, e);
461-
}
462-
463-
//In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
464-
//Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
465-
//We iterate over the list and create a field name to list index map
466-
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList =
467-
getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB);
468-
469-
vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList,
470-
GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "element_wise_sum");
471-
472-
473-
if (vidMapPB != null) {
474-
//Use rebuilt vidMap in exportConfiguration
475-
//NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to
476-
//C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information
477-
//in the JSON file
478-
exportConfigurationBuilder.setVidMapping(vidMapPB);
479-
}
480-
481-
return exportConfigurationBuilder.build();
482-
}
483-
484-
/**
485-
* Parse the vid json and create an in-memory Protobuf structure representing the
486-
* information in the JSON file
487-
*
488-
* @param vidmapJson vid JSON file
489-
* @return Protobuf object
490-
*/
491-
public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson)
492-
throws IOException {
493-
final GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder();
494-
try (final FileReader reader = new FileReader(vidmapJson)) {
495-
JsonFormat.merge(reader, vidMapBuilder);
496-
}
497-
return vidMapBuilder.build();
498-
}
499-
500-
/**
501-
* In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects
502-
* Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store
503-
* We iterate over the list and create a field name to list index map
504-
*
505-
* @param vidMapPB Protobuf vid mapping object
506-
* @return map from field name to index in vidMapPB.fields list
507-
*/
508-
public static HashMap<String, Integer> getFieldNameToListIndexInProtobufVidMappingObject(
509-
final GenomicsDBVidMapProto.VidMappingPB vidMapPB) {
510-
final HashMap<String, Integer> fieldNameToIndexInVidFieldsList = new HashMap<>();
511-
for (int fieldIdx = 0; fieldIdx < vidMapPB.getFieldsCount(); ++fieldIdx) {
512-
fieldNameToIndexInVidFieldsList.put(vidMapPB.getFields(fieldIdx).getName(), fieldIdx);
513-
}
514-
return fieldNameToIndexInVidFieldsList;
515-
}
516-
517-
/**
518-
* Update vid Protobuf object with new combine operation for field
519-
*
520-
* @param vidMapPB input vid object
521-
* @param fieldNameToIndexInVidFieldsList name to index in list
522-
* @param fieldName INFO field name
523-
* @param newCombineOperation combine op ("sum", "median")
524-
* @return updated vid Protobuf object if field exists, else null
525-
*/
526-
public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation(
527-
final GenomicsDBVidMapProto.VidMappingPB vidMapPB,
528-
final Map<String, Integer> fieldNameToIndexInVidFieldsList,
529-
final String fieldName,
530-
final String newCombineOperation) {
531-
final int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName)
532-
? fieldNameToIndexInVidFieldsList.get(fieldName) : -1;
533-
if (fieldIdx >= 0) {
534-
//Would need to rebuild vidMapPB - so get top level builder first
535-
final GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder();
536-
//To update the list element corresponding to fieldName, we get the builder for that specific list element
537-
final GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder fieldBuilder =
538-
updatedVidMapBuilder.getFieldsBuilder(fieldIdx);
539-
//And update its combine operation
540-
fieldBuilder.setVCFFieldCombineOperation(newCombineOperation);
541-
542-
//Shorter way of writing the same operation
543-
/*
544-
updatedVidMapBuilder.getFieldsBuilder(fieldIdx)
545-
.setVCFFieldCombineOperation(newCombineOperation);
546-
*/
547-
548-
//Rebuild full vidMap
549-
return updatedVidMapBuilder.build();
550-
}
551-
return null;
552-
}
553-
554388
/**
555389
* Returns the sequence dictionary for this source of Features.
556390
* Uses the dictionary from the VCF header (if present) for variant inputs,

0 commit comments

Comments
 (0)