|
1 | 1 | package org.broadinstitute.hellbender.engine;
|
2 | 2 |
|
3 |
| -import com.netflix.servo.util.VisibleForTesting; |
| 3 | +import com.intel.genomicsdb.model.GenomicsDBExportConfiguration; |
| 4 | +import com.intel.genomicsdb.reader.GenomicsDBFeatureReader; |
4 | 5 | import htsjdk.samtools.SAMSequenceDictionary;
|
5 | 6 | import htsjdk.samtools.util.IOUtil;
|
6 | 7 | import htsjdk.tribble.*;
|
7 | 8 | import htsjdk.variant.bcf2.BCF2Codec;
|
8 |
| -import htsjdk.variant.variantcontext.GenotypeLikelihoods; |
9 | 9 | import htsjdk.variant.variantcontext.VariantContext;
|
10 | 10 | import htsjdk.variant.vcf.VCFHeader;
|
11 | 11 | import org.apache.logging.log4j.LogManager;
|
|
20 | 20 | import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
|
21 | 21 | import org.broadinstitute.hellbender.utils.io.IOUtils;
|
22 | 22 | import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;
|
23 |
| - |
24 |
| -import com.intel.genomicsdb.model.GenomicsDBExportConfiguration; |
25 |
| -import com.intel.genomicsdb.reader.GenomicsDBFeatureReader; |
26 |
| -import com.googlecode.protobuf.format.JsonFormat; |
27 |
| -import com.intel.genomicsdb.model.GenomicsDBVidMapProto; |
28 |
| -import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; |
| 23 | +import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.*; |
29 | 24 |
|
30 | 25 | import java.io.File;
|
31 | 26 | import java.io.IOException;
|
32 | 27 | import java.nio.channels.SeekableByteChannel;
|
33 |
| -import java.nio.file.Files; |
34 | 28 | import java.nio.file.Path;
|
35 |
| -import java.nio.file.Paths; |
36 | 29 | import java.util.Iterator;
|
37 | 30 | import java.util.List;
|
38 | 31 | import java.util.Optional;
|
39 | 32 | import java.util.function.Function;
|
40 |
| -import java.io.FileReader; |
41 |
| -import java.util.HashMap; |
42 |
| -import java.util.Map; |
| 33 | + |
| 34 | + |
43 | 35 |
|
44 | 36 | /**
|
45 | 37 | * Enables traversals and queries over sources of Features, which are metadata associated with a location
|
|
70 | 62 | public final class FeatureDataSource<T extends Feature> implements GATKDataSource<T>, AutoCloseable {
|
71 | 63 | private static final Logger logger = LogManager.getLogger(FeatureDataSource.class);
|
72 | 64 |
|
73 |
| - /** |
74 |
| - * identifies a path as a GenomicsDB URI |
75 |
| - */ |
76 |
| - public static final String GENOMIC_DB_URI_SCHEME = "gendb://"; |
77 |
| - |
78 | 65 | /**
|
79 | 66 | * Feature reader used to retrieve records from our file
|
80 | 67 | */
|
@@ -290,14 +277,6 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
|
290 | 277 | this.queryLookaheadBases = queryLookaheadBases;
|
291 | 278 | }
|
292 | 279 |
|
293 |
| - /** |
294 |
| - * @param path String containing the path to test |
295 |
| - * @return true if path represent a GenomicsDB URI, otherwise false |
296 |
| - */ |
297 |
| - public static boolean isGenomicsDBPath(final String path) { |
298 |
| - return path != null && path.startsWith(GENOMIC_DB_URI_SCHEME); |
299 |
| - } |
300 |
| - |
301 | 280 | @SuppressWarnings("unchecked")
|
302 | 281 | private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
|
303 | 282 | final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
|
@@ -370,8 +349,7 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
|
370 | 349 | }
|
371 | 350 | }
|
372 | 351 |
|
373 |
| - @VisibleForTesting |
374 |
| - public static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) { |
| 352 | + protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) { |
375 | 353 | if (!isGenomicsDBPath(path)) {
|
376 | 354 | throw new IllegalArgumentException("Trying to create a GenomicsDBReader from a non-GenomicsDB input");
|
377 | 355 | }
|
@@ -407,150 +385,6 @@ public static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final Str
|
407 | 385 | }
|
408 | 386 | }
|
409 | 387 |
|
410 |
| - private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace, |
411 |
| - final File callsetJson, final File vidmapJson, |
412 |
| - final File vcfHeader) { |
413 |
| - final GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder = |
414 |
| - GenomicsDBExportConfiguration.ExportConfiguration.newBuilder() |
415 |
| - .setWorkspace(workspace.getAbsolutePath()) |
416 |
| - .setReferenceGenome(reference.getAbsolutePath()) |
417 |
| - .setVidMappingFile(vidmapJson.getAbsolutePath()) |
418 |
| - .setCallsetMappingFile(callsetJson.getAbsolutePath()) |
419 |
| - .setVcfHeaderFilename(vcfHeader.getAbsolutePath()) |
420 |
| - .setProduceGTField(false) |
421 |
| - .setProduceGTWithMinPLValueForSpanningDeletions(false) |
422 |
| - .setSitesOnlyQuery(false) |
423 |
| - .setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); |
424 |
| - final Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath(); |
425 |
| - |
426 |
| - // For the multi-interval support, we create multiple arrays (directories) in a single workspace - |
427 |
| - // one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]), |
428 |
| - // you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the |
429 |
| - // partition bounds. |
430 |
| - |
431 |
| - // During the read phase, the user only supplies the workspace. The array names are obtained by scanning |
432 |
| - // the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2", |
433 |
| - // 50, 50M), then only the second array is queried. |
434 |
| - |
435 |
| - // In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version |
436 |
| - // will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found, |
437 |
| - // the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the |
438 |
| - // directory entries. |
439 |
| - if (Files.exists(arrayFolder)) { |
440 |
| - exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME); |
441 |
| - } else { |
442 |
| - exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true); |
443 |
| - } |
444 |
| - |
445 |
| - //Sample code snippet to show how combine operations for INFO fields can be specified using the Protobuf |
446 |
| - //API |
447 |
| - // |
448 |
| - //References |
449 |
| - //GenomicsDB Protobuf structs: https://github.com/Intel-HLS/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto |
450 |
| - //Protobuf generated Java code guide: |
451 |
| - //https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api |
452 |
| - //https://developers.google.com/protocol-buffers/docs/reference/java-generated |
453 |
| - |
454 |
| - //Parse the vid json and create an in-memory Protobuf structure representing the |
455 |
| - //information in the JSON file |
456 |
| - GenomicsDBVidMapProto.VidMappingPB vidMapPB; |
457 |
| - try { |
458 |
| - vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson); |
459 |
| - } catch (final IOException e) { |
460 |
| - throw new UserException("Could not open vid json file " + vidmapJson, e); |
461 |
| - } |
462 |
| - |
463 |
| - //In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects |
464 |
| - //Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store |
465 |
| - //We iterate over the list and create a field name to list index map |
466 |
| - final HashMap<String, Integer> fieldNameToIndexInVidFieldsList = |
467 |
| - getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB); |
468 |
| - |
469 |
| - vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, |
470 |
| - GATKVCFConstants.RAW_MAPPING_QUALITY_WITH_DEPTH_KEY, "element_wise_sum"); |
471 |
| - |
472 |
| - |
473 |
| - if (vidMapPB != null) { |
474 |
| - //Use rebuilt vidMap in exportConfiguration |
475 |
| - //NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to |
476 |
| - //C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information |
477 |
| - //in the JSON file |
478 |
| - exportConfigurationBuilder.setVidMapping(vidMapPB); |
479 |
| - } |
480 |
| - |
481 |
| - return exportConfigurationBuilder.build(); |
482 |
| - } |
483 |
| - |
484 |
| - /** |
485 |
| - * Parse the vid json and create an in-memory Protobuf structure representing the |
486 |
| - * information in the JSON file |
487 |
| - * |
488 |
| - * @param vidmapJson vid JSON file |
489 |
| - * @return Protobuf object |
490 |
| - */ |
491 |
| - public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson) |
492 |
| - throws IOException { |
493 |
| - final GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder(); |
494 |
| - try (final FileReader reader = new FileReader(vidmapJson)) { |
495 |
| - JsonFormat.merge(reader, vidMapBuilder); |
496 |
| - } |
497 |
| - return vidMapBuilder.build(); |
498 |
| - } |
499 |
| - |
500 |
| - /** |
501 |
| - * In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects |
502 |
| - * Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store |
503 |
| - * We iterate over the list and create a field name to list index map |
504 |
| - * |
505 |
| - * @param vidMapPB Protobuf vid mapping object |
506 |
| - * @return map from field name to index in vidMapPB.fields list |
507 |
| - */ |
508 |
| - public static HashMap<String, Integer> getFieldNameToListIndexInProtobufVidMappingObject( |
509 |
| - final GenomicsDBVidMapProto.VidMappingPB vidMapPB) { |
510 |
| - final HashMap<String, Integer> fieldNameToIndexInVidFieldsList = new HashMap<>(); |
511 |
| - for (int fieldIdx = 0; fieldIdx < vidMapPB.getFieldsCount(); ++fieldIdx) { |
512 |
| - fieldNameToIndexInVidFieldsList.put(vidMapPB.getFields(fieldIdx).getName(), fieldIdx); |
513 |
| - } |
514 |
| - return fieldNameToIndexInVidFieldsList; |
515 |
| - } |
516 |
| - |
517 |
| - /** |
518 |
| - * Update vid Protobuf object with new combine operation for field |
519 |
| - * |
520 |
| - * @param vidMapPB input vid object |
521 |
| - * @param fieldNameToIndexInVidFieldsList name to index in list |
522 |
| - * @param fieldName INFO field name |
523 |
| - * @param newCombineOperation combine op ("sum", "median") |
524 |
| - * @return updated vid Protobuf object if field exists, else null |
525 |
| - */ |
526 |
| - public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation( |
527 |
| - final GenomicsDBVidMapProto.VidMappingPB vidMapPB, |
528 |
| - final Map<String, Integer> fieldNameToIndexInVidFieldsList, |
529 |
| - final String fieldName, |
530 |
| - final String newCombineOperation) { |
531 |
| - final int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName) |
532 |
| - ? fieldNameToIndexInVidFieldsList.get(fieldName) : -1; |
533 |
| - if (fieldIdx >= 0) { |
534 |
| - //Would need to rebuild vidMapPB - so get top level builder first |
535 |
| - final GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder(); |
536 |
| - //To update the list element corresponding to fieldName, we get the builder for that specific list element |
537 |
| - final GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder fieldBuilder = |
538 |
| - updatedVidMapBuilder.getFieldsBuilder(fieldIdx); |
539 |
| - //And update its combine operation |
540 |
| - fieldBuilder.setVCFFieldCombineOperation(newCombineOperation); |
541 |
| - |
542 |
| - //Shorter way of writing the same operation |
543 |
| - /* |
544 |
| - updatedVidMapBuilder.getFieldsBuilder(fieldIdx) |
545 |
| - .setVCFFieldCombineOperation(newCombineOperation); |
546 |
| - */ |
547 |
| - |
548 |
| - //Rebuild full vidMap |
549 |
| - return updatedVidMapBuilder.build(); |
550 |
| - } |
551 |
| - return null; |
552 |
| - } |
553 |
| - |
554 | 388 | /**
|
555 | 389 | * Returns the sequence dictionary for this source of Features.
|
556 | 390 | * Uses the dictionary from the VCF header (if present) for variant inputs,
|
|
0 commit comments