|
1 | 1 | package org.broadinstitute.hellbender.engine;
|
2 | 2 |
|
| 3 | +import com.intel.genomicsdb.model.GenomicsDBExportConfiguration; |
| 4 | +import com.intel.genomicsdb.reader.GenomicsDBFeatureReader; |
3 | 5 | import htsjdk.samtools.SAMSequenceDictionary;
|
4 | 6 | import htsjdk.samtools.util.IOUtil;
|
5 | 7 | import htsjdk.tribble.*;
|
6 | 8 | import htsjdk.variant.bcf2.BCF2Codec;
|
7 |
| -import htsjdk.variant.variantcontext.GenotypeLikelihoods; |
8 | 9 | import htsjdk.variant.variantcontext.VariantContext;
|
9 | 10 | import htsjdk.variant.vcf.VCFHeader;
|
10 | 11 | import org.apache.logging.log4j.LogManager;
|
|
19 | 20 | import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
|
20 | 21 | import org.broadinstitute.hellbender.utils.io.IOUtils;
|
21 | 22 | import org.broadinstitute.hellbender.utils.nio.SeekableByteChannelPrefetcher;
|
22 |
| - |
23 |
| -import com.intel.genomicsdb.model.GenomicsDBExportConfiguration; |
24 |
| -import com.intel.genomicsdb.reader.GenomicsDBFeatureReader; |
25 |
| -import com.googlecode.protobuf.format.JsonFormat; |
26 |
| -import com.intel.genomicsdb.model.GenomicsDBVidMapProto; |
| 23 | +import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.*; |
27 | 24 |
|
28 | 25 | import java.io.File;
|
29 | 26 | import java.io.IOException;
|
30 | 27 | import java.nio.channels.SeekableByteChannel;
|
31 |
| -import java.nio.file.Files; |
32 | 28 | import java.nio.file.Path;
|
33 |
| -import java.nio.file.Paths; |
34 | 29 | import java.util.Iterator;
|
35 | 30 | import java.util.List;
|
36 | 31 | import java.util.Optional;
|
37 | 32 | import java.util.function.Function;
|
38 |
| -import java.io.FileReader; |
39 |
| -import java.util.HashMap; |
40 |
| -import java.util.Map; |
| 33 | + |
| 34 | + |
41 | 35 |
|
42 | 36 | /**
|
43 | 37 | * Enables traversals and queries over sources of Features, which are metadata associated with a location
|
|
68 | 62 | public final class FeatureDataSource<T extends Feature> implements GATKDataSource<T>, AutoCloseable {
|
69 | 63 | private static final Logger logger = LogManager.getLogger(FeatureDataSource.class);
|
70 | 64 |
|
71 |
| - /** |
72 |
| - * identifies a path as a GenomicsDB URI |
73 |
| - */ |
74 |
| - public static final String GENOMIC_DB_URI_SCHEME = "gendb://"; |
75 |
| - |
76 | 65 | /**
|
77 | 66 | * Feature reader used to retrieve records from our file
|
78 | 67 | */
|
@@ -288,14 +277,6 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
|
288 | 277 | this.queryLookaheadBases = queryLookaheadBases;
|
289 | 278 | }
|
290 | 279 |
|
291 |
| - /** |
292 |
| - * @param path String containing the path to test |
293 |
| - * @return true if path represent a GenomicsDB URI, otherwise false |
294 |
| - */ |
295 |
| - public static boolean isGenomicsDBPath(final String path) { |
296 |
| - return path != null && path.startsWith(GENOMIC_DB_URI_SCHEME); |
297 |
| - } |
298 |
| - |
299 | 280 | @SuppressWarnings("unchecked")
|
300 | 281 | private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
|
301 | 282 | final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
|
@@ -368,7 +349,7 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
|
368 | 349 | }
|
369 | 350 | }
|
370 | 351 |
|
371 |
| - private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) { |
| 352 | + protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final String path, final File reference) { |
372 | 353 | if (!isGenomicsDBPath(path)) {
|
373 | 354 | throw new IllegalArgumentException("Trying to create a GenomicsDBReader from a non-GenomicsDB input");
|
374 | 355 | }
|
@@ -404,149 +385,6 @@ private static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final St
|
404 | 385 | }
|
405 | 386 | }
|
406 | 387 |
|
407 |
| - private static GenomicsDBExportConfiguration.ExportConfiguration createExportConfiguration(final File reference, final File workspace, |
408 |
| - final File callsetJson, final File vidmapJson, |
409 |
| - final File vcfHeader) { |
410 |
| - final GenomicsDBExportConfiguration.ExportConfiguration.Builder exportConfigurationBuilder = |
411 |
| - GenomicsDBExportConfiguration.ExportConfiguration.newBuilder() |
412 |
| - .setWorkspace(workspace.getAbsolutePath()) |
413 |
| - .setReferenceGenome(reference.getAbsolutePath()) |
414 |
| - .setVidMappingFile(vidmapJson.getAbsolutePath()) |
415 |
| - .setCallsetMappingFile(callsetJson.getAbsolutePath()) |
416 |
| - .setVcfHeaderFilename(vcfHeader.getAbsolutePath()) |
417 |
| - .setProduceGTField(false) |
418 |
| - .setProduceGTWithMinPLValueForSpanningDeletions(false) |
419 |
| - .setSitesOnlyQuery(false) |
420 |
| - .setMaxDiploidAltAllelesThatCanBeGenotyped(GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED); |
421 |
| - final Path arrayFolder = Paths.get(workspace.getAbsolutePath(), GenomicsDBConstants.DEFAULT_ARRAY_NAME).toAbsolutePath(); |
422 |
| - |
423 |
| - // For the multi-interval support, we create multiple arrays (directories) in a single workspace - |
424 |
| - // one per interval. So, if you wish to import intervals ("chr1", [ 1, 100M ]) and ("chr2", [ 1, 100M ]), |
425 |
| - // you end up with 2 directories named chr1$1$100M and chr2$1$100M. So, the array names depend on the |
426 |
| - // partition bounds. |
427 |
| - |
428 |
| - // During the read phase, the user only supplies the workspace. The array names are obtained by scanning |
429 |
| - // the entries in the workspace and reading the right arrays. For example, if you wish to read ("chr2", |
430 |
| - // 50, 50M), then only the second array is queried. |
431 |
| - |
432 |
| - // In the previous version of the tool, the array name was a constant - genomicsdb_array. The new version |
433 |
| - // will be backward compatible with respect to reads. Hence, if a directory named genomicsdb_array is found, |
434 |
| - // the array name is passed to the GenomicsDBFeatureReader otherwise the array names are generated from the |
435 |
| - // directory entries. |
436 |
| - if (Files.exists(arrayFolder)) { |
437 |
| - exportConfigurationBuilder.setArrayName(GenomicsDBConstants.DEFAULT_ARRAY_NAME); |
438 |
| - } else { |
439 |
| - exportConfigurationBuilder.setGenerateArrayNameFromPartitionBounds(true); |
440 |
| - } |
441 |
| - |
442 |
| - //Sample code snippet to show how combine operations for INFO fields can be specified using the Protobuf |
443 |
| - //API |
444 |
| - // |
445 |
| - //References |
446 |
| - //GenomicsDB Protobuf structs: https://github.com/Intel-HLS/GenomicsDB/blob/master/src/resources/genomicsdb_vid_mapping.proto |
447 |
| - //Protobuf generated Java code guide: |
448 |
| - //https://developers.google.com/protocol-buffers/docs/javatutorial#the-protocol-buffer-api |
449 |
| - //https://developers.google.com/protocol-buffers/docs/reference/java-generated |
450 |
| - |
451 |
| - //Parse the vid json and create an in-memory Protobuf structure representing the |
452 |
| - //information in the JSON file |
453 |
| - GenomicsDBVidMapProto.VidMappingPB vidMapPB; |
454 |
| - try { |
455 |
| - vidMapPB = getProtobufVidMappingFromJsonFile(vidmapJson); |
456 |
| - } catch (final IOException e) { |
457 |
| - throw new UserException("Could not open vid json file " + vidmapJson, e); |
458 |
| - } |
459 |
| - |
460 |
| - //In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects |
461 |
| - //Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store |
462 |
| - //We iterate over the list and create a field name to list index map |
463 |
| - final HashMap<String, Integer> fieldNameToIndexInVidFieldsList = |
464 |
| - getFieldNameToListIndexInProtobufVidMappingObject(vidMapPB); |
465 |
| - |
466 |
| - //Example: set MQ combine operation to median (default is also median, but this is just an example) |
467 |
| - vidMapPB = updateINFOFieldCombineOperation(vidMapPB, fieldNameToIndexInVidFieldsList, |
468 |
| - "MQ", "median"); |
469 |
| - if (vidMapPB != null) { |
470 |
| - //Use rebuilt vidMap in exportConfiguration |
471 |
| - //NOTE: this does NOT update the JSON file, the vidMapPB is a temporary structure that's passed to |
472 |
| - //C++ modules of GenomicsDB for this specific query. Other queries will continue to use the information |
473 |
| - //in the JSON file |
474 |
| - exportConfigurationBuilder.setVidMapping(vidMapPB); |
475 |
| - } |
476 |
| - |
477 |
| - return exportConfigurationBuilder.build(); |
478 |
| - } |
479 |
| - |
480 |
| - /** |
481 |
| - * Parse the vid json and create an in-memory Protobuf structure representing the |
482 |
| - * information in the JSON file |
483 |
| - * |
484 |
| - * @param vidmapJson vid JSON file |
485 |
| - * @return Protobuf object |
486 |
| - */ |
487 |
| - public static GenomicsDBVidMapProto.VidMappingPB getProtobufVidMappingFromJsonFile(final File vidmapJson) |
488 |
| - throws IOException { |
489 |
| - final GenomicsDBVidMapProto.VidMappingPB.Builder vidMapBuilder = GenomicsDBVidMapProto.VidMappingPB.newBuilder(); |
490 |
| - try (final FileReader reader = new FileReader(vidmapJson)) { |
491 |
| - JsonFormat.merge(reader, vidMapBuilder); |
492 |
| - } |
493 |
| - return vidMapBuilder.build(); |
494 |
| - } |
495 |
| - |
496 |
| - /** |
497 |
| - * In vidMapPB, fields is a list of GenomicsDBVidMapProto.GenomicsDBFieldInfo objects |
498 |
| - * Each GenomicsDBFieldInfo object contains information about a specific field in the TileDB/GenomicsDB store |
499 |
| - * We iterate over the list and create a field name to list index map |
500 |
| - * |
501 |
| - * @param vidMapPB Protobuf vid mapping object |
502 |
| - * @return map from field name to index in vidMapPB.fields list |
503 |
| - */ |
504 |
| - public static HashMap<String, Integer> getFieldNameToListIndexInProtobufVidMappingObject( |
505 |
| - final GenomicsDBVidMapProto.VidMappingPB vidMapPB) { |
506 |
| - final HashMap<String, Integer> fieldNameToIndexInVidFieldsList = new HashMap<>(); |
507 |
| - for (int fieldIdx = 0; fieldIdx < vidMapPB.getFieldsCount(); ++fieldIdx) { |
508 |
| - fieldNameToIndexInVidFieldsList.put(vidMapPB.getFields(fieldIdx).getName(), fieldIdx); |
509 |
| - } |
510 |
| - return fieldNameToIndexInVidFieldsList; |
511 |
| - } |
512 |
| - |
513 |
| - /** |
514 |
| - * Update vid Protobuf object with new combine operation for field |
515 |
| - * |
516 |
| - * @param vidMapPB input vid object |
517 |
| - * @param fieldNameToIndexInVidFieldsList name to index in list |
518 |
| - * @param fieldName INFO field name |
519 |
| - * @param newCombineOperation combine op ("sum", "median") |
520 |
| - * @return updated vid Protobuf object if field exists, else null |
521 |
| - */ |
522 |
| - public static GenomicsDBVidMapProto.VidMappingPB updateINFOFieldCombineOperation( |
523 |
| - final GenomicsDBVidMapProto.VidMappingPB vidMapPB, |
524 |
| - final Map<String, Integer> fieldNameToIndexInVidFieldsList, |
525 |
| - final String fieldName, |
526 |
| - final String newCombineOperation) { |
527 |
| - final int fieldIdx = fieldNameToIndexInVidFieldsList.containsKey(fieldName) |
528 |
| - ? fieldNameToIndexInVidFieldsList.get(fieldName) : -1; |
529 |
| - if (fieldIdx >= 0) { |
530 |
| - //Would need to rebuild vidMapPB - so get top level builder first |
531 |
| - final GenomicsDBVidMapProto.VidMappingPB.Builder updatedVidMapBuilder = vidMapPB.toBuilder(); |
532 |
| - //To update the list element corresponding to fieldName, we get the builder for that specific list element |
533 |
| - final GenomicsDBVidMapProto.GenomicsDBFieldInfo.Builder fieldBuilder = |
534 |
| - updatedVidMapBuilder.getFieldsBuilder(fieldIdx); |
535 |
| - //And update its combine operation |
536 |
| - fieldBuilder.setVCFFieldCombineOperation(newCombineOperation); |
537 |
| - |
538 |
| - //Shorter way of writing the same operation |
539 |
| - /* |
540 |
| - updatedVidMapBuilder.getFieldsBuilder(fieldIdx) |
541 |
| - .setVCFFieldCombineOperation(newCombineOperation); |
542 |
| - */ |
543 |
| - |
544 |
| - //Rebuild full vidMap |
545 |
| - return updatedVidMapBuilder.build(); |
546 |
| - } |
547 |
| - return null; |
548 |
| - } |
549 |
| - |
550 | 388 | /**
|
551 | 389 | * Returns the sequence dictionary for this source of Features.
|
552 | 390 | * Uses the dictionary from the VCF header (if present) for variant inputs,
|
|
0 commit comments