-
Notifications
You must be signed in to change notification settings - Fork 602
PrintBGZFBlockInformation: a tool to dump information about blocks in a BGZF file #4239
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
4b90d90
c57a265
35ce7cd
f9e3288
bef06f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,252 @@ | ||
package org.broadinstitute.hellbender.tools.diagnostics; | ||
|
||
import htsjdk.samtools.util.BlockCompressedStreamConstants; | ||
import htsjdk.samtools.util.IOUtil; | ||
import org.apache.commons.lang3.StringUtils; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||
import org.broadinstitute.barclay.argparser.ExperimentalFeature; | ||
import org.broadinstitute.hellbender.cmdline.CommandLineProgram; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.utils.io.IOUtils; | ||
import picard.cmdline.programgroups.OtherProgramGroup; | ||
|
||
import java.io.*; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* A diagnostic tool that prints information about the compressed blocks in a BGZF format file, | ||
* such as a .vcf.gz file. This tool can detect various kinds of BGZF file corruption such as | ||
* premature BGZF terminator blocks, truncated files, and files that were regular-GZIPPED by | ||
* accident. | ||
* <p> | ||
* The output looks like this: | ||
* </p> | ||
* <pre> | ||
* Block at file offset 0 | ||
* - compressed size: 12932 | ||
* - uncompressed size: 65280 | ||
* | ||
* Block at file offset 12932 | ||
* - compressed size: 9978 | ||
* - uncompressed size: 65280 | ||
* ... | ||
* etc. | ||
* </pre> | ||
* <p> | ||
* The output can be redirected to a file using the -O option. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
* </p> | ||
*/ | ||
@ExperimentalFeature | ||
@CommandLineProgramProperties( | ||
summary = "Print information about the compressed blocks in a BGZF format file", | ||
oneLineSummary = "Print information about the compressed blocks in a BGZF format file", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe expand this comment to more explicitly summarize the compressed and uncompressed size. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not necessary I think -- the tool-level docs have that information. |
||
programGroup = OtherProgramGroup.class | ||
) | ||
public class PrintBGZFBlockInformation extends CommandLineProgram { | ||
|
||
@Argument(fullName = "bgzf-file", doc = "The BGZF-format file for which to print block information", optional = false) | ||
private String bgzfPathString; | ||
|
||
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "File to which to write block information (if not specified, prints to standard output)", optional = true) | ||
private String output; | ||
|
||
private Path bgzfPath; | ||
|
||
private long streamOffset = 0l; | ||
|
||
private PrintStream outStream; | ||
|
||
@Override | ||
protected void onStartup() { | ||
super.onStartup(); | ||
|
||
bgzfPath = IOUtils.getPath(bgzfPathString); | ||
|
||
if ( ! Files.exists(bgzfPath) ) { | ||
throw new UserException.CouldNotReadInputFile("File " + bgzfPathString + " does not exist"); | ||
} | ||
|
||
if ( ! IOUtil.hasBlockCompressedExtension(bgzfPathString) ) { | ||
throw new UserException.CouldNotReadInputFile("File " + bgzfPathString + " does not end in a recognized BGZF file extension (" + | ||
StringUtils.join(IOUtil.BLOCK_COMPRESSED_EXTENSIONS, ",") + ")"); | ||
} | ||
|
||
try { | ||
// Check that the file is in BGZF format. This catches the "regular GZIP" case as well: | ||
if ( ! IOUtil.isBlockCompressed(bgzfPath) ) { | ||
throw new UserException.CouldNotReadInputFile(bgzfPath, "File is not a valid BGZF file. Could possibly be a regular GZIP file?"); | ||
} | ||
} | ||
catch ( IOException e ) { | ||
throw new UserException.CouldNotReadInputFile(bgzfPath, "Unable to determine whether file is a valid BGZF file", e); | ||
} | ||
|
||
if ( output != null ) { | ||
try { | ||
outStream = new PrintStream(output); | ||
} catch (FileNotFoundException e) { | ||
throw new UserException.CouldNotCreateOutputFile(output, "Unable to open output file", e); | ||
} | ||
} else { | ||
outStream = System.out; | ||
} | ||
} | ||
|
||
@Override | ||
protected Object doWork() { | ||
BGZFBlockMetadata previousBlockInfo = null; | ||
int blockNumber = 0; | ||
final List<Integer> nonFinalTerminatorBlockIndices = new ArrayList<>(); | ||
|
||
try ( InputStream bgzfInputStream = Files.newInputStream(bgzfPath) ) { | ||
outStream.printf("BGZF block information for file: %s\n\n", bgzfPath.getFileName()); | ||
|
||
BGZFBlockMetadata blockInfo; | ||
|
||
while ( (blockInfo = processNextBlock(bgzfInputStream, bgzfPathString)) != null ) { | ||
++blockNumber; | ||
|
||
// If we saw a 0-byte terminator block that was not the final block in the file, | ||
// emit an error message | ||
if ( previousBlockInfo != null && previousBlockInfo.uncompressedSize == 0 ) { | ||
nonFinalTerminatorBlockIndices.add(blockNumber - 1); | ||
|
||
outStream.println("*******************************************************"); | ||
outStream.println("ERROR: Premature BGZF 0-byte terminator block was found"); | ||
outStream.println("at block number: " + (blockNumber - 1)); | ||
outStream.println("*******************************************************"); | ||
outStream.println(); | ||
} | ||
|
||
outStream.printf("Block #%d at file offset %d\n", blockNumber, blockInfo.blockOffset); | ||
outStream.printf("\t- compressed size: %d\n", blockInfo.compressedSize); | ||
outStream.printf("\t- uncompressed size: %d\n", blockInfo.uncompressedSize); | ||
outStream.println(); | ||
|
||
previousBlockInfo = blockInfo; | ||
} | ||
} catch ( IOException e ) { | ||
throw new UserException.CouldNotReadInputFile("Error while parsing BGZF file.", e); | ||
} | ||
|
||
// Check whether the last block in the file was a 0-byte BGZF terminator block | ||
if ( previousBlockInfo == null || previousBlockInfo.uncompressedSize != 0 ) { | ||
outStream.println("******************************************************"); | ||
outStream.println("ERROR: Final BGZF 0-byte terminator block was MISSING!"); | ||
outStream.println("******************************************************"); | ||
outStream.println(); | ||
} else { | ||
outStream.println("***************************************************************************"); | ||
outStream.println("Final BGZF 0-byte terminator block FOUND as expected at block number " + blockNumber); | ||
outStream.println("***************************************************************************"); | ||
outStream.println(); | ||
} | ||
|
||
// Emit an error message at the end if we encountered any terminator blocks before the final block: | ||
if ( ! nonFinalTerminatorBlockIndices.isEmpty() ) { | ||
outStream.println("***********************************************************"); | ||
outStream.println("ERROR: Premature BGZF 0-byte terminator block(s) were found"); | ||
outStream.println("at block number(s): " + StringUtils.join(nonFinalTerminatorBlockIndices, ",")); | ||
outStream.println("***********************************************************"); | ||
outStream.println(); | ||
} | ||
|
||
return 0; | ||
} | ||
|
||
@Override | ||
protected void onShutdown() { | ||
if ( outStream != null && outStream != System.out ) { | ||
outStream.close(); | ||
} | ||
} | ||
|
||
// Code adapted from HTSJDK's BlockCompressedInputStream class | ||
private BGZFBlockMetadata processNextBlock(InputStream stream, String streamSource) throws IOException { | ||
final byte[] buffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; | ||
long blockAddress = streamOffset; | ||
|
||
final int headerByteCount = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); | ||
|
||
// Return null when we hit EOF | ||
if ( headerByteCount <= 0 ) { | ||
return null; | ||
} | ||
if (headerByteCount != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { | ||
throw new IOException("Incorrect header size for file: " + streamSource); | ||
} | ||
streamOffset += headerByteCount; | ||
|
||
final int blockLength = unpackInt16(buffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; | ||
|
||
if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > buffer.length) { | ||
throw new IOException("Unexpected compressed block length: " + blockLength + " for " + streamSource); | ||
} | ||
|
||
final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; | ||
final int dataByteCount = readBytes(stream, buffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, | ||
remaining); | ||
|
||
if (dataByteCount != remaining) { | ||
throw new IOException("Premature end of file: " + streamSource); | ||
} | ||
streamOffset += dataByteCount; | ||
|
||
final int uncompressedLength = unpackInt32(buffer, blockLength - 4); | ||
|
||
if (uncompressedLength < 0) { | ||
throw new IOException(streamSource + " has invalid uncompressed length: " + uncompressedLength); | ||
} | ||
|
||
return new BGZFBlockMetadata(blockAddress, blockLength, uncompressedLength); | ||
} | ||
|
||
private static int unpackInt16(final byte[] buffer, final int offset) { | ||
return ((buffer[offset] & 0xFF) | | ||
((buffer[offset+1] & 0xFF) << 8)); | ||
} | ||
|
||
private static int unpackInt32(final byte[] buffer, final int offset) { | ||
return ((buffer[offset] & 0xFF) | | ||
((buffer[offset+1] & 0xFF) << 8) | | ||
((buffer[offset+2] & 0xFF) << 16) | | ||
((buffer[offset+3] & 0xFF) << 24)); | ||
} | ||
|
||
private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) throws IOException { | ||
int bytesRead = 0; | ||
while (bytesRead < length) { | ||
final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); | ||
|
||
// Return EOF if we get EOF from read() and we haven't read any bytes | ||
if ( count < 0 && bytesRead == 0 ) { | ||
return count; | ||
// Otherwise if we hit EOF and we have read something, return the bytes read | ||
} else if (count <= 0) { | ||
break; | ||
} | ||
|
||
bytesRead += count; | ||
} | ||
return bytesRead; | ||
} | ||
|
||
private static final class BGZFBlockMetadata { | ||
private final long blockOffset; | ||
private final int compressedSize; | ||
private final int uncompressedSize; | ||
|
||
public BGZFBlockMetadata(final long blockOffset, final int compressedSize, final int uncompressedSize) { | ||
this.blockOffset = blockOffset; | ||
this.compressedSize = compressedSize; | ||
this.uncompressedSize = uncompressedSize; | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
package org.broadinstitute.hellbender.tools; | ||
|
||
import org.broadinstitute.hellbender.CommandLineProgramTest; | ||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; | ||
import org.testng.annotations.Test; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
public class PrintBGZFBlockInformationIntegrationTest extends CommandLineProgramTest { | ||
|
||
/* Well-formed large BGZF file */ | ||
@Test | ||
public void testNormalLargeInput() throws IOException { | ||
final File input = new File(largeFileTestDir, "gvcfs/HG00096.g.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testNormalLargeInput", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testNormalLargeInput.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Well-formed small BGZF file */ | ||
@Test | ||
public void testNormalSmallInput() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testNormalSmallInput", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testNormalSmallInput.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Malformed BGZF file missing the final 0-byte terminator block */ | ||
@Test | ||
public void testMissingBGZFTerminatorBlock() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.NO_BGZF_TERMINATOR_BLOCK.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testMissingBGZFTerminatorBlock", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testMissingBGZFTerminatorBlock.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Malformed BGZF file with an incomplete (truncated) final block */ | ||
@Test(expectedExceptions= UserException.CouldNotReadInputFile.class) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a brief comment explaining what this test means. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added explanatory comments to all tests |
||
public void testTruncatedFinalBlock() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.TRUNCATED_FINAL_BLOCK.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testTruncatedFinalBlock", ".out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
} | ||
|
||
/* Malformed BGZF file with an extra 0-byte terminator block in the middle */ | ||
@Test | ||
public void testExtraTerminatorBlockInMiddle() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.EXTRA_TERMINATOR_BLOCK_IN_MIDDLE.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testExtraTerminatorBlockInMiddle", ".out"); | ||
final File expectedOutput = new File(toolsTestDir + "PrintBGZFBlockInformation/expected_PrintBGZFBlockInformationIntegrationTest_testExtraTerminatorBlockInMiddle.out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
|
||
IntegrationTestSpec.assertEqualTextFiles(actualOutput, expectedOutput); | ||
} | ||
|
||
/* Regular GZIP file masquerading as a BGZF file */ | ||
@Test(expectedExceptions= UserException.CouldNotReadInputFile.class) | ||
public void testRegularGzipFile() throws IOException { | ||
final File input = new File(toolsTestDir, "PrintBGZFBlockInformation/4featuresHG38Header.REGULAR_GZIP.vcf.gz"); | ||
final File actualOutput = createTempFile("PrintBGZFBlockInformationIntegrationTest_testRegularGzipFile", ".out"); | ||
|
||
final String[] args = { | ||
"--bgzf-file", input.getAbsolutePath(), | ||
"--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, actualOutput.getAbsolutePath() | ||
}; | ||
runCommandLine(args); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
BGZF block information for file: 4featuresHG38Header.EXTRA_TERMINATOR_BLOCK_IN_MIDDLE.vcf.gz | ||
|
||
Block #1 at file offset 0 | ||
- compressed size: 12409 | ||
- uncompressed size: 65498 | ||
|
||
Block #2 at file offset 12409 | ||
- compressed size: 28 | ||
- uncompressed size: 0 | ||
|
||
******************************************************* | ||
ERROR: Premature BGZF 0-byte terminator block was found | ||
at block number: 2 | ||
******************************************************* | ||
|
||
Block #3 at file offset 12437 | ||
- compressed size: 6497 | ||
- uncompressed size: 65498 | ||
|
||
Block #4 at file offset 18934 | ||
- compressed size: 6229 | ||
- uncompressed size: 46819 | ||
|
||
Block #5 at file offset 25163 | ||
- compressed size: 28 | ||
- uncompressed size: 0 | ||
|
||
*************************************************************************** | ||
Final BGZF 0-byte terminator block FOUND as expected at block number 5 | ||
*************************************************************************** | ||
|
||
*********************************************************** | ||
ERROR: Premature BGZF 0-byte terminator block(s) were found | ||
at block number(s): 2 | ||
*********************************************************** | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For properly formatted in the docgen, this requires the
<p>
HTML tag (I guess).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done