Skip to content

Commit 8096cf4

Browse files
committed
bring in more zstd updates from @yesimon PR broadinstitute/viral-ngs#937 and re-add all the compressors as conda dependencies (instead of relying on apt packages in the docker image) so as to facilitate the re-enabling of the conda package builds
1 parent e3859eb commit 8096cf4

File tree

3 files changed

+52
-34
lines changed

3 files changed

+52
-34
lines changed

file_utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,15 @@ def merge_tarballs(out_tarball, in_tarballs, threads=None, extract_to_disk_path=
2828
def parser_merge_tarballs(parser=argparse.ArgumentParser()):
2929
parser.add_argument(
3030
'out_tarball',
31-
help='''output tarball (*.tar.gz|*.tar.lz4|*.tar.bz2|-);
31+
help='''output tarball (*.tar.gz|*.tar.lz4|*.tar.bz2|*.tar.zst|-);
3232
compression is inferred by the file extension.
3333
Note: if "-" is used, output will be written to stdout and
3434
--pipeOutHint must be provided to indicate compression type
3535
when compression type is not gzip (gzip is used by default).
3636
''')
3737
parser.add_argument(
3838
'in_tarballs', nargs='+',
39-
help=('input tarballs (*.tar.gz|*.tar.lz4|*.tar.bz2)')
39+
help=('input tarballs (*.tar.gz|*.tar.lz4|*.tar.bz2|*.tar.zst)')
4040
)
4141
parser.add_argument('--extractToDiskPath',
4242
dest="extract_to_disk_path",

requirements-conda.txt

+7-1
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,23 @@ cd-hit=4.6.8
44
cd-hit-auxtools=4.6.8
55
fastqc=0.11.7
66
gatk=3.8
7+
lbzip2=2.5
8+
lz4-c=1.9.1
79
mvicuna=1.0
810
novoalign=3.07.00
11+
parallel=20190922
912
picard-slim=2.21.1
13+
pigz=2.4
1014
prinseq=0.20.4
11-
#r-base=3.5.1
1215
samtools=1.9
1316
trimmomatic=0.38
17+
unzip=6.0
18+
zstd=1.3.8
1419
# Python packages below
1520
arrow=0.12.1
1621
bedtools=2.28.0
1722
biopython=1.72
1823
matplotlib=2.2.4
1924
pysam=0.15.0
2025
pybedtools=0.7.10
26+
zstandard=0.11.0

util/file.py

+43-31
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import contextlib
1010
import os
1111
import gzip
12+
import bz2
13+
import zstd
1214
import io
1315
import tempfile
1416
import subprocess
@@ -328,40 +330,50 @@ def touch_p(path, times=None):
328330
touch(path, times=times)
329331

330332

331-
def open_or_gzopen(fname, *opts, **kwargs):
332-
mode = 'r'
333-
open_opts = list(opts)
333+
@contextlib.contextmanager
334+
def zstd_open(fname, mode='r'):
335+
'''Handle both text and byte decompression of the file.'''
336+
if 'r' in mode:
337+
with open(fname, 'rb') as fh:
338+
dctx = zstd.ZstdDecompressor()
339+
stream_reader = dctx.stream_reader(fh)
340+
if 'b' not in mode:
341+
text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
342+
yield text_stream
343+
return
344+
yield stream_reader
345+
else:
346+
with open(fname, 'wb') as fh:
347+
cctx = zstd.ZstdCompressor(level=kwargs.get('level', 10),
348+
threads=kwargs.get('threads', 1))
349+
stream_writer = cctx.stream_writer(fh)
350+
if 'b' not in mode:
351+
text_stream = io.TextIOWrapper(stream_reader, encoding='utf-8')
352+
yield text_stream
353+
return
354+
yield stream_writer
355+
356+
def open_or_gzopen(fname, mode='r', **kwargs):
334357
assert type(mode) == str, "open mode must be of type str"
335358

336359
# 'U' mode is deprecated in py3 and may be unsupported in future versions,
337360
# so use newline=None when 'U' is specified
338-
if len(open_opts) > 0:
339-
mode = open_opts[0]
340-
if sys.version_info[0] == 3:
341-
if 'U' in mode:
342-
if 'newline' not in kwargs:
343-
kwargs['newline'] = None
344-
open_opts[0] = mode.replace("U","")
345-
346-
# if this is a gzip file
361+
if 'U' in mode:
362+
if 'newline' not in kwargs:
363+
kwargs['newline'] = None
364+
mode = mode.replace("U","")
365+
347366
if fname.endswith('.gz'):
348-
# if text read mode is desired (by spec or default)
349-
if ('b' not in mode) and (len(open_opts)==0 or 'r' in mode):
350-
# if python 2
351-
if sys.version_info[0] == 2:
352-
# gzip.open() under py2 does not support universal newlines
353-
# so we need to wrap it with something that does
354-
# By ignoring errors in BufferedReader, errors should be handled by TextIoWrapper
355-
return io.TextIOWrapper(io.BufferedReader(gzip.open(fname)))
356-
357-
# if 't' for text mode is not explicitly included,
358-
# replace "U" with "t" since under gzip "rb" is the
359-
# default and "U" depends on "rt"
360-
gz_mode = str(mode).replace("U","" if "t" in mode else "t")
361-
gz_opts = [gz_mode]+list(opts)[1:]
362-
return gzip.open(fname, *gz_opts, **kwargs)
367+
# Allow using 'level' kwarg as an alias for gzip files.
368+
if 'level' in kwargs:
369+
kwargs['compresslevel'] = kwargs.pop('level')
370+
return gzip.open(fname, mode=mode, **kwargs)
371+
elif fname.endswith('.bz2'):
372+
return bz2.open(fname, mode=mode, **kwargs)
373+
elif fname.endswith('.zst'):
374+
return zstd_open(fname, mode=mode, **kwargs)
363375
else:
364-
return open(fname, *open_opts, **kwargs)
376+
return open(fname, mode=mode, **kwargs)
365377

366378

367379
def read_tabfile_dict(inFile, header_prefix="#", skip_prefix=None, rowcount_limit=None):
@@ -986,8 +998,8 @@ def choose_compressor(filepath, threads=8):
986998
return_obj["compress_cmd"] = compressor + ["-c"]
987999
elif re.search(r'\.?zst$', filepath):
9881000
compressor = ['zstd']
989-
return_obj["decompress_cmd"] = compressor + ["-d"]
990-
return_obj["compress_cmd"] = compressor + ["-19"]
1001+
return_obj["decompress_cmd"] = compressor + ["-dc"]
1002+
return_obj["compress_cmd"] = compressor + ["-c19"]
9911003
elif re.search(r'\.?tar$', filepath):
9921004
compressor = ['cat']
9931005
return_obj["decompress_cmd"] = compressor
@@ -1031,7 +1043,7 @@ def read(self, size):
10311043
compressor = choose_compressor(pipe_hint_out)["compress_cmd"]
10321044
outfile = None
10331045
else:
1034-
compressor =choose_compressor(out_compressed_tarball)["compress_cmd"]
1046+
compressor = choose_compressor(out_compressed_tarball)["compress_cmd"]
10351047
outfile = open(out_compressed_tarball, "w")
10361048

10371049
out_compress_ps = subprocess.Popen(compressor, stdout=sys.stdout if out_compressed_tarball == "-" else outfile, stdin=subprocess.PIPE)

0 commit comments

Comments
 (0)