Skip to content

add clumpify-based dedup #970

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 56 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
a5d58b5
add bbmap.BBMapTool().dedup_clumpify()
tomkinsc Jul 2, 2019
595764e
pass JVMmemory; add read_utils.rmdup_clumpify_bam; dedup_bam WDL task
tomkinsc Jul 2, 2019
09901d3
switch from mvicuna to clumpify-based dedup in taxon_filter.py deplete
tomkinsc Jul 2, 2019
df208ea
replace unicode apostrophe
tomkinsc Jul 2, 2019
98ac4fc
reduce clumpify max_mismatches 5->3
tomkinsc Jul 2, 2019
784877a
dump dx-toolkit version and update URL to reflect new source
tomkinsc Jul 2, 2019
232f9cd
dedup prior to metagenomics classification in WDL workflows
tomkinsc Jul 2, 2019
c01bb5b
add missing import
tomkinsc Jul 3, 2019
e25ef52
rename read_utils.wdl -> downsample.wdl, dedup.wdl
tomkinsc Jul 3, 2019
6ba96d4
rename dedup_bam wdl workflow to "dedup"
tomkinsc Jul 3, 2019
e8a4081
increase dx instance size for dedup and memory spec
tomkinsc Jul 3, 2019
8280063
correct argparse parser attachement for rmdup_clumpify_bam
tomkinsc Jul 3, 2019
b86b1c9
wrap WDL variable in dedup command block for var interpolation
tomkinsc Jul 3, 2019
8afe18f
avoid collision
tomkinsc Jul 4, 2019
7d2f45a
Merge branch 'master' into ct-add-clumpify
tomkinsc Jul 10, 2019
f48038e
Merge branch 'master' into ct-add-clumpify
tomkinsc Jul 10, 2019
c78f246
Merge branch 'master' into ct-add-clumpify
tomkinsc Jul 11, 2019
72fb4cd
add sambamba since bbtools looks for it?
tomkinsc Jul 11, 2019
d97f773
Merge branch 'master' into ct-add-clumpify
tomkinsc Jul 13, 2019
a685a8a
remove sambamba
tomkinsc Jul 13, 2019
a2ce0f1
specify containment=t for bbmap clumpify
tomkinsc Aug 1, 2019
6f26717
Merge branch 'master' into ct-add-clumpify
tomkinsc Aug 26, 2019
b73950e
Merge branch 'master' into ct-add-clumpify
tomkinsc Sep 11, 2019
a3010ea
Merge branch 'master' into ct-add-clumpify
tomkinsc Sep 11, 2019
8199b12
enforce containment=False; more tolerant bbmap unit test
tomkinsc Sep 11, 2019
6bb3f6b
update miniconda ssl certs
tomkinsc Sep 12, 2019
97eff11
increase debug info emitted by build-conda.sh
tomkinsc Sep 20, 2019
f6f9b85
Merge branch 'master' into ct-add-clumpify
tomkinsc Sep 20, 2019
1674c44
Merge branch 'master' into ct-add-clumpify
tomkinsc Oct 3, 2019
4ca4693
Merge branch 'master' into ct-add-clumpify
tomkinsc Nov 7, 2019
8f8aaae
bump bbmap to 38.71; set containment=True for clumpify
tomkinsc Nov 7, 2019
218a12b
Merge branch 'ct-add-clumpify' of ssh://github.com/broadinstitute/vir…
tomkinsc Nov 7, 2019
fa5e01e
update stage number
tomkinsc Nov 7, 2019
a0735c7
set bbmap jvmMemDefault='2g'; 1g for clumpify test
tomkinsc Nov 8, 2019
663deba
no longer skip demux_metag from validation/compilation
tomkinsc Nov 20, 2019
5a7ed3b
demux_plus/demux_metag: merge linear parts of scatters, run spike-in …
tomkinsc Nov 20, 2019
2be4a85
add DNAnexus defaults for demux_metag, set inputs in demux_metag
tomkinsc Nov 20, 2019
1d691b2
rmdup_clumpify_bam: preserve sortorder value of input bam
tomkinsc Nov 20, 2019
1ba7415
bump bbmap version 38.71 -> 38.73
tomkinsc Nov 20, 2019
bb589a1
fix bug in conda command quiet calling
tomkinsc Nov 20, 2019
472703b
maintain RG info in clumpify dedup; move processing to bbmap.py
tomkinsc Nov 20, 2019
ca726d0
demux_plus/demux_metag: update dx defaults and pass explicitly in wor…
tomkinsc Nov 20, 2019
3f9f188
remove redundant defaults from dx wdl test inputs
tomkinsc Nov 20, 2019
995cf0d
move krakenuniq back outside scatter
tomkinsc Nov 20, 2019
d54eff3
respecify kaiju deps
tomkinsc Nov 20, 2019
21a6ac4
WDL dedup_bam: report read count before & after dedup
tomkinsc Nov 20, 2019
13f5172
switch to clumpify for downsample dedup
tomkinsc Nov 21, 2019
c1d18be
change to clumpify for pre-depletion dedup
tomkinsc Nov 21, 2019
7c45da6
--JVMmemory=1g for TestDepleteHuman
tomkinsc Nov 21, 2019
d91eca5
remove rmdup from depletion call
tomkinsc Nov 21, 2019
12f73cb
expand arguments exposed for clumpify dedup
tomkinsc Nov 21, 2019
16a2b50
update expected depletion output now that we're not running dedup on it
tomkinsc Nov 21, 2019
f1f9a40
scatter/gather clumpify dedup across libraries
tomkinsc Dec 2, 2019
49bffcb
Merge branch 'master' into ct-add-clumpify
tomkinsc Dec 3, 2019
362d0f3
pass through single-end IDs for bbmap dedup
tomkinsc Mar 27, 2020
f816f6b
Merge branch 'master' into ct-add-clumpify
tomkinsc Mar 27, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion pipes/WDL/workflows/classify_kaiju.wdl
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
import "tasks_metagenomics.wdl" as metagenomics
import "tasks_read_utils.wdl" as reads

workflow classify_kaiju {
call metagenomics.kaiju
Array[File] unclassified_bams
scatter(reads_bam in unclassified_bams) {
call reads.dedup_bam as dedup {
input:
in_bam = reads_bam
}
}
call metagenomics.kaiju {
input:
reads_unmapped_bam = dedup.dedup_bam
}
}
13 changes: 12 additions & 1 deletion pipes/WDL/workflows/classify_krakenuniq.wdl
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
import "tasks_metagenomics.wdl" as metagenomics
import "tasks_reports.wdl" as reports
import "tasks_read_utils.wdl" as reads

workflow classify_krakenuniq {
call metagenomics.krakenuniq
Array[File] unclassified_bams
scatter(reads_bam in unclassified_bams) {
call reads.dedup_bam as dedup {
input:
in_bam = reads_bam
}
}
call metagenomics.krakenuniq {
input:
reads_unmapped_bam = dedup.dedup_bam
}

call reports.aggregate_metagenomics_reports as metag_summary_report {
input:
Expand Down
5 changes: 5 additions & 0 deletions pipes/WDL/workflows/dedup.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import "tasks_read_utils.wdl" as reads

workflow dedup {
call reads.dedup_bam
}
16 changes: 12 additions & 4 deletions pipes/WDL/workflows/demux_metag.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,26 @@ import "tasks_metagenomics.wdl" as metagenomics
import "tasks_taxon_filter.wdl" as taxon_filter
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you think this is ready and want to try it out, shouldn't you remove #DX_SKIP_WORKFLOW?

import "tasks_assembly.wdl" as assembly
import "tasks_reports.wdl" as reports
import "tasks_read_utils.wdl" as reads

workflow demux_metag {
call demux.illumina_demux as illumina_demux

scatter(raw_reads in illumina_demux.raw_reads_unaligned_bams) {
call reads.dedup_bam as dedup {
input:
in_bam = raw_reads
}
}

scatter(reads_bam in dedup.dedup_bam) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't double-scatter. You can just keep this as a single scatter block on the raw_reads and put all the task calls together in that single scatter. WDL interpreters/compilers are smart enough to figure out the DAG and parallelization opportunities within the scatter based on the dependencies between their inputs and outputs.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So specifically:

  scatter(raw_reads in illumina_demux.raw_reads_unaligned_bams) {
    call reads.dedup_bam as dedup {
        input:
            in_bam = raw_reads        
    }
    call reports.spikein_report as spikein {
      input:
        reads_bam = dedup.dedup_bam
    }
    call taxon_filter.deplete_taxa as deplete {
      input:
        raw_reads_unmapped_bam = dedup.dedup_bam
    }
    call assembly.assemble as spades {
      input:
        assembler = "spades",
        reads_unmapped_bam = deplete.cleaned_bam
    }
  }

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two more thoughts on this topic:

  1. Importantly, merging them would allow the execution platform to keep going on the linear DAG portions of each sample as they become ready without waiting for all samples to complete dedup before proceeding to the next steps.
  2. I wonder if we should consider running the spikein counting step on raw / non-deduplicated reads.... ERCC's are so short that we might quickly hit an artificial upper bound on the counts if we do it on dedup output.

call reports.spikein_report as spikein {
input:
reads_bam = raw_reads
reads_bam = reads_bam
}
call taxon_filter.deplete_taxa as deplete {
input:
raw_reads_unmapped_bam = raw_reads
raw_reads_unmapped_bam = reads_bam
}
call assembly.assemble as spades {
input:
Expand All @@ -27,7 +35,7 @@ workflow demux_metag {

call metagenomics.krakenuniq as kraken {
input:
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,
reads_unmapped_bam = dedup.dedup_bam
}
call reports.aggregate_metagenomics_reports as metag_summary_report {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we call aggregate_metageomics_reports a second time on the kaiju outputs as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

metagenomics.py::taxlevel_summary() hasn't been adapted/tested to read kaiju summary files yet. I'd like that to be a separate PR (this one is already way beyond its initial scope).

input:
Expand All @@ -39,6 +47,6 @@ workflow demux_metag {
}
call metagenomics.kaiju as kaiju {
input:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We haven't really used kaiju regularly via WDL yet, but I'm betting that we may want to consider moving it to a scatter-on-single-sample execution mode (like everything else in our WDLs except kraken). Its database is about 4x smaller (I'm guessing the localization time is just a few minutes) and the execution time of the algorithm is much slower, so the cost efficiency (algorithmic compute time vs VM wall clock time) of kaiju on a single sample is much better than kraken... so we might as well move it within the same scatter block as well.

reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,
reads_unmapped_bam = dedup.dedup_bam
}
}
15 changes: 12 additions & 3 deletions pipes/WDL/workflows/demux_plus.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import "tasks_metagenomics.wdl" as metagenomics
import "tasks_taxon_filter.wdl" as taxon_filter
import "tasks_assembly.wdl" as assembly
import "tasks_reports.wdl" as reports
import "tasks_read_utils.wdl" as reads

workflow demux_plus {

Expand All @@ -13,15 +14,23 @@ workflow demux_plus {
Array[File]? bmtaggerDbs # .tar.gz, .tgz, .tar.bz2, .tar.lz4, .fasta, or .fasta.gz
Array[File]? blastDbs # .tar.gz, .tgz, .tar.bz2, .tar.lz4, .fasta, or .fasta.gz
Array[File]? bwaDbs

scatter(raw_reads in illumina_demux.raw_reads_unaligned_bams) {
call reads.dedup_bam as dedup {
input:
in_bam = raw_reads
}
}

scatter(reads_bam in dedup.dedup_bam) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See my comment in demux_metag about combining the scatter blocks.

call reports.spikein_report as spikein {
input:
reads_bam = raw_reads,
reads_bam = reads_bam,
spikein_db = spikein_db
}
call taxon_filter.deplete_taxa as deplete {
input:
raw_reads_unmapped_bam = raw_reads,
raw_reads_unmapped_bam = reads_bam,
bmtaggerDbs = bmtaggerDbs,
blastDbs = blastDbs,
bwaDbs = bwaDbs
Expand All @@ -37,7 +46,7 @@ workflow demux_plus {

call metagenomics.krakenuniq as krakenuniq {
input:
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams
reads_unmapped_bam = dedup.dedup_bam
}

call reports.spikein_summary as spike_summary {
Expand Down
File renamed without changes.
28 changes: 28 additions & 0 deletions pipes/WDL/workflows/tasks/tasks_read_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,32 @@ task downsample_bams {
}
}

task dedup_bam {
File in_bam
Int? max_mismatches=3

String sample_name = basename(in_bam, ".bam")

command {
read_utils.py rmdup_clumpify_bam \
${in_bam} \
${sample_name}.dedup.bam \
${'--maxMismatches=' + max_mismatches} \
--JVMmemory "8g"

reports.py fastqc ${sample_name}.dedup.bam ${sample_name}.deduped_fastqc.html
}

output {
File dedup_bam = "${sample_name}.dedup.bam"
File dedup_only_reads_fastqc = "${sample_name}.deduped_fastqc.html"
String viralngs_version = "viral-ngs_version_unknown"
}
runtime {
docker: "quay.io/broadinstitute/viral-ngs"
memory: "52 GB"
cpu: 8
dx_instance_type: "mem1_ssd1_x32"
preemptible: 0
}
}
46 changes: 45 additions & 1 deletion read_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import util.file
import util.misc
from util.file import mkstempfname
import tools.bbmap
import tools.bwa
import tools.cdhit
import tools.picard
Expand Down Expand Up @@ -902,9 +903,52 @@ def parser_rmdup_cdhit_bam(parser=argparse.ArgumentParser()):
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, rmdup_cdhit_bam, split_args=True)
return parser
__commands__.append(('rmdup_cdhit_bam', parser_rmdup_cdhit_bam))


__commands__.append(('rmdup_cdhit_bam', parser_rmdup_cdhit_bam))
def rmdup_clumpify_bam(in_bam, out_bam, max_mismatches=3, JVMmemory=None):
''' Remove duplicate reads from BAM file using bbmap's clumpify tool.
'''
tmp_dir = tempfile.mkdtemp()

tools.picard.SplitSamByLibraryTool().execute(in_bam, tmp_dir)

bbmap = tools.bbmap.BBMapTool()
out_bams = []
for f in os.listdir(tmp_dir):
out_lb_bam = mkstempfname('.bam')
out_bams.append(out_lb_bam)
library_sam = os.path.join(tmp_dir, f)

log.info("executing BBMap clumpify on library " + library_sam)
bbmap.dedup_clumpify(library_sam, out_lb_bam, subs=max_mismatches, JVMmemory=JVMmemory)

with util.file.fifo(name='merged.sam') as merged_bam:
merge_opts = ['SORT_ORDER=queryname']
tools.picard.MergeSamFilesTool().execute(out_bams, merged_bam, picardOptions=merge_opts, JVMmemory=JVMmemory, background=True)
tools.picard.ReplaceSamHeaderTool().execute(merged_bam, in_bam, out_bam, JVMmemory=JVMmemory)


def parser_rmdup_clumpify_bam(parser=argparse.ArgumentParser()):
parser.add_argument('in_bam', help='Input reads, BAM format.')
parser.add_argument('out_bam', help='Output reads, BAM format.')
parser.add_argument(
'--maxMismatches',
dest="max_mismatches",
type=int,
default=3,
help='The max number of base mismatches to allow when identifying duplicate reads. (default: %(default)s')
parser.add_argument(
'--JVMmemory',
default=tools.picard.FilterSamReadsTool.jvmMemDefault,
help='JVM virtual memory size (default: %(default)s)',
dest='JVMmemory'
)
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, rmdup_clumpify_bam, split_args=True)
return parser
__commands__.append(('rmdup_clumpify_bam', parser_rmdup_clumpify_bam))


def _merge_fastqs_and_mvicuna(lb, files):
readList = mkstempfname('.keep_reads.txt')
Expand Down
2 changes: 1 addition & 1 deletion requirements-conda.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
blast=2.7.1
bbmap=38.56
bbmap=38.71
bmtagger=3.101
bwa=0.7.17
cd-hit=4.6.8
Expand Down
14 changes: 6 additions & 8 deletions taxon_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,12 @@ def main_deplete(args):
tags_to_clear = args.tags_to_clear,
picardOptions = ['MAX_DISCARD_FRACTION=0.5'],
JVMmemory = args.JVMmemory,
sanitize = not args.do_not_sanitize) as bamToDeplete:
sanitize = not args.do_not_sanitize) as bam_to_dedup:

read_utils.rmdup_mvicuna_bam(bam_to_dedup, args.rmdupBam, JVMmemory=args.JVMmemory)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this new world, should we consider:

  1. dropping deduplication entirely from taxon_filter.deplete -- since you now include it in all the pipelines prior to depletion anyway, and since it never really fit in the scope of the name of the command, it was historically embedded in a funny place between depletion steps primary because of its performance profile: it was slower than bmtagger (so we ran it after that) but faster than blastn (so we ran it before that)
  2. dropping mvicuna altogether if we think bbmap is better


multi_db_deplete_bam(
bamToDeplete,
args.rmdupBam,
args.bwaDbs,
deplete_bwa_bam,
args.bwaBam,
Expand All @@ -129,13 +132,8 @@ def bmtagger_wrapper(inBam, db, outBam, JVMmemory=None):
JVMmemory=args.JVMmemory
)

# if the user has not specified saving a revertBam, we used a temp file and can remove it
if not args.revertBam:
os.unlink(revertBamOut)

read_utils.rmdup_mvicuna_bam(args.bmtaggerBam, args.rmdupBam, JVMmemory=args.JVMmemory)
multi_db_deplete_bam(
args.rmdupBam,
args.bmtaggerBam,
args.blastDbs,
deplete_blastn_bam,
args.blastnBam,
Expand Down
Binary file modified test/input/TestDepleteHuman/expected/test-reads.blastn.bam
Binary file not shown.
Binary file modified test/input/TestDepleteHuman/expected/test-reads.bmtagger.bam
Binary file not shown.
Binary file modified test/input/TestDepleteHuman/expected/test-reads.bwa.bam
Binary file not shown.
Binary file modified test/input/TestDepleteHuman/expected/test-reads.rmdup.bam
Binary file not shown.
Binary file modified test/input/TestDepleteHuman/test-reads.bam
Binary file not shown.
Binary file not shown.
Binary file not shown.
4 changes: 2 additions & 2 deletions test/input/WDL/test_inputs-demux_plus-dnanexus.dx.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@
"stage-0.maxMismatches": 0,
"stage-0.minimumQuality": 25,

"stage-5.krakenuniq_db_tar_lz4": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-FVYQqP006zFF064QBGf022X1" } },
"stage-5.krona_taxonomy_db_tgz": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F4z0fgj07FZ8jg8yP7yz0Qzb" } }
"stage-6.krakenuniq_db_tar_lz4": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-FVYQqP006zFF064QBGf022X1" } },
"stage-6.krona_taxonomy_db_tgz": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F4z0fgj07FZ8jg8yP7yz0Qzb" } }
}
32 changes: 32 additions & 0 deletions test/unit/test_read_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import tools
import tools.bwa
import tools.samtools
import tools.bbmap
import util
import util.file
from test import TestCaseWithTmp, assert_equal_bam_reads
Expand Down Expand Up @@ -188,6 +189,37 @@ def test_cdhit_empty_input(self):
)
self.assertEqual(samtools.count(output_bam), 0)

def test_bbmap_canned_input(self):
samtools = tools.samtools.SamtoolsTool()

input_bam = os.path.join(util.file.get_test_input_path(self), 'input.bam')
expected_bam = os.path.join(util.file.get_test_input_path(self), 'expected_clumpify.bam')
output_bam = util.file.mkstempfname("output.bam")
read_utils.rmdup_clumpify_bam(
input_bam,
output_bam,
JVMmemory='1g'
)

starting_count = samtools.count(input_bam)
target_count = samtools.count(expected_bam)
output_count = samtools.count(output_bam)

# check that the target count is within 3% of the expected count
self.assertAlmostEqual(output_count, target_count, delta=target_count*0.03, msg="{} not deduplicated to the target size of {} (observed: {}->{})".format(os.path.basename(output_bam),target_count,starting_count,output_count))

def test_bbmap_empty_input(self):
samtools = tools.samtools.SamtoolsTool()

empty_bam = os.path.join(util.file.get_test_input_path(), 'empty.bam')
output_bam = util.file.mkstempfname("output.bam")
read_utils.rmdup_clumpify_bam(
empty_bam,
output_bam,
JVMmemory='1g'
)
self.assertEqual(samtools.count(output_bam), 0)


class TestMvicuna(TestCaseWithTmp):
"""
Expand Down
25 changes: 19 additions & 6 deletions test/unit/test_tools_bbmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,23 @@ def setUp(self):

def test_align(self):
orig_ref = os.path.join(util.file.get_test_input_path(), 'ebola.fasta')
inRef = util.file.mkstempfname('.fasta')
shutil.copyfile(orig_ref, inRef)
in_ref = util.file.mkstempfname('.fasta')
shutil.copyfile(orig_ref, in_ref)
reads = os.path.join(util.file.get_test_input_path(self), 'ebov_reads.bam')
outBam = util.file.mkstempfname('.bam')
self.bbmap.align(inBam=reads, refFasta=inRef, outBam=outBam)
self.assertTrue(os.path.isfile(outBam))
self.assertTrue(os.path.getsize(outBam))
out_bam = util.file.mkstempfname('.bam')
self.bbmap.align(in_bam=reads, ref_fasta=in_ref, out_bam=out_bam)
self.assertTrue(os.path.isfile(out_bam))
self.assertTrue(os.path.getsize(out_bam))

def test_dedup_clumpify(self):
reads = os.path.join(util.file.get_test_input_path(self), 'ebov_reads.bam')
expected_bam = os.path.join(util.file.get_test_input_path(self), 'ebov_reads_clumpify_dedup_expected.bam')
out_bam = util.file.mkstempfname('.bam')
self.bbmap.dedup_clumpify(in_bam=reads, out_bam=out_bam)

target_count = self.samtools.count(expected_bam)

self.assertTrue(os.path.isfile(out_bam))
self.assertTrue(os.path.getsize(out_bam))
# check that the target count is within 1% of the expected count
self.assertAlmostEqual(self.samtools.count(out_bam), target_count, delta=target_count*0.01, msg="{} not deduplicated to the target size: {}".format(os.path.basename(out_bam),target_count))
Loading