broadinstitute · tomkinsc · Jul 2, 2019 · Jul 2, 2019 · Jul 2, 2019 · Jul 2, 2019
diff --git a/pipes/WDL/workflows/classify_kaiju.wdl b/pipes/WDL/workflows/classify_kaiju.wdl
@@ -1,5 +1,16 @@
 import "tasks_metagenomics.wdl" as metagenomics
+import "tasks_read_utils.wdl" as reads
 
 workflow classify_kaiju {
-    call metagenomics.kaiju
+    Array[File] unclassified_bams
+    scatter(reads_bam in unclassified_bams) {
+        call reads.dedup_bam as dedup {
+            input:
+                in_bam = reads_bam        
+        }
+    }
+    call metagenomics.kaiju {
+        input:
+            reads_unmapped_bam = dedup.dedup_bam
+    }
 }
diff --git a/pipes/WDL/workflows/classify_krakenuniq.wdl b/pipes/WDL/workflows/classify_krakenuniq.wdl
@@ -1,8 +1,19 @@
 import "tasks_metagenomics.wdl" as metagenomics
 import "tasks_reports.wdl" as reports
+import "tasks_read_utils.wdl" as reads
 
 workflow classify_krakenuniq {
-    call metagenomics.krakenuniq
+    Array[File] unclassified_bams
+    scatter(reads_bam in unclassified_bams) {
+        call reads.dedup_bam as dedup {
+            input:
+                in_bam = reads_bam        
+        }
+    }
+    call metagenomics.krakenuniq {
+        input:
+            reads_unmapped_bam = dedup.dedup_bam
+    }
 
     call reports.aggregate_metagenomics_reports as metag_summary_report {
         input:

diff --git a/pipes/WDL/workflows/dedup.wdl b/pipes/WDL/workflows/dedup.wdl
@@ -0,0 +1,5 @@
+import "tasks_read_utils.wdl" as reads
+
+workflow dedup {
+    call reads.dedup_bam
+}
diff --git a/pipes/WDL/workflows/demux_metag.wdl b/pipes/WDL/workflows/demux_metag.wdl
@@ -5,18 +5,26 @@ import "tasks_metagenomics.wdl" as metagenomics
 import "tasks_taxon_filter.wdl" as taxon_filter
 import "tasks_assembly.wdl" as assembly
 import "tasks_reports.wdl" as reports
+import "tasks_read_utils.wdl" as reads
 
 workflow demux_metag {
   call demux.illumina_demux as illumina_demux
 
   scatter(raw_reads in illumina_demux.raw_reads_unaligned_bams) {
+      call reads.dedup_bam as dedup {
+          input:
+              in_bam = raw_reads        
+      }
+  }
+
+  scatter(reads_bam in dedup.dedup_bam) {
     call reports.spikein_report as spikein {
       input:
-        reads_bam = raw_reads
+        reads_bam = reads_bam
     }
     call taxon_filter.deplete_taxa as deplete {
       input:
-        raw_reads_unmapped_bam = raw_reads
+        raw_reads_unmapped_bam = reads_bam
     }
     call assembly.assemble as spades {
       input:
@@ -27,7 +35,7 @@ workflow demux_metag {
 
   call metagenomics.krakenuniq as kraken {
     input:
-      reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,
+      reads_unmapped_bam = dedup.dedup_bam
   }
   call reports.aggregate_metagenomics_reports as metag_summary_report {
       input:
@@ -39,6 +47,6 @@ workflow demux_metag {
   }
   call metagenomics.kaiju as kaiju {
     input:
-      reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,
+      reads_unmapped_bam = dedup.dedup_bam
   }
 }
diff --git a/pipes/WDL/workflows/demux_plus.wdl b/pipes/WDL/workflows/demux_plus.wdl
@@ -3,6 +3,7 @@ import "tasks_metagenomics.wdl" as metagenomics
 import "tasks_taxon_filter.wdl" as taxon_filter
 import "tasks_assembly.wdl" as assembly
 import "tasks_reports.wdl" as reports
+import "tasks_read_utils.wdl" as reads
 
 workflow demux_plus {
 
@@ -13,15 +14,23 @@ workflow demux_plus {
     Array[File]? bmtaggerDbs  # .tar.gz, .tgz, .tar.bz2, .tar.lz4, .fasta, or .fasta.gz
     Array[File]? blastDbs  # .tar.gz, .tgz, .tar.bz2, .tar.lz4, .fasta, or .fasta.gz
     Array[File]? bwaDbs
+
     scatter(raw_reads in illumina_demux.raw_reads_unaligned_bams) {
+        call reads.dedup_bam as dedup {
+            input:
+                in_bam = raw_reads        
+        }
+    }
+
+    scatter(reads_bam in dedup.dedup_bam) {
         call reports.spikein_report as spikein {
             input:
-                reads_bam = raw_reads,
+                reads_bam = reads_bam,
                 spikein_db = spikein_db
         }
         call taxon_filter.deplete_taxa as deplete {
             input:
-                raw_reads_unmapped_bam = raw_reads,
+                raw_reads_unmapped_bam = reads_bam,
                 bmtaggerDbs = bmtaggerDbs,
                 blastDbs = blastDbs,
                 bwaDbs = bwaDbs
@@ -37,7 +46,7 @@ workflow demux_plus {
 
     call metagenomics.krakenuniq as krakenuniq {
         input:
-            reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams
+            reads_unmapped_bam = dedup.dedup_bam
     }
 
     call reports.spikein_summary as spike_summary {

diff --git a/pipes/WDL/workflows/read_utils.wdl → pipes/WDL/workflows/downsample.wdl b/pipes/WDL/workflows/read_utils.wdl → pipes/WDL/workflows/downsample.wdl
diff --git a/pipes/WDL/workflows/tasks/tasks_read_utils.wdl b/pipes/WDL/workflows/tasks/tasks_read_utils.wdl
@@ -37,4 +37,32 @@ task downsample_bams {
   }
 }
 
+task dedup_bam {
+  File  in_bam
+  Int?  max_mismatches=3
 
+  String sample_name = basename(in_bam, ".bam")
+
+  command {
+    read_utils.py rmdup_clumpify_bam \
+        ${in_bam} \
+        ${sample_name}.dedup.bam \
+        ${'--maxMismatches=' + max_mismatches} \
+        --JVMmemory "8g"
+
+    reports.py fastqc ${sample_name}.dedup.bam ${sample_name}.deduped_fastqc.html
+  }
+
+  output {
+    File dedup_bam               = "${sample_name}.dedup.bam"
+    File dedup_only_reads_fastqc = "${sample_name}.deduped_fastqc.html"
+    String      viralngs_version        = "viral-ngs_version_unknown"
+  }
+  runtime {
+    docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "52 GB"
+    cpu: 8
+    dx_instance_type: "mem1_ssd1_x32"
+    preemptible: 0
+  }
+}
diff --git a/read_utils.py b/read_utils.py
@@ -26,6 +26,7 @@
 import util.file
 import util.misc
 from util.file import mkstempfname
+import tools.bbmap
 import tools.bwa
 import tools.cdhit
 import tools.picard
@@ -902,9 +903,52 @@ def parser_rmdup_cdhit_bam(parser=argparse.ArgumentParser()):
     util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
     util.cmd.attach_main(parser, rmdup_cdhit_bam, split_args=True)
     return parser
+__commands__.append(('rmdup_cdhit_bam', parser_rmdup_cdhit_bam))
 
 
-__commands__.append(('rmdup_cdhit_bam', parser_rmdup_cdhit_bam))
+def rmdup_clumpify_bam(in_bam, out_bam, max_mismatches=3, JVMmemory=None):
+    ''' Remove duplicate reads from BAM file using bbmap's clumpify tool.
+    '''
+    tmp_dir = tempfile.mkdtemp()
+
+    tools.picard.SplitSamByLibraryTool().execute(in_bam, tmp_dir)
+
+    bbmap = tools.bbmap.BBMapTool()
+    out_bams = []
+    for f in os.listdir(tmp_dir):
+        out_lb_bam = mkstempfname('.bam')
+        out_bams.append(out_lb_bam)
+        library_sam = os.path.join(tmp_dir, f)
+
+        log.info("executing BBMap clumpify on library " + library_sam)
+        bbmap.dedup_clumpify(library_sam, out_lb_bam, subs=max_mismatches, JVMmemory=JVMmemory)
+
+    with util.file.fifo(name='merged.sam') as merged_bam:
+        merge_opts = ['SORT_ORDER=queryname']
+        tools.picard.MergeSamFilesTool().execute(out_bams, merged_bam, picardOptions=merge_opts, JVMmemory=JVMmemory, background=True)
+        tools.picard.ReplaceSamHeaderTool().execute(merged_bam, in_bam, out_bam, JVMmemory=JVMmemory)
+
+
+def parser_rmdup_clumpify_bam(parser=argparse.ArgumentParser()):
+    parser.add_argument('in_bam', help='Input reads, BAM format.')
+    parser.add_argument('out_bam', help='Output reads, BAM format.')
+    parser.add_argument(
+        '--maxMismatches',
+        dest="max_mismatches",
+        type=int,
+        default=3,
+        help='The max number of base mismatches to allow when identifying duplicate reads. (default: %(default)s')
+    parser.add_argument(
+        '--JVMmemory',
+        default=tools.picard.FilterSamReadsTool.jvmMemDefault,
+        help='JVM virtual memory size (default: %(default)s)',
+        dest='JVMmemory'
+    )
+    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
+    util.cmd.attach_main(parser, rmdup_clumpify_bam, split_args=True)
+    return parser
+__commands__.append(('rmdup_clumpify_bam', parser_rmdup_clumpify_bam))
+
 
 def _merge_fastqs_and_mvicuna(lb, files):
     readList = mkstempfname('.keep_reads.txt')

diff --git a/requirements-conda.txt b/requirements-conda.txt
@@ -1,5 +1,5 @@
 blast=2.7.1
-bbmap=38.56
+bbmap=38.71
 bmtagger=3.101
 bwa=0.7.17
 cd-hit=4.6.8

diff --git a/taxon_filter.py b/taxon_filter.py
@@ -109,9 +109,12 @@ def main_deplete(args):
                                         tags_to_clear = args.tags_to_clear,
                                         picardOptions = ['MAX_DISCARD_FRACTION=0.5'],
                                         JVMmemory     = args.JVMmemory,
-                                        sanitize      = not args.do_not_sanitize) as bamToDeplete:
+                                        sanitize      = not args.do_not_sanitize) as bam_to_dedup:
+
+        read_utils.rmdup_mvicuna_bam(bam_to_dedup, args.rmdupBam, JVMmemory=args.JVMmemory)
+
         multi_db_deplete_bam(
-            bamToDeplete,
+            args.rmdupBam,
             args.bwaDbs,
             deplete_bwa_bam,
             args.bwaBam,
@@ -129,13 +132,8 @@ def bmtagger_wrapper(inBam, db, outBam, JVMmemory=None):
         JVMmemory=args.JVMmemory
     )
 
-    # if the user has not specified saving a revertBam, we used a temp file and can remove it
-    if not args.revertBam:
-        os.unlink(revertBamOut)
-
-    read_utils.rmdup_mvicuna_bam(args.bmtaggerBam, args.rmdupBam, JVMmemory=args.JVMmemory)
     multi_db_deplete_bam(
-        args.rmdupBam,
+        args.bmtaggerBam,
         args.blastDbs,
         deplete_blastn_bam,
         args.blastnBam,

diff --git a/test/input/TestDepleteHuman/expected/test-reads.blastn.bam b/test/input/TestDepleteHuman/expected/test-reads.blastn.bam
diff --git a/test/input/TestDepleteHuman/expected/test-reads.bmtagger.bam b/test/input/TestDepleteHuman/expected/test-reads.bmtagger.bam
diff --git a/test/input/TestDepleteHuman/expected/test-reads.bwa.bam b/test/input/TestDepleteHuman/expected/test-reads.bwa.bam
diff --git a/test/input/TestDepleteHuman/expected/test-reads.rmdup.bam b/test/input/TestDepleteHuman/expected/test-reads.rmdup.bam
diff --git a/test/input/TestDepleteHuman/test-reads.bam b/test/input/TestDepleteHuman/test-reads.bam
diff --git a/test/input/TestRmdupUnaligned/expected_clumpify.bam b/test/input/TestRmdupUnaligned/expected_clumpify.bam
diff --git a/test/input/TestToolBBMap/ebov_reads_clumpify_dedup_expected.bam b/test/input/TestToolBBMap/ebov_reads_clumpify_dedup_expected.bam
diff --git a/test/input/WDL/test_inputs-demux_plus-dnanexus.dx.json b/test/input/WDL/test_inputs-demux_plus-dnanexus.dx.json
@@ -11,6 +11,6 @@
   "stage-0.maxMismatches": 0,
   "stage-0.minimumQuality": 25,
 
-  "stage-5.krakenuniq_db_tar_lz4": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-FVYQqP006zFF064QBGf022X1" } },
-  "stage-5.krona_taxonomy_db_tgz": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F4z0fgj07FZ8jg8yP7yz0Qzb" } }
+  "stage-6.krakenuniq_db_tar_lz4": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-FVYQqP006zFF064QBGf022X1" } },
+  "stage-6.krona_taxonomy_db_tgz": { "$dnanexus_link": { "project": "project-F8PQ6380xf5bK0Qk0YPjB17P", "id": "file-F4z0fgj07FZ8jg8yP7yz0Qzb" } }
 }
diff --git a/test/unit/test_read_utils.py b/test/unit/test_read_utils.py
@@ -14,6 +14,7 @@
 import tools
 import tools.bwa
 import tools.samtools
+import tools.bbmap
 import util
 import util.file
 from test import TestCaseWithTmp, assert_equal_bam_reads
@@ -188,6 +189,37 @@ def test_cdhit_empty_input(self):
         )
         self.assertEqual(samtools.count(output_bam), 0)
 
+    def test_bbmap_canned_input(self):
+        samtools = tools.samtools.SamtoolsTool()
+
+        input_bam = os.path.join(util.file.get_test_input_path(self), 'input.bam')
+        expected_bam = os.path.join(util.file.get_test_input_path(self), 'expected_clumpify.bam')
+        output_bam = util.file.mkstempfname("output.bam")
+        read_utils.rmdup_clumpify_bam(
+            input_bam,
+            output_bam,
+            JVMmemory='1g'
+        )
+
+        starting_count = samtools.count(input_bam)
+        target_count = samtools.count(expected_bam)
+        output_count = samtools.count(output_bam)
+
+        # check that the target count is within 3% of the expected count
+        self.assertAlmostEqual(output_count, target_count, delta=target_count*0.03, msg="{} not deduplicated to the target size of {} (observed: {}->{})".format(os.path.basename(output_bam),target_count,starting_count,output_count))
+
+    def test_bbmap_empty_input(self):
+        samtools = tools.samtools.SamtoolsTool()
+
+        empty_bam = os.path.join(util.file.get_test_input_path(), 'empty.bam')
+        output_bam = util.file.mkstempfname("output.bam")
+        read_utils.rmdup_clumpify_bam(
+            empty_bam,
+            output_bam,
+            JVMmemory='1g'
+        )
+        self.assertEqual(samtools.count(output_bam), 0)
+
 
 class TestMvicuna(TestCaseWithTmp):
     """

diff --git a/test/unit/test_tools_bbmap.py b/test/unit/test_tools_bbmap.py
@@ -22,10 +22,23 @@ def setUp(self):
 
     def test_align(self):
         orig_ref = os.path.join(util.file.get_test_input_path(), 'ebola.fasta')
-        inRef = util.file.mkstempfname('.fasta')
-        shutil.copyfile(orig_ref, inRef)
+        in_ref = util.file.mkstempfname('.fasta')
+        shutil.copyfile(orig_ref, in_ref)
         reads = os.path.join(util.file.get_test_input_path(self), 'ebov_reads.bam')
-        outBam = util.file.mkstempfname('.bam')
-        self.bbmap.align(inBam=reads, refFasta=inRef, outBam=outBam)
-        self.assertTrue(os.path.isfile(outBam))
-        self.assertTrue(os.path.getsize(outBam))
+        out_bam = util.file.mkstempfname('.bam')
+        self.bbmap.align(in_bam=reads, ref_fasta=in_ref, out_bam=out_bam)
+        self.assertTrue(os.path.isfile(out_bam))
+        self.assertTrue(os.path.getsize(out_bam))
+
+    def test_dedup_clumpify(self):
+        reads = os.path.join(util.file.get_test_input_path(self), 'ebov_reads.bam')
+        expected_bam = os.path.join(util.file.get_test_input_path(self), 'ebov_reads_clumpify_dedup_expected.bam')
+        out_bam = util.file.mkstempfname('.bam')
+        self.bbmap.dedup_clumpify(in_bam=reads, out_bam=out_bam)
+
+        target_count = self.samtools.count(expected_bam)
+
+        self.assertTrue(os.path.isfile(out_bam))
+        self.assertTrue(os.path.getsize(out_bam))
+        # check that the target count is within 1% of the expected count
+        self.assertAlmostEqual(self.samtools.count(out_bam), target_count, delta=target_count*0.01, msg="{} not deduplicated to the target size: {}".format(os.path.basename(out_bam),target_count))