-
Notifications
You must be signed in to change notification settings - Fork 67
add clumpify-based dedup #970
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 34 commits
a5d58b5
595764e
09901d3
df208ea
98ac4fc
784877a
232f9cd
c01bb5b
e25ef52
6ba96d4
e8a4081
8280063
b86b1c9
8afe18f
7d2f45a
f48038e
c78f246
72fb4cd
d97f773
a685a8a
a2ce0f1
6f26717
b73950e
a3010ea
8199b12
6bb3f6b
97eff11
f6f9b85
1674c44
4ca4693
8f8aaae
218a12b
fa5e01e
a0735c7
663deba
5a7ed3b
2be4a85
1d691b2
1ba7415
bb589a1
472703b
ca726d0
3f9f188
995cf0d
d54eff3
21a6ac4
13f5172
c1d18be
7c45da6
d91eca5
12f73cb
16a2b50
f1f9a40
49bffcb
362d0f3
f816f6b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,16 @@ | ||
import "tasks_metagenomics.wdl" as metagenomics | ||
import "tasks_read_utils.wdl" as reads | ||
|
||
workflow classify_kaiju { | ||
call metagenomics.kaiju | ||
Array[File] unclassified_bams | ||
scatter(reads_bam in unclassified_bams) { | ||
call reads.dedup_bam as dedup { | ||
input: | ||
in_bam = reads_bam | ||
} | ||
} | ||
call metagenomics.kaiju { | ||
input: | ||
reads_unmapped_bam = dedup.dedup_bam | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
import "tasks_read_utils.wdl" as reads | ||
|
||
workflow dedup { | ||
call reads.dedup_bam | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,18 +5,26 @@ import "tasks_metagenomics.wdl" as metagenomics | |
import "tasks_taxon_filter.wdl" as taxon_filter | ||
import "tasks_assembly.wdl" as assembly | ||
import "tasks_reports.wdl" as reports | ||
import "tasks_read_utils.wdl" as reads | ||
|
||
workflow demux_metag { | ||
call demux.illumina_demux as illumina_demux | ||
|
||
scatter(raw_reads in illumina_demux.raw_reads_unaligned_bams) { | ||
call reads.dedup_bam as dedup { | ||
input: | ||
in_bam = raw_reads | ||
} | ||
} | ||
|
||
scatter(reads_bam in dedup.dedup_bam) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouldn't double-scatter. You can just keep this as a single scatter block on the raw_reads and put all the task calls together in that single scatter. WDL interpreters/compilers are smart enough to figure out the DAG and parallelization opportunities within the scatter based on the dependencies between their inputs and outputs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So specifically:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Two more thoughts on this topic:
|
||
call reports.spikein_report as spikein { | ||
input: | ||
reads_bam = raw_reads | ||
reads_bam = reads_bam | ||
} | ||
call taxon_filter.deplete_taxa as deplete { | ||
input: | ||
raw_reads_unmapped_bam = raw_reads | ||
raw_reads_unmapped_bam = reads_bam | ||
} | ||
call assembly.assemble as spades { | ||
input: | ||
|
@@ -27,7 +35,7 @@ workflow demux_metag { | |
|
||
call metagenomics.krakenuniq as kraken { | ||
input: | ||
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams, | ||
reads_unmapped_bam = dedup.dedup_bam | ||
} | ||
call reports.aggregate_metagenomics_reports as metag_summary_report { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we call aggregate_metageomics_reports a second time on the kaiju outputs as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
input: | ||
|
@@ -39,6 +47,6 @@ workflow demux_metag { | |
} | ||
call metagenomics.kaiju as kaiju { | ||
input: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We haven't really used kaiju regularly via WDL yet, but I'm betting that we may want to consider moving it to a scatter-on-single-sample execution mode (like everything else in our WDLs except kraken). Its database is about 4x smaller (I'm guessing the localization time is just a few minutes) and the execution time of the algorithm is much slower, so the cost efficiency (algorithmic compute time vs VM wall clock time) of kaiju on a single sample is much better than kraken... so we might as well move it within the same scatter block as well. |
||
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams, | ||
reads_unmapped_bam = dedup.dedup_bam | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ import "tasks_metagenomics.wdl" as metagenomics | |
import "tasks_taxon_filter.wdl" as taxon_filter | ||
import "tasks_assembly.wdl" as assembly | ||
import "tasks_reports.wdl" as reports | ||
import "tasks_read_utils.wdl" as reads | ||
|
||
workflow demux_plus { | ||
|
||
|
@@ -13,15 +14,23 @@ workflow demux_plus { | |
Array[File]? bmtaggerDbs # .tar.gz, .tgz, .tar.bz2, .tar.lz4, .fasta, or .fasta.gz | ||
Array[File]? blastDbs # .tar.gz, .tgz, .tar.bz2, .tar.lz4, .fasta, or .fasta.gz | ||
Array[File]? bwaDbs | ||
|
||
scatter(raw_reads in illumina_demux.raw_reads_unaligned_bams) { | ||
call reads.dedup_bam as dedup { | ||
input: | ||
in_bam = raw_reads | ||
} | ||
} | ||
|
||
scatter(reads_bam in dedup.dedup_bam) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See my comment in demux_metag about combining the scatter blocks. |
||
call reports.spikein_report as spikein { | ||
input: | ||
reads_bam = raw_reads, | ||
reads_bam = reads_bam, | ||
spikein_db = spikein_db | ||
} | ||
call taxon_filter.deplete_taxa as deplete { | ||
input: | ||
raw_reads_unmapped_bam = raw_reads, | ||
raw_reads_unmapped_bam = reads_bam, | ||
bmtaggerDbs = bmtaggerDbs, | ||
blastDbs = blastDbs, | ||
bwaDbs = bwaDbs | ||
|
@@ -37,7 +46,7 @@ workflow demux_plus { | |
|
||
call metagenomics.krakenuniq as krakenuniq { | ||
input: | ||
reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams | ||
reads_unmapped_bam = dedup.dedup_bam | ||
} | ||
|
||
call reports.spikein_summary as spike_summary { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
blast=2.7.1 | ||
bbmap=38.56 | ||
bbmap=38.71 | ||
bmtagger=3.101 | ||
bwa=0.7.17 | ||
cd-hit=4.6.8 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -109,9 +109,12 @@ def main_deplete(args): | |
tags_to_clear = args.tags_to_clear, | ||
picardOptions = ['MAX_DISCARD_FRACTION=0.5'], | ||
JVMmemory = args.JVMmemory, | ||
sanitize = not args.do_not_sanitize) as bamToDeplete: | ||
sanitize = not args.do_not_sanitize) as bam_to_dedup: | ||
|
||
read_utils.rmdup_mvicuna_bam(bam_to_dedup, args.rmdupBam, JVMmemory=args.JVMmemory) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this new world, should we consider:
|
||
|
||
multi_db_deplete_bam( | ||
bamToDeplete, | ||
args.rmdupBam, | ||
args.bwaDbs, | ||
deplete_bwa_bam, | ||
args.bwaBam, | ||
|
@@ -129,13 +132,8 @@ def bmtagger_wrapper(inBam, db, outBam, JVMmemory=None): | |
JVMmemory=args.JVMmemory | ||
) | ||
|
||
# if the user has not specified saving a revertBam, we used a temp file and can remove it | ||
if not args.revertBam: | ||
os.unlink(revertBamOut) | ||
|
||
read_utils.rmdup_mvicuna_bam(args.bmtaggerBam, args.rmdupBam, JVMmemory=args.JVMmemory) | ||
multi_db_deplete_bam( | ||
args.rmdupBam, | ||
args.bmtaggerBam, | ||
args.blastDbs, | ||
deplete_blastn_bam, | ||
args.blastnBam, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you think this is ready and want to try it out, shouldn't you remove #DX_SKIP_WORKFLOW?