-
Notifications
You must be signed in to change notification settings - Fork 602
Remove NuMTs from MT pipeline and updates wdl to GATK4.1.1.0 #5847
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
8912cc7
b42b9f3
9b4fa2b
b04c568
f098a71
23051fe
1ec555f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,4 @@ | ||
import "AlignmentPipeline.wdl" as AlignAndMarkDuplicates | ||
import "MitochondriaCalling.wdl" as MutectAndFilter | ||
|
||
workflow AlignAndCall { | ||
meta { | ||
|
@@ -10,7 +9,7 @@ workflow AlignAndCall { | |
} | ||
|
||
File unmapped_bam | ||
Int? autosomal_coverage | ||
Float? autosomal_coverage | ||
|
||
File mt_dict | ||
File mt_fasta | ||
|
@@ -40,10 +39,10 @@ workflow AlignAndCall { | |
|
||
File? gatk_override | ||
String? m2_extra_args | ||
String? m2_filter_extra_args | ||
Float? vaf_filter_threshold | ||
Float? f_score_beta | ||
|
||
# Using an older version of the default Mutect LOD cutoff. This value can be changed and is only useful at low depths | ||
# to catch sites that would not get caught by the LOD divided by depth filter. | ||
Float lod_cutoff = 6.3 | ||
# Read length used for optimization only. If this is too small CollectWgsMetrics might fail, but the results are not | ||
# affected by this number. Default is 151. | ||
Int? max_read_length | ||
|
@@ -99,64 +98,85 @@ workflow AlignAndCall { | |
preemptible_tries = preemptible_tries | ||
} | ||
|
||
call MutectAndFilter.MitochondriaCalling as CallAndFilterMt { | ||
Int? M2_mem = if CollectWgsMetrics.mean_coverage > 25000 then 14 else 7 | ||
|
||
call M2 as CallMt { | ||
input: | ||
input_bam = AlignToMt.mt_aligned_bam, | ||
input_bam_index = AlignToMt.mt_aligned_bai, | ||
input_bai = AlignToMt.mt_aligned_bai, | ||
ref_fasta = mt_fasta, | ||
ref_fasta_index = mt_fasta_index, | ||
ref_fai = mt_fasta_index, | ||
ref_dict = mt_dict, | ||
lod_cutoff = lod_cutoff, | ||
compress = false, | ||
gatk_override = gatk_override, | ||
# Everything is called except the control region. | ||
m2_extra_args = select_first([m2_extra_args, ""]) + " -L chrM:576-16024 ", | ||
blacklisted_sites = blacklisted_sites, | ||
blacklisted_sites_index = blacklisted_sites_index, | ||
mean_coverage = CollectWgsMetrics.mean_coverage, | ||
autosomal_coverage = autosomal_coverage, | ||
contamination = GetContamination.minor_level, | ||
max_read_length = max_read_length, | ||
mem = M2_mem, | ||
preemptible_tries = preemptible_tries | ||
} | ||
|
||
call MutectAndFilter.MitochondriaCalling as CallAndFilterShiftedMt { | ||
call M2 as CallShiftedMt { | ||
input: | ||
input_bam = AlignToShiftedMt.mt_aligned_bam, | ||
input_bam_index = AlignToShiftedMt.mt_aligned_bai, | ||
input_bai = AlignToShiftedMt.mt_aligned_bai, | ||
ref_fasta = mt_shifted_fasta, | ||
ref_fasta_index = mt_shifted_fasta_index, | ||
ref_fai = mt_shifted_fasta_index, | ||
ref_dict = mt_shifted_dict, | ||
lod_cutoff = lod_cutoff, | ||
compress = false, | ||
gatk_override = gatk_override, | ||
# Interval correspondes to control region in the shifted reference | ||
# Everything is called except the control region. | ||
m2_extra_args = select_first([m2_extra_args, ""]) + " -L chrM:8025-9144 ", | ||
blacklisted_sites = blacklisted_sites_shifted, | ||
blacklisted_sites_index = blacklisted_sites_shifted_index, | ||
mean_coverage = CollectWgsMetrics.mean_coverage, | ||
autosomal_coverage = autosomal_coverage, | ||
contamination = GetContamination.minor_level, | ||
max_read_length = max_read_length, | ||
mem = M2_mem, | ||
preemptible_tries = preemptible_tries | ||
} | ||
|
||
call LiftoverAndCombineVcfs { | ||
input: | ||
shifted_vcf = CallAndFilterShiftedMt.vcf, | ||
vcf = CallAndFilterMt.vcf, | ||
shifted_vcf = CallShiftedMt.raw_vcf, | ||
vcf = CallMt.raw_vcf, | ||
ref_fasta = mt_fasta, | ||
ref_fasta_index = mt_fasta_index, | ||
ref_dict = mt_dict, | ||
shift_back_chain = shift_back_chain, | ||
preemptible_tries = preemptible_tries | ||
} | ||
|
||
call MergeStats { | ||
input: | ||
shifted_stats = CallShiftedMt.stats, | ||
non_shifted_stats = CallMt.stats, | ||
gatk_override = gatk_override, | ||
preemptible_tries = preemptible_tries | ||
} | ||
|
||
call Filter { | ||
input: | ||
raw_vcf = LiftoverAndCombineVcfs.final_vcf, | ||
raw_vcf_index = LiftoverAndCombineVcfs.final_vcf_index, | ||
raw_vcf_stats = MergeStats.stats, | ||
ref_fasta = mt_fasta, | ||
ref_fai = mt_fasta_index, | ||
ref_dict = mt_dict, | ||
compress = false, | ||
gatk_override = gatk_override, | ||
m2_extra_filtering_args = m2_filter_extra_args, | ||
max_alt_allele_count = 4, | ||
contamination = GetContamination.minor_level, | ||
autosomal_coverage = autosomal_coverage, | ||
vaf_filter_threshold = vaf_filter_threshold, | ||
blacklisted_sites = blacklisted_sites, | ||
blacklisted_sites_index = blacklisted_sites_index, | ||
f_score_beta = f_score_beta, | ||
preemptible_tries = preemptible_tries | ||
} | ||
|
||
output { | ||
File mt_aligned_bam = AlignToMt.mt_aligned_bam | ||
File mt_aligned_bai = AlignToMt.mt_aligned_bai | ||
File mt_aligned_shifted_bam = AlignToShiftedMt.mt_aligned_bam | ||
File mt_aligned_shifted_bai = AlignToShiftedMt.mt_aligned_bai | ||
File out_vcf = LiftoverAndCombineVcfs.final_vcf | ||
File out_vcf_index = LiftoverAndCombineVcfs.final_vcf_index | ||
File out_vcf = Filter.filtered_vcf | ||
File out_vcf_index = Filter.filtered_vcf_idx | ||
File duplicate_metrics = AlignToMt.duplicate_metrics | ||
File coverage_metrics = CollectWgsMetrics.metrics | ||
File theoretical_sensitivity_metrics = CollectWgsMetrics.theoretical_sensitivity | ||
|
@@ -335,3 +355,173 @@ task LiftoverAndCombineVcfs { | |
File final_vcf_index = "${basename}.final.vcf.idx" | ||
} | ||
} | ||
|
||
task M2 { | ||
File ref_fasta | ||
File ref_fai | ||
File ref_dict | ||
File input_bam | ||
File input_bai | ||
String? m2_extra_args | ||
Boolean? make_bamout | ||
Boolean compress | ||
File? gga_vcf | ||
File? gga_vcf_idx | ||
|
||
String output_vcf = "raw" + if compress then ".vcf.gz" else ".vcf" | ||
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" | ||
|
||
File? gatk_override | ||
|
||
# runtime | ||
Int? mem | ||
Int? preemptible_tries | ||
Float ref_size = size(ref_fasta, "GB") + size(ref_fai, "GB") | ||
Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 | ||
|
||
# Mem is in units of GB but our command and memory runtime values are in MB | ||
Int machine_mem = if defined(mem) then mem * 1000 else 3500 | ||
Int command_mem = machine_mem - 500 | ||
|
||
meta { | ||
description: "Mutect2 for calling Snps and Indels" | ||
} | ||
parameter_meta { | ||
input_bam: "Aligned Bam" | ||
gga_vcf: "VCF for genotype given alleles mode" | ||
} | ||
command <<< | ||
set -e | ||
|
||
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} | ||
|
||
# We need to create these files regardless, even if they stay empty | ||
touch bamout.bam | ||
|
||
gatk --java-options "-Xmx${command_mem}m" Mutect2 \ | ||
-R ${ref_fasta} \ | ||
-I ${input_bam} \ | ||
${"--genotyping-mode GENOTYPE_GIVEN_ALLELES --alleles " + gga_vcf} \ | ||
-O ${output_vcf} \ | ||
${true='--bam-output bamout.bam' false='' make_bamout} \ | ||
${m2_extra_args} \ | ||
--annotation StrandBiasBySample \ | ||
--mitochondria-mode \ | ||
--max-reads-per-alignment-start 75 \ | ||
--max-mnp-distance 0 | ||
>>> | ||
runtime { | ||
docker: "us.gcr.io/broad-gatk/gatk:4.1.1.0" | ||
memory: machine_mem + " MB" | ||
disks: "local-disk " + disk_size + " HDD" | ||
preemptible: select_first([preemptible_tries, 5]) | ||
cpu: 2 | ||
} | ||
output { | ||
File raw_vcf = "${output_vcf}" | ||
File raw_vcf_idx = "${output_vcf_index}" | ||
File stats = "${output_vcf}.stats" | ||
File output_bamOut = "bamout.bam" | ||
} | ||
} | ||
|
||
task Filter { | ||
File ref_fasta | ||
File ref_fai | ||
File ref_dict | ||
File raw_vcf | ||
File raw_vcf_index | ||
File raw_vcf_stats | ||
Boolean compress | ||
Float? vaf_cutoff | ||
|
||
String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we really need to allow the pipeline to output uncompressed vcfs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I only look at uncompressed vcfs (the files are already tiny and it makes looking at things in Terra/Notebooks smoother). In fact I think I hardcoded There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ehhh it just irks me when I see uncompressed output (even if I know its tiny). Tools you use in Terra/Notebooks should be able to read compressed vcfs :P. I would lean towards wiring it through so people have an option to changing it. |
||
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" | ||
|
||
String? m2_extra_filtering_args | ||
Int max_alt_allele_count | ||
Float contamination | ||
Float? autosomal_coverage | ||
Float? vaf_filter_threshold | ||
Float? f_score_beta | ||
|
||
File blacklisted_sites | ||
File blacklisted_sites_index | ||
|
||
File? gatk_override | ||
|
||
# runtime | ||
Int? preemptible_tries | ||
Float ref_size = size(ref_fasta, "GB") + size(ref_fai, "GB") | ||
Int disk_size = ceil(size(raw_vcf, "GB") + ref_size) + 20 | ||
|
||
meta { | ||
description: "Mutect2 Filtering for calling Snps and Indels" | ||
} | ||
parameter_meta { | ||
autosomal_coverage: "Median coverage of the autosomes for filtering potential polymorphic NuMT variants" | ||
vaf_filter_thershold: "Hard cutoff for minimum allele fraction. All sites with VAF less than this cutoff will be filtered." | ||
f_score_beta: "F-Score beta balances the filtering strategy between recall and precision. The relative weight of recall to precision." | ||
} | ||
command <<< | ||
set -e | ||
|
||
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} | ||
|
||
# We need to create these files regardless, even if they stay empty | ||
touch bamout.bam | ||
|
||
gatk --java-options "-Xmx2500m" FilterMutectCalls -V ${raw_vcf} \ | ||
-R ${ref_fasta} \ | ||
-O filtered.vcf \ | ||
--stats ${raw_vcf_stats} \ | ||
${m2_extra_filtering_args} \ | ||
--max-alt-allele-count ${max_alt_allele_count} \ | ||
--mitochondria-mode \ | ||
${"--autosomal-coverage " + autosomal_coverage} \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are we ok with not having an autosomal coverage in some cases? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My thought was that if a user doesn't have this information it should be optional so they can still run the pipeline. But if you don't provide an input here you don't get the polymorphic NuMT filter, so I'm open to making it required. Do you have a strong opinion? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. gotcha that makes sense to me, also your |
||
${"--min-allele-fraction " + vaf_filter_threshold} \ | ||
${"--f-score-beta " + f_score_beta} \ | ||
--contamination-estimate ${contamination} | ||
|
||
gatk VariantFiltration -V filtered.vcf \ | ||
-O ${output_vcf} \ | ||
--mask ${blacklisted_sites} \ | ||
--mask-name "blacklisted_site" | ||
|
||
>>> | ||
runtime { | ||
docker: "us.gcr.io/broad-gatk/gatk:4.1.1.0" | ||
memory: "4 MB" | ||
disks: "local-disk " + disk_size + " HDD" | ||
preemptible: select_first([preemptible_tries, 5]) | ||
cpu: 2 | ||
} | ||
output { | ||
File filtered_vcf = "${output_vcf}" | ||
File filtered_vcf_idx = "${output_vcf_index}" | ||
} | ||
} | ||
|
||
task MergeStats { | ||
File shifted_stats | ||
File non_shifted_stats | ||
Int? preemptible_tries | ||
File? gatk_override | ||
|
||
command{ | ||
set -e | ||
|
||
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} | ||
|
||
gatk MergeMutectStats --stats ${shifted_stats} --stats ${non_shifted_stats} -O raw.combined.stats | ||
} | ||
output { | ||
File stats = "raw.combined.stats" | ||
} | ||
runtime { | ||
docker: "us.gcr.io/broad-gatk/gatk:4.1.1.0" | ||
memory: "3 MB" | ||
disks: "local-disk 20 HDD" | ||
preemptible: select_first([preemptible_tries, 5]) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
too many spaces before
input: