Skip to content

Remove NuMTs from MT pipeline and updates wdl to GATK4.1.1.0 #5847

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 2, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion scripts/m2_cromwell_tests/run_m2_wdl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,4 @@ sudo java -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/mutect2_wdl/mutect2_m
echo "Running Mitochondria M2 WDL through cromwell"
ln -fs $WORKING_DIR/gatk/scripts/mitochondria_m2_wdl/AlignAndCall.wdl
ln -fs $WORKING_DIR/gatk/scripts/mitochondria_m2_wdl/AlignmentPipeline.wdl
ln -fs $WORKING_DIR/gatk/scripts/mitochondria_m2_wdl/MitochondriaCalling.wdl
sudo java -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl -i $WORKING_DIR/gatk/scripts/m2_cromwell_tests/test_mitochondria_m2_wdl.json -m $WORKING_DIR/test_mitochondria_m2_wdl.metadata
1 change: 0 additions & 1 deletion scripts/m2_cromwell_tests/test_mitochondria_m2_wdl.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"MitochondriaPipeline.wgs_aligned_input_bam_or_cram": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/NA12878.alignedHg38.duplicateMarked.baseRealigned.bam",
"MitochondriaPipeline.autosomal_coverage": 30,
"MitochondriaPipeline.MT_with_numts_intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/mitochondria_references/chrMWithFinalNuMTs.hg38.interval_list",
"MitochondriaPipeline.mt_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/mitochondria_references/Homo_sapiens_assembly38.chrM.dict",
"MitochondriaPipeline.mt_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/mitochondria_references/Homo_sapiens_assembly38.chrM.fasta",
"MitochondriaPipeline.mt_fasta_index": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/mitochondria_references/Homo_sapiens_assembly38.chrM.fasta.fai",
Expand Down
250 changes: 220 additions & 30 deletions scripts/mitochondria_m2_wdl/AlignAndCall.wdl
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import "AlignmentPipeline.wdl" as AlignAndMarkDuplicates
import "MitochondriaCalling.wdl" as MutectAndFilter

workflow AlignAndCall {
meta {
Expand All @@ -10,7 +9,7 @@ workflow AlignAndCall {
}

File unmapped_bam
Int? autosomal_coverage
Float? autosomal_coverage

File mt_dict
File mt_fasta
Expand Down Expand Up @@ -40,10 +39,10 @@ workflow AlignAndCall {

File? gatk_override
String? m2_extra_args
String? m2_filter_extra_args
Float? vaf_filter_threshold
Float? f_score_beta

# Using an older version of the default Mutect LOD cutoff. This value can be changed and is only useful at low depths
# to catch sites that would not get caught by the LOD divided by depth filter.
Float lod_cutoff = 6.3
# Read length used for optimization only. If this is too small CollectWgsMetrics might fail, but the results are not
# affected by this number. Default is 151.
Int? max_read_length
Expand Down Expand Up @@ -99,64 +98,85 @@ workflow AlignAndCall {
preemptible_tries = preemptible_tries
}

call MutectAndFilter.MitochondriaCalling as CallAndFilterMt {
Int? M2_mem = if CollectWgsMetrics.mean_coverage > 25000 then 14 else 7

call M2 as CallMt {
input:
input_bam = AlignToMt.mt_aligned_bam,
input_bam_index = AlignToMt.mt_aligned_bai,
input_bai = AlignToMt.mt_aligned_bai,
ref_fasta = mt_fasta,
ref_fasta_index = mt_fasta_index,
ref_fai = mt_fasta_index,
ref_dict = mt_dict,
lod_cutoff = lod_cutoff,
compress = false,
gatk_override = gatk_override,
# Everything is called except the control region.
m2_extra_args = select_first([m2_extra_args, ""]) + " -L chrM:576-16024 ",
blacklisted_sites = blacklisted_sites,
blacklisted_sites_index = blacklisted_sites_index,
mean_coverage = CollectWgsMetrics.mean_coverage,
autosomal_coverage = autosomal_coverage,
contamination = GetContamination.minor_level,
max_read_length = max_read_length,
mem = M2_mem,
preemptible_tries = preemptible_tries
}

call MutectAndFilter.MitochondriaCalling as CallAndFilterShiftedMt {
call M2 as CallShiftedMt {
input:
input_bam = AlignToShiftedMt.mt_aligned_bam,
input_bam_index = AlignToShiftedMt.mt_aligned_bai,
input_bai = AlignToShiftedMt.mt_aligned_bai,
ref_fasta = mt_shifted_fasta,
ref_fasta_index = mt_shifted_fasta_index,
ref_fai = mt_shifted_fasta_index,
ref_dict = mt_shifted_dict,
lod_cutoff = lod_cutoff,
compress = false,
gatk_override = gatk_override,
# Interval correspondes to control region in the shifted reference
# Everything is called except the control region.
m2_extra_args = select_first([m2_extra_args, ""]) + " -L chrM:8025-9144 ",
blacklisted_sites = blacklisted_sites_shifted,
blacklisted_sites_index = blacklisted_sites_shifted_index,
mean_coverage = CollectWgsMetrics.mean_coverage,
autosomal_coverage = autosomal_coverage,
contamination = GetContamination.minor_level,
max_read_length = max_read_length,
mem = M2_mem,
preemptible_tries = preemptible_tries
}

call LiftoverAndCombineVcfs {
input:
shifted_vcf = CallAndFilterShiftedMt.vcf,
vcf = CallAndFilterMt.vcf,
shifted_vcf = CallShiftedMt.raw_vcf,
vcf = CallMt.raw_vcf,
ref_fasta = mt_fasta,
ref_fasta_index = mt_fasta_index,
ref_dict = mt_dict,
shift_back_chain = shift_back_chain,
preemptible_tries = preemptible_tries
}

call MergeStats {
input:
shifted_stats = CallShiftedMt.stats,
non_shifted_stats = CallMt.stats,
gatk_override = gatk_override,
preemptible_tries = preemptible_tries
}

call Filter {
input:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

too many spaces before input:

raw_vcf = LiftoverAndCombineVcfs.final_vcf,
raw_vcf_index = LiftoverAndCombineVcfs.final_vcf_index,
raw_vcf_stats = MergeStats.stats,
ref_fasta = mt_fasta,
ref_fai = mt_fasta_index,
ref_dict = mt_dict,
compress = false,
gatk_override = gatk_override,
m2_extra_filtering_args = m2_filter_extra_args,
max_alt_allele_count = 4,
contamination = GetContamination.minor_level,
autosomal_coverage = autosomal_coverage,
vaf_filter_threshold = vaf_filter_threshold,
blacklisted_sites = blacklisted_sites,
blacklisted_sites_index = blacklisted_sites_index,
f_score_beta = f_score_beta,
preemptible_tries = preemptible_tries
}

output {
File mt_aligned_bam = AlignToMt.mt_aligned_bam
File mt_aligned_bai = AlignToMt.mt_aligned_bai
File mt_aligned_shifted_bam = AlignToShiftedMt.mt_aligned_bam
File mt_aligned_shifted_bai = AlignToShiftedMt.mt_aligned_bai
File out_vcf = LiftoverAndCombineVcfs.final_vcf
File out_vcf_index = LiftoverAndCombineVcfs.final_vcf_index
File out_vcf = Filter.filtered_vcf
File out_vcf_index = Filter.filtered_vcf_idx
File duplicate_metrics = AlignToMt.duplicate_metrics
File coverage_metrics = CollectWgsMetrics.metrics
File theoretical_sensitivity_metrics = CollectWgsMetrics.theoretical_sensitivity
Expand Down Expand Up @@ -335,3 +355,173 @@ task LiftoverAndCombineVcfs {
File final_vcf_index = "${basename}.final.vcf.idx"
}
}

task M2 {
File ref_fasta
File ref_fai
File ref_dict
File input_bam
File input_bai
String? m2_extra_args
Boolean? make_bamout
Boolean compress
File? gga_vcf
File? gga_vcf_idx

String output_vcf = "raw" + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"

File? gatk_override

# runtime
Int? mem
Int? preemptible_tries
Float ref_size = size(ref_fasta, "GB") + size(ref_fai, "GB")
Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 3500
Int command_mem = machine_mem - 500

meta {
description: "Mutect2 for calling Snps and Indels"
}
parameter_meta {
input_bam: "Aligned Bam"
gga_vcf: "VCF for genotype given alleles mode"
}
command <<<
set -e

export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

# We need to create these files regardless, even if they stay empty
touch bamout.bam

gatk --java-options "-Xmx${command_mem}m" Mutect2 \
-R ${ref_fasta} \
-I ${input_bam} \
${"--genotyping-mode GENOTYPE_GIVEN_ALLELES --alleles " + gga_vcf} \
-O ${output_vcf} \
${true='--bam-output bamout.bam' false='' make_bamout} \
${m2_extra_args} \
--annotation StrandBiasBySample \
--mitochondria-mode \
--max-reads-per-alignment-start 75 \
--max-mnp-distance 0
>>>
runtime {
docker: "us.gcr.io/broad-gatk/gatk:4.1.1.0"
memory: machine_mem + " MB"
disks: "local-disk " + disk_size + " HDD"
preemptible: select_first([preemptible_tries, 5])
cpu: 2
}
output {
File raw_vcf = "${output_vcf}"
File raw_vcf_idx = "${output_vcf_index}"
File stats = "${output_vcf}.stats"
File output_bamOut = "bamout.bam"
}
}

task Filter {
File ref_fasta
File ref_fai
File ref_dict
File raw_vcf
File raw_vcf_index
File raw_vcf_stats
Boolean compress
Float? vaf_cutoff

String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we really need to allow the pipeline to output uncompressed vcfs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I only look at uncompressed vcfs (the files are already tiny and it makes looking at things in Terra/Notebooks smoother). In fact I think I hardcoded compress = false when calling this task. Should I wire that argument through as an option?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ehhh it just irks me when I see uncompressed output (even if I know its tiny). Tools you use in Terra/Notebooks should be able to read compressed vcfs :P. I would lean towards wiring it through so people have an option to changing it.

String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"

String? m2_extra_filtering_args
Int max_alt_allele_count
Float contamination
Float? autosomal_coverage
Float? vaf_filter_threshold
Float? f_score_beta

File blacklisted_sites
File blacklisted_sites_index

File? gatk_override

# runtime
Int? preemptible_tries
Float ref_size = size(ref_fasta, "GB") + size(ref_fai, "GB")
Int disk_size = ceil(size(raw_vcf, "GB") + ref_size) + 20

meta {
description: "Mutect2 Filtering for calling Snps and Indels"
}
parameter_meta {
autosomal_coverage: "Median coverage of the autosomes for filtering potential polymorphic NuMT variants"
vaf_filter_thershold: "Hard cutoff for minimum allele fraction. All sites with VAF less than this cutoff will be filtered."
f_score_beta: "F-Score beta balances the filtering strategy between recall and precision. The relative weight of recall to precision."
}
command <<<
set -e

export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

# We need to create these files regardless, even if they stay empty
touch bamout.bam

gatk --java-options "-Xmx2500m" FilterMutectCalls -V ${raw_vcf} \
-R ${ref_fasta} \
-O filtered.vcf \
--stats ${raw_vcf_stats} \
${m2_extra_filtering_args} \
--max-alt-allele-count ${max_alt_allele_count} \
--mitochondria-mode \
${"--autosomal-coverage " + autosomal_coverage} \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we ok with not having an autosomal coverage in some cases?

Copy link
Contributor Author

@meganshand meganshand Apr 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My thought was that if a user doesn't have this information it should be optional so they can still run the pipeline. But if you don't provide an input here you don't get the polymorphic NuMT filter, so I'm open to making it required. Do you have a strong opinion?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gotcha that makes sense to me, also your parameter_meta 👯 is great!

${"--min-allele-fraction " + vaf_filter_threshold} \
${"--f-score-beta " + f_score_beta} \
--contamination-estimate ${contamination}

gatk VariantFiltration -V filtered.vcf \
-O ${output_vcf} \
--mask ${blacklisted_sites} \
--mask-name "blacklisted_site"

>>>
runtime {
docker: "us.gcr.io/broad-gatk/gatk:4.1.1.0"
memory: "4 MB"
disks: "local-disk " + disk_size + " HDD"
preemptible: select_first([preemptible_tries, 5])
cpu: 2
}
output {
File filtered_vcf = "${output_vcf}"
File filtered_vcf_idx = "${output_vcf_index}"
}
}

task MergeStats {
File shifted_stats
File non_shifted_stats
Int? preemptible_tries
File? gatk_override

command{
set -e

export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

gatk MergeMutectStats --stats ${shifted_stats} --stats ${non_shifted_stats} -O raw.combined.stats
}
output {
File stats = "raw.combined.stats"
}
runtime {
docker: "us.gcr.io/broad-gatk/gatk:4.1.1.0"
memory: "3 MB"
disks: "local-disk 20 HDD"
preemptible: select_first([preemptible_tries, 5])
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"MitochondriaPipeline.wgs_aligned_input_bam_or_cram": "input_bam_here",
"MitochondriaPipeline.autosomal_coverage": autosomal_median_coverage_here,
"MitochondriaPipeline.MT_with_numts_intervals": "gs://broad-references/hg38/v0/chrM/chrMWithFinalNuMTs.hg38.interval_list",
"MitochondriaPipeline.mt_dict": "gs://broad-references/hg38/v0/chrM/Homo_sapiens_assembly38.chrM.dict",
"MitochondriaPipeline.mt_fasta": "gs://broad-references/hg38/v0/chrM/Homo_sapiens_assembly38.chrM.fasta",
"MitochondriaPipeline.mt_fasta_index": "gs://broad-references/hg38/v0/chrM/Homo_sapiens_assembly38.chrM.fasta.fai",
Expand Down
Loading