Skip to content

Commit e4c90aa

Browse files
authored
Added liftover wdls and jsons for gnomAD 2.1 (#5604)
1 parent 1ea1036 commit e4c90aa

15 files changed

+452
-48
lines changed

scripts/funcotator/data_sources/gnomAD/liftoverVcf.wdl

+9-2
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,22 @@ workflow LiftoverVcf {
5454

5555
# Run liftover on each input VCF:
5656
scatter ( vcf in variant_vcfs ) {
57+
58+
# Get the name of this run's VCF file:
59+
String vcf_extension = sub(vcf, "^.*.vcf", ".vcf" )
60+
String vcf_base_name = basename( vcf, vcf_extension )
61+
String scattered_vcf_name = vcf_base_name + ".LIFTOVER" + vcf_extension
62+
String scattered_vcf_rejects_name = vcf_base_name + ".LIFTOVER_REJECTS" + vcf_extension
63+
5764
call LiftoverVcfTask {
5865
input:
5966
input_vcf_file = vcf,
6067
chain_file = chain_file,
6168
target_reference_sequence_fasta_file = target_reference_sequence_fasta_file,
6269
target_reference_sequence_fasta_file_index = target_reference_sequence_fasta_file_index,
6370
target_reference_sequence_fasta_file_dict = target_reference_sequence_fasta_file_dict,
64-
lifted_over_vcf_name = lifted_over_vcf_name,
65-
lifted_over_rejects_vcf_name = lifted_over_rejects_vcf_name,
71+
lifted_over_vcf_name = scattered_vcf_name,
72+
lifted_over_rejects_vcf_name = scattered_vcf_rejects_name,
6673

6774
warn_on_missing_contig = warn_on_missing_contig,
6875
write_original_position = write_original_position,

scripts/funcotator/data_sources/gnomAD/liftoverVcf_Gnomad2.1.json

-19
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"LiftoverVcf.gatk_docker": "broadinstitute/gatk:4.0.11.0",
3+
4+
"LiftoverVcf.variant_vcfs": [ "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr1.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr10.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr11.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr12.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr13.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr14.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr15.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr16.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr17.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr18.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr19.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr2.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr20.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr21.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr22.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr3.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr4.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr5.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr6.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr7.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr8.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chr9.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chrX.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.chrY.vcf.bgz" ],
5+
"LiftoverVcf.chain_file": "gs://broad-dsde-methods-jonn/gnomAD_2.1_Liftover_hg38/b37ToHg38.over.chain",
6+
"LiftoverVcf.target_reference_sequence_fasta_file": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta",
7+
"LiftoverVcf.target_reference_sequence_fasta_file_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
8+
"LiftoverVcf.target_reference_sequence_fasta_file_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
9+
10+
"LiftoverVcf.lifted_over_vcf_name": "gnomad.exomes.r2.1.sites.liftoverToHg38.vcf.gz",
11+
"LiftoverVcf.lifted_over_rejects_vcf_name": "gnomad.exomes.r2.1.sites.liftoverToHg38.REJECTS.vcf.gz",
12+
13+
"LiftoverVcf.write_original_position": "true",
14+
"LiftoverVcf.warn_on_missing_contig": "true",
15+
16+
"LiftoverVcf.mem_gb": "128",
17+
"LiftoverVcf.disk_space_gb": "16384",
18+
"LiftoverVcf.boot_disk_size_gb": "100"
19+
}

scripts/funcotator/data_sources/gnomAD/liftoverVcf_Gnomad2.1_parallel.json renamed to scripts/funcotator/data_sources/gnomAD/liftoverVcf_Gnomad2.1_genome.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"LiftoverVcf.gatk_docker": "broadinstitute/gatk:4.0.11.0",
33

44
"LiftoverVcf.variant_vcfs": [ "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr1.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr10.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr11.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr12.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr13.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr14.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr15.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr16.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr17.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr18.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr19.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr2.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr20.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr21.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr22.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr3.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr4.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr5.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr6.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr7.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr8.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chr9.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.chrX.vcf.bgz" ],
5-
"LiftoverVcf.chain_file": "gs://broad-dsde-methods-jonn/b37ToHg38.over.chain",
5+
"LiftoverVcf.chain_file": "gs://broad-dsde-methods-jonn/gnomAD_2.1_Liftover_hg38/b37ToHg38.over.chain",
66
"LiftoverVcf.target_reference_sequence_fasta_file": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta",
77
"LiftoverVcf.target_reference_sequence_fasta_file_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
88
"LiftoverVcf.target_reference_sequence_fasta_file_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict",

scripts/funcotator/data_sources/gnomAD/mergeVcfs.wdl

+23-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
# gatk_docker - GATK Docker image in which to run
77
# variant_vcfs - Array of Variant Context Files (VCF) containing the variants.
88
# output_vcf_file_name - Desired name of the resulting VCF output file.
9-
# output_vcf_index_name - Desired name of the resulting VCF index output file.
109
#
1110
# Optional:
1211
# File gatk4_jar_override - Override Jar file containing GATK 4. Use this when overriding the docker JAR or when using a backend without docker.
@@ -69,6 +68,8 @@ task MergeVcfs {
6968
Int? cpu
7069
Int? boot_disk_size_gb
7170

71+
String dollar = "$"
72+
7273
# ------------------------------------------------
7374
# Process input args:
7475
String timing_output_file = basename(output_vcf_file) + ".timingInformation.txt"
@@ -94,12 +95,32 @@ task MergeVcfs {
9495
set -e
9596
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
9697

98+
fileListArgs=""
99+
100+
# Ensure that the names of the files end in the correct suffixes:
101+
# (MergeVCFs requires compressed vcfs to end in '.vcf.gz')
102+
for f in ${sep=' ' input_vcfs} ; do
103+
base=$( basename $f )
104+
d=$( dirname $f )
105+
echo "$base" | grep -q ".vcf.bgz$"
106+
r=$?
107+
if [ $r -eq 0 ] ; then
108+
newName=$( echo $base | sed 's#.vcf.bgz$#.vcf.gz#g' )
109+
mv $f ${dollar}{d}/${dollar}{newName}
110+
fileListArgs="${dollar}{fileListArgs} -I ${dollar}{d}/${dollar}{newName}"
111+
else
112+
fileListArgs="${dollar}{fileListArgs} -I $f"
113+
fi
114+
done
115+
116+
echo "Using file list: ${dollar}{fileListArgs}"
117+
97118
startTime=`date +%s.%N`
98119
echo "StartTime: $startTime" > ${timing_output_file}
99120

100121
gatk --java-options "-Xmx${command_mem}m -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" \
101122
MergeVcfs \
102-
-I ${sep=' -I ' input_vcfs} \
123+
${dollar}{fileListArgs} \
103124
-O ${output_vcf_file}
104125

105126
endTime=`date +%s.%N`
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"MergeVcfsWorkflow.gatk_docker": "broadinstitute/gatk:4.0.11.0",
3+
4+
"MergeVcfsWorkflow.variant_vcfs": [ "gs://gnomad-public/release/2.1/vcf/exomes/gnomad.exomes.r2.1.sites.vcf.bgz", "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.vcf.bgz" ],
5+
"MergeVcfsWorkflow.output_vcf_file_name": "gnomad.genomes.r2.1.sites.exome_and_genome.vcf.bgz",
6+
7+
"MergeVcfsWorkflow.mem_gb": "128",
8+
"MergeVcfsWorkflow.disk_space_gb": "16384",
9+
"MergeVcfsWorkflow.boot_disk_size_gb": "100"
10+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# Removes a given list of INFO field annotations from the given VCF files.
2+
#
3+
# Description of inputs:
4+
#
5+
# Required:
6+
# String gatk_docker - GATK Docker image in which to run
7+
#
8+
# Array[File] variant_vcfs - Array of Variant Context Files (VCFs) from which to remove INFO field annotations.
9+
# Assumes that index files are in the same folder as VCF files and of the correct extension.
10+
# Array[String] info_annotations_to_remove - Array of strings with each being the name of an annotation to remove from the INFO field.
11+
#
12+
# Optional:
13+
# File gatk4_jar_override - Override Jar file containing GATK 4. Use this when overriding the docker JAR or when using a backend without docker.
14+
# Int mem_gb - Amount of memory to give to the machine running each task in this workflow (in gb).
15+
# Int preemptible_attempts - Number of times to allow each task in this workflow to be preempted.
16+
# Int disk_space_gb - Amount of storage disk space (in Gb) to give to each machine running each task in this workflow.
17+
# Int cpu - Number of CPU cores to give to each machine running each task in this workflow.
18+
# Int boot_disk_size_gb - Amount of boot disk space (in Gb) to give to each machine running each task in this workflow.
19+
#
20+
# This WDL needs to decide whether to use the ``gatk_jar`` or ``gatk_jar_override`` for the jar location. As of cromwell-0.24,
21+
# this logic *must* go into each task. Therefore, there is a lot of duplicated code. This allows users to specify a jar file
22+
# independent of what is in the docker file. See the README.md for more info.
23+
24+
workflow RemoveInfoFieldAnnotationsFromVcf {
25+
String gatk_docker
26+
27+
Array[File] variant_vcfs
28+
Array[String] info_annotations_to_remove
29+
30+
File? gatk4_jar_override
31+
Int? mem_gb
32+
Int? preemptible_attempts
33+
Int? disk_space_gb
34+
Int? cpu
35+
Int? boot_disk_size_gb
36+
37+
# Run liftover on each input VCF:
38+
scatter ( vcf_file in variant_vcfs ) {
39+
40+
# Get the name of this run's index file:
41+
String index_format = if sub(vcf_file, ".*\\.", "") == "vcf" then "idx" else "tbi"
42+
File vcf_index = vcf_file + "." + index_format
43+
44+
# Get the name of this run's VCF file:
45+
String vcf_extension = sub(vcf_file, "^.*.vcf", ".vcf" )
46+
String vcf_base_name = basename( vcf_file, vcf_extension )
47+
String output_vcf_file = vcf_base_name + ".INFO_ANNOTATIONS_FIXED" + vcf_extension
48+
49+
call SelectVariantsTask {
50+
input:
51+
input_vcf_file = vcf_file,
52+
input_vcf_file_index = vcf_index,
53+
output_vcf_file = output_vcf_file,
54+
info_annotations_to_remove = info_annotations_to_remove,
55+
56+
gatk_docker = gatk_docker,
57+
gatk_override = gatk4_jar_override,
58+
mem = mem_gb,
59+
preemptible_attempts = preemptible_attempts,
60+
disk_space_gb = disk_space_gb,
61+
cpu = cpu,
62+
boot_disk_size_gb = boot_disk_size_gb
63+
}
64+
}
65+
66+
output {
67+
Array[File] vcf_file_with_clean_info_fields = SelectVariantsTask.vcf_with_cleaned_info_field
68+
Array[File] vcf_file_with_clean_info_fields_index = SelectVariantsTask.vcf_with_cleaned_info_field_index
69+
}
70+
}
71+
72+
task SelectVariantsTask {
73+
74+
# ------------------------------------------------
75+
# Input args:
76+
File input_vcf_file
77+
File input_vcf_file_index
78+
String output_vcf_file
79+
Array[String] info_annotations_to_remove
80+
81+
# Runtime Options:
82+
String gatk_docker
83+
File? gatk_override
84+
Int? mem
85+
Int? preemptible_attempts
86+
Int? disk_space_gb
87+
Int? cpu
88+
Int? boot_disk_size_gb
89+
90+
String index_format = if sub(input_vcf_file, ".*\\.", "") == "vcf" then "idx" else "tbi"
91+
String timing_output_file = basename(input_vcf_file) + ".timingInformation.txt"
92+
93+
# ------------------------------------------------
94+
# Get machine settings:
95+
Boolean use_ssd = false
96+
97+
# You may have to change the following two parameter values depending on the task requirements
98+
Int default_ram_mb = 1024 * 3
99+
# WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples.
100+
Int default_disk_space_gb = 100
101+
102+
Int default_boot_disk_size_gb = 15
103+
104+
# Mem is in units of GB but our command and memory runtime values are in MB
105+
Int machine_mem = if defined(mem) then mem * 1024 else default_ram_mb
106+
Int command_mem = machine_mem - 1024
107+
108+
# ------------------------------------------------
109+
# Run our command:
110+
command <<<
111+
set -e
112+
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
113+
114+
echo "Disk Space:"
115+
df -h
116+
117+
startTime=`date +%s.%N`
118+
echo "StartTime: $startTime" > ${timing_output_file}
119+
120+
gatk --java-options "-Xmx${command_mem}m -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" \
121+
SelectVariants \
122+
-V ${input_vcf_file} \
123+
-O ${output_vcf_file} \
124+
--drop-info-annotation ${sep=" --drop-info-annotation " info_annotations_to_remove}
125+
126+
endTime=`date +%s.%N`
127+
echo "EndTime: $endTime" >> ${timing_output_file}
128+
elapsedTime=`python -c "print( $endTime - $startTime )"`
129+
echo "Elapsed Time: $elapsedTime" >> ${timing_output_file}
130+
>>>
131+
132+
# ------------------------------------------------
133+
# Runtime settings:
134+
runtime {
135+
docker: gatk_docker
136+
memory: machine_mem + " MB"
137+
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD"
138+
bootDiskSizeGb: select_first([boot_disk_size_gb, default_boot_disk_size_gb])
139+
preemptible: 0
140+
cpu: select_first([cpu, 1])
141+
}
142+
143+
# ------------------------------------------------
144+
# Outputs:
145+
output {
146+
File vcf_with_cleaned_info_field = "${output_vcf_file}"
147+
File vcf_with_cleaned_info_field_index = "${output_vcf_file}.${index_format}"
148+
File timing_info = timing_output_file
149+
}
150+
}
151+

scripts/funcotator/data_sources/gnomAD/removeInfoFieldAnnotationsFromVcf_gnomAD_hg19.json

+11
Large diffs are not rendered by default.

scripts/funcotator/data_sources/gnomAD/removeInfoFieldAnnotationsFromVcf_gnomAD_hg38.json

+11
Large diffs are not rendered by default.

scripts/funcotator/data_sources/gnomAD/removeInfoFieldAnnotationsFromVcf_gnomAD_hg38_exomes.json

+11
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"SubsetGenomicInfoFieldToAlleleFrequencies.gatk_docker": "broadinstitute/gatk:4.0.11.0",
3+
"SubsetGenomicInfoFieldToAlleleFrequencies.gnomad_vcf": "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.vcf.bgz",
4+
"SubsetGenomicInfoFieldToAlleleFrequencies.gnomad_version": "2.1",
5+
6+
"SubsetGenomicInfoFieldToAlleleFrequencies.mem_gb": "128",
7+
"SubsetGenomicInfoFieldToAlleleFrequencies.disk_space_gb": "16384",
8+
"SubsetGenomicInfoFieldToAlleleFrequencies.boot_disk_size_gb": "100"
9+
}

0 commit comments

Comments
 (0)