|
| 1 | +# Removes a given list of INFO field annotations from the given VCF files. |
| 2 | +# |
| 3 | +# Description of inputs: |
| 4 | +# |
| 5 | +# Required: |
| 6 | +# String gatk_docker - GATK Docker image in which to run |
| 7 | +# |
| 8 | +# Array[File] variant_vcfs - Array of Variant Context Files (VCFs) from which to remove INFO field annotations. |
| 9 | +# Assumes that index files are in the same folder as VCF files and of the correct extension. |
| 10 | +# Array[String] info_annotations_to_remove - Array of strings with each being the name of an annotation to remove from the INFO field. |
| 11 | +# |
| 12 | +# Optional: |
| 13 | +# File gatk4_jar_override - Override Jar file containing GATK 4. Use this when overriding the docker JAR or when using a backend without docker. |
| 14 | +# Int mem_gb - Amount of memory to give to the machine running each task in this workflow (in gb). |
| 15 | +# Int preemptible_attempts - Number of times to allow each task in this workflow to be preempted. |
| 16 | +# Int disk_space_gb - Amount of storage disk space (in Gb) to give to each machine running each task in this workflow. |
| 17 | +# Int cpu - Number of CPU cores to give to each machine running each task in this workflow. |
| 18 | +# Int boot_disk_size_gb - Amount of boot disk space (in Gb) to give to each machine running each task in this workflow. |
| 19 | +# |
| 20 | +# This WDL needs to decide whether to use the ``gatk_jar`` or ``gatk_jar_override`` for the jar location. As of cromwell-0.24, |
| 21 | +# this logic *must* go into each task. Therefore, there is a lot of duplicated code. This allows users to specify a jar file |
| 22 | +# independent of what is in the docker file. See the README.md for more info. |
| 23 | +
|
| 24 | +workflow RemoveInfoFieldAnnotationsFromVcf { |
| 25 | + String gatk_docker |
| 26 | + |
| 27 | + Array[File] variant_vcfs |
| 28 | + Array[String] info_annotations_to_remove |
| 29 | + |
| 30 | + File? gatk4_jar_override |
| 31 | + Int? mem_gb |
| 32 | + Int? preemptible_attempts |
| 33 | + Int? disk_space_gb |
| 34 | + Int? cpu |
| 35 | + Int? boot_disk_size_gb |
| 36 | + |
| 37 | + # Run liftover on each input VCF: |
| 38 | + scatter ( vcf_file in variant_vcfs ) { |
| 39 | + |
| 40 | + # Get the name of this run's index file: |
| 41 | + String index_format = if sub(vcf_file, ".*\\.", "") == "vcf" then "idx" else "tbi" |
| 42 | + File vcf_index = vcf_file + "." + index_format |
| 43 | + |
| 44 | + # Get the name of this run's VCF file: |
| 45 | + String vcf_extension = sub(vcf_file, "^.*.vcf", ".vcf" ) |
| 46 | + String vcf_base_name = basename( vcf_file, vcf_extension ) |
| 47 | + String output_vcf_file = vcf_base_name + ".INFO_ANNOTATIONS_FIXED" + vcf_extension |
| 48 | + |
| 49 | + call SelectVariantsTask { |
| 50 | + input: |
| 51 | + input_vcf_file = vcf_file, |
| 52 | + input_vcf_file_index = vcf_index, |
| 53 | + output_vcf_file = output_vcf_file, |
| 54 | + info_annotations_to_remove = info_annotations_to_remove, |
| 55 | + |
| 56 | + gatk_docker = gatk_docker, |
| 57 | + gatk_override = gatk4_jar_override, |
| 58 | + mem = mem_gb, |
| 59 | + preemptible_attempts = preemptible_attempts, |
| 60 | + disk_space_gb = disk_space_gb, |
| 61 | + cpu = cpu, |
| 62 | + boot_disk_size_gb = boot_disk_size_gb |
| 63 | + } |
| 64 | + } |
| 65 | + |
| 66 | + output { |
| 67 | + Array[File] vcf_file_with_clean_info_fields = SelectVariantsTask.vcf_with_cleaned_info_field |
| 68 | + Array[File] vcf_file_with_clean_info_fields_index = SelectVariantsTask.vcf_with_cleaned_info_field_index |
| 69 | + } |
| 70 | +} |
| 71 | + |
| 72 | +task SelectVariantsTask { |
| 73 | + |
| 74 | + # ------------------------------------------------ |
| 75 | + # Input args: |
| 76 | + File input_vcf_file |
| 77 | + File input_vcf_file_index |
| 78 | + String output_vcf_file |
| 79 | + Array[String] info_annotations_to_remove |
| 80 | + |
| 81 | + # Runtime Options: |
| 82 | + String gatk_docker |
| 83 | + File? gatk_override |
| 84 | + Int? mem |
| 85 | + Int? preemptible_attempts |
| 86 | + Int? disk_space_gb |
| 87 | + Int? cpu |
| 88 | + Int? boot_disk_size_gb |
| 89 | + |
| 90 | + String index_format = if sub(input_vcf_file, ".*\\.", "") == "vcf" then "idx" else "tbi" |
| 91 | + String timing_output_file = basename(input_vcf_file) + ".timingInformation.txt" |
| 92 | + |
| 93 | + # ------------------------------------------------ |
| 94 | + # Get machine settings: |
| 95 | + Boolean use_ssd = false |
| 96 | + |
| 97 | + # You may have to change the following two parameter values depending on the task requirements |
| 98 | + Int default_ram_mb = 1024 * 3 |
| 99 | + # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. |
| 100 | + Int default_disk_space_gb = 100 |
| 101 | + |
| 102 | + Int default_boot_disk_size_gb = 15 |
| 103 | + |
| 104 | + # Mem is in units of GB but our command and memory runtime values are in MB |
| 105 | + Int machine_mem = if defined(mem) then mem * 1024 else default_ram_mb |
| 106 | + Int command_mem = machine_mem - 1024 |
| 107 | + |
| 108 | + # ------------------------------------------------ |
| 109 | + # Run our command: |
| 110 | + command <<< |
| 111 | + set -e |
| 112 | + export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} |
| 113 | + |
| 114 | + echo "Disk Space:" |
| 115 | + df -h |
| 116 | + |
| 117 | + startTime=`date +%s.%N` |
| 118 | + echo "StartTime: $startTime" > ${timing_output_file} |
| 119 | + |
| 120 | + gatk --java-options "-Xmx${command_mem}m -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" \ |
| 121 | + SelectVariants \ |
| 122 | + -V ${input_vcf_file} \ |
| 123 | + -O ${output_vcf_file} \ |
| 124 | + --drop-info-annotation ${sep=" --drop-info-annotation " info_annotations_to_remove} |
| 125 | + |
| 126 | + endTime=`date +%s.%N` |
| 127 | + echo "EndTime: $endTime" >> ${timing_output_file} |
| 128 | + elapsedTime=`python -c "print( $endTime - $startTime )"` |
| 129 | + echo "Elapsed Time: $elapsedTime" >> ${timing_output_file} |
| 130 | + >>> |
| 131 | + |
| 132 | + # ------------------------------------------------ |
| 133 | + # Runtime settings: |
| 134 | + runtime { |
| 135 | + docker: gatk_docker |
| 136 | + memory: machine_mem + " MB" |
| 137 | + disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" |
| 138 | + bootDiskSizeGb: select_first([boot_disk_size_gb, default_boot_disk_size_gb]) |
| 139 | + preemptible: 0 |
| 140 | + cpu: select_first([cpu, 1]) |
| 141 | + } |
| 142 | + |
| 143 | + # ------------------------------------------------ |
| 144 | + # Outputs: |
| 145 | + output { |
| 146 | + File vcf_with_cleaned_info_field = "${output_vcf_file}" |
| 147 | + File vcf_with_cleaned_info_field_index = "${output_vcf_file}.${index_format}" |
| 148 | + File timing_info = timing_output_file |
| 149 | + } |
| 150 | + } |
| 151 | + |
0 commit comments