Skip to content

Commit b69a644

Browse files
committed
Restore array output in gCNV WDLs for efficient postprocessing.
1 parent 079d34a commit b69a644

4 files changed

+66
-17
lines changed

scripts/cnv_wdl/cnv_common_tasks.wdl

+17-2
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,10 @@ task PostprocessGermlineCNVCalls {
321321
String entity_id
322322
Array[File] gcnv_calls_tars
323323
Array[File] gcnv_model_tars
324+
Array[File] calling_configs
325+
Array[File] denoising_configs
326+
Array[File] gcnvkernel_version
327+
Array[File] sharded_interval_lists
324328
File contig_ploidy_calls_tar
325329
Array[String]? allosomal_contigs
326330
Int ref_copy_number_autosomal_contigs
@@ -349,13 +353,24 @@ task PostprocessGermlineCNVCalls {
349353
set -e
350354
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
351355

356+
sharded_interval_lists_array=(${sep=" " sharded_interval_lists})
357+
352358
# untar calls to CALLS_0, CALLS_1, etc directories and build the command line
359+
# also copy over shard config and interval files
353360
gcnv_calls_tar_array=(${sep=" " gcnv_calls_tars})
361+
calling_configs_array=(${sep=" " calling_configs})
362+
denoising_configs_array=(${sep=" " denoising_configs})
363+
gcnvkernel_version_array=(${sep=" " gcnvkernel_version})
364+
sharded_interval_lists_array=(${sep=" " sharded_interval_lists})
354365
calls_args=""
355366
for index in ${dollar}{!gcnv_calls_tar_array[@]}; do
356367
gcnv_calls_tar=${dollar}{gcnv_calls_tar_array[$index]}
357-
mkdir CALLS_$index
358-
tar xzf $gcnv_calls_tar -C CALLS_$index
368+
mkdir -p CALLS_$index/SAMPLE_${sample_index}
369+
tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_${sample_index}
370+
cp ${dollar}{calling_configs_array[$index]} CALLS_$index/
371+
cp ${dollar}{denoising_configs_array[$index]} CALLS_$index/
372+
cp ${dollar}{gcnvkernel_version_array[$index]} CALLS_$index/
373+
cp ${dollar}{sharded_interval_lists_array[$index]} CALLS_$index/
359374
calls_args="$calls_args --calls-shard-path CALLS_$index"
360375
done
361376

scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl

+1-1
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ workflow CNVGermlineCaseScatteredWorkflow {
187187
Array[Array[File]] read_counts_entity_id = CNVGermlineCaseWorkflow.read_counts_entity_id
188188
Array[Array[File]] read_counts = CNVGermlineCaseWorkflow.read_counts
189189
Array[File] contig_ploidy_calls_tars = CNVGermlineCaseWorkflow.contig_ploidy_calls_tar
190-
Array[Array[File]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars
190+
Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars
191191
Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars
192192
Array[Array[File]] genotyped_intervals_vcf = CNVGermlineCaseWorkflow.genotyped_intervals_vcf
193193
Array[Array[File]] genotyped_segments_vcf = CNVGermlineCaseWorkflow.genotyped_segments_vcf

scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl

+23-6
Original file line numberDiff line numberDiff line change
@@ -211,12 +211,18 @@ workflow CNVGermlineCaseWorkflow {
211211
}
212212
}
213213

214+
Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCaseMode.gcnv_call_tars)
215+
214216
scatter (sample_index in range(length(normal_bams))) {
215217
call CNVTasks.PostprocessGermlineCNVCalls {
216218
input:
217219
entity_id = CollectCounts.entity_id[sample_index],
218-
gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar,
220+
gcnv_calls_tars = call_tars_sample_by_shard[sample_index],
219221
gcnv_model_tars = gcnv_model_tars,
222+
calling_configs = GermlineCNVCallerCaseMode.calling_config_json,
223+
denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json,
224+
gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json,
225+
sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list,
220226
allosomal_contigs = allosomal_contigs,
221227
ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
222228
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar,
@@ -232,7 +238,7 @@ workflow CNVGermlineCaseWorkflow {
232238
Array[File] read_counts_entity_id = CollectCounts.entity_id
233239
Array[File] read_counts = CollectCounts.counts
234240
File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar
235-
Array[File] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar
241+
Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars
236242
Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar
237243
Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
238244
Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf
@@ -355,6 +361,9 @@ task GermlineCNVCallerCaseMode {
355361

356362
# If optional output_dir not specified, use "out"
357363
String output_dir_ = select_first([output_dir, "out"])
364+
Int num_samples = length(read_count_files)
365+
366+
String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819
358367

359368
command <<<
360369
set -e
@@ -406,8 +415,16 @@ task GermlineCNVCallerCaseMode {
406415
--caller-external-admixing-rate ${default="1.00" caller_external_admixing_rate} \
407416
--disable-annealing ${default="false" disable_annealing}
408417

409-
tar czf case-gcnv-calls-${scatter_index}.tar.gz -C ${output_dir_}/case-calls .
410-
tar czf case-gcnv-tracking-${scatter_index}.tar.gz -C ${output_dir_}/case-tracking .
418+
tar czf case-gcnv-tracking-shard-${scatter_index}.tar.gz -C ${output_dir_}/case-tracking .
419+
420+
CURRENT_SAMPLE=0
421+
NUM_SAMPLES=${num_samples}
422+
NUM_DIGITS=${dollar}{#NUM_SAMPLES}
423+
while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do
424+
CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${dollar}{NUM_DIGITS}d" $CURRENT_SAMPLE)
425+
tar czf case-gcnv-calls-shard-${scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ${output_dir_}/case-calls/SAMPLE_$CURRENT_SAMPLE .
426+
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
427+
done
411428
>>>
412429

413430
runtime {
@@ -419,8 +436,8 @@ task GermlineCNVCallerCaseMode {
419436
}
420437

421438
output {
422-
File gcnv_calls_tar = "case-gcnv-calls-${scatter_index}.tar.gz"
423-
File gcnv_tracking_tar = "case-gcnv-tracking-${scatter_index}.tar.gz"
439+
Array[File] gcnv_call_tars = glob("case-gcnv-calls-shard-${scatter_index}-sample-*.tar.gz")
440+
File gcnv_tracking_tar = "case-gcnv-tracking-shard-${scatter_index}.tar.gz"
424441
File calling_config_json = "${output_dir_}/case-calls/calling_config.json"
425442
File denoising_config_json = "${output_dir_}/case-calls/denoising_config.json"
426443
File gcnvkernel_version_json = "${output_dir_}/case-calls/gcnvkernel_version.json"

scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl

+25-8
Original file line numberDiff line numberDiff line change
@@ -304,12 +304,18 @@ workflow CNVGermlineCohortWorkflow {
304304
}
305305
}
306306

307+
Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars)
308+
307309
scatter (sample_index in range(length(CollectCounts.entity_id))) {
308310
call CNVTasks.PostprocessGermlineCNVCalls {
309311
input:
310312
entity_id = CollectCounts.entity_id[sample_index],
311-
gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar,
313+
gcnv_calls_tars = call_tars_sample_by_shard[sample_index],
312314
gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar,
315+
calling_configs = GermlineCNVCallerCohortMode.calling_config_json,
316+
denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json,
317+
gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json,
318+
sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list,
313319
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar,
314320
allosomal_contigs = allosomal_contigs,
315321
ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
@@ -329,7 +335,7 @@ workflow CNVGermlineCohortWorkflow {
329335
File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar
330336
File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar
331337
Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar
332-
Array[File] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar
338+
Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars
333339
Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar
334340
Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
335341
Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf
@@ -470,6 +476,9 @@ task GermlineCNVCallerCohortMode {
470476

471477
# If optional output_dir not specified, use "out"
472478
String output_dir_ = select_first([output_dir, "out"])
479+
Int num_samples = length(read_count_files)
480+
481+
String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819
473482
474483
command <<<
475484
set -e
@@ -529,9 +538,17 @@ task GermlineCNVCallerCohortMode {
529538
--caller-external-admixing-rate ${default="1.00" caller_external_admixing_rate} \
530539
--disable-annealing ${default="false" disable_annealing}
531540

532-
tar czf ${cohort_entity_id}-gcnv-model-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-model .
533-
tar czf ${cohort_entity_id}-gcnv-calls-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-calls .
534-
tar czf ${cohort_entity_id}-gcnv-tracking-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-tracking .
541+
tar czf ${cohort_entity_id}-gcnv-model-shard-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-model .
542+
tar czf ${cohort_entity_id}-gcnv-tracking-shard-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-tracking .
543+
544+
CURRENT_SAMPLE=0
545+
NUM_SAMPLES=${num_samples}
546+
NUM_DIGITS=${dollar}{#NUM_SAMPLES}
547+
while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do
548+
CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${dollar}{NUM_DIGITS}d" $CURRENT_SAMPLE)
549+
tar czf ${cohort_entity_id}-gcnv-calls-shard-${scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ${output_dir_}/${cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE .
550+
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
551+
done
535552
>>>
536553

537554
runtime {
@@ -543,9 +560,9 @@ task GermlineCNVCallerCohortMode {
543560
}
544561

545562
output {
546-
File gcnv_model_tar = "${cohort_entity_id}-gcnv-model-${scatter_index}.tar.gz"
547-
File gcnv_calls_tar = "${cohort_entity_id}-gcnv-calls-${scatter_index}.tar.gz"
548-
File gcnv_tracking_tar = "${cohort_entity_id}-gcnv-tracking-${scatter_index}.tar.gz"
563+
File gcnv_model_tar = "${cohort_entity_id}-gcnv-model-shard-${scatter_index}.tar.gz"
564+
Array[File] gcnv_call_tars = glob("${cohort_entity_id}-gcnv-calls-shard-${scatter_index}-sample-*.tar.gz")
565+
File gcnv_tracking_tar = "${cohort_entity_id}-gcnv-tracking-shard-${scatter_index}.tar.gz"
549566
File calling_config_json = "${output_dir_}/${cohort_entity_id}-calls/calling_config.json"
550567
File denoising_config_json = "${output_dir_}/${cohort_entity_id}-calls/denoising_config.json"
551568
File gcnvkernel_version_json = "${output_dir_}/${cohort_entity_id}-calls/gcnvkernel_version.json"

0 commit comments

Comments
 (0)