Skip to content

Commit 6cf6b77

Browse files
authored
CA-403422: lengthen the timeout for xenopsd's serialized tasks (#6192)
Historically parallel operations were run in tasks as part of serial operations, and serial tasks were not run as part of parallel ones. This changed recently, causing some timeouts that did not happen before. To mitigate this issue, now the timeouts for tasks are 20 minutes per single serialized operation, instead of 20 minutes per task. Passed one of the VDI / VM scalibility tests that previously failed reliably: Job 4182581
2 parents 9eeb1f3 + a899232 commit 6cf6b77

File tree

1 file changed

+19
-12
lines changed

1 file changed

+19
-12
lines changed

ocaml/xenopsd/lib/xenops_server.ml

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,15 @@ let rec name_of_atomic = function
278278
| Best_effort atomic ->
279279
Printf.sprintf "Best_effort (%s)" (name_of_atomic atomic)
280280

281+
let rec atomic_expires_after = function
282+
| Serial (_, _, ops) ->
283+
List.map atomic_expires_after ops |> List.fold_left ( +. ) 0.
284+
| Parallel (_, _, ops) ->
285+
List.map atomic_expires_after ops |> List.fold_left Float.max 0.
286+
| _ ->
287+
(* 20 minutes, in seconds *)
288+
1200.
289+
281290
type vm_migrate_op = {
282291
vmm_id: Vm.id
283292
; vmm_vdi_map: (string * string) list
@@ -1848,7 +1857,7 @@ let with_tracing ~name ~task f =
18481857
warn "Failed to start tracing: %s" (Printexc.to_string e) ;
18491858
f ()
18501859

1851-
let rec perform_atomic ~progress_callback ?subtask:_ ?result (op : atomic)
1860+
let rec perform_atomic ~progress_callback ?result (op : atomic)
18521861
(t : Xenops_task.task_handle) : unit =
18531862
let module B = (val get_backend () : S) in
18541863
with_tracing ~name:(name_of_atomic op) ~task:t @@ fun () ->
@@ -2341,16 +2350,17 @@ and queue_atomics_and_wait ~progress_callback ~max_parallel_atoms dbg id ops =
23412350
let atom_id =
23422351
Printf.sprintf "%s.chunk=%d.atom=%d" id chunk_idx atom_idx
23432352
in
2344-
queue_atomic_int ~progress_callback dbg atom_id op
2353+
(queue_atomic_int ~progress_callback dbg atom_id op, op)
23452354
)
23462355
ops
23472356
in
23482357
let timeout_start = Unix.gettimeofday () in
23492358
List.map
2350-
(fun task ->
2359+
(fun (task, op) ->
23512360
let task_id = Xenops_task.id_of_handle task in
2361+
let expiration = atomic_expires_after op in
23522362
let completion =
2353-
event_wait updates task ~from ~timeout_start 1200.0
2363+
event_wait updates task ~from ~timeout_start expiration
23542364
(is_task task_id) task_ended
23552365
in
23562366
(task_id, task, completion)
@@ -2386,7 +2396,7 @@ let perform_atomics atomics t =
23862396
progress_callback progress (weight /. total_weight) t
23872397
in
23882398
debug "Performing: %s" (string_of_atomic x) ;
2389-
perform_atomic ~subtask:(string_of_atomic x) ~progress_callback x t ;
2399+
perform_atomic ~progress_callback x t ;
23902400
progress_callback 1. ;
23912401
progress +. (weight /. total_weight)
23922402
)
@@ -2520,8 +2530,7 @@ and trigger_cleanup_after_failure_atom op t =
25202530
| VM_import_metadata _ ->
25212531
()
25222532

2523-
and perform_exn ?subtask ?result (op : operation) (t : Xenops_task.task_handle)
2524-
: unit =
2533+
and perform_exn ?result (op : operation) (t : Xenops_task.task_handle) : unit =
25252534
let module B = (val get_backend () : S) in
25262535
with_tracing ~name:(name_of_operation op) ~task:t @@ fun () ->
25272536
match op with
@@ -2648,9 +2657,7 @@ and perform_exn ?subtask ?result (op : operation) (t : Xenops_task.task_handle)
26482657
(id, vm.Vm.memory_dynamic_min, vm.Vm.memory_dynamic_min)
26492658
in
26502659
let (_ : unit) =
2651-
perform_atomic ~subtask:(string_of_atomic atomic)
2652-
~progress_callback:(fun _ -> ())
2653-
atomic t
2660+
perform_atomic ~progress_callback:(fun _ -> ()) atomic t
26542661
in
26552662
(* Waiting here is not essential but adds a degree of safety and
26562663
reducess unnecessary memory copying. *)
@@ -3162,7 +3169,7 @@ and perform_exn ?subtask ?result (op : operation) (t : Xenops_task.task_handle)
31623169
VUSB_DB.signal id
31633170
| Atomic op ->
31643171
let progress_callback = progress_callback 0. 1. t in
3165-
perform_atomic ~progress_callback ?subtask ?result op t
3172+
perform_atomic ~progress_callback ?result op t
31663173

31673174
and verify_power_state op =
31683175
let module B = (val get_backend () : S) in
@@ -3191,7 +3198,7 @@ and perform ?subtask ?result (op : operation) (t : Xenops_task.task_handle) :
31913198
unit =
31923199
let one op =
31933200
verify_power_state op ;
3194-
try perform_exn ?subtask ?result op t
3201+
try perform_exn ?result op t
31953202
with e ->
31963203
Backtrace.is_important e ;
31973204
info "Caught %s executing %s: triggering cleanup actions"

0 commit comments

Comments
 (0)