Skip to content

Commit 5f6b500

Browse files
authored
CA-404062: Wrongly restart xapi when receiving HTTP errors (#6201)
The xapi on a supporter host would restart when it received HTTP error from the xapi on the coordinator host. This breaks the pool.designate_new_master use case for a big pool, e.g. 64-host pool. In this case, some supporters may restart unexpectedly within the phase of committing new coordinator due to the logic above. Additionally, the purpose of this logic, explained by the error message, is not correct also. Not all HTTP errors are caused by "our master address is wrong". On the other hand, if a use case requires to restart the xapi, an more explicit logic should ensure that, instead of leveraging an implicit HTTP error code. Furhtermore, if a supporter indeed is connecting to a wrong coordinator, this should be a bug and can be recovered manually. Based on above arguments, the restarting xapi after receiving HTTP error is removed. This follows the TODO concluded in CA-36936 as well.
2 parents 0f1f45c + 9a44916 commit 5f6b500

File tree

1 file changed

+55
-58
lines changed

1 file changed

+55
-58
lines changed

ocaml/database/master_connection.ml

Lines changed: 55 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,57 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) :
221221
else if !backoff_delay > 256.0 then
222222
backoff_delay := 256.0
223223
in
224+
let reconnect () =
225+
(* RPC failed - there's no way we can recover from this so try reopening connection every 2s + backoff delay *)
226+
( match !my_connection with
227+
| None ->
228+
()
229+
| Some st_proc -> (
230+
my_connection := None ;
231+
(* don't want to try closing multiple times *)
232+
try Stunnel.disconnect st_proc with _ -> ()
233+
)
234+
) ;
235+
let time_sofar = Unix.gettimeofday () -. time_call_started in
236+
if !connection_timeout < 0. then (
237+
if not !surpress_no_timeout_logs then (
238+
debug
239+
"Connection to master died. I will continue to retry indefinitely \
240+
(supressing future logging of this message)." ;
241+
error
242+
"Connection to master died. I will continue to retry indefinitely \
243+
(supressing future logging of this message)."
244+
) ;
245+
surpress_no_timeout_logs := true
246+
) else
247+
debug
248+
"Connection to master died: time taken so far in this call '%f'; will \
249+
%s"
250+
time_sofar
251+
( if !connection_timeout < 0. then
252+
"never timeout"
253+
else
254+
Printf.sprintf "timeout after '%f'" !connection_timeout
255+
) ;
256+
if time_sofar > !connection_timeout && !connection_timeout >= 0. then
257+
if !restart_on_connection_timeout then (
258+
debug "Exceeded timeout for retrying master connection: restarting xapi" ;
259+
!Db_globs.restart_fn ()
260+
) else (
261+
debug
262+
"Exceeded timeout for retrying master connection: raising \
263+
Cannot_connect_to_master" ;
264+
raise Cannot_connect_to_master
265+
) ;
266+
debug "Sleeping %f seconds before retrying master connection..."
267+
!backoff_delay ;
268+
let timed_out = Scheduler.PipeDelay.wait delay !backoff_delay in
269+
if not timed_out then
270+
debug "%s: Sleep interrupted, retrying master connection now" __FUNCTION__ ;
271+
update_backoff_delay () ;
272+
D.log_and_ignore_exn open_secure_connection
273+
in
274+
224275
while not !write_ok do
225276
try
226277
let req_string = req in
@@ -266,67 +317,13 @@ let do_db_xml_rpc_persistent_with_reopen ~host:_ ~path (req : string) :
266317
Db_globs.http_limit_max_rpc_size ;
267318
debug "Re-raising exception to caller." ;
268319
raise Http.Client_requested_size_over_limit
269-
(* TODO: This http exception handler caused CA-36936 and can probably be removed now that there's backoff delay in the generic handler _ below *)
270320
| Http_client.Http_error (http_code, err_msg) ->
271-
error
272-
"Received HTTP error %s (%s) from master. This suggests our master \
273-
address is wrong. Sleeping for %.0fs and then executing restart_fn."
274-
http_code err_msg
275-
!Db_globs.permanent_master_failure_retry_interval ;
276-
Thread.delay !Db_globs.permanent_master_failure_retry_interval ;
277-
!Db_globs.restart_fn ()
321+
error "Received HTTP error %s (%s) from the coordinator" http_code
322+
err_msg ;
323+
reconnect ()
278324
| e ->
279325
error "Caught %s" (Printexc.to_string e) ;
280-
(* RPC failed - there's no way we can recover from this so try reopening connection every 2s + backoff delay *)
281-
( match !my_connection with
282-
| None ->
283-
()
284-
| Some st_proc -> (
285-
my_connection := None ;
286-
(* don't want to try closing multiple times *)
287-
try Stunnel.disconnect st_proc with _ -> ()
288-
)
289-
) ;
290-
let time_sofar = Unix.gettimeofday () -. time_call_started in
291-
if !connection_timeout < 0. then (
292-
if not !surpress_no_timeout_logs then (
293-
debug
294-
"Connection to master died. I will continue to retry \
295-
indefinitely (supressing future logging of this message)." ;
296-
error
297-
"Connection to master died. I will continue to retry \
298-
indefinitely (supressing future logging of this message)."
299-
) ;
300-
surpress_no_timeout_logs := true
301-
) else
302-
debug
303-
"Connection to master died: time taken so far in this call '%f'; \
304-
will %s"
305-
time_sofar
306-
( if !connection_timeout < 0. then
307-
"never timeout"
308-
else
309-
Printf.sprintf "timeout after '%f'" !connection_timeout
310-
) ;
311-
if time_sofar > !connection_timeout && !connection_timeout >= 0. then
312-
if !restart_on_connection_timeout then (
313-
debug
314-
"Exceeded timeout for retrying master connection: restarting xapi" ;
315-
!Db_globs.restart_fn ()
316-
) else (
317-
debug
318-
"Exceeded timeout for retrying master connection: raising \
319-
Cannot_connect_to_master" ;
320-
raise Cannot_connect_to_master
321-
) ;
322-
debug "Sleeping %f seconds before retrying master connection..."
323-
!backoff_delay ;
324-
let timed_out = Scheduler.PipeDelay.wait delay !backoff_delay in
325-
if not timed_out then
326-
debug "%s: Sleep interrupted, retrying master connection now"
327-
__FUNCTION__ ;
328-
update_backoff_delay () ;
329-
D.log_and_ignore_exn open_secure_connection
326+
reconnect ()
330327
done ;
331328
!result
332329

0 commit comments

Comments
 (0)