Skip to content

Commit ddea018

Browse files
authored
Mux mirror failure check for SXM (#6439)
Continuation of #6434, more mutiplexing for SXM, this time the mirror status checking logic. No functional change. More to come...
2 parents 1cfaab9 + bb994e0 commit ddea018

File tree

12 files changed

+213
-70
lines changed

12 files changed

+213
-70
lines changed

ocaml/xapi-idl/storage/storage_interface.ml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,6 +1179,27 @@ module StorageAPI (R : RPC) = struct
11791179
let receive_cancel2 =
11801180
declare "DATA.MIRROR.receive_cancel2" []
11811181
(dbg_p @-> id_p @-> url_p @-> verify_dest_p @-> returning unit_p err)
1182+
1183+
let pre_deactivate_hook =
1184+
declare "DATA.MIRROR.pre_deactivate_hook" []
1185+
(dbg_p @-> dp_p @-> sr_p @-> vdi_p @-> returning unit_p err)
1186+
1187+
let has_mirror_failed =
1188+
let mirror_failed_p =
1189+
Param.mk ~name:"mirror_failed_p" ~description:[] Types.bool
1190+
in
1191+
declare "DATA.MIRROR.has_mirror_failed" []
1192+
(dbg_p @-> id_p @-> sr_p @-> returning mirror_failed_p err)
1193+
1194+
let list =
1195+
let result_p =
1196+
Param.mk ~name:"mirrors" TypeCombinators.(list (pair Mirror.(id, t)))
1197+
in
1198+
declare "DATA.MIRROR.list" [] (dbg_p @-> returning result_p err)
1199+
1200+
let stat =
1201+
let result_p = Param.mk ~name:"result" Mirror.t in
1202+
declare "DATA.MIRROR.stat" [] (dbg_p @-> id_p @-> returning result_p err)
11821203
end
11831204
end
11841205

@@ -1285,6 +1306,16 @@ module type MIRROR = sig
12851306
-> url:string
12861307
-> verify_dest:bool
12871308
-> unit
1309+
1310+
val pre_deactivate_hook :
1311+
context -> dbg:debug_info -> dp:dp -> sr:sr -> vdi:vdi -> unit
1312+
1313+
val has_mirror_failed :
1314+
context -> dbg:debug_info -> mirror_id:Mirror.id -> sr:Sr.t -> bool
1315+
1316+
val list : context -> dbg:debug_info -> (Mirror.id * Mirror.t) list
1317+
1318+
val stat : context -> dbg:debug_info -> id:Mirror.id -> Mirror.t
12881319
end
12891320

12901321
module type Server_impl = sig
@@ -1759,6 +1790,14 @@ module Server (Impl : Server_impl) () = struct
17591790
Impl.DATA.MIRROR.receive_finalize2 () ~dbg ~mirror_id ~sr ~url
17601791
~verify_dest
17611792
) ;
1793+
S.DATA.MIRROR.pre_deactivate_hook (fun dbg dp sr vdi ->
1794+
Impl.DATA.MIRROR.pre_deactivate_hook () ~dbg ~dp ~sr ~vdi
1795+
) ;
1796+
S.DATA.MIRROR.has_mirror_failed (fun dbg mirror_id sr ->
1797+
Impl.DATA.MIRROR.has_mirror_failed () ~dbg ~mirror_id ~sr
1798+
) ;
1799+
S.DATA.MIRROR.list (fun dbg -> Impl.DATA.MIRROR.list () ~dbg) ;
1800+
S.DATA.MIRROR.stat (fun dbg id -> Impl.DATA.MIRROR.stat () ~dbg ~id) ;
17621801
S.DATA.import_activate (fun dbg dp sr vdi vm ->
17631802
Impl.DATA.import_activate () ~dbg ~dp ~sr ~vdi ~vm
17641803
) ;

ocaml/xapi-idl/storage/storage_skeleton.ml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,16 @@ module DATA = struct
182182

183183
let receive_cancel2 ctx ~dbg ~mirror_id ~url ~verify_dest =
184184
u "DATA.MIRROR.receive_cancel2"
185+
186+
let pre_deactivate_hook ctx ~dbg ~dp ~sr ~vdi =
187+
u "DATA.MIRROR.pre_deactivate_hook"
188+
189+
let has_mirror_failed ctx ~dbg ~mirror_id ~sr =
190+
u "DATA.MIRROR.has_mirror_failed"
191+
192+
let list ctx ~dbg = u "DATA.MIRROR.list"
193+
194+
let stat ctx ~dbg ~id = u "DATA.MIRROR.stat"
185195
end
186196
end
187197

ocaml/xapi-storage-cli/main.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ let string_of_file filename =
149149

150150
let mirror_list common_opts =
151151
wrap common_opts (fun () ->
152-
let list = Storage_migrate.list ~dbg in
152+
let list = Client.DATA.MIRROR.list dbg in
153153
List.iter
154154
(fun (id, status) -> Printf.printf "%s" (string_of_mirror id status))
155155
list

ocaml/xapi-storage-script/main.ml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1927,6 +1927,10 @@ let bind ~volume_script_dir =
19271927
S.DATA.MIRROR.receive_finalize2 (u "DATA.MIRROR.receive_finalize2") ;
19281928
S.DATA.MIRROR.receive_cancel (u "DATA.MIRROR.receive_cancel") ;
19291929
S.DATA.MIRROR.receive_cancel2 (u "DATA.MIRROR.receive_cancel2") ;
1930+
S.DATA.MIRROR.pre_deactivate_hook (u "DATA.MIRROR.pre_deactivate_hook") ;
1931+
S.DATA.MIRROR.has_mirror_failed (u "DATA.MIRROR.has_mirror_failed") ;
1932+
S.DATA.MIRROR.list (u "DATA.MIRROR.list") ;
1933+
S.DATA.MIRROR.stat (u "DATA.MIRROR.stat") ;
19301934
S.DP.create (u "DP.create") ;
19311935
S.TASK.cancel (u "TASK.cancel") ;
19321936
S.TASK.list (u "TASK.list") ;

ocaml/xapi/storage_access.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ let update_task ~__context id =
446446
let update_mirror ~__context id =
447447
try
448448
let dbg = Context.string_of_task __context in
449-
let m = Storage_migrate.stat ~dbg ~id in
449+
let m = Client.DATA.MIRROR.stat dbg id in
450450
if m.Mirror.failed then
451451
debug "Mirror %s has failed" id ;
452452
let task = get_mirror_task id in

ocaml/xapi/storage_migrate.ml

Lines changed: 8 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -209,28 +209,20 @@ module MigrateLocal = struct
209209
stop ~dbg ~id:mirror_id ;
210210
raise e
211211

212-
let stat ~dbg:_ ~id =
212+
let stat ~dbg ~id =
213213
let recv_opt = State.find_active_receive_mirror id in
214214
let send_opt = State.find_active_local_mirror id in
215215
let copy_opt = State.find_active_copy id in
216+
let sr, _vdi = State.of_mirror_id id in
216217
let open State in
217218
let failed =
218219
match send_opt with
219220
| Some send_state ->
221+
let (module Migrate_Backend) = choose_backend dbg sr in
220222
let failed =
221-
match send_state.Send_state.tapdev with
222-
| Some tapdev -> (
223-
try
224-
let stats = Tapctl.stats (Tapctl.create ()) tapdev in
225-
stats.Tapctl.Stats.nbd_mirror_failed = 1
226-
with _ ->
227-
debug "Using cached copy of failure status" ;
228-
send_state.Send_state.failed
229-
)
230-
| None ->
231-
false
223+
Migrate_Backend.has_mirror_failed () ~dbg ~mirror_id:id ~sr
232224
in
233-
send_state.Send_state.failed <- failed ;
225+
send_state.failed <- failed ;
234226
failed
235227
| None ->
236228
false
@@ -325,58 +317,9 @@ module MigrateLocal = struct
325317
State.clear ()
326318
end
327319

328-
exception Timeout of Mtime.Span.t
329-
330-
let reqs_outstanding_timeout = Mtime.Span.(150 * s)
331-
332-
let pp_time () = Fmt.str "%a" Mtime.Span.pp
333-
334-
(* Tapdisk should time out after 2 mins. We can wait a little longer *)
335-
336-
let pre_deactivate_hook ~dbg:_ ~dp:_ ~sr ~vdi =
337-
let open State.Send_state in
338-
let id = State.mirror_id_of (sr, vdi) in
339-
let start = Mtime_clock.counter () in
340-
State.find_active_local_mirror id
341-
|> Option.iter (fun s ->
342-
(* We used to pause here and then check the nbd_mirror_failed key. Now, we poll
343-
until the number of outstanding requests has gone to zero, then check the
344-
status. This avoids confusing the backend (CA-128460) *)
345-
try
346-
match s.tapdev with
347-
| None ->
348-
()
349-
| Some tapdev ->
350-
let open Tapctl in
351-
let ctx = create () in
352-
let rec wait () =
353-
let elapsed = Mtime_clock.count start in
354-
if Mtime.Span.compare elapsed reqs_outstanding_timeout > 0 then
355-
raise (Timeout elapsed) ;
356-
let st = stats ctx tapdev in
357-
if st.Stats.reqs_outstanding > 0 then (
358-
Thread.delay 1.0 ; wait ()
359-
) else
360-
(st, elapsed)
361-
in
362-
let st, elapsed = wait () in
363-
debug "Got final stats after waiting %a" pp_time elapsed ;
364-
if st.Stats.nbd_mirror_failed = 1 then (
365-
error "tapdisk reports mirroring failed" ;
366-
s.failed <- true
367-
)
368-
with
369-
| Timeout elapsed ->
370-
error
371-
"Timeout out after %a waiting for tapdisk to complete all \
372-
outstanding requests"
373-
pp_time elapsed ;
374-
s.failed <- true
375-
| e ->
376-
error "Caught exception while finally checking mirror state: %s"
377-
(Printexc.to_string e) ;
378-
s.failed <- true
379-
)
320+
let pre_deactivate_hook ~dbg ~dp ~sr ~vdi =
321+
let (module Migrate_Backend) = choose_backend dbg sr in
322+
Migrate_Backend.pre_deactivate_hook () ~dbg ~dp ~sr ~vdi
380323

381324
let post_deactivate_hook ~sr ~vdi ~dp:_ =
382325
let open State.Send_state in

ocaml/xapi/storage_mux.ml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,22 @@ module Mux = struct
857857

858858
let receive_cancel2 () ~dbg:_ ~mirror_id:_ ~url:_ ~verify_dest:_ =
859859
u __FUNCTION__
860+
861+
let pre_deactivate_hook _ctx ~dbg:_ ~dp:_ ~sr:_ ~vdi:_ =
862+
u "DATA.MIRROR.pre_deactivate_hook"
863+
864+
let has_mirror_failed _ctx ~dbg:_ ~mirror_id:_ ~sr:_ =
865+
u "DATA.MIRROR.has_mirror_failed"
866+
867+
let list () ~dbg =
868+
with_dbg ~name:"DATA.MIRROR.list" ~dbg @@ fun di ->
869+
info "%s dbg: %s" __FUNCTION__ dbg ;
870+
Storage_migrate.list ~dbg:di.log
871+
872+
let stat () ~dbg ~id =
873+
with_dbg ~name:"DATA.MIRROR.stat" ~dbg @@ fun di ->
874+
info "%s dbg: %s mirror_id: %s" __FUNCTION__ di.log id ;
875+
Storage_migrate.stat ~dbg:di.log ~id
860876
end
861877
end
862878

ocaml/xapi/storage_smapiv1.ml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,6 +1157,14 @@ module SMAPIv1 : Server_impl = struct
11571157

11581158
let receive_cancel2 _context ~dbg:_ ~mirror_id:_ ~url:_ ~verify_dest:_ =
11591159
assert false
1160+
1161+
let pre_deactivate_hook _context ~dbg:_ ~dp:_ ~sr:_ ~vdi:_ = assert false
1162+
1163+
let has_mirror_failed _context ~dbg:_ ~mirror_id:_ ~sr:_ = assert false
1164+
1165+
let list _context ~dbg:_ = assert false
1166+
1167+
let stat _context ~dbg:_ ~id:_ = assert false
11601168
end
11611169
end
11621170

0 commit comments

Comments
 (0)