Skip to content

Commit ce5abab

Browse files
authored
Update feature/perf from master (#6167)
And solve conflicts. The conflict resolution can be reviewed locally with this command if you have a new enough version of `git`: ``` git log --remerge-diff -1 81146d223d3d5cd449105ecef713cfd57e2c1853 ``` The conflicts are mostly due to: * with_tracing being removed from the Http library * the xapi_periodic_scheduler being moved to xapi-stdext-threads * a slightly different version of the concurrent PR being merged between the 2 branches (`vm` instead of `vm'`) For convenience here is the output of that command: ```diff commit 81146d223d3d5cd449105ecef713cfd57e2c1853 Merge: 59da2e0 d8baca7 Author: Edwin Török <[email protected]> Date: Tue Dec 10 13:43:23 2024 +0000 Merge master into feature/perf Fix conflicts in http.ml (with_tracing got moved), and periodic scheduler (got moved to xapi-stdext-threads). Signed-off-by: Edwin Török <[email protected]> diff --git a/ocaml/libs/http-lib/http.ml b/ocaml/libs/http-lib/http.ml remerge CONFLICT (content): Merge conflict in ocaml/libs/http-lib/http.ml index ca635e2ba0..c979e1f 100644 --- a/ocaml/libs/http-lib/http.ml +++ b/ocaml/libs/http-lib/http.ml @@ -132,16 +132,8 @@ module Hdr = struct let location = "location" -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let originator = "originator" - let traceparent = "traceparent" - -||||||| 77dd474 - let traceparent = "traceparent" - -======= ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) let hsts = "strict-transport-security" end @@ -684,7 +676,6 @@ module Request = struct let headers, body = to_headers_and_body x in let frame_header = if x.frame then make_frame_header headers else "" in frame_header ^ headers ^ body -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let with_originator_of req f = Option.iter @@ -693,55 +684,6 @@ module Request = struct f originator ) req - - let traceparent_of req = - let open Tracing in - let ( let* ) = Option.bind in - let* traceparent = req.traceparent in - let* span_context = SpanContext.of_traceparent traceparent in - let span = Tracer.span_of_span_context span_context req.uri in - Some span - - let with_tracing ?attributes ~name req f = - let open Tracing in - let parent = traceparent_of req in - with_child_trace ?attributes parent ~name (fun (span : Span.t option) -> - match span with - | Some span -> - let traceparent = - Some (span |> Span.get_context |> SpanContext.to_traceparent) - in - let req = {req with traceparent} in - f req - | None -> - f req - ) -||||||| 77dd474 - - let traceparent_of req = - let open Tracing in - let ( let* ) = Option.bind in - let* traceparent = req.traceparent in - let* span_context = SpanContext.of_traceparent traceparent in - let span = Tracer.span_of_span_context span_context req.uri in - Some span - - let with_tracing ?attributes ~name req f = - let open Tracing in - let parent = traceparent_of req in - with_child_trace ?attributes parent ~name (fun (span : Span.t option) -> - match span with - | Some span -> - let traceparent = - Some (span |> Span.get_context |> SpanContext.to_traceparent) - in - let req = {req with traceparent} in - f req - | None -> - f req - ) -======= ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) end module Response = struct diff --git a/ocaml/libs/http-lib/http.mli b/ocaml/libs/http-lib/http.mli remerge CONFLICT (content): Merge conflict in ocaml/libs/http-lib/http.mli index e77a9ebd5a..114ddbc 100644 --- a/ocaml/libs/http-lib/http.mli +++ b/ocaml/libs/http-lib/http.mli @@ -126,22 +126,8 @@ module Request : sig val to_wire_string : t -> string (** [to_wire_string t] returns a string which could be sent to a server *) -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) val with_originator_of : t option -> (string option -> unit) -> unit - - val traceparent_of : t -> Tracing.Span.t option - - val with_tracing : - ?attributes:(string * string) list -> name:string -> t -> (t -> 'a) -> 'a -||||||| 77dd474 - - val traceparent_of : t -> Tracing.Span.t option - - val with_tracing : - ?attributes:(string * string) list -> name:string -> t -> (t -> 'a) -> 'a -======= ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) end (** Parsed form of the HTTP response *) diff --git a/ocaml/tests/dune b/ocaml/tests/dune index b51bbca..ce8fe96 100644 --- a/ocaml/tests/dune +++ b/ocaml/tests/dune @@ -118,6 +118,7 @@ xapi-types xapi-stdext-date xapi-stdext-threads + xapi-stdext-threads.scheduler xapi-stdext-unix xml-light2 yojson diff --git a/ocaml/tests/test_event.ml b/ocaml/tests/test_event.ml index d36dba9..821bb3b 100644 --- a/ocaml/tests/test_event.ml +++ b/ocaml/tests/test_event.ml @@ -287,7 +287,7 @@ let test_short_oneshot () = started := true ; Condition.broadcast cond ; Mutex.unlock m ; - Xapi_periodic_scheduler.loop () + Xapi_stdext_threads_scheduler.Scheduler.loop () in ignore (Thread.create scheduler ()) ; (* ensure scheduler sees an empty queue , by waiting for it to start *) @@ -303,8 +303,8 @@ let test_short_oneshot () = let fired = Atomic.make false in let fire () = Atomic.set fired true in let task = "test_oneshot" in - Xapi_periodic_scheduler.add_to_queue task Xapi_periodic_scheduler.OneShot 1. - fire ; + Xapi_stdext_threads_scheduler.Scheduler.add_to_queue task + Xapi_stdext_threads_scheduler.Scheduler.OneShot 1. fire ; Thread.delay 2. ; assert (Atomic.get fired) diff --git a/ocaml/xapi-storage-script/main.ml b/ocaml/xapi-storage-script/main.ml remerge CONFLICT (content): Merge conflict in ocaml/xapi-storage-script/main.ml index 1ec4d9dcb1..cba7ec8 100644 --- a/ocaml/xapi-storage-script/main.ml +++ b/ocaml/xapi-storage-script/main.ml @@ -66,7 +66,6 @@ let backend_backtrace_error name args backtrace = let missing_uri () = backend_error "MISSING_URI" ["Please include a URI in the device-config"] -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) (** return a unique 'domain' string for Dom0, so that we can plug disks multiple times (e.g. for copy). @@ -84,26 +83,6 @@ let domain_of ~dp ~vm = | _ -> vm -||||||| 77dd474 -======= -(** return a unique 'domain' string for Dom0, so that we can plug disks - multiple times (e.g. for copy). - - XAPI should give us a unique 'dp' (datapath) string, e.g. a UUID for storage migration, - or vbd/domid/device. - For regular guests keep the domain as passed by XAPI (an integer). - *) -let domain_of ~dp ~vm' = - let vm = Storage_interface.Vm.string_of vm' in - match vm with - | "0" -> - (* SM tries to use this in filesystem paths, so cannot have /, - and systemd might be a bit unhappy with - *) - "u0-" ^ dp |> String.map (function '/' | '-' -> '_' | c -> c) - | _ -> - vm - ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) (** Functions to wrap calls to the above client modules and convert their exceptions and errors into SMAPIv2 errors of type [Storage_interface.Exception.exnty]. The above client modules should only @@ -1476,21 +1455,9 @@ let bind ~volume_script_dir = |> wrap in S.VDI.introduce vdi_introduce_impl ; -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let vdi_attach3_impl dbg dp sr vdi' vm _readwrite = -||||||| 77dd474 - let vdi_attach3_impl dbg _dp sr vdi' vm' _readwrite = -======= - let vdi_attach3_impl dbg dp sr vdi' vm' _readwrite = ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) (let vdi = Storage_interface.Vdi.string_of vdi' in -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let domain = domain_of ~dp ~vm in -||||||| 77dd474 - let domain = Storage_interface.Vm.string_of vm' in -======= - let domain = domain_of ~dp ~vm' in ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) vdi_attach_common dbg sr vdi domain >>>= fun response -> let convert_implementation = function | Xapi_storage.Data.XenDisk {params; extra; backend_type} -> @@ -1512,21 +1479,9 @@ let bind ~volume_script_dir = |> wrap in S.VDI.attach3 vdi_attach3_impl ; -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let vdi_activate_common dbg dp sr vdi' vm readonly = -||||||| 77dd474 - let vdi_activate_common dbg sr vdi' vm' readonly = -======= - let vdi_activate_common dbg dp sr vdi' vm' readonly = ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) (let vdi = Storage_interface.Vdi.string_of vdi' in -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let domain = domain_of ~dp ~vm in -||||||| 77dd474 - let domain = Storage_interface.Vm.string_of vm' in -======= - let domain = domain_of ~dp ~vm' in ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) Attached_SRs.find sr >>>= fun sr -> (* Discover the URIs using Volume.stat *) stat ~dbg ~sr ~vdi >>>= fun response -> @@ -1551,45 +1506,17 @@ let bind ~volume_script_dir = ) |> wrap in -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let vdi_activate3_impl dbg dp sr vdi' vm = vdi_activate_common dbg dp sr vdi' vm false -||||||| 77dd474 - let vdi_activate3_impl dbg _dp sr vdi' vm' = - vdi_activate_common dbg sr vdi' vm' false -======= - let vdi_activate3_impl dbg dp sr vdi' vm' = - vdi_activate_common dbg dp sr vdi' vm' false ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) in S.VDI.activate3 vdi_activate3_impl ; -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let vdi_activate_readonly_impl dbg dp sr vdi' vm = vdi_activate_common dbg dp sr vdi' vm true -||||||| 77dd474 - let vdi_activate_readonly_impl dbg _dp sr vdi' vm' = - vdi_activate_common dbg sr vdi' vm' true -======= - let vdi_activate_readonly_impl dbg dp sr vdi' vm' = - vdi_activate_common dbg dp sr vdi' vm' true ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) in S.VDI.activate_readonly vdi_activate_readonly_impl ; -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let vdi_deactivate_impl dbg dp sr vdi' vm = -||||||| 77dd474 - let vdi_deactivate_impl dbg _dp sr vdi' vm' = -======= - let vdi_deactivate_impl dbg dp sr vdi' vm' = ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) (let vdi = Storage_interface.Vdi.string_of vdi' in -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let domain = domain_of ~dp ~vm in -||||||| 77dd474 - let domain = Storage_interface.Vm.string_of vm' in -======= - let domain = domain_of ~dp ~vm' in ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) Attached_SRs.find sr >>>= fun sr -> (* Discover the URIs using Volume.stat *) stat ~dbg ~sr ~vdi >>>= fun response -> @@ -1610,21 +1537,9 @@ let bind ~volume_script_dir = |> wrap in S.VDI.deactivate vdi_deactivate_impl ; -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let vdi_detach_impl dbg dp sr vdi' vm = -||||||| 77dd474 - let vdi_detach_impl dbg _dp sr vdi' vm' = -======= - let vdi_detach_impl dbg dp sr vdi' vm' = ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) (let vdi = Storage_interface.Vdi.string_of vdi' in -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let domain = domain_of ~dp ~vm in -||||||| 77dd474 - let domain = Storage_interface.Vm.string_of vm' in -======= - let domain = domain_of ~dp ~vm' in ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) Attached_SRs.find sr >>>= fun sr -> (* Discover the URIs using Volume.stat *) stat ~dbg ~sr ~vdi >>>= fun response -> @@ -1732,21 +1647,9 @@ let bind ~volume_script_dir = S.VDI.epoch_end vdi_epoch_end_impl ; let vdi_set_persistent_impl _dbg _sr _vdi _persistent = return () |> wrap in S.VDI.set_persistent vdi_set_persistent_impl ; -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let dp_destroy2 dbg dp sr vdi' vm _allow_leak = -||||||| 77dd474 - let dp_destroy2 dbg _dp sr vdi' vm' _allow_leak = -======= - let dp_destroy2 dbg dp sr vdi' vm' _allow_leak = ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) (let vdi = Storage_interface.Vdi.string_of vdi' in -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let domain = domain_of ~dp ~vm in -||||||| 77dd474 - let domain = Storage_interface.Vm.string_of vm' in -======= - let domain = domain_of ~dp ~vm' in ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) Attached_SRs.find sr >>>= fun sr -> (* Discover the URIs using Volume.stat *) stat ~dbg ~sr ~vdi >>>= fun response -> @@ -1888,17 +1791,7 @@ let rec diff a b = | a :: aa -> if List.mem a b then diff aa b else a :: diff aa b -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let concurrent = ref true -||||||| 77dd474 -(* default false due to bugs in SMAPIv3 plugins, - once they are fixed this should be set to true *) -let concurrent = ref false -======= -(* default false due to bugs in SMAPIv3 plugins, - once they are fixed this should be set to true *) -let concurrent = ref true ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) type reload = All | Files of string list | Nothing diff --git a/ocaml/xapi/helpers.ml b/ocaml/xapi/helpers.ml remerge CONFLICT (content): Merge conflict in ocaml/xapi/helpers.ml index ca57afb8d8..1175b6a 100644 --- a/ocaml/xapi/helpers.ml +++ b/ocaml/xapi/helpers.ml @@ -410,20 +410,14 @@ let make_rpc ~__context rpc : Rpc.response = let subtask_of = Ref.string_of (Context.get_task_id __context) in let open Xmlrpc_client in let tracing = Context.set_client_span __context in -<<<<<<< 59da2e0 (CA-388564: move qemu-dm to vm.slice (#6150)) let dorpc, path = if !Xapi_globs.use_xmlrpc then (XMLRPC_protocol.rpc, "/") else (JSONRPC_protocol.rpc, "/jsonrpc") in - let http = xmlrpc ~subtask_of ~version:"1.1" path ~tracing in -||||||| 77dd474 - let http = xmlrpc ~subtask_of ~version:"1.1" "/" ~tracing in -======= - let http = xmlrpc ~subtask_of ~version:"1.1" "/" in + let http = xmlrpc ~subtask_of ~version:"1.1" path in let http = TraceHelper.inject_span_into_req tracing http in ->>>>>>> d8baca7 (CA-390025: do not override SR's client-set metadata on update (#6165)) let transport = if Pool_role.is_master () then Unix Xapi_globs.unix_domain_socket diff --git a/ocaml/xapi/xapi_periodic_scheduler_init.ml b/ocaml/xapi/xapi_periodic_scheduler_init.ml index 6300f89db2..1bd13d5 100644 --- a/ocaml/xapi/xapi_periodic_scheduler_init.ml +++ b/ocaml/xapi/xapi_periodic_scheduler_init.ml @@ -129,9 +129,10 @@ let register ~__context = ) ) ; let stunnel_period = !Stunnel_cache.max_idle /. 2. in - Xapi_periodic_scheduler.add_to_queue "Check stunnel cache expiry" - (Xapi_periodic_scheduler.Periodic stunnel_period) stunnel_period - Stunnel_cache.gc ; + Xapi_stdext_threads_scheduler.Scheduler.add_to_queue + "Check stunnel cache expiry" + (Xapi_stdext_threads_scheduler.Scheduler.Periodic stunnel_period) + stunnel_period Stunnel_cache.gc ; if master && Db.Pool.get_update_sync_enabled ~__context ```
2 parents 59da2e0 + 51e9cf6 commit ce5abab

File tree

122 files changed

+2622
-3769
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

122 files changed

+2622
-3769
lines changed

.github/workflows/generate-and-build-sdks.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@ jobs:
2424
shell: bash
2525
run: opam exec -- make sdk
2626

27+
# sdk-ci runs some Go unit tests.
28+
# This setting ensures that SDK date time
29+
# tests are run on a machine that
30+
# isn't using UTC
31+
- name: Set Timezone to Tokyo for datetime tests
32+
run: |
33+
sudo timedatectl set-timezone Asia/Tokyo
34+
2735
- name: Run CI for SDKs
2836
uses: ./.github/workflows/sdk-ci
2937

@@ -54,6 +62,7 @@ jobs:
5462
path: |
5563
_build/install/default/share/go/*
5664
!_build/install/default/share/go/dune
65+
!_build/install/default/share/go/**/*_test.go
5766
5867
- name: Store Java SDK source
5968
uses: actions/upload-artifact@v4
@@ -110,6 +119,14 @@ jobs:
110119
java-version: '17'
111120
distribution: 'temurin'
112121

122+
# Java Tests are run at compile time.
123+
# This setting ensures that SDK date time
124+
# tests are run on a machine that
125+
# isn't using UTC
126+
- name: Set Timezone to Tokyo for datetime tests
127+
run: |
128+
sudo timedatectl set-timezone Asia/Tokyo
129+
113130
- name: Build Java SDK
114131
shell: bash
115132
run: |
@@ -138,6 +155,21 @@ jobs:
138155
name: SDK_Source_CSharp
139156
path: source/
140157

158+
# All tests builds and pipelines should
159+
# work on other timezones. This setting ensures that
160+
# SDK date time tests are run on a machine that
161+
# isn't using UTC
162+
- name: Set Timezone to Tokyo for datetime tests
163+
shell: pwsh
164+
run: Set-TimeZone -Id "Tokyo Standard Time"
165+
166+
- name: Test C# SDK
167+
shell: pwsh
168+
run: |
169+
dotnet test source/XenServerTest `
170+
--disable-build-servers `
171+
--verbosity=normal
172+
141173
- name: Build C# SDK
142174
shell: pwsh
143175
run: |

.github/workflows/go-ci/action.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ runs:
1414
working-directory: ${{ github.workspace }}/_build/install/default/share/go/src
1515
args: --config=${{ github.workspace }}/.golangci.yml
1616

17+
- name: Run Go Tests
18+
shell: bash
19+
working-directory: ${{ github.workspace }}/_build/install/default/share/go/src
20+
run: go test -v
21+
1722
- name: Run CI for Go SDK
1823
shell: bash
1924
run: |

doc/content/design/plugin-protocol-v2.md

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ DATASOURCES
2020
000001e4
2121
dba4bf7a84b6d11d565d19ef91f7906e
2222
{
23-
"timestamp": 1339685573,
23+
"timestamp": 1339685573.245,
2424
"data_sources": {
2525
"cpu-temp-cpu0": {
2626
"description": "Temperature of CPU 0",
@@ -62,7 +62,7 @@ reported datasources.
6262
### Example
6363
```
6464
{
65-
"timestamp": 1339685573,
65+
"timestamp": 1339685573.245,
6666
"data_sources": {
6767
"cpu-temp-cpu0": {
6868
"description": "Temperature of CPU 0",
@@ -96,7 +96,7 @@ Protocol V2
9696
|data checksum |32 |int32 |binary-encoded crc32 of the concatenation of the encoded timestamp and datasource values|
9797
|metadata checksum |32 |int32 |binary-encoded crc32 of the metadata string (see below) |
9898
|number of datasources|32 |int32 |only needed if the metadata has changed - otherwise RRDD can use a cached value |
99-
|timestamp |64 |int64 |Unix epoch |
99+
|timestamp |64 |double|Unix epoch |
100100
|datasource values |n * 64 |int64 \| double |n is the number of datasources exported by the plugin, type dependent on the setting in the metadata for value_type [int64\|float] |
101101
|metadata length |32 |int32 | |
102102
|metadata |(string length)*8|string| |
@@ -193,6 +193,3 @@ This means that for a normal update, RRDD will only have to read the header plus
193193
the first (16 + 16 + 4 + 8 + 8*n) bytes of data, where n is the number of
194194
datasources exported by the plugin. If the metadata changes RRDD will have to
195195
read all the data (and parse the metadata).
196-
197-
n.b. the timestamp reported by plugins is not currently used by RRDD - it uses
198-
its own global timestamp.

doc/content/toolstack/features/NUMA/index.md

Lines changed: 46 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ There is also I/O NUMA where a cost is similarly associated to where a PCIe is p
4949

5050
NUMA does have advantages though: if each node accesses only its local memory, then each node can independently achieve maximum throughput.
5151

52-
For best performance we should:
52+
For best performance, we should:
5353
- minimize the amount of interconnect bandwidth we are using
5454
- run code that accesses memory allocated on the closest NUMA node
5555
- maximize the number of NUMA nodes that we use in the system as a whole
@@ -62,39 +62,59 @@ The Xen scheduler supports 2 kinds of constraints:
6262
* hard pinning: a vCPU may only run on the specified set of pCPUs and nowhere else
6363
* soft pinning: a vCPU is *preferably* run on the specified set of pCPUs, but if they are all busy then it may run elsewhere
6464

65-
The former is useful if you want strict separation, but it can potentially leave part of the system idle while another part is bottlenecked with lots of vCPUs all competing for the same limited set of pCPUs.
65+
Hard pinning can be used to partition the system. But, it can potentially leave part of the system idle while another part is bottlenecked by many vCPUs competing for the same limited set of pCPUs.
6666

67-
Xen does not migrate workloads between NUMA nodes on its own (the Linux kernel does), although it is possible to achieve a similar effect with explicit migration.
68-
However migration introduces additional delays and is best avoided for entire VMs.
67+
Xen does not migrate workloads between NUMA nodes on its own (the Linux kernel can). Although, it is possible to achieve a similar effect with explicit migration.
68+
However, migration introduces additional delays and is best avoided for entire VMs.
6969

70-
The latter (soft pinning) is preferred: running a workload now, even on a potentially suboptimal pCPU (higher NUMA latency) is still better than not running it at all and waiting until a pCPU is freed up.
70+
Therefore, soft pinning is preferred: Running on a potentially suboptimal pCPU that uses remote memory could still be better than not running it at all until a pCPU is free to run it.
7171

72-
Xen will also allocate memory for the VM according to the vCPU (soft) pinning: if the vCPUs are pinned only to NUMA nodes A and B, then it will allocate the VM's memory from NUMA nodes A and B (in a round-robin way, resulting in interleaving).
72+
Xen will also allocate memory for the VM according to the vCPU (soft) pinning: If the vCPUs are pinned to NUMA nodes A and B, Xen allocates memory from NUMA nodes A and B in a round-robin way, resulting in interleaving.
7373

74-
By default (no pinning) it will interleave memory from all NUMA nodes, which provides average performance, but individual tasks' performance may be significantly higher or lower depending on which NUMA node the application may have "landed" on.
75-
Furthermore restarting processes will speed them up or slow them down as address space randomization picks different memory regions inside a VM.
74+
### Current default: No vCPU pinning
75+
76+
By default, when no vCPU pinning is used, Xen interleaves memory from all NUMA nodes. This averages the memory performance, but individual tasks' performance may be significantly higher or lower depending on which NUMA node the application may have "landed" on.
77+
As a result, restarting processes will speed them up or slow them down as address space randomization picks different memory regions inside a VM.
78+
79+
This uses the memory bandwidth of all memory controllers and distributes the load across all nodes.
80+
However, the memory latency is higher as the NUMA interconnects are used for most memory accesses and vCPU synchronization within the Domains.
7681

7782
Note that this is not the worst case: the worst case would be for memory to be allocated on one NUMA node, but the vCPU always running on the furthest away NUMA node.
7883

7984
## Best effort NUMA-aware memory allocation for VMs
8085

81-
By default Xen stripes the VM's memory accross all NUMA nodes of the host, which means that every VM has to go through all the interconnects.
86+
87+
### Summary
88+
89+
The best-effort mode attempts to fit Domains into NUMA nodes and to balance memory usage.
90+
It soft-pins Domains on the NUMA node with the most available memory when adding the Domain.
91+
Memory is currently allocated when booting the VM (or while constructing the resuming VM).
92+
93+
Parallel boot issue: Memory is not pre-allocated on creation, but allocated during boot.
94+
The result is that parallel VM creation and boot can exhaust the memory of NUMA nodes.
95+
96+
### Goals
97+
98+
By default, Xen stripes the VM's memory across all NUMA nodes of the host, which means that every VM has to go through all the interconnects.
8299
The goal here is to find a better allocation than the default, not necessarily an optimal allocation.
83-
An optimal allocation would require knowing what VMs you would start/create in the future, and planning across hosts too.
100+
An optimal allocation would require knowing what VMs you would start/create in the future, and planning across hosts.
101+
This allows the host to use all NUMA nodes to take advantage of the full memory bandwidth available on the pool hosts.
84102

85-
Overall we want to balance the VMs across NUMA nodes, such that we use all NUMA nodes to take advantage of the maximum memory bandwidth available on the system.
103+
Overall, we want to balance the VMs across NUMA nodes, such that we use all NUMA nodes to take advantage of the maximum memory bandwidth available on the system.
86104
For now this proposed balancing will be done only by balancing memory usage: always heuristically allocating VMs on the NUMA node that has the most available memory.
87-
Note that this allocation has a race condition for now when multiple VMs are booted in parallel, because we don't wait until Xen has constructed the domain for each one (that'd serialize domain construction, which is currently parallel).
105+
For now, this allocation has a race condition: This happens when multiple VMs are booted in parallel, because we don't wait until Xen has constructed the domain for each one (that'd serialize domain construction, which is currently parallel).
88106
This may be improved in the future by having an API to query Xen where it has allocated the memory, and to explicitly ask it to place memory on a given NUMA node (instead of best_effort).
89107

90108
If a VM doesn't fit into a single node then it is not so clear what the best approach is.
91109
One criteria to consider is minimizing the NUMA distance between the nodes chosen for the VM.
92-
Large NUMA systems may not be fully connected in a mesh requiring multiple hops to each a node, or even have assymetric links, or links with different bitwidth.
93-
These tradeoff should be approximatively reflected in the ACPI SLIT tables, as a matrix of distances between nodes.
110+
Large NUMA systems may not be fully connected in a mesh requiring multiple hops to each a node, or even have asymmetric links, or links with different bandwidth.
111+
The specific NUMA topology is provided by the ACPI SLIT table as the matrix of distances between nodes.
94112
It is possible that 3 NUMA nodes have a smaller average/maximum distance than 2, so we need to consider all possibilities.
95113

96114
For N nodes there would be 2^N possibilities, so [Topology.NUMA.candidates] limits the number of choices to 65520+N (full set of 2^N possibilities for 16 NUMA nodes, and a reduced set of choices for larger systems).
97115

116+
### Implementation
117+
98118
[Topology.NUMA.candidates] is a sorted sequence of node sets, in ascending order of maximum/average distances.
99119
Once we've eliminated the candidates not suitable for this VM (that do not have enough total memory/pCPUs) we are left with a monotonically increasing sequence of nodes.
100120
There are still multiple possibilities with same average distance.
@@ -110,19 +130,19 @@ See page 13 in [^AMD_numa] for a diagram of an AMD Opteron 6272 system.
110130

111131
* Booting multiple VMs in parallel will result in potentially allocating both on the same NUMA node (race condition)
112132
* When we're about to run out of host memory we'll fall back to striping memory again, but the soft affinity mask won't reflect that (this needs an API to query Xen on where it has actually placed the VM, so we can fix up the mask accordingly)
113-
* XAPI is not aware of NUMA balancing across a pool, and choses hosts purely based on total amount of free memory, even if a better NUMA placement could be found on another host
133+
* XAPI is not aware of NUMA balancing across a pool. Xenopsd chooses NUMA nodes purely based on amount of free memory on the NUMA nodes of the host, even if a better NUMA placement could be found on another host
114134
* Very large (>16 NUMA nodes) systems may only explore a limited number of choices (fit into a single node vs fallback to full interleaving)
115135
* The exact VM placement is not yet controllable
116136
* Microbenchmarks with a single VM on a host show both performance improvements and regressions on memory bandwidth usage: previously a single VM may have been able to take advantage of the bandwidth of both NUMA nodes if it happened to allocate memory from the right places, whereas now it'll be forced to use just a single node.
117137
As soon as you have more than 1 VM that is busy on a system enabling NUMA balancing should almost always be an improvement though.
118-
* it is not supported to combine hard vCPU masks with soft affinity: if hard affinities are used then no NUMA scheduling is done by the toolstack and we obey exactly what the user has asked for with hard affinities.
138+
* It is not supported to combine hard vCPU masks with soft affinity: if hard affinities are used, then no NUMA scheduling is done by the toolstack, and we obey exactly what the user has asked for with hard affinities.
119139
This shouldn't affect other VMs since the memory used by hard-pinned VMs will still be reflected in overall less memory available on individual NUMA nodes.
120140
* Corner case: the ACPI standard allows certain NUMA nodes to be unreachable (distance `0xFF` = `-1` in the Xen bindings).
121141
This is not supported and will cause an exception to be raised.
122142
If this is an issue in practice the NUMA matrix could be pre-filtered to contain only reachable nodes.
123-
NUMA nodes with 0 CPUs *are* accepted (it can result from hard affinity pinnings)
143+
NUMA nodes with 0 CPUs *are* accepted (it can result from hard affinity pinning)
124144
* NUMA balancing is not considered during HA planning
125-
* Dom0 is a single VM that needs to communicate with all other VMs, so NUMA balancing is not applied to it (we'd need to expose NUMA topology to the Dom0 kernel so it can better allocate processes)
145+
* Dom0 is a single VM that needs to communicate with all other VMs, so NUMA balancing is not applied to it (we'd need to expose NUMA topology to the Dom0 kernel, so it can better allocate processes)
126146
* IO NUMA is out of scope for now
127147

128148
## XAPI datamodel design
@@ -139,7 +159,7 @@ Meaning of the policy:
139159
* `best_effort`: the algorithm described in this document, where soft pinning is used to achieve better balancing and lower latency
140160
* `default_policy`: when the admin hasn't expressed a preference
141161

142-
* Currently `default_policy` is treated as `any`, but the admin can change it, and then the system will remember that change across upgrades.
162+
* Currently, `default_policy` is treated as `any`, but the admin can change it, and then the system will remember that change across upgrades.
143163
If we didn't have a `default_policy` then changing the "default" policy on an upgrade would be tricky: we either risk overriding an explicit choice of the admin, or existing installs cannot take advantage of the improved performance from `best_effort`
144164
* Future XAPI versions may change `default_policy` to mean `best_effort`.
145165
Admins can still override it to `any` if they wish on a host by host basis.
@@ -149,7 +169,7 @@ It is not expected that users would have to change `best_effort`, unless they ru
149169
There is also no separate feature flag: this host flag acts as a feature flag that can be set through the API without restarting the toolstack.
150170
Although obviously only new VMs will benefit.
151171

152-
Debugging the allocator is done by running `xl vcpu-list` and investigating the soft pinning masks, and by analyzing xensource.log.
172+
Debugging the allocator is done by running `xl vcpu-list` and investigating the soft pinning masks, and by analyzing `xensource.log`.
153173

154174
### Xenopsd implementation
155175

@@ -166,18 +186,18 @@ This avoids exponential state space explosion on very large systems (>16 NUMA no
166186
* [Topology.NUMA.choose] will choose one NUMA node deterministically, while trying to keep overall NUMA node usage balanced.
167187
* [Domain.numa_placement] builds a [NUMARequest] and uses the above [Topology] and [Softaffinity] functions to compute and apply a plan.
168188

169-
We used to have a `xenopsd.conf` configuration option to enable numa placement, for backwards compatibility this is still supported, but only if the admin hasn't set an explicit policy on the Host.
189+
We used to have a `xenopsd.conf` configuration option to enable NUMA placement, for backwards compatibility this is still supported, but only if the admin hasn't set an explicit policy on the Host.
170190
It is best to remove the experimental `xenopsd.conf` entry though, a future version may completely drop it.
171191

172192
Tests are in [test_topology.ml] which checks balancing properties and whether the plan has improved best/worst/average-case access times in a simulated test based on 2 predefined NUMA distance matrixes (one from Intel and one from an AMD system).
173193

174194
## Future work
175195

176-
* enable 'best_effort' mode by default once more testing has been done
177-
* an API to query Xen where it has actually allocated the VM's memory.
178-
Currently only an `xl debug-keys` interface exists which is not supported in production as it can result in killing the host via the watchdog, and is not a proper API, but a textual debug output with no stability guarantees.
179-
* more host policies (e.g. `strict`).
180-
Requires the XAPI pool scheduler to be NUMA aware and consider it as part of chosing hosts.
196+
* Enable 'best_effort' mode by default once more testing has been done
197+
* Add an API to query Xen for the NUMA node memory placement (where it has actually allocated the VM's memory).
198+
Currently, only the `xl debug-keys` interface exists which is not supported in production as it can result in killing the host via the watchdog, and is not a proper API, but a textual debug output with no stability guarantees.
199+
* More host policies, e.g. `strict`.
200+
Requires the XAPI pool scheduler to be NUMA aware and consider it as part of choosing hosts.
181201
* VM level policy that can set a NUMA affinity index, mapped to a NUMA node modulo NUMA nodes available on the system (this is needed so that after migration we don't end up trying to allocate vCPUs to a non-existent NUMA node)
182202
* VM level anti-affinity rules for NUMA placement (can be achieved by setting unique NUMA affinity indexes)
183203

0 commit comments

Comments
 (0)