Skip to content

Commit e21647c

Browse files
authored
CP-53658: Claim memory on a single NUMA node (xapi-project#6369)
This PR allows xenopsd to launch domains that are using memory allocated in a single NUMA node in a reliable way. (when restarting a VM in a host with most of the memory in use) Since this is only supported by Xen with this patchqueue: https://lists.xen.org/archives/html/xen-devel/2025-03/msg01127.html, and it needs support from xenguest and emu-manager, there are two commits that revert the changes to just claim memory pages without a NUMA memory node. These are easily revertable by patches to enable this functionality in cutting edge builds. Revertable patches: - xenctrl: Don't use numa_node in domain_claim_pages calls - xenopsd/xc: Do not try to allocate pages to a particular NUMA node
2 parents b5894f5 + 9e6fb15 commit e21647c

14 files changed

+430
-164
lines changed

ocaml/xapi-client/event_helper.ml

+66-30
Original file line numberDiff line numberDiff line change
@@ -43,96 +43,132 @@ type event_record =
4343
| VMPP of [`VMPP] Ref.t * API.vMPP_t option
4444
| VMSS of [`VMSS] Ref.t * API.vMSS_t option
4545

46-
let maybe f x = match x with Some x -> Some (f x) | None -> None
47-
4846
let record_of_event ev =
4947
let rpc = ev.Event_types.snapshot in
5048
match ev.Event_types.ty with
5149
| "session" ->
5250
Session
5351
( Ref.of_secret_string ev.Event_types.reference
54-
, maybe API.session_t_of_rpc rpc
52+
, Option.map API.session_t_of_rpc rpc
5553
)
5654
| "task" ->
57-
Task (Ref.of_string ev.Event_types.reference, maybe API.task_t_of_rpc rpc)
55+
Task
56+
( Ref.of_string ev.Event_types.reference
57+
, Option.map API.task_t_of_rpc rpc
58+
)
5859
| "event" ->
5960
Event
60-
(Ref.of_string ev.Event_types.reference, maybe API.event_t_of_rpc rpc)
61+
( Ref.of_string ev.Event_types.reference
62+
, Option.map API.event_t_of_rpc rpc
63+
)
6164
| "vm" ->
62-
VM (Ref.of_string ev.Event_types.reference, maybe API.vM_t_of_rpc rpc)
65+
VM (Ref.of_string ev.Event_types.reference, Option.map API.vM_t_of_rpc rpc)
6366
| "vm_metrics" ->
6467
VM_metrics
6568
( Ref.of_string ev.Event_types.reference
66-
, maybe API.vM_metrics_t_of_rpc rpc
69+
, Option.map API.vM_metrics_t_of_rpc rpc
6770
)
6871
| "vm_guest_metrics" ->
6972
VM_guest_metrics
7073
( Ref.of_string ev.Event_types.reference
71-
, maybe API.vM_guest_metrics_t_of_rpc rpc
74+
, Option.map API.vM_guest_metrics_t_of_rpc rpc
7275
)
7376
| "host" ->
74-
Host (Ref.of_string ev.Event_types.reference, maybe API.host_t_of_rpc rpc)
77+
Host
78+
( Ref.of_string ev.Event_types.reference
79+
, Option.map API.host_t_of_rpc rpc
80+
)
7581
| "host_metrics" ->
7682
Host_metrics
7783
( Ref.of_string ev.Event_types.reference
78-
, maybe API.host_metrics_t_of_rpc rpc
84+
, Option.map API.host_metrics_t_of_rpc rpc
7985
)
8086
| "host_cpu" ->
8187
Host_cpu
82-
(Ref.of_string ev.Event_types.reference, maybe API.host_cpu_t_of_rpc rpc)
88+
( Ref.of_string ev.Event_types.reference
89+
, Option.map API.host_cpu_t_of_rpc rpc
90+
)
8391
| "network" ->
8492
Network
85-
(Ref.of_string ev.Event_types.reference, maybe API.network_t_of_rpc rpc)
93+
( Ref.of_string ev.Event_types.reference
94+
, Option.map API.network_t_of_rpc rpc
95+
)
8696
| "vif" ->
87-
VIF (Ref.of_string ev.Event_types.reference, maybe API.vIF_t_of_rpc rpc)
97+
VIF
98+
(Ref.of_string ev.Event_types.reference, Option.map API.vIF_t_of_rpc rpc)
8899
| "vif_metrics" ->
89100
VIF_metrics
90101
( Ref.of_string ev.Event_types.reference
91-
, maybe API.vIF_metrics_t_of_rpc rpc
102+
, Option.map API.vIF_metrics_t_of_rpc rpc
92103
)
93104
| "pif" ->
94-
PIF (Ref.of_string ev.Event_types.reference, maybe API.pIF_t_of_rpc rpc)
105+
PIF
106+
(Ref.of_string ev.Event_types.reference, Option.map API.pIF_t_of_rpc rpc)
95107
| "pif_metrics" ->
96108
PIF_metrics
97109
( Ref.of_string ev.Event_types.reference
98-
, maybe API.pIF_metrics_t_of_rpc rpc
110+
, Option.map API.pIF_metrics_t_of_rpc rpc
99111
)
100112
| "sr" ->
101-
SR (Ref.of_string ev.Event_types.reference, maybe API.sR_t_of_rpc rpc)
113+
SR (Ref.of_string ev.Event_types.reference, Option.map API.sR_t_of_rpc rpc)
102114
| "vdi" ->
103-
VDI (Ref.of_string ev.Event_types.reference, maybe API.vDI_t_of_rpc rpc)
115+
VDI
116+
(Ref.of_string ev.Event_types.reference, Option.map API.vDI_t_of_rpc rpc)
104117
| "vbd" ->
105-
VBD (Ref.of_string ev.Event_types.reference, maybe API.vBD_t_of_rpc rpc)
118+
VBD
119+
(Ref.of_string ev.Event_types.reference, Option.map API.vBD_t_of_rpc rpc)
106120
| "vbd_metrics" ->
107121
VBD_metrics
108122
( Ref.of_string ev.Event_types.reference
109-
, maybe API.vBD_metrics_t_of_rpc rpc
123+
, Option.map API.vBD_metrics_t_of_rpc rpc
110124
)
111125
| "pbd" ->
112-
PBD (Ref.of_string ev.Event_types.reference, maybe API.pBD_t_of_rpc rpc)
126+
PBD
127+
(Ref.of_string ev.Event_types.reference, Option.map API.pBD_t_of_rpc rpc)
113128
| "crashdump" ->
114129
Crashdump
115130
( Ref.of_string ev.Event_types.reference
116-
, maybe API.crashdump_t_of_rpc rpc
131+
, Option.map API.crashdump_t_of_rpc rpc
117132
)
118133
| "vtpm" ->
119-
VTPM (Ref.of_string ev.Event_types.reference, maybe API.vTPM_t_of_rpc rpc)
134+
VTPM
135+
( Ref.of_string ev.Event_types.reference
136+
, Option.map API.vTPM_t_of_rpc rpc
137+
)
120138
| "console" ->
121139
Console
122-
(Ref.of_string ev.Event_types.reference, maybe API.console_t_of_rpc rpc)
140+
( Ref.of_string ev.Event_types.reference
141+
, Option.map API.console_t_of_rpc rpc
142+
)
123143
| "user" ->
124-
User (Ref.of_string ev.Event_types.reference, maybe API.user_t_of_rpc rpc)
144+
User
145+
( Ref.of_string ev.Event_types.reference
146+
, Option.map API.user_t_of_rpc rpc
147+
)
125148
| "pool" ->
126-
Pool (Ref.of_string ev.Event_types.reference, maybe API.pool_t_of_rpc rpc)
149+
Pool
150+
( Ref.of_string ev.Event_types.reference
151+
, Option.map API.pool_t_of_rpc rpc
152+
)
127153
| "message" ->
128154
Message
129-
(Ref.of_string ev.Event_types.reference, maybe API.message_t_of_rpc rpc)
155+
( Ref.of_string ev.Event_types.reference
156+
, Option.map API.message_t_of_rpc rpc
157+
)
130158
| "secret" ->
131159
Secret
132-
(Ref.of_string ev.Event_types.reference, maybe API.secret_t_of_rpc rpc)
160+
( Ref.of_string ev.Event_types.reference
161+
, Option.map API.secret_t_of_rpc rpc
162+
)
133163
| "vmpp" ->
134-
VMPP (Ref.of_string ev.Event_types.reference, maybe API.vMPP_t_of_rpc rpc)
164+
VMPP
165+
( Ref.of_string ev.Event_types.reference
166+
, Option.map API.vMPP_t_of_rpc rpc
167+
)
135168
| "vmss" ->
136-
VMSS (Ref.of_string ev.Event_types.reference, maybe API.vMSS_t_of_rpc rpc)
169+
VMSS
170+
( Ref.of_string ev.Event_types.reference
171+
, Option.map API.vMSS_t_of_rpc rpc
172+
)
137173
| _ ->
138174
failwith "unknown event type"

ocaml/xenopsd/c_stubs/xenctrlext_stubs.c

+4-3
Original file line numberDiff line numberDiff line change
@@ -672,16 +672,17 @@ CAMLprim value stub_xenforeignmemory_unmap(value fmem, value mapping)
672672
}
673673

674674
CAMLprim value stub_xenctrlext_domain_claim_pages(value xch_val, value domid_val,
675-
value nr_pages_val)
675+
value numa_node_val, value nr_pages_val)
676676
{
677-
CAMLparam3(xch_val, domid_val, nr_pages_val);
677+
CAMLparam4(xch_val, domid_val, numa_node_val, nr_pages_val);
678678
int retval, the_errno;
679679
xc_interface* xch = xch_of_val(xch_val);
680680
uint32_t domid = Int_val(domid_val);
681+
// unsigned int numa_node = Int_val(numa_node_val);
681682
unsigned long nr_pages = Long_val(nr_pages_val);
682683

683684
caml_release_runtime_system();
684-
retval = xc_domain_claim_pages(xch, domid, nr_pages);
685+
retval = xc_domain_claim_pages(xch, domid, /*numa_node,*/ nr_pages);
685686
the_errno = errno;
686687
caml_acquire_runtime_system();
687688

ocaml/xenopsd/lib/softaffinity.ml

+5-6
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,9 @@ let plan host nodes ~vm =
2626
(Fmt.to_to_string NUMARequest.pp_dump requested)
2727
(Fmt.to_to_string NUMAResource.pp_dump allocated) ;
2828
let candidate = nodes.(nodeidx) in
29-
( NUMAResource.union allocated candidate
30-
, node :: picked
31-
, NUMARequest.shrink requested candidate
32-
)
29+
(* This is where the memory allocated to the node can be calculated *)
30+
let remaining_request = NUMARequest.shrink requested candidate in
31+
(NUMAResource.union allocated candidate, node :: picked, remaining_request)
3332
in
3433
let plan_valid (avg, nodes) =
3534
let allocated, picked, remaining =
@@ -72,8 +71,8 @@ let plan host nodes ~vm =
7271
| None ->
7372
debug "No allocations possible" ;
7473
None
75-
| Some allocated ->
74+
| Some (allocated, nodes) ->
7675
debug "Allocated resources: %s"
7776
(Fmt.to_to_string NUMAResource.pp_dump allocated) ;
7877
assert (NUMARequest.fits vm allocated) ;
79-
Some allocated.NUMAResource.affinity
78+
Some (allocated.NUMAResource.affinity, nodes)

ocaml/xenopsd/lib/softaffinity.mli

+5-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,11 @@
1414

1515
open Topology
1616

17-
val plan : NUMA.t -> NUMAResource.t array -> vm:NUMARequest.t -> CPUSet.t option
17+
val plan :
18+
NUMA.t
19+
-> NUMAResource.t array
20+
-> vm:NUMARequest.t
21+
-> (Topology.CPUSet.t * Topology.NUMA.node list) option
1822
(** [plan host nodes ~vm] returns the CPU soft affinity recommended for [vm],
1923
Such that the memory latency between the NUMA nodes of the vCPUs is small,
2024
and usage of NUMA nodes is balanced.

ocaml/xenopsd/lib/topology.ml

+1-1
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ module NUMA = struct
298298
None
299299
else (
300300
List.iter (fun (Node n) -> t.node_usage.(n) <- t.node_usage.(n) + 1) nodes ;
301-
Some result
301+
Some (result, nodes)
302302
)
303303

304304
let pp_dump_node = Fmt.(using (fun (Node x) -> x) int)

ocaml/xenopsd/lib/topology.mli

+4-1
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,10 @@ module NUMA : sig
150150
NUMA nodes > 16 it limits the length of the sequence to [n+65520], to
151151
avoid exponential blowup. *)
152152

153-
val choose : t -> (node list * NUMAResource.t) Seq.t -> NUMAResource.t option
153+
val choose :
154+
t
155+
-> (node list * NUMAResource.t) Seq.t
156+
-> (NUMAResource.t * node list) option
154157
(** [choose t resources] will choose one NUMA node deterministically, trying
155158
to keep the overall NUMA node usage balanced *)
156159

ocaml/xenopsd/test/test_topology.ml

+8-6
Original file line numberDiff line numberDiff line change
@@ -210,26 +210,28 @@ let test_allocate ?(mem = default_mem) (expected_cores, h) ~vms () =
210210
match Softaffinity.plan h nodes ~vm with
211211
| None ->
212212
Alcotest.fail "No NUMA plan"
213-
| Some plan ->
214-
D.debug "NUMA allocation succeeded for VM %d: %s" i
215-
(Fmt.to_to_string CPUSet.pp_dump plan) ;
213+
| Some (cpu_plan, mem_plan) ->
214+
D.debug
215+
"NUMA allocation succeeded for VM %d: [CPUS: %s]; [nodes: %s]" i
216+
(Fmt.to_to_string CPUSet.pp_dump cpu_plan)
217+
(Fmt.to_to_string Fmt.(Dump.list NUMA.pp_dump_node) mem_plan) ;
216218
let usednodes =
217-
plan
219+
cpu_plan
218220
|> CPUSet.elements
219221
|> List.map (NUMA.node_of_cpu h)
220222
|> List.sort_uniq compare
221223
|> List.to_seq
222224
in
223225
let costs_numa_aware =
224-
vm_access_costs h plans (vm_cores, usednodes, plan)
226+
vm_access_costs h plans (vm_cores, usednodes, cpu_plan)
225227
in
226228
let costs_default =
227229
vm_access_costs h plans (vm_cores, NUMA.nodes h, NUMA.all_cpus h)
228230
in
229231
cost_not_worse ~default:costs_default costs_numa_aware ;
230232
( costs_default :: costs_old
231233
, costs_numa_aware :: costs_new
232-
, ((vm_cores, List.of_seq usednodes), plan) :: plans
234+
, ((vm_cores, List.of_seq usednodes), cpu_plan) :: plans
233235
)
234236
)
235237
([], [], [])

0 commit comments

Comments
 (0)