Skip to content

Commit 4e4fe93

Browse files
guowangykawasaki
authored andcommitted
lib/group_cpus: make group CPU cluster aware
As CPU core counts increase, the number of NVMe IRQs may be smaller than the total number of CPUs. This forces multiple CPUs to share the same IRQ. If the IRQ affinity and the CPU’s cluster do not align, a performance penalty can be observed on some platforms. This patch improves IRQ affinity by grouping CPUs by cluster within each NUMA domain, ensuring better locality between CPUs and their assigned NVMe IRQs. Reviewed-by: Tianyou Li <[email protected]> Reviewed-by: Tim Chen <[email protected]> Tested-by: Dan Liang <[email protected]> Signed-off-by: Wangyang Guo <[email protected]>
1 parent 6f43942 commit 4e4fe93

File tree

1 file changed

+204
-65
lines changed

1 file changed

+204
-65
lines changed

lib/group_cpus.c

Lines changed: 204 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -114,48 +114,15 @@ static int ncpus_cmp_func(const void *l, const void *r)
114114
return ln->ncpus - rn->ncpus;
115115
}
116116

117-
/*
118-
* Allocate group number for each node, so that for each node:
119-
*
120-
* 1) the allocated number is >= 1
121-
*
122-
* 2) the allocated number is <= active CPU number of this node
123-
*
124-
* The actual allocated total groups may be less than @numgrps when
125-
* active total CPU number is less than @numgrps.
126-
*
127-
* Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
128-
* for each node.
129-
*/
130-
static void alloc_nodes_groups(unsigned int numgrps,
131-
cpumask_var_t *node_to_cpumask,
132-
const struct cpumask *cpu_mask,
133-
const nodemask_t nodemsk,
134-
struct cpumask *nmsk,
135-
struct node_groups *node_groups)
117+
static void alloc_groups_to_nodes(unsigned int numgrps,
118+
unsigned int numcpus,
119+
struct node_groups *node_groups,
120+
unsigned int num_nodes)
136121
{
137-
unsigned n, remaining_ncpus = 0;
138-
139-
for (n = 0; n < nr_node_ids; n++) {
140-
node_groups[n].id = n;
141-
node_groups[n].ncpus = UINT_MAX;
142-
}
143-
144-
for_each_node_mask(n, nodemsk) {
145-
unsigned ncpus;
146-
147-
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
148-
ncpus = cpumask_weight(nmsk);
122+
unsigned int n, remaining_ncpus = numcpus;
123+
unsigned int ngroups, ncpus;
149124

150-
if (!ncpus)
151-
continue;
152-
remaining_ncpus += ncpus;
153-
node_groups[n].ncpus = ncpus;
154-
}
155-
156-
numgrps = min_t(unsigned, remaining_ncpus, numgrps);
157-
158-
sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
125+
sort(node_groups, num_nodes, sizeof(node_groups[0]),
159126
ncpus_cmp_func, NULL);
160127

161128
/*
@@ -226,9 +193,8 @@ static void alloc_nodes_groups(unsigned int numgrps,
226193
* finally for each node X: grps(X) <= ncpu(X).
227194
*
228195
*/
229-
for (n = 0; n < nr_node_ids; n++) {
230-
unsigned ngroups, ncpus;
231196

197+
for (n = 0; n < num_nodes; n++) {
232198
if (node_groups[n].ncpus == UINT_MAX)
233199
continue;
234200

@@ -246,12 +212,199 @@ static void alloc_nodes_groups(unsigned int numgrps,
246212
}
247213
}
248214

215+
/*
216+
* Allocate group number for each node, so that for each node:
217+
*
218+
* 1) the allocated number is >= 1
219+
*
220+
* 2) the allocated number is <= active CPU number of this node
221+
*
222+
* The actual allocated total groups may be less than @numgrps when
223+
* active total CPU number is less than @numgrps.
224+
*
225+
* Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
226+
* for each node.
227+
*/
228+
static void alloc_nodes_groups(unsigned int numgrps,
229+
cpumask_var_t *node_to_cpumask,
230+
const struct cpumask *cpu_mask,
231+
const nodemask_t nodemsk,
232+
struct cpumask *nmsk,
233+
struct node_groups *node_groups)
234+
{
235+
unsigned int n, numcpus = 0;
236+
237+
for (n = 0; n < nr_node_ids; n++) {
238+
node_groups[n].id = n;
239+
node_groups[n].ncpus = UINT_MAX;
240+
}
241+
242+
for_each_node_mask(n, nodemsk) {
243+
unsigned int ncpus;
244+
245+
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
246+
ncpus = cpumask_weight(nmsk);
247+
248+
if (!ncpus)
249+
continue;
250+
numcpus += ncpus;
251+
node_groups[n].ncpus = ncpus;
252+
}
253+
254+
numgrps = min_t(unsigned int, numcpus, numgrps);
255+
alloc_groups_to_nodes(numgrps, numcpus, node_groups, nr_node_ids);
256+
}
257+
258+
static void assign_cpus_to_groups(unsigned int ncpus,
259+
struct cpumask *nmsk,
260+
struct node_groups *nv,
261+
struct cpumask *masks,
262+
unsigned int *curgrp,
263+
unsigned int last_grp)
264+
{
265+
unsigned int v, cpus_per_grp, extra_grps;
266+
/* Account for rounding errors */
267+
extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
268+
269+
/* Spread allocated groups on CPUs of the current node */
270+
for (v = 0; v < nv->ngroups; v++, *curgrp += 1) {
271+
cpus_per_grp = ncpus / nv->ngroups;
272+
273+
/* Account for extra groups to compensate rounding errors */
274+
if (extra_grps) {
275+
cpus_per_grp++;
276+
--extra_grps;
277+
}
278+
279+
/*
280+
* wrapping has to be considered given 'startgrp'
281+
* may start anywhere
282+
*/
283+
if (*curgrp >= last_grp)
284+
*curgrp = 0;
285+
grp_spread_init_one(&masks[*curgrp], nmsk, cpus_per_grp);
286+
}
287+
}
288+
289+
static int alloc_cluster_groups(unsigned int ncpus,
290+
unsigned int ngroups,
291+
struct cpumask *node_cpumask,
292+
cpumask_var_t msk,
293+
const struct cpumask ***clusters_ptr,
294+
struct node_groups **cluster_groups_ptr)
295+
{
296+
unsigned int ncluster = 0;
297+
unsigned int cpu, nc, n;
298+
const struct cpumask *cluster_mask;
299+
const struct cpumask **clusters;
300+
struct node_groups *cluster_groups;
301+
302+
cpumask_copy(msk, node_cpumask);
303+
304+
/* Probe how many clusters in this node. */
305+
while (1) {
306+
cpu = cpumask_first(msk);
307+
if (cpu >= nr_cpu_ids)
308+
break;
309+
310+
cluster_mask = topology_cluster_cpumask(cpu);
311+
/* Clean out CPUs on the same cluster. */
312+
cpumask_andnot(msk, msk, cluster_mask);
313+
ncluster++;
314+
}
315+
316+
/* If ngroups < ncluster, cross cluster is inevitable, skip. */
317+
if (ncluster == 0 || ncluster > ngroups)
318+
goto no_cluster;
319+
320+
/* Allocate memory based on cluster number. */
321+
clusters = kcalloc(ncluster, sizeof(struct cpumask *), GFP_KERNEL);
322+
if (!clusters)
323+
goto no_cluster;
324+
cluster_groups = kcalloc(ncluster, sizeof(struct node_groups), GFP_KERNEL);
325+
if (!cluster_groups)
326+
goto fail_cluster_groups;
327+
328+
/* Filling cluster info for later process. */
329+
cpumask_copy(msk, node_cpumask);
330+
for (n = 0; n < ncluster; n++) {
331+
cpu = cpumask_first(msk);
332+
cluster_mask = topology_cluster_cpumask(cpu);
333+
nc = cpumask_weight_and(cluster_mask, node_cpumask);
334+
clusters[n] = cluster_mask;
335+
cluster_groups[n].id = n;
336+
cluster_groups[n].ncpus = nc;
337+
cpumask_andnot(msk, msk, cluster_mask);
338+
}
339+
340+
alloc_groups_to_nodes(ngroups, ncpus, cluster_groups, ncluster);
341+
342+
*clusters_ptr = clusters;
343+
*cluster_groups_ptr = cluster_groups;
344+
return ncluster;
345+
346+
fail_cluster_groups:
347+
kfree(clusters);
348+
no_cluster:
349+
return 0;
350+
}
351+
352+
/*
353+
* Try group CPUs evenly for cluster locality within a NUMA node.
354+
*
355+
* Return: true if success, false otherwise.
356+
*/
357+
static bool __try_group_cluster_cpus(unsigned int ncpus,
358+
unsigned int ngroups,
359+
struct cpumask *node_cpumask,
360+
struct cpumask *masks,
361+
unsigned int *curgrp,
362+
unsigned int last_grp)
363+
{
364+
struct node_groups *cluster_groups;
365+
const struct cpumask **clusters;
366+
unsigned int ncluster;
367+
bool ret = false;
368+
cpumask_var_t nmsk;
369+
unsigned int i, nc;
370+
371+
if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
372+
goto fail_nmsk_alloc;
373+
374+
ncluster = alloc_cluster_groups(ncpus, ngroups, node_cpumask, nmsk,
375+
&clusters, &cluster_groups);
376+
377+
if (ncluster == 0)
378+
goto fail_no_clusters;
379+
380+
for (i = 0; i < ncluster; i++) {
381+
struct node_groups *nv = &cluster_groups[i];
382+
383+
/* Get the cpus on this cluster. */
384+
cpumask_and(nmsk, node_cpumask, clusters[nv->id]);
385+
nc = cpumask_weight(nmsk);
386+
if (!nc)
387+
continue;
388+
WARN_ON_ONCE(nv->ngroups > nc);
389+
390+
assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp);
391+
}
392+
393+
ret = true;
394+
kfree(cluster_groups);
395+
kfree(clusters);
396+
fail_no_clusters:
397+
free_cpumask_var(nmsk);
398+
fail_nmsk_alloc:
399+
return ret;
400+
}
401+
249402
static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
250403
cpumask_var_t *node_to_cpumask,
251404
const struct cpumask *cpu_mask,
252405
struct cpumask *nmsk, struct cpumask *masks)
253406
{
254-
unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
407+
unsigned int i, n, nodes, done = 0;
255408
unsigned int last_grp = numgrps;
256409
unsigned int curgrp = startgrp;
257410
nodemask_t nodemsk = NODE_MASK_NONE;
@@ -287,7 +440,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
287440
alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
288441
nodemsk, nmsk, node_groups);
289442
for (i = 0; i < nr_node_ids; i++) {
290-
unsigned int ncpus, v;
443+
unsigned int ncpus;
291444
struct node_groups *nv = &node_groups[i];
292445

293446
if (nv->ngroups == UINT_MAX)
@@ -301,28 +454,14 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
301454

302455
WARN_ON_ONCE(nv->ngroups > ncpus);
303456

304-
/* Account for rounding errors */
305-
extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
306-
307-
/* Spread allocated groups on CPUs of the current node */
308-
for (v = 0; v < nv->ngroups; v++, curgrp++) {
309-
cpus_per_grp = ncpus / nv->ngroups;
310-
311-
/* Account for extra groups to compensate rounding errors */
312-
if (extra_grps) {
313-
cpus_per_grp++;
314-
--extra_grps;
315-
}
316-
317-
/*
318-
* wrapping has to be considered given 'startgrp'
319-
* may start anywhere
320-
*/
321-
if (curgrp >= last_grp)
322-
curgrp = 0;
323-
grp_spread_init_one(&masks[curgrp], nmsk,
324-
cpus_per_grp);
457+
if (__try_group_cluster_cpus(ncpus, nv->ngroups, nmsk,
458+
masks, &curgrp, last_grp)) {
459+
done += nv->ngroups;
460+
continue;
325461
}
462+
463+
assign_cpus_to_groups(ncpus, nmsk, nv, masks, &curgrp,
464+
last_grp);
326465
done += nv->ngroups;
327466
}
328467
kfree(node_groups);

0 commit comments

Comments
 (0)