@@ -114,48 +114,15 @@ static int ncpus_cmp_func(const void *l, const void *r)
114114 return ln -> ncpus - rn -> ncpus ;
115115}
116116
117- /*
118- * Allocate group number for each node, so that for each node:
119- *
120- * 1) the allocated number is >= 1
121- *
122- * 2) the allocated number is <= active CPU number of this node
123- *
124- * The actual allocated total groups may be less than @numgrps when
125- * active total CPU number is less than @numgrps.
126- *
127- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
128- * for each node.
129- */
130- static void alloc_nodes_groups (unsigned int numgrps ,
131- cpumask_var_t * node_to_cpumask ,
132- const struct cpumask * cpu_mask ,
133- const nodemask_t nodemsk ,
134- struct cpumask * nmsk ,
135- struct node_groups * node_groups )
117+ static void alloc_groups_to_nodes (unsigned int numgrps ,
118+ unsigned int numcpus ,
119+ struct node_groups * node_groups ,
120+ unsigned int num_nodes )
136121{
137- unsigned n , remaining_ncpus = 0 ;
138-
139- for (n = 0 ; n < nr_node_ids ; n ++ ) {
140- node_groups [n ].id = n ;
141- node_groups [n ].ncpus = UINT_MAX ;
142- }
143-
144- for_each_node_mask (n , nodemsk ) {
145- unsigned ncpus ;
146-
147- cpumask_and (nmsk , cpu_mask , node_to_cpumask [n ]);
148- ncpus = cpumask_weight (nmsk );
122+ unsigned int n , remaining_ncpus = numcpus ;
123+ unsigned int ngroups , ncpus ;
149124
150- if (!ncpus )
151- continue ;
152- remaining_ncpus += ncpus ;
153- node_groups [n ].ncpus = ncpus ;
154- }
155-
156- numgrps = min_t (unsigned , remaining_ncpus , numgrps );
157-
158- sort (node_groups , nr_node_ids , sizeof (node_groups [0 ]),
125+ sort (node_groups , num_nodes , sizeof (node_groups [0 ]),
159126 ncpus_cmp_func , NULL );
160127
161128 /*
@@ -226,9 +193,8 @@ static void alloc_nodes_groups(unsigned int numgrps,
226193 * finally for each node X: grps(X) <= ncpu(X).
227194 *
228195 */
229- for (n = 0 ; n < nr_node_ids ; n ++ ) {
230- unsigned ngroups , ncpus ;
231196
197+ for (n = 0 ; n < num_nodes ; n ++ ) {
232198 if (node_groups [n ].ncpus == UINT_MAX )
233199 continue ;
234200
@@ -246,12 +212,199 @@ static void alloc_nodes_groups(unsigned int numgrps,
246212 }
247213}
248214
215+ /*
216+ * Allocate group number for each node, so that for each node:
217+ *
218+ * 1) the allocated number is >= 1
219+ *
220+ * 2) the allocated number is <= active CPU number of this node
221+ *
222+ * The actual allocated total groups may be less than @numgrps when
223+ * active total CPU number is less than @numgrps.
224+ *
225+ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
226+ * for each node.
227+ */
228+ static void alloc_nodes_groups (unsigned int numgrps ,
229+ cpumask_var_t * node_to_cpumask ,
230+ const struct cpumask * cpu_mask ,
231+ const nodemask_t nodemsk ,
232+ struct cpumask * nmsk ,
233+ struct node_groups * node_groups )
234+ {
235+ unsigned int n , numcpus = 0 ;
236+
237+ for (n = 0 ; n < nr_node_ids ; n ++ ) {
238+ node_groups [n ].id = n ;
239+ node_groups [n ].ncpus = UINT_MAX ;
240+ }
241+
242+ for_each_node_mask (n , nodemsk ) {
243+ unsigned int ncpus ;
244+
245+ cpumask_and (nmsk , cpu_mask , node_to_cpumask [n ]);
246+ ncpus = cpumask_weight (nmsk );
247+
248+ if (!ncpus )
249+ continue ;
250+ numcpus += ncpus ;
251+ node_groups [n ].ncpus = ncpus ;
252+ }
253+
254+ numgrps = min_t (unsigned int , numcpus , numgrps );
255+ alloc_groups_to_nodes (numgrps , numcpus , node_groups , nr_node_ids );
256+ }
257+
258+ static void assign_cpus_to_groups (unsigned int ncpus ,
259+ struct cpumask * nmsk ,
260+ struct node_groups * nv ,
261+ struct cpumask * masks ,
262+ unsigned int * curgrp ,
263+ unsigned int last_grp )
264+ {
265+ unsigned int v , cpus_per_grp , extra_grps ;
266+ /* Account for rounding errors */
267+ extra_grps = ncpus - nv -> ngroups * (ncpus / nv -> ngroups );
268+
269+ /* Spread allocated groups on CPUs of the current node */
270+ for (v = 0 ; v < nv -> ngroups ; v ++ , * curgrp += 1 ) {
271+ cpus_per_grp = ncpus / nv -> ngroups ;
272+
273+ /* Account for extra groups to compensate rounding errors */
274+ if (extra_grps ) {
275+ cpus_per_grp ++ ;
276+ -- extra_grps ;
277+ }
278+
279+ /*
280+ * wrapping has to be considered given 'startgrp'
281+ * may start anywhere
282+ */
283+ if (* curgrp >= last_grp )
284+ * curgrp = 0 ;
285+ grp_spread_init_one (& masks [* curgrp ], nmsk , cpus_per_grp );
286+ }
287+ }
288+
289+ static int alloc_cluster_groups (unsigned int ncpus ,
290+ unsigned int ngroups ,
291+ struct cpumask * node_cpumask ,
292+ cpumask_var_t msk ,
293+ const struct cpumask * * * clusters_ptr ,
294+ struct node_groups * * cluster_groups_ptr )
295+ {
296+ unsigned int ncluster = 0 ;
297+ unsigned int cpu , nc , n ;
298+ const struct cpumask * cluster_mask ;
299+ const struct cpumask * * clusters ;
300+ struct node_groups * cluster_groups ;
301+
302+ cpumask_copy (msk , node_cpumask );
303+
304+ /* Probe how many clusters in this node. */
305+ while (1 ) {
306+ cpu = cpumask_first (msk );
307+ if (cpu >= nr_cpu_ids )
308+ break ;
309+
310+ cluster_mask = topology_cluster_cpumask (cpu );
311+ /* Clean out CPUs on the same cluster. */
312+ cpumask_andnot (msk , msk , cluster_mask );
313+ ncluster ++ ;
314+ }
315+
316+ /* If ngroups < ncluster, cross cluster is inevitable, skip. */
317+ if (ncluster == 0 || ncluster > ngroups )
318+ goto no_cluster ;
319+
320+ /* Allocate memory based on cluster number. */
321+ clusters = kcalloc (ncluster , sizeof (struct cpumask * ), GFP_KERNEL );
322+ if (!clusters )
323+ goto no_cluster ;
324+ cluster_groups = kcalloc (ncluster , sizeof (struct node_groups ), GFP_KERNEL );
325+ if (!cluster_groups )
326+ goto fail_cluster_groups ;
327+
328+ /* Filling cluster info for later process. */
329+ cpumask_copy (msk , node_cpumask );
330+ for (n = 0 ; n < ncluster ; n ++ ) {
331+ cpu = cpumask_first (msk );
332+ cluster_mask = topology_cluster_cpumask (cpu );
333+ nc = cpumask_weight_and (cluster_mask , node_cpumask );
334+ clusters [n ] = cluster_mask ;
335+ cluster_groups [n ].id = n ;
336+ cluster_groups [n ].ncpus = nc ;
337+ cpumask_andnot (msk , msk , cluster_mask );
338+ }
339+
340+ alloc_groups_to_nodes (ngroups , ncpus , cluster_groups , ncluster );
341+
342+ * clusters_ptr = clusters ;
343+ * cluster_groups_ptr = cluster_groups ;
344+ return ncluster ;
345+
346+ fail_cluster_groups :
347+ kfree (clusters );
348+ no_cluster :
349+ return 0 ;
350+ }
351+
352+ /*
353+ * Try group CPUs evenly for cluster locality within a NUMA node.
354+ *
355+ * Return: true if success, false otherwise.
356+ */
357+ static bool __try_group_cluster_cpus (unsigned int ncpus ,
358+ unsigned int ngroups ,
359+ struct cpumask * node_cpumask ,
360+ struct cpumask * masks ,
361+ unsigned int * curgrp ,
362+ unsigned int last_grp )
363+ {
364+ struct node_groups * cluster_groups ;
365+ const struct cpumask * * clusters ;
366+ unsigned int ncluster ;
367+ bool ret = false;
368+ cpumask_var_t nmsk ;
369+ unsigned int i , nc ;
370+
371+ if (!zalloc_cpumask_var (& nmsk , GFP_KERNEL ))
372+ goto fail_nmsk_alloc ;
373+
374+ ncluster = alloc_cluster_groups (ncpus , ngroups , node_cpumask , nmsk ,
375+ & clusters , & cluster_groups );
376+
377+ if (ncluster == 0 )
378+ goto fail_no_clusters ;
379+
380+ for (i = 0 ; i < ncluster ; i ++ ) {
381+ struct node_groups * nv = & cluster_groups [i ];
382+
383+ /* Get the cpus on this cluster. */
384+ cpumask_and (nmsk , node_cpumask , clusters [nv -> id ]);
385+ nc = cpumask_weight (nmsk );
386+ if (!nc )
387+ continue ;
388+ WARN_ON_ONCE (nv -> ngroups > nc );
389+
390+ assign_cpus_to_groups (nc , nmsk , nv , masks , curgrp , last_grp );
391+ }
392+
393+ ret = true;
394+ kfree (cluster_groups );
395+ kfree (clusters );
396+ fail_no_clusters :
397+ free_cpumask_var (nmsk );
398+ fail_nmsk_alloc :
399+ return ret ;
400+ }
401+
249402static int __group_cpus_evenly (unsigned int startgrp , unsigned int numgrps ,
250403 cpumask_var_t * node_to_cpumask ,
251404 const struct cpumask * cpu_mask ,
252405 struct cpumask * nmsk , struct cpumask * masks )
253406{
254- unsigned int i , n , nodes , cpus_per_grp , extra_grps , done = 0 ;
407+ unsigned int i , n , nodes , done = 0 ;
255408 unsigned int last_grp = numgrps ;
256409 unsigned int curgrp = startgrp ;
257410 nodemask_t nodemsk = NODE_MASK_NONE ;
@@ -287,7 +440,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
287440 alloc_nodes_groups (numgrps , node_to_cpumask , cpu_mask ,
288441 nodemsk , nmsk , node_groups );
289442 for (i = 0 ; i < nr_node_ids ; i ++ ) {
290- unsigned int ncpus , v ;
443+ unsigned int ncpus ;
291444 struct node_groups * nv = & node_groups [i ];
292445
293446 if (nv -> ngroups == UINT_MAX )
@@ -301,28 +454,14 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
301454
302455 WARN_ON_ONCE (nv -> ngroups > ncpus );
303456
304- /* Account for rounding errors */
305- extra_grps = ncpus - nv -> ngroups * (ncpus / nv -> ngroups );
306-
307- /* Spread allocated groups on CPUs of the current node */
308- for (v = 0 ; v < nv -> ngroups ; v ++ , curgrp ++ ) {
309- cpus_per_grp = ncpus / nv -> ngroups ;
310-
311- /* Account for extra groups to compensate rounding errors */
312- if (extra_grps ) {
313- cpus_per_grp ++ ;
314- -- extra_grps ;
315- }
316-
317- /*
318- * wrapping has to be considered given 'startgrp'
319- * may start anywhere
320- */
321- if (curgrp >= last_grp )
322- curgrp = 0 ;
323- grp_spread_init_one (& masks [curgrp ], nmsk ,
324- cpus_per_grp );
457+ if (__try_group_cluster_cpus (ncpus , nv -> ngroups , nmsk ,
458+ masks , & curgrp , last_grp )) {
459+ done += nv -> ngroups ;
460+ continue ;
325461 }
462+
463+ assign_cpus_to_groups (ncpus , nmsk , nv , masks , & curgrp ,
464+ last_grp );
326465 done += nv -> ngroups ;
327466 }
328467 kfree (node_groups );
0 commit comments