Skip to content

Commit 6cbb834

Browse files
mayastor-borstiagolobocastro
andcommitted
chore(bors): merge pull request #1038
1038: feat(affinity-group): support scale down to 1 replica (backport #1037) r=tiagolobocastro a=mergify[bot] Scaling down affinity groups to 1 replica was previously blocked because we didn't want to get into a situation where replicas would end up sharing the same node, defeating the purpose of Affinity Groups! We now loosen this restriction by allowing the scale down if we don't get into that scenario :) This is done by creating a list of restricted nodes which are nodes where a 1-replica volume part of the same affinity group places its replica. In such case, we don't want to scale down in such a way that the last remaining replica would be placed in restricted nodes! This isn't the full monty as replicas may be sharing nodes when Scaling back from 3 to 2, which would end up impeding further scaling down to 1! For this we'd have to implement some logic to improve the sorting. This limitation is fine for now, we can document scaling up and down as a way of moving replicas around!<hr>This is an automatic backport of pull request #1037 done by [Mergify](https://mergify.com). Co-authored-by: Tiago Castro <[email protected]>
2 parents 656b9e5 + 68def69 commit 6cbb834

File tree

11 files changed

+386
-53
lines changed

11 files changed

+386
-53
lines changed

.github/mergify.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
pull_request_rules:
2+
- name: auto-label and approve backport PRs
3+
conditions:
4+
- author=mergify[bot]
5+
- title~=\(backport \#[0-9]+\)$
6+
actions:
7+
label:
8+
add:
9+
- kind/backport
10+
review:
11+
type: APPROVE
12+
message: "Automatically approved backport PR"
13+
bot_account: openebs-ci

control-plane/agents/src/bin/core/controller/scheduling/affinity_group.rs

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use crate::controller::{registry::Registry, resources::ResourceUid};
2-
use std::collections::HashMap;
2+
use std::collections::{HashMap, HashSet};
33
use stor_port::types::v0::{
44
store::volume::{AffinityGroupSpec, VolumeOperation, VolumeSpec},
55
transport::{NodeId, PoolId},
@@ -61,6 +61,27 @@ pub(crate) async fn get_pool_ag_replica_count(
6161
pool_ag_replica_count
6262
}
6363

64+
/// Get a set of nodes which contain the replica of a single-replica volume, part of the affinity group.
65+
pub(crate) async fn ag_restricted_nodes(
66+
affinity_group_spec: &AffinityGroupSpec,
67+
registry: &Registry,
68+
) -> HashSet<NodeId> {
69+
let mut nodes = HashSet::<NodeId>::new();
70+
let specs = registry.specs();
71+
for volume_id in affinity_group_spec.volumes() {
72+
let Some(volume) = registry.specs().volume_rsc(volume_id) else {
73+
// should not really happen
74+
continue;
75+
};
76+
if volume.lock().num_replicas != 1 {
77+
continue;
78+
}
79+
80+
nodes.extend(specs.volume_replica_nodes(volume_id));
81+
}
82+
nodes
83+
}
84+
6485
/// Get the map of node to the number of the Affinity Group nexuses on the node.
6586
pub(crate) async fn get_node_ag_nexus_count(
6687
affinity_group_spec: &AffinityGroupSpec,

control-plane/agents/src/bin/core/controller/scheduling/mod.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,13 +332,23 @@ impl ChildSorters {
332332
/// Sort replicas by their nexus child (state and rebuild progress)
333333
/// todo: should we use weights instead (like moac)?
334334
pub(crate) fn sort(
335-
_request: &GetChildForRemovalContext,
335+
request: &GetChildForRemovalContext,
336336
a: &ReplicaItem,
337337
b: &ReplicaItem,
338338
) -> std::cmp::Ordering {
339339
match Self::sort_by_health(a, b) {
340340
Ordering::Equal => match Self::sort_by_child(a, b) {
341341
Ordering::Equal => {
342+
if let Some(ag) = request.affinity_group() {
343+
match (
344+
a.ag_restricted_node(ag.restricted_nodes()),
345+
b.ag_restricted_node(ag.restricted_nodes()),
346+
) {
347+
(Some(a), Some(b)) if a != b => return b.cmp(&a),
348+
_ => {}
349+
}
350+
}
351+
342352
// Remove replicas from nodes which are cordoned with most priority.
343353
// remove mismatched topology replicas first
344354
if let (Some(a), Some(b)) = (a.valid_node_topology(), b.valid_node_topology()) {
@@ -377,6 +387,7 @@ impl ChildSorters {
377387
let childb_is_local = !b.spec().share.shared();
378388
match (childa_is_local, childb_is_local) {
379389
(true, true) | (false, false) => {
390+
// todo: this should probably be done regardless of child locality
380391
b.ag_replicas_on_pool().cmp(&a.ag_replicas_on_pool())
381392
}
382393
(true, false) => Ordering::Greater,

control-plane/agents/src/bin/core/controller/scheduling/resources/mod.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@ use stor_port::types::v0::{
1616
transport::{Child, ChildUri, NodeId, PoolId, Replica},
1717
};
1818

19-
use std::{cmp::Ordering, collections::HashMap, ops::Deref};
19+
use std::{
20+
cmp::Ordering,
21+
collections::{HashMap, HashSet},
22+
ops::Deref,
23+
};
2024

2125
/// Item for pool scheduling logic.
2226
#[derive(Debug, Clone)]
@@ -293,6 +297,10 @@ impl ReplicaItem {
293297
self.valid_pool_topology = valid_pool_topology;
294298
self
295299
}
300+
/// Check all topological information is valid!
301+
pub(crate) fn valid_topology(&self) -> bool {
302+
self.valid_pool_topology() == &Some(true) && self.valid_node_topology() == Some(true)
303+
}
296304
/// Get a reference to the node topology validity information.
297305
pub(crate) fn node_topology_info(&self) -> &Option<TopologyRmInfo> {
298306
&self.node_topology_info
@@ -338,6 +346,13 @@ impl ReplicaItem {
338346
pub(crate) fn ag_replicas_on_pool(&self) -> u64 {
339347
self.ag_replicas_on_pool.unwrap_or(u64::MIN)
340348
}
349+
/// Check if this replica is in an Affinity Group restricted node.
350+
/// A restricted node is a node where another single-replica volume (from the same AG) already
351+
/// resides in.
352+
pub(crate) fn ag_restricted_node(&self, restricted_nodes: &HashSet<NodeId>) -> Option<bool> {
353+
let node_spec = self.node_spec.as_ref()?;
354+
Some(restricted_nodes.get(node_spec.id()).is_some())
355+
}
341356
}
342357

343358
/// Individual nexus child (replicas) which can be used for nexus creation.

control-plane/agents/src/bin/core/controller/scheduling/volume.rs

Lines changed: 102 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,12 @@ use stor_port::types::v0::{
2222
transport::{NodeId, PoolId, VolumeState},
2323
};
2424

25-
use crate::controller::scheduling::ResourceExcReason;
25+
use crate::controller::scheduling::{affinity_group::ag_restricted_nodes, ResourceExcReason};
2626
use std::{
27-
collections::{BTreeMap, HashMap},
27+
collections::{BTreeMap, HashMap, HashSet},
2828
ops::Deref,
2929
};
30+
use stor_port::types::v0::store::volume::AffinityGroupSpec;
3031

3132
/// Move replica to another pool.
3233
#[derive(Default, Clone)]
@@ -273,6 +274,20 @@ impl GetChildForRemoval {
273274
}
274275
}
275276

277+
/// Affinity Group useful information when scaling down.
278+
#[derive(Clone)]
279+
pub(crate) struct AffinityGroupRmCtx {
280+
_group: AffinityGroupSpec,
281+
pool_repl_cnt: HashMap<PoolId, u64>,
282+
restricted_nodes: HashSet<NodeId>,
283+
}
284+
impl AffinityGroupRmCtx {
285+
/// Get the Affinity Group restricted nodes.
286+
pub(crate) fn restricted_nodes(&self) -> &HashSet<NodeId> {
287+
&self.restricted_nodes
288+
}
289+
}
290+
276291
/// Used to filter nexus children in order to choose the best candidates for removal
277292
/// when the volume's replica count is being reduced.
278293
#[derive(Clone)]
@@ -282,6 +297,7 @@ pub(crate) struct GetChildForRemovalContext {
282297
state: VolumeState,
283298
nexus_info: Option<NexusInfo>,
284299
unused_only: bool,
300+
affinity_group: Option<AffinityGroupRmCtx>,
285301
}
286302
impl std::fmt::Debug for GetChildForRemovalContext {
287303
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -295,7 +311,11 @@ impl std::fmt::Debug for GetChildForRemovalContext {
295311
}
296312

297313
impl GetChildForRemovalContext {
298-
async fn new(registry: &Registry, request: &GetChildForRemoval) -> Result<Self, SvcError> {
314+
async fn new(
315+
registry: &Registry,
316+
request: &GetChildForRemoval,
317+
affinity_group: Option<AffinityGroupSpec>,
318+
) -> Result<Self, SvcError> {
299319
let nexus_info = registry
300320
.nexus_info(
301321
Some(&request.spec.uuid),
@@ -311,10 +331,19 @@ impl GetChildForRemovalContext {
311331
state: request.state.clone(),
312332
nexus_info,
313333
unused_only: request.unused_only,
334+
affinity_group: match affinity_group {
335+
None => None,
336+
Some(group) => Some(AffinityGroupRmCtx {
337+
pool_repl_cnt: get_pool_ag_replica_count(&group, registry).await,
338+
restricted_nodes: ag_restricted_nodes(&group, registry).await,
339+
_group: group,
340+
}),
341+
},
314342
})
315343
}
316344

317-
async fn list(&self, pool_ag_rep: &Option<HashMap<PoolId, u64>>) -> Vec<ReplicaItem> {
345+
async fn list(&self) -> Vec<ReplicaItem> {
346+
let pool_ag_rep = self.affinity_group.as_ref().map(|g| &g.pool_repl_cnt);
318347
let replicas = self.registry.specs().volume_replicas_cln(&self.spec.uuid);
319348
let nexus = self.registry.specs().volume_target_nexus_rsc(&self.spec);
320349
let used_node_spread =
@@ -386,19 +415,22 @@ impl GetChildForRemovalContext {
386415
})
387416
.collect::<Vec<_>>()
388417
}
418+
419+
/// Get the volume's affinity group.
420+
pub(crate) fn affinity_group(&self) -> Option<&AffinityGroupRmCtx> {
421+
self.affinity_group.as_ref()
422+
}
389423
}
390424

391425
impl DecreaseVolumeReplica {
392426
async fn builder(request: &GetChildForRemoval, registry: &Registry) -> Result<Self, SvcError> {
393-
let mut pool_ag_replica_count_map: Option<HashMap<PoolId, u64>> = None;
394-
if let Some(affinity_group) = &request.spec.affinity_group {
395-
let affinity_group_spec = registry.specs().affinity_group_spec(affinity_group.id())?;
396-
pool_ag_replica_count_map =
397-
Some(get_pool_ag_replica_count(&affinity_group_spec, registry).await);
398-
}
427+
let affinity_group = request.spec.affinity_group.as_ref();
428+
let affinity_group = affinity_group
429+
.map(|g| registry.specs().affinity_group_spec(g.id()))
430+
.transpose()?;
399431

400-
let context = GetChildForRemovalContext::new(registry, request).await?;
401-
let list = context.list(&pool_ag_replica_count_map).await;
432+
let context = GetChildForRemovalContext::new(registry, request, affinity_group).await?;
433+
let list = context.list().await;
402434
Ok(Self {
403435
data: ResourceData::new(context, list),
404436
})
@@ -448,6 +480,24 @@ impl ReplicaRemovalCandidates {
448480
None
449481
}
450482
}
483+
fn ag_restricted_nodes(&self) -> Option<&HashSet<NodeId>> {
484+
let ag = self.context.affinity_group.as_ref()?;
485+
if ag.restricted_nodes.is_empty() {
486+
return None;
487+
}
488+
// N
489+
// R 1 2 3
490+
// 1 x x
491+
// 2 x x
492+
// 3 x x x
493+
// todo: In this case, shouldn't we block removal or r3n3 ?
494+
if self.context.spec.num_replicas > 2 {
495+
// for now, we only restrict from 2->1
496+
return None;
497+
}
498+
Some(&ag.restricted_nodes)
499+
}
500+
451501
/// Get the next unhealthy candidates (any is a good fit).
452502
fn next_unhealthy(&mut self) -> Option<ReplicaItem> {
453503
self.unhealthy.pop()
@@ -457,6 +507,46 @@ impl ReplicaRemovalCandidates {
457507
pub(crate) fn next(&mut self) -> Option<ReplicaItem> {
458508
self.next_unhealthy().or_else(|| self.next_healthy())
459509
}
510+
/// Get the next removal candidate.
511+
/// Unhealthy replicas are removed before healthy replicas.
512+
/// Use when scaling down the volume where we need to guard against restricted node usage.
513+
pub(crate) fn next_down(&mut self) -> Option<Option<ReplicaItem>> {
514+
// we don't want to leave place 1-r affinity volumes in the same node (potentially),
515+
// so we should check whether we have other replicas available
516+
let mut replicas_in_unrestricted_nodes = false;
517+
518+
let candidate = self.next_unhealthy().or_else(|| self.next_healthy())?;
519+
520+
let Some(restricted_nodes) = self.ag_restricted_nodes() else {
521+
// no affinity groups, just remove the best candidate
522+
return Some(Some(candidate));
523+
};
524+
// todo: still need a better way of controlling the priority of candidates when
525+
// scaling from X -> X-1, where X > 2
526+
// also we can block scale down to 1, if we don't have sufficient distinct nodes
527+
// to place all the other volume replicas
528+
529+
tracing::debug!("Restricted Affinity Group Nodes: {restricted_nodes:?}");
530+
tracing::debug!("Candidate for removal: {}", candidate.spec().uuid);
531+
532+
for repl in &self.healthy {
533+
let Some(node) = repl.node_spec() else {
534+
continue;
535+
};
536+
if restricted_nodes.get(node.id()).is_none()
537+
&& (repl.valid_topology() || !candidate.valid_topology())
538+
{
539+
replicas_in_unrestricted_nodes = true;
540+
break;
541+
}
542+
}
543+
544+
if replicas_in_unrestricted_nodes {
545+
Some(Some(candidate))
546+
} else {
547+
Some(None)
548+
}
549+
}
460550

461551
fn new(context: GetChildForRemovalContext, items: Vec<ReplicaItem>) -> Self {
462552
let has_info = context.nexus_info.is_some();

0 commit comments

Comments
 (0)