Skip to content

Commit f1f5a5b

Browse files
mayastor-borstiagolobocastro
andcommitted
chore(bors): merge pull request #639
639: fix(nexus/replica): check with replica owner before destroying it r=tiagolobocastro a=tiagolobocastro A user hit a very weird situation where there were 2 created nexuses containing the same replica. How can this happen? A potential situation is fixed where we now collect volume state AFTER we get the volume guard, though it's a very tight race so I suspect something else might still be at play here.. Irregardless of how this can happen we now plug the hole by ensuring we always check wit the volume replica removal logic before attempting to disown and destroy a replica. Co-authored-by: Tiago Castro <[email protected]>
2 parents c660b0e + b2ce7c0 commit f1f5a5b

File tree

6 files changed

+200
-64
lines changed

6 files changed

+200
-64
lines changed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
name: Submodule Branch Check
2+
on:
3+
pull_request:
4+
types: ['opened', 'edited', 'reopened', 'synchronize']
5+
push:
6+
branches:
7+
- 'release/**'
8+
- staging
9+
jobs:
10+
submodule-branch:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- uses: actions/checkout@v3
14+
- name: Check root submodules branch
15+
run: echo "Compat pass"
16+

control-plane/agents/core/src/core/reconciler/volume/hot_spare.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,21 +48,24 @@ async fn hot_spare_reconcile(
4848
volume_spec: &Arc<Mutex<VolumeSpec>>,
4949
context: &PollContext,
5050
) -> PollResult {
51-
let uuid = volume_spec.lock().uuid.clone();
52-
let volume_state = context.registry().get_volume_state(&uuid).await?;
5351
let _guard = match volume_spec.operation_guard(OperationMode::ReconcileStart) {
5452
Ok(guard) => guard,
5553
Err(_) => return PollResult::Ok(PollerState::Busy),
5654
};
57-
let mode = OperationMode::ReconcileStep;
58-
59-
if !volume_spec.lock().policy.self_heal {
55+
let volume_spec_cln = volume_spec.lock().clone();
56+
if !volume_spec_cln.policy.self_heal {
6057
return PollResult::Ok(PollerState::Idle);
6158
}
62-
if !volume_spec.lock().status.created() {
59+
if !volume_spec_cln.status.created() {
6360
return PollResult::Ok(PollerState::Idle);
6461
}
6562

63+
let volume_state = context
64+
.registry()
65+
.get_volume_spec_state(volume_spec_cln)
66+
.await?;
67+
let mode = OperationMode::ReconcileStep;
68+
6669
match volume_state.status {
6770
VolumeStatus::Online => volume_replica_count_reconciler(volume_spec, context, mode).await,
6871
VolumeStatus::Unknown | VolumeStatus::Degraded => {

control-plane/agents/core/src/nexus/specs.rs

Lines changed: 16 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use crate::core::{
55
};
66
use common::errors::{NexusNotFound, SvcError};
77
use common_lib::{
8-
mbus_api::{ErrorChain, ResourceKind},
8+
mbus_api::ResourceKind,
99
types::v0::{
1010
message_bus::{
1111
AddNexusChild, AddNexusReplica, Child, ChildUri, CreateNexus, DestroyNexus, Nexus,
@@ -16,7 +16,7 @@ use common_lib::{
1616
nexus::{NexusOperation, NexusSpec},
1717
nexus_child::NexusChild,
1818
replica::ReplicaSpec,
19-
OperationMode, SpecStatus, SpecTransaction, TraceSpan,
19+
OperationMode, SpecStatus, SpecTransaction,
2020
},
2121
},
2222
};
@@ -433,37 +433,23 @@ impl ResourceSpecsLocked {
433433
.ok_or(SvcError::ReplicaNotFound {
434434
replica_id: replica.uuid().clone(),
435435
})?;
436-
let pool_id = replica_spec.lock().pool.clone();
437-
match Self::get_pool_node(registry, pool_id).await {
438-
Some(node) => {
439-
if let Err(error) = self
440-
.disown_and_destroy_replica(registry, &node, replica.uuid())
441-
.await
442-
{
443-
nexus_spec.lock().clone().error_span(|| {
444-
tracing::error!(
445-
replica.uuid = %replica.uuid(),
446-
error = %error.full_string(),
447-
"Failed to disown and destroy the replica"
448-
)
449-
});
450-
}
451-
}
452-
None => {
453-
// The replica node was not found (perhaps because it is offline).
454-
// The replica can't be destroyed because the node isn't there.
455-
// Instead, disown the replica from the volume and let the garbage
456-
// collector destroy it later.
457-
nexus_spec.lock().clone().warn_span(|| {
458-
tracing::warn!(
459-
replica.uuid = %replica.uuid(),
460-
"Failed to find the node where the replica is hosted"
436+
if !replica_spec.lock().owners.owned_by_a_nexus() {
437+
let owner_volume = {
438+
let replica_spec = replica_spec.lock();
439+
replica_spec.owners.volume().cloned()
440+
};
441+
if let Some(owner) = owner_volume {
442+
if let Some(volume) = self.get_locked_volume(&owner) {
443+
self.remove_unused_volume_replica(
444+
registry,
445+
&volume,
446+
replica.uuid(),
447+
mode,
461448
)
462-
});
463-
let _ = self.disown_volume_replica(registry, &replica_spec).await;
449+
.await?;
450+
}
464451
}
465452
}
466-
467453
Ok(())
468454
}
469455
result => result,

control-plane/agents/core/src/volume/registry.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,17 @@ impl Registry {
2222
.await
2323
}
2424

25+
/// Get the volume state for the specified volume spec.
26+
pub(crate) async fn get_volume_spec_state(
27+
&self,
28+
volume_spec: VolumeSpec,
29+
) -> Result<VolumeState, SvcError> {
30+
let replica_specs = self.specs().get_cloned_volume_replicas(&volume_spec.uuid);
31+
32+
self.get_volume_state_with_replicas(&volume_spec, &replica_specs)
33+
.await
34+
}
35+
2536
/// Get the volume state for the specified volume
2637
#[tracing::instrument(level = "info", skip(self, volume_spec, replicas))]
2738
pub(crate) async fn get_volume_state_with_replicas(

control-plane/agents/core/src/volume/specs.rs

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,8 @@ impl ResourceSpecsLocked {
655655
{
656656
Ok(_) => Ok(()),
657657
Err(error) if !request.force() => Err(error),
658+
Err(error @ SvcError::Conflict {}) => Err(error),
659+
Err(error @ SvcError::StoreSave { .. }) => Err(error),
658660
Err(error) => {
659661
let node_online = match registry.get_node_wrapper(&nexus_clone.node).await {
660662
Ok(node) => {
@@ -1053,6 +1055,17 @@ impl ResourceSpecsLocked {
10531055
.await
10541056
{
10551057
Ok(_) => Ok(()),
1058+
Err(SvcError::GrpcRequestError {
1059+
source,
1060+
request,
1061+
resource,
1062+
}) if source.code() == tonic::Code::DeadlineExceeded => {
1063+
Err(SvcError::GrpcRequestError {
1064+
source,
1065+
request,
1066+
resource,
1067+
})
1068+
}
10561069
Err(error) => {
10571070
if let Some(replica) = self.get_replica(&replica.uuid) {
10581071
let mut replica = replica.lock();
@@ -1333,26 +1346,6 @@ impl ResourceSpecsLocked {
13331346
.await
13341347
}
13351348

1336-
/// Disown and destroy the replica from its volume
1337-
pub(crate) async fn disown_and_destroy_replica(
1338-
&self,
1339-
registry: &Registry,
1340-
node: &NodeId,
1341-
replica_uuid: &ReplicaId,
1342-
) -> Result<(), SvcError> {
1343-
if let Some(replica) = self.get_replica(replica_uuid) {
1344-
// disown it from the volume first, so at the very least it can be garbage collected
1345-
// at a later point if the node is not accessible
1346-
self.disown_volume_replica(registry, &replica).await?;
1347-
self.destroy_volume_replica(registry, Some(node), &replica)
1348-
.await
1349-
} else {
1350-
Err(SvcError::ReplicaNotFound {
1351-
replica_id: replica_uuid.to_owned(),
1352-
})
1353-
}
1354-
}
1355-
13561349
/// Remove volume by its `id`
13571350
pub(super) fn remove_volume(&self, id: &VolumeId) {
13581351
let mut specs = self.write();

control-plane/agents/core/src/volume/tests.rs

Lines changed: 135 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,10 @@ use common_lib::{
3030
ReplicaId, ReplicaOwners, VolumeId,
3131
},
3232
openapi::{models, models::NodeStatus, tower::client::Error},
33-
store::{definitions::StorableObject, volume::VolumeSpec},
33+
store::{
34+
definitions::StorableObject, nexus::ReplicaUri, nexus_child::NexusChild,
35+
volume::VolumeSpec,
36+
},
3437
},
3538
};
3639
use std::{
@@ -903,26 +906,38 @@ async fn wait_till_volume(volume: &VolumeId, replicas: usize) {
903906
}
904907
}
905908

906-
/// Wait for a volume to reach the provided status
907-
async fn wait_till_volume_status(cluster: &Cluster, volume: &Uuid, status: models::VolumeStatus) {
908-
let timeout = Duration::from_secs(RECONCILE_TIMEOUT_SECS);
909+
/// Wait for a volume to reach the provided status with timeout.
910+
async fn wait_till_volume_status_timeout(
911+
cluster: &Cluster,
912+
volume: &Uuid,
913+
status: models::VolumeStatus,
914+
timeout: Duration,
915+
) -> Result<(), String> {
909916
let start = std::time::Instant::now();
910917
loop {
911918
let volume = cluster.rest_v00().volumes_api().get_volume(volume).await;
912919
if volume.as_ref().unwrap().state.status == status {
913-
return;
920+
return Ok(());
914921
}
915922

916923
if std::time::Instant::now() > (start + timeout) {
917-
panic!(
918-
"Timeout waiting for the volume to reach the specified status ('{:?}'), current: '{:?}'",
919-
status, volume
924+
return Err(
925+
format!("Timeout waiting for the volume to reach the specified status ('{:?}'), current: '{:?}'",
926+
status, volume)
920927
);
921928
}
922929
tokio::time::sleep(Duration::from_millis(500)).await;
923930
}
924931
}
925932

933+
/// Wait for a volume to reach the provided status.
934+
async fn wait_till_volume_status(cluster: &Cluster, volume: &Uuid, status: models::VolumeStatus) {
935+
let timeout = Duration::from_secs(RECONCILE_TIMEOUT_SECS);
936+
wait_till_volume_status_timeout(cluster, volume, status, timeout)
937+
.await
938+
.unwrap();
939+
}
940+
926941
/// Either fault the local replica, the remote, or set the nexus as having an unclean shutdown
927942
#[derive(Debug)]
928943
enum FaultTest {
@@ -1499,3 +1514,115 @@ async fn smoke_test() {
14991514
assert!(GetNexuses::default().request().await.unwrap().0.is_empty());
15001515
assert!(GetReplicas::default().request().await.unwrap().0.is_empty());
15011516
}
1517+
1518+
/// When a second nexus with the same child is created for some reason, ensure that removing
1519+
/// a replica doesn't cause the replica to be disowned from the volume and destroyed.
1520+
/// This is something that shouldn't happen to begin with but this adds a safety net just in case.
1521+
#[tokio::test]
1522+
async fn duplicate_nexus_missing_children() {
1523+
let reconcile_period = Duration::from_millis(100);
1524+
let cluster = ClusterBuilder::builder()
1525+
.with_rest(true)
1526+
.with_agents(vec!["core"])
1527+
.with_mayastors(2)
1528+
.with_pool(1, "malloc:///d?size_mb=100")
1529+
.with_cache_period("100ms")
1530+
.with_reconcile_period(reconcile_period, reconcile_period)
1531+
.build()
1532+
.await
1533+
.unwrap();
1534+
let nodes = GetNodes::default().request().await.unwrap();
1535+
tracing::info!("Nodes: {:?}", nodes);
1536+
1537+
let volume = CreateVolume {
1538+
uuid: "1e3cf927-80c2-47a8-adf0-95c486bdd7b7".try_into().unwrap(),
1539+
size: 5242880,
1540+
replicas: 1,
1541+
..Default::default()
1542+
}
1543+
.request()
1544+
.await
1545+
.unwrap();
1546+
1547+
let fake_volume = CreateVolume {
1548+
uuid: "2e3cf927-80c2-47a8-adf0-95c486bdd7b7".try_into().unwrap(),
1549+
size: 5242880,
1550+
replicas: 1,
1551+
..Default::default()
1552+
}
1553+
.request()
1554+
.await
1555+
.unwrap();
1556+
1557+
let volume = PublishVolume::new(volume.spec().uuid.clone(), Some(cluster.node(0)), None)
1558+
.request()
1559+
.await
1560+
.unwrap();
1561+
1562+
tracing::info!("Volume: {:?}", volume);
1563+
let volume_state = volume.state();
1564+
let nexus = volume_state.target.unwrap().clone();
1565+
1566+
let child = nexus.children.first().cloned().unwrap();
1567+
let replica_uri = ReplicaUri::new(
1568+
&ReplicaId::try_from(child.uri.uuid_str().unwrap()).unwrap(),
1569+
&child.uri,
1570+
);
1571+
1572+
let local = "malloc:///local?size_mb=12&uuid=4a7b0566-8ec6-49e0-a8b2-1d9a292cf59b".into();
1573+
1574+
let bad_nexus = CreateNexus {
1575+
node: cluster.node(1),
1576+
uuid: NexusId::try_from("f086f12c-1728-449e-be32-9415051090d6").unwrap(),
1577+
size: 5242880,
1578+
children: vec![NexusChild::Replica(replica_uri), local],
1579+
managed: true,
1580+
// pretend this nexus is from another volume so it won't be deleted..
1581+
owner: Some(fake_volume.uuid().clone()),
1582+
..Default::default()
1583+
}
1584+
.request()
1585+
.await
1586+
.unwrap();
1587+
1588+
let nexuses = GetNexuses::default().request().await.unwrap().0;
1589+
tracing::info!("Nexuses: {:?}", nexuses);
1590+
1591+
let mut rpc_handle = cluster
1592+
.composer()
1593+
.grpc_handle(cluster.node(1).as_str())
1594+
.await
1595+
.unwrap();
1596+
1597+
let children_before_fault = volume_children(volume.uuid()).await;
1598+
tracing::info!("volume children: {:?}", children_before_fault);
1599+
1600+
let missing_child = child.uri.to_string();
1601+
rpc_handle
1602+
.mayastor
1603+
.remove_child_nexus(rpc::mayastor::RemoveChildNexusRequest {
1604+
uuid: bad_nexus.uuid.to_string(),
1605+
uri: missing_child.clone(),
1606+
})
1607+
.await
1608+
.unwrap();
1609+
1610+
tracing::debug!(
1611+
"Nexus: {:?}",
1612+
rpc_handle
1613+
.mayastor
1614+
.list_nexus(rpc::mayastor::Null {})
1615+
.await
1616+
.unwrap()
1617+
);
1618+
1619+
// There no easy way to check for a negative here, just wait for 2 garbage reconcilers.
1620+
wait_till_volume_status_timeout(
1621+
&cluster,
1622+
volume.uuid(),
1623+
models::VolumeStatus::Faulted,
1624+
reconcile_period * 10,
1625+
)
1626+
.await
1627+
.expect_err("Should not get faulted!");
1628+
}

0 commit comments

Comments
 (0)