Skip to content

Commit d13405e

Browse files
mayastor-borsyugchaudhari
andcommitted
chore(bors): merge pull request #1103
1103: feat(core): add OfflineRebuildReconciler skeleton (detection-only) r=Abhinandan-Purkait a=yugchaudhari ## Description Starting on the offline volume rebuild (OEP 4208). This first PR adds the detection part, an `OfflineRebuildReconciler` that finds unpublished volumes in Degraded state and logs them. No rebuild happens yet. Everything's behind `--offline-rebuild-enabled` (off by default), so this is a no-op unless you flip the flag. ## Motivation and Context Unpublished volumes with degraded replicas currently stay degraded forever, the existing HotSpareReconciler only kicks in for published volumes (it gates on `target.is_some()`). This means if a node goes down while a volume is unpublished, replicas stay faulted until someone manually publishes and repairs. OEP: https://github.com/openebs/openebs/blob/develop/designs/replicated-pv/mayastor/offline-volume-rebuild.md Tracking: openebs/openebs#4208 Refs: openebs/mayastor#1977 ## Testing No BDD tests in this PR, there's no observable behavior to assert on yet (just a log line). Iteration 2 will add the actual rebuild and its tests will cover this detection path as a prerequisite. Verified locally: - `cargo check` passes in nix-shell - Existing tests unaffected (feature is gated, reconciler is additive) ## Backward Compatibility Fully backward compatible. The flag defaults to off, so existing deployments see zero change in behavior. Co-authored-by: yugchaudhari <yug.chaudhari@kluisz.ai>
2 parents 0c2e5cc + 4bca4e6 commit d13405e

4 files changed

Lines changed: 91 additions & 1 deletion

File tree

control-plane/agents/src/bin/core/controller/reconciler/volume/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
mod garbage_collector;
22
mod hot_spare;
33
mod nexus;
4+
mod offline_rebuild;
45

56
use crate::controller::task_poller::{PollContext, PollPeriods, PollResult, PollTimer, TaskPoller};
67

78
use crate::controller::reconciler::volume::{
89
garbage_collector::GarbageCollector, hot_spare::HotSpareReconciler,
9-
nexus::VolumeNexusReconciler,
10+
nexus::VolumeNexusReconciler, offline_rebuild::OfflineRebuildReconciler,
1011
};
1112

1213
/// Volume Reconciler loop which:
@@ -26,6 +27,7 @@ impl VolumeReconciler {
2627
Box::new(HotSpareReconciler::new()),
2728
Box::new(GarbageCollector::new()),
2829
Box::new(VolumeNexusReconciler::new()),
30+
Box::new(OfflineRebuildReconciler::new()),
2931
],
3032
}
3133
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
use crate::controller::{
2+
reconciler::{PollContext, TaskPoller},
3+
resources::{operations_helper::OperationSequenceGuard, ResourceMutex},
4+
task_poller::{PollResult, PollerState},
5+
};
6+
7+
use stor_port::types::v0::{store::volume::VolumeSpec, transport::VolumeStatus};
8+
9+
/// Offline Volume Rebuild reconciler.
10+
///
11+
/// Detects unpublished volumes whose replicas have become unhealthy and would
12+
/// benefit from a rebuild. Iteration 1 only logs eligible candidates; the
13+
/// actual rebuild action is added in a follow-up iteration.
14+
///
15+
/// See `designs/replicated-pv/mayastor/offline-volume-rebuild.md` for the
16+
/// full design.
17+
#[derive(Debug)]
18+
pub(super) struct OfflineRebuildReconciler {}
19+
20+
impl OfflineRebuildReconciler {
21+
pub(super) fn new() -> Self {
22+
Self {}
23+
}
24+
}
25+
26+
#[async_trait::async_trait]
27+
impl TaskPoller for OfflineRebuildReconciler {
28+
async fn poll(&mut self, context: &PollContext) -> PollResult {
29+
if !context.registry().offline_rebuild_enabled() {
30+
return PollResult::Ok(PollerState::Idle);
31+
}
32+
33+
let mut results = vec![];
34+
let volumes = context.specs().volumes_rsc();
35+
for mut volume in volumes {
36+
results.push(offline_rebuild_reconcile(&mut volume, context).await);
37+
}
38+
Self::squash_results(results)
39+
}
40+
}
41+
42+
#[tracing::instrument(
43+
level = "debug",
44+
skip(context, volume_spec),
45+
fields(volume.uuid = %volume_spec.uuid(), request.reconcile = true)
46+
)]
47+
async fn offline_rebuild_reconcile(
48+
volume_spec: &mut ResourceMutex<VolumeSpec>,
49+
context: &PollContext,
50+
) -> PollResult {
51+
let volume = match volume_spec.operation_guard() {
52+
Ok(guard) => guard,
53+
Err(_) => return PollResult::Ok(PollerState::Busy),
54+
};
55+
56+
if !volume.as_ref().policy.self_heal
57+
|| !volume.as_ref().status.created()
58+
|| volume.as_ref().target().is_some()
59+
{
60+
return PollResult::Ok(PollerState::Idle);
61+
}
62+
63+
let volume_state = context.registry().volume_state(volume.uuid()).await?;
64+
if volume_state.status == VolumeStatus::Degraded {
65+
tracing::debug!(
66+
volume.uuid = %volume.uuid(),
67+
"Unpublished volume is Degraded; eligible for offline rebuild"
68+
);
69+
}
70+
71+
PollResult::Ok(PollerState::Idle)
72+
}

control-plane/agents/src/bin/core/controller/registry.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,9 @@ pub(crate) struct RegistryInner<S: Store> {
125125
deprecated_access_mode: bool,
126126
/// Running in simulation mode.
127127
sim_args: Option<SimArgs>,
128+
/// Enable the OfflineRebuildReconciler. Off by default until the
129+
/// feature is fully tested and ready for general use.
130+
offline_rebuild_enabled: bool,
128131
}
129132

130133
impl Registry {
@@ -156,6 +159,7 @@ impl Registry {
156159
pool_cluster_size: Option<u32>,
157160
deprecated_access_mode: bool,
158161
sim_args: Option<SimArgs>,
162+
offline_rebuild_enabled: bool,
159163
) -> Result<Self, SvcError> {
160164
let store_endpoint = Self::format_store_endpoint(&store_url);
161165
tracing::info!("Connecting to persistent store at {}", store_endpoint);
@@ -220,6 +224,7 @@ impl Registry {
220224
pool_cluster_size: pool_cluster_size.or(Some(POOL_BS_CLUSTER_SIZE_DEFAULT)),
221225
deprecated_access_mode,
222226
sim_args,
227+
offline_rebuild_enabled,
223228
}),
224229
};
225230
registry.init().await?;
@@ -355,6 +360,10 @@ impl Registry {
355360
pub(crate) fn faulted_child_wait_period(&self) -> Option<std::time::Duration> {
356361
self.faulted_child_wait_period
357362
}
363+
/// Whether the OfflineRebuildReconciler is enabled.
364+
pub(crate) fn offline_rebuild_enabled(&self) -> bool {
365+
self.offline_rebuild_enabled
366+
}
358367
/// Allow for this given time before assuming failure and allowing the pool to get deleted.
359368
pub(crate) fn pool_async_creat_tmo(&self) -> std::time::Duration {
360369
self.pool_async_creat_tmo

control-plane/agents/src/bin/core/main.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ pub(crate) struct CliArgs {
4747
#[clap(long)]
4848
pub(crate) faulted_child_wait_period: Option<humantime::Duration>,
4949

50+
/// Enable the OfflineRebuildReconciler, which detects unpublished
51+
/// volumes with degraded replicas and logs them as rebuild candidates.
52+
/// Disabled by default; rebuild action is not yet implemented.
53+
#[clap(long, env = "OFFLINE_REBUILD_ENABLED")]
54+
pub(crate) offline_rebuild_enabled: bool,
55+
5056
/// When the pool creation gRPC times out, the actual call in the io-engine
5157
/// may still progress.
5258
/// We wait up to this period before considering the operation a failure and
@@ -288,6 +294,7 @@ async fn server(cli_args: CliArgs) -> anyhow::Result<()> {
288294
cli_args.pool_cluster_size,
289295
cli_args.deprecated_access_mode,
290296
sim_args,
297+
cli_args.offline_rebuild_enabled,
291298
)
292299
.await?;
293300

0 commit comments

Comments
 (0)