Skip to content

Commit e17d933

Browse files
mayastor-borsAbhinandan-Purkait
andcommitted
chore(bors): merge pull request #962
962: feat(encryption): restrict different devlink of same device for pool creation r=Abhinandan-Purkait a=Abhinandan-Purkait ## Issue Since we can't read the metadata of an encrypted pool, there isn't any definitive way in data-plane to stop a encrypted pool from getting recreated with a different devlink of a same device with a different pool name. ## Solution Compare the devlinks across all pools on the specified node, if any pool shares a devlink with the requested disk's devlink/devpath/devname then reject the creation. ## Test Adds a test to validate the scenario by creating an lvm lv on a loop device and using it's devlinks across multiple creations. Co-authored-by: Abhinandan Purkait <[email protected]>
2 parents b04c16b + de05a36 commit e17d933

File tree

25 files changed

+575
-51
lines changed

25 files changed

+575
-51
lines changed

Cargo.lock

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

control-plane/agents/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ tracing = "0.1.40"
5858
nix = { version = "0.29.0", default-features = false }
5959
prost-types = "0.13.3"
6060
url = "2.5.4"
61+
regex = "1.11.1"
6162

6263
grpc = { path = "../grpc" }
6364
shutdown = { path = "../../utils/shutdown" }

control-plane/agents/src/bin/core/controller/io_engine/client.rs

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,6 @@ impl GrpcContext {
8585
self.lock.clone().lock_owned().await
8686
}
8787
/// Use this context to connect a grpc client and return it.
88-
pub(crate) async fn connect(&self) -> Result<GrpcClient, SvcError> {
89-
GrpcClient::new(self).await
90-
}
91-
/// Use this context to connect a grpc client and return it.
9288
/// This client differs from `GrpcClient` from `Self::connect` in that it ensures only 1 request
9389
/// is sent at a time among all `GrpcClientLocked` clients.
9490
pub(crate) async fn connect_locked(

control-plane/agents/src/bin/core/controller/registry.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ pub(crate) struct RegistryInner<S: Store> {
114114
ha_disabled: bool,
115115
/// Etcd max page size.
116116
etcd_max_page_size: i64,
117+
/// Allow pool creation using non-persistent devlinks.
118+
allow_non_persistent_devlinks: bool,
117119
}
118120

119121
impl Registry {
@@ -140,6 +142,7 @@ impl Registry {
140142
ha_enabled: bool,
141143
etcd_max_page_size: i64,
142144
no_volume_health: bool,
145+
allow_non_persistent_devlinks: bool,
143146
) -> Result<Self, SvcError> {
144147
let store_endpoint = Self::format_store_endpoint(&store_url);
145148
tracing::info!("Connecting to persistent store at {}", store_endpoint);
@@ -199,6 +202,7 @@ impl Registry {
199202
thin_args,
200203
ha_disabled: ha_enabled,
201204
etcd_max_page_size,
205+
allow_non_persistent_devlinks,
202206
}),
203207
};
204208
registry.init().await?;
@@ -230,6 +234,11 @@ impl Registry {
230234
self.ha_disabled
231235
}
232236

237+
/// Check if pool creation using non-persistent devlink is allowed.
238+
pub(crate) fn allow_non_persistent_devlinks(&self) -> bool {
239+
self.allow_non_persistent_devlinks
240+
}
241+
233242
/// Check if the partial rebuilds are disabled.
234243
pub(crate) fn partial_rebuild_disabled(&self) -> bool {
235244
self.disable_partial_rebuild

control-plane/agents/src/bin/core/main.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,10 @@ pub(crate) struct CliArgs {
142142
/// Disable volume health reporting which uses pstor watchers.
143143
#[clap(long)]
144144
no_volume_health: bool,
145+
146+
/// Allow pools to be created using non persistent devlinks.
147+
#[clap(long)]
148+
allow_non_persistent_devlink: bool,
145149
}
146150
impl CliArgs {
147151
fn args() -> Self {
@@ -219,6 +223,7 @@ async fn server(cli_args: CliArgs) -> anyhow::Result<()> {
219223
cli_args.disable_ha,
220224
cli_args.etcd_page_limit as i64,
221225
cli_args.no_volume_health,
226+
cli_args.allow_non_persistent_devlink,
222227
)
223228
.await?;
224229

control-plane/agents/src/bin/core/node/service.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use stor_port::types::v0::transport::{
1313
Deregister, Filter, Node, NodeId, NodeState, NodeStatus, Register,
1414
};
1515

16+
use crate::controller::io_engine::HostApi;
1617
use crate::controller::wrapper::InternalOps;
1718
use grpc::{
1819
context::Context,
@@ -367,9 +368,7 @@ impl Service {
367368
) -> Result<BlockDevices, SvcError> {
368369
let node = self.registry.node_wrapper(&request.node).await?;
369370

370-
let grpc = node.read().await.grpc_context()?;
371-
let client = grpc.connect().await?;
372-
client.list_blockdevices(request).await
371+
node.list_blockdevices(request).await
373372
}
374373

375374
/// Cordon the specified node.

control-plane/agents/src/bin/core/node/wrapper.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use crate::controller::io_engine::HostApi;
12
use crate::{
23
controller::{
34
io_engine::{
@@ -15,8 +16,11 @@ use crate::{
1516
NumRebuilds,
1617
};
1718
use agents::{errors::SvcError, eventing::EventWithMeta};
19+
use async_trait::async_trait;
1820
use events_api::event::{EventAction, EventCategory, EventMessage, EventMeta, EventSource};
1921
use grpc::operations::snapshot::SnapshotInfo;
22+
use stor_port::transport_api::v0::BlockDevices;
23+
use stor_port::types::v0::transport::GetBlockDevices;
2024
use stor_port::{
2125
transport_api::{Message, MessageId, ResourceKind},
2226
types::v0::{
@@ -36,7 +40,6 @@ use stor_port::{
3640
},
3741
};
3842

39-
use async_trait::async_trait;
4043
use parking_lot::RwLock;
4144
use std::{future::Future, ops::DerefMut, sync::Arc};
4245
use tracing::{debug, trace, warn};
@@ -1754,3 +1757,16 @@ impl From<&NodeWrapper> for NodeState {
17541757
node.node_state().clone()
17551758
}
17561759
}
1760+
1761+
#[async_trait]
1762+
impl HostApi for Arc<tokio::sync::RwLock<NodeWrapper>> {
1763+
async fn liveness_probe(&self) -> Result<Register, SvcError> {
1764+
let dataplane = self.read().await.grpc_client().await?;
1765+
dataplane.liveness_probe().await
1766+
}
1767+
1768+
async fn list_blockdevices(&self, request: &GetBlockDevices) -> Result<BlockDevices, SvcError> {
1769+
let dataplane = self.read().await.grpc_client().await?;
1770+
dataplane.list_blockdevices(request).await
1771+
}
1772+
}

control-plane/agents/src/bin/core/pool/operations_helper.rs

Lines changed: 111 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use crate::controller::io_engine::HostApi;
2+
use crate::node::wrapper::NodeWrapper;
13
use crate::{
24
controller::{
35
io_engine::PoolApi,
@@ -11,8 +13,8 @@ use crate::{
1113
node::wrapper::GetterOps,
1214
};
1315
use agents::{errors, errors::SvcError};
14-
use snafu::OptionExt;
15-
use std::ops::Deref;
16+
use stor_port::transport_api::ResourceKind;
17+
use stor_port::types::v0::transport::{GetBlockDevices, PoolDeviceUri};
1618
use stor_port::types::v0::{
1719
store::{
1820
pool::PoolSpec,
@@ -21,6 +23,14 @@ use stor_port::types::v0::{
2123
transport::{CreatePool, DestroyReplica, NodeId, ReplicaOwners},
2224
};
2325

26+
use itertools::Itertools;
27+
use regex::Regex;
28+
use snafu::OptionExt;
29+
use std::collections::HashSet;
30+
use std::ops::Deref;
31+
use std::sync::Arc;
32+
use tokio::sync::RwLock;
33+
2434
impl OperationGuardArc<PoolSpec> {
2535
/// Retries the creation of the pool which is being done in the background by the io-engine.
2636
/// This may happen if the pool create gRPC times out, for very large pools.
@@ -130,3 +140,102 @@ impl OperationGuardArc<ReplicaSpec> {
130140
}
131141
}
132142
}
143+
144+
pub(crate) async fn devlink_preflight_checks(
145+
request: &CreatePool,
146+
node: Arc<RwLock<NodeWrapper>>,
147+
registry: &Registry,
148+
) -> Result<(), SvcError> {
149+
let request_disks: HashSet<String> = request
150+
.disks
151+
.iter()
152+
.map(|disk| utils::disk::normalize_disk(disk.as_str()))
153+
.collect();
154+
155+
if !registry.allow_non_persistent_devlinks() {
156+
fn is_persistent_devlink(pattern: &str) -> Result<bool, SvcError> {
157+
let re = Regex::new(utils::DEVLINK_REGEX).map_err(|_| SvcError::InvalidArguments {})?;
158+
Ok(re.is_match(pattern))
159+
}
160+
161+
if request_disks
162+
.iter()
163+
// Only attempt to validate if it starts with "/dev".
164+
.filter(|disk| disk.starts_with("/dev"))
165+
.any(|disk| !is_persistent_devlink(disk).is_ok_and(|val| val))
166+
{
167+
return Err(SvcError::InvalidDevlink {});
168+
}
169+
}
170+
171+
let node_pools = registry
172+
.get_node_opt_pools(Some(request.node.clone()))
173+
.await?;
174+
175+
if !node_pools.is_empty() {
176+
let node_pools_disks: Vec<PoolDeviceUri> = node_pools
177+
.into_iter()
178+
.filter_map(|pool| pool.spec().map(|spec| spec.disks))
179+
.flatten()
180+
.collect();
181+
182+
let node_pools_disks_normalized: HashSet<String> = node_pools_disks
183+
.iter()
184+
.map(|disk| utils::disk::normalize_disk(disk.as_str()))
185+
.collect();
186+
187+
let common_disks: HashSet<_> = request_disks
188+
.intersection(&node_pools_disks_normalized)
189+
.cloned()
190+
.collect();
191+
192+
// Same devpaths or devlinks should be rejected.
193+
if !common_disks.is_empty() {
194+
return Err(SvcError::InUse {
195+
kind: ResourceKind::Block,
196+
id: common_disks.iter().join(","),
197+
});
198+
}
199+
200+
let node_block_devices = node
201+
.list_blockdevices(&GetBlockDevices {
202+
node: request.node.clone(),
203+
all: true,
204+
})
205+
.await?
206+
.into_inner();
207+
208+
let matched_devices: Vec<_> = node_block_devices
209+
.iter()
210+
.filter(|device| {
211+
request_disks.contains(&device.devname)
212+
|| request_disks.contains(&device.devpath)
213+
|| device
214+
.devlinks
215+
.iter()
216+
.any(|link| request_disks.contains(link))
217+
})
218+
.collect();
219+
220+
// If the requested disk was not found, that could be because it could be a malloc or a file,
221+
// in that case ignore and move ahead to allow tests. If it is an actual disk that was not
222+
// detected by blockdevice api then the disk might not be visible to io-engine, so let it fail from io-engine
223+
// rather than bailing out from control-plane to keep the behaviour as before.
224+
if !matched_devices.is_empty() {
225+
if let Some(conflict) = matched_devices.iter().find(|bd| {
226+
node_pools_disks_normalized.contains(&bd.devname)
227+
|| node_pools_disks_normalized.contains(&bd.devpath)
228+
|| bd
229+
.devlinks
230+
.iter()
231+
.any(|link| node_pools_disks_normalized.contains(link))
232+
}) {
233+
return Err(SvcError::InUse {
234+
kind: ResourceKind::Block,
235+
id: conflict.devname.clone(),
236+
});
237+
}
238+
}
239+
}
240+
Ok(())
241+
}

control-plane/agents/src/bin/core/pool/pool_operations.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ use crate::controller::{
77
OperationGuardArc,
88
},
99
};
10+
use crate::pool::operations_helper::devlink_preflight_checks;
1011
use agents::errors::{SvcError, SvcError::CordonedNode};
11-
use std::collections::HashMap;
1212
use stor_port::{
1313
transport_api::ResourceKind,
1414
types::v0::{
@@ -18,6 +18,8 @@ use stor_port::{
1818
};
1919
use utils::dsp_created_by_key;
2020

21+
use std::collections::HashMap;
22+
2123
#[async_trait::async_trait]
2224
impl ResourceLifecycle for OperationGuardArc<PoolSpec> {
2325
type Create = CreatePool;
@@ -35,13 +37,15 @@ impl ResourceLifecycle for OperationGuardArc<PoolSpec> {
3537
node_id: request.node.to_string(),
3638
});
3739
}
40+
3841
if request.disks.len() != 1 {
3942
return Err(SvcError::InvalidPoolDeviceNum {
4043
disks: request.disks.clone(),
4144
});
4245
}
4346

44-
if let Ok(pool) = registry.specs().pool(&request.id) {
47+
let pool_get_result = registry.specs().pool(&request.id);
48+
if let Ok(pool) = &pool_get_result {
4549
if pool.status.created() {
4650
return Err(SvcError::AlreadyExists {
4751
kind: ResourceKind::Pool,
@@ -58,6 +62,11 @@ impl ResourceLifecycle for OperationGuardArc<PoolSpec> {
5862
});
5963
}
6064

65+
if pool_get_result.is_err() {
66+
// If the devlink for a device is used for multiple pools, reject new creation.
67+
devlink_preflight_checks(request, node.clone(), registry).await?
68+
}
69+
6170
let mut pool = specs
6271
.get_or_create_pool(request)
6372
.operation_guard_wait()

0 commit comments

Comments
 (0)