Skip to content

Commit 0d8ac9c

Browse files
mayastor-borsdsharma-dctiagolobocastro
committed
Merge #1775
1775: Backport fixes for release/2.7 r=tiagolobocastro a=tiagolobocastro test(bdd): make nvme controller usage more robust Caters for when the device is /dev/nvmeX but X not the same as the controller! Signed-off-by: Tiago Castro <[email protected]> --- Merge #1755 1755: Reuse Rebuild IO handles r=tiagolobocastro a=tiagolobocastro fix(rebuild): reuse rebuild IO handles Reuses the rebuild IO handles, rather than attempting to allocate them per rebuild task. The main issue with handle allocation on the fly is that the target may have not cleaned up a previous IO qpair connection, and so the connect may fail. We started seeing this more on CI because we forgot to cherry-pick a commit increasing the retry delay. However, after inspecting a bunch of user support bundles I see that we still have occasional connect errors. Rather than increasing the timeout, we attempt here to reuse the handles, thus avoid the problem almost entirely. Signed-off-by: Tiago Castro <[email protected]> --- refactor(rebuild): rebuild completion is not an error When the rebuild has been complete, if we wait for it this fails because the channels are not longer available. Instead, simply return the rebuild state, since this is what we want anyway. Signed-off-by: Tiago Castro <[email protected]> Co-authored-by: Tiago Castro <[email protected]> --- fix: check valid sct and sc combinations for pi error Signed-off-by: Diwakar Sharma <[email protected]> --- fix: use auto-detected sector size for blockdev This fixes the behaviour where we pass 512 as sector size if the disk uri doesn't contain blk_size parameter. This causes pool creation failure if the underlying disk has a different sector size e.g. 4096. Instead of passing 512, we now pass 0 which lets spdk detect the device's sector size and use that value. Signed-off-by: Diwakar Sharma <[email protected]> Co-authored-by: Diwakar Sharma <[email protected]> Co-authored-by: mayastor-bors <[email protected]> Co-authored-by: Tiago Castro <[email protected]>
2 parents 40572c9 + ece9dd0 commit 0d8ac9c

File tree

18 files changed

+321
-79
lines changed

18 files changed

+321
-79
lines changed

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[submodule "spdk-rs"]
22
path = spdk-rs
3-
url = https://github.com/openebs/spdk-rs
3+
url = ../spdk-rs.git
44
branch = release/2.7
55
[submodule "utils/dependencies"]
66
path = utils/dependencies

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

io-engine-tests/src/lib.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,28 @@ pub fn truncate_file_bytes(path: &str, size: u64) {
188188
assert!(output.status.success());
189189
}
190190

191+
/// Automatically assign a loopdev to path
192+
pub fn setup_loopdev_file(path: &str, sector_size: Option<u64>) -> String {
193+
let log_sec = sector_size.unwrap_or(512);
194+
195+
let output = Command::new("losetup")
196+
.args(["-f", "--show", "-b", &format!("{log_sec}"), path])
197+
.output()
198+
.expect("failed exec losetup");
199+
assert!(output.status.success());
200+
// return the assigned loop device
201+
String::from_utf8(output.stdout).unwrap().trim().to_string()
202+
}
203+
204+
/// Detach the provided loop device.
205+
pub fn detach_loopdev(dev: &str) {
206+
let output = Command::new("losetup")
207+
.args(["-d", dev])
208+
.output()
209+
.expect("failed exec losetup");
210+
assert!(output.status.success());
211+
}
212+
191213
pub fn fscheck(device: &str) {
192214
let output = Command::new("fsck")
193215
.args([device, "-n"])

io-engine/Cargo.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ libc = "0.2.149"
7070
log = "0.4.20"
7171
md5 = "0.7.0"
7272
merge = "0.1.0"
73-
nix = { version = "0.27.1", default-features = false, features = [ "hostname", "net", "socket", "ioctl" ] }
73+
nix = { version = "0.27.1", default-features = false, features = ["hostname", "net", "socket", "ioctl"] }
7474
once_cell = "1.18.0"
7575
parking_lot = "0.12.1"
7676
pin-utils = "0.1.0"
@@ -98,9 +98,10 @@ async-process = { version = "1.8.1" }
9898
rstack = { version = "0.3.3" }
9999
tokio-stream = "0.1.14"
100100
rustls = "0.21.12"
101+
either = "1.9.0"
101102

102103
devinfo = { path = "../utils/dependencies/devinfo" }
103-
jsonrpc = { path = "../jsonrpc"}
104+
jsonrpc = { path = "../jsonrpc" }
104105
io-engine-api = { path = "../utils/dependencies/apis/io-engine" }
105106
spdk-rs = { path = "../spdk-rs" }
106107
sysfs = { path = "../sysfs" }

io-engine/src/bdev/aio.rs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use std::{
33
convert::TryFrom,
44
ffi::CString,
55
fmt::{Debug, Formatter},
6+
os::unix::fs::FileTypeExt,
67
};
78

89
use async_trait::async_trait;
@@ -29,7 +30,7 @@ pub(super) struct Aio {
2930

3031
impl Debug for Aio {
3132
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
32-
write!(f, "Aio '{}'", self.name)
33+
write!(f, "Aio '{}', 'blk_size: {}'", self.name, self.blk_size)
3334
}
3435
}
3536

@@ -47,6 +48,10 @@ impl TryFrom<&Url> for Aio {
4748
});
4849
}
4950

51+
let path_is_blockdev = std::fs::metadata(url.path())
52+
.ok()
53+
.map_or(false, |meta| meta.file_type().is_block_device());
54+
5055
let mut parameters: HashMap<String, String> =
5156
url.query_pairs().into_owned().collect();
5257

@@ -58,9 +63,14 @@ impl TryFrom<&Url> for Aio {
5863
value: value.clone(),
5964
})?
6065
}
61-
None => 512,
66+
None => {
67+
if path_is_blockdev {
68+
0
69+
} else {
70+
512
71+
}
72+
}
6273
};
63-
6474
let uuid = uri::uuid(parameters.remove("uuid")).context(
6575
bdev_api::UuidParamParseFailed {
6676
uri: url.to_string(),

io-engine/src/bdev/nexus/nexus_bdev_children.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1152,10 +1152,11 @@ impl<'n> Nexus<'n> {
11521152
// Cancel rebuild job for this child, if any.
11531153
if let Some(job) = child.rebuild_job() {
11541154
debug!("{self:?}: retire: stopping rebuild job...");
1155-
let terminated = job.force_fail();
1156-
Reactors::master().send_future(async move {
1157-
terminated.await.ok();
1158-
});
1155+
if let either::Either::Left(terminated) = job.force_fail() {
1156+
Reactors::master().send_future(async move {
1157+
terminated.await.ok();
1158+
});
1159+
}
11591160
}
11601161

11611162
debug!("{child:?}: retire: enqueuing device '{dev}' to retire");

io-engine/src/bdev/nexus/nexus_bdev_rebuild.rs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -247,15 +247,18 @@ impl<'n> Nexus<'n> {
247247
async fn terminate_rebuild(&self, child_uri: &str) {
248248
// If a rebuild job is not found that's ok
249249
// as we were just going to remove it anyway.
250-
if let Ok(rj) = self.rebuild_job_mut(child_uri) {
251-
let ch = rj.force_stop();
252-
if let Err(e) = ch.await {
253-
error!(
254-
"Failed to wait on rebuild job for child {child_uri} \
250+
let Ok(rj) = self.rebuild_job_mut(child_uri) else {
251+
return;
252+
};
253+
let either::Either::Left(ch) = rj.force_stop() else {
254+
return;
255+
};
256+
if let Err(e) = ch.await {
257+
error!(
258+
"Failed to wait on rebuild job for child {child_uri} \
255259
to terminate with error {}",
256-
e.verbose()
257-
);
258-
}
260+
e.verbose()
261+
);
259262
}
260263
}
261264

@@ -355,6 +358,9 @@ impl<'n> Nexus<'n> {
355358

356359
// wait for the jobs to complete terminating
357360
for job in terminated_jobs {
361+
let either::Either::Left(job) = job else {
362+
continue;
363+
};
358364
if let Err(e) = job.await {
359365
error!(
360366
"{:?}: error when waiting for the rebuild job \

io-engine/src/bdev/nvmx/utils.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,9 @@ pub enum NvmeAerInfoNvmCommandSet {
4040

4141
/// Check if the Completion Queue Entry indicates abnormal termination of
4242
/// request due to any of the following conditions:
43-
/// - Any media specific errors that occur in the NVM or data integrity type
44-
/// errors.
43+
/// - An Status Code Type(SCT) of media specific errors that occur in the NVM
44+
/// or data integrity type errors, AND a Status Code(SC) value pertaining to
45+
/// one of the below:
4546
/// - The command was aborted due to an end-to-end guard check failure.
4647
/// - The command was aborted due to an end-to-end application tag check
4748
/// failure.
@@ -59,9 +60,9 @@ pub(crate) fn nvme_cpl_is_pi_error(cpl: *const spdk_nvme_cpl) -> bool {
5960
}
6061

6162
sct == NvmeStatusCodeType::MediaError as u16
62-
|| sc == NvmeMediaErrorStatusCode::Guard as u16
63-
|| sc == NvmeMediaErrorStatusCode::ApplicationTag as u16
64-
|| sc == NvmeMediaErrorStatusCode::ReferenceTag as u16
63+
&& (sc == NvmeMediaErrorStatusCode::Guard as u16
64+
|| sc == NvmeMediaErrorStatusCode::ApplicationTag as u16
65+
|| sc == NvmeMediaErrorStatusCode::ReferenceTag as u16)
6566
}
6667

6768
#[inline]

io-engine/src/bdev/uring.rs

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
use std::{collections::HashMap, convert::TryFrom, ffi::CString};
1+
use std::{
2+
collections::HashMap,
3+
convert::TryFrom,
4+
ffi::CString,
5+
os::unix::fs::FileTypeExt,
6+
};
27

38
use async_trait::async_trait;
49
use futures::channel::oneshot;
@@ -36,6 +41,10 @@ impl TryFrom<&Url> for Uring {
3641
});
3742
}
3843

44+
let path_is_blockdev = std::fs::metadata(url.path())
45+
.ok()
46+
.map_or(false, |meta| meta.file_type().is_block_device());
47+
3948
let mut parameters: HashMap<String, String> =
4049
url.query_pairs().into_owned().collect();
4150

@@ -47,7 +56,13 @@ impl TryFrom<&Url> for Uring {
4756
value: value.clone(),
4857
})?
4958
}
50-
None => 512,
59+
None => {
60+
if path_is_blockdev {
61+
0
62+
} else {
63+
512
64+
}
65+
}
5166
};
5267

5368
let uuid = uri::uuid(parameters.remove("uuid")).context(

io-engine/src/grpc/v1/snapshot_rebuild.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,10 @@ impl SnapshotRebuildRpc for SnapshotRebuildService {
110110
let Ok(job) = SnapshotRebuildJob::lookup(&args.uuid) else {
111111
return Err(tonic::Status::not_found(""));
112112
};
113-
let rx = job.force_stop().await.ok();
113+
let rx = match job.force_stop() {
114+
either::Either::Left(chan) => chan.await,
115+
either::Either::Right(stopped) => Ok(stopped),
116+
};
114117
info!("Snapshot Rebuild stopped: {rx:?}");
115118
job.destroy();
116119
Ok(())

0 commit comments

Comments
 (0)