Skip to content

Commit fb60ebc

Browse files
fix(nvmx/retire): disconnect failed controllers
When we are pausing the nexus, all IO must get flushed before the subsystem pausing completes. If we can't flush the IO then pausing is stuck forever... The issue we have seen is that when IO's are stuck there's nothing which can fail them and allow pause to complete. One way this can happen is when the controller is failed as it seems in this case the io queues are not getting polled. A first fix that can be done is to piggy back on the adminq polling failure and use this to drive the removal of the failed child devices from the nexus per-core channels. A better approach might be needed in the future to be able to timeout the IOs even when no completions are processed in a given I/O qpair. Signed-off-by: Tiago Castro <tiagolobocastro@gmail.com>
1 parent 02526c6 commit fb60ebc

File tree

7 files changed

+110
-16
lines changed

7 files changed

+110
-16
lines changed

io-engine-tests/src/compose/mod.rs

+43-7
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,21 @@ use std::future::Future;
55
use tokio::sync::oneshot::channel;
66

77
use crate::mayastor_test_init_ex;
8-
use io_engine::core::{
9-
mayastor_env_stop,
10-
MayastorCliArgs,
11-
MayastorEnvironment,
12-
Reactor,
13-
Reactors,
14-
GLOBAL_RC,
8+
use io_engine::{
9+
core::{
10+
device_monitor_loop,
11+
mayastor_env_stop,
12+
runtime,
13+
MayastorCliArgs,
14+
MayastorEnvironment,
15+
ProtectedSubsystems,
16+
Reactor,
17+
Reactors,
18+
ResourceLockManager,
19+
ResourceLockManagerConfig,
20+
GLOBAL_RC,
21+
},
22+
grpc,
1523
};
1624
use std::time::Duration;
1725

@@ -99,6 +107,34 @@ impl<'a> MayastorTest<'a> {
99107
tokio::time::sleep(Duration::from_millis(500)).await;
100108
}
101109
}
110+
111+
/// Starts the device monitor loop which is required to fully
112+
/// remove devices when they are not in use.
113+
pub fn start_device_monitor(&self) {
114+
runtime::spawn(device_monitor_loop());
115+
}
116+
117+
/// Start the gRPC server which can be useful to debug tests.
118+
pub fn start_grpc(&self) {
119+
let cfg = ResourceLockManagerConfig::default()
120+
.with_subsystem(ProtectedSubsystems::POOL, 32)
121+
.with_subsystem(ProtectedSubsystems::NEXUS, 512)
122+
.with_subsystem(ProtectedSubsystems::REPLICA, 1024);
123+
ResourceLockManager::initialize(cfg);
124+
125+
let env = MayastorEnvironment::global_or_default();
126+
runtime::spawn(async {
127+
grpc::MayastorGrpcServer::run(
128+
&env.node_name,
129+
&env.node_nqn,
130+
env.grpc_endpoint.unwrap(),
131+
env.rpc_addr,
132+
env.api_versions,
133+
)
134+
.await
135+
.ok();
136+
});
137+
}
102138
}
103139

104140
impl<'a> Drop for MayastorTest<'a> {

io-engine/src/bdev/nexus/nexus_bdev_children.rs

+32-2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ use super::{
4141
Nexus,
4242
NexusChild,
4343
NexusOperation,
44+
NexusPauseState,
4445
NexusState,
4546
NexusStatus,
4647
PersistOp,
@@ -787,6 +788,13 @@ impl<'n> DeviceEventListener for Nexus<'n> {
787788
false,
788789
);
789790
}
791+
DeviceEventType::AdminQNoticeCtrlFailed => {
792+
Reactors::master().send_future(Nexus::disconnect_failed_child(
793+
self.name.clone(),
794+
dev_name.to_owned(),
795+
));
796+
}
797+
790798
_ => {
791799
warn!(
792800
"{:?}: ignoring event '{:?}' for device '{}'",
@@ -917,6 +925,28 @@ impl<'n> Nexus<'n> {
917925
}
918926
}
919927

928+
/// Disconnect a failed child from the given nexus.
929+
async fn disconnect_failed_child(nexus_name: String, dev: String) {
930+
let Some(nex) = nexus_lookup_mut(&nexus_name) else {
931+
warn!(
932+
"Nexus '{nexus_name}': retiring failed device '{dev}': \
933+
nexus already gone"
934+
);
935+
return;
936+
};
937+
938+
info!("Nexus '{nexus_name}': disconnect handlers for controller failed device: '{dev}'");
939+
940+
if nex.io_subsystem_state() == Some(NexusPauseState::Pausing) {
941+
nex.traverse_io_channels_async((), |channel, _| {
942+
channel.disconnect_detached_devices(|h| {
943+
h.get_device().device_name() == dev && h.is_ctrlr_failed()
944+
});
945+
})
946+
.await;
947+
}
948+
}
949+
920950
/// Retires a child device for the given nexus.
921951
async fn child_retire_routine(
922952
nexus_name: String,
@@ -981,12 +1011,12 @@ impl<'n> Nexus<'n> {
9811011
// channels, and all I/Os failing due to this device will eventually
9821012
// resubmit and succeeded (if any healthy children are left).
9831013
//
984-
// Device disconnection is done in two steps (detach, than disconnect)
1014+
// Device disconnection is done in two steps (detach, then disconnect)
9851015
// in order to prevent an I/O race when retiring a device.
9861016
self.detach_device(&dev).await;
9871017

9881018
// Disconnect the devices with failed controllers _before_ pause,
989-
// otherwise pause would stuck. Keep all controoled that are _not_
1019+
// otherwise pause would get stuck. Keep all controllers which are _not_
9901020
// failed (e.g., in the case I/O failed due to ENOSPC).
9911021
self.traverse_io_channels_async((), |channel, _| {
9921022
channel.disconnect_detached_devices(|h| h.is_ctrlr_failed());

io-engine/src/bdev/nexus/nexus_channel.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,14 @@ impl<'n> Debug for NexusChannel<'n> {
3232
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
3333
write!(
3434
f,
35-
"{io} chan '{nex}' core:{core}({cur}) [R:{r} W:{w} L:{l} C:{c}]",
35+
"{io} chan '{nex}' core:{core}({cur}) [R:{r} W:{w} D:{d} L:{l} C:{c}]",
3636
io = if self.is_io_chan { "I/O" } else { "Aux" },
3737
nex = self.nexus.nexus_name(),
3838
core = self.core,
3939
cur = Cores::current(),
4040
r = self.readers.len(),
4141
w = self.writers.len(),
42+
d = self.detached.len(),
4243
l = self.io_logs.len(),
4344
c = self.nexus.child_count(),
4445
)

io-engine/src/bdev/nvmx/controller.rs

+9-3
Original file line numberDiff line numberDiff line change
@@ -844,13 +844,14 @@ pub extern "C" fn nvme_poll_adminq(ctx: *mut c_void) -> i32 {
844844
if result < 0 {
845845
if context.start_device_destroy() {
846846
error!(
847-
"process adminq: {}: {}",
847+
"process adminq: {}: ctrl failed: {}, error: {}",
848848
context.name,
849+
context.is_failed(),
849850
Errno::from_i32(result.abs())
850851
);
851852
info!("dispatching nexus fault and retire: {}", context.name);
852-
let dev_name = context.name.to_string();
853-
let carc = NVME_CONTROLLERS.lookup_by_name(&dev_name).unwrap();
853+
let dev_name = context.name.as_str();
854+
let carc = NVME_CONTROLLERS.lookup_by_name(dev_name).unwrap();
854855
debug!(
855856
?dev_name,
856857
"notifying listeners of admin command completion failure"
@@ -864,6 +865,11 @@ pub extern "C" fn nvme_poll_adminq(ctx: *mut c_void) -> i32 {
864865
?num_listeners,
865866
"listeners notified of admin command completion failure"
866867
);
868+
} else if context.report_failed() {
869+
if let Some(carc) = NVME_CONTROLLERS.lookup_by_name(&context.name) {
870+
carc.lock()
871+
.notify_listeners(DeviceEventType::AdminQNoticeCtrlFailed);
872+
}
867873
}
868874
return 1;
869875
}

io-engine/src/bdev/nvmx/controller_inner.rs

+15
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ pub(crate) struct TimeoutConfig {
7474
reset_attempts: u32,
7575
next_reset_time: Instant,
7676
destroy_in_progress: AtomicCell<bool>,
77+
report_failed: AtomicCell<bool>,
7778
}
7879

7980
impl Drop for TimeoutConfig {
@@ -94,6 +95,7 @@ impl TimeoutConfig {
9495
reset_attempts: MAX_RESET_ATTEMPTS,
9596
next_reset_time: Instant::now(),
9697
destroy_in_progress: AtomicCell::new(false),
98+
report_failed: AtomicCell::new(true),
9799
}
98100
}
99101

@@ -116,6 +118,19 @@ impl TimeoutConfig {
116118
}
117119
}
118120

121+
/// Check if the SPDK's nvme controller is failed.
122+
pub fn is_failed(&self) -> bool {
123+
self.ctrlr.is_failed
124+
}
125+
/// Check if we need to report the controller failure.
126+
/// We only report this failure once.
127+
pub fn report_failed(&mut self) -> bool {
128+
if !self.is_failed() {
129+
return false;
130+
}
131+
self.report_failed.compare_exchange(true, false).is_ok()
132+
}
133+
119134
fn reset_cb(success: bool, ctx: *mut c_void) {
120135
let timeout_ctx = TimeoutConfig::from_ptr(ctx as *mut TimeoutConfig);
121136

io-engine/src/core/device_events.rs

+7-1
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,14 @@ pub enum DeviceEventType {
1919
DeviceResized,
2020
/// TODO
2121
MediaManagement,
22-
/// TODO
22+
/// Sent when admin q polling fails for the first time.
2323
AdminCommandCompletionFailed,
24+
/// When the adminq poll fails the first time, the controller may not yet
25+
/// be failed.
26+
/// Next time the admin q poll fails, if the controller is noticed as
27+
/// failed for the first time, this event is sent, allowing further
28+
/// clean up to be performed.
29+
AdminQNoticeCtrlFailed,
2430
}
2531

2632
/// TODO

io-engine/src/core/env.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ type Result<T, E = EnvError> = std::result::Result<T, E>;
381381
#[allow(dead_code)]
382382
pub struct MayastorEnvironment {
383383
pub node_name: String,
384-
node_nqn: Option<String>,
384+
pub node_nqn: Option<String>,
385385
pub grpc_endpoint: Option<std::net::SocketAddr>,
386386
pub registration_endpoint: Option<Uri>,
387387
ps_endpoint: Option<String>,
@@ -420,7 +420,7 @@ pub struct MayastorEnvironment {
420420
nvmf_tgt_interface: Option<String>,
421421
/// NVMF target Command Retry Delay in x100 ms.
422422
pub nvmf_tgt_crdt: [u16; TARGET_CRDT_LEN],
423-
api_versions: Vec<ApiVersion>,
423+
pub api_versions: Vec<ApiVersion>,
424424
skip_sig_handler: bool,
425425
enable_io_all_thrd_nexus_channels: bool,
426426
developer_delay: bool,

0 commit comments

Comments
 (0)