Skip to content

Commit

Permalink
Merge #1436
Browse files Browse the repository at this point in the history
1436: fix(nexus): fixing child retire during rebuild r=dsavitskiy a=dsavitskiy



Co-authored-by: Dmitry Savitskiy <[email protected]>
  • Loading branch information
mayastor-bors and dsavitskiy committed Jul 6, 2023
2 parents 122e96f + 040e206 commit ce677f7
Show file tree
Hide file tree
Showing 18 changed files with 497 additions and 400 deletions.
45 changes: 17 additions & 28 deletions mayastor/src/bdev/nexus/nexus_bdev.rs
Original file line number Diff line number Diff line change
Expand Up @@ -479,9 +479,6 @@ pub struct Nexus<'n> {
pub state: parking_lot::Mutex<NexusState>,
/// The offset in blocks where the data partition starts.
pub(crate) data_ent_offset: u64,
/// the handle to be used when sharing the nexus, this allows for the bdev
/// to be shared with vbdevs on top
pub(crate) share_handle: Option<String>,
/// enum containing the protocol-specific target used to publish the nexus
pub nexus_target: Option<NexusTarget>,
/// Indicates if the Nexus has an I/O device.
Expand Down Expand Up @@ -588,7 +585,6 @@ impl<'n> Nexus<'n> {
state: parking_lot::Mutex::new(NexusState::Init),
bdev: None,
data_ent_offset: 0,
share_handle: None,
req_size: size,
nexus_target: None,
nvme_params,
Expand Down Expand Up @@ -716,6 +712,11 @@ impl<'n> Nexus<'n> {
unsafe { self.bdev().required_alignment() }
}

/// TODO
pub fn children_iter(&self) -> std::slice::Iter<NexusChild<'n>> {
self.children.iter()
}

/// Reconfigures the child event handler.
pub(crate) async fn reconfigure(&self, event: DrEvent) {
info!(
Expand Down Expand Up @@ -879,27 +880,15 @@ impl<'n> Nexus<'n> {
pub async fn destroy(mut self: Pin<&mut Self>) -> Result<(), Error> {
info!("Destroying nexus {}", self.name);

self.as_mut().destroy_shares().await?;
self.as_mut().unshare_nexus().await?;

// wait for all rebuild jobs to be cancelled before proceeding with the
// destruction of the nexus
for child in self.children.iter() {
self.cancel_child_rebuild_jobs(child.get_name()).await;
}

unsafe {
for child in self.as_mut().get_unchecked_mut().children.iter_mut() {
info!("Destroying child bdev {}", child.get_name());
if let Err(e) = child.close().await {
// TODO: should an error be returned here?
error!(
"Failed to close child {} with error {}",
child.get_name(),
e.verbose()
);
}
}
}
self.close_children().await;

// Persist the fact that the nexus destruction has completed.
self.persist(PersistOp::Shutdown).await;
Expand Down Expand Up @@ -1287,16 +1276,9 @@ impl<'n> BdevOps for Nexus<'n> {
Reactor::block_on(async move {
let self_ref = unsafe { &mut *self_ptr };

for child in self_ref.children.iter_mut() {
if child.state() == ChildState::Open {
if let Err(e) = child.close().await {
error!(
"{}: child {} failed to close with error {}",
self_ref.name,
child.get_name(),
e.verbose()
);
}
for child in self_ref.children_iter() {
if child.is_healthy() {
child.close().await.ok();
}
}

Expand Down Expand Up @@ -1473,6 +1455,13 @@ async fn nexus_create_internal(
children: &[String],
nexus_info_key: Option<String>,
) -> Result<(), Error> {
info!(
"Creating new nexus '{}' ({} child(ren): {:?})...",
name,
children.len(),
children
);

if let Some(nexus) = nexus_lookup_name_uuid(name, nexus_uuid) {
// FIXME: Instead of error, we return Ok without checking
// that the children match, which seems wrong.
Expand Down
85 changes: 31 additions & 54 deletions mayastor/src/bdev/nexus/nexus_bdev_children.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

use std::{cmp::min, pin::Pin};

use futures::future::join_all;
use snafu::ResultExt;

use super::{
Expand Down Expand Up @@ -299,17 +298,12 @@ impl<'n> Nexus<'n> {
Some(val) => val,
};

unsafe {
if let Err(e) = self.as_mut().get_unchecked_mut().children[idx]
.close()
.await
{
return Err(Error::CloseChild {
name: self.name.clone(),
child: self.children[idx].get_name().to_string(),
source: e,
});
}
if let Err(e) = self.children[idx].close().await {
return Err(Error::CloseChild {
name: self.name.clone(),
child: self.children[idx].get_name().to_string(),
source: e,
});
}

let child_state = self.children[idx].state();
Expand Down Expand Up @@ -460,18 +454,25 @@ impl<'n> Nexus<'n> {
}
}

/// Close each child that belongs to this nexus.
pub(crate) async fn close_children(mut self: Pin<&mut Self>) {
let futures = unsafe {
self.as_mut()
.get_unchecked_mut()
.children
.iter_mut()
.map(|c| c.close())
};
let results = join_all(futures).await;
if results.iter().any(|c| c.is_err()) {
error!("{}: Failed to close children", self.name);
/// Unconditionally closes all children of this nexus.
pub(crate) async fn close_children(&self) {
info!(
"{:?}: closing {n} children...",
self,
n = self.children.len()
);

let mut failed = 0;
for child in self.children_iter() {
if child.close().await.is_err() {
failed += 1;
}
}

if failed == 0 {
info!("{:?}: all children closed", self);
} else {
error!("{:?}: failed to close some of the children", self);
}
}

Expand Down Expand Up @@ -520,20 +521,9 @@ impl<'n> Nexus<'n> {
// completed yet so we fail the registration all together for now.
if failed {
// Close any children that WERE succesfully opened.
unsafe {
for child in
self.as_mut().get_unchecked_mut().children.iter_mut()
{
if child.state() == ChildState::Open {
if let Err(error) = child.close().await {
error!(
"{}: failed to close child {}: {}",
name,
child.name,
error.verbose()
);
}
}
for child in self.children_iter() {
if child.is_healthy() {
child.close().await.ok();
}
}

Expand Down Expand Up @@ -563,20 +553,7 @@ impl<'n> Nexus<'n> {
}

if let Err(error) = write_ex_err {
unsafe {
for child in
self.as_mut().get_unchecked_mut().children.iter_mut()
{
if let Err(error) = child.close().await {
error!(
"{}: child {} failed to close with error {}",
name,
child.name,
error.verbose()
);
}
}
}
self.close_children().await;
return Err(error);
}

Expand Down Expand Up @@ -609,9 +586,9 @@ impl<'n> Nexus<'n> {
}

/// TODO
pub async fn destroy_child(&self, name: &str) -> Result<(), Error> {
pub async fn close_child(&self, name: &str) -> Result<(), Error> {
if let Some(child) = self.lookup_child(name) {
child.destroy().await.map_err(|source| Error::DestroyChild {
child.close().await.map_err(|source| Error::DestroyChild {
source,
child: name.to_string(),
name: self.name.to_string(),
Expand Down
18 changes: 14 additions & 4 deletions mayastor/src/bdev/nexus/nexus_channel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,12 @@ pub(crate) fn fault_nexus_child(nexus: Pin<&mut Nexus>, name: &str) -> bool {
nexus
.children
.iter()
.filter(|c| c.state() == ChildState::Open)
.filter(|c| {
matches!(
c.state(),
ChildState::Open | ChildState::Faulted(Reason::OutOfSync)
)
})
.filter(|c| {
// If there were previous retires, we do not have a reference
// to a BlockDevice. We do however, know it can't be the device
Expand All @@ -65,11 +70,16 @@ pub(crate) fn fault_nexus_child(nexus: Pin<&mut Nexus>, name: &str) -> bool {
}
})
.any(|c| {
Ok(ChildState::Open)
Ok(ChildState::Faulted(Reason::OutOfSync))
== c.state.compare_exchange(
ChildState::Open,
ChildState::Faulted(Reason::IoError),
ChildState::Faulted(Reason::OutOfSync),
ChildState::Faulted(Reason::RebuildFailed),
)
|| Ok(ChildState::Open)
== c.state.compare_exchange(
ChildState::Open,
ChildState::Faulted(Reason::IoError),
)
})
}

Expand Down
Loading

0 comments on commit ce677f7

Please sign in to comment.