Hi Greg,
We hit a panic in 4.14.43 with md raid1.
It's easy to reproduce with a test case, running Fio, and doing mdadm --grow bitmap=none /dev/mdx && mdadm --grow bitmap=internal /dev/mdx in a loop.
With following patches applied, we can no longer reproduce the problem.
Please consider to apply the patches to 4.14, they can be applied cleanly on 4.14.52.
Cheers, Jack
NeilBrown (6): md: always hold reconfig_mutex when calling mddev_suspend() md: don't call bitmap_create() while array is quiesced. md: move suspend_hi/lo handling into core md code md: use mddev_suspend/resume instead of ->quiesce() md: allow metadata update while suspending. md: remove special meaning of ->quiesce(.., 2)
drivers/md/dm-raid.c | 10 ++++-- drivers/md/md-cluster.c | 6 ++-- drivers/md/md.c | 90 ++++++++++++++++++++++++++++++------------------ drivers/md/md.h | 15 +++++--- drivers/md/raid0.c | 2 +- drivers/md/raid1.c | 27 +++++---------- drivers/md/raid10.c | 10 ++---- drivers/md/raid5-cache.c | 30 ++++++++++------ drivers/md/raid5-log.h | 2 +- drivers/md/raid5.c | 40 ++++----------------- 10 files changed, 116 insertions(+), 116 deletions(-)
From: NeilBrown neilb@suse.com
commit 4d5324f760aacaefeb721b172aa14bf66045c332 upstream
Most often mddev_suspend() is called with reconfig_mutex held. Make this a requirement in preparation a subsequent patch. Also require reconfig_mutex to be held for mddev_resume(), partly for symmetry and partly to guarantee no races with incr/decr of mddev->suspend.
Taking the mutex in r5c_disable_writeback_async() is a little tricky as this is called from a work queue via log->disable_writeback_work, and flush_work() is called on that while holding ->reconfig_mutex. If the work item hasn't run before flush_work() is called, the work function will not be able to get the mutex.
So we use mddev_trylock() inside the wait_event() call, and have that abort when conf->log is set to NULL, which happens before flush_work() is called. We wait in mddev->sb_wait and ensure this is woken when any of the conditions change. This requires waking mddev->sb_wait in mddev_unlock(). This is only like to trigger extra wake_ups of threads that needn't be woken when metadata is being written, and that doesn't happen often enough that the cost would be noticeable.
Signed-off-by: NeilBrown neilb@suse.com Signed-off-by: Shaohua Li shli@fb.com [jwang: cherry-pick to 4.14] Signed-off-by: Jack Wang jinpu.wang@profitbricks.com --- drivers/md/dm-raid.c | 10 ++++++++-- drivers/md/md.c | 3 +++ drivers/md/raid5-cache.c | 18 +++++++++++++----- 3 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 33834db..38a2ac2 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3637,8 +3637,11 @@ static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private;
- if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) + if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(&rs->md); mddev_suspend(&rs->md); + mddev_unlock(&rs->md); + }
rs->md.ro = 1; } @@ -3898,8 +3901,11 @@ static void raid_resume(struct dm_target *ti) if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(mddev); mddev_resume(mddev); + mddev_unlock(mddev); + } }
static struct target_type raid_target = { diff --git a/drivers/md/md.c b/drivers/md/md.c index 24e64b0..436066d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -344,6 +344,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) void mddev_suspend(struct mddev *mddev) { WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->suspended++) return; synchronize_rcu(); @@ -357,6 +358,7 @@ EXPORT_SYMBOL_GPL(mddev_suspend);
void mddev_resume(struct mddev *mddev) { + lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; wake_up(&mddev->sb_wait); @@ -663,6 +665,7 @@ void mddev_unlock(struct mddev *mddev) */ spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); + wake_up(&mddev->sb_wait); spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 9a34072..79d8127 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -693,6 +693,8 @@ static void r5c_disable_writeback_async(struct work_struct *work) struct r5l_log *log = container_of(work, struct r5l_log, disable_writeback_work); struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + int locked = 0;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; @@ -701,11 +703,15 @@ static void r5c_disable_writeback_async(struct work_struct *work)
/* wait superblock change before suspend */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); - - mddev_suspend(mddev); - log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; - mddev_resume(mddev); + conf->log == NULL || + (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && + (locked = mddev_trylock(mddev)))); + if (locked) { + mddev_suspend(mddev); + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + mddev_resume(mddev); + mddev_unlock(mddev); + } }
static void r5l_submit_current_io(struct r5l_log *log) @@ -3161,6 +3167,8 @@ void r5l_exit_log(struct r5conf *conf) conf->log = NULL; synchronize_rcu();
+ /* Ensure disable_writeback_work wakes up and exits */ + wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); mempool_destroy(log->meta_pool);
From: NeilBrown neilb@suse.com
commit 52a0d49de3d592a3118e13f35985e3d99eaf43df upstream
bitmap_create() allocates memory with GFP_KERNEL and so can wait for IO. If called while the array is quiesced, it could wait indefinitely for write out to the array - deadlock. So call bitmap_create() before quiescing the array.
Signed-off-by: NeilBrown neilb@suse.com Signed-off-by: Shaohua Li shli@fb.com [jwang: cherry-pick to 4.14] Signed-off-by: Jack Wang jinpu.wang@profitbricks.com --- drivers/md/md.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c index 436066d..cf5bc31 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6644,22 +6644,26 @@ static int set_bitmap_file(struct mddev *mddev, int fd) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); if (fd >= 0) { struct bitmap *bitmap;
bitmap = bitmap_create(mddev, -1); + mddev->pers->quiesce(mddev, 1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = bitmap_load(mddev); } else err = PTR_ERR(bitmap); - } - if (fd < 0 || err) { + if (err) { + bitmap_destroy(mddev); + fd = -1; + } + mddev->pers->quiesce(mddev, 0); + } else if (fd < 0) { + mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); - fd = -1; /* make sure to put the file */ + mddev->pers->quiesce(mddev, 0); } - mddev->pers->quiesce(mddev, 0); } if (fd < 0) { struct file *f = mddev->bitmap_info.file; @@ -6943,8 +6947,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - mddev->pers->quiesce(mddev, 1); bitmap = bitmap_create(mddev, -1); + mddev->pers->quiesce(mddev, 1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = bitmap_load(mddev);
From: NeilBrown neilb@suse.com
commit b3143b9a38d5039bcd1f2d1c94039651bfba8043 upstream
responding to ->suspend_lo and ->suspend_hi is similar to responding to ->suspended. It is best to wait in the common core code without incrementing ->active_io. This allows mddev_suspend()/mddev_resume() to work while requests are waiting for suspend_lo/hi to change. This is will be important after a subsequent patch which uses mddev_suspend() to synchronize updating for suspend_lo/hi.
So move the code for testing suspend_lo/hi out of raid1.c and raid5.c, and place it in md.c
Signed-off-by: NeilBrown neilb@suse.com Signed-off-by: Shaohua Li shli@fb.com [jwang: cherry-pick to 4.14] Signed-off-by: Jack Wang jinpu.wang@profitbricks.com --- drivers/md/md.c | 29 +++++++++++++++++++++++------ drivers/md/raid1.c | 14 +++++--------- drivers/md/raid5.c | 22 ---------------------- 3 files changed, 28 insertions(+), 37 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c index cf5bc31..8f7220a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -266,16 +266,31 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ +static bool is_suspended(struct mddev *mddev, struct bio *bio) +{ + if (mddev->suspended) + return true; + if (bio_data_dir(bio) != WRITE) + return false; + if (mddev->suspend_lo >= mddev->suspend_hi) + return false; + if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + return false; + if (bio_end_sector(bio) < mddev->suspend_lo) + return false; + return true; +} + void md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: rcu_read_lock(); - if (mddev->suspended) { + if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) + if (!is_suspended(mddev, bio)) break; rcu_read_unlock(); schedule(); @@ -4848,10 +4863,11 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; old = mddev->suspend_lo; mddev->suspend_lo = new; - if (new >= old) + if (new >= old) { /* Shrinking suspended region */ + wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 2); - else { + } else { /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -4891,10 +4907,11 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; old = mddev->suspend_hi; mddev->suspend_hi = new; - if (new <= old) + if (new <= old) { /* Shrinking suspended region */ + wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 2); - else { + } else { /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e4e01d3..bd5976a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1298,11 +1298,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */
- if ((bio_end_sector(bio) > mddev->suspend_lo && - bio->bi_iter.bi_sector < mddev->suspend_hi) || - (mddev_is_clustered(mddev) && + if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, bio_end_sector(bio)))) { + bio->bi_iter.bi_sector, bio_end_sector(bio))) {
/* * As the suspend_* range is controlled by userspace, we want @@ -1313,12 +1311,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, sigset_t full, old; prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); - if ((bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_iter.bi_sector >= mddev->suspend_hi) && - (!mddev_is_clustered(mddev) || - !md_cluster_ops->area_resyncing(mddev, WRITE, + if (!mddev_is_clustered(mddev) || + !md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, - bio_end_sector(bio)))) + bio_end_sector(bio))) break; sigfillset(&full); sigprocmask(SIG_BLOCK, &full, &old); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index de1ef62..30c1dc1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5686,28 +5686,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) goto retry; }
- if (rw == WRITE && - logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - raid5_release_stripe(sh); - /* As the suspend_* range is controlled by - * userspace, we want an interruptible - * wait. - */ - prepare_to_wait(&conf->wait_for_overlap, - &w, TASK_INTERRUPTIBLE); - if (logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - sigset_t full, old; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); - schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); - do_prepare = true; - } - goto retry; - } - if (test_bit(STRIPE_EXPANDING, &sh->state) || !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { /* Stripe is busy expanding or
From: NeilBrown neilb@suse.com
commit 9e1cc0a54556a6c63dc0cfb7cd7d60d43337bba6 upstream
mddev_suspend() is a more general interface than calling ->quiesce() and is so more extensible. A future patch will make use of this.
Signed-off-by: NeilBrown neilb@suse.com Signed-off-by: Shaohua Li shli@fb.com [jwang: cherry-pick to 4.14] Signed-off-by: Jack Wang jinpu.wang@profitbricks.com --- drivers/md/md.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c index 8f7220a..56e180b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4869,8 +4869,8 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 2); } else { /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); + mddev_suspend(mddev); + mddev_resume(mddev); } err = 0; unlock: @@ -4913,8 +4913,8 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 2); } else { /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); + mddev_suspend(mddev); + mddev_resume(mddev); } err = 0; unlock: @@ -6665,7 +6665,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) struct bitmap *bitmap;
bitmap = bitmap_create(mddev, -1); - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = bitmap_load(mddev); @@ -6675,11 +6675,11 @@ static int set_bitmap_file(struct mddev *mddev, int fd) bitmap_destroy(mddev); fd = -1; } - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } else if (fd < 0) { - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } } if (fd < 0) { @@ -6965,7 +6965,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.space = mddev->bitmap_info.default_space; bitmap = bitmap_create(mddev, -1); - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = bitmap_load(mddev); @@ -6973,7 +6973,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = PTR_ERR(bitmap); if (rv) bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { @@ -6996,9 +6996,9 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); } - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); mddev->bitmap_info.offset = 0; } }
From: NeilBrown neilb@suse.com
commit 35bfc52187f6df8779d0f1cebdb52b7f797baf4e upstream
There are various deadlocks that can occur when a thread holds reconfig_mutex and calls ->quiesce(mddev, 1). As some write request block waiting for metadata to be updated (e.g. to record device failure), and as the md thread updates the metadata while the reconfig mutex is held, holding the mutex can stop write requests completing, and this prevents ->quiesce(mddev, 1) from completing.
->quiesce() is now usually called from mddev_suspend(), and it is always called with reconfig_mutex held. So at this time it is safe for the thread to update metadata without explicitly taking the lock.
So add 2 new flags, one which says the unlocked updates is allowed, and one which ways it is happening. Then allow it while the quiesce completes, and then wait for it to finish.
Reported-and-tested-by: Xiao Ni xni@redhat.com Signed-off-by: NeilBrown neilb@suse.com Signed-off-by: Shaohua Li shli@fb.com [jwang: cherry-pick to 4.14] Signed-off-by: Jack Wang jinpu.wang@profitbricks.com --- drivers/md/md.c | 14 ++++++++++++++ drivers/md/md.h | 6 ++++++ 2 files changed, 20 insertions(+)
diff --git a/drivers/md/md.c b/drivers/md/md.c index 56e180b..ceee5f5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -364,8 +364,12 @@ void mddev_suspend(struct mddev *mddev) return; synchronize_rcu(); wake_up(&mddev->sb_wait); + set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); + smp_mb__after_atomic(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); + clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); + wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
del_timer_sync(&mddev->safemode_timer); } @@ -8880,6 +8884,16 @@ void md_check_recovery(struct mddev *mddev) unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); + } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { + /* Write superblock - thread that called mddev_suspend() + * holds reconfig_mutex for us. + */ + set_bit(MD_UPDATING_SB, &mddev->flags); + smp_mb__after_atomic(); + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) + md_update_sb(mddev, 0); + clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); + wake_up(&mddev->sb_wait); } } EXPORT_SYMBOL(md_check_recovery); diff --git a/drivers/md/md.h b/drivers/md/md.h index 9b0a896..37c19b7 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -237,6 +237,12 @@ enum mddev_flags { */ MD_HAS_PPL, /* The raid array has PPL feature set */ MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ + MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update + * the metadata without taking reconfig_mutex. + */ + MD_UPDATING_SB, /* md_check_recovery is updating the metadata + * without explicitly holding reconfig_mutex. + */ };
enum mddev_sb_flags {
From: NeilBrown neilb@suse.com
commit b03e0ccb5ab9df3efbe51c87843a1ffbecbafa1f upstream
The '2' argument means "wake up anything that is waiting". This is an inelegant part of the design and was added to help support management of suspend_lo/suspend_hi setting. Now that suspend_lo/hi is managed in mddev_suspend/resume, that need is gone. These is still a couple of places where we call 'quiesce' with an argument of '2', but they can safely be changed to call ->quiesce(.., 1); ->quiesce(.., 0) which achieve the same result at the small cost of pausing IO briefly.
This removes a small "optimization" from suspend_{hi,lo}_store, but it isn't clear that optimization served a useful purpose. The code now is a lot clearer.
Suggested-by: Shaohua Li shli@kernel.org Signed-off-by: NeilBrown neilb@suse.com Signed-off-by: Shaohua Li shli@fb.com [jwang: cherry-pick to 4.14] Signed-off-by: Jack Wang jinpu.wang@profitbricks.com --- drivers/md/md-cluster.c | 6 +++--- drivers/md/md.c | 34 ++++++++++------------------------ drivers/md/md.h | 9 ++++----- drivers/md/raid0.c | 2 +- drivers/md/raid1.c | 13 +++---------- drivers/md/raid10.c | 10 +++------- drivers/md/raid5-cache.c | 12 ++++++------ drivers/md/raid5-log.h | 2 +- drivers/md/raid5.c | 18 ++++++------------ 9 files changed, 37 insertions(+), 69 deletions(-)
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 03082e1..72ce0bc 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -442,10 +442,11 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) static void remove_suspend_info(struct mddev *mddev, int slot) { struct md_cluster_info *cinfo = mddev->cluster_info; + mddev->pers->quiesce(mddev, 1); spin_lock_irq(&cinfo->suspend_lock); __remove_suspend_info(cinfo, slot); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); }
@@ -492,13 +493,12 @@ static void process_suspend_info(struct mddev *mddev, s->lo = lo; s->hi = hi; mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); spin_lock_irq(&cinfo->suspend_lock); /* Remove existing entry (if exists) before adding */ __remove_suspend_info(cinfo, slot); list_add(&s->list, &cinfo->suspend_list); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); }
static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) diff --git a/drivers/md/md.c b/drivers/md/md.c index ceee5f5..22c9024 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4849,7 +4849,7 @@ suspend_lo_show(struct mddev *mddev, char *page) static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err;
err = kstrtoull(buf, 10, &new); @@ -4865,17 +4865,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers == NULL || mddev->pers->quiesce == NULL) goto unlock; - old = mddev->suspend_lo; + mddev_suspend(mddev); mddev->suspend_lo = new; - if (new >= old) { - /* Shrinking suspended region */ - wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 2); - } else { - /* Expanding suspended region - need to wait */ - mddev_suspend(mddev); - mddev_resume(mddev); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); @@ -4893,7 +4886,7 @@ suspend_hi_show(struct mddev *mddev, char *page) static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err;
err = kstrtoull(buf, 10, &new); @@ -4906,20 +4899,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; err = -EINVAL; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) + if (mddev->pers == NULL) goto unlock; - old = mddev->suspend_hi; + + mddev_suspend(mddev); mddev->suspend_hi = new; - if (new <= old) { - /* Shrinking suspended region */ - wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 2); - } else { - /* Expanding suspended region - need to wait */ - mddev_suspend(mddev); - mddev_resume(mddev); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); diff --git a/drivers/md/md.h b/drivers/md/md.h index 37c19b7..11696ab 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -546,12 +546,11 @@ struct md_personality int (*check_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); - /* quiesce moves between quiescence states - * 0 - fully active - * 1 - no new requests allowed - * others - reserved + /* quiesce suspends or resumes internal processing. + * 1 - stop new actions and wait for action io to complete + * 0 - return to normal behaviour */ - void (*quiesce) (struct mddev *mddev, int state); + void (*quiesce) (struct mddev *mddev, int quiesce); /* takeover is used to transition an array from one * personality to another. The new personality must be able * to handle the data in the current layout. diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 5a00fc1..5ecba9e 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -768,7 +768,7 @@ static void *raid0_takeover(struct mddev *mddev) return ERR_PTR(-EINVAL); }
-static void raid0_quiesce(struct mddev *mddev, int state) +static void raid0_quiesce(struct mddev *mddev, int quiesce) { }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index bd5976a..029ecba 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3276,21 +3276,14 @@ static int raid1_reshape(struct mddev *mddev) return 0; }
-static void raid1_quiesce(struct mddev *mddev, int state) +static void raid1_quiesce(struct mddev *mddev, int quiesce) { struct r1conf *conf = mddev->private;
- switch(state) { - case 2: /* wake for suspend */ - wake_up(&conf->wait_barrier); - break; - case 1: + if (quiesce) freeze_array(conf, 0); - break; - case 0: + else unfreeze_array(conf); - break; - } }
static void *raid1_takeover(struct mddev *mddev) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5fb31ef..b20c23f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3838,18 +3838,14 @@ static void raid10_free(struct mddev *mddev, void *priv) kfree(conf); }
-static void raid10_quiesce(struct mddev *mddev, int state) +static void raid10_quiesce(struct mddev *mddev, int quiesce) { struct r10conf *conf = mddev->private;
- switch(state) { - case 1: + if (quiesce) raise_barrier(conf, 0); - break; - case 0: + else lower_barrier(conf); - break; - } }
static int raid10_resize(struct mddev *mddev, sector_t sectors) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 79d8127..0d535b4 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1589,21 +1589,21 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) md_wakeup_thread(log->reclaim_thread); }
-void r5l_quiesce(struct r5l_log *log, int state) +void r5l_quiesce(struct r5l_log *log, int quiesce) { struct mddev *mddev; - if (!log || state == 2) + if (!log) return; - if (state == 0) - kthread_unpark(log->reclaim_thread->tsk); - else if (state == 1) { + + if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); kthread_park(log->reclaim_thread->tsk); r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); - } + } else + kthread_unpark(log->reclaim_thread->tsk); }
bool r5l_log_disk_error(struct r5conf *conf) diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 7f9ad5f..284578b 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -9,7 +9,7 @@ extern void r5l_write_stripe_run(struct r5l_log *log); extern void r5l_flush_stripe_to_raid(struct r5l_log *log); extern void r5l_stripe_write_finished(struct stripe_head *sh); extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); -extern void r5l_quiesce(struct r5l_log *log, int state); +extern void r5l_quiesce(struct r5l_log *log, int quiesce); extern bool r5l_log_disk_error(struct r5conf *conf); extern bool r5c_is_writeback(struct r5l_log *log); extern int diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 30c1dc1..07ca2fd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8003,16 +8003,12 @@ static void raid5_finish_reshape(struct mddev *mddev) } }
-static void raid5_quiesce(struct mddev *mddev, int state) +static void raid5_quiesce(struct mddev *mddev, int quiesce) { struct r5conf *conf = mddev->private;
- switch(state) { - case 2: /* resume for a suspend */ - wake_up(&conf->wait_for_overlap); - break; - - case 1: /* stop all writes */ + if (quiesce) { + /* stop all writes */ lock_all_device_hash_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain @@ -8028,17 +8024,15 @@ static void raid5_quiesce(struct mddev *mddev, int state) unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); - break; - - case 0: /* re-enable writes */ + } else { + /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_overlap); unlock_all_device_hash_locks_irq(conf); - break; } - r5l_quiesce(conf->log, state); + r5l_quiesce(conf->log, quiesce); }
static void *raid45_takeover_raid0(struct mddev *mddev, int level)
On Mon, Jun 25, 2018 at 04:41:07PM +0200, Jack Wang wrote:
Hi Greg,
We hit a panic in 4.14.43 with md raid1.
It's easy to reproduce with a test case, running Fio, and doing mdadm --grow bitmap=none /dev/mdx && mdadm --grow bitmap=internal /dev/mdx in a loop.
With following patches applied, we can no longer reproduce the problem.
Please consider to apply the patches to 4.14, they can be applied cleanly on 4.14.52.
Now queued up, thanks!
greg k-h
Thanks! On Thu, Jul 5, 2018 at 7:00 PM Greg KH gregkh@linuxfoundation.org wrote:
On Mon, Jun 25, 2018 at 04:41:07PM +0200, Jack Wang wrote:
Hi Greg,
We hit a panic in 4.14.43 with md raid1.
It's easy to reproduce with a test case, running Fio, and doing mdadm --grow bitmap=none /dev/mdx && mdadm --grow bitmap=internal /dev/mdx in a loop.
With following patches applied, we can no longer reproduce the problem.
Please consider to apply the patches to 4.14, they can be applied cleanly on 4.14.52.
Now queued up, thanks!
greg k-h
linux-stable-mirror@lists.linaro.org