We use the async_delalloc_pages mechanism to make sure that we've
completed our async work before trying to continue our delalloc
flushing. The reason for this is we need to see any ordered extents
that were created by our delalloc flushing. However we're waking up
before we do the submit work, which is before we create the ordered
extents. This is a pretty wide race window where we could potentially
think there are no ordered extents and thus exit shrink_delalloc
prematurely. Fix this by waking us up after we've done the work to
create ordered extents.
cc: stable(a)vger.kernel.org
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
---
fs/btrfs/inode.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b1f02e3fea5d..e388153c4ae4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1290,11 +1290,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /* atomic_sub_return implies a barrier */
- if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M)
- cond_wake_up_nomb(&fs_info->async_submit_wait);
-
/*
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
@@ -1303,6 +1298,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
+
+ /* atomic_sub_return implies a barrier */
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
}
static noinline void async_cow_free(struct btrfs_work *work)
--
2.26.3
We have been hitting some early ENOSPC issues in production with more
recent kernels, and I tracked it down to us simply not flushing delalloc
as aggressively as we should be. With tracing I was seeing us failing
all tickets with all of the block rsvs at or around 0, with very little
pinned space, but still around 120MiB of outstanding bytes_may_used.
Upon further investigation I saw that we were flushing around 14 pages
per shrink call for delalloc, despite having around 2GiB of delalloc
outstanding.
Consider the example of a 8 way machine, all CPUs trying to create a
file in parallel, which at the time of this commit requires 5 items to
do. Assuming a 16k leaf size, we have 10MiB of total metadata reclaim
size waiting on reservations. Now assume we have 128MiB of delalloc
outstanding. With our current math we would set items to 20, and then
set to_reclaim to 20 * 256k, or 5MiB.
Assuming that we went through this loop all 3 times, for both
FLUSH_DELALLOC and FLUSH_DELALLOC_WAIT, and then did the full loop
twice, we'd only flush 60MiB of the 128MiB delalloc space. This could
leave a fair bit of delalloc reservations still hanging around by the
time we go to ENOSPC out all the remaining tickets.
Fix this two ways. First, change the calculations to be a fraction of
the total delalloc bytes on the system. Prior to this change we were
calculating based on dirty inodes so our math made more sense, now it's
just completely unrelated to what we're actually doing.
Second add a FLUSH_DELALLOC_FULL state, that we hold off until we've
gone through the flush states at least once. This will empty the system
of all delalloc so we're sure to be truly out of space when we start
failing tickets.
I'm tagging stable 5.10 and forward, because this is where we started
using the page stuff heavily again. This affects earlier kernel
versions as well, but would be a pain to backport to them as the
flushing mechanisms aren't the same.
CC: stable(a)vger.kernel.org # 5.10+
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
---
fs/btrfs/ctree.h | 9 +++++----
fs/btrfs/space-info.c | 35 ++++++++++++++++++++++++++---------
include/trace/events/btrfs.h | 1 +
3 files changed, 32 insertions(+), 13 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d7ef4d7d2c1a..232ff1a49ca6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2783,10 +2783,11 @@ enum btrfs_flush_state {
FLUSH_DELAYED_REFS = 4,
FLUSH_DELALLOC = 5,
FLUSH_DELALLOC_WAIT = 6,
- ALLOC_CHUNK = 7,
- ALLOC_CHUNK_FORCE = 8,
- RUN_DELAYED_IPUTS = 9,
- COMMIT_TRANS = 10,
+ FLUSH_DELALLOC_FULL = 7,
+ ALLOC_CHUNK = 8,
+ ALLOC_CHUNK_FORCE = 9,
+ RUN_DELAYED_IPUTS = 10,
+ COMMIT_TRANS = 11,
};
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index af161eb808a2..0c539a94c6d9 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -494,6 +494,9 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
long time_left;
int loops;
+ delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+ ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
+
/* Calc the number of the pages we need flush for space reservation */
if (to_reclaim == U64_MAX) {
items = U64_MAX;
@@ -501,19 +504,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
/*
* to_reclaim is set to however much metadata we need to
* reclaim, but reclaiming that much data doesn't really track
- * exactly, so increase the amount to reclaim by 2x in order to
- * make sure we're flushing enough delalloc to hopefully reclaim
- * some metadata reservations.
+ * exactly. What we really want to do is reclaim full inode's
+ * worth of reservations, however that's not available to us
+ * here. We will take a fraction of the delalloc bytes for our
+ * flushing loops and hope for the best. Delalloc will expand
+ * the amount we write to cover an entire dirty extent, which
+ * will reclaim the metadata reservation for that range. If
+ * it's not enough subsequent flush stages will be more
+ * aggressive.
*/
+ to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
}
trans = (struct btrfs_trans_handle *)current->journal_info;
- delalloc_bytes = percpu_counter_sum_positive(
- &fs_info->delalloc_bytes);
- ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
if (delalloc_bytes == 0 && ordered_bytes == 0)
return;
@@ -596,8 +601,11 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
+ case FLUSH_DELALLOC_FULL:
+ if (state == FLUSH_DELALLOC_FULL)
+ num_bytes = U64_MAX;
shrink_delalloc(fs_info, space_info, num_bytes,
- state == FLUSH_DELALLOC_WAIT, for_preempt);
+ state != FLUSH_DELALLOC, for_preempt);
break;
case FLUSH_DELAYED_REFS_NR:
case FLUSH_DELAYED_REFS:
@@ -907,6 +915,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
commit_cycles--;
}
+ /*
+ * We do not want to empty the system of delalloc unless we're
+ * under heavy pressure, so allow one trip through the flushing
+ * logic before we start doing a FLUSH_DELALLOC_FULL.
+ */
+ if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
+ flush_state++;
+
/*
* We don't want to force a chunk allocation until we've tried
* pretty hard to reclaim space. Think of the case where we
@@ -1070,7 +1086,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* so if we now have space to allocate do the force chunk allocation.
*/
static const enum btrfs_flush_state data_flush_states[] = {
- FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
RUN_DELAYED_IPUTS,
COMMIT_TRANS,
ALLOC_CHUNK_FORCE,
@@ -1159,6 +1175,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
FLUSH_DELAYED_REFS,
FLUSH_DELALLOC,
FLUSH_DELALLOC_WAIT,
+ FLUSH_DELALLOC_FULL,
ALLOC_CHUNK,
COMMIT_TRANS,
};
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 3d81ba8c37b9..ddf5c250726c 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -94,6 +94,7 @@ struct btrfs_space_info;
EM( FLUSH_DELAYED_ITEMS, "FLUSH_DELAYED_ITEMS") \
EM( FLUSH_DELALLOC, "FLUSH_DELALLOC") \
EM( FLUSH_DELALLOC_WAIT, "FLUSH_DELALLOC_WAIT") \
+ EM( FLUSH_DELALLOC_FULL, "FLUSH_DELALLOC_FULL") \
EM( FLUSH_DELAYED_REFS_NR, "FLUSH_DELAYED_REFS_NR") \
EM( FLUSH_DELAYED_REFS, "FLUSH_ELAYED_REFS") \
EM( ALLOC_CHUNK, "ALLOC_CHUNK") \
--
2.26.3
Function ceph_check_delayed_caps() is called from the mdsc->delayed_work
workqueue and it can be kept looping for quite some time if caps keep being
added back to the mdsc->cap_delay_list. This may result in the watchdog
tainting the kernel with the softlockup flag.
This patch re-arranges the loop through the caps list so that it initially
removes all the caps from list, adding them to a temporary list. And then, with
less locking contention, it will eventually call the ceph_check_caps() for each
inode. Any caps added to the list in the meantime will be handled in the next
run.
Cc: stable(a)vger.kernel.org
Signed-off-by: Luis Henriques <lhenriques(a)suse.de>
---
Hi Jeff!
So, I've not based this patch on top of your patchset that gets rid of
ceph_async_iput() so that it will make it easier to backport it for stable
kernels. Of course I'm not 100% this classifies as stable material.
Other than that, I've been testing this patch and I couldn't see anything
breaking. Let me know what you think.
(I *think* I've seen a tracker bug for this in the past but I couldn't
find it. I guess it could be added as a 'Link:' tag.)
Cheers,
--
Luis
fs/ceph/caps.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a5e93b185515..727e41e3b939 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4229,6 +4229,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
struct inode *inode;
struct ceph_inode_info *ci;
+ LIST_HEAD(caps_list);
dout("check_delayed_caps\n");
spin_lock(&mdsc->cap_delay_lock);
@@ -4239,19 +4240,23 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
time_before(jiffies, ci->i_hold_caps_max))
break;
- list_del_init(&ci->i_cap_delay_list);
+ list_move_tail(&ci->i_cap_delay_list, &caps_list);
+ }
+ spin_unlock(&mdsc->cap_delay_lock);
+ while (!list_empty(&caps_list)) {
+ ci = list_first_entry(&caps_list,
+ struct ceph_inode_info,
+ i_cap_delay_list);
+ list_del_init(&ci->i_cap_delay_list);
inode = igrab(&ci->vfs_inode);
if (inode) {
- spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
ceph_check_caps(ci, 0, NULL);
/* avoid calling iput_final() in tick thread */
ceph_async_iput(inode);
- spin_lock(&mdsc->cap_delay_lock);
}
}
- spin_unlock(&mdsc->cap_delay_lock);
}
/*
On 4/9/21 1:00 AM, Marek Vasut wrote:
> The Microchip LAN8710Ai PHY requires XTAL1/CLKIN external clock to be
> enabled when the nRST is toggled according to datasheet Microchip
> LAN8710A/LAN8710Ai DS00002164B page 35 section 3.8.5.1 Hardware Reset:
[...]
> Fixes: 34e0c7847dcf ("ARM: dts: stm32: Add DH Electronics DHCOM STM32MP1 SoM and PDK2 board")
Adding stable to CC.
Patch is now part of Linux 5.13 as commit
1cebcf9932ab ("ARM: dts: stm32: Rework LAN8710Ai PHY reset on DHCOM SoM")
It would be nice to pick it into stable, since it fixes ethernet
stability issues on the device.
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ea6d0630100b285f059d0a8d8e86f38a46407536 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Date: Thu, 24 Jun 2021 18:40:01 -0700
Subject: [PATCH] mm/hwpoison: do not lock page again when me_huge_page()
successfully recovers
Currently me_huge_page() temporary unlocks page to perform some actions
then locks it again later. My testcase (which calls hard-offline on
some tail page in a hugetlb, then accesses the address of the hugetlb
range) showed that page allocation code detects this page lock on buddy
page and printed out "BUG: Bad page state" message.
check_new_page_bad() does not consider a page with __PG_HWPOISON as bad
page, so this flag works as kind of filter, but this filtering doesn't
work in this case because the "bad page" is not the actual hwpoisoned
page. So stop locking page again. Actions to be taken depend on the
page type of the error, so page unlocking should be done in ->action()
callbacks. So let's make it assumed and change all existing callbacks
that way.
Link: https://lkml.kernel.org/r/20210609072029.74645-1-nao.horiguchi@gmail.com
Fixes: commit 78bb920344b8 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Tony Luck <tony.luck(a)intel.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f24105db7081..6f5f78885ab4 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
*/
static int me_kernel(struct page *p, unsigned long pfn)
{
+ unlock_page(p);
return MF_IGNORED;
}
@@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
static int me_unknown(struct page *p, unsigned long pfn)
{
pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ unlock_page(p);
return MF_FAILED;
}
@@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
*/
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
struct address_space *mapping;
delete_from_lru_cache(p);
@@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* For anonymous pages we're done the only reference left
* should be the one m_f() holds.
*/
- if (PageAnon(p))
- return MF_RECOVERED;
+ if (PageAnon(p)) {
+ ret = MF_RECOVERED;
+ goto out;
+ }
/*
* Now truncate the page in the page cache. This is really
@@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
- return MF_FAILED;
+ ret = MF_FAILED;
+ goto out;
}
/*
@@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
- return truncate_error_page(p, pfn, mapping);
+ ret = truncate_error_page(p, pfn, mapping);
+out:
+ unlock_page(p);
+ return ret;
}
/*
@@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
*/
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
{
+ int ret;
+
ClearPageDirty(p);
/* Trigger EIO in shmem: */
ClearPageUptodate(p);
- if (!delete_from_lru_cache(p))
- return MF_DELAYED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+ unlock_page(p);
+ return ret;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
+
delete_from_swap_cache(p);
- if (!delete_from_lru_cache(p))
- return MF_RECOVERED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+ unlock_page(p);
+ return ret;
}
/*
@@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, pfn, mapping);
+ unlock_page(hpage);
} else {
res = MF_FAILED;
unlock_page(hpage);
@@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
page_ref_inc(p);
res = MF_RECOVERED;
}
- lock_page(hpage);
}
return res;
@@ -866,6 +877,8 @@ static struct page_state {
unsigned long mask;
unsigned long res;
enum mf_action_page_type type;
+
+ /* Callback ->action() has to unlock the relevant page inside it. */
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
@@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p,
int result;
int count;
+ /* page p should be unlocked after returning from ps->action(). */
result = ps->action(p, pfn);
count = page_count(p) - 1;
@@ -1313,7 +1327,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
goto out;
}
- res = identify_page_state(pfn, p, page_flags);
+ return identify_page_state(pfn, p, page_flags);
out:
unlock_page(head);
return res;
@@ -1596,6 +1610,8 @@ int memory_failure(unsigned long pfn, int flags)
identify_page_state:
res = identify_page_state(pfn, p, page_flags);
+ mutex_unlock(&mf_mutex);
+ return res;
unlock_page:
unlock_page(p);
unlock_mutex:
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ea6d0630100b285f059d0a8d8e86f38a46407536 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Date: Thu, 24 Jun 2021 18:40:01 -0700
Subject: [PATCH] mm/hwpoison: do not lock page again when me_huge_page()
successfully recovers
Currently me_huge_page() temporary unlocks page to perform some actions
then locks it again later. My testcase (which calls hard-offline on
some tail page in a hugetlb, then accesses the address of the hugetlb
range) showed that page allocation code detects this page lock on buddy
page and printed out "BUG: Bad page state" message.
check_new_page_bad() does not consider a page with __PG_HWPOISON as bad
page, so this flag works as kind of filter, but this filtering doesn't
work in this case because the "bad page" is not the actual hwpoisoned
page. So stop locking page again. Actions to be taken depend on the
page type of the error, so page unlocking should be done in ->action()
callbacks. So let's make it assumed and change all existing callbacks
that way.
Link: https://lkml.kernel.org/r/20210609072029.74645-1-nao.horiguchi@gmail.com
Fixes: commit 78bb920344b8 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Tony Luck <tony.luck(a)intel.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f24105db7081..6f5f78885ab4 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
*/
static int me_kernel(struct page *p, unsigned long pfn)
{
+ unlock_page(p);
return MF_IGNORED;
}
@@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
static int me_unknown(struct page *p, unsigned long pfn)
{
pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ unlock_page(p);
return MF_FAILED;
}
@@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
*/
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
struct address_space *mapping;
delete_from_lru_cache(p);
@@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* For anonymous pages we're done the only reference left
* should be the one m_f() holds.
*/
- if (PageAnon(p))
- return MF_RECOVERED;
+ if (PageAnon(p)) {
+ ret = MF_RECOVERED;
+ goto out;
+ }
/*
* Now truncate the page in the page cache. This is really
@@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
- return MF_FAILED;
+ ret = MF_FAILED;
+ goto out;
}
/*
@@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
- return truncate_error_page(p, pfn, mapping);
+ ret = truncate_error_page(p, pfn, mapping);
+out:
+ unlock_page(p);
+ return ret;
}
/*
@@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
*/
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
{
+ int ret;
+
ClearPageDirty(p);
/* Trigger EIO in shmem: */
ClearPageUptodate(p);
- if (!delete_from_lru_cache(p))
- return MF_DELAYED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+ unlock_page(p);
+ return ret;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
+
delete_from_swap_cache(p);
- if (!delete_from_lru_cache(p))
- return MF_RECOVERED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+ unlock_page(p);
+ return ret;
}
/*
@@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, pfn, mapping);
+ unlock_page(hpage);
} else {
res = MF_FAILED;
unlock_page(hpage);
@@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
page_ref_inc(p);
res = MF_RECOVERED;
}
- lock_page(hpage);
}
return res;
@@ -866,6 +877,8 @@ static struct page_state {
unsigned long mask;
unsigned long res;
enum mf_action_page_type type;
+
+ /* Callback ->action() has to unlock the relevant page inside it. */
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
@@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p,
int result;
int count;
+ /* page p should be unlocked after returning from ps->action(). */
result = ps->action(p, pfn);
count = page_count(p) - 1;
@@ -1313,7 +1327,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
goto out;
}
- res = identify_page_state(pfn, p, page_flags);
+ return identify_page_state(pfn, p, page_flags);
out:
unlock_page(head);
return res;
@@ -1596,6 +1610,8 @@ int memory_failure(unsigned long pfn, int flags)
identify_page_state:
res = identify_page_state(pfn, p, page_flags);
+ mutex_unlock(&mf_mutex);
+ return res;
unlock_page:
unlock_page(p);
unlock_mutex:
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ea6d0630100b285f059d0a8d8e86f38a46407536 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Date: Thu, 24 Jun 2021 18:40:01 -0700
Subject: [PATCH] mm/hwpoison: do not lock page again when me_huge_page()
successfully recovers
Currently me_huge_page() temporary unlocks page to perform some actions
then locks it again later. My testcase (which calls hard-offline on
some tail page in a hugetlb, then accesses the address of the hugetlb
range) showed that page allocation code detects this page lock on buddy
page and printed out "BUG: Bad page state" message.
check_new_page_bad() does not consider a page with __PG_HWPOISON as bad
page, so this flag works as kind of filter, but this filtering doesn't
work in this case because the "bad page" is not the actual hwpoisoned
page. So stop locking page again. Actions to be taken depend on the
page type of the error, so page unlocking should be done in ->action()
callbacks. So let's make it assumed and change all existing callbacks
that way.
Link: https://lkml.kernel.org/r/20210609072029.74645-1-nao.horiguchi@gmail.com
Fixes: commit 78bb920344b8 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error")
Signed-off-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Tony Luck <tony.luck(a)intel.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f24105db7081..6f5f78885ab4 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
*/
static int me_kernel(struct page *p, unsigned long pfn)
{
+ unlock_page(p);
return MF_IGNORED;
}
@@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
static int me_unknown(struct page *p, unsigned long pfn)
{
pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ unlock_page(p);
return MF_FAILED;
}
@@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
*/
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
struct address_space *mapping;
delete_from_lru_cache(p);
@@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* For anonymous pages we're done the only reference left
* should be the one m_f() holds.
*/
- if (PageAnon(p))
- return MF_RECOVERED;
+ if (PageAnon(p)) {
+ ret = MF_RECOVERED;
+ goto out;
+ }
/*
* Now truncate the page in the page cache. This is really
@@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
- return MF_FAILED;
+ ret = MF_FAILED;
+ goto out;
}
/*
@@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
- return truncate_error_page(p, pfn, mapping);
+ ret = truncate_error_page(p, pfn, mapping);
+out:
+ unlock_page(p);
+ return ret;
}
/*
@@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
*/
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
{
+ int ret;
+
ClearPageDirty(p);
/* Trigger EIO in shmem: */
ClearPageUptodate(p);
- if (!delete_from_lru_cache(p))
- return MF_DELAYED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+ unlock_page(p);
+ return ret;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
+
delete_from_swap_cache(p);
- if (!delete_from_lru_cache(p))
- return MF_RECOVERED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+ unlock_page(p);
+ return ret;
}
/*
@@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, pfn, mapping);
+ unlock_page(hpage);
} else {
res = MF_FAILED;
unlock_page(hpage);
@@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
page_ref_inc(p);
res = MF_RECOVERED;
}
- lock_page(hpage);
}
return res;
@@ -866,6 +877,8 @@ static struct page_state {
unsigned long mask;
unsigned long res;
enum mf_action_page_type type;
+
+ /* Callback ->action() has to unlock the relevant page inside it. */
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
@@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p,
int result;
int count;
+ /* page p should be unlocked after returning from ps->action(). */
result = ps->action(p, pfn);
count = page_count(p) - 1;
@@ -1313,7 +1327,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
goto out;
}
- res = identify_page_state(pfn, p, page_flags);
+ return identify_page_state(pfn, p, page_flags);
out:
unlock_page(head);
return res;
@@ -1596,6 +1610,8 @@ int memory_failure(unsigned long pfn, int flags)
identify_page_state:
res = identify_page_state(pfn, p, page_flags);
+ mutex_unlock(&mf_mutex);
+ return res;
unlock_page:
unlock_page(p);
unlock_mutex: