From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit f8d4f44df056c5b504b0d49683fb7279218fd207 ]
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/eventpoll.c | 37 ++++++++++++++++++-------------------
1 file changed, 18 insertions(+), 19 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index af9dfa494b1fa..a32df9cad519f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1461,6 +1461,22 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
RCU_INIT_POINTER(epi->ws, NULL);
}
+ /* Add the current item to the list of active epoll hook for this file */
+ spin_lock(&tfile->f_lock);
+ list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
+ spin_unlock(&tfile->f_lock);
+
+ /*
+ * Add the current item to the RB tree. All RB tree operations are
+ * protected by "mtx", and ep_insert() is called with "mtx" held.
+ */
+ ep_rbtree_insert(ep, epi);
+
+ /* now check if we've created too many backpaths */
+ error = -EINVAL;
+ if (full_check && reverse_path_check())
+ goto error_remove_epi;
+
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
@@ -1483,22 +1499,6 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
if (epi->nwait < 0)
goto error_unregister;
- /* Add the current item to the list of active epoll hook for this file */
- spin_lock(&tfile->f_lock);
- list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
- spin_unlock(&tfile->f_lock);
-
- /*
- * Add the current item to the RB tree. All RB tree operations are
- * protected by "mtx", and ep_insert() is called with "mtx" held.
- */
- ep_rbtree_insert(ep, epi);
-
- /* now check if we've created too many backpaths */
- error = -EINVAL;
- if (full_check && reverse_path_check())
- goto error_remove_epi;
-
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irqsave(&ep->lock, flags);
@@ -1527,6 +1527,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
return 0;
+error_unregister:
+ ep_unregister_pollwait(ep, epi);
error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
@@ -1534,9 +1536,6 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
rb_erase_cached(&epi->rbn, &ep->rbr);
-error_unregister:
- ep_unregister_pollwait(ep, epi);
-
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
--
2.25.1
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit f8d4f44df056c5b504b0d49683fb7279218fd207 ]
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/eventpoll.c | 37 ++++++++++++++++++-------------------
1 file changed, 18 insertions(+), 19 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 61a52bb26d127..ed6c06dbb5369 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1450,6 +1450,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
RCU_INIT_POINTER(epi->ws, NULL);
}
+ /* Add the current item to the list of active epoll hook for this file */
+ spin_lock(&tfile->f_lock);
+ list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
+ spin_unlock(&tfile->f_lock);
+
+ /*
+ * Add the current item to the RB tree. All RB tree operations are
+ * protected by "mtx", and ep_insert() is called with "mtx" held.
+ */
+ ep_rbtree_insert(ep, epi);
+
+ /* now check if we've created too many backpaths */
+ error = -EINVAL;
+ if (full_check && reverse_path_check())
+ goto error_remove_epi;
+
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
@@ -1472,22 +1488,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
if (epi->nwait < 0)
goto error_unregister;
- /* Add the current item to the list of active epoll hook for this file */
- spin_lock(&tfile->f_lock);
- list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
- spin_unlock(&tfile->f_lock);
-
- /*
- * Add the current item to the RB tree. All RB tree operations are
- * protected by "mtx", and ep_insert() is called with "mtx" held.
- */
- ep_rbtree_insert(ep, epi);
-
- /* now check if we've created too many backpaths */
- error = -EINVAL;
- if (full_check && reverse_path_check())
- goto error_remove_epi;
-
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irq(&ep->wq.lock);
@@ -1516,6 +1516,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
return 0;
+error_unregister:
+ ep_unregister_pollwait(ep, epi);
error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
@@ -1523,9 +1525,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
rb_erase_cached(&epi->rbn, &ep->rbr);
-error_unregister:
- ep_unregister_pollwait(ep, epi);
-
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
--
2.25.1
From: Al Viro <viro(a)zeniv.linux.org.uk>
[ Upstream commit f8d4f44df056c5b504b0d49683fb7279218fd207 ]
Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/eventpoll.c | 37 ++++++++++++++++++-------------------
1 file changed, 18 insertions(+), 19 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ae1d32344f7ac..f70df53666ed1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1527,6 +1527,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
RCU_INIT_POINTER(epi->ws, NULL);
}
+ /* Add the current item to the list of active epoll hook for this file */
+ spin_lock(&tfile->f_lock);
+ list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
+ spin_unlock(&tfile->f_lock);
+
+ /*
+ * Add the current item to the RB tree. All RB tree operations are
+ * protected by "mtx", and ep_insert() is called with "mtx" held.
+ */
+ ep_rbtree_insert(ep, epi);
+
+ /* now check if we've created too many backpaths */
+ error = -EINVAL;
+ if (full_check && reverse_path_check())
+ goto error_remove_epi;
+
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
@@ -1549,22 +1565,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
if (epi->nwait < 0)
goto error_unregister;
- /* Add the current item to the list of active epoll hook for this file */
- spin_lock(&tfile->f_lock);
- list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
- spin_unlock(&tfile->f_lock);
-
- /*
- * Add the current item to the RB tree. All RB tree operations are
- * protected by "mtx", and ep_insert() is called with "mtx" held.
- */
- ep_rbtree_insert(ep, epi);
-
- /* now check if we've created too many backpaths */
- error = -EINVAL;
- if (full_check && reverse_path_check())
- goto error_remove_epi;
-
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irq(&ep->lock);
@@ -1593,6 +1593,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
return 0;
+error_unregister:
+ ep_unregister_pollwait(ep, epi);
error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
@@ -1600,9 +1602,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
rb_erase_cached(&epi->rbn, &ep->rbr);
-error_unregister:
- ep_unregister_pollwait(ep, epi);
-
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
--
2.25.1
I recently tracked down a problem I observed when booting a v5.4 kernel
on a sparsemem UMA arm platform which includes a no-map reserved-memory
region in the middle of its HighMem zone.
When memmap_init_zone() is invoked the pfn's that correspond to the
no-map region fail the early_pfn_valid() check and the struct page
structures are not initialized creating a "hole" in the memmap. Later in
my boot sequence the sock_init() initcall leads to a bpf_prog_alloc()
which ends up stealing a page from the block containing the no-map
region which then leads to a call of move_freepages_block() to
reclassify the migratetype of the entire block.
The function move_freepages() includes a check of pfn_valid_within for
each page in the range, but since the arm architecture doesn't include
HOLES_IN_ZONE this check is optimized out and the uninitialized struct
page is accessed. Specifically, PageLRU() calls compound_head() on the
page and if the page->compound_head value is odd the value is used as a
pointer to the head struct page. For uninitialized memory there is a
high chance that a random value of compound head will be odd and contain
an invalid pointer value that causes the kernel to abort and panic.
As you might imagine specifying HOLES_IN_ZONE for the arm build allows
pfn_valid_within to protect against accessing the uninitialized struct
page. However, the performance penalty this incurs seems unnecessary.
Commit 35fd1eb1e821 ("mm/sparse: abstract sparse buffer allocations") as
part of the "sparse_init rewrite" series introduced in v4.19 changed the
way sparsemem memmaps are initialized. Prior to this patch the sparsemem
memmaps are initialized to all 0's. I observed that on older kernels the
"uninitialized" struct page access also occurs, but the 0
page->compound_head indicates no compound head and the page pointer is
therefore not corrupted. The other logic ends up causing the page to be
skipped and everything "happens to work".
While considering solutions to this issue I observed that the problem
does not occur in the current upstream as a result of a combination of
other commits. The following commits provided functionality to
initialize struct page structures for pages that are unavailable like
the no-map region in my system:
commit a4a3ede2132a ("mm: zero reserved and unavailable struct pages")
commit 907ec5fca3dc ("mm: zero remaining unavailable struct pages")
commit ec393a0f014e ("mm: return zero_resv_unavail optimization")
commit e822969cab48 ("mm/page_alloc.c: fix uninitialized memmaps on a
partially populated last section")
commit 4b094b7851bf ("mm/page_alloc.c: initialize memmap of unavailable
memory directly")
However, those commits added the functionality to the free_area_init()
and free_area_init_nodes() functions and the non-NUMA arm architecture
did not begin calling free_area_init() until the following commit in v5.8:
commit a32c1c61212d ("arm: simplify detection of memory zone boundaries")
Prior to that commit the non-NUMA arm architecture called
free_area_init_node() directly at the end of zone_sizes_init().
So while the problem appears to be fixed upstream by commit a32c1c61212d
("arm: simplify detection of memory zone boundaries") it is still
present in stable branches between v4.19.y and v5.7.y inclusive and
probably for architectures other than arm as well that didn't call
free_area_init(). This upstream commit is not easily/safely backportable
to stable branches, but if we focus on the sliver of functionality that
adds the initialization code from free_area_init() to the
zones_sizes_init() function used by non-NUMA arm kernels I believe a
simple patch could be developed for each relevant stable branch to
resolve the issue I am observing. Similar patches could also be applied
for other architectures that now call free_area_init() upstream but not
in one of these stable branches, but I am not in a position to test
those architectures.
For the linux-5.4.y branch such a patch might look like this:
>From 671c341b5cdb8360349c33ade43115e28ca56a8a Mon Sep 17 00:00:00 2001
From: Doug Berger <opendmb(a)gmail.com>
Date: Tue, 25 Aug 2020 14:39:43 -0700
Subject: [PATCH] ARM: mm: sync zone_sizes_init with free_area_init
The arm architecture does not invoke the common function
free_area_init(). Instead for non-NUMA builds it invokes
free_area_init_node() directly from zone_sizes_init().
As a result recent changes in free_area_init() are not
picked up by arm architecture builds.
This commit adds the updates to the zone_sizes_init()
function to achieve parity with the free_area_init()
functionality.
Fixes: 35fd1eb1e821 ("mm/sparse: abstract sparse buffer allocations")
Signed-off-by: Doug Berger <opendmb(a)gmail.com>
Cc: stable(a)vger.kernel.org
---
arch/arm/mm/init.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 6f19ba53fd1f..4f171d834c60 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -169,6 +169,7 @@ static void __init zone_sizes_init(unsigned long
min, unsigned long max_low,
arm_dma_zone_size >> PAGE_SHIFT);
#endif
+ zero_resv_unavail();
free_area_init_node(0, zone_size, min, zhole_size);
}
--
2.7.4
I am unclear of the mechanics for submitting such a stable patch when it
represents a perhaps less than obvious sliver of the upstream commit
that fixes the issue, so I am soliciting guidance with this email.
Thank you for taking the time to read this far, and please let me know
how I can improve the situation,
Doug
[BUG]
There are quite some bug reports of btrfs falling into a ENOSPC trap,
where btrfs can't even start a transaction to add new devices.
[CAUSE]
Most of the reports are utilize multi-device profiles, like
RAID1/RAID10/RAID5/RAID6, and the involved disks have very unbalanced
sizes.
It turns out that, the overcommit calculation in btrfs_can_overcommit()
is just a factor based calculation, which can't check if devices can
really fulfill the requirement for the desired profile.
This makes btrfs_can_overcommit() to be always over-confident about
usable space, and when we can't allocate any new metadata chunk but
still allow new metadata operations, we fall into the ENOSPC trap and
have no way to exit it.
[WORKAROUND]
The root fix needs a device layout aware, chunk allocator like available
space calculation.
There used to be such patchset submitted to the mail list, but the extra
failure mode is tricky to handle for chunk allocation, thus that
patchset needs more time to mature.
Meanwhile to prevent such problems reaching more users, workaround the
problem by:
- Half the over-commit available space reported
So that we won't always be that over-confident.
But this won't really help if we have extremely unbalanced disk size.
- Don't over-commit if the space info is already full
This may already be too late, but still better than doing nothing and
believe the over-commit values.
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
---
fs/btrfs/space-info.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 475968ccbd1d..e8133ec7e34a 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -339,6 +339,18 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
avail >>= 3;
else
avail >>= 1;
+ /*
+ * Since current over-commit calculation is doomed already for
+ * RAID0/RADI1/RAID10/RAID5/6, we half the availabe space to reduce
+ * over-commit amount.
+ *
+ * This is just a workaround before the device layout aware
+ * available space calculation arrives.
+ */
+ if ((BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1_MASK |
+ BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID56_MASK) &
+ profile)
+ avail >>= 1;
return avail;
}
@@ -353,6 +365,14 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
return 0;
+ /*
+ * If we can't allocate new space already, no overcommit is allowed.
+ *
+ * This check may be already late, but still better than nothing.
+ */
+ if (space_info->full)
+ return 0;
+
used = btrfs_space_info_used(space_info, true);
avail = calc_available_free_space(fs_info, space_info, flush);
--
2.28.0
Backport version to the 4.19-stable tree of:
[ Upstream commit c1d0da83358a2316d9be7f229f26126dbaa07468 ]
Patch series "mm: fix memory to node bad links in sysfs", v3.
Sometimes, firmware may expose interleaved memory layout like this:
Early memory node ranges
node 1: [mem 0x0000000000000000-0x000000011fffffff]
node 2: [mem 0x0000000120000000-0x000000014fffffff]
node 1: [mem 0x0000000150000000-0x00000001ffffffff]
node 0: [mem 0x0000000200000000-0x000000048fffffff]
node 2: [mem 0x0000000490000000-0x00000007ffffffff]
In that case, we can see memory blocks assigned to multiple nodes in
sysfs:
$ ls -l /sys/devices/system/memory/memory21
total 0
lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1
lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2
-rw-r--r-- 1 root root 65536 Aug 24 05:27 online
-r--r--r-- 1 root root 65536 Aug 24 05:27 phys_device
-r--r--r-- 1 root root 65536 Aug 24 05:27 phys_index
drwxr-xr-x 2 root root 0 Aug 24 05:27 power
-r--r--r-- 1 root root 65536 Aug 24 05:27 removable
-rw-r--r-- 1 root root 65536 Aug 24 05:27 state
lrwxrwxrwx 1 root root 0 Aug 24 05:25 subsystem -> ../../../../bus/memory
-rw-r--r-- 1 root root 65536 Aug 24 05:25 uevent
-r--r--r-- 1 root root 65536 Aug 24 05:27 valid_zones
The same applies in the node's directory with a memory21 link in both
the node1 and node2's directory.
This is wrong but doesn't prevent the system to run. However when
later, one of these memory blocks is hot-unplugged and then hot-plugged,
the system is detecting an inconsistency in the sysfs layout and a
BUG_ON() is raised:
kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084!
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4
CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25
Call Trace:
add_memory_resource+0x23c/0x340 (unreliable)
__add_memory+0x5c/0xf0
dlpar_add_lmb+0x1b4/0x500
dlpar_memory+0x1f8/0xb80
handle_dlpar_errorlog+0xc0/0x190
dlpar_store+0x198/0x4a0
kobj_attr_store+0x30/0x50
sysfs_kf_write+0x64/0x90
kernfs_fop_write+0x1b0/0x290
vfs_write+0xe8/0x290
ksys_write+0xdc/0x130
system_call_exception+0x160/0x270
system_call_common+0xf0/0x27c
This has been seen on PowerPC LPAR.
The root cause of this issue is that when node's memory is registered,
the range used can overlap another node's range, thus the memory block
is registered to multiple nodes in sysfs.
There are two issues here:
(a) The sysfs memory and node's layouts are broken due to these
multiple links
(b) The link errors in link_mem_sections() should not lead to a system
panic.
To address (a) register_mem_sect_under_node should not rely on the
system state to detect whether the link operation is triggered by a hot
plug operation or not. This is addressed by the patches 1 and 2 of this
series.
Issue (b) will be addressed separately.
This patch (of 2):
The memmap_context enum is used to detect whether a memory operation is
due to a hot-add operation or happening at boot time.
Make it general to the hotplug operation and rename it as
meminit_context.
There is no functional change introduced by this patch
Suggested-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: Laurent Dufour <ldufour(a)linux.ibm.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: "Rafael J . Wysocki" <rafael(a)kernel.org>
Cc: Nathan Lynch <nathanl(a)linux.ibm.com>
Cc: Scott Cheloha <cheloha(a)linux.ibm.com>
Cc: Tony Luck <tony.luck(a)intel.com>
Cc: Fenghua Yu <fenghua.yu(a)intel.com>
Cc: <stable(a)vger.kernel.org> # 4.19.y
Link: https://lkml.kernel.org/r/20200915094143.79181-1-ldufour@linux.ibm.com
Link: https://lkml.kernel.org/r/20200915132624.9723-1-ldufour@linux.ibm.com
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
---
arch/ia64/mm/init.c | 6 +++---
include/linux/mm.h | 2 +-
include/linux/mmzone.h | 11 ++++++++---
mm/memory_hotplug.c | 2 +-
mm/page_alloc.c | 11 ++++++-----
5 files changed, 19 insertions(+), 13 deletions(-)
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 79e5cc70f1fd..561e2573bd34 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -499,7 +499,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg)
if (map_start < map_end)
memmap_init_zone((unsigned long)(map_end - map_start),
args->nid, args->zone, page_to_pfn(map_start),
- MEMMAP_EARLY, NULL);
+ MEMINIT_EARLY, NULL);
return 0;
}
@@ -508,8 +508,8 @@ memmap_init (unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn)
{
if (!vmem_map) {
- memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY,
- NULL);
+ memmap_init_zone(size, nid, zone, start_pfn,
+ MEMINIT_EARLY, NULL);
} else {
struct page *start;
struct memmap_init_callback_data args;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 05bc5f25ab85..83828c118b6b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2179,7 +2179,7 @@ static inline void zero_resv_unavail(void) {}
extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
- enum memmap_context, struct vmem_altmap *);
+ enum meminit_context, struct vmem_altmap *);
extern void setup_per_zone_wmarks(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fdd93a39f1fa..fa02014eba8e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -759,10 +759,15 @@ bool zone_watermark_ok(struct zone *z, unsigned int order,
unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx);
-enum memmap_context {
- MEMMAP_EARLY,
- MEMMAP_HOTPLUG,
+/*
+ * Memory initialization context, use to differentiate memory added by
+ * the platform statically or via memory hotplug interface.
+ */
+enum meminit_context {
+ MEMINIT_EARLY,
+ MEMINIT_HOTPLUG,
};
+
extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
unsigned long size);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index aae7ff485671..c839c4ad4871 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -733,7 +733,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
- MEMMAP_HOTPLUG, altmap);
+ MEMINIT_HOTPLUG, altmap);
set_zone_contiguous(zone);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5717ee66c8b3..545800433dfb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5480,7 +5480,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum memmap_context context,
+ unsigned long start_pfn, enum meminit_context context,
struct vmem_altmap *altmap)
{
unsigned long end_pfn = start_pfn + size;
@@ -5507,7 +5507,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context != MEMMAP_EARLY)
+ if (context != MEMINIT_EARLY)
goto not_early;
if (!early_pfn_valid(pfn))
@@ -5542,7 +5542,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
not_early:
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
- if (context == MEMMAP_HOTPLUG)
+ if (context == MEMINIT_HOTPLUG)
SetPageReserved(page);
/*
@@ -5557,7 +5557,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*
- * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in sparse_add_one_section
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
@@ -5578,7 +5578,8 @@ static void __meminit zone_init_free_lists(struct zone *zone)
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL)
+ memmap_init_zone((size), (nid), (zone), (start_pfn), \
+ MEMINIT_EARLY, NULL)
#endif
static int zone_batchsize(struct zone *zone)
--
2.28.0
commit: 2b8bd423614c595540eaadcfbc702afe8e155e50
Please apply to linux-5.4.y. Without this fix, disk utilization reporting is
unusable, especially on spinning disks.
Thanks,
Deb
The patch below does not apply to the 5.8-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 27ba3e8ff3ab86449e63d38a8d623053591e65fa Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal(a)wdc.com>
Date: Tue, 15 Sep 2020 16:33:46 +0900
Subject: [PATCH] scsi: sd: sd_zbc: Fix handling of host-aware ZBC disks
When CONFIG_BLK_DEV_ZONED is disabled, allow using host-aware ZBC disks as
regular disks. In this case, ensure that command completion is correctly
executed by changing sd_zbc_complete() to return good_bytes instead of 0
and causing a hang during device probe (endless retries).
When CONFIG_BLK_DEV_ZONED is enabled and a host-aware disk is detected to
have partitions, it will be used as a regular disk. In this case, make sure
to not do anything in sd_zbc_revalidate_zones() as that triggers warnings.
Since all these different cases result in subtle settings of the disk queue
zoned model, introduce the block layer helper function
blk_queue_set_zoned() to generically implement setting up the effective
zoned model according to the disk type, the presence of partitions on the
disk and CONFIG_BLK_DEV_ZONED configuration.
Link: https://lore.kernel.org/r/20200915073347.832424-2-damien.lemoal@wdc.com
Fixes: b72053072c0b ("block: allow partitions on host aware zone devices")
Cc: <stable(a)vger.kernel.org>
Reported-by: Borislav Petkov <bp(a)alien8.de>
Suggested-by: Christoph Hellwig <hch(a)infradead.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal(a)wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 76a7e03bcd6c..34b721a2743a 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -801,6 +801,52 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
+/**
+ * blk_queue_set_zoned - configure a disk queue zoned model.
+ * @disk: the gendisk of the queue to configure
+ * @model: the zoned model to set
+ *
+ * Set the zoned model of the request queue of @disk according to @model.
+ * When @model is BLK_ZONED_HM (host managed), this should be called only
+ * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option).
+ * If @model specifies BLK_ZONED_HA (host aware), the effective model used
+ * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions
+ * on the disk.
+ */
+void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
+{
+ switch (model) {
+ case BLK_ZONED_HM:
+ /*
+ * Host managed devices are supported only if
+ * CONFIG_BLK_DEV_ZONED is enabled.
+ */
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
+ break;
+ case BLK_ZONED_HA:
+ /*
+ * Host aware devices can be treated either as regular block
+ * devices (similar to drive managed devices) or as zoned block
+ * devices to take advantage of the zone command set, similarly
+ * to host managed devices. We try the latter if there are no
+ * partitions and zoned block device support is enabled, else
+ * we do nothing special as far as the block layer is concerned.
+ */
+ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) ||
+ disk_has_partitions(disk))
+ model = BLK_ZONED_NONE;
+ break;
+ case BLK_ZONED_NONE:
+ default:
+ if (WARN_ON_ONCE(model != BLK_ZONED_NONE))
+ model = BLK_ZONED_NONE;
+ break;
+ }
+
+ disk->queue->limits.zoned = model;
+}
+EXPORT_SYMBOL_GPL(blk_queue_set_zoned);
+
static int __init blk_settings_init(void)
{
blk_max_low_pfn = max_low_pfn - 1;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 95018e650f2d..06286b6aeaec 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2964,26 +2964,32 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
if (sdkp->device->type == TYPE_ZBC) {
/* Host-managed */
- q->limits.zoned = BLK_ZONED_HM;
+ blk_queue_set_zoned(sdkp->disk, BLK_ZONED_HM);
} else {
sdkp->zoned = (buffer[8] >> 4) & 3;
- if (sdkp->zoned == 1 && !disk_has_partitions(sdkp->disk)) {
+ if (sdkp->zoned == 1) {
/* Host-aware */
- q->limits.zoned = BLK_ZONED_HA;
+ blk_queue_set_zoned(sdkp->disk, BLK_ZONED_HA);
} else {
- /*
- * Treat drive-managed devices and host-aware devices
- * with partitions as regular block devices.
- */
- q->limits.zoned = BLK_ZONED_NONE;
- if (sdkp->zoned == 2 && sdkp->first_scan)
- sd_printk(KERN_NOTICE, sdkp,
- "Drive-managed SMR disk\n");
+ /* Regular disk or drive managed disk */
+ blk_queue_set_zoned(sdkp->disk, BLK_ZONED_NONE);
}
}
- if (blk_queue_is_zoned(q) && sdkp->first_scan)
+
+ if (!sdkp->first_scan)
+ goto out;
+
+ if (blk_queue_is_zoned(q)) {
sd_printk(KERN_NOTICE, sdkp, "Host-%s zoned block device\n",
q->limits.zoned == BLK_ZONED_HM ? "managed" : "aware");
+ } else {
+ if (sdkp->zoned == 1)
+ sd_printk(KERN_NOTICE, sdkp,
+ "Host-aware SMR disk used as regular disk\n");
+ else if (sdkp->zoned == 2)
+ sd_printk(KERN_NOTICE, sdkp,
+ "Drive-managed SMR disk\n");
+ }
out:
kfree(buffer);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 4933e7daf17d..7251434100e6 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -259,7 +259,7 @@ static inline blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd,
unsigned int good_bytes, struct scsi_sense_hdr *sshdr)
{
- return 0;
+ return good_bytes;
}
static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd,
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 0e94ff056bff..a739456dea02 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -667,7 +667,11 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
u32 max_append;
int ret = 0;
- if (!sd_is_zoned(sdkp))
+ /*
+ * There is nothing to do for regular disks, including host-aware disks
+ * that have partitions.
+ */
+ if (!blk_queue_is_zoned(q))
return 0;
/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bb5636cc17b9..868e11face00 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -352,6 +352,8 @@ struct queue_limits {
typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
void *data);
+void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model);
+
#ifdef CONFIG_BLK_DEV_ZONED
#define BLK_ALL_ZONES ((unsigned int)-1)