This is a note to let you know that I've just added the patch titled
dm bufio: fix shrinker scans when (nr_to_scan < retain_target)
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
dm-bufio-fix-shrinker-scans-when-nr_to_scan-retain_target.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From fbc7c07ec23c040179384a1f16b62b6030eb6bdd Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb(a)google.com>
Date: Wed, 6 Dec 2017 09:27:30 -0800
Subject: dm bufio: fix shrinker scans when (nr_to_scan < retain_target)
From: Suren Baghdasaryan <surenb(a)google.com>
commit fbc7c07ec23c040179384a1f16b62b6030eb6bdd upstream.
When system is under memory pressure it is observed that dm bufio
shrinker often reclaims only one buffer per scan. This change fixes
the following two issues in dm bufio shrinker that cause this behavior:
1. ((nr_to_scan - freed) <= retain_target) condition is used to
terminate slab scan process. This assumes that nr_to_scan is equal
to the LRU size, which might not be correct because do_shrink_slab()
in vmscan.c calculates nr_to_scan using multiple inputs.
As a result when nr_to_scan is less than retain_target (64) the scan
will terminate after the first iteration, effectively reclaiming one
buffer per scan and making scans very inefficient. This hurts vmscan
performance especially because mutex is acquired/released every time
dm_bufio_shrink_scan() is called.
New implementation uses ((LRU size - freed) <= retain_target)
condition for scan termination. LRU size can be safely determined
inside __scan() because this function is called after dm_bufio_lock().
2. do_shrink_slab() uses value returned by dm_bufio_shrink_count() to
determine number of freeable objects in the slab. However dm_bufio
always retains retain_target buffers in its LRU and will terminate
a scan when this mark is reached. Therefore returning the entire LRU size
from dm_bufio_shrink_count() is misleading because that does not
represent the number of freeable objects that slab will reclaim during
a scan. Returning (LRU size - retain_target) better represents the
number of freeable objects in the slab. This way do_shrink_slab()
returns 0 when (LRU size < retain_target) and vmscan will not try to
scan this shrinker avoiding scans that will not reclaim any memory.
Test: tested using Android device running
<AOSP>/system/extras/alloc-stress that generates memory pressure
and causes intensive shrinker scans
Signed-off-by: Suren Baghdasaryan <surenb(a)google.com>
Signed-off-by: Mike Snitzer <snitzer(a)redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/md/dm-bufio.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1611,7 +1611,8 @@ static unsigned long __scan(struct dm_bu
int l;
struct dm_buffer *b, *tmp;
unsigned long freed = 0;
- unsigned long count = nr_to_scan;
+ unsigned long count = c->n_buffers[LIST_CLEAN] +
+ c->n_buffers[LIST_DIRTY];
unsigned long retain_target = get_retain_buffers(c);
for (l = 0; l < LIST_SIZE; l++) {
@@ -1647,8 +1648,11 @@ static unsigned long
dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
+ unsigned long count = ACCESS_ONCE(c->n_buffers[LIST_CLEAN]) +
+ ACCESS_ONCE(c->n_buffers[LIST_DIRTY]);
+ unsigned long retain_target = get_retain_buffers(c);
- return ACCESS_ONCE(c->n_buffers[LIST_CLEAN]) + ACCESS_ONCE(c->n_buffers[LIST_DIRTY]);
+ return (count < retain_target) ? 0 : (count - retain_target);
}
/*
Patches currently in stable-queue which might be from surenb(a)google.com are
queue-4.14/dm-bufio-fix-shrinker-scans-when-nr_to_scan-retain_target.patch
Handling CD-ROM devices from libsas is decidedly odd, as libata
relies on SCSI EH to be started to figure out that no medium is
present.
So we cannot do asynchronous aborts for SATA devices.
Fixes: 909657615d9 ("scsi: libsas: allow async aborts")
Cc: <stable(a)vger.kernel.org> # 4.12+
Signed-off-by: Hannes Reinecke <hare(a)suse.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Tested-by: Yves-Alexis Perez <corsac(a)debian.org>
---
drivers/scsi/libsas/sas_scsi_host.c | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 6267272..6de9681 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -487,15 +487,28 @@ static int sas_queue_reset(struct domain_device *dev, int reset_type,
int sas_eh_abort_handler(struct scsi_cmnd *cmd)
{
- int res;
+ int res = TMF_RESP_FUNC_FAILED;
struct sas_task *task = TO_SAS_TASK(cmd);
struct Scsi_Host *host = cmd->device->host;
+ struct domain_device *dev = cmd_to_domain_dev(cmd);
struct sas_internal *i = to_sas_internal(host->transportt);
+ unsigned long flags;
if (!i->dft->lldd_abort_task)
return FAILED;
- res = i->dft->lldd_abort_task(task);
+ spin_lock_irqsave(host->host_lock, flags);
+ /* We cannot do async aborts for SATA devices */
+ if (dev_is_sata(dev) && !host->host_eh_scheduled) {
+ spin_unlock_irqrestore(host->host_lock, flags);
+ return FAILED;
+ }
+ spin_unlock_irqrestore(host->host_lock, flags);
+
+ if (task)
+ res = i->dft->lldd_abort_task(task);
+ else
+ SAS_DPRINTK("no task to abort\n");
if (res == TMF_RESP_FUNC_SUCC || res == TMF_RESP_FUNC_COMPLETE)
return SUCCESS;
--
1.8.5.6
From: Eric Biggers <ebiggers(a)google.com>
pipe-user-pages-hard and pipe-user-pages-soft are only supposed to apply
to unprivileged users, as documented in both Documentation/sysctl/fs.txt
and the pipe(7) man page.
However, the capabilities are actually only checked when increasing a
pipe's size using F_SETPIPE_SZ, not when creating a new pipe.
Therefore, if pipe-user-pages-hard has been set, the root user can run
into it and be unable to create pipes. Similarly, if
pipe-user-pages-soft has been set, the root user can run into it and
have their pipes limited to 1 page each.
Fix this by allowing the privileged override in both cases.
Fixes: 759c01142a5d ("pipe: limit the per-user amount of pages allocated in pipes")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
fs/pipe.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/fs/pipe.c b/fs/pipe.c
index d0dec5e7ef33..847ecc388820 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -613,6 +613,11 @@ static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard;
}
+static bool is_unprivileged_user(void)
+{
+ return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+}
+
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
@@ -629,12 +634,12 @@ struct pipe_inode_info *alloc_pipe_info(void)
user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
- if (too_many_pipe_buffers_soft(user_bufs)) {
+ if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) {
user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
pipe_bufs = 1;
}
- if (too_many_pipe_buffers_hard(user_bufs))
+ if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user())
goto out_revert_acct;
pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
@@ -1065,7 +1070,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
if (nr_pages > pipe->buffers &&
(too_many_pipe_buffers_hard(user_bufs) ||
too_many_pipe_buffers_soft(user_bufs)) &&
- !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+ is_unprivileged_user()) {
ret = -EPERM;
goto out_revert_acct;
}
--
2.15.1
Hello Bjorn,
Again, reviving this very old thread :-)
On Thu, 5 Oct 2017 12:23:30 -0500, Bjorn Helgaas wrote:
> > - if (PCI_SLOT(devfn) != 0) {
> > + if ((bus->number == pcie->root_bus_nr) && (PCI_SLOT(devfn) != 0)) {
>
> I'm fine with this, but please take a look at these:
>
> 8e7ca8ca5fd8 PCI: xilinx: Relax device number checking to allow SR-IOV
> e18934b5e9c7 PCI: designware: Relax device number checking to allow SR-IOV
> d99e30b7936a PCI: altera: Relax device number checking to allow SR-IOV
>
> and make sure that reasoning doesn't apply here, too.
>
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=8…
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=e…
> http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d…
The original code for xilinx/designware/altera was doing:
if (bus->number == port->root_busno && devfn > 0)
return false;
if (bus->primary == port->root_busno && devfn > 0)
return false;
I.e, it was checking both if bus->number *and* bus->primary were equal
to port->root_busno.
The commit you points removed the check on bus->primary, keeping the
check on bus->number.
Your patch for the Aadvark driver only adds a check on bus->number, i.e
exactly what the xilinx/designware/altera code is still doing today:
Altera:
/* access only one slot on each root port */
if (bus->number == pcie->root_bus_nr && dev > 0)
return false;
Designware:
/* access only one slot on each root port */
if (bus->number == pp->root_bus_nr && dev > 0)
return 0;
Xilinx:
/* Only one device down on each root port */
if (bus->number == port->root_busno && devfn > 0)
return false;
Aardvark (with our patch):
if ((bus->number == pcie->root_bus_nr) && (PCI_SLOT(devfn) != 0)) {
*val = 0xffffffff;
return PCIBIOS_DEVICE_NOT_FOUND;
}
So we're doing exactly the same thing.
Do you agree ?
Best regards,
Thomas Petazzoni
--
Thomas Petazzoni, CTO, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com
The patch titled
Subject: lib/strscpy: remove word-at-a-time optimization.
has been added to the -mm tree. Its filename is
lib-strscpy-remove-word-at-a-time-optimization.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/lib-strscpy-remove-word-at-a-time-…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/lib-strscpy-remove-word-at-a-time-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/SubmitChecklist when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Subject: lib/strscpy: remove word-at-a-time optimization.
strscpy() performs the word-at-a-time optimistic reads. So it may may
access the memory past the end of the object, which is perfectly fine
since strscpy() doesn't use that (past-the-end) data and makes sure the
optimistic read won't cross a page boundary.
But KASAN doesn't know anything about that so it will complain. There are
several possible ways to address this issue, but none are perfect. See
https://lkml.kernel.org/r/9f0a9cf6-51f7-cd1f-5dc6-6d510a7b8ec4@virtuozzo.com
It seems the best solution is to simply disable word-at-a-time
optimization. My trivial testing shows that byte-at-a-time could be up to
x4.3 times slower than word-at-a-time. It may seems like a lot, but it's
actually ~1.2e-10 sec per symbol vs ~4.8e-10 sec per symbol on modern
hardware. And we don't use strscpy() in a performance critical paths to
copy large amounts of data, so it shouldn't matter anyway.
Link: http://lkml.kernel.org/r/20180109163745.3692-1-aryabinin@virtuozzo.com
Fixes: 30035e45753b7 ("string: provide strscpy()")
Signed-off-by: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Eryu Guan <eguan(a)redhat.com>
Cc: Alexander Potapenko <glider(a)google.com>
Cc: Chris Metcalf <metcalf(a)alum.mit.edu>
Cc: David Laight <David.Laight(a)ACULAB.COM>
Cc: Dmitry Vyukov <dvyukov(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/string.c | 38 --------------------------------------
1 file changed, 38 deletions(-)
diff -puN lib/string.c~lib-strscpy-remove-word-at-a-time-optimization lib/string.c
--- a/lib/string.c~lib-strscpy-remove-word-at-a-time-optimization
+++ a/lib/string.c
@@ -29,7 +29,6 @@
#include <linux/errno.h>
#include <asm/byteorder.h>
-#include <asm/word-at-a-time.h>
#include <asm/page.h>
#ifndef __HAVE_ARCH_STRNCASECMP
@@ -177,45 +176,8 @@ EXPORT_SYMBOL(strlcpy);
*/
ssize_t strscpy(char *dest, const char *src, size_t count)
{
- const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
- size_t max = count;
long res = 0;
- if (count == 0)
- return -E2BIG;
-
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- /*
- * If src is unaligned, don't cross a page boundary,
- * since we don't know if the next page is mapped.
- */
- if ((long)src & (sizeof(long) - 1)) {
- size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1));
- if (limit < max)
- max = limit;
- }
-#else
- /* If src or dest is unaligned, don't do word-at-a-time. */
- if (((long) dest | (long) src) & (sizeof(long) - 1))
- max = 0;
-#endif
-
- while (max >= sizeof(unsigned long)) {
- unsigned long c, data;
-
- c = *(unsigned long *)(src+res);
- if (has_zero(c, &data, &constants)) {
- data = prep_zero_mask(c, data, &constants);
- data = create_zero_mask(data);
- *(unsigned long *)(dest+res) = c & zero_bytemask(data);
- return res + find_zero(data);
- }
- *(unsigned long *)(dest+res) = c;
- res += sizeof(unsigned long);
- count -= sizeof(unsigned long);
- max -= sizeof(unsigned long);
- }
-
while (count) {
char c;
_
Patches currently in -mm which might be from aryabinin(a)virtuozzo.com are
kasan-makefile-support-llvm-style-asan-parameters.patch
lib-strscpy-remove-word-at-a-time-optimization.patch