This is a note to let you know that I've just added the patch titled
usb-storage: Add unusual-devs entry for VL817 USB-SATA bridge
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 5b67b315037250a61861119683e7fcb509deea25 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern(a)rowland.harvard.edu>
Date: Mon, 24 Jan 2022 15:14:40 -0500
Subject: usb-storage: Add unusual-devs entry for VL817 USB-SATA bridge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Two people have reported (and mentioned numerous other reports on the
web) that VIA's VL817 USB-SATA bridge does not work with the uas
driver. Typical log messages are:
[ 3606.232149] sd 14:0:0:0: [sdg] tag#2 uas_zap_pending 0 uas-tag 1 inflight: CMD
[ 3606.232154] sd 14:0:0:0: [sdg] tag#2 CDB: Write(16) 8a 00 00 00 00 00 18 0c c9 80 00 00 00 80 00 00
[ 3606.306257] usb 4-4.4: reset SuperSpeed Plus Gen 2x1 USB device number 11 using xhci_hcd
[ 3606.328584] scsi host14: uas_eh_device_reset_handler success
Surprisingly, the devices do seem to work okay for some other people.
The cause of the differing behaviors is not known.
In the hope of getting the devices to work for the most users, even at
the possible cost of degraded performance for some, this patch adds an
unusual_devs entry for the VL817 to block it from binding to the uas
driver by default. Users will be able to override this entry by means
of a module parameter, if they want.
CC: <stable(a)vger.kernel.org>
Reported-by: DocMAX <mail(a)vacharakis.de>
Reported-and-tested-by: Thomas Weißschuh <linux(a)weissschuh.net>
Signed-off-by: Alan Stern <stern(a)rowland.harvard.edu>
Link: https://lore.kernel.org/r/Ye8IsK2sjlEv1rqU@rowland.harvard.edu
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/storage/unusual_devs.h | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index 29191d33c0e3..1a05e3dcfec8 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -2301,6 +2301,16 @@ UNUSUAL_DEV( 0x2027, 0xa001, 0x0000, 0x9999,
USB_SC_DEVICE, USB_PR_DEVICE, usb_stor_euscsi_init,
US_FL_SCM_MULT_TARG ),
+/*
+ * Reported by DocMAX <mail(a)vacharakis.de>
+ * and Thomas Weißschuh <linux(a)weissschuh.net>
+ */
+UNUSUAL_DEV( 0x2109, 0x0715, 0x9999, 0x9999,
+ "VIA Labs, Inc.",
+ "VL817 SATA Bridge",
+ USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+ US_FL_IGNORE_UAS),
+
UNUSUAL_DEV( 0x2116, 0x0320, 0x0001, 0x0001,
"ST",
"2A",
--
2.35.0
This is a note to let you know that I've just added the patch titled
usb: typec: tcpci: don't touch CC line if it's Vconn source
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 5638b0dfb6921f69943c705383ff40fb64b987f2 Mon Sep 17 00:00:00 2001
From: Xu Yang <xu.yang_2(a)nxp.com>
Date: Thu, 13 Jan 2022 17:29:43 +0800
Subject: usb: typec: tcpci: don't touch CC line if it's Vconn source
With the AMS and Collision Avoidance, tcpm often needs to change the CC's
termination. When one CC line is sourcing Vconn, if we still change its
termination, the voltage of the another CC line is likely to be fluctuant
and unstable.
Therefore, we should verify whether a CC line is sourcing Vconn before
changing its termination and only change the termination that is not
a Vconn line. This can be done by reading the Vconn Present bit of
POWER_ STATUS register. To determine the polarity, we can read the
Plug Orientation bit of TCPC_CONTROL register. Since Vconn can only be
sourced if Plug Orientation is set.
Fixes: 0908c5aca31e ("usb: typec: tcpm: AMS and Collision Avoidance")
cc: <stable(a)vger.kernel.org>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Acked-by: Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
Signed-off-by: Xu Yang <xu.yang_2(a)nxp.com>
Link: https://lore.kernel.org/r/20220113092943.752372-1-xu.yang_2@nxp.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/typec/tcpm/tcpci.c | 26 ++++++++++++++++++++++++++
drivers/usb/typec/tcpm/tcpci.h | 1 +
2 files changed, 27 insertions(+)
diff --git a/drivers/usb/typec/tcpm/tcpci.c b/drivers/usb/typec/tcpm/tcpci.c
index 35a1307349a2..e07d26a3cd8e 100644
--- a/drivers/usb/typec/tcpm/tcpci.c
+++ b/drivers/usb/typec/tcpm/tcpci.c
@@ -75,9 +75,25 @@ static int tcpci_write16(struct tcpci *tcpci, unsigned int reg, u16 val)
static int tcpci_set_cc(struct tcpc_dev *tcpc, enum typec_cc_status cc)
{
struct tcpci *tcpci = tcpc_to_tcpci(tcpc);
+ bool vconn_pres;
+ enum typec_cc_polarity polarity = TYPEC_POLARITY_CC1;
unsigned int reg;
int ret;
+ ret = regmap_read(tcpci->regmap, TCPC_POWER_STATUS, ®);
+ if (ret < 0)
+ return ret;
+
+ vconn_pres = !!(reg & TCPC_POWER_STATUS_VCONN_PRES);
+ if (vconn_pres) {
+ ret = regmap_read(tcpci->regmap, TCPC_TCPC_CTRL, ®);
+ if (ret < 0)
+ return ret;
+
+ if (reg & TCPC_TCPC_CTRL_ORIENTATION)
+ polarity = TYPEC_POLARITY_CC2;
+ }
+
switch (cc) {
case TYPEC_CC_RA:
reg = (TCPC_ROLE_CTRL_CC_RA << TCPC_ROLE_CTRL_CC1_SHIFT) |
@@ -112,6 +128,16 @@ static int tcpci_set_cc(struct tcpc_dev *tcpc, enum typec_cc_status cc)
break;
}
+ if (vconn_pres) {
+ if (polarity == TYPEC_POLARITY_CC2) {
+ reg &= ~(TCPC_ROLE_CTRL_CC1_MASK << TCPC_ROLE_CTRL_CC1_SHIFT);
+ reg |= (TCPC_ROLE_CTRL_CC_OPEN << TCPC_ROLE_CTRL_CC1_SHIFT);
+ } else {
+ reg &= ~(TCPC_ROLE_CTRL_CC2_MASK << TCPC_ROLE_CTRL_CC2_SHIFT);
+ reg |= (TCPC_ROLE_CTRL_CC_OPEN << TCPC_ROLE_CTRL_CC2_SHIFT);
+ }
+ }
+
ret = regmap_write(tcpci->regmap, TCPC_ROLE_CTRL, reg);
if (ret < 0)
return ret;
diff --git a/drivers/usb/typec/tcpm/tcpci.h b/drivers/usb/typec/tcpm/tcpci.h
index 2be7a77d400e..b2edd45f13c6 100644
--- a/drivers/usb/typec/tcpm/tcpci.h
+++ b/drivers/usb/typec/tcpm/tcpci.h
@@ -98,6 +98,7 @@
#define TCPC_POWER_STATUS_SOURCING_VBUS BIT(4)
#define TCPC_POWER_STATUS_VBUS_DET BIT(3)
#define TCPC_POWER_STATUS_VBUS_PRES BIT(2)
+#define TCPC_POWER_STATUS_VCONN_PRES BIT(1)
#define TCPC_POWER_STATUS_SINKING_VBUS BIT(0)
#define TCPC_FAULT_STATUS 0x1f
--
2.35.0
Hi Greg,
Regression found on
stable-rc 5.4 queue riscv tinyconfig build failed.
Not sure which patch is causing build failures.
We will bisect and get back to you.
make --silent --keep-going --jobs=8
O=/home/tuxbuild/.cache/tuxmake/builds/current LLVM_IAS=1 ARCH=riscv
CROSS_COMPILE=riscv64-linux-gnu- HOSTCC=clang CC=clang
In file included from /builds/linux/kernel/dma/mapping.c:8:
In file included from /builds/linux/include/linux/memblock.h:13:
In file included from /builds/linux/include/linux/mm.h:10:
In file included from /builds/linux/include/linux/gfp.h:6:
In file included from /builds/linux/include/linux/mmzone.h:8:
In file included from /builds/linux/include/linux/spinlock.h:51:
In file included from /builds/linux/include/linux/preempt.h:78:
In file included from ./arch/riscv/include/generated/asm/preempt.h:1:
In file included from /builds/linux/include/asm-generic/preempt.h:5:
In file included from /builds/linux/include/linux/thread_info.h:22:
/builds/linux/arch/riscv/include/asm/current.h:30:9: warning: variable
'tp' is uninitialized when used here [-Wuninitialized]
return tp;
^~
/builds/linux/arch/riscv/include/asm/current.h:29:33: note: initialize
the variable 'tp' to silence this warning
register struct task_struct *tp __asm__("tp");
^
= NULL
clang: warning: argument unused during compilation: '-no-pie'
[-Wunused-command-line-argument]
In file included from /builds/linux/arch/riscv/kernel/cpu.c:7:
In file included from /builds/linux/include/linux/seq_file.h:8:
In file included from /builds/linux/include/linux/mutex.h:14:
/builds/linux/arch/riscv/include/asm/current.h:30:9: warning: variable
'tp' is uninitialized when used here [-Wuninitialized]
return tp;
^~
/builds/linux/arch/riscv/include/asm/current.h:29:33: note: initialize
the variable 'tp' to silence this warning
register struct task_struct *tp __asm__("tp");
^
= NULL
1 warning generated.
1 warning generated.
1 warning generated.
1 warning generated.
<instantiation>:1:1: error: unrecognized instruction mnemonic
LOCAL _restore_kernel_tpsp
^
/builds/linux/arch/riscv/kernel/entry.S:163:2: note: while in macro
instantiation
SAVE_ALL
^
<instantiation>:2:2: error: unrecognized instruction mnemonic
LOCAL _save_context
^
Reported-by: Linux Kernel Functional Testing <lkft(a)linaro.org>
--
Linaro LKFT
https://lkft.linaro.org
Hello There, I hope this message finds you in good spirits
especially during this challenging period. Anyway, I am D C
Johnston, a private broker formerly with a Fund Management
company in South Africa. I am privately contacting you because
one of my high profile clients is interested in investing abroad
and has asked me to look for individuals and companies with
interesting business ideas that he can invest in. He has an
interest in investing a very substantial amount abroad. I got
your email contact through an online business directory and I
thought I'd contact you to see if you are interested in this
opportunity. Do you or your company have new or existing projects
that require funding?
Please indicate your interest by replying back to this email.
Once I get your response, I will give you more details and we can
plan a strategy that will be beneficial to all parties. Please
also indicate your direct mobile and whatsapp numbers for an
easier communication. Please keep safe and I wish you a great new
year.
Best Regards,
Dean.
May the peace of God be with You.
I am contacting you through this means because I need your urgent
assistance and also help me to carry a charity project in your
country. I found your email address as a true child of God for past
few days now that I have been praying to know if you are really the
chosen one for this great charity project, according to God's
direction, after all prayers I am convinced, and I have decided to
contact you. Please, i want you use the funds for the Lord's work,
with confidence, read and respond now.
My name is Mrs. Emman Nadia F, a widow, but currently based in West
Africa since my life with my late husband, who was a businessman in
this country before dying some years ago. We were married to many
years without a child. He died after a brief illness that lasted only
six days and I myself have been suffering from an ovarian cancer
disease. At this moment I am about to finish the race in this way
because the disease has reached a very bad stage, without any family
member and without children. I hope you do not expose or betray this
trust and I am sure that I am about to trust you for the mutual
benefit of orphans and the less privileged. I have some funds that I
inherited from my late husband, the total sum of ($ 12,500,000.00)
deposited at a bank here in Burkina Faso. After knowing my current
state of health, I decided to trust you with this fund, believing that
you will use it in the way I will instruct here.
you will use this $12.5 Million for public benefit as follows;
1. Establish An Orphanage Home To Help The Orphanages Children.
2. Build A Hospital To Help The Poor.
3. Build A Nursing Home For Elderly People Need Care & Meal.
You will named them after my late husband.Therefore, I need you to
help me and claim this money and use it for charities, for orphanages
and provide justice and help to the poor, needy and to promote the
words of God and the effort to maintain the house of God, according to
the bible in the book of. Jeremiah 22: 15-16, without minding our
different religions.
It will be a pleasure to compensate with 40% percent of the total
money for your effort in handling the transaction, while 60% of the
money will go to charity project.
All I need from you is sincerity and ability to complete the task of
God without any failure. It will be my pleasure to see that the bank
has finally released and transferred the fund to your bank account in
the country, even before I die here in the hospital, due to my current
state of health, everything must be processed as soon as possible.
I am waiting for your immediate response, if you are only interested
in obtaining more details about the transaction and execution of this
humanitarian project for the glory and honor of God.
Sorry if you received this letter in your spam, is due to recent
connection/network error here in the country.
Please I am waiting for your urgent reply now.
May God Bless you,
Mrs. EEmman Nadia F.
Hot-unplug all firmware-framebuffer devices as part of removing
them via remove_conflicting_framebuffers() et al. Releases all
memory regions to be acquired by native drivers.
Firmware, such as EFI, install a framebuffer while posting the
computer. After removing the firmware-framebuffer device from fbdev,
a native driver takes over the hardware and the firmware framebuffer
becomes invalid.
Firmware-framebuffer drivers, specifically simplefb, don't release
their device from Linux' device hierarchy. It still owns the firmware
framebuffer and blocks the native drivers from loading. This has been
observed in the vmwgfx driver. [1]
Initiating a device removal (i.e., hot unplug) as part of
remove_conflicting_framebuffers() removes the underlying device and
returns the memory range to the system.
[1] https://lore.kernel.org/dri-devel/20220117180359.18114-1-zack@kde.org/
v2:
* rename variable 'dev' to 'device' (Javier)
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Reported-by: Zack Rusin <zackr(a)vmware.com>
Reviewed-by: Javier Martinez Canillas <javierm(a)redhat.com>
Reviewed-by: Zack Rusin <zackr(a)vmware.com>
CC: stable(a)vger.kernel.org # v5.11+
---
drivers/video/fbdev/core/fbmem.c | 29 ++++++++++++++++++++++++++---
include/linux/fb.h | 1 +
2 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 0fa7ede94fa6..b585339509b0 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -25,6 +25,7 @@
#include <linux/init.h>
#include <linux/linux_logo.h>
#include <linux/proc_fs.h>
+#include <linux/platform_device.h>
#include <linux/seq_file.h>
#include <linux/console.h>
#include <linux/kmod.h>
@@ -1557,18 +1558,36 @@ static void do_remove_conflicting_framebuffers(struct apertures_struct *a,
/* check all firmware fbs and kick off if the base addr overlaps */
for_each_registered_fb(i) {
struct apertures_struct *gen_aper;
+ struct device *device;
if (!(registered_fb[i]->flags & FBINFO_MISC_FIRMWARE))
continue;
gen_aper = registered_fb[i]->apertures;
+ device = registered_fb[i]->device;
if (fb_do_apertures_overlap(gen_aper, a) ||
(primary && gen_aper && gen_aper->count &&
gen_aper->ranges[0].base == VGA_FB_PHYS)) {
printk(KERN_INFO "fb%d: switching to %s from %s\n",
i, name, registered_fb[i]->fix.id);
- do_unregister_framebuffer(registered_fb[i]);
+
+ /*
+ * If we kick-out a firmware driver, we also want to remove
+ * the underlying platform device, such as simple-framebuffer,
+ * VESA, EFI, etc. A native driver will then be able to
+ * allocate the memory range.
+ *
+ * If it's not a platform device, at least print a warning. A
+ * fix would add code to remove the device from the system.
+ */
+ if (dev_is_platform(device)) {
+ registered_fb[i]->forced_out = true;
+ platform_device_unregister(to_platform_device(device));
+ } else {
+ pr_warn("fb%d: cannot remove device\n", i);
+ do_unregister_framebuffer(registered_fb[i]);
+ }
}
}
}
@@ -1898,9 +1917,13 @@ EXPORT_SYMBOL(register_framebuffer);
void
unregister_framebuffer(struct fb_info *fb_info)
{
- mutex_lock(®istration_lock);
+ bool forced_out = fb_info->forced_out;
+
+ if (!forced_out)
+ mutex_lock(®istration_lock);
do_unregister_framebuffer(fb_info);
- mutex_unlock(®istration_lock);
+ if (!forced_out)
+ mutex_unlock(®istration_lock);
}
EXPORT_SYMBOL(unregister_framebuffer);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 3da95842b207..9a14f3f8a329 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -502,6 +502,7 @@ struct fb_info {
} *apertures;
bool skip_vt_switch; /* no VT switch on suspend/resume required */
+ bool forced_out; /* set when being removed by another driver */
};
static inline struct apertures_struct *alloc_apertures(unsigned int max_num) {
--
2.34.1
The patch titled
Subject: mm, kasan: use compare-exchange operation to set KASAN page tag
has been added to the -mm tree. Its filename is
mm-use-compare-exchange-operation-to-set-kasan-page-tag.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-use-compare-exchange-operation…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-use-compare-exchange-operation…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Peter Collingbourne <pcc(a)google.com>
Subject: mm, kasan: use compare-exchange operation to set KASAN page tag
It has been reported that the tag setting operation on newly-allocated
pages can cause the page flags to be corrupted when performed concurrently
with other flag updates as a result of the use of non-atomic operations.
Fix the problem by using a compare-exchange loop to update the tag.
Link: https://lkml.kernel.org/r/20220120020148.1632253-1-pcc@google.com
Link: https://linux-review.googlesource.com/id/I456b24a2b9067d93968d43b4bb3351c0c…
Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc")
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Cc: Andrey Konovalov <andreyknvl(a)gmail.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mm.h | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
--- a/include/linux/mm.h~mm-use-compare-exchange-operation-to-set-kasan-page-tag
+++ a/include/linux/mm.h
@@ -1506,11 +1506,18 @@ static inline u8 page_kasan_tag(const st
static inline void page_kasan_tag_set(struct page *page, u8 tag)
{
- if (kasan_enabled()) {
- tag ^= 0xff;
- page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
- page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
- }
+ unsigned long old_flags, flags;
+
+ if (!kasan_enabled())
+ return;
+
+ tag ^= 0xff;
+ old_flags = READ_ONCE(page->flags);
+ do {
+ flags = old_flags;
+ flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
+ flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+ } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
}
static inline void page_kasan_tag_reset(struct page *page)
_
Patches currently in -mm which might be from pcc(a)google.com are
mm-use-compare-exchange-operation-to-set-kasan-page-tag.patch
The patch titled
Subject: mm/gup.c: fix invalid page pointer returned with FOLL_PIN gups
has been added to the -mm tree. Its filename is
mm-fix-invalid-page-pointer-returned-with-foll_pin-gups.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-fix-invalid-page-pointer-retur…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-fix-invalid-page-pointer-retur…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Peter Xu <peterx(a)redhat.com>
Subject: mm/gup.c: fix invalid page pointer returned with FOLL_PIN gups
Alex reported invalid page pointer returned with pin_user_pages_remote()
from vfio after upstream commit 4b6c33b32296 ("vfio/type1: Prepare for
batched pinning with struct vfio_batch"). This problem breaks NVIDIA vfio
mdev.
It turns out that it's not the fault of the vfio commit; however after
vfio switches to a full page buffer to store the page pointers it starts
to expose the problem easier.
The problem is for VM_PFNMAP vmas we should normally fail with an -EFAULT
then vfio will carry on to handle the MMIO regions. However when the bug
triggered, follow_page_mask() returned -EEXIST for such a page, which will
jump over the current page, leaving that entry in **pages untouched.
However the caller is not aware of it, hence the caller will reference the
page as usual even if the pointer data can be anything.
We had that -EEXIST logic since commit 1027e4436b6a ("mm: make GUP handle
pfn mapping unless FOLL_GET is requested") which seems very reasonable.
It could be that when we reworked GUP with FOLL_PIN we could have
overlooked that special path in commit 3faa52c03f44 ("mm/gup: track
FOLL_PIN pages"), even if that commit rightfully touched up
follow_devmap_pud() on checking FOLL_PIN when it needs to return an
-EEXIST.
Since at it, add another WARN_ON_ONCE() at the -EEXIST handling to make
sure we mustn't have **pages set when reaching there, because otherwise it
means the caller will try to read a garbage right after __get_user_pages()
returns.
Attaching the Fixes to the FOLL_PIN rework commit, as it happened later
than 1027e4436b6a.
Link: https://lkml.kernel.org/r/20220125033700.69705-1-peterx@redhat.com
Fixes: 3faa52c03f44 ("mm/gup: track FOLL_PIN pages")
Signed-off-by: Peter Xu <peterx(a)redhat.com>
Reported-by: Alex Williamson <alex.williamson(a)redhat.com>
Debugged-by: Alex Williamson <alex.williamson(a)redhat.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Jérôme Glisse <jglisse(a)redhat.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/gup.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
--- a/mm/gup.c~mm-fix-invalid-page-pointer-returned-with-foll_pin-gups
+++ a/mm/gup.c
@@ -440,7 +440,7 @@ static int follow_pfn_pte(struct vm_area
pte_t *pte, unsigned int flags)
{
/* No page to get reference */
- if (flags & FOLL_GET)
+ if (flags & (FOLL_GET | FOLL_PIN))
return -EFAULT;
if (flags & FOLL_TOUCH) {
@@ -1181,7 +1181,13 @@ retry:
/*
* Proper page table entry exists, but no corresponding
* struct page.
+ *
+ * Warn if we jumped over even with a valid **pages.
+ * It shouldn't trigger in practise, but when there's
+ * buggy returns on -EEXIST we'll warn before returning
+ * an invalid page pointer in the array.
*/
+ WARN_ON_ONCE(pages);
goto next_page;
} else if (IS_ERR(page)) {
ret = PTR_ERR(page);
_
Patches currently in -mm which might be from peterx(a)redhat.com are
mm-fix-invalid-page-pointer-returned-with-foll_pin-gups.patch
My husband wish, can I trust you?
I am Mrs. Maya Oliver, from Norway
. Firstly, I am married to Mr. Patrick Oliver, A diamond and gold
merchant who owns a small gold Mine in Burkina Faso and Egypt Cairo;
He died of Cardiovascular Disease in mid-March 2011. During his
lifetime he deposited the sum of € 18.5 Million Euro) Eighteen
million, Five hundred thousand Euros in a bank in Ouagadougou the
capital city of Burkina Faso. The deposited money was from the sale of
the shares, death benefits payment and entitlements of my deceased
husband by his company.
Since his death I decided not to remarry, When my late husband was
Alive he deposited the sum of € 18.5 Million Euro) Eight million, Five hundred
thousand Euro) in a bank in Burkina Faso, Presently this money is
still in bank. And My Doctor told me that I don't have much time to
live because of the cancer problem,
Having known my condition I decided to hand you over this fund to take
care of the less-privileged people, you will utilize this money the
way I am going to instruct herein. I want you to take 35% Percent of
the total money for your personal use While 65% of the money will go
to charity" people and helping the orphanage.
I don't want my husband's efforts to be used by the Government. I grew
up as an Orphan and I don't have anybody as my family member,
Meanwhile the total funds is currently with the RIA transfer company
under the guiding of my bank director and they have been instructed to
transfer the funds to you through the mention options bellow
1, Money Gram
2, ATM card,
3 RIA
4, Online Transfer
that mention above method of transfer is 100% guarantee for you to
received the funds without much delaying, once you are in contact with
them, base on the urgency required for you to handle the project, as
my doctors has confirmed that I don’t have much time to live, bellow
is the contact of the RIA transfer manager who will proceed the
transfer to you once you are in contact with them.
BELOW HERE IS THEIR CONTACT INFORMATION
OFFICE NAME: RIA MONEY TRANSFER SERVICE BURKINA FASO
CONTACT PERSON: Mr. Mohamed SIMPORE - Directeur général
CONTACT FIRST EMAIL: transferriamoney0(a)gmail.com
CONTACT SECOND EMAIL: servicemoneygram8(a)gmail.com
Phone Numbers: (+226) 25 49 24 0470
Please note you will be required to send them your information as below
A scan copy of your passport or ID card
Your telephone number……………………………………
Your occupation ……………………………………………
So as to commence the transfer to you without delaying
Regards,
Mrs. Maya Oliver,
The patch titled
Subject: ocfs2: fix a deadlock when commit trans
has been added to the -mm tree. Its filename is
ocfs2-fix-a-deadlock-when-commit-trans.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/ocfs2-fix-a-deadlock-when-commit-…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/ocfs2-fix-a-deadlock-when-commit-…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Subject: ocfs2: fix a deadlock when commit trans
commit 6f1b228529ae introduces a regression which can deadlock as follows:
Task1: Task2:
jbd2_journal_commit_transaction ocfs2_test_bg_bit_allocatable
spin_lock(&jh->b_state_lock) jbd_lock_bh_journal_head
__jbd2_journal_remove_checkpoint spin_lock(&jh->b_state_lock)
jbd2_journal_put_journal_head
jbd_lock_bh_journal_head
Task1 and Task2 lock bh->b_state and jh->b_state_lock in different
order, which finally result in a deadlock.
So use jbd2_journal_[grab|put]_journal_head instead in
ocfs2_test_bg_bit_allocatable() to fix it.
Link: https://lkml.kernel.org/r/20220121071205.100648-3-joseph.qi@linux.alibaba.c…
Fixes: 6f1b228529ae ("ocfs2: fix race between searching chunks and release journal_head from buffer_head")
Signed-off-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Reported-by: Gautham Ananthakrishna <gautham.ananthakrishna(a)oracle.com>
Reported-by: Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
Cc: "Theodore Ts'o" <tytso(a)mit.edu>
Cc: Andreas Dilger <adilger.kernel(a)dilger.ca>
Cc: <stable(a)vger.kernel.org>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Gang He <ghe(a)suse.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/suballoc.c | 25 +++++++++++--------------
1 file changed, 11 insertions(+), 14 deletions(-)
--- a/fs/ocfs2/suballoc.c~ocfs2-fix-a-deadlock-when-commit-trans
+++ a/fs/ocfs2/suballoc.c
@@ -1251,26 +1251,23 @@ static int ocfs2_test_bg_bit_allocatable
{
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct journal_head *jh;
- int ret = 1;
+ int ret;
if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
return 0;
- if (!buffer_jbd(bg_bh))
+ jh = jbd2_journal_grab_journal_head(bg_bh);
+ if (!jh)
return 1;
- jbd_lock_bh_journal_head(bg_bh);
- if (buffer_jbd(bg_bh)) {
- jh = bh2jh(bg_bh);
- spin_lock(&jh->b_state_lock);
- bg = (struct ocfs2_group_desc *) jh->b_committed_data;
- if (bg)
- ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
- else
- ret = 1;
- spin_unlock(&jh->b_state_lock);
- }
- jbd_unlock_bh_journal_head(bg_bh);
+ spin_lock(&jh->b_state_lock);
+ bg = (struct ocfs2_group_desc *) jh->b_committed_data;
+ if (bg)
+ ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
+ else
+ ret = 1;
+ spin_unlock(&jh->b_state_lock);
+ jbd2_journal_put_journal_head(jh);
return ret;
}
_
Patches currently in -mm which might be from joseph.qi(a)linux.alibaba.com are
jbd2-export-jbd2_journal__journal_head.patch
ocfs2-fix-a-deadlock-when-commit-trans.patch
The patch titled
Subject: jbd2: export jbd2_journal_[grab|put]_journal_head
has been added to the -mm tree. Its filename is
jbd2-export-jbd2_journal__journal_head.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/jbd2-export-jbd2_journal__journal…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/jbd2-export-jbd2_journal__journal…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Subject: jbd2: export jbd2_journal_[grab|put]_journal_head
Patch series "ocfs2: fix a deadlock case".
This fixes a deadlock case in ocfs2. We firstly export jbd2 symbols
jbd2_journal_[grab|put]_journal_head as preparation and later use them in
ocfs2 insread of jbd_[lock|unlock]_bh_journal_head to fix the deadlock.
This patch (of 2):
This exports symbols jbd2_journal_[grab|put]_journal_head, which will be
used outside modules, e.g. ocfs2.
Link: https://lkml.kernel.org/r/20220121071205.100648-2-joseph.qi@linux.alibaba.c…
Signed-off-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Gang He <ghe(a)suse.com>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: Andreas Dilger <adilger.kernel(a)dilger.ca>
Cc: Gautham Ananthakrishna <gautham.ananthakrishna(a)oracle.com>
Cc: Saeed Mirzamohammadi <saeed.mirzamohammadi(a)oracle.com>
Cc: "Theodore Ts'o" <tytso(a)mit.edu>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/jbd2/journal.c | 2 ++
1 file changed, 2 insertions(+)
--- a/fs/jbd2/journal.c~jbd2-export-jbd2_journal__journal_head
+++ a/fs/jbd2/journal.c
@@ -2972,6 +2972,7 @@ struct journal_head *jbd2_journal_grab_j
jbd_unlock_bh_journal_head(bh);
return jh;
}
+EXPORT_SYMBOL(jbd2_journal_grab_journal_head);
static void __journal_remove_journal_head(struct buffer_head *bh)
{
@@ -3024,6 +3025,7 @@ void jbd2_journal_put_journal_head(struc
jbd_unlock_bh_journal_head(bh);
}
}
+EXPORT_SYMBOL(jbd2_journal_put_journal_head);
/*
* Initialize jbd inode head
_
Patches currently in -mm which might be from joseph.qi(a)linux.alibaba.com are
jbd2-export-jbd2_journal__journal_head.patch
ocfs2-fix-a-deadlock-when-commit-trans.patch
Sir/Madam,
Good day to you.
I am Dr.Gertjan Vlieghe personal Secretary to Andrew Bailey who double as the Governor, Bank of England (https://en.wikipedia.org/wiki/Andrew_Bailey_%28banker%29). We have an inheritance of a deceased client, who bear the same name with your surname. kindly contact Andrew Bailey through his personal email ( andbaill228(a)mail2world.com ) with your details for more information.
Thank you.
Dr.Gertjan Vlieghe
Dearest Friend,
In the name of God, Most Gracious, Most Merciful.
Peace be upon you and mercy be upon you and blessings be upon you.
I have the sum of $27.5 million USD for investment, I am interested in
you for investment project assistance in your country. My name is
Aisha Gaddafi and presently living in Oman, I am a Widow and single
Mother with three Children, the only biological Daughter of late
Libyan President (Late Colonel Muammar Gaddafi) and presently I am
under political asylum protection by the Omani Government.
Kindly reply urgently for more details.
my email address below: ayishagddafio(a)mail.ru
Thanks
Yours Truly Aisha
hallo Greg
5.16.3-rc1
compiles, boots and runs on my x86_64
(Intel i5-11400, Fedora 35)
Thanks
Tested-by: Ronald Warsow <rwarsow(a)gmx.de>
--
regards
Ronald
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d386f7ef9f410266bc1f364ad6a11cb28dae09a8 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello(a)amd.com>
Date: Fri, 10 Dec 2021 08:35:29 -0600
Subject: [PATCH] platform/x86: amd-pmc: only use callbacks for suspend
This driver is intended to be used exclusively for suspend to idle
so callbacks to send OS_HINT during hibernate and S5 will set OS_HINT
at the wrong time leading to an undefined behavior.
Cc: stable(a)vger.kernel.org
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
Link: https://lore.kernel.org/r/20211210143529.10594-1-mario.limonciello@amd.com
Reviewed-by: Hans de Goede <hdegoede(a)redhat.com>
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c
index c709ff993e8b..f794343d6aaa 100644
--- a/drivers/platform/x86/amd-pmc.c
+++ b/drivers/platform/x86/amd-pmc.c
@@ -585,7 +585,8 @@ static int __maybe_unused amd_pmc_resume(struct device *dev)
}
static const struct dev_pm_ops amd_pmc_pm_ops = {
- SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(amd_pmc_suspend, amd_pmc_resume)
+ .suspend_noirq = amd_pmc_suspend,
+ .resume_noirq = amd_pmc_resume,
};
static const struct pci_device_id pmc_pci_ids[] = {
Greetings!
I'm Nasser Rashid, a business financial specialist and investment
expert. consultant experienced in financial funding services. I
have a
I have a serious business investment opportunity to discuss with
you. Century Financial Dubai is the home of discerning investors.
We
We offer independent financial advice and assist our clients in
making sound investment decisions from a variety of investment
options.
Opportunities are available.
Our company is structured to provide personalized services to As
a result, capital security and adequate funding are ensured.
returns on investment. Our investors are ready to provide funding
for your business expansion, such as debt and equity.
financing. If you require funding, we would be able to partner
with you. We look forward to your response.
Thank you and stay safe,
Nasser Rashid, CFA,
Century Financial
In order to optimize FIFO access, especially on m_can cores attached
to slow busses like SPI, in patch
| e39381770ec9 ("can: m_can: Disable IRQs on FIFO bus errors")
bulk read/write support has been added to the m_can_fifo_{read,write}
functions.
That change leads to the tcan driver to call
regmap_bulk_{read,write}() with a length of 0 (for CAN frames with 0
data length). regmap treats this as an error:
| tcan4x5x spi1.0 tcan4x5x0: FIFO write returned -22
This patch fixes the problem by not calling the
cdev->ops->{read,write)_fifo() in case of a 0 length read/write.
Fixes: e39381770ec9 ("can: m_can: Disable IRQs on FIFO bus errors")
Link: https://lore.kernel.org/all/20220114155751.2651888-1-mkl@pengutronix.de
Cc: stable(a)vger.kernel.org
Cc: Matt Kline <matt(a)bitbashing.io>
Cc: Chandrasekar Ramakrishnan <rcsekar(a)samsung.com>
Reported-by: Michael Anochin <anochin(a)photo-meter.com>
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
drivers/net/can/m_can/m_can.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 5b47cd867783..1a4b56f6fa8c 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -336,6 +336,9 @@ m_can_fifo_read(struct m_can_classdev *cdev,
u32 addr_offset = cdev->mcfg[MRAM_RXF0].off + fgi * RXF0_ELEMENT_SIZE +
offset;
+ if (val_count == 0)
+ return 0;
+
return cdev->ops->read_fifo(cdev, addr_offset, val, val_count);
}
@@ -346,6 +349,9 @@ m_can_fifo_write(struct m_can_classdev *cdev,
u32 addr_offset = cdev->mcfg[MRAM_TXB].off + fpi * TXB_ELEMENT_SIZE +
offset;
+ if (val_count == 0)
+ return 0;
+
return cdev->ops->write_fifo(cdev, addr_offset, val, val_count);
}
--
2.34.1
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7c8a4742c4abe205ec9daf416c9d42fd6b406e8e Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack(a)google.com>
Date: Thu, 13 Jan 2022 23:30:17 +0000
Subject: [PATCH] KVM: x86/mmu: Fix write-protection of PTs mapped by the TDP
MMU
When the TDP MMU is write-protection GFNs for page table protection (as
opposed to for dirty logging, or due to the HVA not being writable), it
checks if the SPTE is already write-protected and if so skips modifying
the SPTE and the TLB flush.
This behavior is incorrect because it fails to check if the SPTE
is write-protected for page table protection, i.e. fails to check
that MMU-writable is '0'. If the SPTE was write-protected for dirty
logging but not page table protection, the SPTE could locklessly be made
writable, and vCPUs could still be running with writable mappings cached
in their TLB.
Fix this by only skipping setting the SPTE if the SPTE is already
write-protected *and* MMU-writable is already clear. Technically,
checking only MMU-writable would suffice; a SPTE cannot be writable
without MMU-writable being set. But check both to be paranoid and
because it arguably yields more readable code.
Fixes: 46044f72c382 ("kvm: x86/mmu: Support write protection for nesting in tdp MMU")
Cc: stable(a)vger.kernel.org
Signed-off-by: David Matlack <dmatlack(a)google.com>
Message-Id: <20220113233020.3986005-2-dmatlack(a)google.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7b1bc816b7c3..bc9e3553fba2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- if (!is_writable_pte(iter.old_spte))
- break;
-
new_spte = iter.old_spte &
~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
+ if (new_spte == iter.old_spte)
+ break;
+
tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
}
It has been reported that the tag setting operation on newly-allocated
pages can cause the page flags to be corrupted when performed
concurrently with other flag updates as a result of the use of
non-atomic operations. Fix the problem by using a compare-exchange
loop to update the tag.
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Link: https://linux-review.googlesource.com/id/I456b24a2b9067d93968d43b4bb3351c0c…
Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc")
Cc: stable(a)vger.kernel.org
---
v3:
- use try_cmpxchg() as suggested by Peter Zijlstra on another
patch
v2:
- use READ_ONCE()
include/linux/mm.h | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c768a7c81b0b..87473fe52c3f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1531,11 +1531,18 @@ static inline u8 page_kasan_tag(const struct page *page)
static inline void page_kasan_tag_set(struct page *page, u8 tag)
{
- if (kasan_enabled()) {
- tag ^= 0xff;
- page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
- page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
- }
+ unsigned long old_flags, flags;
+
+ if (!kasan_enabled())
+ return;
+
+ tag ^= 0xff;
+ old_flags = READ_ONCE(page->flags);
+ do {
+ flags = old_flags;
+ flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
+ flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+ } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
}
static inline void page_kasan_tag_reset(struct page *page)
--
2.34.1.703.g22d0c6ccf7-goog
The membarrier command MEMBARRIER_CMD_QUERY allows querying the
available membarrier commands. When the membarrier-rseq fence commands
were added, a new MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK was
introduced with the intent to expose them with the MEMBARRIER_CMD_QUERY
command, the but it was never added to MEMBARRIER_CMD_BITMASK.
The membarrier-rseq fence commands are therefore not wired up with the
query command.
Rename MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK to
MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK (the bitmask is not a command
per-se), and change the erroneous
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK (which does not
actually exist) to MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ.
Wire up MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK in
MEMBARRIER_CMD_BITMASK. Fixing this allows discovering availability of
the membarrier-rseq fence feature.
Fixes: 2a36ab717e8f ("rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Peter Oskolkov <posk(a)google.com>
Cc: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: <stable(a)vger.kernel.org> # 5.10+
---
kernel/sched/membarrier.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index b5add64d9698..3d2825408e3a 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -147,11 +147,11 @@
#endif
#ifdef CONFIG_RSEQ
-#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
- | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
#else
-#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
#endif
#define MEMBARRIER_CMD_BITMASK \
@@ -159,7 +159,8 @@
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
- | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
static void ipi_mb(void *info)
{
--
2.17.1
In Linux 4.14 and 4.19 these architectures still have their own
implementations of get_user_pages_fast(). These also need to force
the write flag on when taking the fast path.
Fixes: 407faed92b4a ("gup: document and work around "COW can break either way" issue")
Fixes: 5e24029791e8 ("gup: document and work around "COW can break either way" issue")
Signed-off-by: Ben Hutchings <ben(a)decadent.org.uk>
---
arch/mips/mm/gup.c | 9 ++++++++-
arch/s390/mm/gup.c | 9 ++++++++-
arch/sh/mm/gup.c | 9 ++++++++-
arch/sparc/mm/gup.c | 9 ++++++++-
4 files changed, 32 insertions(+), 4 deletions(-)
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 5a4875cac1ec..2e7a0d201c09 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -274,7 +274,14 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ /*
+ * The FAST_GUP case requires FOLL_WRITE even for pure reads,
+ * because get_user_pages() may need to cause an early COW in
+ * order to avoid confusing the normal COW routines. So only
+ * targets that are already writable are safe to do by just
+ * looking at the page tables.
+ */
+ if (!gup_pud_range(pgd, addr, next, 1, pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index 9b5b866d8adf..5389bf5bc828 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -287,7 +287,14 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
might_sleep();
start &= PAGE_MASK;
- nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ /*
+ * The FAST_GUP case requires FOLL_WRITE even for pure reads,
+ * because get_user_pages() may need to cause an early COW in
+ * order to avoid confusing the normal COW routines. So only
+ * targets that are already writable are safe to do by just
+ * looking at the page tables.
+ */
+ nr = __get_user_pages_fast(start, nr_pages, 1, pages);
if (nr == nr_pages)
return nr;
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index 56c86ca98ecf..23fa2fc8aabc 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -242,7 +242,14 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ /*
+ * The FAST_GUP case requires FOLL_WRITE even for pure reads,
+ * because get_user_pages() may need to cause an early COW in
+ * order to avoid confusing the normal COW routines. So only
+ * targets that are already writable are safe to do by just
+ * looking at the page tables.
+ */
+ if (!gup_pud_range(pgd, addr, next, 1, pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index aee6dba83d0e..f291d34a1cd5 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -303,7 +303,14 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ /*
+ * The FAST_GUP case requires FOLL_WRITE even for pure reads,
+ * because get_user_pages() may need to cause an early COW in
+ * order to avoid confusing the normal COW routines. So only
+ * targets that are already writable are safe to do by just
+ * looking at the page tables.
+ */
+ if (!gup_pud_range(pgd, addr, next, 1, pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
From: Ross Zwisler <ross.zwisler(a)linux.intel.com>
commit 097963959594c5eccaba42510f7033f703211bda upstream.
Patch series "Write protect DAX PMDs in *sync path".
Currently dax_mapping_entry_mkclean() fails to clean and write protect
the pmd_t of a DAX PMD entry during an *sync operation. This can result
in data loss, as detailed in patch 2.
This series is based on Dan's "libnvdimm-pending" branch, which is the
current home for Jan's "dax: Page invalidation fixes" series. You can
find a working tree here:
https://git.kernel.org/cgit/linux/kernel/git/zwisler/linux.git/log/?h=dax_p…
This patch (of 2):
Similar to follow_pte(), follow_pte_pmd() allows either a PTE leaf or a
huge page PMD leaf to be found and returned.
Link: http://lkml.kernel.org/r/1482272586-21177-2-git-send-email-ross.zwisler@lin…
Signed-off-by: Ross Zwisler <ross.zwisler(a)linux.intel.com>
Suggested-by: Dave Hansen <dave.hansen(a)intel.com>
Cc: Alexander Viro <viro(a)zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Dave Chinner <david(a)fromorbit.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Matthew Wilcox <mawilcox(a)microsoft.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
[bwh: Backported to 4.9: adjust context]
Signed-off-by: Ben Hutchings <ben(a)decadent.org.uk>
---
include/linux/mm.h | 2 ++
mm/memory.c | 37 ++++++++++++++++++++++++++++++-------
2 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7a4c035b187f..81ee5d0b2642 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1269,6 +1269,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
+int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn);
int follow_phys(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index c2890dc104d9..2b2cc69ddcce 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3780,8 +3780,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-static int __follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
+static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
@@ -3798,11 +3798,20 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
- goto out;
- /* We cannot handle huge page PFN maps. Luckily they don't exist. */
- if (pmd_huge(*pmd))
+ if (pmd_huge(*pmd)) {
+ if (!pmdpp)
+ goto out;
+
+ *ptlp = pmd_lock(mm, pmd);
+ if (pmd_huge(*pmd)) {
+ *pmdpp = pmd;
+ return 0;
+ }
+ spin_unlock(*ptlp);
+ }
+
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
@@ -3825,9 +3834,23 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte(mm, address, ptepp, ptlp)));
+ !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
+ ptlp)));
+ return res;
+}
+
+int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+{
+ int res;
+
+ /* (void) is needed to make gcc happy */
+ (void) __cond_lock(*ptlp,
+ !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
+ ptlp)));
return res;
}
+EXPORT_SYMBOL(follow_pte_pmd);
/**
* follow_pfn - look up PFN at a user virtual address
From: Davidlohr Bueso <dave(a)stgolabs.net>
commit cd9e61ed1eebbcd5dfad59475d41ec58d9b64b6a upstream.
Patch series "rbtree: Cache leftmost node internally", v4.
A series to extending rbtrees to internally cache the leftmost node such
that we can have fast overlap check optimization for all interval tree
users[1]. The benefits of this series are that:
(i) Unify users that do internal leftmost node caching.
(ii) Optimize all interval tree users.
(iii) Convert at least two new users (epoll and procfs) to the new interface.
This patch (of 16):
Red-black tree semantics imply that nodes with smaller or greater (or
equal for duplicates) keys always be to the left and right,
respectively. For the kernel this is extremely evident when considering
our rb_first() semantics. Enabling lookups for the smallest node in the
tree in O(1) can save a good chunk of cycles in not having to walk down
the tree each time. To this end there are a few core users that
explicitly do this, such as the scheduler and rtmutexes. There is also
the desire for interval trees to have this optimization allowing faster
overlap checking.
This patch introduces a new 'struct rb_root_cached' which is just the
root with a cached pointer to the leftmost node. The reason why the
regular rb_root was not extended instead of adding a new structure was
that this allows the user to have the choice between memory footprint
and actual tree performance. The new wrappers on top of the regular
rb_root calls are:
- rb_first_cached(cached_root) -- which is a fast replacement
for rb_first.
- rb_insert_color_cached(node, cached_root, new)
- rb_erase_cached(node, cached_root)
In addition, augmented cached interfaces are also added for basic
insertion and deletion operations; which becomes important for the
interval tree changes.
With the exception of the inserts, which adds a bool for updating the
new leftmost, the interfaces are kept the same. To this end, porting rb
users to the cached version becomes really trivial, and keeping current
rbtree semantics for users that don't care about the optimization
requires zero overhead.
Link: http://lkml.kernel.org/r/20170719014603.19029-2-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso(a)suse.de>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Acked-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Ben Hutchings <ben(a)decadent.org.uk>
---
Documentation/rbtree.txt | 33 +++++++++++++++++++++++++++++++
include/linux/rbtree.h | 21 ++++++++++++++++++++
include/linux/rbtree_augmented.h | 33 ++++++++++++++++++++++++++++---
lib/rbtree.c | 34 +++++++++++++++++++++++++++-----
4 files changed, 113 insertions(+), 8 deletions(-)
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index b9d9cc57be18..9fedfedfd85f 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -190,6 +190,39 @@ rb_entry(node, type, member).
for (node = rb_first(&mytree); node; node = rb_next(node))
printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
+Cached rbtrees
+--------------
+
+Computing the leftmost (smallest) node is quite a common task for binary
+search trees, such as for traversals or users relying on a the particular
+order for their own logic. To this end, users can use 'struct rb_root_cached'
+to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding
+potentially expensive tree iterations. This is done at negligible runtime
+overhead for maintanence; albeit larger memory footprint.
+
+Similar to the rb_root structure, cached rbtrees are initialized to be
+empty via:
+
+ struct rb_root_cached mytree = RB_ROOT_CACHED;
+
+Cached rbtree is simply a regular rb_root with an extra pointer to cache the
+leftmost node. This allows rb_root_cached to exist wherever rb_root does,
+which permits augmented trees to be supported as well as only a few extra
+interfaces:
+
+ struct rb_node *rb_first_cached(struct rb_root_cached *tree);
+ void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool);
+ void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
+
+Both insert and erase calls have their respective counterpart of augmented
+trees:
+
+ void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *,
+ bool, struct rb_augment_callbacks *);
+ void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *,
+ struct rb_augment_callbacks *);
+
+
Support for Augmented rbtrees
-----------------------------
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index e585018498d5..d574361943ea 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -44,10 +44,25 @@ struct rb_root {
struct rb_node *rb_node;
};
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+ struct rb_root rb_root;
+ struct rb_node *rb_leftmost;
+};
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
#define RB_ROOT (struct rb_root) { NULL, }
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL)
@@ -69,6 +84,12 @@ extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);
+extern void rb_insert_color_cached(struct rb_node *,
+ struct rb_root_cached *, bool);
+extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index d076183e49be..023d64657e95 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -41,7 +41,9 @@ struct rb_augment_callbacks {
void (*rotate)(struct rb_node *old, struct rb_node *new);
};
-extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+extern void __rb_insert_augmented(struct rb_node *node,
+ struct rb_root *root,
+ bool newleft, struct rb_node **leftmost,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
/*
* Fixup the rbtree and update the augmented information when rebalancing.
@@ -57,7 +59,16 @@ static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
- __rb_insert_augmented(node, root, augment->rotate);
+ __rb_insert_augmented(node, root, false, NULL, augment->rotate);
+}
+
+static inline void
+rb_insert_augmented_cached(struct rb_node *node,
+ struct rb_root_cached *root, bool newleft,
+ const struct rb_augment_callbacks *augment)
+{
+ __rb_insert_augmented(node, &root->rb_root,
+ newleft, &root->rb_leftmost, augment->rotate);
}
#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \
@@ -148,6 +159,7 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+ struct rb_node **leftmost,
const struct rb_augment_callbacks *augment)
{
struct rb_node *child = node->rb_right;
@@ -155,6 +167,9 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
struct rb_node *parent, *rebalance;
unsigned long pc;
+ if (leftmost && node == *leftmost)
+ *leftmost = rb_next(node);
+
if (!tmp) {
/*
* Case 1: node to erase has no more than 1 child (easy!)
@@ -254,9 +269,21 @@ static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
const struct rb_augment_callbacks *augment)
{
- struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
+ struct rb_node *rebalance = __rb_erase_augmented(node, root,
+ NULL, augment);
if (rebalance)
__rb_erase_color(rebalance, root, augment->rotate);
}
+static __always_inline void
+rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
+ const struct rb_augment_callbacks *augment)
+{
+ struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root,
+ &root->rb_leftmost,
+ augment);
+ if (rebalance)
+ __rb_erase_color(rebalance, &root->rb_root, augment->rotate);
+}
+
#endif /* _LINUX_RBTREE_AUGMENTED_H */
diff --git a/lib/rbtree.c b/lib/rbtree.c
index eb8a19fee110..53746be42903 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -95,10 +95,14 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
static __always_inline void
__rb_insert(struct rb_node *node, struct rb_root *root,
+ bool newleft, struct rb_node **leftmost,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
+ if (newleft)
+ *leftmost = node;
+
while (true) {
/*
* Loop invariant: node is red
@@ -417,19 +421,38 @@ static const struct rb_augment_callbacks dummy_callbacks = {
void rb_insert_color(struct rb_node *node, struct rb_root *root)
{
- __rb_insert(node, root, dummy_rotate);
+ __rb_insert(node, root, false, NULL, dummy_rotate);
}
EXPORT_SYMBOL(rb_insert_color);
void rb_erase(struct rb_node *node, struct rb_root *root)
{
struct rb_node *rebalance;
- rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
+ rebalance = __rb_erase_augmented(node, root,
+ NULL, &dummy_callbacks);
if (rebalance)
____rb_erase_color(rebalance, root, dummy_rotate);
}
EXPORT_SYMBOL(rb_erase);
+void rb_insert_color_cached(struct rb_node *node,
+ struct rb_root_cached *root, bool leftmost)
+{
+ __rb_insert(node, &root->rb_root, leftmost,
+ &root->rb_leftmost, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_insert_color_cached);
+
+void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
+{
+ struct rb_node *rebalance;
+ rebalance = __rb_erase_augmented(node, &root->rb_root,
+ &root->rb_leftmost, &dummy_callbacks);
+ if (rebalance)
+ ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate);
+}
+EXPORT_SYMBOL(rb_erase_cached);
+
/*
* Augmented rbtree manipulation functions.
*
@@ -438,9 +461,10 @@ EXPORT_SYMBOL(rb_erase);
*/
void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+ bool newleft, struct rb_node **leftmost,
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
{
- __rb_insert(node, root, augment_rotate);
+ __rb_insert(node, root, newleft, leftmost, augment_rotate);
}
EXPORT_SYMBOL(__rb_insert_augmented);
@@ -485,7 +509,7 @@ struct rb_node *rb_next(const struct rb_node *node)
* as we can.
*/
if (node->rb_right) {
- node = node->rb_right;
+ node = node->rb_right;
while (node->rb_left)
node=node->rb_left;
return (struct rb_node *)node;
@@ -517,7 +541,7 @@ struct rb_node *rb_prev(const struct rb_node *node)
* as we can.
*/
if (node->rb_left) {
- node = node->rb_left;
+ node = node->rb_left;
while (node->rb_right)
node=node->rb_right;
return (struct rb_node *)node;
From: Paul Moore <paul(a)paul-moore.com>
commit ad5d07f4a9cd671233ae20983848874731102c08 upstream.
The current CIPSO and CALIPSO refcounting scheme for the DOI
definitions is a bit flawed in that we:
1. Don't correctly match gets/puts in netlbl_cipsov4_list().
2. Decrement the refcount on each attempt to remove the DOI from the
DOI list, only removing it from the list once the refcount drops
to zero.
This patch fixes these problems by adding the missing "puts" to
netlbl_cipsov4_list() and introduces a more conventional, i.e.
not-buggy, refcounting mechanism to the DOI definitions. Upon the
addition of a DOI to the DOI list, it is initialized with a refcount
of one, removing a DOI from the list removes it from the list and
drops the refcount by one; "gets" and "puts" behave as expected with
respect to refcounts, increasing and decreasing the DOI's refcount by
one.
Fixes: b1edeb102397 ("netlabel: Replace protocol/NetLabel linking with refrerence counts")
Fixes: d7cce01504a0 ("netlabel: Add support for removing a CALIPSO DOI.")
Reported-by: syzbot+9ec037722d2603a9f52e(a)syzkaller.appspotmail.com
Signed-off-by: Paul Moore <paul(a)paul-moore.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
[bwh: Backported to 4.9: adjust context]
Signed-off-by: Ben Hutchings <ben(a)decadent.org.uk>
---
net/ipv4/cipso_ipv4.c | 11 +----------
net/ipv6/calipso.c | 14 +++++---------
net/netlabel/netlabel_cipso_v4.c | 3 +++
3 files changed, 9 insertions(+), 19 deletions(-)
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 553cda6f887a..b7dc20a65b64 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -534,16 +534,10 @@ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
ret_val = -ENOENT;
goto doi_remove_return;
}
- if (!atomic_dec_and_test(&doi_def->refcount)) {
- spin_unlock(&cipso_v4_doi_list_lock);
- ret_val = -EBUSY;
- goto doi_remove_return;
- }
list_del_rcu(&doi_def->list);
spin_unlock(&cipso_v4_doi_list_lock);
- cipso_v4_cache_invalidate();
- call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+ cipso_v4_doi_putdef(doi_def);
ret_val = 0;
doi_remove_return:
@@ -600,9 +594,6 @@ void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
if (!atomic_dec_and_test(&doi_def->refcount))
return;
- spin_lock(&cipso_v4_doi_list_lock);
- list_del_rcu(&doi_def->list);
- spin_unlock(&cipso_v4_doi_list_lock);
cipso_v4_cache_invalidate();
call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index b206415bbde7..7628963ddacc 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -97,6 +97,9 @@ struct calipso_map_cache_entry {
static struct calipso_map_cache_bkt *calipso_cache;
+static void calipso_cache_invalidate(void);
+static void calipso_doi_putdef(struct calipso_doi *doi_def);
+
/* Label Mapping Cache Functions
*/
@@ -458,15 +461,10 @@ static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info)
ret_val = -ENOENT;
goto doi_remove_return;
}
- if (!atomic_dec_and_test(&doi_def->refcount)) {
- spin_unlock(&calipso_doi_list_lock);
- ret_val = -EBUSY;
- goto doi_remove_return;
- }
list_del_rcu(&doi_def->list);
spin_unlock(&calipso_doi_list_lock);
- call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
+ calipso_doi_putdef(doi_def);
ret_val = 0;
doi_remove_return:
@@ -522,10 +520,8 @@ static void calipso_doi_putdef(struct calipso_doi *doi_def)
if (!atomic_dec_and_test(&doi_def->refcount))
return;
- spin_lock(&calipso_doi_list_lock);
- list_del_rcu(&doi_def->list);
- spin_unlock(&calipso_doi_list_lock);
+ calipso_cache_invalidate();
call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
}
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 422fac2a4a3c..9a256d0fb957 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -587,6 +587,7 @@ static int netlbl_cipsov4_list(struct sk_buff *skb, struct genl_info *info)
break;
}
+ cipso_v4_doi_putdef(doi_def);
rcu_read_unlock();
genlmsg_end(ans_skb, data);
@@ -595,12 +596,14 @@ static int netlbl_cipsov4_list(struct sk_buff *skb, struct genl_info *info)
list_retry:
/* XXX - this limit is a guesstimate */
if (nlsze_mult < 4) {
+ cipso_v4_doi_putdef(doi_def);
rcu_read_unlock();
kfree_skb(ans_skb);
nlsze_mult *= 2;
goto list_start;
}
list_failure_lock:
+ cipso_v4_doi_putdef(doi_def);
rcu_read_unlock();
list_failure:
kfree_skb(ans_skb);
Proszę o uwagę!
Jak się masz? Mam nadzieję, że jesteś zdrowy i zdrowy? Informuję, że
udało mi się zakończyć transakcję z pomocą nowego partnera z Indii i
teraz środki zostały przelane do Indii na konto bankowe nowego
partnera.
W międzyczasie zdecydowałem się zrekompensować ci sumę 500 000 $
(tylko pięćset tysięcy dolarów amerykańskich) z powodu twoich
wcześniejszych wysiłków, chociaż mnie rozczarowałeś. Niemniej jednak
bardzo się cieszę z pomyślnego zakończenia transakcji bez żadnego
problemu i dlatego postanowiłem zrekompensować Ci kwotę 500 000 $,
abyś podzielił się ze mną radością.
Radzę skontaktować się z moją sekretarką w sprawie karty bankomatowej
o wartości 500 000 $, którą zachowałem dla Ciebie. Skontaktuj się z
nią teraz bez zwłoki.
Imię: Linda Koffi
E-mail: koffilinda785(a)gmail.com
Uprzejmie potwierdź jej następujące informacje:
Twoje pełne imię:........
Twój adres:..........
Twój kraj:..........
Twój wiek:.........
Twój zawód:..........
Twój numer telefonu komórkowego:..........
Twój paszport lub prawo jazdy:........
Pamiętaj, że jeśli nie prześlesz jej powyższych informacji
kompletnych, nie wyda ci karty bankomatowej, ponieważ musi się
upewnić, że to ty. Poproś ją, aby przesłała Ci całkowitą sumę (500 000
USD) karty bankomatowej, którą dla Ciebie zachowałem.
Z wyrazami szacunku,
Pan Abraham Morrison
Please pick the following commits for 4.9 and 4.14. They should apply
cleanly.
commit d903ec77118c09f93a610b384d83a6df33a64fe6
Author: Andy Spencer <aspencer(a)spacex.com>
Date: Thu Feb 22 11:05:33 2018 -0800
gianfar: simplify FCS handling and fix memory leak
commit d8861bab48b6c1fc3cdbcab8ff9d1eaea43afe7f
Author: Michael Braun <michael-dev(a)fami-braun.de>
Date: Thu Mar 4 20:52:52 2021 +0100
gianfar: fix jumbo packets+napi+rx overrun crash
Ben.
--
Ben Hutchings
Teamwork is essential - it allows you to blame someone else.
Hot-unplug all firmware-framebuffer devices as part of removing
them via remove_conflicting_framebuffers() et al. Releases all
memory regions to be acquired by native drivers.
Firmware, such as EFI, install a framebuffer while posting the
computer. After removing the firmware-framebuffer device from fbdev,
a native driver takes over the hardware and the firmware framebuffer
becomes invalid.
Firmware-framebuffer drivers, specifically simplefb, don't release
their device from Linux' device hierarchy. It still owns the firmware
framebuffer and blocks the native drivers from loading. This has been
observed in the vmwgfx driver. [1]
Initiating a device removal (i.e., hot unplug) as part of
remove_conflicting_framebuffers() removes the underlying device and
returns the memory range to the system.
[1] https://lore.kernel.org/dri-devel/20220117180359.18114-1-zack@kde.org/
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
CC: stable(a)vger.kernel.org # v5.11+
---
drivers/video/fbdev/core/fbmem.c | 29 ++++++++++++++++++++++++++---
include/linux/fb.h | 1 +
2 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 0fa7ede94fa6..f73f8415b8cb 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -25,6 +25,7 @@
#include <linux/init.h>
#include <linux/linux_logo.h>
#include <linux/proc_fs.h>
+#include <linux/platform_device.h>
#include <linux/seq_file.h>
#include <linux/console.h>
#include <linux/kmod.h>
@@ -1557,18 +1558,36 @@ static void do_remove_conflicting_framebuffers(struct apertures_struct *a,
/* check all firmware fbs and kick off if the base addr overlaps */
for_each_registered_fb(i) {
struct apertures_struct *gen_aper;
+ struct device *dev;
if (!(registered_fb[i]->flags & FBINFO_MISC_FIRMWARE))
continue;
gen_aper = registered_fb[i]->apertures;
+ dev = registered_fb[i]->device;
if (fb_do_apertures_overlap(gen_aper, a) ||
(primary && gen_aper && gen_aper->count &&
gen_aper->ranges[0].base == VGA_FB_PHYS)) {
printk(KERN_INFO "fb%d: switching to %s from %s\n",
i, name, registered_fb[i]->fix.id);
- do_unregister_framebuffer(registered_fb[i]);
+
+ /*
+ * If we kick-out a firmware driver, we also want to remove
+ * the underlying platform device, such as simple-framebuffer,
+ * VESA, EFI, etc. A native driver will then be able to
+ * allocate the memory range.
+ *
+ * If it's not a platform device, at least print a warning. A
+ * fix would add code to remove the device from the system.
+ */
+ if (dev_is_platform(dev)) {
+ registered_fb[i]->forced_out = true;
+ platform_device_unregister(to_platform_device(dev));
+ } else {
+ pr_warn("fb%d: cannot remove device\n", i);
+ do_unregister_framebuffer(registered_fb[i]);
+ }
}
}
}
@@ -1898,9 +1917,13 @@ EXPORT_SYMBOL(register_framebuffer);
void
unregister_framebuffer(struct fb_info *fb_info)
{
- mutex_lock(®istration_lock);
+ bool forced_out = fb_info->forced_out;
+
+ if (!forced_out)
+ mutex_lock(®istration_lock);
do_unregister_framebuffer(fb_info);
- mutex_unlock(®istration_lock);
+ if (!forced_out)
+ mutex_unlock(®istration_lock);
}
EXPORT_SYMBOL(unregister_framebuffer);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 3da95842b207..9a14f3f8a329 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -502,6 +502,7 @@ struct fb_info {
} *apertures;
bool skip_vt_switch; /* no VT switch on suspend/resume required */
+ bool forced_out; /* set when being removed by another driver */
};
static inline struct apertures_struct *alloc_apertures(unsigned int max_num) {
--
2.34.1
From: Dave Airlie <airlied(a)redhat.com>
commit 5de5b6ecf97a021f29403aa272cb4e03318ef586 upstream.
This is confusing, and from my reading of all the drivers only
nouveau got this right.
Just make the API act under driver control of it's own allocation
failing, and don't call destroy, if the page table fails to
create there is nothing to cleanup here.
(I'm willing to believe I've missed something here, so please
review deeply).
Reviewed-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Dave Airlie <airlied(a)redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200728041736.20689-1-airlie…
[bwh: Backported to 4.14:
- Drop change in ttm_sg_tt_init()
- Adjust context]
Signed-off-by: Ben Hutchings <ben(a)decadent.org.uk>
---
drivers/gpu/drm/nouveau/nouveau_sgdma.c | 9 +++------
drivers/gpu/drm/ttm/ttm_tt.c | 2 --
2 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
index fde11ce466e4..495c4043467e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c
@@ -106,12 +106,9 @@ nouveau_sgdma_create_ttm(struct ttm_bo_device *bdev,
else
nvbe->ttm.ttm.func = &nv50_sgdma_backend;
- if (ttm_dma_tt_init(&nvbe->ttm, bdev, size, page_flags, dummy_read_page))
- /*
- * A failing ttm_dma_tt_init() will call ttm_tt_destroy()
- * and thus our nouveau_sgdma_destroy() hook, so we don't need
- * to free nvbe here.
- */
+ if (ttm_dma_tt_init(&nvbe->ttm, bdev, size, page_flags, dummy_read_page)) {
+ kfree(nvbe);
return NULL;
+ }
return &nvbe->ttm.ttm;
}
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 8ebc8d3560c3..fc8bdcc1541b 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -199,7 +199,6 @@ int ttm_tt_init(struct ttm_tt *ttm, struct ttm_bo_device *bdev,
ttm_tt_alloc_page_directory(ttm);
if (!ttm->pages) {
- ttm_tt_destroy(ttm);
pr_err("Failed allocating page table\n");
return -ENOMEM;
}
@@ -232,7 +231,6 @@ int ttm_dma_tt_init(struct ttm_dma_tt *ttm_dma, struct ttm_bo_device *bdev,
INIT_LIST_HEAD(&ttm_dma->pages_list);
ttm_dma_tt_alloc_page_directory(ttm_dma);
if (!ttm->pages) {
- ttm_tt_destroy(ttm);
pr_err("Failed allocating page table\n");
return -ENOMEM;
}
We don't currently validate that the values being set are within the range
we advertised to userspace as being valid, do so and reject any values
that are out of range.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
sound/soc/soc-ops.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c
index c31e63b27193..dc0e7c8d31f3 100644
--- a/sound/soc/soc-ops.c
+++ b/sound/soc/soc-ops.c
@@ -879,6 +879,8 @@ int snd_soc_put_xr_sx(struct snd_kcontrol *kcontrol,
long val = ucontrol->value.integer.value[0];
unsigned int i;
+ if (val < mc->min || val > mc->max)
+ return -EINVAL;
if (invert)
val = max - val;
val &= mask;
--
2.30.2
We don't currently validate that the values being set are within the range
we advertised to userspace as being valid, do so and reject any values
that are out of range.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
sound/soc/soc-ops.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c
index fbe5d326b0f2..c31e63b27193 100644
--- a/sound/soc/soc-ops.c
+++ b/sound/soc/soc-ops.c
@@ -423,8 +423,15 @@ int snd_soc_put_volsw_sx(struct snd_kcontrol *kcontrol,
int err = 0;
unsigned int val, val_mask;
+ val = ucontrol->value.integer.value[0];
+ if (mc->platform_max && val > mc->platform_max)
+ return -EINVAL;
+ if (val > max - min)
+ return -EINVAL;
+ if (val < 0)
+ return -EINVAL;
val_mask = mask << shift;
- val = (ucontrol->value.integer.value[0] + min) & mask;
+ val = (val + min) & mask;
val = val << shift;
err = snd_soc_component_update_bits(component, reg, val_mask, val);
--
2.30.2
We don't currently validate that the values being set are within the range
we advertised to userspace as being valid, do so and reject any values
that are out of range.
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
sound/soc/soc-ops.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c
index 08eaa9ddf191..fbe5d326b0f2 100644
--- a/sound/soc/soc-ops.c
+++ b/sound/soc/soc-ops.c
@@ -316,13 +316,27 @@ int snd_soc_put_volsw(struct snd_kcontrol *kcontrol,
if (sign_bit)
mask = BIT(sign_bit + 1) - 1;
- val = ((ucontrol->value.integer.value[0] + min) & mask);
+ val = ucontrol->value.integer.value[0];
+ if (mc->platform_max && val > mc->platform_max)
+ return -EINVAL;
+ if (val > max - min)
+ return -EINVAL;
+ if (val < 0)
+ return -EINVAL;
+ val = (val + min) & mask;
if (invert)
val = max - val;
val_mask = mask << shift;
val = val << shift;
if (snd_soc_volsw_is_stereo(mc)) {
- val2 = ((ucontrol->value.integer.value[1] + min) & mask);
+ val2 = ucontrol->value.integer.value[1];
+ if (mc->platform_max && val2 > mc->platform_max)
+ return -EINVAL;
+ if (val2 > max - min)
+ return -EINVAL;
+ if (val2 < 0)
+ return -EINVAL;
+ val2 = (val2 + min) & mask;
if (invert)
val2 = max - val2;
if (reg == reg2) {
--
2.30.2
Mark the start_backtrace() as notrace and NOKPROBE_SYMBOL
because this function is called from ftrace and lockdep to
get the caller address via return_address(). The lockdep
is used in kprobes, it should also be NOKPROBE_SYMBOL.
Fixes: b07f3499661c ("arm64: stacktrace: Move start_backtrace() out of the header")
Cc: stable(a)vger.kernel.org
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
---
arch/arm64/kernel/stacktrace.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 94f83cd44e50..b0f21677764d 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -33,7 +33,7 @@
*/
-void start_backtrace(struct stackframe *frame, unsigned long fp,
+void notrace start_backtrace(struct stackframe *frame, unsigned long fp,
unsigned long pc)
{
frame->fp = fp;
@@ -55,6 +55,7 @@ void start_backtrace(struct stackframe *frame, unsigned long fp,
frame->prev_fp = 0;
frame->prev_type = STACK_TYPE_UNKNOWN;
}
+NOKPROBE_SYMBOL(start_backtrace);
/*
* Unwind from one frame record (A) to the next frame record (B).
This reverts commit 77fa5e15c933a1ec812de61ad709c00aa51e96ae.
Since the upstream commit e792ff804f49720ce003b3e4c618b5d996256a18
depends on the generic kretprobe trampoline handler, which was
introduced by commit 66ada2ccae4e ("kprobes: Add generic kretprobe
trampoline handler") but that is not ported to the stable kernel
because it is not a bugfix series.
So revert this commit to fix a build error.
NOTE: I keep commit a7fe2378454c ("ia64: kprobes: Fix to pass
correct trampoline address to the handler") on the tree, that seems
just a cleanup without the original reverted commit, but it would
be better to use dereference_function_descriptor() macro instead
of accessing descriptor's field directly.
Fixes: 77fa5e15c933 ("ia64: kprobes: Use generic kretprobe trampoline handler")
Reported-by: kernel test robot <lkp(a)intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
---
Changes in v2:
- fix the lack of type casting for dereference_function_descriptor().
---
arch/ia64/kernel/kprobes.c | 78 ++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 75 insertions(+), 3 deletions(-)
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 8a223d0e4918..fa10d51f6217 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -396,10 +396,83 @@ static void kretprobe_trampoline(void)
{
}
+/*
+ * At this point the target function has been tricked into
+ * returning into our trampoline. Lookup the associated instance
+ * and then:
+ * - call the handler function
+ * - cleanup by marking the instance as unused
+ * - long jump back to the original return address
+ */
int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
{
- regs->cr_iip = __kretprobe_trampoline_handler(regs,
- dereference_function_descriptor(kretprobe_trampoline), NULL);
+ struct kretprobe_instance *ri = NULL;
+ struct hlist_head *head, empty_rp;
+ struct hlist_node *tmp;
+ unsigned long flags, orig_ret_address = 0;
+ unsigned long trampoline_address =
+ (unsigned long)dereference_function_descriptor(kretprobe_trampoline);
+
+ INIT_HLIST_HEAD(&empty_rp);
+ kretprobe_hash_lock(current, &head, &flags);
+
+ /*
+ * It is possible to have multiple instances associated with a given
+ * task either because an multiple functions in the call path
+ * have a return probe installed on them, and/or more than one return
+ * return probe was registered for a target function.
+ *
+ * We can handle this because:
+ * - instances are always inserted at the head of the list
+ * - when multiple return probes are registered for the same
+ * function, the first instance's ret_addr will point to the
+ * real return address, and all the rest will point to
+ * kretprobe_trampoline
+ */
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+
+ regs->cr_iip = orig_ret_address;
+
+ hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+ if (ri->task != current)
+ /* another task is sharing our hash bucket */
+ continue;
+
+ if (ri->rp && ri->rp->handler)
+ ri->rp->handler(ri, regs);
+
+ orig_ret_address = (unsigned long)ri->ret_addr;
+ recycle_rp_inst(ri, &empty_rp);
+
+ if (orig_ret_address != trampoline_address)
+ /*
+ * This is the real return address. Any other
+ * instances associated with this task are for
+ * other calls deeper on the call stack
+ */
+ break;
+ }
+ kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+ kretprobe_hash_unlock(current, &flags);
+
+ hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
+ hlist_del(&ri->hlist);
+ kfree(ri);
+ }
/*
* By returning a non-zero value, we are telling
* kprobe_handler() that we don't want the post_handler
@@ -412,7 +485,6 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
ri->ret_addr = (kprobe_opcode_t *)regs->b0;
- ri->fp = NULL;
/* Replace the return addr with trampoline addr */
regs->b0 = (unsigned long)dereference_function_descriptor(kretprobe_trampoline);
Dear friend.
This being a wide world in which it can be difficult to make new
acquaintances and because it is virtually impossible to know who is
trustworthy and who can be believed, I have decided to repose
confidence in you after much fasting and prayer. It is only because of
this that I have decided to confide in you and to share with you this
confidential business.
overdue and unclaimed sum of $15.5m, (Fifteen Million Five Hundred
Thousand Dollars Only) when the account holder suddenly passed on, he
left no beneficiary who would be entitled to the receipt of this fund.
For this reason, I have found it expedient to transfer this fund to a
trustworthy individual with capacity to act as foreign business
partner.
Yours Faithful,
Mr.Sal Kavar.
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 96da174024b9c63bd5d3358668d0bc12677be877 Mon Sep 17 00:00:00 2001
From: Ranjani Sridharan <ranjani.sridharan(a)linux.intel.com>
Date: Tue, 23 Nov 2021 19:16:06 +0200
Subject: [PATCH] ASoC: SOF: handle paused streams during system suspend
During system suspend, paused streams do not get suspended.
Therefore, we need to explicitly free these PCMs in the DSP
and free the associated DAPM widgets so that they can be set
up again during resume.
Fixes: 5fcdbb2d45df ("ASoC: SOF: Add support for dynamic pipelines")
Signed-off-by: Ranjani Sridharan <ranjani.sridharan(a)linux.intel.com>
Reviewed-by: Paul Olaru <paul.olaru(a)oss.nxp.com>
Reviewed-by: Bard Liao <bard.liao(a)intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart(a)linux.intel.com>
Signed-off-by: Kai Vehmanen <kai.vehmanen(a)linux.intel.com>
Link: https://lore.kernel.org/r/20211123171606.129350-3-kai.vehmanen@linux.intel.…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/sound/soc/sof/pcm.c b/sound/soc/sof/pcm.c
index 31dd79b794f1..0ceb1a9cbf73 100644
--- a/sound/soc/sof/pcm.c
+++ b/sound/soc/sof/pcm.c
@@ -100,9 +100,8 @@ void snd_sof_pcm_period_elapsed(struct snd_pcm_substream *substream)
}
EXPORT_SYMBOL(snd_sof_pcm_period_elapsed);
-static int sof_pcm_dsp_pcm_free(struct snd_pcm_substream *substream,
- struct snd_sof_dev *sdev,
- struct snd_sof_pcm *spcm)
+int sof_pcm_dsp_pcm_free(struct snd_pcm_substream *substream, struct snd_sof_dev *sdev,
+ struct snd_sof_pcm *spcm)
{
struct sof_ipc_stream stream;
struct sof_ipc_reply reply;
diff --git a/sound/soc/sof/sof-audio.c b/sound/soc/sof/sof-audio.c
index f4e142ec0fbd..e00ce275052f 100644
--- a/sound/soc/sof/sof-audio.c
+++ b/sound/soc/sof/sof-audio.c
@@ -129,6 +129,14 @@ int sof_widget_free(struct snd_sof_dev *sdev, struct snd_sof_widget *swidget)
case snd_soc_dapm_buffer:
ipc_free.hdr.cmd |= SOF_IPC_TPLG_BUFFER_FREE;
break;
+ case snd_soc_dapm_dai_in:
+ case snd_soc_dapm_dai_out:
+ {
+ struct snd_sof_dai *dai = swidget->private;
+
+ dai->configured = false;
+ fallthrough;
+ }
default:
ipc_free.hdr.cmd |= SOF_IPC_TPLG_COMP_FREE;
break;
@@ -720,6 +728,55 @@ int sof_set_up_pipelines(struct snd_sof_dev *sdev, bool verify)
return 0;
}
+/*
+ * Free the PCM, its associated widgets and set the prepared flag to false for all PCMs that
+ * did not get suspended(ex: paused streams) so the widgets can be set up again during resume.
+ */
+static int sof_tear_down_left_over_pipelines(struct snd_sof_dev *sdev)
+{
+ struct snd_sof_widget *swidget;
+ struct snd_sof_pcm *spcm;
+ int dir, ret;
+
+ /*
+ * free all PCMs and their associated DAPM widgets if their connected DAPM widget
+ * list is not NULL. This should only be true for paused streams at this point.
+ * This is equivalent to the handling of FE DAI suspend trigger for running streams.
+ */
+ list_for_each_entry(spcm, &sdev->pcm_list, list)
+ for_each_pcm_streams(dir) {
+ struct snd_pcm_substream *substream = spcm->stream[dir].substream;
+
+ if (!substream || !substream->runtime)
+ continue;
+
+ if (spcm->stream[dir].list) {
+ ret = sof_pcm_dsp_pcm_free(substream, sdev, spcm);
+ if (ret < 0)
+ return ret;
+
+ ret = sof_widget_list_free(sdev, spcm, dir);
+ if (ret < 0) {
+ dev_err(sdev->dev, "failed to free widgets during suspend\n");
+ return ret;
+ }
+ }
+ }
+
+ /*
+ * free any left over DAI widgets. This is equivalent to the handling of suspend trigger
+ * for the BE DAI for running streams.
+ */
+ list_for_each_entry(swidget, &sdev->widget_list, list)
+ if (WIDGET_IS_DAI(swidget->id) && swidget->use_count == 1) {
+ ret = sof_widget_free(sdev, swidget);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
/*
* For older firmware, this function doesn't free widgets for static pipelines during suspend.
* It only resets use_count for all widgets.
@@ -734,8 +791,8 @@ int sof_tear_down_pipelines(struct snd_sof_dev *sdev, bool verify)
/*
* This function is called during suspend and for one-time topology verification during
* first boot. In both cases, there is no need to protect swidget->use_count and
- * sroute->setup because during suspend all streams are suspended and during topology
- * loading the sound card unavailable to open PCMs.
+ * sroute->setup because during suspend all running streams are suspended and during
+ * topology loading the sound card unavailable to open PCMs.
*/
list_for_each_entry(swidget, &sdev->widget_list, list) {
if (swidget->dynamic_pipeline_widget)
@@ -754,6 +811,19 @@ int sof_tear_down_pipelines(struct snd_sof_dev *sdev, bool verify)
return ret;
}
+ /*
+ * Tear down all pipelines associated with PCMs that did not get suspended
+ * and unset the prepare flag so that they can be set up again during resume.
+ * Skip this step for older firmware.
+ */
+ if (!verify && v->abi_version >= SOF_ABI_VER(3, 19, 0)) {
+ ret = sof_tear_down_left_over_pipelines(sdev);
+ if (ret < 0) {
+ dev_err(sdev->dev, "failed to tear down paused pipelines\n");
+ return ret;
+ }
+ }
+
list_for_each_entry(sroute, &sdev->route_list, list)
sroute->setup = false;
diff --git a/sound/soc/sof/sof-audio.h b/sound/soc/sof/sof-audio.h
index 389d56ac3aba..1c4f59d34717 100644
--- a/sound/soc/sof/sof-audio.h
+++ b/sound/soc/sof/sof-audio.h
@@ -265,4 +265,6 @@ int sof_widget_free(struct snd_sof_dev *sdev, struct snd_sof_widget *swidget);
/* PCM */
int sof_widget_list_setup(struct snd_sof_dev *sdev, struct snd_sof_pcm *spcm, int dir);
int sof_widget_list_free(struct snd_sof_dev *sdev, struct snd_sof_pcm *spcm, int dir);
+int sof_pcm_dsp_pcm_free(struct snd_pcm_substream *substream, struct snd_sof_dev *sdev,
+ struct snd_sof_pcm *spcm);
#endif
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 01429183f479c54c1b5d15453a8ce574ea43e525 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart(a)linux.intel.com>
Date: Tue, 23 Nov 2021 19:16:04 +0200
Subject: [PATCH] ASoC: SOF: sof-audio: setup sched widgets during pipeline
complete step
Older firmware prior to ABI 3.19 has a dependency where the scheduler
widgets need to be setup last. Moving the call to sof_widget_setup()
before the pipeline_complete() call also helps remove the need for the
'reverse' direction when walking through the widget list - this was
only working because of the topology macros but the topology does not
require any order.
Fixes: 5fcdbb2d45df ("ASoC: SOF: Add support for dynamic pipelines")
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart(a)linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan(a)linux.intel.com>
Signed-off-by: Kai Vehmanen <kai.vehmanen(a)linux.intel.com>
Link: https://lore.kernel.org/r/20211123171606.129350-1-kai.vehmanen@linux.intel.…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/sound/soc/sof/sof-audio.c b/sound/soc/sof/sof-audio.c
index 0f2566f7c094..f4e142ec0fbd 100644
--- a/sound/soc/sof/sof-audio.c
+++ b/sound/soc/sof/sof-audio.c
@@ -637,16 +637,25 @@ const struct sof_ipc_pipe_new *snd_sof_pipeline_find(struct snd_sof_dev *sdev,
int sof_set_up_pipelines(struct snd_sof_dev *sdev, bool verify)
{
+ struct sof_ipc_fw_version *v = &sdev->fw_ready.version;
struct snd_sof_widget *swidget;
struct snd_sof_route *sroute;
int ret;
/* restore pipeline components */
- list_for_each_entry_reverse(swidget, &sdev->widget_list, list) {
+ list_for_each_entry(swidget, &sdev->widget_list, list) {
/* only set up the widgets belonging to static pipelines */
if (!verify && swidget->dynamic_pipeline_widget)
continue;
+ /*
+ * For older firmware, skip scheduler widgets in this loop,
+ * sof_widget_setup() will be called in the 'complete pipeline' loop
+ */
+ if (v->abi_version < SOF_ABI_VER(3, 19, 0) &&
+ swidget->id == snd_soc_dapm_scheduler)
+ continue;
+
/* update DAI config. The IPC will be sent in sof_widget_setup() */
if (WIDGET_IS_DAI(swidget->id)) {
struct snd_sof_dai *dai = swidget->private;
@@ -694,6 +703,12 @@ int sof_set_up_pipelines(struct snd_sof_dev *sdev, bool verify)
if (!verify && swidget->dynamic_pipeline_widget)
continue;
+ if (v->abi_version < SOF_ABI_VER(3, 19, 0)) {
+ ret = sof_widget_setup(sdev, swidget);
+ if (ret < 0)
+ return ret;
+ }
+
swidget->complete =
snd_sof_complete_pipeline(sdev, swidget);
break;
@@ -722,7 +737,7 @@ int sof_tear_down_pipelines(struct snd_sof_dev *sdev, bool verify)
* sroute->setup because during suspend all streams are suspended and during topology
* loading the sound card unavailable to open PCMs.
*/
- list_for_each_entry_reverse(swidget, &sdev->widget_list, list) {
+ list_for_each_entry(swidget, &sdev->widget_list, list) {
if (swidget->dynamic_pipeline_widget)
continue;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a674e48c5443d12a8a43c3ac42367aa39505d506 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe(a)redhat.com>
Date: Fri, 14 Jan 2022 14:07:41 -0800
Subject: [PATCH] dma/pool: create dma atomic pool only if dma zone has managed
pages
Currently three dma atomic pools are initialized as long as the relevant
kernel codes are built in. While in kdump kernel of x86_64, this is not
right when trying to create atomic_pool_dma, because there's no managed
pages in DMA zone. In the case, DMA zone only has low 1M memory
presented and locked down by memblock allocator. So no pages are added
into buddy of DMA zone. Please check commit f1d4d47c5851 ("x86/setup:
Always reserve the first 1M of RAM").
Then in kdump kernel of x86_64, it always prints below failure message:
DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations
swapper/0: page allocation failure: order:5, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-0.rc5.20210611git929d931f2b40.42.fc35.x86_64 #1
Hardware name: Dell Inc. PowerEdge R910/0P658H, BIOS 2.12.0 06/04/2018
Call Trace:
dump_stack+0x7f/0xa1
warn_alloc.cold+0x72/0xd6
__alloc_pages_slowpath.constprop.0+0xf29/0xf50
__alloc_pages+0x24d/0x2c0
alloc_page_interleave+0x13/0xb0
atomic_pool_expand+0x118/0x210
__dma_atomic_pool_init+0x45/0x93
dma_atomic_pool_init+0xdb/0x176
do_one_initcall+0x67/0x320
kernel_init_freeable+0x290/0x2dc
kernel_init+0xa/0x111
ret_from_fork+0x22/0x30
Mem-Info:
......
DMA: failed to allocate 128 KiB GFP_KERNEL|GFP_DMA pool for atomic allocation
DMA: preallocated 128 KiB GFP_KERNEL|GFP_DMA32 pool for atomic allocations
Here, let's check if DMA zone has managed pages, then create
atomic_pool_dma if yes. Otherwise just skip it.
Link: https://lkml.kernel.org/r/20211223094435.248523-3-bhe@redhat.com
Fixes: 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified")
Signed-off-by: Baoquan He <bhe(a)redhat.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Acked-by: John Donnelly <john.p.donnelly(a)oracle.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Cc: Marek Szyprowski <m.szyprowski(a)samsung.com>
Cc: Robin Murphy <robin.murphy(a)arm.com>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: Christoph Lameter <cl(a)linux.com>
Cc: David Laight <David.Laight(a)ACULAB.COM>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Hyeonggon Yoo <42.hyeyoo(a)gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 5f84e6cdb78e..4d40dcce7604 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void)
GFP_KERNEL);
if (!atomic_pool_kernel)
ret = -ENOMEM;
- if (IS_ENABLED(CONFIG_ZONE_DMA)) {
+ if (has_managed_dma()) {
atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
GFP_KERNEL | GFP_DMA);
if (!atomic_pool_dma)
@@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
if (prev == NULL) {
if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
return atomic_pool_dma32;
- if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+ if (atomic_pool_dma && (gfp & GFP_DMA))
return atomic_pool_dma;
return atomic_pool_kernel;
}
This reverts commit 9bbd42e79720122334226afad9ddcac1c3e6d373, which
was commit 17839856fd588f4ab6b789f482ed3ffd7c403e1f upstream. The
backport was incorrect and incomplete:
* It forced the write flag on in the generic __get_user_pages_fast(),
whereas only get_user_pages_fast() was supposed to do that.
* It only fixed the generic RCU-based implementation used by arm,
arm64, and powerpc. Before Linux 4.13, several other architectures
had their own implementations: mips, s390, sparc, sh, and x86.
This will be followed by a (hopefully) correct backport.
Signed-off-by: Ben Hutchings <ben(a)decadent.org.uk>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: stable(a)vger.kernel.org
---
mm/gup.c | 48 ++++++++----------------------------------------
mm/huge_memory.c | 7 ++++---
2 files changed, 12 insertions(+), 43 deletions(-)
diff --git a/mm/gup.c b/mm/gup.c
index 301dd96ef176..6bb7a8eb7f82 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -61,22 +61,13 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
}
/*
- * FOLL_FORCE or a forced COW break can write even to unwritable pte's,
- * but only after we've gone through a COW cycle and they are dirty.
+ * FOLL_FORCE can write to even unwritable pte's, but only
+ * after we've gone through a COW cycle and they are dirty.
*/
static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
{
- return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte));
-}
-
-/*
- * A (separate) COW fault might break the page the other way and
- * get_user_pages() would return the page from what is now the wrong
- * VM. So we need to force a COW break at GUP time even for reads.
- */
-static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags)
-{
- return is_cow_mapping(vma->vm_flags) && (flags & FOLL_GET);
+ return pte_write(pte) ||
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -586,18 +577,12 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (!vma || check_vma_flags(vma, gup_flags))
return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
- if (should_force_cow_break(vma, foll_flags))
- foll_flags |= FOLL_WRITE;
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
- foll_flags);
+ gup_flags);
continue;
}
}
-
- if (should_force_cow_break(vma, foll_flags))
- foll_flags |= FOLL_WRITE;
-
retry:
/*
* If we have a pending SIGKILL, don't keep faulting pages and
@@ -1518,10 +1503,6 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
/*
* Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
* the regular GUP. It will only return non-negative values.
- *
- * Careful, careful! COW breaking can go either way, so a non-write
- * access can get ambiguous page results. If you call this function without
- * 'write' set, you'd better be sure that you're ok with that ambiguity.
*/
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
@@ -1551,12 +1532,6 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
*
* We do not adopt an rcu_read_lock(.) here as we also want to
* block IPIs that come from THPs splitting.
- *
- * NOTE! We allow read-only gup_fast() here, but you'd better be
- * careful about possible COW pages. You'll get _a_ COW page, but
- * not necessarily the one you intended to get depending on what
- * COW event happens after this. COW may break the page copy in a
- * random direction.
*/
local_irq_save(flags);
@@ -1567,22 +1542,15 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
break;
- /*
- * The FAST_GUP case requires FOLL_WRITE even for pure reads,
- * because get_user_pages() may need to cause an early COW in
- * order to avoid confusing the normal COW routines. So only
- * targets that are already writable are safe to do by just
- * looking at the page tables.
- */
if (unlikely(pgd_huge(pgd))) {
- if (!gup_huge_pgd(pgd, pgdp, addr, next, 1,
+ if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
pages, &nr))
break;
} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
- PGDIR_SHIFT, next, 1, pages, &nr))
+ PGDIR_SHIFT, next, write, pages, &nr))
break;
- } else if (!gup_pud_range(pgd, addr, next, 1, pages, &nr))
+ } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3f3a86cc62b6..91f33bb43f17 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1135,12 +1135,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
}
/*
- * FOLL_FORCE or a forced COW break can write even to unwritable pmd's,
- * but only after we've gone through a COW cycle and they are dirty.
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
*/
static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
{
- return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd));
+ return pmd_write(pmd) ||
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
}
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
This is a backport of the recently merged "[PATCH v3 0/4] KVM: x86:
Partially allow KVM_SET_CPUID{,2} after KVM_RUN for CPU hotplug"
(https://lore.kernel.org/kvm/20220118141801.2219924-1-vkuznets@redhat.com/)
Original description:
Recently, KVM made it illegal to change CPUID after KVM_RUN but
unfortunately this change is not fully compatible with existing VMMs.
In particular, QEMU reuses vCPU fds for CPU hotplug after unplug and it
calls KVM_SET_CPUID2. Relax the requirement by implementing an allowing
KVM_SET_CPUID{,2} with the exact same data.
Vitaly Kuznetsov (4):
KVM: x86: Do runtime CPUID update before updating
vcpu->arch.cpuid_entries
KVM: x86: Partially allow KVM_SET_CPUID{,2} after KVM_RUN
KVM: selftests: Rename 'get_cpuid_test' to 'cpuid_test'
KVM: selftests: Test KVM_SET_CPUID2 after KVM_RUN
arch/x86/kvm/cpuid.c | 90 ++++++++++++++-----
arch/x86/kvm/x86.c | 19 ----
tools/testing/selftests/kvm/.gitignore | 2 +-
tools/testing/selftests/kvm/Makefile | 4 +-
.../selftests/kvm/include/x86_64/processor.h | 7 ++
.../selftests/kvm/lib/x86_64/processor.c | 33 ++++++-
.../x86_64/{get_cpuid_test.c => cpuid_test.c} | 30 +++++++
7 files changed, 139 insertions(+), 46 deletions(-)
rename tools/testing/selftests/kvm/x86_64/{get_cpuid_test.c => cpuid_test.c} (83%)
--
2.34.1
Hi stable maintainers,
On 06.04.21 03:47, Yoshio Furuyama wrote:
> From: "Doyle, Patrick" <pdoyle(a)irobot.com>
>
> In the unlikely event that both blocks 10 and 11 are marked as bad (on a
> 32 bit machine), then the process of marking block 10 as bad stomps on
> cached entry for block 11. There are (of course) other examples.
>
> Signed-off-by: Patrick Doyle <pdoyle(a)irobot.com>
> Reviewed-by: Richard Weinberger <richard(a)nod.at>
We have systems on which this patch fixes real failures. Could you
please add the upstream patch fd0d8d85f723 ("mtd: nand: bbt: Fix corner
case in bad block table handling") to the stable queues for 4.19, 5.4, 5.10?
Thanks!
Cc: stable(a)vger.kernel.org
Fixes: 9c3736a3de21 ("mtd: nand: Add core infrastructure to deal with
NAND devices")
> ---
> drivers/mtd/nand/bbt.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/mtd/nand/bbt.c b/drivers/mtd/nand/bbt.c
> index 044adf913854..64af6898131d 100644
> --- a/drivers/mtd/nand/bbt.c
> +++ b/drivers/mtd/nand/bbt.c
> @@ -123,7 +123,7 @@ int nanddev_bbt_set_block_status(struct nand_device *nand, unsigned int entry,
> unsigned int rbits = bits_per_block + offs - BITS_PER_LONG;
>
> pos[1] &= ~GENMASK(rbits - 1, 0);
> - pos[1] |= val >> rbits;
> + pos[1] |= val >> (bits_per_block - rbits);
> }
>
> return 0;
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 802d4d207e75d7208ff75adb712b556c1e91cf1c Mon Sep 17 00:00:00 2001
From: Manish Chopra <manishc(a)marvell.com>
Date: Fri, 17 Dec 2021 08:55:52 -0800
Subject: [PATCH] bnx2x: Invalidate fastpath HSI version for VFs
Commit 0a6890b9b4df ("bnx2x: Utilize FW 7.13.15.0.")
added validation for fastpath HSI versions for different
client init which was not meant for SR-IOV VF clients, which
resulted in firmware asserts when running VF clients with
different fastpath HSI version.
This patch along with the new firmware support in patch #1
fixes this behavior in order to not validate fastpath HSI
version for the VFs.
Fixes: 0a6890b9b4df ("bnx2x: Utilize FW 7.13.15.0.")
Signed-off-by: Manish Chopra <manishc(a)marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha(a)marvell.com>
Signed-off-by: Alok Prasad <palok(a)marvell.com>
Signed-off-by: Ariel Elior <aelior(a)marvell.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index 74a8931ce1d1..11d15cd03600 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -758,9 +758,18 @@ static void bnx2x_vf_igu_reset(struct bnx2x *bp, struct bnx2x_virtf *vf)
void bnx2x_vf_enable_access(struct bnx2x *bp, u8 abs_vfid)
{
+ u16 abs_fid;
+
+ abs_fid = FW_VF_HANDLE(abs_vfid);
+
/* set the VF-PF association in the FW */
- storm_memset_vf_to_pf(bp, FW_VF_HANDLE(abs_vfid), BP_FUNC(bp));
- storm_memset_func_en(bp, FW_VF_HANDLE(abs_vfid), 1);
+ storm_memset_vf_to_pf(bp, abs_fid, BP_FUNC(bp));
+ storm_memset_func_en(bp, abs_fid, 1);
+
+ /* Invalidate fp_hsi version for vfs */
+ if (bp->fw_cap & FW_CAP_INVALIDATE_VF_FP_HSI)
+ REG_WR8(bp, BAR_XSTRORM_INTMEM +
+ XSTORM_ETH_FUNCTION_INFO_FP_HSI_VALID_E2_OFFSET(abs_fid), 0);
/* clear vf errors*/
bnx2x_vf_semi_clear_err(bp, abs_vfid);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 802d4d207e75d7208ff75adb712b556c1e91cf1c Mon Sep 17 00:00:00 2001
From: Manish Chopra <manishc(a)marvell.com>
Date: Fri, 17 Dec 2021 08:55:52 -0800
Subject: [PATCH] bnx2x: Invalidate fastpath HSI version for VFs
Commit 0a6890b9b4df ("bnx2x: Utilize FW 7.13.15.0.")
added validation for fastpath HSI versions for different
client init which was not meant for SR-IOV VF clients, which
resulted in firmware asserts when running VF clients with
different fastpath HSI version.
This patch along with the new firmware support in patch #1
fixes this behavior in order to not validate fastpath HSI
version for the VFs.
Fixes: 0a6890b9b4df ("bnx2x: Utilize FW 7.13.15.0.")
Signed-off-by: Manish Chopra <manishc(a)marvell.com>
Signed-off-by: Prabhakar Kushwaha <pkushwaha(a)marvell.com>
Signed-off-by: Alok Prasad <palok(a)marvell.com>
Signed-off-by: Ariel Elior <aelior(a)marvell.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index 74a8931ce1d1..11d15cd03600 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -758,9 +758,18 @@ static void bnx2x_vf_igu_reset(struct bnx2x *bp, struct bnx2x_virtf *vf)
void bnx2x_vf_enable_access(struct bnx2x *bp, u8 abs_vfid)
{
+ u16 abs_fid;
+
+ abs_fid = FW_VF_HANDLE(abs_vfid);
+
/* set the VF-PF association in the FW */
- storm_memset_vf_to_pf(bp, FW_VF_HANDLE(abs_vfid), BP_FUNC(bp));
- storm_memset_func_en(bp, FW_VF_HANDLE(abs_vfid), 1);
+ storm_memset_vf_to_pf(bp, abs_fid, BP_FUNC(bp));
+ storm_memset_func_en(bp, abs_fid, 1);
+
+ /* Invalidate fp_hsi version for vfs */
+ if (bp->fw_cap & FW_CAP_INVALIDATE_VF_FP_HSI)
+ REG_WR8(bp, BAR_XSTRORM_INTMEM +
+ XSTORM_ETH_FUNCTION_INFO_FP_HSI_VALID_E2_OFFSET(abs_fid), 0);
/* clear vf errors*/
bnx2x_vf_semi_clear_err(bp, abs_vfid);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 87c01d57fa23de82fff593a7d070933d08755801 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 14 Jan 2022 14:09:31 -0800
Subject: [PATCH] mm/hmm.c: allow VM_MIXEDMAP to work with hmm_range_fault
hmm_range_fault() can be used instead of get_user_pages() for devices
which allow faulting however unlike get_user_pages() it will return an
error when used on a VM_MIXEDMAP range.
To make hmm_range_fault() more closely match get_user_pages() remove
this restriction. This requires dealing with the !ARCH_HAS_PTE_SPECIAL
case in hmm_vma_handle_pte(). Rather than replicating the logic of
vm_normal_page() call it directly and do a check for the zero pfn
similar to what get_user_pages() currently does.
Also add a test to hmm selftest to verify functionality.
Link: https://lkml.kernel.org/r/20211104012001.2555676-1-apopple@nvidia.com
Fixes: da4c3c735ea4 ("mm/hmm/mirror: helper to snapshot CPU page table")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: Jerome Glisse <jglisse(a)redhat.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index e2ce8f9b7605..767538089a62 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -1086,9 +1086,33 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
return 0;
}
+static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long addr;
+
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+ struct page *page;
+ int ret;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ ret = vm_insert_page(vma, addr, page);
+ if (ret) {
+ __free_page(page);
+ return ret;
+ }
+ put_page(page);
+ }
+
+ return 0;
+}
+
static const struct file_operations dmirror_fops = {
.open = dmirror_fops_open,
.release = dmirror_fops_release,
+ .mmap = dmirror_fops_mmap,
.unlocked_ioctl = dmirror_fops_unlocked_ioctl,
.llseek = default_llseek,
.owner = THIS_MODULE,
diff --git a/mm/hmm.c b/mm/hmm.c
index 842e26599238..bd56641c79d4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -300,7 +300,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* Since each architecture defines a struct page for the zero page, just
* fall through and treat it like a normal page.
*/
- if (pte_special(pte) && !pte_devmap(pte) &&
+ if (!vm_normal_page(walk->vma, addr, pte) &&
+ !pte_devmap(pte) &&
!is_zero_pfn(pte_pfn(pte))) {
if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
pte_unmap(ptep);
@@ -518,7 +519,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) &&
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
vma->vm_flags & VM_READ)
return 0;
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 864f126ffd78..203323967b50 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1248,6 +1248,48 @@ TEST_F(hmm, anon_teardown)
}
}
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm, mixedmap)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned char *m;
+ int ret;
+
+ npages = 1;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(npages);
+ ASSERT_NE(buffer->mirror, NULL);
+
+
+ /* Reserve a range of addresses. */
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE,
+ self->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Simulate a device snapshotting CPU pagetables. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device saw. */
+ m = buffer->mirror;
+ ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ);
+
+ hmm_buffer_free(buffer);
+}
+
/*
* Test memory snapshot without faulting in pages accessed by the device.
*/
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 87c01d57fa23de82fff593a7d070933d08755801 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 14 Jan 2022 14:09:31 -0800
Subject: [PATCH] mm/hmm.c: allow VM_MIXEDMAP to work with hmm_range_fault
hmm_range_fault() can be used instead of get_user_pages() for devices
which allow faulting however unlike get_user_pages() it will return an
error when used on a VM_MIXEDMAP range.
To make hmm_range_fault() more closely match get_user_pages() remove
this restriction. This requires dealing with the !ARCH_HAS_PTE_SPECIAL
case in hmm_vma_handle_pte(). Rather than replicating the logic of
vm_normal_page() call it directly and do a check for the zero pfn
similar to what get_user_pages() currently does.
Also add a test to hmm selftest to verify functionality.
Link: https://lkml.kernel.org/r/20211104012001.2555676-1-apopple@nvidia.com
Fixes: da4c3c735ea4 ("mm/hmm/mirror: helper to snapshot CPU page table")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: Jerome Glisse <jglisse(a)redhat.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index e2ce8f9b7605..767538089a62 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -1086,9 +1086,33 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
return 0;
}
+static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long addr;
+
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+ struct page *page;
+ int ret;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ ret = vm_insert_page(vma, addr, page);
+ if (ret) {
+ __free_page(page);
+ return ret;
+ }
+ put_page(page);
+ }
+
+ return 0;
+}
+
static const struct file_operations dmirror_fops = {
.open = dmirror_fops_open,
.release = dmirror_fops_release,
+ .mmap = dmirror_fops_mmap,
.unlocked_ioctl = dmirror_fops_unlocked_ioctl,
.llseek = default_llseek,
.owner = THIS_MODULE,
diff --git a/mm/hmm.c b/mm/hmm.c
index 842e26599238..bd56641c79d4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -300,7 +300,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* Since each architecture defines a struct page for the zero page, just
* fall through and treat it like a normal page.
*/
- if (pte_special(pte) && !pte_devmap(pte) &&
+ if (!vm_normal_page(walk->vma, addr, pte) &&
+ !pte_devmap(pte) &&
!is_zero_pfn(pte_pfn(pte))) {
if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
pte_unmap(ptep);
@@ -518,7 +519,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) &&
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
vma->vm_flags & VM_READ)
return 0;
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 864f126ffd78..203323967b50 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1248,6 +1248,48 @@ TEST_F(hmm, anon_teardown)
}
}
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm, mixedmap)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned char *m;
+ int ret;
+
+ npages = 1;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(npages);
+ ASSERT_NE(buffer->mirror, NULL);
+
+
+ /* Reserve a range of addresses. */
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE,
+ self->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Simulate a device snapshotting CPU pagetables. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device saw. */
+ m = buffer->mirror;
+ ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ);
+
+ hmm_buffer_free(buffer);
+}
+
/*
* Test memory snapshot without faulting in pages accessed by the device.
*/
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 87c01d57fa23de82fff593a7d070933d08755801 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple(a)nvidia.com>
Date: Fri, 14 Jan 2022 14:09:31 -0800
Subject: [PATCH] mm/hmm.c: allow VM_MIXEDMAP to work with hmm_range_fault
hmm_range_fault() can be used instead of get_user_pages() for devices
which allow faulting however unlike get_user_pages() it will return an
error when used on a VM_MIXEDMAP range.
To make hmm_range_fault() more closely match get_user_pages() remove
this restriction. This requires dealing with the !ARCH_HAS_PTE_SPECIAL
case in hmm_vma_handle_pte(). Rather than replicating the logic of
vm_normal_page() call it directly and do a check for the zero pfn
similar to what get_user_pages() currently does.
Also add a test to hmm selftest to verify functionality.
Link: https://lkml.kernel.org/r/20211104012001.2555676-1-apopple@nvidia.com
Fixes: da4c3c735ea4 ("mm/hmm/mirror: helper to snapshot CPU page table")
Signed-off-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Cc: Jerome Glisse <jglisse(a)redhat.com>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: Ralph Campbell <rcampbell(a)nvidia.com>
Cc: Felix Kuehling <Felix.Kuehling(a)amd.com>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index e2ce8f9b7605..767538089a62 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -1086,9 +1086,33 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
return 0;
}
+static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long addr;
+
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+ struct page *page;
+ int ret;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ ret = vm_insert_page(vma, addr, page);
+ if (ret) {
+ __free_page(page);
+ return ret;
+ }
+ put_page(page);
+ }
+
+ return 0;
+}
+
static const struct file_operations dmirror_fops = {
.open = dmirror_fops_open,
.release = dmirror_fops_release,
+ .mmap = dmirror_fops_mmap,
.unlocked_ioctl = dmirror_fops_unlocked_ioctl,
.llseek = default_llseek,
.owner = THIS_MODULE,
diff --git a/mm/hmm.c b/mm/hmm.c
index 842e26599238..bd56641c79d4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -300,7 +300,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* Since each architecture defines a struct page for the zero page, just
* fall through and treat it like a normal page.
*/
- if (pte_special(pte) && !pte_devmap(pte) &&
+ if (!vm_normal_page(walk->vma, addr, pte) &&
+ !pte_devmap(pte) &&
!is_zero_pfn(pte_pfn(pte))) {
if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
pte_unmap(ptep);
@@ -518,7 +519,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) &&
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
vma->vm_flags & VM_READ)
return 0;
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
index 864f126ffd78..203323967b50 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -1248,6 +1248,48 @@ TEST_F(hmm, anon_teardown)
}
}
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm, mixedmap)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned char *m;
+ int ret;
+
+ npages = 1;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(npages);
+ ASSERT_NE(buffer->mirror, NULL);
+
+
+ /* Reserve a range of addresses. */
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE,
+ self->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Simulate a device snapshotting CPU pagetables. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device saw. */
+ m = buffer->mirror;
+ ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ);
+
+ hmm_buffer_free(buffer);
+}
+
/*
* Test memory snapshot without faulting in pages accessed by the device.
*/
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin(a)huawei.com>
Date: Thu, 13 Jan 2022 10:55:36 +0800
Subject: [PATCH] blk-mq: fix tag_get wait task can't be awakened
In case of shared tags, there might be more than one hctx which
allocates from the same tags, and each hctx is limited to allocate at
most:
hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U);
tag idle detection is lazy, and may be delayed for 30sec, so there
could be just one real active hctx(queue) but all others are actually
idle and still accounted as active because of the lazy idle detection.
Then if wake_batch is > hctx_max_depth, driver tag allocation may wait
forever on this real active hctx.
Fix this by recalculating wake_batch when inc or dec active_queues.
Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps")
Suggested-by: Ming Lei <ming.lei(a)redhat.com>
Suggested-by: John Garry <john.garry(a)huawei.com>
Signed-off-by: Laibin Qiu <qiulaibin(a)huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e55a6834c9a6..845f74e8dd7b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -16,6 +16,21 @@
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
+/*
+ * Recalculate wakeup batch when tag is shared by hctx.
+ */
+static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
+ unsigned int users)
+{
+ if (!users)
+ return;
+
+ sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
+ users);
+ sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
+ users);
+}
+
/*
* If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail
@@ -24,18 +39,26 @@
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
+ unsigned int users;
+
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
- !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
+ return true;
+ }
} else {
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
- !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
+ test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
+ return true;
+ }
}
+ users = atomic_inc_return(&hctx->tags->active_queues);
+
+ blk_mq_update_wake_batch(hctx->tags, users);
+
return true;
}
@@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
+ unsigned int users;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
return;
}
- atomic_dec(&tags->active_queues);
+ users = atomic_dec_return(&tags->active_queues);
+
+ blk_mq_update_wake_batch(tags, users);
blk_mq_tag_wakeup_all(tags, false);
}
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index fc0357a6e19b..95df357ec009 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
sbitmap_free(&sbq->sb);
}
+/**
+ * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
+ * @sbq: Bitmap queue to recalculate wake batch.
+ * @users: Number of shares.
+ *
+ * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
+ * by depth. This interface is for HCTX shared tags or queue shared tags.
+ */
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users);
+
/**
* sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
* @sbq: Bitmap queue to resize.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 2709ab825499..6220fa67fb7e 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
-static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
- unsigned int depth)
+static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int wake_batch)
{
- unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
int i;
if (sbq->wake_batch != wake_batch) {
@@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
}
}
+static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int depth)
+{
+ unsigned int wake_batch;
+
+ wake_batch = sbq_calc_wake_batch(sbq, depth);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users)
+{
+ unsigned int wake_batch;
+
+ wake_batch = clamp_val((sbq->sb.depth + users - 1) /
+ users, 4, SBQ_WAKE_BATCH);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);
+
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
{
sbitmap_queue_update_wake_batch(sbq, depth);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin(a)huawei.com>
Date: Thu, 13 Jan 2022 10:55:36 +0800
Subject: [PATCH] blk-mq: fix tag_get wait task can't be awakened
In case of shared tags, there might be more than one hctx which
allocates from the same tags, and each hctx is limited to allocate at
most:
hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U);
tag idle detection is lazy, and may be delayed for 30sec, so there
could be just one real active hctx(queue) but all others are actually
idle and still accounted as active because of the lazy idle detection.
Then if wake_batch is > hctx_max_depth, driver tag allocation may wait
forever on this real active hctx.
Fix this by recalculating wake_batch when inc or dec active_queues.
Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps")
Suggested-by: Ming Lei <ming.lei(a)redhat.com>
Suggested-by: John Garry <john.garry(a)huawei.com>
Signed-off-by: Laibin Qiu <qiulaibin(a)huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e55a6834c9a6..845f74e8dd7b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -16,6 +16,21 @@
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
+/*
+ * Recalculate wakeup batch when tag is shared by hctx.
+ */
+static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
+ unsigned int users)
+{
+ if (!users)
+ return;
+
+ sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
+ users);
+ sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
+ users);
+}
+
/*
* If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail
@@ -24,18 +39,26 @@
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
+ unsigned int users;
+
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
- !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
+ return true;
+ }
} else {
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
- !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
+ test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
+ return true;
+ }
}
+ users = atomic_inc_return(&hctx->tags->active_queues);
+
+ blk_mq_update_wake_batch(hctx->tags, users);
+
return true;
}
@@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
+ unsigned int users;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
return;
}
- atomic_dec(&tags->active_queues);
+ users = atomic_dec_return(&tags->active_queues);
+
+ blk_mq_update_wake_batch(tags, users);
blk_mq_tag_wakeup_all(tags, false);
}
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index fc0357a6e19b..95df357ec009 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
sbitmap_free(&sbq->sb);
}
+/**
+ * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
+ * @sbq: Bitmap queue to recalculate wake batch.
+ * @users: Number of shares.
+ *
+ * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
+ * by depth. This interface is for HCTX shared tags or queue shared tags.
+ */
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users);
+
/**
* sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
* @sbq: Bitmap queue to resize.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 2709ab825499..6220fa67fb7e 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
-static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
- unsigned int depth)
+static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int wake_batch)
{
- unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
int i;
if (sbq->wake_batch != wake_batch) {
@@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
}
}
+static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int depth)
+{
+ unsigned int wake_batch;
+
+ wake_batch = sbq_calc_wake_batch(sbq, depth);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users)
+{
+ unsigned int wake_batch;
+
+ wake_batch = clamp_val((sbq->sb.depth + users - 1) /
+ users, 4, SBQ_WAKE_BATCH);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);
+
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
{
sbitmap_queue_update_wake_batch(sbq, depth);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin(a)huawei.com>
Date: Thu, 13 Jan 2022 10:55:36 +0800
Subject: [PATCH] blk-mq: fix tag_get wait task can't be awakened
In case of shared tags, there might be more than one hctx which
allocates from the same tags, and each hctx is limited to allocate at
most:
hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U);
tag idle detection is lazy, and may be delayed for 30sec, so there
could be just one real active hctx(queue) but all others are actually
idle and still accounted as active because of the lazy idle detection.
Then if wake_batch is > hctx_max_depth, driver tag allocation may wait
forever on this real active hctx.
Fix this by recalculating wake_batch when inc or dec active_queues.
Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps")
Suggested-by: Ming Lei <ming.lei(a)redhat.com>
Suggested-by: John Garry <john.garry(a)huawei.com>
Signed-off-by: Laibin Qiu <qiulaibin(a)huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e55a6834c9a6..845f74e8dd7b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -16,6 +16,21 @@
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
+/*
+ * Recalculate wakeup batch when tag is shared by hctx.
+ */
+static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
+ unsigned int users)
+{
+ if (!users)
+ return;
+
+ sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
+ users);
+ sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
+ users);
+}
+
/*
* If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail
@@ -24,18 +39,26 @@
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
+ unsigned int users;
+
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
- !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
+ return true;
+ }
} else {
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
- !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
+ test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
+ return true;
+ }
}
+ users = atomic_inc_return(&hctx->tags->active_queues);
+
+ blk_mq_update_wake_batch(hctx->tags, users);
+
return true;
}
@@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
+ unsigned int users;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
return;
}
- atomic_dec(&tags->active_queues);
+ users = atomic_dec_return(&tags->active_queues);
+
+ blk_mq_update_wake_batch(tags, users);
blk_mq_tag_wakeup_all(tags, false);
}
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index fc0357a6e19b..95df357ec009 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
sbitmap_free(&sbq->sb);
}
+/**
+ * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
+ * @sbq: Bitmap queue to recalculate wake batch.
+ * @users: Number of shares.
+ *
+ * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
+ * by depth. This interface is for HCTX shared tags or queue shared tags.
+ */
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users);
+
/**
* sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
* @sbq: Bitmap queue to resize.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 2709ab825499..6220fa67fb7e 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
-static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
- unsigned int depth)
+static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int wake_batch)
{
- unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
int i;
if (sbq->wake_batch != wake_batch) {
@@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
}
}
+static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int depth)
+{
+ unsigned int wake_batch;
+
+ wake_batch = sbq_calc_wake_batch(sbq, depth);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users)
+{
+ unsigned int wake_batch;
+
+ wake_batch = clamp_val((sbq->sb.depth + users - 1) /
+ users, 4, SBQ_WAKE_BATCH);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);
+
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
{
sbitmap_queue_update_wake_batch(sbq, depth);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin(a)huawei.com>
Date: Thu, 13 Jan 2022 10:55:36 +0800
Subject: [PATCH] blk-mq: fix tag_get wait task can't be awakened
In case of shared tags, there might be more than one hctx which
allocates from the same tags, and each hctx is limited to allocate at
most:
hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U);
tag idle detection is lazy, and may be delayed for 30sec, so there
could be just one real active hctx(queue) but all others are actually
idle and still accounted as active because of the lazy idle detection.
Then if wake_batch is > hctx_max_depth, driver tag allocation may wait
forever on this real active hctx.
Fix this by recalculating wake_batch when inc or dec active_queues.
Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps")
Suggested-by: Ming Lei <ming.lei(a)redhat.com>
Suggested-by: John Garry <john.garry(a)huawei.com>
Signed-off-by: Laibin Qiu <qiulaibin(a)huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e55a6834c9a6..845f74e8dd7b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -16,6 +16,21 @@
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
+/*
+ * Recalculate wakeup batch when tag is shared by hctx.
+ */
+static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
+ unsigned int users)
+{
+ if (!users)
+ return;
+
+ sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
+ users);
+ sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
+ users);
+}
+
/*
* If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail
@@ -24,18 +39,26 @@
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
+ unsigned int users;
+
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
- !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
+ return true;
+ }
} else {
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
- !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
+ test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
+ return true;
+ }
}
+ users = atomic_inc_return(&hctx->tags->active_queues);
+
+ blk_mq_update_wake_batch(hctx->tags, users);
+
return true;
}
@@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
+ unsigned int users;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
return;
}
- atomic_dec(&tags->active_queues);
+ users = atomic_dec_return(&tags->active_queues);
+
+ blk_mq_update_wake_batch(tags, users);
blk_mq_tag_wakeup_all(tags, false);
}
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index fc0357a6e19b..95df357ec009 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
sbitmap_free(&sbq->sb);
}
+/**
+ * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
+ * @sbq: Bitmap queue to recalculate wake batch.
+ * @users: Number of shares.
+ *
+ * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
+ * by depth. This interface is for HCTX shared tags or queue shared tags.
+ */
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users);
+
/**
* sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
* @sbq: Bitmap queue to resize.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 2709ab825499..6220fa67fb7e 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
-static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
- unsigned int depth)
+static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int wake_batch)
{
- unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
int i;
if (sbq->wake_batch != wake_batch) {
@@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
}
}
+static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int depth)
+{
+ unsigned int wake_batch;
+
+ wake_batch = sbq_calc_wake_batch(sbq, depth);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users)
+{
+ unsigned int wake_batch;
+
+ wake_batch = clamp_val((sbq->sb.depth + users - 1) /
+ users, 4, SBQ_WAKE_BATCH);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);
+
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
{
sbitmap_queue_update_wake_batch(sbq, depth);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin(a)huawei.com>
Date: Thu, 13 Jan 2022 10:55:36 +0800
Subject: [PATCH] blk-mq: fix tag_get wait task can't be awakened
In case of shared tags, there might be more than one hctx which
allocates from the same tags, and each hctx is limited to allocate at
most:
hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U);
tag idle detection is lazy, and may be delayed for 30sec, so there
could be just one real active hctx(queue) but all others are actually
idle and still accounted as active because of the lazy idle detection.
Then if wake_batch is > hctx_max_depth, driver tag allocation may wait
forever on this real active hctx.
Fix this by recalculating wake_batch when inc or dec active_queues.
Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps")
Suggested-by: Ming Lei <ming.lei(a)redhat.com>
Suggested-by: John Garry <john.garry(a)huawei.com>
Signed-off-by: Laibin Qiu <qiulaibin(a)huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e55a6834c9a6..845f74e8dd7b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -16,6 +16,21 @@
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
+/*
+ * Recalculate wakeup batch when tag is shared by hctx.
+ */
+static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
+ unsigned int users)
+{
+ if (!users)
+ return;
+
+ sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
+ users);
+ sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
+ users);
+}
+
/*
* If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail
@@ -24,18 +39,26 @@
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
+ unsigned int users;
+
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
- !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
+ return true;
+ }
} else {
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
- !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
+ test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
+ return true;
+ }
}
+ users = atomic_inc_return(&hctx->tags->active_queues);
+
+ blk_mq_update_wake_batch(hctx->tags, users);
+
return true;
}
@@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
+ unsigned int users;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
return;
}
- atomic_dec(&tags->active_queues);
+ users = atomic_dec_return(&tags->active_queues);
+
+ blk_mq_update_wake_batch(tags, users);
blk_mq_tag_wakeup_all(tags, false);
}
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index fc0357a6e19b..95df357ec009 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
sbitmap_free(&sbq->sb);
}
+/**
+ * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
+ * @sbq: Bitmap queue to recalculate wake batch.
+ * @users: Number of shares.
+ *
+ * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
+ * by depth. This interface is for HCTX shared tags or queue shared tags.
+ */
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users);
+
/**
* sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
* @sbq: Bitmap queue to resize.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 2709ab825499..6220fa67fb7e 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
-static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
- unsigned int depth)
+static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int wake_batch)
{
- unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
int i;
if (sbq->wake_batch != wake_batch) {
@@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
}
}
+static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int depth)
+{
+ unsigned int wake_batch;
+
+ wake_batch = sbq_calc_wake_batch(sbq, depth);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users)
+{
+ unsigned int wake_batch;
+
+ wake_batch = clamp_val((sbq->sb.depth + users - 1) /
+ users, 4, SBQ_WAKE_BATCH);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);
+
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
{
sbitmap_queue_update_wake_batch(sbq, depth);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin(a)huawei.com>
Date: Thu, 13 Jan 2022 10:55:36 +0800
Subject: [PATCH] blk-mq: fix tag_get wait task can't be awakened
In case of shared tags, there might be more than one hctx which
allocates from the same tags, and each hctx is limited to allocate at
most:
hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U);
tag idle detection is lazy, and may be delayed for 30sec, so there
could be just one real active hctx(queue) but all others are actually
idle and still accounted as active because of the lazy idle detection.
Then if wake_batch is > hctx_max_depth, driver tag allocation may wait
forever on this real active hctx.
Fix this by recalculating wake_batch when inc or dec active_queues.
Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps")
Suggested-by: Ming Lei <ming.lei(a)redhat.com>
Suggested-by: John Garry <john.garry(a)huawei.com>
Signed-off-by: Laibin Qiu <qiulaibin(a)huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e55a6834c9a6..845f74e8dd7b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -16,6 +16,21 @@
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
+/*
+ * Recalculate wakeup batch when tag is shared by hctx.
+ */
+static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
+ unsigned int users)
+{
+ if (!users)
+ return;
+
+ sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
+ users);
+ sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
+ users);
+}
+
/*
* If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail
@@ -24,18 +39,26 @@
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
+ unsigned int users;
+
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
- !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
+ return true;
+ }
} else {
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
- !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
+ test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
+ return true;
+ }
}
+ users = atomic_inc_return(&hctx->tags->active_queues);
+
+ blk_mq_update_wake_batch(hctx->tags, users);
+
return true;
}
@@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
+ unsigned int users;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
return;
}
- atomic_dec(&tags->active_queues);
+ users = atomic_dec_return(&tags->active_queues);
+
+ blk_mq_update_wake_batch(tags, users);
blk_mq_tag_wakeup_all(tags, false);
}
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index fc0357a6e19b..95df357ec009 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
sbitmap_free(&sbq->sb);
}
+/**
+ * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
+ * @sbq: Bitmap queue to recalculate wake batch.
+ * @users: Number of shares.
+ *
+ * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
+ * by depth. This interface is for HCTX shared tags or queue shared tags.
+ */
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users);
+
/**
* sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
* @sbq: Bitmap queue to resize.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 2709ab825499..6220fa67fb7e 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
-static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
- unsigned int depth)
+static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int wake_batch)
{
- unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
int i;
if (sbq->wake_batch != wake_batch) {
@@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
}
}
+static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int depth)
+{
+ unsigned int wake_batch;
+
+ wake_batch = sbq_calc_wake_batch(sbq, depth);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users)
+{
+ unsigned int wake_batch;
+
+ wake_batch = clamp_val((sbq->sb.depth + users - 1) /
+ users, 4, SBQ_WAKE_BATCH);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);
+
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
{
sbitmap_queue_update_wake_batch(sbq, depth);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001
From: Laibin Qiu <qiulaibin(a)huawei.com>
Date: Thu, 13 Jan 2022 10:55:36 +0800
Subject: [PATCH] blk-mq: fix tag_get wait task can't be awakened
In case of shared tags, there might be more than one hctx which
allocates from the same tags, and each hctx is limited to allocate at
most:
hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U);
tag idle detection is lazy, and may be delayed for 30sec, so there
could be just one real active hctx(queue) but all others are actually
idle and still accounted as active because of the lazy idle detection.
Then if wake_batch is > hctx_max_depth, driver tag allocation may wait
forever on this real active hctx.
Fix this by recalculating wake_batch when inc or dec active_queues.
Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps")
Suggested-by: Ming Lei <ming.lei(a)redhat.com>
Suggested-by: John Garry <john.garry(a)huawei.com>
Signed-off-by: Laibin Qiu <qiulaibin(a)huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e55a6834c9a6..845f74e8dd7b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -16,6 +16,21 @@
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
+/*
+ * Recalculate wakeup batch when tag is shared by hctx.
+ */
+static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
+ unsigned int users)
+{
+ if (!users)
+ return;
+
+ sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
+ users);
+ sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
+ users);
+}
+
/*
* If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail
@@ -24,18 +39,26 @@
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
+ unsigned int users;
+
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
- !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
+ return true;
+ }
} else {
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
- !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- atomic_inc(&hctx->tags->active_queues);
+ if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
+ test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
+ return true;
+ }
}
+ users = atomic_inc_return(&hctx->tags->active_queues);
+
+ blk_mq_update_wake_batch(hctx->tags, users);
+
return true;
}
@@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
+ unsigned int users;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
@@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
return;
}
- atomic_dec(&tags->active_queues);
+ users = atomic_dec_return(&tags->active_queues);
+
+ blk_mq_update_wake_batch(tags, users);
blk_mq_tag_wakeup_all(tags, false);
}
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index fc0357a6e19b..95df357ec009 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
sbitmap_free(&sbq->sb);
}
+/**
+ * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
+ * @sbq: Bitmap queue to recalculate wake batch.
+ * @users: Number of shares.
+ *
+ * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
+ * by depth. This interface is for HCTX shared tags or queue shared tags.
+ */
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users);
+
/**
* sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
* @sbq: Bitmap queue to resize.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 2709ab825499..6220fa67fb7e 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
-static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
- unsigned int depth)
+static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int wake_batch)
{
- unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
int i;
if (sbq->wake_batch != wake_batch) {
@@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
}
}
+static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int depth)
+{
+ unsigned int wake_batch;
+
+ wake_batch = sbq_calc_wake_batch(sbq, depth);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+
+void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
+ unsigned int users)
+{
+ unsigned int wake_batch;
+
+ wake_batch = clamp_val((sbq->sb.depth + users - 1) /
+ users, 4, SBQ_WAKE_BATCH);
+ __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);
+
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
{
sbitmap_queue_update_wake_batch(sbq, depth);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 80f15f3bef9e9c2cc29888a6773df44de0a0c65f Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Date: Fri, 14 Jan 2022 15:36:37 +0200
Subject: [PATCH] net: mscc: ocelot: don't dereference NULL pointers with
shared tc filters
The following command sequence:
tc qdisc del dev swp0 clsact
tc qdisc add dev swp0 ingress_block 1 clsact
tc qdisc add dev swp1 ingress_block 1 clsact
tc filter add block 1 flower action drop
tc qdisc del dev swp0 clsact
produces the following NPD:
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000014
pc : vcap_entry_set+0x14/0x70
lr : ocelot_vcap_filter_del+0x198/0x234
Call trace:
vcap_entry_set+0x14/0x70
ocelot_vcap_filter_del+0x198/0x234
ocelot_cls_flower_destroy+0x94/0xe4
felix_cls_flower_del+0x70/0x84
dsa_slave_setup_tc_block_cb+0x13c/0x60c
dsa_slave_setup_tc_block_cb_ig+0x20/0x30
tc_setup_cb_reoffload+0x44/0x120
fl_reoffload+0x280/0x320
tcf_block_playback_offloads+0x6c/0x184
tcf_block_unbind+0x80/0xe0
tcf_block_setup+0x174/0x214
tcf_block_offload_cmd.isra.0+0x100/0x13c
tcf_block_offload_unbind+0x5c/0xa0
__tcf_block_put+0x54/0x174
tcf_block_put_ext+0x5c/0x74
clsact_destroy+0x40/0x60
qdisc_destroy+0x4c/0x150
qdisc_put+0x70/0x90
qdisc_graft+0x3f0/0x4c0
tc_get_qdisc+0x1cc/0x364
rtnetlink_rcv_msg+0x124/0x340
The reason is that the driver isn't prepared to receive two tc filters
with the same cookie. It unconditionally creates a new struct
ocelot_vcap_filter for each tc filter, and it adds all filters with the
same identifier (cookie) to the ocelot_vcap_block.
The problem is here, in ocelot_vcap_filter_del():
/* Gets index of the filter */
index = ocelot_vcap_block_get_filter_index(block, filter);
if (index < 0)
return index;
/* Delete filter */
ocelot_vcap_block_remove_filter(ocelot, block, filter);
/* Move up all the blocks over the deleted filter */
for (i = index; i < block->count; i++) {
struct ocelot_vcap_filter *tmp;
tmp = ocelot_vcap_block_find_filter_by_index(block, i);
vcap_entry_set(ocelot, i, tmp);
}
what will happen is ocelot_vcap_block_get_filter_index() will return the
index (@index) of the first filter found with that cookie. This is _not_
the index of _this_ filter, but the other one with the same cookie,
because ocelot_vcap_filter_equal() gets fooled.
Then later, ocelot_vcap_block_remove_filter() is coded to remove all
filters that are ocelot_vcap_filter_equal() with the passed @filter.
So unexpectedly, both filters get deleted from the list.
Then ocelot_vcap_filter_del() will attempt to move all the other filters
up, again finding them by index (@i). The block count is 2, @index was 0,
so it will attempt to move up filter @i=0 and @i=1. It assigns tmp =
ocelot_vcap_block_find_filter_by_index(block, i), which is now a NULL
pointer because ocelot_vcap_block_remove_filter() has removed more than
one filter.
As far as I can see, this problem has been there since the introduction
of tc offload support, however I cannot test beyond the blamed commit
due to hardware availability. In any case, any fix cannot be backported
that far, due to lots of changes to the code base.
Therefore, let's go for the correct solution, which is to not call
ocelot_vcap_filter_add() and ocelot_vcap_filter_del(), unless the filter
is actually unique and not shared. For the shared filters, we should
just modify the ingress port mask and call ocelot_vcap_filter_replace(),
a function introduced by commit 95706be13b9f ("net: mscc: ocelot: create
a function that replaces an existing VCAP filter"). This way,
block->rules will only contain filters with unique cookies, by design.
Fixes: 07d985eef073 ("net: dsa: felix: Wire up the ocelot cls_flower methods")
Signed-off-by: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c
index beb9379424c0..4a0fda22d343 100644
--- a/drivers/net/ethernet/mscc/ocelot_flower.c
+++ b/drivers/net/ethernet/mscc/ocelot_flower.c
@@ -805,13 +805,34 @@ int ocelot_cls_flower_replace(struct ocelot *ocelot, int port,
struct netlink_ext_ack *extack = f->common.extack;
struct ocelot_vcap_filter *filter;
int chain = f->common.chain_index;
- int ret;
+ int block_id, ret;
if (chain && !ocelot_find_vcap_filter_that_points_at(ocelot, chain)) {
NL_SET_ERR_MSG_MOD(extack, "No default GOTO action points to this chain");
return -EOPNOTSUPP;
}
+ block_id = ocelot_chain_to_block(chain, ingress);
+ if (block_id < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot offload to this chain");
+ return -EOPNOTSUPP;
+ }
+
+ filter = ocelot_vcap_block_find_filter_by_id(&ocelot->block[block_id],
+ f->cookie, true);
+ if (filter) {
+ /* Filter already exists on other ports */
+ if (!ingress) {
+ NL_SET_ERR_MSG_MOD(extack, "VCAP ES0 does not support shared filters");
+ return -EOPNOTSUPP;
+ }
+
+ filter->ingress_port_mask |= BIT(port);
+
+ return ocelot_vcap_filter_replace(ocelot, filter);
+ }
+
+ /* Filter didn't exist, create it now */
filter = ocelot_vcap_filter_create(ocelot, port, ingress, f);
if (!filter)
return -ENOMEM;
@@ -874,6 +895,12 @@ int ocelot_cls_flower_destroy(struct ocelot *ocelot, int port,
if (filter->type == OCELOT_VCAP_FILTER_DUMMY)
return ocelot_vcap_dummy_filter_del(ocelot, filter);
+ if (ingress) {
+ filter->ingress_port_mask &= ~BIT(port);
+ if (filter->ingress_port_mask)
+ return ocelot_vcap_filter_replace(ocelot, filter);
+ }
+
return ocelot_vcap_filter_del(ocelot, filter);
}
EXPORT_SYMBOL_GPL(ocelot_cls_flower_destroy);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 80f15f3bef9e9c2cc29888a6773df44de0a0c65f Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Date: Fri, 14 Jan 2022 15:36:37 +0200
Subject: [PATCH] net: mscc: ocelot: don't dereference NULL pointers with
shared tc filters
The following command sequence:
tc qdisc del dev swp0 clsact
tc qdisc add dev swp0 ingress_block 1 clsact
tc qdisc add dev swp1 ingress_block 1 clsact
tc filter add block 1 flower action drop
tc qdisc del dev swp0 clsact
produces the following NPD:
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000014
pc : vcap_entry_set+0x14/0x70
lr : ocelot_vcap_filter_del+0x198/0x234
Call trace:
vcap_entry_set+0x14/0x70
ocelot_vcap_filter_del+0x198/0x234
ocelot_cls_flower_destroy+0x94/0xe4
felix_cls_flower_del+0x70/0x84
dsa_slave_setup_tc_block_cb+0x13c/0x60c
dsa_slave_setup_tc_block_cb_ig+0x20/0x30
tc_setup_cb_reoffload+0x44/0x120
fl_reoffload+0x280/0x320
tcf_block_playback_offloads+0x6c/0x184
tcf_block_unbind+0x80/0xe0
tcf_block_setup+0x174/0x214
tcf_block_offload_cmd.isra.0+0x100/0x13c
tcf_block_offload_unbind+0x5c/0xa0
__tcf_block_put+0x54/0x174
tcf_block_put_ext+0x5c/0x74
clsact_destroy+0x40/0x60
qdisc_destroy+0x4c/0x150
qdisc_put+0x70/0x90
qdisc_graft+0x3f0/0x4c0
tc_get_qdisc+0x1cc/0x364
rtnetlink_rcv_msg+0x124/0x340
The reason is that the driver isn't prepared to receive two tc filters
with the same cookie. It unconditionally creates a new struct
ocelot_vcap_filter for each tc filter, and it adds all filters with the
same identifier (cookie) to the ocelot_vcap_block.
The problem is here, in ocelot_vcap_filter_del():
/* Gets index of the filter */
index = ocelot_vcap_block_get_filter_index(block, filter);
if (index < 0)
return index;
/* Delete filter */
ocelot_vcap_block_remove_filter(ocelot, block, filter);
/* Move up all the blocks over the deleted filter */
for (i = index; i < block->count; i++) {
struct ocelot_vcap_filter *tmp;
tmp = ocelot_vcap_block_find_filter_by_index(block, i);
vcap_entry_set(ocelot, i, tmp);
}
what will happen is ocelot_vcap_block_get_filter_index() will return the
index (@index) of the first filter found with that cookie. This is _not_
the index of _this_ filter, but the other one with the same cookie,
because ocelot_vcap_filter_equal() gets fooled.
Then later, ocelot_vcap_block_remove_filter() is coded to remove all
filters that are ocelot_vcap_filter_equal() with the passed @filter.
So unexpectedly, both filters get deleted from the list.
Then ocelot_vcap_filter_del() will attempt to move all the other filters
up, again finding them by index (@i). The block count is 2, @index was 0,
so it will attempt to move up filter @i=0 and @i=1. It assigns tmp =
ocelot_vcap_block_find_filter_by_index(block, i), which is now a NULL
pointer because ocelot_vcap_block_remove_filter() has removed more than
one filter.
As far as I can see, this problem has been there since the introduction
of tc offload support, however I cannot test beyond the blamed commit
due to hardware availability. In any case, any fix cannot be backported
that far, due to lots of changes to the code base.
Therefore, let's go for the correct solution, which is to not call
ocelot_vcap_filter_add() and ocelot_vcap_filter_del(), unless the filter
is actually unique and not shared. For the shared filters, we should
just modify the ingress port mask and call ocelot_vcap_filter_replace(),
a function introduced by commit 95706be13b9f ("net: mscc: ocelot: create
a function that replaces an existing VCAP filter"). This way,
block->rules will only contain filters with unique cookies, by design.
Fixes: 07d985eef073 ("net: dsa: felix: Wire up the ocelot cls_flower methods")
Signed-off-by: Vladimir Oltean <vladimir.oltean(a)nxp.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/mscc/ocelot_flower.c b/drivers/net/ethernet/mscc/ocelot_flower.c
index beb9379424c0..4a0fda22d343 100644
--- a/drivers/net/ethernet/mscc/ocelot_flower.c
+++ b/drivers/net/ethernet/mscc/ocelot_flower.c
@@ -805,13 +805,34 @@ int ocelot_cls_flower_replace(struct ocelot *ocelot, int port,
struct netlink_ext_ack *extack = f->common.extack;
struct ocelot_vcap_filter *filter;
int chain = f->common.chain_index;
- int ret;
+ int block_id, ret;
if (chain && !ocelot_find_vcap_filter_that_points_at(ocelot, chain)) {
NL_SET_ERR_MSG_MOD(extack, "No default GOTO action points to this chain");
return -EOPNOTSUPP;
}
+ block_id = ocelot_chain_to_block(chain, ingress);
+ if (block_id < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot offload to this chain");
+ return -EOPNOTSUPP;
+ }
+
+ filter = ocelot_vcap_block_find_filter_by_id(&ocelot->block[block_id],
+ f->cookie, true);
+ if (filter) {
+ /* Filter already exists on other ports */
+ if (!ingress) {
+ NL_SET_ERR_MSG_MOD(extack, "VCAP ES0 does not support shared filters");
+ return -EOPNOTSUPP;
+ }
+
+ filter->ingress_port_mask |= BIT(port);
+
+ return ocelot_vcap_filter_replace(ocelot, filter);
+ }
+
+ /* Filter didn't exist, create it now */
filter = ocelot_vcap_filter_create(ocelot, port, ingress, f);
if (!filter)
return -ENOMEM;
@@ -874,6 +895,12 @@ int ocelot_cls_flower_destroy(struct ocelot *ocelot, int port,
if (filter->type == OCELOT_VCAP_FILTER_DUMMY)
return ocelot_vcap_dummy_filter_del(ocelot, filter);
+ if (ingress) {
+ filter->ingress_port_mask &= ~BIT(port);
+ if (filter->ingress_port_mask)
+ return ocelot_vcap_filter_replace(ocelot, filter);
+ }
+
return ocelot_vcap_filter_del(ocelot, filter);
}
EXPORT_SYMBOL_GPL(ocelot_cls_flower_destroy);
--
Arvefond
Jeg vet at dette brevet kommer til deg som en overraskelse, men det er
for en større hensikt. Jeg er en personlig advokat for min avdøde
klient, som mistet livet i en forferdelig bilulykke med hele familien.
Jeg har fått fullmakt fra bankledelsen til å presentere de pårørende
for avdøde som vil ha rett til fondet eller få det inndratt ved
fullmaktens utløp. Vennligst kom tilbake til meg umiddelbart hvis du
er interessert i mitt forslag for mer detaljer og avklaring.
Ditt samarbeid vil bli satt stor pris på.
Barrister Felix Joel.
Republikken lome-Togo
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1771afd47430f5e95c9c3a2e3a8a63e67402d3fe Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb(a)kernel.org>
Date: Tue, 18 Jan 2022 11:22:04 +0100
Subject: [PATCH] net: cpsw: avoid alignment faults by taking NET_IP_ALIGN into
account
Both versions of the CPSW driver declare a CPSW_HEADROOM_NA macro that
takes NET_IP_ALIGN into account, but fail to use it appropriately when
storing incoming packets in memory. This results in the IPv4 source and
destination addresses to appear misaligned in memory, which causes
aligment faults that need to be fixed up in software.
So let's switch from CPSW_HEADROOM to CPSW_HEADROOM_NA where needed.
This gets rid of any alignment faults on the RX path on a Beaglebone
White.
Fixes: 9ed4050c0d75 ("net: ethernet: ti: cpsw: add XDP support")
Cc: Grygorii Strashko <grygorii.strashko(a)ti.com>
Cc: Ilias Apalodimas <ilias.apalodimas(a)linaro.org>
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 33142d505fc8..03575c017500 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -349,7 +349,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
struct cpsw_common *cpsw = ndev_to_cpsw(xmeta->ndev);
int pkt_size = cpsw->rx_packet_max;
int ret = 0, port, ch = xmeta->ch;
- int headroom = CPSW_HEADROOM;
+ int headroom = CPSW_HEADROOM_NA;
struct net_device *ndev = xmeta->ndev;
struct cpsw_priv *priv;
struct page_pool *pool;
@@ -392,7 +392,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
}
if (priv->xdp_prog) {
- int headroom = CPSW_HEADROOM, size = len;
+ int size = len;
xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]);
if (status & CPDMA_RX_VLAN_ENCAP) {
@@ -442,7 +442,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
xmeta->ndev = ndev;
xmeta->ch = ch;
- dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM;
+ dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM_NA;
ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma,
pkt_size, 0);
if (ret < 0) {
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 279e261e4720..bd4b1528cf99 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -283,7 +283,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
{
struct page *new_page, *page = token;
void *pa = page_address(page);
- int headroom = CPSW_HEADROOM;
+ int headroom = CPSW_HEADROOM_NA;
struct cpsw_meta_xdp *xmeta;
struct cpsw_common *cpsw;
struct net_device *ndev;
@@ -336,7 +336,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
}
if (priv->xdp_prog) {
- int headroom = CPSW_HEADROOM, size = len;
+ int size = len;
xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]);
if (status & CPDMA_RX_VLAN_ENCAP) {
@@ -386,7 +386,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
xmeta->ndev = ndev;
xmeta->ch = ch;
- dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM;
+ dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM_NA;
ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma,
pkt_size, 0);
if (ret < 0) {
diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c
index 3537502e5e8b..ba220593e6db 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.c
+++ b/drivers/net/ethernet/ti/cpsw_priv.c
@@ -1122,7 +1122,7 @@ int cpsw_fill_rx_channels(struct cpsw_priv *priv)
xmeta->ndev = priv->ndev;
xmeta->ch = ch;
- dma = page_pool_get_dma_addr(page) + CPSW_HEADROOM;
+ dma = page_pool_get_dma_addr(page) + CPSW_HEADROOM_NA;
ret = cpdma_chan_idle_submit_mapped(cpsw->rxv[ch].ch,
page, dma,
cpsw->rx_packet_max,
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1771afd47430f5e95c9c3a2e3a8a63e67402d3fe Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb(a)kernel.org>
Date: Tue, 18 Jan 2022 11:22:04 +0100
Subject: [PATCH] net: cpsw: avoid alignment faults by taking NET_IP_ALIGN into
account
Both versions of the CPSW driver declare a CPSW_HEADROOM_NA macro that
takes NET_IP_ALIGN into account, but fail to use it appropriately when
storing incoming packets in memory. This results in the IPv4 source and
destination addresses to appear misaligned in memory, which causes
aligment faults that need to be fixed up in software.
So let's switch from CPSW_HEADROOM to CPSW_HEADROOM_NA where needed.
This gets rid of any alignment faults on the RX path on a Beaglebone
White.
Fixes: 9ed4050c0d75 ("net: ethernet: ti: cpsw: add XDP support")
Cc: Grygorii Strashko <grygorii.strashko(a)ti.com>
Cc: Ilias Apalodimas <ilias.apalodimas(a)linaro.org>
Signed-off-by: Ard Biesheuvel <ardb(a)kernel.org>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 33142d505fc8..03575c017500 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -349,7 +349,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
struct cpsw_common *cpsw = ndev_to_cpsw(xmeta->ndev);
int pkt_size = cpsw->rx_packet_max;
int ret = 0, port, ch = xmeta->ch;
- int headroom = CPSW_HEADROOM;
+ int headroom = CPSW_HEADROOM_NA;
struct net_device *ndev = xmeta->ndev;
struct cpsw_priv *priv;
struct page_pool *pool;
@@ -392,7 +392,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
}
if (priv->xdp_prog) {
- int headroom = CPSW_HEADROOM, size = len;
+ int size = len;
xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]);
if (status & CPDMA_RX_VLAN_ENCAP) {
@@ -442,7 +442,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
xmeta->ndev = ndev;
xmeta->ch = ch;
- dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM;
+ dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM_NA;
ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma,
pkt_size, 0);
if (ret < 0) {
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 279e261e4720..bd4b1528cf99 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -283,7 +283,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
{
struct page *new_page, *page = token;
void *pa = page_address(page);
- int headroom = CPSW_HEADROOM;
+ int headroom = CPSW_HEADROOM_NA;
struct cpsw_meta_xdp *xmeta;
struct cpsw_common *cpsw;
struct net_device *ndev;
@@ -336,7 +336,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
}
if (priv->xdp_prog) {
- int headroom = CPSW_HEADROOM, size = len;
+ int size = len;
xdp_init_buff(&xdp, PAGE_SIZE, &priv->xdp_rxq[ch]);
if (status & CPDMA_RX_VLAN_ENCAP) {
@@ -386,7 +386,7 @@ static void cpsw_rx_handler(void *token, int len, int status)
xmeta->ndev = ndev;
xmeta->ch = ch;
- dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM;
+ dma = page_pool_get_dma_addr(new_page) + CPSW_HEADROOM_NA;
ret = cpdma_chan_submit_mapped(cpsw->rxv[ch].ch, new_page, dma,
pkt_size, 0);
if (ret < 0) {
diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c
index 3537502e5e8b..ba220593e6db 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.c
+++ b/drivers/net/ethernet/ti/cpsw_priv.c
@@ -1122,7 +1122,7 @@ int cpsw_fill_rx_channels(struct cpsw_priv *priv)
xmeta->ndev = priv->ndev;
xmeta->ch = ch;
- dma = page_pool_get_dma_addr(page) + CPSW_HEADROOM;
+ dma = page_pool_get_dma_addr(page) + CPSW_HEADROOM_NA;
ret = cpdma_chan_idle_submit_mapped(cpsw->rxv[ch].ch,
page, dma,
cpsw->rx_packet_max,
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 998c0bd2b3715244da7639cc4e6a2062cb79c3f4 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder(a)linaro.org>
Date: Wed, 12 Jan 2022 07:30:12 -0600
Subject: [PATCH] net: ipa: prevent concurrent replenish
We have seen cases where an endpoint RX completion interrupt arrives
while replenishing for the endpoint is underway. This causes another
instance of replenishing to begin as part of completing the receive
transaction. If this occurs it can lead to transaction corruption.
Use a new flag to ensure only one replenish instance for an endpoint
executes at a time.
Fixes: 84f9bd12d46db ("soc: qcom: ipa: IPA endpoints")
Signed-off-by: Alex Elder <elder(a)linaro.org>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c
index cddddcedaf72..68291a3efd04 100644
--- a/drivers/net/ipa/ipa_endpoint.c
+++ b/drivers/net/ipa/ipa_endpoint.c
@@ -1088,15 +1088,27 @@ static void ipa_endpoint_replenish(struct ipa_endpoint *endpoint, bool add_one)
return;
}
+ /* If already active, just update the backlog */
+ if (test_and_set_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags)) {
+ if (add_one)
+ atomic_inc(&endpoint->replenish_backlog);
+ return;
+ }
+
while (atomic_dec_not_zero(&endpoint->replenish_backlog))
if (ipa_endpoint_replenish_one(endpoint))
goto try_again_later;
+
+ clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags);
+
if (add_one)
atomic_inc(&endpoint->replenish_backlog);
return;
try_again_later:
+ clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags);
+
/* The last one didn't succeed, so fix the backlog */
delta = add_one ? 2 : 1;
backlog = atomic_add_return(delta, &endpoint->replenish_backlog);
@@ -1691,6 +1703,7 @@ static void ipa_endpoint_setup_one(struct ipa_endpoint *endpoint)
* backlog is the same as the maximum outstanding TREs.
*/
clear_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags);
+ clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags);
atomic_set(&endpoint->replenish_saved,
gsi_channel_tre_max(gsi, endpoint->channel_id));
atomic_set(&endpoint->replenish_backlog, 0);
diff --git a/drivers/net/ipa/ipa_endpoint.h b/drivers/net/ipa/ipa_endpoint.h
index 07d5c20e5f00..0313cdc607de 100644
--- a/drivers/net/ipa/ipa_endpoint.h
+++ b/drivers/net/ipa/ipa_endpoint.h
@@ -44,10 +44,12 @@ enum ipa_endpoint_name {
* enum ipa_replenish_flag: RX buffer replenish flags
*
* @IPA_REPLENISH_ENABLED: Whether receive buffer replenishing is enabled
+ * @IPA_REPLENISH_ACTIVE: Whether replenishing is underway
* @IPA_REPLENISH_COUNT: Number of defined replenish flags
*/
enum ipa_replenish_flag {
IPA_REPLENISH_ENABLED,
+ IPA_REPLENISH_ACTIVE,
IPA_REPLENISH_COUNT, /* Number of flags (must be last) */
};
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 998c0bd2b3715244da7639cc4e6a2062cb79c3f4 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder(a)linaro.org>
Date: Wed, 12 Jan 2022 07:30:12 -0600
Subject: [PATCH] net: ipa: prevent concurrent replenish
We have seen cases where an endpoint RX completion interrupt arrives
while replenishing for the endpoint is underway. This causes another
instance of replenishing to begin as part of completing the receive
transaction. If this occurs it can lead to transaction corruption.
Use a new flag to ensure only one replenish instance for an endpoint
executes at a time.
Fixes: 84f9bd12d46db ("soc: qcom: ipa: IPA endpoints")
Signed-off-by: Alex Elder <elder(a)linaro.org>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c
index cddddcedaf72..68291a3efd04 100644
--- a/drivers/net/ipa/ipa_endpoint.c
+++ b/drivers/net/ipa/ipa_endpoint.c
@@ -1088,15 +1088,27 @@ static void ipa_endpoint_replenish(struct ipa_endpoint *endpoint, bool add_one)
return;
}
+ /* If already active, just update the backlog */
+ if (test_and_set_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags)) {
+ if (add_one)
+ atomic_inc(&endpoint->replenish_backlog);
+ return;
+ }
+
while (atomic_dec_not_zero(&endpoint->replenish_backlog))
if (ipa_endpoint_replenish_one(endpoint))
goto try_again_later;
+
+ clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags);
+
if (add_one)
atomic_inc(&endpoint->replenish_backlog);
return;
try_again_later:
+ clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags);
+
/* The last one didn't succeed, so fix the backlog */
delta = add_one ? 2 : 1;
backlog = atomic_add_return(delta, &endpoint->replenish_backlog);
@@ -1691,6 +1703,7 @@ static void ipa_endpoint_setup_one(struct ipa_endpoint *endpoint)
* backlog is the same as the maximum outstanding TREs.
*/
clear_bit(IPA_REPLENISH_ENABLED, endpoint->replenish_flags);
+ clear_bit(IPA_REPLENISH_ACTIVE, endpoint->replenish_flags);
atomic_set(&endpoint->replenish_saved,
gsi_channel_tre_max(gsi, endpoint->channel_id));
atomic_set(&endpoint->replenish_backlog, 0);
diff --git a/drivers/net/ipa/ipa_endpoint.h b/drivers/net/ipa/ipa_endpoint.h
index 07d5c20e5f00..0313cdc607de 100644
--- a/drivers/net/ipa/ipa_endpoint.h
+++ b/drivers/net/ipa/ipa_endpoint.h
@@ -44,10 +44,12 @@ enum ipa_endpoint_name {
* enum ipa_replenish_flag: RX buffer replenish flags
*
* @IPA_REPLENISH_ENABLED: Whether receive buffer replenishing is enabled
+ * @IPA_REPLENISH_ACTIVE: Whether replenishing is underway
* @IPA_REPLENISH_COUNT: Number of defined replenish flags
*/
enum ipa_replenish_flag {
IPA_REPLENISH_ENABLED,
+ IPA_REPLENISH_ACTIVE,
IPA_REPLENISH_COUNT, /* Number of flags (must be last) */
};
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6c0e3b5ce94947b311348c367db9e11dcb2ccc93 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder(a)linaro.org>
Date: Wed, 12 Jan 2022 07:30:10 -0600
Subject: [PATCH] net: ipa: fix atomic update in ipa_endpoint_replenish()
In ipa_endpoint_replenish(), if an error occurs when attempting to
replenish a receive buffer, we just quit and try again later. In
that case we increment the backlog count to reflect that the attempt
was unsuccessful. Then, if the add_one flag was true we increment
the backlog again.
This second increment is not included in the backlog local variable
though, and its value determines whether delayed work should be
scheduled. This is a bug.
Fix this by determining whether 1 or 2 should be added to the
backlog before adding it in a atomic_add_return() call.
Reviewed-by: Matthias Kaehlcke <mka(a)chromium.org>
Fixes: 84f9bd12d46db ("soc: qcom: ipa: IPA endpoints")
Signed-off-by: Alex Elder <elder(a)linaro.org>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c
index 49d9a077d037..8b055885cf3c 100644
--- a/drivers/net/ipa/ipa_endpoint.c
+++ b/drivers/net/ipa/ipa_endpoint.c
@@ -1080,6 +1080,7 @@ static void ipa_endpoint_replenish(struct ipa_endpoint *endpoint, bool add_one)
{
struct gsi *gsi;
u32 backlog;
+ int delta;
if (!endpoint->replenish_enabled) {
if (add_one)
@@ -1097,10 +1098,8 @@ static void ipa_endpoint_replenish(struct ipa_endpoint *endpoint, bool add_one)
try_again_later:
/* The last one didn't succeed, so fix the backlog */
- backlog = atomic_inc_return(&endpoint->replenish_backlog);
-
- if (add_one)
- atomic_inc(&endpoint->replenish_backlog);
+ delta = add_one ? 2 : 1;
+ backlog = atomic_add_return(delta, &endpoint->replenish_backlog);
/* Whenever a receive buffer transaction completes we'll try to
* replenish again. It's unlikely, but if we fail to supply even
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4624f199327a704dd1069aca1c3cadb8f2a28c6f Mon Sep 17 00:00:00 2001
From: Zechuan Chen <chenzechuan1(a)huawei.com>
Date: Tue, 28 Dec 2021 19:13:38 +0800
Subject: [PATCH] perf probe: Fix ppc64 'perf probe add events failed' case
Because of commit bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms
lookup across powerpc ABIv1 and ABIv2"), in ppc64 ABIv1, our perf
command eliminates the need to use the prefix "." at the symbol name.
But when the command "perf probe -a schedule" is executed on ppc64
ABIv1, it obtains two symbol address information through /proc/kallsyms,
for example:
cat /proc/kallsyms | grep -w schedule
c000000000657020 T .schedule
c000000000d4fdb8 D schedule
The symbol "D schedule" is not a function symbol, and perf will print:
"p:probe/schedule _text+13958584"Failed to write event: Invalid argument
Therefore, when searching symbols from map and adding probe point for
them, a symbol type check is added. If the type of symbol is not a
function, skip it.
Fixes: bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms lookup across powerpc ABIv1 and ABIv2")
Signed-off-by: Zechuan Chen <chenzechuan1(a)huawei.com>
Acked-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Alexander Shishkin <alexander.shishkin(a)linux.intel.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jianlin Lv <Jianlin.Lv(a)arm.com>
Cc: Jin Yao <yao.jin(a)linux.intel.com>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ravi Bangoria <ravi.bangoria(a)linux.ibm.com>
Cc: Yang Jihong <yangjihong1(a)huawei.com>
Link: https://lore.kernel.org/r/20211228111338.218602-1-chenzechuan1@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index b2a02c9ab8ea..a834918a0a0d 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -3083,6 +3083,9 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
for (j = 0; j < num_matched_functions; j++) {
sym = syms[j];
+ if (sym->type != STT_FUNC)
+ continue;
+
/* There can be duplicated symbols in the map */
for (i = 0; i < j; i++)
if (sym->start == syms[i]->start) {
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4624f199327a704dd1069aca1c3cadb8f2a28c6f Mon Sep 17 00:00:00 2001
From: Zechuan Chen <chenzechuan1(a)huawei.com>
Date: Tue, 28 Dec 2021 19:13:38 +0800
Subject: [PATCH] perf probe: Fix ppc64 'perf probe add events failed' case
Because of commit bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms
lookup across powerpc ABIv1 and ABIv2"), in ppc64 ABIv1, our perf
command eliminates the need to use the prefix "." at the symbol name.
But when the command "perf probe -a schedule" is executed on ppc64
ABIv1, it obtains two symbol address information through /proc/kallsyms,
for example:
cat /proc/kallsyms | grep -w schedule
c000000000657020 T .schedule
c000000000d4fdb8 D schedule
The symbol "D schedule" is not a function symbol, and perf will print:
"p:probe/schedule _text+13958584"Failed to write event: Invalid argument
Therefore, when searching symbols from map and adding probe point for
them, a symbol type check is added. If the type of symbol is not a
function, skip it.
Fixes: bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms lookup across powerpc ABIv1 and ABIv2")
Signed-off-by: Zechuan Chen <chenzechuan1(a)huawei.com>
Acked-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Alexander Shishkin <alexander.shishkin(a)linux.intel.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jianlin Lv <Jianlin.Lv(a)arm.com>
Cc: Jin Yao <yao.jin(a)linux.intel.com>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ravi Bangoria <ravi.bangoria(a)linux.ibm.com>
Cc: Yang Jihong <yangjihong1(a)huawei.com>
Link: https://lore.kernel.org/r/20211228111338.218602-1-chenzechuan1@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index b2a02c9ab8ea..a834918a0a0d 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -3083,6 +3083,9 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
for (j = 0; j < num_matched_functions; j++) {
sym = syms[j];
+ if (sym->type != STT_FUNC)
+ continue;
+
/* There can be duplicated symbols in the map */
for (i = 0; i < j; i++)
if (sym->start == syms[i]->start) {
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4624f199327a704dd1069aca1c3cadb8f2a28c6f Mon Sep 17 00:00:00 2001
From: Zechuan Chen <chenzechuan1(a)huawei.com>
Date: Tue, 28 Dec 2021 19:13:38 +0800
Subject: [PATCH] perf probe: Fix ppc64 'perf probe add events failed' case
Because of commit bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms
lookup across powerpc ABIv1 and ABIv2"), in ppc64 ABIv1, our perf
command eliminates the need to use the prefix "." at the symbol name.
But when the command "perf probe -a schedule" is executed on ppc64
ABIv1, it obtains two symbol address information through /proc/kallsyms,
for example:
cat /proc/kallsyms | grep -w schedule
c000000000657020 T .schedule
c000000000d4fdb8 D schedule
The symbol "D schedule" is not a function symbol, and perf will print:
"p:probe/schedule _text+13958584"Failed to write event: Invalid argument
Therefore, when searching symbols from map and adding probe point for
them, a symbol type check is added. If the type of symbol is not a
function, skip it.
Fixes: bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms lookup across powerpc ABIv1 and ABIv2")
Signed-off-by: Zechuan Chen <chenzechuan1(a)huawei.com>
Acked-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Alexander Shishkin <alexander.shishkin(a)linux.intel.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jianlin Lv <Jianlin.Lv(a)arm.com>
Cc: Jin Yao <yao.jin(a)linux.intel.com>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ravi Bangoria <ravi.bangoria(a)linux.ibm.com>
Cc: Yang Jihong <yangjihong1(a)huawei.com>
Link: https://lore.kernel.org/r/20211228111338.218602-1-chenzechuan1@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index b2a02c9ab8ea..a834918a0a0d 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -3083,6 +3083,9 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
for (j = 0; j < num_matched_functions; j++) {
sym = syms[j];
+ if (sym->type != STT_FUNC)
+ continue;
+
/* There can be duplicated symbols in the map */
for (i = 0; i < j; i++)
if (sym->start == syms[i]->start) {
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4624f199327a704dd1069aca1c3cadb8f2a28c6f Mon Sep 17 00:00:00 2001
From: Zechuan Chen <chenzechuan1(a)huawei.com>
Date: Tue, 28 Dec 2021 19:13:38 +0800
Subject: [PATCH] perf probe: Fix ppc64 'perf probe add events failed' case
Because of commit bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms
lookup across powerpc ABIv1 and ABIv2"), in ppc64 ABIv1, our perf
command eliminates the need to use the prefix "." at the symbol name.
But when the command "perf probe -a schedule" is executed on ppc64
ABIv1, it obtains two symbol address information through /proc/kallsyms,
for example:
cat /proc/kallsyms | grep -w schedule
c000000000657020 T .schedule
c000000000d4fdb8 D schedule
The symbol "D schedule" is not a function symbol, and perf will print:
"p:probe/schedule _text+13958584"Failed to write event: Invalid argument
Therefore, when searching symbols from map and adding probe point for
them, a symbol type check is added. If the type of symbol is not a
function, skip it.
Fixes: bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms lookup across powerpc ABIv1 and ABIv2")
Signed-off-by: Zechuan Chen <chenzechuan1(a)huawei.com>
Acked-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Alexander Shishkin <alexander.shishkin(a)linux.intel.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jianlin Lv <Jianlin.Lv(a)arm.com>
Cc: Jin Yao <yao.jin(a)linux.intel.com>
Cc: Jiri Olsa <jolsa(a)redhat.com>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Namhyung Kim <namhyung(a)kernel.org>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ravi Bangoria <ravi.bangoria(a)linux.ibm.com>
Cc: Yang Jihong <yangjihong1(a)huawei.com>
Link: https://lore.kernel.org/r/20211228111338.218602-1-chenzechuan1@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme(a)redhat.com>
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index b2a02c9ab8ea..a834918a0a0d 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -3083,6 +3083,9 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
for (j = 0; j < num_matched_functions; j++) {
sym = syms[j];
+ if (sym->type != STT_FUNC)
+ continue;
+
/* There can be duplicated symbols in the map */
for (i = 0; i < j; i++)
if (sym->start == syms[i]->start) {
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b63e5cb94ad6947ab5fe38b5a9417dcfd0bc6122 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:08 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix race for the tx desc callback
The transfer descriptors were wrongly moved to the free descriptors list
before calling the tx desc callback. As the DMA engine drivers drop any
locks before calling the callback function, txd could be taken again,
resulting in its callback called prematurely. Fix the race for the tx desc
callback by moving the xfer desc into the free desc list after the
callback is invoked.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-6-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 83c031207530..d5b37459f906 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1582,20 +1582,6 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
return ret;
}
-/* Call must be protected by lock. */
-static void at_xdmac_remove_xfer(struct at_xdmac_chan *atchan,
- struct at_xdmac_desc *desc)
-{
- dev_dbg(chan2dev(&atchan->chan), "%s: desc 0x%p\n", __func__, desc);
-
- /*
- * Remove the transfer from the transfer list then move the transfer
- * descriptors into the free descriptors list.
- */
- list_del(&desc->xfer_node);
- list_splice_init(&desc->descs_list, &atchan->free_descs_list);
-}
-
static void at_xdmac_advance_work(struct at_xdmac_chan *atchan)
{
struct at_xdmac_desc *desc;
@@ -1704,7 +1690,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
txd = &desc->tx_dma_desc;
dma_cookie_complete(txd);
- at_xdmac_remove_xfer(atchan, desc);
+ /* Remove the transfer from the transfer list. */
+ list_del(&desc->xfer_node);
spin_unlock_irq(&atchan->lock);
if (txd->flags & DMA_PREP_INTERRUPT)
@@ -1713,6 +1700,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
dma_run_dependencies(txd);
spin_lock_irq(&atchan->lock);
+ /* Move the xfer descriptors into the free descriptors list. */
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
at_xdmac_advance_work(atchan);
spin_unlock_irq(&atchan->lock);
}
@@ -1859,8 +1848,10 @@ static int at_xdmac_device_terminate_all(struct dma_chan *chan)
cpu_relax();
/* Cancel all pending transfers. */
- list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node)
- at_xdmac_remove_xfer(atchan, desc);
+ list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node) {
+ list_del(&desc->xfer_node);
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
+ }
clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
clear_bit(AT_XDMAC_CHAN_IS_CYCLIC, &atchan->status);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b63e5cb94ad6947ab5fe38b5a9417dcfd0bc6122 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:08 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix race for the tx desc callback
The transfer descriptors were wrongly moved to the free descriptors list
before calling the tx desc callback. As the DMA engine drivers drop any
locks before calling the callback function, txd could be taken again,
resulting in its callback called prematurely. Fix the race for the tx desc
callback by moving the xfer desc into the free desc list after the
callback is invoked.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-6-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 83c031207530..d5b37459f906 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1582,20 +1582,6 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
return ret;
}
-/* Call must be protected by lock. */
-static void at_xdmac_remove_xfer(struct at_xdmac_chan *atchan,
- struct at_xdmac_desc *desc)
-{
- dev_dbg(chan2dev(&atchan->chan), "%s: desc 0x%p\n", __func__, desc);
-
- /*
- * Remove the transfer from the transfer list then move the transfer
- * descriptors into the free descriptors list.
- */
- list_del(&desc->xfer_node);
- list_splice_init(&desc->descs_list, &atchan->free_descs_list);
-}
-
static void at_xdmac_advance_work(struct at_xdmac_chan *atchan)
{
struct at_xdmac_desc *desc;
@@ -1704,7 +1690,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
txd = &desc->tx_dma_desc;
dma_cookie_complete(txd);
- at_xdmac_remove_xfer(atchan, desc);
+ /* Remove the transfer from the transfer list. */
+ list_del(&desc->xfer_node);
spin_unlock_irq(&atchan->lock);
if (txd->flags & DMA_PREP_INTERRUPT)
@@ -1713,6 +1700,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
dma_run_dependencies(txd);
spin_lock_irq(&atchan->lock);
+ /* Move the xfer descriptors into the free descriptors list. */
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
at_xdmac_advance_work(atchan);
spin_unlock_irq(&atchan->lock);
}
@@ -1859,8 +1848,10 @@ static int at_xdmac_device_terminate_all(struct dma_chan *chan)
cpu_relax();
/* Cancel all pending transfers. */
- list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node)
- at_xdmac_remove_xfer(atchan, desc);
+ list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node) {
+ list_del(&desc->xfer_node);
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
+ }
clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
clear_bit(AT_XDMAC_CHAN_IS_CYCLIC, &atchan->status);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b63e5cb94ad6947ab5fe38b5a9417dcfd0bc6122 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:08 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix race for the tx desc callback
The transfer descriptors were wrongly moved to the free descriptors list
before calling the tx desc callback. As the DMA engine drivers drop any
locks before calling the callback function, txd could be taken again,
resulting in its callback called prematurely. Fix the race for the tx desc
callback by moving the xfer desc into the free desc list after the
callback is invoked.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-6-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 83c031207530..d5b37459f906 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1582,20 +1582,6 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
return ret;
}
-/* Call must be protected by lock. */
-static void at_xdmac_remove_xfer(struct at_xdmac_chan *atchan,
- struct at_xdmac_desc *desc)
-{
- dev_dbg(chan2dev(&atchan->chan), "%s: desc 0x%p\n", __func__, desc);
-
- /*
- * Remove the transfer from the transfer list then move the transfer
- * descriptors into the free descriptors list.
- */
- list_del(&desc->xfer_node);
- list_splice_init(&desc->descs_list, &atchan->free_descs_list);
-}
-
static void at_xdmac_advance_work(struct at_xdmac_chan *atchan)
{
struct at_xdmac_desc *desc;
@@ -1704,7 +1690,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
txd = &desc->tx_dma_desc;
dma_cookie_complete(txd);
- at_xdmac_remove_xfer(atchan, desc);
+ /* Remove the transfer from the transfer list. */
+ list_del(&desc->xfer_node);
spin_unlock_irq(&atchan->lock);
if (txd->flags & DMA_PREP_INTERRUPT)
@@ -1713,6 +1700,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
dma_run_dependencies(txd);
spin_lock_irq(&atchan->lock);
+ /* Move the xfer descriptors into the free descriptors list. */
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
at_xdmac_advance_work(atchan);
spin_unlock_irq(&atchan->lock);
}
@@ -1859,8 +1848,10 @@ static int at_xdmac_device_terminate_all(struct dma_chan *chan)
cpu_relax();
/* Cancel all pending transfers. */
- list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node)
- at_xdmac_remove_xfer(atchan, desc);
+ list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node) {
+ list_del(&desc->xfer_node);
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
+ }
clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
clear_bit(AT_XDMAC_CHAN_IS_CYCLIC, &atchan->status);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b63e5cb94ad6947ab5fe38b5a9417dcfd0bc6122 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:08 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix race for the tx desc callback
The transfer descriptors were wrongly moved to the free descriptors list
before calling the tx desc callback. As the DMA engine drivers drop any
locks before calling the callback function, txd could be taken again,
resulting in its callback called prematurely. Fix the race for the tx desc
callback by moving the xfer desc into the free desc list after the
callback is invoked.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-6-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 83c031207530..d5b37459f906 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1582,20 +1582,6 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
return ret;
}
-/* Call must be protected by lock. */
-static void at_xdmac_remove_xfer(struct at_xdmac_chan *atchan,
- struct at_xdmac_desc *desc)
-{
- dev_dbg(chan2dev(&atchan->chan), "%s: desc 0x%p\n", __func__, desc);
-
- /*
- * Remove the transfer from the transfer list then move the transfer
- * descriptors into the free descriptors list.
- */
- list_del(&desc->xfer_node);
- list_splice_init(&desc->descs_list, &atchan->free_descs_list);
-}
-
static void at_xdmac_advance_work(struct at_xdmac_chan *atchan)
{
struct at_xdmac_desc *desc;
@@ -1704,7 +1690,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
txd = &desc->tx_dma_desc;
dma_cookie_complete(txd);
- at_xdmac_remove_xfer(atchan, desc);
+ /* Remove the transfer from the transfer list. */
+ list_del(&desc->xfer_node);
spin_unlock_irq(&atchan->lock);
if (txd->flags & DMA_PREP_INTERRUPT)
@@ -1713,6 +1700,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
dma_run_dependencies(txd);
spin_lock_irq(&atchan->lock);
+ /* Move the xfer descriptors into the free descriptors list. */
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
at_xdmac_advance_work(atchan);
spin_unlock_irq(&atchan->lock);
}
@@ -1859,8 +1848,10 @@ static int at_xdmac_device_terminate_all(struct dma_chan *chan)
cpu_relax();
/* Cancel all pending transfers. */
- list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node)
- at_xdmac_remove_xfer(atchan, desc);
+ list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node) {
+ list_del(&desc->xfer_node);
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
+ }
clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
clear_bit(AT_XDMAC_CHAN_IS_CYCLIC, &atchan->status);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b63e5cb94ad6947ab5fe38b5a9417dcfd0bc6122 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:08 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix race for the tx desc callback
The transfer descriptors were wrongly moved to the free descriptors list
before calling the tx desc callback. As the DMA engine drivers drop any
locks before calling the callback function, txd could be taken again,
resulting in its callback called prematurely. Fix the race for the tx desc
callback by moving the xfer desc into the free desc list after the
callback is invoked.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-6-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 83c031207530..d5b37459f906 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1582,20 +1582,6 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
return ret;
}
-/* Call must be protected by lock. */
-static void at_xdmac_remove_xfer(struct at_xdmac_chan *atchan,
- struct at_xdmac_desc *desc)
-{
- dev_dbg(chan2dev(&atchan->chan), "%s: desc 0x%p\n", __func__, desc);
-
- /*
- * Remove the transfer from the transfer list then move the transfer
- * descriptors into the free descriptors list.
- */
- list_del(&desc->xfer_node);
- list_splice_init(&desc->descs_list, &atchan->free_descs_list);
-}
-
static void at_xdmac_advance_work(struct at_xdmac_chan *atchan)
{
struct at_xdmac_desc *desc;
@@ -1704,7 +1690,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
txd = &desc->tx_dma_desc;
dma_cookie_complete(txd);
- at_xdmac_remove_xfer(atchan, desc);
+ /* Remove the transfer from the transfer list. */
+ list_del(&desc->xfer_node);
spin_unlock_irq(&atchan->lock);
if (txd->flags & DMA_PREP_INTERRUPT)
@@ -1713,6 +1700,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
dma_run_dependencies(txd);
spin_lock_irq(&atchan->lock);
+ /* Move the xfer descriptors into the free descriptors list. */
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
at_xdmac_advance_work(atchan);
spin_unlock_irq(&atchan->lock);
}
@@ -1859,8 +1848,10 @@ static int at_xdmac_device_terminate_all(struct dma_chan *chan)
cpu_relax();
/* Cancel all pending transfers. */
- list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node)
- at_xdmac_remove_xfer(atchan, desc);
+ list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node) {
+ list_del(&desc->xfer_node);
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
+ }
clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
clear_bit(AT_XDMAC_CHAN_IS_CYCLIC, &atchan->status);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b63e5cb94ad6947ab5fe38b5a9417dcfd0bc6122 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:08 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix race for the tx desc callback
The transfer descriptors were wrongly moved to the free descriptors list
before calling the tx desc callback. As the DMA engine drivers drop any
locks before calling the callback function, txd could be taken again,
resulting in its callback called prematurely. Fix the race for the tx desc
callback by moving the xfer desc into the free desc list after the
callback is invoked.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-6-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 83c031207530..d5b37459f906 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1582,20 +1582,6 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
return ret;
}
-/* Call must be protected by lock. */
-static void at_xdmac_remove_xfer(struct at_xdmac_chan *atchan,
- struct at_xdmac_desc *desc)
-{
- dev_dbg(chan2dev(&atchan->chan), "%s: desc 0x%p\n", __func__, desc);
-
- /*
- * Remove the transfer from the transfer list then move the transfer
- * descriptors into the free descriptors list.
- */
- list_del(&desc->xfer_node);
- list_splice_init(&desc->descs_list, &atchan->free_descs_list);
-}
-
static void at_xdmac_advance_work(struct at_xdmac_chan *atchan)
{
struct at_xdmac_desc *desc;
@@ -1704,7 +1690,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
txd = &desc->tx_dma_desc;
dma_cookie_complete(txd);
- at_xdmac_remove_xfer(atchan, desc);
+ /* Remove the transfer from the transfer list. */
+ list_del(&desc->xfer_node);
spin_unlock_irq(&atchan->lock);
if (txd->flags & DMA_PREP_INTERRUPT)
@@ -1713,6 +1700,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
dma_run_dependencies(txd);
spin_lock_irq(&atchan->lock);
+ /* Move the xfer descriptors into the free descriptors list. */
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
at_xdmac_advance_work(atchan);
spin_unlock_irq(&atchan->lock);
}
@@ -1859,8 +1848,10 @@ static int at_xdmac_device_terminate_all(struct dma_chan *chan)
cpu_relax();
/* Cancel all pending transfers. */
- list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node)
- at_xdmac_remove_xfer(atchan, desc);
+ list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node) {
+ list_del(&desc->xfer_node);
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
+ }
clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
clear_bit(AT_XDMAC_CHAN_IS_CYCLIC, &atchan->status);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b63e5cb94ad6947ab5fe38b5a9417dcfd0bc6122 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:08 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix race for the tx desc callback
The transfer descriptors were wrongly moved to the free descriptors list
before calling the tx desc callback. As the DMA engine drivers drop any
locks before calling the callback function, txd could be taken again,
resulting in its callback called prematurely. Fix the race for the tx desc
callback by moving the xfer desc into the free desc list after the
callback is invoked.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-6-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 83c031207530..d5b37459f906 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1582,20 +1582,6 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
return ret;
}
-/* Call must be protected by lock. */
-static void at_xdmac_remove_xfer(struct at_xdmac_chan *atchan,
- struct at_xdmac_desc *desc)
-{
- dev_dbg(chan2dev(&atchan->chan), "%s: desc 0x%p\n", __func__, desc);
-
- /*
- * Remove the transfer from the transfer list then move the transfer
- * descriptors into the free descriptors list.
- */
- list_del(&desc->xfer_node);
- list_splice_init(&desc->descs_list, &atchan->free_descs_list);
-}
-
static void at_xdmac_advance_work(struct at_xdmac_chan *atchan)
{
struct at_xdmac_desc *desc;
@@ -1704,7 +1690,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
txd = &desc->tx_dma_desc;
dma_cookie_complete(txd);
- at_xdmac_remove_xfer(atchan, desc);
+ /* Remove the transfer from the transfer list. */
+ list_del(&desc->xfer_node);
spin_unlock_irq(&atchan->lock);
if (txd->flags & DMA_PREP_INTERRUPT)
@@ -1713,6 +1700,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
dma_run_dependencies(txd);
spin_lock_irq(&atchan->lock);
+ /* Move the xfer descriptors into the free descriptors list. */
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
at_xdmac_advance_work(atchan);
spin_unlock_irq(&atchan->lock);
}
@@ -1859,8 +1848,10 @@ static int at_xdmac_device_terminate_all(struct dma_chan *chan)
cpu_relax();
/* Cancel all pending transfers. */
- list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node)
- at_xdmac_remove_xfer(atchan, desc);
+ list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node) {
+ list_del(&desc->xfer_node);
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
+ }
clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
clear_bit(AT_XDMAC_CHAN_IS_CYCLIC, &atchan->status);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b63e5cb94ad6947ab5fe38b5a9417dcfd0bc6122 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:08 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix race for the tx desc callback
The transfer descriptors were wrongly moved to the free descriptors list
before calling the tx desc callback. As the DMA engine drivers drop any
locks before calling the callback function, txd could be taken again,
resulting in its callback called prematurely. Fix the race for the tx desc
callback by moving the xfer desc into the free desc list after the
callback is invoked.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-6-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 83c031207530..d5b37459f906 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1582,20 +1582,6 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
return ret;
}
-/* Call must be protected by lock. */
-static void at_xdmac_remove_xfer(struct at_xdmac_chan *atchan,
- struct at_xdmac_desc *desc)
-{
- dev_dbg(chan2dev(&atchan->chan), "%s: desc 0x%p\n", __func__, desc);
-
- /*
- * Remove the transfer from the transfer list then move the transfer
- * descriptors into the free descriptors list.
- */
- list_del(&desc->xfer_node);
- list_splice_init(&desc->descs_list, &atchan->free_descs_list);
-}
-
static void at_xdmac_advance_work(struct at_xdmac_chan *atchan)
{
struct at_xdmac_desc *desc;
@@ -1704,7 +1690,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
txd = &desc->tx_dma_desc;
dma_cookie_complete(txd);
- at_xdmac_remove_xfer(atchan, desc);
+ /* Remove the transfer from the transfer list. */
+ list_del(&desc->xfer_node);
spin_unlock_irq(&atchan->lock);
if (txd->flags & DMA_PREP_INTERRUPT)
@@ -1713,6 +1700,8 @@ static void at_xdmac_tasklet(struct tasklet_struct *t)
dma_run_dependencies(txd);
spin_lock_irq(&atchan->lock);
+ /* Move the xfer descriptors into the free descriptors list. */
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
at_xdmac_advance_work(atchan);
spin_unlock_irq(&atchan->lock);
}
@@ -1859,8 +1848,10 @@ static int at_xdmac_device_terminate_all(struct dma_chan *chan)
cpu_relax();
/* Cancel all pending transfers. */
- list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node)
- at_xdmac_remove_xfer(atchan, desc);
+ list_for_each_entry_safe(desc, _desc, &atchan->xfers_list, xfer_node) {
+ list_del(&desc->xfer_node);
+ list_splice_init(&desc->descs_list, &atchan->free_descs_list);
+ }
clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
clear_bit(AT_XDMAC_CHAN_IS_CYCLIC, &atchan->status);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 18deddea9184b62941395889ff7659529c877326 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:10 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix concurrency over xfers_list
Since tx_submit can be called from a hard IRQ, xfers_list must be
protected with a lock to avoid concurency on the list's elements.
Since at_xdmac_handle_cyclic() is called from a tasklet, spin_lock_irq
is enough to protect from a hard IRQ.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-8-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index b6547f1b5645..eeb03065d484 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1608,14 +1608,17 @@ static void at_xdmac_handle_cyclic(struct at_xdmac_chan *atchan)
struct at_xdmac_desc *desc;
struct dma_async_tx_descriptor *txd;
- if (!list_empty(&atchan->xfers_list)) {
- desc = list_first_entry(&atchan->xfers_list,
- struct at_xdmac_desc, xfer_node);
- txd = &desc->tx_dma_desc;
-
- if (txd->flags & DMA_PREP_INTERRUPT)
- dmaengine_desc_get_callback_invoke(txd, NULL);
+ spin_lock_irq(&atchan->lock);
+ if (list_empty(&atchan->xfers_list)) {
+ spin_unlock_irq(&atchan->lock);
+ return;
}
+ desc = list_first_entry(&atchan->xfers_list, struct at_xdmac_desc,
+ xfer_node);
+ spin_unlock_irq(&atchan->lock);
+ txd = &desc->tx_dma_desc;
+ if (txd->flags & DMA_PREP_INTERRUPT)
+ dmaengine_desc_get_callback_invoke(txd, NULL);
}
static void at_xdmac_handle_error(struct at_xdmac_chan *atchan)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 18deddea9184b62941395889ff7659529c877326 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:10 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix concurrency over xfers_list
Since tx_submit can be called from a hard IRQ, xfers_list must be
protected with a lock to avoid concurency on the list's elements.
Since at_xdmac_handle_cyclic() is called from a tasklet, spin_lock_irq
is enough to protect from a hard IRQ.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-8-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index b6547f1b5645..eeb03065d484 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1608,14 +1608,17 @@ static void at_xdmac_handle_cyclic(struct at_xdmac_chan *atchan)
struct at_xdmac_desc *desc;
struct dma_async_tx_descriptor *txd;
- if (!list_empty(&atchan->xfers_list)) {
- desc = list_first_entry(&atchan->xfers_list,
- struct at_xdmac_desc, xfer_node);
- txd = &desc->tx_dma_desc;
-
- if (txd->flags & DMA_PREP_INTERRUPT)
- dmaengine_desc_get_callback_invoke(txd, NULL);
+ spin_lock_irq(&atchan->lock);
+ if (list_empty(&atchan->xfers_list)) {
+ spin_unlock_irq(&atchan->lock);
+ return;
}
+ desc = list_first_entry(&atchan->xfers_list, struct at_xdmac_desc,
+ xfer_node);
+ spin_unlock_irq(&atchan->lock);
+ txd = &desc->tx_dma_desc;
+ if (txd->flags & DMA_PREP_INTERRUPT)
+ dmaengine_desc_get_callback_invoke(txd, NULL);
}
static void at_xdmac_handle_error(struct at_xdmac_chan *atchan)
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 18deddea9184b62941395889ff7659529c877326 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:10 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix concurrency over xfers_list
Since tx_submit can be called from a hard IRQ, xfers_list must be
protected with a lock to avoid concurency on the list's elements.
Since at_xdmac_handle_cyclic() is called from a tasklet, spin_lock_irq
is enough to protect from a hard IRQ.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-8-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index b6547f1b5645..eeb03065d484 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1608,14 +1608,17 @@ static void at_xdmac_handle_cyclic(struct at_xdmac_chan *atchan)
struct at_xdmac_desc *desc;
struct dma_async_tx_descriptor *txd;
- if (!list_empty(&atchan->xfers_list)) {
- desc = list_first_entry(&atchan->xfers_list,
- struct at_xdmac_desc, xfer_node);
- txd = &desc->tx_dma_desc;
-
- if (txd->flags & DMA_PREP_INTERRUPT)
- dmaengine_desc_get_callback_invoke(txd, NULL);
+ spin_lock_irq(&atchan->lock);
+ if (list_empty(&atchan->xfers_list)) {
+ spin_unlock_irq(&atchan->lock);
+ return;
}
+ desc = list_first_entry(&atchan->xfers_list, struct at_xdmac_desc,
+ xfer_node);
+ spin_unlock_irq(&atchan->lock);
+ txd = &desc->tx_dma_desc;
+ if (txd->flags & DMA_PREP_INTERRUPT)
+ dmaengine_desc_get_callback_invoke(txd, NULL);
}
static void at_xdmac_handle_error(struct at_xdmac_chan *atchan)
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 18deddea9184b62941395889ff7659529c877326 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:10 +0200
Subject: [PATCH] dmaengine: at_xdmac: Fix concurrency over xfers_list
Since tx_submit can be called from a hard IRQ, xfers_list must be
protected with a lock to avoid concurency on the list's elements.
Since at_xdmac_handle_cyclic() is called from a tasklet, spin_lock_irq
is enough to protect from a hard IRQ.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-8-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index b6547f1b5645..eeb03065d484 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1608,14 +1608,17 @@ static void at_xdmac_handle_cyclic(struct at_xdmac_chan *atchan)
struct at_xdmac_desc *desc;
struct dma_async_tx_descriptor *txd;
- if (!list_empty(&atchan->xfers_list)) {
- desc = list_first_entry(&atchan->xfers_list,
- struct at_xdmac_desc, xfer_node);
- txd = &desc->tx_dma_desc;
-
- if (txd->flags & DMA_PREP_INTERRUPT)
- dmaengine_desc_get_callback_invoke(txd, NULL);
+ spin_lock_irq(&atchan->lock);
+ if (list_empty(&atchan->xfers_list)) {
+ spin_unlock_irq(&atchan->lock);
+ return;
}
+ desc = list_first_entry(&atchan->xfers_list, struct at_xdmac_desc,
+ xfer_node);
+ spin_unlock_irq(&atchan->lock);
+ txd = &desc->tx_dma_desc;
+ if (txd->flags & DMA_PREP_INTERRUPT)
+ dmaengine_desc_get_callback_invoke(txd, NULL);
}
static void at_xdmac_handle_error(struct at_xdmac_chan *atchan)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e6af9b05bec63cd4d1de2a33968cd0be2a91282a Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:05 +0200
Subject: [PATCH] dmaengine: at_xdmac: Start transfer for cyclic channels in
issue_pending
Cyclic channels must too call issue_pending in order to start a transfer.
Start the transfer in issue_pending regardless of the type of channel.
This wrongly worked before, because in the past the transfer was started
at tx_submit level when only a desc in the transfer list.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-3-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 4ff12b083136..c3d3e1270236 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1778,11 +1778,9 @@ static void at_xdmac_issue_pending(struct dma_chan *chan)
dev_dbg(chan2dev(&atchan->chan), "%s\n", __func__);
- if (!at_xdmac_chan_is_cyclic(atchan)) {
- spin_lock_irqsave(&atchan->lock, flags);
- at_xdmac_advance_work(atchan);
- spin_unlock_irqrestore(&atchan->lock, flags);
- }
+ spin_lock_irqsave(&atchan->lock, flags);
+ at_xdmac_advance_work(atchan);
+ spin_unlock_irqrestore(&atchan->lock, flags);
return;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e6af9b05bec63cd4d1de2a33968cd0be2a91282a Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:05 +0200
Subject: [PATCH] dmaengine: at_xdmac: Start transfer for cyclic channels in
issue_pending
Cyclic channels must too call issue_pending in order to start a transfer.
Start the transfer in issue_pending regardless of the type of channel.
This wrongly worked before, because in the past the transfer was started
at tx_submit level when only a desc in the transfer list.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-3-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 4ff12b083136..c3d3e1270236 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1778,11 +1778,9 @@ static void at_xdmac_issue_pending(struct dma_chan *chan)
dev_dbg(chan2dev(&atchan->chan), "%s\n", __func__);
- if (!at_xdmac_chan_is_cyclic(atchan)) {
- spin_lock_irqsave(&atchan->lock, flags);
- at_xdmac_advance_work(atchan);
- spin_unlock_irqrestore(&atchan->lock, flags);
- }
+ spin_lock_irqsave(&atchan->lock, flags);
+ at_xdmac_advance_work(atchan);
+ spin_unlock_irqrestore(&atchan->lock, flags);
return;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e6af9b05bec63cd4d1de2a33968cd0be2a91282a Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:05 +0200
Subject: [PATCH] dmaengine: at_xdmac: Start transfer for cyclic channels in
issue_pending
Cyclic channels must too call issue_pending in order to start a transfer.
Start the transfer in issue_pending regardless of the type of channel.
This wrongly worked before, because in the past the transfer was started
at tx_submit level when only a desc in the transfer list.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-3-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 4ff12b083136..c3d3e1270236 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1778,11 +1778,9 @@ static void at_xdmac_issue_pending(struct dma_chan *chan)
dev_dbg(chan2dev(&atchan->chan), "%s\n", __func__);
- if (!at_xdmac_chan_is_cyclic(atchan)) {
- spin_lock_irqsave(&atchan->lock, flags);
- at_xdmac_advance_work(atchan);
- spin_unlock_irqrestore(&atchan->lock, flags);
- }
+ spin_lock_irqsave(&atchan->lock, flags);
+ at_xdmac_advance_work(atchan);
+ spin_unlock_irqrestore(&atchan->lock, flags);
return;
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e6af9b05bec63cd4d1de2a33968cd0be2a91282a Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:05 +0200
Subject: [PATCH] dmaengine: at_xdmac: Start transfer for cyclic channels in
issue_pending
Cyclic channels must too call issue_pending in order to start a transfer.
Start the transfer in issue_pending regardless of the type of channel.
This wrongly worked before, because in the past the transfer was started
at tx_submit level when only a desc in the transfer list.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-3-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 4ff12b083136..c3d3e1270236 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1778,11 +1778,9 @@ static void at_xdmac_issue_pending(struct dma_chan *chan)
dev_dbg(chan2dev(&atchan->chan), "%s\n", __func__);
- if (!at_xdmac_chan_is_cyclic(atchan)) {
- spin_lock_irqsave(&atchan->lock, flags);
- at_xdmac_advance_work(atchan);
- spin_unlock_irqrestore(&atchan->lock, flags);
- }
+ spin_lock_irqsave(&atchan->lock, flags);
+ at_xdmac_advance_work(atchan);
+ spin_unlock_irqrestore(&atchan->lock, flags);
return;
}
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e6af9b05bec63cd4d1de2a33968cd0be2a91282a Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Wed, 15 Dec 2021 13:01:05 +0200
Subject: [PATCH] dmaengine: at_xdmac: Start transfer for cyclic channels in
issue_pending
Cyclic channels must too call issue_pending in order to start a transfer.
Start the transfer in issue_pending regardless of the type of channel.
This wrongly worked before, because in the past the transfer was started
at tx_submit level when only a desc in the transfer list.
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel eXtended DMA Controller driver")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Link: https://lore.kernel.org/r/20211215110115.191749-3-tudor.ambarus@microchip.c…
Signed-off-by: Vinod Koul <vkoul(a)kernel.org>
diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 4ff12b083136..c3d3e1270236 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -1778,11 +1778,9 @@ static void at_xdmac_issue_pending(struct dma_chan *chan)
dev_dbg(chan2dev(&atchan->chan), "%s\n", __func__);
- if (!at_xdmac_chan_is_cyclic(atchan)) {
- spin_lock_irqsave(&atchan->lock, flags);
- at_xdmac_advance_work(atchan);
- spin_unlock_irqrestore(&atchan->lock, flags);
- }
+ spin_lock_irqsave(&atchan->lock, flags);
+ at_xdmac_advance_work(atchan);
+ spin_unlock_irqrestore(&atchan->lock, flags);
return;
}
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 791f3465c4afde02d7f16cf7424ca87070b69396 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 14 Jan 2022 11:59:10 +0000
Subject: [PATCH] io_uring: fix UAF due to missing POLLFREE handling
Fixes a problem described in 50252e4b5e989
("aio: fix use-after-free due to missing POLLFREE handling")
and copies the approach used there.
In short, we have to forcibly eject a poll entry when we meet POLLFREE.
We can't rely on io_poll_get_ownership() as can't wait for potentially
running tw handlers, so we use the fact that wqs are RCU freed. See
Eric's patch and comments for more details.
Reported-by: Eric Biggers <ebiggers(a)google.com>
Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org
Reported-and-tested-by: syzbot+5426c7ed6868c705ca14(a)syzkaller.appspotmail.com
Fixes: 221c5eb233823 ("io_uring: add support for IORING_OP_POLL")
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/4ed56b6f548f7ea337603a82315750449412748a.16421612…
[axboe: drop non-functional change from patch]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fa3277844d2e..422d6de48688 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5462,12 +5462,14 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
{
- struct wait_queue_head *head = poll->head;
+ struct wait_queue_head *head = smp_load_acquire(&poll->head);
- spin_lock_irq(&head->lock);
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- spin_unlock_irq(&head->lock);
+ if (head) {
+ spin_lock_irq(&head->lock);
+ list_del_init(&poll->wait.entry);
+ poll->head = NULL;
+ spin_unlock_irq(&head->lock);
+ }
}
static void io_poll_remove_entries(struct io_kiocb *req)
@@ -5475,10 +5477,26 @@ static void io_poll_remove_entries(struct io_kiocb *req)
struct io_poll_iocb *poll = io_poll_get_single(req);
struct io_poll_iocb *poll_double = io_poll_get_double(req);
- if (poll->head)
- io_poll_remove_entry(poll);
- if (poll_double && poll_double->head)
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ io_poll_remove_entry(poll);
+ if (poll_double)
io_poll_remove_entry(poll_double);
+ rcu_read_unlock();
}
/*
@@ -5618,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
wait);
__poll_t mask = key_to_poll(key);
+ if (unlikely(mask & POLLFREE)) {
+ io_poll_mark_cancelled(req);
+ /* we have to kick tw in case it's not already */
+ io_poll_execute(req, 0);
+
+ /*
+ * If the waitqueue is being freed early but someone is already
+ * holds ownership over it, we have to tear down the request as
+ * best we can. That means immediately removing the request from
+ * its waitqueue and preventing all further accesses to the
+ * waitqueue via the request.
+ */
+ list_del_init(&poll->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&poll->head, NULL);
+ return 1;
+ }
+
/* for instances that support it check for an event match first */
if (mask && !(mask & poll->events))
return 0;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 791f3465c4afde02d7f16cf7424ca87070b69396 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 14 Jan 2022 11:59:10 +0000
Subject: [PATCH] io_uring: fix UAF due to missing POLLFREE handling
Fixes a problem described in 50252e4b5e989
("aio: fix use-after-free due to missing POLLFREE handling")
and copies the approach used there.
In short, we have to forcibly eject a poll entry when we meet POLLFREE.
We can't rely on io_poll_get_ownership() as can't wait for potentially
running tw handlers, so we use the fact that wqs are RCU freed. See
Eric's patch and comments for more details.
Reported-by: Eric Biggers <ebiggers(a)google.com>
Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org
Reported-and-tested-by: syzbot+5426c7ed6868c705ca14(a)syzkaller.appspotmail.com
Fixes: 221c5eb233823 ("io_uring: add support for IORING_OP_POLL")
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/4ed56b6f548f7ea337603a82315750449412748a.16421612…
[axboe: drop non-functional change from patch]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fa3277844d2e..422d6de48688 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5462,12 +5462,14 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
{
- struct wait_queue_head *head = poll->head;
+ struct wait_queue_head *head = smp_load_acquire(&poll->head);
- spin_lock_irq(&head->lock);
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- spin_unlock_irq(&head->lock);
+ if (head) {
+ spin_lock_irq(&head->lock);
+ list_del_init(&poll->wait.entry);
+ poll->head = NULL;
+ spin_unlock_irq(&head->lock);
+ }
}
static void io_poll_remove_entries(struct io_kiocb *req)
@@ -5475,10 +5477,26 @@ static void io_poll_remove_entries(struct io_kiocb *req)
struct io_poll_iocb *poll = io_poll_get_single(req);
struct io_poll_iocb *poll_double = io_poll_get_double(req);
- if (poll->head)
- io_poll_remove_entry(poll);
- if (poll_double && poll_double->head)
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ io_poll_remove_entry(poll);
+ if (poll_double)
io_poll_remove_entry(poll_double);
+ rcu_read_unlock();
}
/*
@@ -5618,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
wait);
__poll_t mask = key_to_poll(key);
+ if (unlikely(mask & POLLFREE)) {
+ io_poll_mark_cancelled(req);
+ /* we have to kick tw in case it's not already */
+ io_poll_execute(req, 0);
+
+ /*
+ * If the waitqueue is being freed early but someone is already
+ * holds ownership over it, we have to tear down the request as
+ * best we can. That means immediately removing the request from
+ * its waitqueue and preventing all further accesses to the
+ * waitqueue via the request.
+ */
+ list_del_init(&poll->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&poll->head, NULL);
+ return 1;
+ }
+
/* for instances that support it check for an event match first */
if (mask && !(mask & poll->events))
return 0;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 791f3465c4afde02d7f16cf7424ca87070b69396 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 14 Jan 2022 11:59:10 +0000
Subject: [PATCH] io_uring: fix UAF due to missing POLLFREE handling
Fixes a problem described in 50252e4b5e989
("aio: fix use-after-free due to missing POLLFREE handling")
and copies the approach used there.
In short, we have to forcibly eject a poll entry when we meet POLLFREE.
We can't rely on io_poll_get_ownership() as can't wait for potentially
running tw handlers, so we use the fact that wqs are RCU freed. See
Eric's patch and comments for more details.
Reported-by: Eric Biggers <ebiggers(a)google.com>
Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org
Reported-and-tested-by: syzbot+5426c7ed6868c705ca14(a)syzkaller.appspotmail.com
Fixes: 221c5eb233823 ("io_uring: add support for IORING_OP_POLL")
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/4ed56b6f548f7ea337603a82315750449412748a.16421612…
[axboe: drop non-functional change from patch]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fa3277844d2e..422d6de48688 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5462,12 +5462,14 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
{
- struct wait_queue_head *head = poll->head;
+ struct wait_queue_head *head = smp_load_acquire(&poll->head);
- spin_lock_irq(&head->lock);
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- spin_unlock_irq(&head->lock);
+ if (head) {
+ spin_lock_irq(&head->lock);
+ list_del_init(&poll->wait.entry);
+ poll->head = NULL;
+ spin_unlock_irq(&head->lock);
+ }
}
static void io_poll_remove_entries(struct io_kiocb *req)
@@ -5475,10 +5477,26 @@ static void io_poll_remove_entries(struct io_kiocb *req)
struct io_poll_iocb *poll = io_poll_get_single(req);
struct io_poll_iocb *poll_double = io_poll_get_double(req);
- if (poll->head)
- io_poll_remove_entry(poll);
- if (poll_double && poll_double->head)
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ io_poll_remove_entry(poll);
+ if (poll_double)
io_poll_remove_entry(poll_double);
+ rcu_read_unlock();
}
/*
@@ -5618,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
wait);
__poll_t mask = key_to_poll(key);
+ if (unlikely(mask & POLLFREE)) {
+ io_poll_mark_cancelled(req);
+ /* we have to kick tw in case it's not already */
+ io_poll_execute(req, 0);
+
+ /*
+ * If the waitqueue is being freed early but someone is already
+ * holds ownership over it, we have to tear down the request as
+ * best we can. That means immediately removing the request from
+ * its waitqueue and preventing all further accesses to the
+ * waitqueue via the request.
+ */
+ list_del_init(&poll->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&poll->head, NULL);
+ return 1;
+ }
+
/* for instances that support it check for an event match first */
if (mask && !(mask & poll->events))
return 0;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 791f3465c4afde02d7f16cf7424ca87070b69396 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 14 Jan 2022 11:59:10 +0000
Subject: [PATCH] io_uring: fix UAF due to missing POLLFREE handling
Fixes a problem described in 50252e4b5e989
("aio: fix use-after-free due to missing POLLFREE handling")
and copies the approach used there.
In short, we have to forcibly eject a poll entry when we meet POLLFREE.
We can't rely on io_poll_get_ownership() as can't wait for potentially
running tw handlers, so we use the fact that wqs are RCU freed. See
Eric's patch and comments for more details.
Reported-by: Eric Biggers <ebiggers(a)google.com>
Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org
Reported-and-tested-by: syzbot+5426c7ed6868c705ca14(a)syzkaller.appspotmail.com
Fixes: 221c5eb233823 ("io_uring: add support for IORING_OP_POLL")
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/4ed56b6f548f7ea337603a82315750449412748a.16421612…
[axboe: drop non-functional change from patch]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fa3277844d2e..422d6de48688 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5462,12 +5462,14 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
{
- struct wait_queue_head *head = poll->head;
+ struct wait_queue_head *head = smp_load_acquire(&poll->head);
- spin_lock_irq(&head->lock);
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- spin_unlock_irq(&head->lock);
+ if (head) {
+ spin_lock_irq(&head->lock);
+ list_del_init(&poll->wait.entry);
+ poll->head = NULL;
+ spin_unlock_irq(&head->lock);
+ }
}
static void io_poll_remove_entries(struct io_kiocb *req)
@@ -5475,10 +5477,26 @@ static void io_poll_remove_entries(struct io_kiocb *req)
struct io_poll_iocb *poll = io_poll_get_single(req);
struct io_poll_iocb *poll_double = io_poll_get_double(req);
- if (poll->head)
- io_poll_remove_entry(poll);
- if (poll_double && poll_double->head)
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ io_poll_remove_entry(poll);
+ if (poll_double)
io_poll_remove_entry(poll_double);
+ rcu_read_unlock();
}
/*
@@ -5618,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
wait);
__poll_t mask = key_to_poll(key);
+ if (unlikely(mask & POLLFREE)) {
+ io_poll_mark_cancelled(req);
+ /* we have to kick tw in case it's not already */
+ io_poll_execute(req, 0);
+
+ /*
+ * If the waitqueue is being freed early but someone is already
+ * holds ownership over it, we have to tear down the request as
+ * best we can. That means immediately removing the request from
+ * its waitqueue and preventing all further accesses to the
+ * waitqueue via the request.
+ */
+ list_del_init(&poll->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&poll->head, NULL);
+ return 1;
+ }
+
/* for instances that support it check for an event match first */
if (mask && !(mask & poll->events))
return 0;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 23e7b1bfed61e301853b5e35472820d919498278 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault(a)redhat.com>
Date: Mon, 10 Jan 2022 14:43:06 +0100
Subject: [PATCH] xfrm: Don't accidentally set RTO_ONLINK in decode_session4()
Similar to commit 94e2238969e8 ("xfrm4: strip ECN bits from tos field"),
clear the ECN bits from iph->tos when setting ->flowi4_tos.
This ensures that the last bit of ->flowi4_tos is cleared, so
ip_route_output_key_hash() isn't going to restrict the scope of the
route lookup.
Use ~INET_ECN_MASK instead of IPTOS_RT_MASK, because we have no reason
to clear the high order bits.
Found by code inspection, compile tested only.
Fixes: 4da3089f2b58 ("[IPSEC]: Use TOS when doing tunnel lookups")
Signed-off-by: Guillaume Nault <gnault(a)redhat.com>
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index dccb8f3318ef..04d1ce9b510f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -31,6 +31,7 @@
#include <linux/if_tunnel.h>
#include <net/dst.h>
#include <net/flow.h>
+#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/gre.h>
@@ -3295,7 +3296,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse)
fl4->flowi4_proto = iph->protocol;
fl4->daddr = reverse ? iph->saddr : iph->daddr;
fl4->saddr = reverse ? iph->daddr : iph->saddr;
- fl4->flowi4_tos = iph->tos;
+ fl4->flowi4_tos = iph->tos & ~INET_ECN_MASK;
if (!ip_is_fragment(iph)) {
switch (iph->protocol) {
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 23e7b1bfed61e301853b5e35472820d919498278 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault(a)redhat.com>
Date: Mon, 10 Jan 2022 14:43:06 +0100
Subject: [PATCH] xfrm: Don't accidentally set RTO_ONLINK in decode_session4()
Similar to commit 94e2238969e8 ("xfrm4: strip ECN bits from tos field"),
clear the ECN bits from iph->tos when setting ->flowi4_tos.
This ensures that the last bit of ->flowi4_tos is cleared, so
ip_route_output_key_hash() isn't going to restrict the scope of the
route lookup.
Use ~INET_ECN_MASK instead of IPTOS_RT_MASK, because we have no reason
to clear the high order bits.
Found by code inspection, compile tested only.
Fixes: 4da3089f2b58 ("[IPSEC]: Use TOS when doing tunnel lookups")
Signed-off-by: Guillaume Nault <gnault(a)redhat.com>
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index dccb8f3318ef..04d1ce9b510f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -31,6 +31,7 @@
#include <linux/if_tunnel.h>
#include <net/dst.h>
#include <net/flow.h>
+#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/gre.h>
@@ -3295,7 +3296,7 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse)
fl4->flowi4_proto = iph->protocol;
fl4->daddr = reverse ? iph->saddr : iph->daddr;
fl4->saddr = reverse ? iph->daddr : iph->saddr;
- fl4->flowi4_tos = iph->tos;
+ fl4->flowi4_tos = iph->tos & ~INET_ECN_MASK;
if (!ip_is_fragment(iph)) {
switch (iph->protocol) {
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 58589a75bba96f43b62d8069b35be081bc00d7c3 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Date: Thu, 25 Nov 2021 23:27:29 +0200
Subject: [PATCH] misc: at25: Check proper value of chip length in FRAM case
Obviously the byte_len value should be checked from the chip
and not from at25->chip.
Fixes: fd307a4ad332 ("nvmem: prepare basics for FRAM support")
Acked-by: Arnd Bergmann <arnd(a)arndb.de>
Signed-off-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20211125212729.86585-4-andriy.shevchenko@linux.in…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c
index f0b0efc30ee6..e21216541b0f 100644
--- a/drivers/misc/eeprom/at25.c
+++ b/drivers/misc/eeprom/at25.c
@@ -433,9 +433,9 @@ static int at25_probe(struct spi_device *spi)
dev_err(&spi->dev, "Error: unsupported size (id %02x)\n", id[7]);
return -ENODEV;
}
- chip.byte_len = int_pow(2, id[7] - 0x21 + 4) * 1024;
- if (at25->chip.byte_len > 64 * 1024)
+ chip.byte_len = int_pow(2, id[7] - 0x21 + 4) * 1024;
+ if (chip.byte_len > 64 * 1024)
at25->chip.flags |= EE_ADDR3;
else
at25->chip.flags |= EE_ADDR2;
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 58589a75bba96f43b62d8069b35be081bc00d7c3 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Date: Thu, 25 Nov 2021 23:27:29 +0200
Subject: [PATCH] misc: at25: Check proper value of chip length in FRAM case
Obviously the byte_len value should be checked from the chip
and not from at25->chip.
Fixes: fd307a4ad332 ("nvmem: prepare basics for FRAM support")
Acked-by: Arnd Bergmann <arnd(a)arndb.de>
Signed-off-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Link: https://lore.kernel.org/r/20211125212729.86585-4-andriy.shevchenko@linux.in…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/misc/eeprom/at25.c b/drivers/misc/eeprom/at25.c
index f0b0efc30ee6..e21216541b0f 100644
--- a/drivers/misc/eeprom/at25.c
+++ b/drivers/misc/eeprom/at25.c
@@ -433,9 +433,9 @@ static int at25_probe(struct spi_device *spi)
dev_err(&spi->dev, "Error: unsupported size (id %02x)\n", id[7]);
return -ENODEV;
}
- chip.byte_len = int_pow(2, id[7] - 0x21 + 4) * 1024;
- if (at25->chip.byte_len > 64 * 1024)
+ chip.byte_len = int_pow(2, id[7] - 0x21 + 4) * 1024;
+ if (chip.byte_len > 64 * 1024)
at25->chip.flags |= EE_ADDR3;
else
at25->chip.flags |= EE_ADDR2;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1b5a42d9c85f0e731f01c8d1129001fd8531a8a0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm(a)xmission.com>
Date: Mon, 3 Jan 2022 11:32:36 -0600
Subject: [PATCH] taskstats: Cleanup the use of task->exit_code
In the function bacct_add_task the code reading task->exit_code was
introduced in commit f3cef7a99469 ("[PATCH] csa: basic accounting over
taskstats"), and it is not entirely clear what the taskstats interface
is trying to return as only returning the exit_code of the first task
in a process doesn't make a lot of sense.
As best as I can figure the intent is to return task->exit_code after
a task exits. The field is returned with per task fields, so the
exit_code of the entire process is not wanted. Only the value of the
first task is returned so this is not a useful way to get the per task
ptrace stop code. The ordinary case of returning this value is
returning after a task exits, which also precludes use for getting
a ptrace value.
It is common to for the first task of a process to also be the last
task of a process so this field may have done something reasonable by
accident in testing.
Make ac_exitcode a reliable per task value by always returning it for
every exited task.
Setting ac_exitcode in a sensible mannter makes it possible to continue
to provide this value going forward.
Cc: Balbir Singh <bsingharora(a)gmail.com>
Fixes: f3cef7a99469 ("[PATCH] csa: basic accounting over taskstats")
Link: https://lkml.kernel.org/r/20220103213312.9144-5-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f00de83d0246..1d261fbe367b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -38,11 +38,10 @@ void bacct_add_tsk(struct user_namespace *user_ns,
stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
stats->ac_btime64 = btime;
- if (thread_group_leader(tsk)) {
+ if (tsk->flags & PF_EXITING)
stats->ac_exitcode = tsk->exit_code;
- if (tsk->flags & PF_FORKNOEXEC)
- stats->ac_flag |= AFORK;
- }
+ if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC))
+ stats->ac_flag |= AFORK;
if (tsk->flags & PF_SUPERPRIV)
stats->ac_flag |= ASU;
if (tsk->flags & PF_DUMPCORE)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1b5a42d9c85f0e731f01c8d1129001fd8531a8a0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm(a)xmission.com>
Date: Mon, 3 Jan 2022 11:32:36 -0600
Subject: [PATCH] taskstats: Cleanup the use of task->exit_code
In the function bacct_add_task the code reading task->exit_code was
introduced in commit f3cef7a99469 ("[PATCH] csa: basic accounting over
taskstats"), and it is not entirely clear what the taskstats interface
is trying to return as only returning the exit_code of the first task
in a process doesn't make a lot of sense.
As best as I can figure the intent is to return task->exit_code after
a task exits. The field is returned with per task fields, so the
exit_code of the entire process is not wanted. Only the value of the
first task is returned so this is not a useful way to get the per task
ptrace stop code. The ordinary case of returning this value is
returning after a task exits, which also precludes use for getting
a ptrace value.
It is common to for the first task of a process to also be the last
task of a process so this field may have done something reasonable by
accident in testing.
Make ac_exitcode a reliable per task value by always returning it for
every exited task.
Setting ac_exitcode in a sensible mannter makes it possible to continue
to provide this value going forward.
Cc: Balbir Singh <bsingharora(a)gmail.com>
Fixes: f3cef7a99469 ("[PATCH] csa: basic accounting over taskstats")
Link: https://lkml.kernel.org/r/20220103213312.9144-5-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f00de83d0246..1d261fbe367b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -38,11 +38,10 @@ void bacct_add_tsk(struct user_namespace *user_ns,
stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
stats->ac_btime64 = btime;
- if (thread_group_leader(tsk)) {
+ if (tsk->flags & PF_EXITING)
stats->ac_exitcode = tsk->exit_code;
- if (tsk->flags & PF_FORKNOEXEC)
- stats->ac_flag |= AFORK;
- }
+ if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC))
+ stats->ac_flag |= AFORK;
if (tsk->flags & PF_SUPERPRIV)
stats->ac_flag |= ASU;
if (tsk->flags & PF_DUMPCORE)
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1b5a42d9c85f0e731f01c8d1129001fd8531a8a0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm(a)xmission.com>
Date: Mon, 3 Jan 2022 11:32:36 -0600
Subject: [PATCH] taskstats: Cleanup the use of task->exit_code
In the function bacct_add_task the code reading task->exit_code was
introduced in commit f3cef7a99469 ("[PATCH] csa: basic accounting over
taskstats"), and it is not entirely clear what the taskstats interface
is trying to return as only returning the exit_code of the first task
in a process doesn't make a lot of sense.
As best as I can figure the intent is to return task->exit_code after
a task exits. The field is returned with per task fields, so the
exit_code of the entire process is not wanted. Only the value of the
first task is returned so this is not a useful way to get the per task
ptrace stop code. The ordinary case of returning this value is
returning after a task exits, which also precludes use for getting
a ptrace value.
It is common to for the first task of a process to also be the last
task of a process so this field may have done something reasonable by
accident in testing.
Make ac_exitcode a reliable per task value by always returning it for
every exited task.
Setting ac_exitcode in a sensible mannter makes it possible to continue
to provide this value going forward.
Cc: Balbir Singh <bsingharora(a)gmail.com>
Fixes: f3cef7a99469 ("[PATCH] csa: basic accounting over taskstats")
Link: https://lkml.kernel.org/r/20220103213312.9144-5-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f00de83d0246..1d261fbe367b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -38,11 +38,10 @@ void bacct_add_tsk(struct user_namespace *user_ns,
stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
stats->ac_btime64 = btime;
- if (thread_group_leader(tsk)) {
+ if (tsk->flags & PF_EXITING)
stats->ac_exitcode = tsk->exit_code;
- if (tsk->flags & PF_FORKNOEXEC)
- stats->ac_flag |= AFORK;
- }
+ if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC))
+ stats->ac_flag |= AFORK;
if (tsk->flags & PF_SUPERPRIV)
stats->ac_flag |= ASU;
if (tsk->flags & PF_DUMPCORE)
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1b5a42d9c85f0e731f01c8d1129001fd8531a8a0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm(a)xmission.com>
Date: Mon, 3 Jan 2022 11:32:36 -0600
Subject: [PATCH] taskstats: Cleanup the use of task->exit_code
In the function bacct_add_task the code reading task->exit_code was
introduced in commit f3cef7a99469 ("[PATCH] csa: basic accounting over
taskstats"), and it is not entirely clear what the taskstats interface
is trying to return as only returning the exit_code of the first task
in a process doesn't make a lot of sense.
As best as I can figure the intent is to return task->exit_code after
a task exits. The field is returned with per task fields, so the
exit_code of the entire process is not wanted. Only the value of the
first task is returned so this is not a useful way to get the per task
ptrace stop code. The ordinary case of returning this value is
returning after a task exits, which also precludes use for getting
a ptrace value.
It is common to for the first task of a process to also be the last
task of a process so this field may have done something reasonable by
accident in testing.
Make ac_exitcode a reliable per task value by always returning it for
every exited task.
Setting ac_exitcode in a sensible mannter makes it possible to continue
to provide this value going forward.
Cc: Balbir Singh <bsingharora(a)gmail.com>
Fixes: f3cef7a99469 ("[PATCH] csa: basic accounting over taskstats")
Link: https://lkml.kernel.org/r/20220103213312.9144-5-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm(a)xmission.com>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index f00de83d0246..1d261fbe367b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -38,11 +38,10 @@ void bacct_add_tsk(struct user_namespace *user_ns,
stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX);
stats->ac_btime64 = btime;
- if (thread_group_leader(tsk)) {
+ if (tsk->flags & PF_EXITING)
stats->ac_exitcode = tsk->exit_code;
- if (tsk->flags & PF_FORKNOEXEC)
- stats->ac_flag |= AFORK;
- }
+ if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC))
+ stats->ac_flag |= AFORK;
if (tsk->flags & PF_SUPERPRIV)
stats->ac_flag |= ASU;
if (tsk->flags & PF_DUMPCORE)
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ff164ae39b82ee483b24579c8e22a13a8ce5bd04 Mon Sep 17 00:00:00 2001
From: Riwen Lu <luriwen(a)kylinos.cn>
Date: Thu, 6 Jan 2022 16:46:09 +0800
Subject: [PATCH] rtc: cmos: Evaluate century appropriate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There's limiting the year to 2069. When setting the rtc year to 2070,
reading it returns 1970. Evaluate century starting from 19 to count the
correct year.
$ sudo date -s 20700106
Mon 06 Jan 2070 12:00:00 AM CST
$ sudo hwclock -w
$ sudo hwclock -r
1970-01-06 12:00:49.604968+08:00
Fixes: 2a4daadd4d3e5071 ("rtc: cmos: ignore bogus century byte")
Signed-off-by: Riwen Lu <luriwen(a)kylinos.cn>
Acked-by: Eric Wong <e(a)80x24.org>
Reviewed-by: Mateusz Jończyk <mat.jonczyk(a)o2.pl>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20220106084609.1223688-1-luriwen@kylinos.cn
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c
index 7f689f1bafc5..ae9f131b43c0 100644
--- a/drivers/rtc/rtc-mc146818-lib.c
+++ b/drivers/rtc/rtc-mc146818-lib.c
@@ -159,7 +159,7 @@ int mc146818_get_time(struct rtc_time *time)
#endif
#ifdef CONFIG_ACPI
- if (p.century > 20)
+ if (p.century > 19)
time->tm_year += (p.century - 19) * 100;
#endif
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ff164ae39b82ee483b24579c8e22a13a8ce5bd04 Mon Sep 17 00:00:00 2001
From: Riwen Lu <luriwen(a)kylinos.cn>
Date: Thu, 6 Jan 2022 16:46:09 +0800
Subject: [PATCH] rtc: cmos: Evaluate century appropriate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There's limiting the year to 2069. When setting the rtc year to 2070,
reading it returns 1970. Evaluate century starting from 19 to count the
correct year.
$ sudo date -s 20700106
Mon 06 Jan 2070 12:00:00 AM CST
$ sudo hwclock -w
$ sudo hwclock -r
1970-01-06 12:00:49.604968+08:00
Fixes: 2a4daadd4d3e5071 ("rtc: cmos: ignore bogus century byte")
Signed-off-by: Riwen Lu <luriwen(a)kylinos.cn>
Acked-by: Eric Wong <e(a)80x24.org>
Reviewed-by: Mateusz Jończyk <mat.jonczyk(a)o2.pl>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20220106084609.1223688-1-luriwen@kylinos.cn
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c
index 7f689f1bafc5..ae9f131b43c0 100644
--- a/drivers/rtc/rtc-mc146818-lib.c
+++ b/drivers/rtc/rtc-mc146818-lib.c
@@ -159,7 +159,7 @@ int mc146818_get_time(struct rtc_time *time)
#endif
#ifdef CONFIG_ACPI
- if (p.century > 20)
+ if (p.century > 19)
time->tm_year += (p.century - 19) * 100;
#endif
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ff164ae39b82ee483b24579c8e22a13a8ce5bd04 Mon Sep 17 00:00:00 2001
From: Riwen Lu <luriwen(a)kylinos.cn>
Date: Thu, 6 Jan 2022 16:46:09 +0800
Subject: [PATCH] rtc: cmos: Evaluate century appropriate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There's limiting the year to 2069. When setting the rtc year to 2070,
reading it returns 1970. Evaluate century starting from 19 to count the
correct year.
$ sudo date -s 20700106
Mon 06 Jan 2070 12:00:00 AM CST
$ sudo hwclock -w
$ sudo hwclock -r
1970-01-06 12:00:49.604968+08:00
Fixes: 2a4daadd4d3e5071 ("rtc: cmos: ignore bogus century byte")
Signed-off-by: Riwen Lu <luriwen(a)kylinos.cn>
Acked-by: Eric Wong <e(a)80x24.org>
Reviewed-by: Mateusz Jończyk <mat.jonczyk(a)o2.pl>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20220106084609.1223688-1-luriwen@kylinos.cn
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c
index 7f689f1bafc5..ae9f131b43c0 100644
--- a/drivers/rtc/rtc-mc146818-lib.c
+++ b/drivers/rtc/rtc-mc146818-lib.c
@@ -159,7 +159,7 @@ int mc146818_get_time(struct rtc_time *time)
#endif
#ifdef CONFIG_ACPI
- if (p.century > 20)
+ if (p.century > 19)
time->tm_year += (p.century - 19) * 100;
#endif
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ff164ae39b82ee483b24579c8e22a13a8ce5bd04 Mon Sep 17 00:00:00 2001
From: Riwen Lu <luriwen(a)kylinos.cn>
Date: Thu, 6 Jan 2022 16:46:09 +0800
Subject: [PATCH] rtc: cmos: Evaluate century appropriate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There's limiting the year to 2069. When setting the rtc year to 2070,
reading it returns 1970. Evaluate century starting from 19 to count the
correct year.
$ sudo date -s 20700106
Mon 06 Jan 2070 12:00:00 AM CST
$ sudo hwclock -w
$ sudo hwclock -r
1970-01-06 12:00:49.604968+08:00
Fixes: 2a4daadd4d3e5071 ("rtc: cmos: ignore bogus century byte")
Signed-off-by: Riwen Lu <luriwen(a)kylinos.cn>
Acked-by: Eric Wong <e(a)80x24.org>
Reviewed-by: Mateusz Jończyk <mat.jonczyk(a)o2.pl>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20220106084609.1223688-1-luriwen@kylinos.cn
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c
index 7f689f1bafc5..ae9f131b43c0 100644
--- a/drivers/rtc/rtc-mc146818-lib.c
+++ b/drivers/rtc/rtc-mc146818-lib.c
@@ -159,7 +159,7 @@ int mc146818_get_time(struct rtc_time *time)
#endif
#ifdef CONFIG_ACPI
- if (p.century > 20)
+ if (p.century > 19)
time->tm_year += (p.century - 19) * 100;
#endif
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ff164ae39b82ee483b24579c8e22a13a8ce5bd04 Mon Sep 17 00:00:00 2001
From: Riwen Lu <luriwen(a)kylinos.cn>
Date: Thu, 6 Jan 2022 16:46:09 +0800
Subject: [PATCH] rtc: cmos: Evaluate century appropriate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There's limiting the year to 2069. When setting the rtc year to 2070,
reading it returns 1970. Evaluate century starting from 19 to count the
correct year.
$ sudo date -s 20700106
Mon 06 Jan 2070 12:00:00 AM CST
$ sudo hwclock -w
$ sudo hwclock -r
1970-01-06 12:00:49.604968+08:00
Fixes: 2a4daadd4d3e5071 ("rtc: cmos: ignore bogus century byte")
Signed-off-by: Riwen Lu <luriwen(a)kylinos.cn>
Acked-by: Eric Wong <e(a)80x24.org>
Reviewed-by: Mateusz Jończyk <mat.jonczyk(a)o2.pl>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20220106084609.1223688-1-luriwen@kylinos.cn
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c
index 7f689f1bafc5..ae9f131b43c0 100644
--- a/drivers/rtc/rtc-mc146818-lib.c
+++ b/drivers/rtc/rtc-mc146818-lib.c
@@ -159,7 +159,7 @@ int mc146818_get_time(struct rtc_time *time)
#endif
#ifdef CONFIG_ACPI
- if (p.century > 20)
+ if (p.century > 19)
time->tm_year += (p.century - 19) * 100;
#endif
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ff164ae39b82ee483b24579c8e22a13a8ce5bd04 Mon Sep 17 00:00:00 2001
From: Riwen Lu <luriwen(a)kylinos.cn>
Date: Thu, 6 Jan 2022 16:46:09 +0800
Subject: [PATCH] rtc: cmos: Evaluate century appropriate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There's limiting the year to 2069. When setting the rtc year to 2070,
reading it returns 1970. Evaluate century starting from 19 to count the
correct year.
$ sudo date -s 20700106
Mon 06 Jan 2070 12:00:00 AM CST
$ sudo hwclock -w
$ sudo hwclock -r
1970-01-06 12:00:49.604968+08:00
Fixes: 2a4daadd4d3e5071 ("rtc: cmos: ignore bogus century byte")
Signed-off-by: Riwen Lu <luriwen(a)kylinos.cn>
Acked-by: Eric Wong <e(a)80x24.org>
Reviewed-by: Mateusz Jończyk <mat.jonczyk(a)o2.pl>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20220106084609.1223688-1-luriwen@kylinos.cn
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c
index 7f689f1bafc5..ae9f131b43c0 100644
--- a/drivers/rtc/rtc-mc146818-lib.c
+++ b/drivers/rtc/rtc-mc146818-lib.c
@@ -159,7 +159,7 @@ int mc146818_get_time(struct rtc_time *time)
#endif
#ifdef CONFIG_ACPI
- if (p.century > 20)
+ if (p.century > 19)
time->tm_year += (p.century - 19) * 100;
#endif
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ff164ae39b82ee483b24579c8e22a13a8ce5bd04 Mon Sep 17 00:00:00 2001
From: Riwen Lu <luriwen(a)kylinos.cn>
Date: Thu, 6 Jan 2022 16:46:09 +0800
Subject: [PATCH] rtc: cmos: Evaluate century appropriate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There's limiting the year to 2069. When setting the rtc year to 2070,
reading it returns 1970. Evaluate century starting from 19 to count the
correct year.
$ sudo date -s 20700106
Mon 06 Jan 2070 12:00:00 AM CST
$ sudo hwclock -w
$ sudo hwclock -r
1970-01-06 12:00:49.604968+08:00
Fixes: 2a4daadd4d3e5071 ("rtc: cmos: ignore bogus century byte")
Signed-off-by: Riwen Lu <luriwen(a)kylinos.cn>
Acked-by: Eric Wong <e(a)80x24.org>
Reviewed-by: Mateusz Jończyk <mat.jonczyk(a)o2.pl>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20220106084609.1223688-1-luriwen@kylinos.cn
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c
index 7f689f1bafc5..ae9f131b43c0 100644
--- a/drivers/rtc/rtc-mc146818-lib.c
+++ b/drivers/rtc/rtc-mc146818-lib.c
@@ -159,7 +159,7 @@ int mc146818_get_time(struct rtc_time *time)
#endif
#ifdef CONFIG_ACPI
- if (p.century > 20)
+ if (p.century > 19)
time->tm_year += (p.century - 19) * 100;
#endif
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ff164ae39b82ee483b24579c8e22a13a8ce5bd04 Mon Sep 17 00:00:00 2001
From: Riwen Lu <luriwen(a)kylinos.cn>
Date: Thu, 6 Jan 2022 16:46:09 +0800
Subject: [PATCH] rtc: cmos: Evaluate century appropriate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There's limiting the year to 2069. When setting the rtc year to 2070,
reading it returns 1970. Evaluate century starting from 19 to count the
correct year.
$ sudo date -s 20700106
Mon 06 Jan 2070 12:00:00 AM CST
$ sudo hwclock -w
$ sudo hwclock -r
1970-01-06 12:00:49.604968+08:00
Fixes: 2a4daadd4d3e5071 ("rtc: cmos: ignore bogus century byte")
Signed-off-by: Riwen Lu <luriwen(a)kylinos.cn>
Acked-by: Eric Wong <e(a)80x24.org>
Reviewed-by: Mateusz Jończyk <mat.jonczyk(a)o2.pl>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20220106084609.1223688-1-luriwen@kylinos.cn
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c
index 7f689f1bafc5..ae9f131b43c0 100644
--- a/drivers/rtc/rtc-mc146818-lib.c
+++ b/drivers/rtc/rtc-mc146818-lib.c
@@ -159,7 +159,7 @@ int mc146818_get_time(struct rtc_time *time)
#endif
#ifdef CONFIG_ACPI
- if (p.century > 20)
+ if (p.century > 19)
time->tm_year += (p.century - 19) * 100;
#endif
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ea6fa4961aab8f90a8aa03575a98b4bda368d4b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Jo=C5=84czyk?= <mat.jonczyk(a)o2.pl>
Date: Fri, 10 Dec 2021 21:01:26 +0100
Subject: [PATCH] rtc: mc146818-lib: fix RTC presence check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
To prevent an infinite loop in mc146818_get_time(),
commit 211e5db19d15 ("rtc: mc146818: Detect and handle broken RTCs")
added a check for RTC availability. Together with a later fix, it
checked if bit 6 in register 0x0d is cleared.
This, however, caused a false negative on a motherboard with an AMD
SB710 southbridge; according to the specification [1], bit 6 of register
0x0d of this chipset is a scratchbit. This caused a regression in Linux
5.11 - the RTC was determined broken by the kernel and not used by
rtc-cmos.c [3]. This problem was also reported in Fedora [4].
As a better alternative, check whether the UIP ("Update-in-progress")
bit is set for longer then 10ms. If that is the case, then apparently
the RTC is either absent (and all register reads return 0xff) or broken.
Also limit the number of loop iterations in mc146818_get_time() to 10 to
prevent an infinite loop there.
The functions mc146818_get_time() and mc146818_does_rtc_work() will be
refactored later in this patch series, in order to fix a separate
problem with reading / setting the RTC alarm time. This is done so to
avoid a confusion about what is being fixed when.
In a previous approach to this problem, I implemented a check whether
the RTC_HOURS register contains a value <= 24. This, however, sometimes
did not work correctly on my Intel Kaby Lake laptop. According to
Intel's documentation [2], "the time and date RAM locations (0-9) are
disconnected from the external bus" during the update cycle so reading
this register without checking the UIP bit is incorrect.
[1] AMD SB700/710/750 Register Reference Guide, page 308,
https://developer.amd.com/wordpress/media/2012/10/43009_sb7xx_rrg_pub_1.00.…
[2] 7th Generation Intel ® Processor Family I/O for U/Y Platforms [...] Datasheet
Volume 1 of 2, page 209
Intel's Document Number: 334658-006,
https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/7th…
[3] Functions in arch/x86/kernel/rtc.c apparently were using it.
[4] https://bugzilla.redhat.com/show_bug.cgi?id=1936688
Fixes: 211e5db19d15 ("rtc: mc146818: Detect and handle broken RTCs")
Fixes: ebb22a059436 ("rtc: mc146818: Dont test for bit 0-5 in Register D")
Signed-off-by: Mateusz Jończyk <mat.jonczyk(a)o2.pl>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Alessandro Zummo <a.zummo(a)towertech.it>
Cc: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Link: https://lore.kernel.org/r/20211210200131.153887-5-mat.jonczyk@o2.pl
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index d0f58cca5c20..b90a603d6b12 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -800,16 +800,14 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
rename_region(ports, dev_name(&cmos_rtc.rtc->dev));
- spin_lock_irq(&rtc_lock);
-
- /* Ensure that the RTC is accessible. Bit 6 must be 0! */
- if ((CMOS_READ(RTC_VALID) & 0x40) != 0) {
- spin_unlock_irq(&rtc_lock);
- dev_warn(dev, "not accessible\n");
+ if (!mc146818_does_rtc_work()) {
+ dev_warn(dev, "broken or not accessible\n");
retval = -ENXIO;
goto cleanup1;
}
+ spin_lock_irq(&rtc_lock);
+
if (!(flags & CMOS_RTC_FLAGS_NOFREQ)) {
/* force periodic irq to CMOS reset default of 1024Hz;
*
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c
index ccd974b8a75a..d8e67a01220e 100644
--- a/drivers/rtc/rtc-mc146818-lib.c
+++ b/drivers/rtc/rtc-mc146818-lib.c
@@ -8,10 +8,36 @@
#include <linux/acpi.h>
#endif
+/*
+ * If the UIP (Update-in-progress) bit of the RTC is set for more then
+ * 10ms, the RTC is apparently broken or not present.
+ */
+bool mc146818_does_rtc_work(void)
+{
+ int i;
+ unsigned char val;
+ unsigned long flags;
+
+ for (i = 0; i < 10; i++) {
+ spin_lock_irqsave(&rtc_lock, flags);
+ val = CMOS_READ(RTC_FREQ_SELECT);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ if ((val & RTC_UIP) == 0)
+ return true;
+
+ mdelay(1);
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(mc146818_does_rtc_work);
+
unsigned int mc146818_get_time(struct rtc_time *time)
{
unsigned char ctrl;
unsigned long flags;
+ unsigned int iter_count = 0;
unsigned char century = 0;
bool retry;
@@ -20,13 +46,13 @@ unsigned int mc146818_get_time(struct rtc_time *time)
#endif
again:
- spin_lock_irqsave(&rtc_lock, flags);
- /* Ensure that the RTC is accessible. Bit 6 must be 0! */
- if (WARN_ON_ONCE((CMOS_READ(RTC_VALID) & 0x40) != 0)) {
- spin_unlock_irqrestore(&rtc_lock, flags);
+ if (iter_count > 10) {
memset(time, 0, sizeof(*time));
return -EIO;
}
+ iter_count++;
+
+ spin_lock_irqsave(&rtc_lock, flags);
/*
* Check whether there is an update in progress during which the
diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h
index 0661af17a758..69c80c4325bf 100644
--- a/include/linux/mc146818rtc.h
+++ b/include/linux/mc146818rtc.h
@@ -123,6 +123,7 @@ struct cmos_rtc_board_info {
#define RTC_IO_EXTENT_USED RTC_IO_EXTENT
#endif /* ARCH_RTC_LOCATION */
+bool mc146818_does_rtc_work(void);
unsigned int mc146818_get_time(struct rtc_time *time);
int mc146818_set_time(struct rtc_time *time);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2d19c3fd80178160dd505ccd7fed1643831227a5 Mon Sep 17 00:00:00 2001
From: Robert Hancock <robert.hancock(a)calian.com>
Date: Tue, 18 Jan 2022 15:41:32 -0600
Subject: [PATCH] net: axienet: increase default TX ring size to 128
With previous changes to make the driver handle the TX ring size more
correctly, the default TX ring size of 64 appears to significantly
bottleneck TX performance to around 600 Mbps on a 1 Gbps link on ZynqMP.
Increasing this to 128 seems to bring performance up to near line rate and
shouldn't cause excess bufferbloat (this driver doesn't yet support modern
byte-based queue management).
Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver")
Signed-off-by: Robert Hancock <robert.hancock(a)calian.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index b4f42ee9b75d..377c94ec2486 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -41,7 +41,7 @@
#include "xilinx_axienet.h"
/* Descriptors defines for Tx and Rx DMA */
-#define TX_BD_NUM_DEFAULT 64
+#define TX_BD_NUM_DEFAULT 128
#define RX_BD_NUM_DEFAULT 1024
#define TX_BD_NUM_MIN (MAX_SKB_FRAGS + 1)
#define TX_BD_NUM_MAX 4096
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2d19c3fd80178160dd505ccd7fed1643831227a5 Mon Sep 17 00:00:00 2001
From: Robert Hancock <robert.hancock(a)calian.com>
Date: Tue, 18 Jan 2022 15:41:32 -0600
Subject: [PATCH] net: axienet: increase default TX ring size to 128
With previous changes to make the driver handle the TX ring size more
correctly, the default TX ring size of 64 appears to significantly
bottleneck TX performance to around 600 Mbps on a 1 Gbps link on ZynqMP.
Increasing this to 128 seems to bring performance up to near line rate and
shouldn't cause excess bufferbloat (this driver doesn't yet support modern
byte-based queue management).
Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver")
Signed-off-by: Robert Hancock <robert.hancock(a)calian.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index b4f42ee9b75d..377c94ec2486 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -41,7 +41,7 @@
#include "xilinx_axienet.h"
/* Descriptors defines for Tx and Rx DMA */
-#define TX_BD_NUM_DEFAULT 64
+#define TX_BD_NUM_DEFAULT 128
#define RX_BD_NUM_DEFAULT 1024
#define TX_BD_NUM_MIN (MAX_SKB_FRAGS + 1)
#define TX_BD_NUM_MAX 4096
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2d19c3fd80178160dd505ccd7fed1643831227a5 Mon Sep 17 00:00:00 2001
From: Robert Hancock <robert.hancock(a)calian.com>
Date: Tue, 18 Jan 2022 15:41:32 -0600
Subject: [PATCH] net: axienet: increase default TX ring size to 128
With previous changes to make the driver handle the TX ring size more
correctly, the default TX ring size of 64 appears to significantly
bottleneck TX performance to around 600 Mbps on a 1 Gbps link on ZynqMP.
Increasing this to 128 seems to bring performance up to near line rate and
shouldn't cause excess bufferbloat (this driver doesn't yet support modern
byte-based queue management).
Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver")
Signed-off-by: Robert Hancock <robert.hancock(a)calian.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index b4f42ee9b75d..377c94ec2486 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -41,7 +41,7 @@
#include "xilinx_axienet.h"
/* Descriptors defines for Tx and Rx DMA */
-#define TX_BD_NUM_DEFAULT 64
+#define TX_BD_NUM_DEFAULT 128
#define RX_BD_NUM_DEFAULT 1024
#define TX_BD_NUM_MIN (MAX_SKB_FRAGS + 1)
#define TX_BD_NUM_MAX 4096
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2d19c3fd80178160dd505ccd7fed1643831227a5 Mon Sep 17 00:00:00 2001
From: Robert Hancock <robert.hancock(a)calian.com>
Date: Tue, 18 Jan 2022 15:41:32 -0600
Subject: [PATCH] net: axienet: increase default TX ring size to 128
With previous changes to make the driver handle the TX ring size more
correctly, the default TX ring size of 64 appears to significantly
bottleneck TX performance to around 600 Mbps on a 1 Gbps link on ZynqMP.
Increasing this to 128 seems to bring performance up to near line rate and
shouldn't cause excess bufferbloat (this driver doesn't yet support modern
byte-based queue management).
Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver")
Signed-off-by: Robert Hancock <robert.hancock(a)calian.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index b4f42ee9b75d..377c94ec2486 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -41,7 +41,7 @@
#include "xilinx_axienet.h"
/* Descriptors defines for Tx and Rx DMA */
-#define TX_BD_NUM_DEFAULT 64
+#define TX_BD_NUM_DEFAULT 128
#define RX_BD_NUM_DEFAULT 1024
#define TX_BD_NUM_MIN (MAX_SKB_FRAGS + 1)
#define TX_BD_NUM_MAX 4096
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bb193e3db8b86a63f26889c99e14fd30c9ebd72a Mon Sep 17 00:00:00 2001
From: Robert Hancock <robert.hancock(a)calian.com>
Date: Tue, 18 Jan 2022 15:41:31 -0600
Subject: [PATCH] net: axienet: fix for TX busy handling
Network driver documentation indicates we should be avoiding returning
NETDEV_TX_BUSY from ndo_start_xmit in normal cases, since it requires
the packets to be requeued. Instead the queue should be stopped after
a packet is added to the TX ring when there may not be enough room for an
additional one. Also, when TX ring entries are completed, we should only
wake the queue if we know there is room for another full maximally
fragmented packet.
Print a warning if there is insufficient space at the start of start_xmit,
since this should no longer happen.
Combined with increasing the default TX ring size (in a subsequent
patch), this appears to recover the TX performance lost by previous changes
to actually manage the TX ring state properly.
Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver")
Signed-off-by: Robert Hancock <robert.hancock(a)calian.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 8dc9e92e05d2..b4f42ee9b75d 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -660,6 +660,32 @@ static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
return i;
}
+/**
+ * axienet_check_tx_bd_space - Checks if a BD/group of BDs are currently busy
+ * @lp: Pointer to the axienet_local structure
+ * @num_frag: The number of BDs to check for
+ *
+ * Return: 0, on success
+ * NETDEV_TX_BUSY, if any of the descriptors are not free
+ *
+ * This function is invoked before BDs are allocated and transmission starts.
+ * This function returns 0 if a BD or group of BDs can be allocated for
+ * transmission. If the BD or any of the BDs are not free the function
+ * returns a busy status. This is invoked from axienet_start_xmit.
+ */
+static inline int axienet_check_tx_bd_space(struct axienet_local *lp,
+ int num_frag)
+{
+ struct axidma_bd *cur_p;
+
+ /* Ensure we see all descriptor updates from device or TX IRQ path */
+ rmb();
+ cur_p = &lp->tx_bd_v[(lp->tx_bd_tail + num_frag) % lp->tx_bd_num];
+ if (cur_p->cntrl)
+ return NETDEV_TX_BUSY;
+ return 0;
+}
+
/**
* axienet_start_xmit_done - Invoked once a transmit is completed by the
* Axi DMA Tx channel.
@@ -689,33 +715,8 @@ static void axienet_start_xmit_done(struct net_device *ndev)
/* Matches barrier in axienet_start_xmit */
smp_mb();
- netif_wake_queue(ndev);
-}
-
-/**
- * axienet_check_tx_bd_space - Checks if a BD/group of BDs are currently busy
- * @lp: Pointer to the axienet_local structure
- * @num_frag: The number of BDs to check for
- *
- * Return: 0, on success
- * NETDEV_TX_BUSY, if any of the descriptors are not free
- *
- * This function is invoked before BDs are allocated and transmission starts.
- * This function returns 0 if a BD or group of BDs can be allocated for
- * transmission. If the BD or any of the BDs are not free the function
- * returns a busy status. This is invoked from axienet_start_xmit.
- */
-static inline int axienet_check_tx_bd_space(struct axienet_local *lp,
- int num_frag)
-{
- struct axidma_bd *cur_p;
-
- /* Ensure we see all descriptor updates from device or TX IRQ path */
- rmb();
- cur_p = &lp->tx_bd_v[(lp->tx_bd_tail + num_frag) % lp->tx_bd_num];
- if (cur_p->cntrl)
- return NETDEV_TX_BUSY;
- return 0;
+ if (!axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1))
+ netif_wake_queue(ndev);
}
/**
@@ -748,19 +749,14 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
cur_p = &lp->tx_bd_v[lp->tx_bd_tail];
if (axienet_check_tx_bd_space(lp, num_frag + 1)) {
- if (netif_queue_stopped(ndev))
- return NETDEV_TX_BUSY;
-
+ /* Should not happen as last start_xmit call should have
+ * checked for sufficient space and queue should only be
+ * woken when sufficient space is available.
+ */
netif_stop_queue(ndev);
-
- /* Matches barrier in axienet_start_xmit_done */
- smp_mb();
-
- /* Space might have just been freed - check again */
- if (axienet_check_tx_bd_space(lp, num_frag + 1))
- return NETDEV_TX_BUSY;
-
- netif_wake_queue(ndev);
+ if (net_ratelimit())
+ netdev_warn(ndev, "TX ring unexpectedly full\n");
+ return NETDEV_TX_BUSY;
}
if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -821,6 +817,18 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
if (++lp->tx_bd_tail >= lp->tx_bd_num)
lp->tx_bd_tail = 0;
+ /* Stop queue if next transmit may not have space */
+ if (axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1)) {
+ netif_stop_queue(ndev);
+
+ /* Matches barrier in axienet_start_xmit_done */
+ smp_mb();
+
+ /* Space might have just been freed - check again */
+ if (!axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1))
+ netif_wake_queue(ndev);
+ }
+
return NETDEV_TX_OK;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bb193e3db8b86a63f26889c99e14fd30c9ebd72a Mon Sep 17 00:00:00 2001
From: Robert Hancock <robert.hancock(a)calian.com>
Date: Tue, 18 Jan 2022 15:41:31 -0600
Subject: [PATCH] net: axienet: fix for TX busy handling
Network driver documentation indicates we should be avoiding returning
NETDEV_TX_BUSY from ndo_start_xmit in normal cases, since it requires
the packets to be requeued. Instead the queue should be stopped after
a packet is added to the TX ring when there may not be enough room for an
additional one. Also, when TX ring entries are completed, we should only
wake the queue if we know there is room for another full maximally
fragmented packet.
Print a warning if there is insufficient space at the start of start_xmit,
since this should no longer happen.
Combined with increasing the default TX ring size (in a subsequent
patch), this appears to recover the TX performance lost by previous changes
to actually manage the TX ring state properly.
Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver")
Signed-off-by: Robert Hancock <robert.hancock(a)calian.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 8dc9e92e05d2..b4f42ee9b75d 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -660,6 +660,32 @@ static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
return i;
}
+/**
+ * axienet_check_tx_bd_space - Checks if a BD/group of BDs are currently busy
+ * @lp: Pointer to the axienet_local structure
+ * @num_frag: The number of BDs to check for
+ *
+ * Return: 0, on success
+ * NETDEV_TX_BUSY, if any of the descriptors are not free
+ *
+ * This function is invoked before BDs are allocated and transmission starts.
+ * This function returns 0 if a BD or group of BDs can be allocated for
+ * transmission. If the BD or any of the BDs are not free the function
+ * returns a busy status. This is invoked from axienet_start_xmit.
+ */
+static inline int axienet_check_tx_bd_space(struct axienet_local *lp,
+ int num_frag)
+{
+ struct axidma_bd *cur_p;
+
+ /* Ensure we see all descriptor updates from device or TX IRQ path */
+ rmb();
+ cur_p = &lp->tx_bd_v[(lp->tx_bd_tail + num_frag) % lp->tx_bd_num];
+ if (cur_p->cntrl)
+ return NETDEV_TX_BUSY;
+ return 0;
+}
+
/**
* axienet_start_xmit_done - Invoked once a transmit is completed by the
* Axi DMA Tx channel.
@@ -689,33 +715,8 @@ static void axienet_start_xmit_done(struct net_device *ndev)
/* Matches barrier in axienet_start_xmit */
smp_mb();
- netif_wake_queue(ndev);
-}
-
-/**
- * axienet_check_tx_bd_space - Checks if a BD/group of BDs are currently busy
- * @lp: Pointer to the axienet_local structure
- * @num_frag: The number of BDs to check for
- *
- * Return: 0, on success
- * NETDEV_TX_BUSY, if any of the descriptors are not free
- *
- * This function is invoked before BDs are allocated and transmission starts.
- * This function returns 0 if a BD or group of BDs can be allocated for
- * transmission. If the BD or any of the BDs are not free the function
- * returns a busy status. This is invoked from axienet_start_xmit.
- */
-static inline int axienet_check_tx_bd_space(struct axienet_local *lp,
- int num_frag)
-{
- struct axidma_bd *cur_p;
-
- /* Ensure we see all descriptor updates from device or TX IRQ path */
- rmb();
- cur_p = &lp->tx_bd_v[(lp->tx_bd_tail + num_frag) % lp->tx_bd_num];
- if (cur_p->cntrl)
- return NETDEV_TX_BUSY;
- return 0;
+ if (!axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1))
+ netif_wake_queue(ndev);
}
/**
@@ -748,19 +749,14 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
cur_p = &lp->tx_bd_v[lp->tx_bd_tail];
if (axienet_check_tx_bd_space(lp, num_frag + 1)) {
- if (netif_queue_stopped(ndev))
- return NETDEV_TX_BUSY;
-
+ /* Should not happen as last start_xmit call should have
+ * checked for sufficient space and queue should only be
+ * woken when sufficient space is available.
+ */
netif_stop_queue(ndev);
-
- /* Matches barrier in axienet_start_xmit_done */
- smp_mb();
-
- /* Space might have just been freed - check again */
- if (axienet_check_tx_bd_space(lp, num_frag + 1))
- return NETDEV_TX_BUSY;
-
- netif_wake_queue(ndev);
+ if (net_ratelimit())
+ netdev_warn(ndev, "TX ring unexpectedly full\n");
+ return NETDEV_TX_BUSY;
}
if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -821,6 +817,18 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
if (++lp->tx_bd_tail >= lp->tx_bd_num)
lp->tx_bd_tail = 0;
+ /* Stop queue if next transmit may not have space */
+ if (axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1)) {
+ netif_stop_queue(ndev);
+
+ /* Matches barrier in axienet_start_xmit_done */
+ smp_mb();
+
+ /* Space might have just been freed - check again */
+ if (!axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1))
+ netif_wake_queue(ndev);
+ }
+
return NETDEV_TX_OK;
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bb193e3db8b86a63f26889c99e14fd30c9ebd72a Mon Sep 17 00:00:00 2001
From: Robert Hancock <robert.hancock(a)calian.com>
Date: Tue, 18 Jan 2022 15:41:31 -0600
Subject: [PATCH] net: axienet: fix for TX busy handling
Network driver documentation indicates we should be avoiding returning
NETDEV_TX_BUSY from ndo_start_xmit in normal cases, since it requires
the packets to be requeued. Instead the queue should be stopped after
a packet is added to the TX ring when there may not be enough room for an
additional one. Also, when TX ring entries are completed, we should only
wake the queue if we know there is room for another full maximally
fragmented packet.
Print a warning if there is insufficient space at the start of start_xmit,
since this should no longer happen.
Combined with increasing the default TX ring size (in a subsequent
patch), this appears to recover the TX performance lost by previous changes
to actually manage the TX ring state properly.
Fixes: 8a3b7a252dca9 ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver")
Signed-off-by: Robert Hancock <robert.hancock(a)calian.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 8dc9e92e05d2..b4f42ee9b75d 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -660,6 +660,32 @@ static int axienet_free_tx_chain(struct net_device *ndev, u32 first_bd,
return i;
}
+/**
+ * axienet_check_tx_bd_space - Checks if a BD/group of BDs are currently busy
+ * @lp: Pointer to the axienet_local structure
+ * @num_frag: The number of BDs to check for
+ *
+ * Return: 0, on success
+ * NETDEV_TX_BUSY, if any of the descriptors are not free
+ *
+ * This function is invoked before BDs are allocated and transmission starts.
+ * This function returns 0 if a BD or group of BDs can be allocated for
+ * transmission. If the BD or any of the BDs are not free the function
+ * returns a busy status. This is invoked from axienet_start_xmit.
+ */
+static inline int axienet_check_tx_bd_space(struct axienet_local *lp,
+ int num_frag)
+{
+ struct axidma_bd *cur_p;
+
+ /* Ensure we see all descriptor updates from device or TX IRQ path */
+ rmb();
+ cur_p = &lp->tx_bd_v[(lp->tx_bd_tail + num_frag) % lp->tx_bd_num];
+ if (cur_p->cntrl)
+ return NETDEV_TX_BUSY;
+ return 0;
+}
+
/**
* axienet_start_xmit_done - Invoked once a transmit is completed by the
* Axi DMA Tx channel.
@@ -689,33 +715,8 @@ static void axienet_start_xmit_done(struct net_device *ndev)
/* Matches barrier in axienet_start_xmit */
smp_mb();
- netif_wake_queue(ndev);
-}
-
-/**
- * axienet_check_tx_bd_space - Checks if a BD/group of BDs are currently busy
- * @lp: Pointer to the axienet_local structure
- * @num_frag: The number of BDs to check for
- *
- * Return: 0, on success
- * NETDEV_TX_BUSY, if any of the descriptors are not free
- *
- * This function is invoked before BDs are allocated and transmission starts.
- * This function returns 0 if a BD or group of BDs can be allocated for
- * transmission. If the BD or any of the BDs are not free the function
- * returns a busy status. This is invoked from axienet_start_xmit.
- */
-static inline int axienet_check_tx_bd_space(struct axienet_local *lp,
- int num_frag)
-{
- struct axidma_bd *cur_p;
-
- /* Ensure we see all descriptor updates from device or TX IRQ path */
- rmb();
- cur_p = &lp->tx_bd_v[(lp->tx_bd_tail + num_frag) % lp->tx_bd_num];
- if (cur_p->cntrl)
- return NETDEV_TX_BUSY;
- return 0;
+ if (!axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1))
+ netif_wake_queue(ndev);
}
/**
@@ -748,19 +749,14 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
cur_p = &lp->tx_bd_v[lp->tx_bd_tail];
if (axienet_check_tx_bd_space(lp, num_frag + 1)) {
- if (netif_queue_stopped(ndev))
- return NETDEV_TX_BUSY;
-
+ /* Should not happen as last start_xmit call should have
+ * checked for sufficient space and queue should only be
+ * woken when sufficient space is available.
+ */
netif_stop_queue(ndev);
-
- /* Matches barrier in axienet_start_xmit_done */
- smp_mb();
-
- /* Space might have just been freed - check again */
- if (axienet_check_tx_bd_space(lp, num_frag + 1))
- return NETDEV_TX_BUSY;
-
- netif_wake_queue(ndev);
+ if (net_ratelimit())
+ netdev_warn(ndev, "TX ring unexpectedly full\n");
+ return NETDEV_TX_BUSY;
}
if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -821,6 +817,18 @@ axienet_start_xmit(struct sk_buff *skb, struct net_device *ndev)
if (++lp->tx_bd_tail >= lp->tx_bd_num)
lp->tx_bd_tail = 0;
+ /* Stop queue if next transmit may not have space */
+ if (axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1)) {
+ netif_stop_queue(ndev);
+
+ /* Matches barrier in axienet_start_xmit_done */
+ smp_mb();
+
+ /* Space might have just been freed - check again */
+ if (!axienet_check_tx_bd_space(lp, MAX_SKB_FRAGS + 1))
+ netif_wake_queue(ndev);
+ }
+
return NETDEV_TX_OK;
}