The patch below does not apply to the 4.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 7d867a1b765e2b70815fec4964d7822a976ed349 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil(a)cisco.com>
Date: Fri, 5 Oct 2018 08:00:21 -0400
Subject: [PATCH] media: cec: fix the Signal Free Time calculation
The calculation of the Signal Free Time in the framework was not
correct. If a message was received, then the next transmit should be
considered a New Initiator and use a shorter SFT value.
This was not done with the result that if both sides where continually
sending messages, they both could use the same SFT value and one side
could deny the other side access to the bus.
Note that this fix does not take the corner case into account where
a receive is in progress when you call adap_transmit.
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Cc: <stable(a)vger.kernel.org> # for v4.18 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung(a)kernel.org>
diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index e6e82b504e56..0c0d9107383e 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -526,9 +526,11 @@ int cec_thread_func(void *_adap)
if (data->attempts) {
/* should be >= 3 data bit periods for a retry */
signal_free_time = CEC_SIGNAL_FREE_TIME_RETRY;
- } else if (data->new_initiator) {
+ } else if (adap->last_initiator !=
+ cec_msg_initiator(&data->msg)) {
/* should be >= 5 data bit periods for new initiator */
signal_free_time = CEC_SIGNAL_FREE_TIME_NEW_INITIATOR;
+ adap->last_initiator = cec_msg_initiator(&data->msg);
} else {
/*
* should be >= 7 data bit periods for sending another
@@ -713,7 +715,6 @@ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg,
struct cec_fh *fh, bool block)
{
struct cec_data *data;
- u8 last_initiator = 0xff;
msg->rx_ts = 0;
msg->tx_ts = 0;
@@ -823,23 +824,6 @@ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg,
data->adap = adap;
data->blocking = block;
- /*
- * Determine if this message follows a message from the same
- * initiator. Needed to determine the free signal time later on.
- */
- if (msg->len > 1) {
- if (!(list_empty(&adap->transmit_queue))) {
- const struct cec_data *last;
-
- last = list_last_entry(&adap->transmit_queue,
- const struct cec_data, list);
- last_initiator = cec_msg_initiator(&last->msg);
- } else if (adap->transmitting) {
- last_initiator =
- cec_msg_initiator(&adap->transmitting->msg);
- }
- }
- data->new_initiator = last_initiator != cec_msg_initiator(msg);
init_completion(&data->c);
INIT_DELAYED_WORK(&data->work, cec_wait_timeout);
@@ -1027,6 +1011,8 @@ void cec_received_msg_ts(struct cec_adapter *adap,
mutex_lock(&adap->lock);
dprintk(2, "%s: %*ph\n", __func__, msg->len, msg->msg);
+ adap->last_initiator = 0xff;
+
/* Check if this message was for us (directed or broadcast). */
if (!cec_msg_is_broadcast(msg))
valid_la = cec_has_log_addr(adap, msg_dest);
@@ -1489,6 +1475,8 @@ void __cec_s_phys_addr(struct cec_adapter *adap, u16 phys_addr, bool block)
}
mutex_lock(&adap->devnode.lock);
+ adap->last_initiator = 0xff;
+
if ((adap->needs_hpd || list_empty(&adap->devnode.fhs)) &&
adap->ops->adap_enable(adap, true)) {
mutex_unlock(&adap->devnode.lock);
diff --git a/include/media/cec.h b/include/media/cec.h
index 9f382f0c2970..254a610b9aa5 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -63,7 +63,6 @@ struct cec_data {
struct delayed_work work;
struct completion c;
u8 attempts;
- bool new_initiator;
bool blocking;
bool completed;
};
@@ -174,6 +173,7 @@ struct cec_adapter {
bool is_configuring;
bool is_configured;
bool cec_pin_is_high;
+ u8 last_initiator;
u32 monitor_all_cnt;
u32 monitor_pin_cnt;
u32 follower_cnt;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 7d867a1b765e2b70815fec4964d7822a976ed349 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil(a)cisco.com>
Date: Fri, 5 Oct 2018 08:00:21 -0400
Subject: [PATCH] media: cec: fix the Signal Free Time calculation
The calculation of the Signal Free Time in the framework was not
correct. If a message was received, then the next transmit should be
considered a New Initiator and use a shorter SFT value.
This was not done with the result that if both sides where continually
sending messages, they both could use the same SFT value and one side
could deny the other side access to the bus.
Note that this fix does not take the corner case into account where
a receive is in progress when you call adap_transmit.
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Cc: <stable(a)vger.kernel.org> # for v4.18 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung(a)kernel.org>
diff --git a/drivers/media/cec/cec-adap.c b/drivers/media/cec/cec-adap.c
index e6e82b504e56..0c0d9107383e 100644
--- a/drivers/media/cec/cec-adap.c
+++ b/drivers/media/cec/cec-adap.c
@@ -526,9 +526,11 @@ int cec_thread_func(void *_adap)
if (data->attempts) {
/* should be >= 3 data bit periods for a retry */
signal_free_time = CEC_SIGNAL_FREE_TIME_RETRY;
- } else if (data->new_initiator) {
+ } else if (adap->last_initiator !=
+ cec_msg_initiator(&data->msg)) {
/* should be >= 5 data bit periods for new initiator */
signal_free_time = CEC_SIGNAL_FREE_TIME_NEW_INITIATOR;
+ adap->last_initiator = cec_msg_initiator(&data->msg);
} else {
/*
* should be >= 7 data bit periods for sending another
@@ -713,7 +715,6 @@ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg,
struct cec_fh *fh, bool block)
{
struct cec_data *data;
- u8 last_initiator = 0xff;
msg->rx_ts = 0;
msg->tx_ts = 0;
@@ -823,23 +824,6 @@ int cec_transmit_msg_fh(struct cec_adapter *adap, struct cec_msg *msg,
data->adap = adap;
data->blocking = block;
- /*
- * Determine if this message follows a message from the same
- * initiator. Needed to determine the free signal time later on.
- */
- if (msg->len > 1) {
- if (!(list_empty(&adap->transmit_queue))) {
- const struct cec_data *last;
-
- last = list_last_entry(&adap->transmit_queue,
- const struct cec_data, list);
- last_initiator = cec_msg_initiator(&last->msg);
- } else if (adap->transmitting) {
- last_initiator =
- cec_msg_initiator(&adap->transmitting->msg);
- }
- }
- data->new_initiator = last_initiator != cec_msg_initiator(msg);
init_completion(&data->c);
INIT_DELAYED_WORK(&data->work, cec_wait_timeout);
@@ -1027,6 +1011,8 @@ void cec_received_msg_ts(struct cec_adapter *adap,
mutex_lock(&adap->lock);
dprintk(2, "%s: %*ph\n", __func__, msg->len, msg->msg);
+ adap->last_initiator = 0xff;
+
/* Check if this message was for us (directed or broadcast). */
if (!cec_msg_is_broadcast(msg))
valid_la = cec_has_log_addr(adap, msg_dest);
@@ -1489,6 +1475,8 @@ void __cec_s_phys_addr(struct cec_adapter *adap, u16 phys_addr, bool block)
}
mutex_lock(&adap->devnode.lock);
+ adap->last_initiator = 0xff;
+
if ((adap->needs_hpd || list_empty(&adap->devnode.fhs)) &&
adap->ops->adap_enable(adap, true)) {
mutex_unlock(&adap->devnode.lock);
diff --git a/include/media/cec.h b/include/media/cec.h
index 9f382f0c2970..254a610b9aa5 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -63,7 +63,6 @@ struct cec_data {
struct delayed_work work;
struct completion c;
u8 attempts;
- bool new_initiator;
bool blocking;
bool completed;
};
@@ -174,6 +173,7 @@ struct cec_adapter {
bool is_configuring;
bool is_configured;
bool cec_pin_is_high;
+ u8 last_initiator;
u32 monitor_all_cnt;
u32 monitor_pin_cnt;
u32 follower_cnt;
Please apply the follow patch from kernel 4.18 to kernel 4.14
Commit: a7f58b9ecfd3c0f63703ec10f4a592cc38dbd1b8
This patch fixes a performance issue with some devices being accessed using
the Volume Management Device (vmd). I have included an updated copy of the
patch because the parent directory of the file has changed between the 4.14
and 4.18 releases
4.14: drivers/pci/host/vmd.c
4.18: drivers/pci/controller/vmd.c
Thank you,
Kenneth Heitke <Kenneth.heitke(a)intel.com>
---
From: Keith Busch <keith.busch(a)intel.com>
Date: Tue, 8 May 2018 10:00:22 -0600
Subject: [PATCH] PCI: vmd: White list for fast interrupt handlers
Devices with slow interrupt handlers are significantly harming
performance when their interrupt vector is shared with a fast device.
Create a class code white list for devices with known fast interrupt
handlers and let all other devices share a single vector so that they
don't interfere with performance.
At the moment, only the NVM Express class code is on the list, but more
may be added if VMD users desire to use other low-latency devices in
these domains.
Signed-off-by: Keith Busch <keith.busch(a)intel.com>
[lorenzo.pieralisi(a)arm.com: changelog]
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi(a)arm.com>
Acked-by: Jon Derrick: <jonathan.derrick(a)intel.com>
---
drivers/pci/host/vmd.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/host/vmd.c b/drivers/pci/host/vmd.c
index 509893b..2537b02 100644
--- a/drivers/pci/host/vmd.c
+++ b/drivers/pci/host/vmd.c
@@ -183,9 +183,20 @@ static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *d
int i, best = 1;
unsigned long flags;
- if (pci_is_bridge(msi_desc_to_pci_dev(desc)) || vmd->msix_count == 1)
+ if (vmd->msix_count == 1)
return &vmd->irqs[0];
+ /*
+ * White list for fast-interrupt handlers. All others will share the
+ * "slow" interrupt vector.
+ */
+ switch (msi_desc_to_pci_dev(desc)->class) {
+ case PCI_CLASS_STORAGE_EXPRESS:
+ break;
+ default:
+ return &vmd->irqs[0];
+ }
+
raw_spin_lock_irqsave(&list_lock, flags);
for (i = 1; i < vmd->msix_count; i++)
if (vmd->irqs[i].count < vmd->irqs[best].count)
--
2.7.4
In September last year, Ben Hutchings submitted commit [9547837bdccb]
for 3.16.48-rc1 and I informed him that it would be useless without
[3f3752705dbd] (and that maybe [c3883fe06488] would be useful as well).
Ben dropped the patch but suggested I email this list with the
information of the other two patches but I never quite got around to it.
Now I see Sasha Levin is submitting [3f3752705dbd] and [c3883fe06488]
for 4.9, 4.4 and 3.18 it would now make sense to include [9547837bdccb].
This patch fixes a minor problem where a certain USB adapter for Sega
Genesis controllers appears as one input device when it has two ports
for two controllers. I imagine some users of emulator distributions
might use stable kernels and might benefit from this fix.
I'm actually not entirely sure that patch is something suitable for
stable but since it was already submitted once then I don't think it
hurts to bring it up again (despite it breaking stable-kernel-rules as
far as I understand it).
Commits mentioned:
[9547837bdccb]: HID: usbhid: add quirk for innomedia INNEX GENESIS/ATARI adapter
[3f3752705dbd]: HID: reject input outside logical range only if null state is set
[c3883fe06488]: HID: clamp input to logical range if no null state
If the patch [9547837bdccb] is not relevant then feel free to ignore
this email.
Thanks,
--
Tomasz Kramkowski
The patch below does not apply to the 3.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From aeae4f3e5c38d47bdaef50446dc0ec857307df68 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas(a)wunner.de>
Date: Tue, 4 Sep 2018 12:34:18 -0500
Subject: [PATCH] PCI/ASPM: Fix link_state teardown on device removal
Upon removal of the last device on a bus, the link_state of the bridge
leading to that bus is sought to be torn down by having pci_stop_dev()
call pcie_aspm_exit_link_state().
When ASPM was originally introduced by commit 7d715a6c1ae5 ("PCI: add
PCI Express ASPM support"), it determined whether the device being
removed is the last one by calling list_empty() on the bridge's
subordinate devices list. That didn't work because the device is only
removed from the list slightly later in pci_destroy_dev().
Commit 3419c75e15f8 ("PCI: properly clean up ASPM link state on device
remove") attempted to fix it by calling list_is_last(), but that's not
correct either because it checks whether the device is at the *end* of
the list, not whether it's the last one *left* in the list. If the user
removes the device which happens to be at the end of the list via sysfs
but other devices are preceding the device in the list, the link_state
is torn down prematurely.
The real fix is to move the invocation of pcie_aspm_exit_link_state() to
pci_destroy_dev() and reinstate the call to list_empty(). Remove a
duplicate check for dev->bus->self because pcie_aspm_exit_link_state()
already contains an identical check.
Fixes: 7d715a6c1ae5 ("PCI: add PCI Express ASPM support")
Signed-off-by: Lukas Wunner <lukas(a)wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Shaohua Li <shaohua.li(a)intel.com>
Cc: stable(a)vger.kernel.org # v2.6.26
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 5326916715d2..f78860ce884b 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -991,7 +991,7 @@ void pcie_aspm_exit_link_state(struct pci_dev *pdev)
* All PCIe functions are in one slot, remove one function will remove
* the whole slot, so just wait until we are the last function left.
*/
- if (!list_is_last(&pdev->bus_list, &parent->subordinate->devices))
+ if (!list_empty(&parent->subordinate->devices))
goto out;
link = parent->link_state;
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 461e7fd2756f..e9c6b120cf45 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -25,9 +25,6 @@ static void pci_stop_dev(struct pci_dev *dev)
pci_dev_assign_added(dev, false);
}
-
- if (dev->bus->self)
- pcie_aspm_exit_link_state(dev);
}
static void pci_destroy_dev(struct pci_dev *dev)
@@ -41,6 +38,7 @@ static void pci_destroy_dev(struct pci_dev *dev)
list_del(&dev->bus_list);
up_write(&pci_bus_sem);
+ pcie_aspm_exit_link_state(dev);
pci_bridge_d3_update(dev);
pci_free_resources(dev);
put_device(&dev->dev);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From aeae4f3e5c38d47bdaef50446dc0ec857307df68 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas(a)wunner.de>
Date: Tue, 4 Sep 2018 12:34:18 -0500
Subject: [PATCH] PCI/ASPM: Fix link_state teardown on device removal
Upon removal of the last device on a bus, the link_state of the bridge
leading to that bus is sought to be torn down by having pci_stop_dev()
call pcie_aspm_exit_link_state().
When ASPM was originally introduced by commit 7d715a6c1ae5 ("PCI: add
PCI Express ASPM support"), it determined whether the device being
removed is the last one by calling list_empty() on the bridge's
subordinate devices list. That didn't work because the device is only
removed from the list slightly later in pci_destroy_dev().
Commit 3419c75e15f8 ("PCI: properly clean up ASPM link state on device
remove") attempted to fix it by calling list_is_last(), but that's not
correct either because it checks whether the device is at the *end* of
the list, not whether it's the last one *left* in the list. If the user
removes the device which happens to be at the end of the list via sysfs
but other devices are preceding the device in the list, the link_state
is torn down prematurely.
The real fix is to move the invocation of pcie_aspm_exit_link_state() to
pci_destroy_dev() and reinstate the call to list_empty(). Remove a
duplicate check for dev->bus->self because pcie_aspm_exit_link_state()
already contains an identical check.
Fixes: 7d715a6c1ae5 ("PCI: add PCI Express ASPM support")
Signed-off-by: Lukas Wunner <lukas(a)wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Shaohua Li <shaohua.li(a)intel.com>
Cc: stable(a)vger.kernel.org # v2.6.26
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 5326916715d2..f78860ce884b 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -991,7 +991,7 @@ void pcie_aspm_exit_link_state(struct pci_dev *pdev)
* All PCIe functions are in one slot, remove one function will remove
* the whole slot, so just wait until we are the last function left.
*/
- if (!list_is_last(&pdev->bus_list, &parent->subordinate->devices))
+ if (!list_empty(&parent->subordinate->devices))
goto out;
link = parent->link_state;
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 461e7fd2756f..e9c6b120cf45 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -25,9 +25,6 @@ static void pci_stop_dev(struct pci_dev *dev)
pci_dev_assign_added(dev, false);
}
-
- if (dev->bus->self)
- pcie_aspm_exit_link_state(dev);
}
static void pci_destroy_dev(struct pci_dev *dev)
@@ -41,6 +38,7 @@ static void pci_destroy_dev(struct pci_dev *dev)
list_del(&dev->bus_list);
up_write(&pci_bus_sem);
+ pcie_aspm_exit_link_state(dev);
pci_bridge_d3_update(dev);
pci_free_resources(dev);
put_device(&dev->dev);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From aeae4f3e5c38d47bdaef50446dc0ec857307df68 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas(a)wunner.de>
Date: Tue, 4 Sep 2018 12:34:18 -0500
Subject: [PATCH] PCI/ASPM: Fix link_state teardown on device removal
Upon removal of the last device on a bus, the link_state of the bridge
leading to that bus is sought to be torn down by having pci_stop_dev()
call pcie_aspm_exit_link_state().
When ASPM was originally introduced by commit 7d715a6c1ae5 ("PCI: add
PCI Express ASPM support"), it determined whether the device being
removed is the last one by calling list_empty() on the bridge's
subordinate devices list. That didn't work because the device is only
removed from the list slightly later in pci_destroy_dev().
Commit 3419c75e15f8 ("PCI: properly clean up ASPM link state on device
remove") attempted to fix it by calling list_is_last(), but that's not
correct either because it checks whether the device is at the *end* of
the list, not whether it's the last one *left* in the list. If the user
removes the device which happens to be at the end of the list via sysfs
but other devices are preceding the device in the list, the link_state
is torn down prematurely.
The real fix is to move the invocation of pcie_aspm_exit_link_state() to
pci_destroy_dev() and reinstate the call to list_empty(). Remove a
duplicate check for dev->bus->self because pcie_aspm_exit_link_state()
already contains an identical check.
Fixes: 7d715a6c1ae5 ("PCI: add PCI Express ASPM support")
Signed-off-by: Lukas Wunner <lukas(a)wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Cc: Shaohua Li <shaohua.li(a)intel.com>
Cc: stable(a)vger.kernel.org # v2.6.26
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 5326916715d2..f78860ce884b 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -991,7 +991,7 @@ void pcie_aspm_exit_link_state(struct pci_dev *pdev)
* All PCIe functions are in one slot, remove one function will remove
* the whole slot, so just wait until we are the last function left.
*/
- if (!list_is_last(&pdev->bus_list, &parent->subordinate->devices))
+ if (!list_empty(&parent->subordinate->devices))
goto out;
link = parent->link_state;
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 461e7fd2756f..e9c6b120cf45 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -25,9 +25,6 @@ static void pci_stop_dev(struct pci_dev *dev)
pci_dev_assign_added(dev, false);
}
-
- if (dev->bus->self)
- pcie_aspm_exit_link_state(dev);
}
static void pci_destroy_dev(struct pci_dev *dev)
@@ -41,6 +38,7 @@ static void pci_destroy_dev(struct pci_dev *dev)
list_del(&dev->bus_list);
up_write(&pci_bus_sem);
+ pcie_aspm_exit_link_state(dev);
pci_bridge_d3_update(dev);
pci_free_resources(dev);
put_device(&dev->dev);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9a59739bd01f77db6fbe2955a4fce165f0f43568 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg(a)mellanox.com>
Date: Tue, 14 Aug 2018 15:33:02 -0700
Subject: [PATCH] IB/rxe: Revise the ib_wr_opcode enum
This enum has become part of the uABI, as both RXE and the
ib_uverbs_post_send() command expect userspace to supply values from this
enum. So it should be properly placed in include/uapi/rdma.
In userspace this enum is called 'enum ibv_wr_opcode' as part of
libibverbs.h. That enum defines different values for IB_WR_LOCAL_INV,
IB_WR_SEND_WITH_INV, and IB_WR_LSO. These were introduced (incorrectly, it
turns out) into libiberbs in 2015.
The kernel has changed its mind on the numbering for several of the IB_WC
values over the years, but has remained stable on IB_WR_LOCAL_INV and
below.
Based on this we can conclude that there is no real user space user of the
values beyond IB_WR_ATOMIC_FETCH_AND_ADD, as they have never worked via
rdma-core. This is confirmed by inspection, only rxe uses the kernel enum
and implements the latter operations. rxe has clearly never worked with
these attributes from userspace. Other drivers that support these opcodes
implement the functionality without calling out to the kernel.
To make IB_WR_SEND_WITH_INV and related work for RXE in userspace we
choose to renumber the IB_WR enum in the kernel to match the uABI that
userspace has bee using since before Soft RoCE was merged. This is an
overall simpler configuration for the whole software stack, and obviously
can't break anything existing.
Reported-by: Seth Howell <seth.howell(a)intel.com>
Tested-by: Seth Howell <seth.howell(a)intel.com>
Fixes: 8700e3e7c485 ("Soft RoCE driver")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Jason Gunthorpe <jgg(a)mellanox.com>
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6076c9b72ab9..e463d3007a35 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1281,21 +1281,27 @@ struct ib_qp_attr {
};
enum ib_wr_opcode {
- IB_WR_RDMA_WRITE,
- IB_WR_RDMA_WRITE_WITH_IMM,
- IB_WR_SEND,
- IB_WR_SEND_WITH_IMM,
- IB_WR_RDMA_READ,
- IB_WR_ATOMIC_CMP_AND_SWP,
- IB_WR_ATOMIC_FETCH_AND_ADD,
- IB_WR_LSO,
- IB_WR_SEND_WITH_INV,
- IB_WR_RDMA_READ_WITH_INV,
- IB_WR_LOCAL_INV,
- IB_WR_REG_MR,
- IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
- IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+ /* These are shared with userspace */
+ IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE,
+ IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM,
+ IB_WR_SEND = IB_UVERBS_WR_SEND,
+ IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM,
+ IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ,
+ IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP,
+ IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD,
+ IB_WR_LSO = IB_UVERBS_WR_TSO,
+ IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV,
+ IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV,
+ IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV,
+ IB_WR_MASKED_ATOMIC_CMP_AND_SWP =
+ IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
+ IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
+ IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+
+ /* These are kernel only and can not be issued by userspace */
+ IB_WR_REG_MR = 0x20,
IB_WR_REG_SIG_MR,
+
/* reserve values for low level drivers' internal use.
* These values will not be used at all in the ib core layer.
*/
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 25a16760de2a..1254b51a551a 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -763,10 +763,28 @@ struct ib_uverbs_sge {
__u32 lkey;
};
+enum ib_uverbs_wr_opcode {
+ IB_UVERBS_WR_RDMA_WRITE = 0,
+ IB_UVERBS_WR_RDMA_WRITE_WITH_IMM = 1,
+ IB_UVERBS_WR_SEND = 2,
+ IB_UVERBS_WR_SEND_WITH_IMM = 3,
+ IB_UVERBS_WR_RDMA_READ = 4,
+ IB_UVERBS_WR_ATOMIC_CMP_AND_SWP = 5,
+ IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD = 6,
+ IB_UVERBS_WR_LOCAL_INV = 7,
+ IB_UVERBS_WR_BIND_MW = 8,
+ IB_UVERBS_WR_SEND_WITH_INV = 9,
+ IB_UVERBS_WR_TSO = 10,
+ IB_UVERBS_WR_RDMA_READ_WITH_INV = 11,
+ IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12,
+ IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13,
+ /* Review enum ib_wr_opcode before modifying this */
+};
+
struct ib_uverbs_send_wr {
__aligned_u64 wr_id;
__u32 num_sge;
- __u32 opcode;
+ __u32 opcode; /* see enum ib_uverbs_wr_opcode */
__u32 send_flags;
union {
__be32 imm_data;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9a59739bd01f77db6fbe2955a4fce165f0f43568 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg(a)mellanox.com>
Date: Tue, 14 Aug 2018 15:33:02 -0700
Subject: [PATCH] IB/rxe: Revise the ib_wr_opcode enum
This enum has become part of the uABI, as both RXE and the
ib_uverbs_post_send() command expect userspace to supply values from this
enum. So it should be properly placed in include/uapi/rdma.
In userspace this enum is called 'enum ibv_wr_opcode' as part of
libibverbs.h. That enum defines different values for IB_WR_LOCAL_INV,
IB_WR_SEND_WITH_INV, and IB_WR_LSO. These were introduced (incorrectly, it
turns out) into libiberbs in 2015.
The kernel has changed its mind on the numbering for several of the IB_WC
values over the years, but has remained stable on IB_WR_LOCAL_INV and
below.
Based on this we can conclude that there is no real user space user of the
values beyond IB_WR_ATOMIC_FETCH_AND_ADD, as they have never worked via
rdma-core. This is confirmed by inspection, only rxe uses the kernel enum
and implements the latter operations. rxe has clearly never worked with
these attributes from userspace. Other drivers that support these opcodes
implement the functionality without calling out to the kernel.
To make IB_WR_SEND_WITH_INV and related work for RXE in userspace we
choose to renumber the IB_WR enum in the kernel to match the uABI that
userspace has bee using since before Soft RoCE was merged. This is an
overall simpler configuration for the whole software stack, and obviously
can't break anything existing.
Reported-by: Seth Howell <seth.howell(a)intel.com>
Tested-by: Seth Howell <seth.howell(a)intel.com>
Fixes: 8700e3e7c485 ("Soft RoCE driver")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Jason Gunthorpe <jgg(a)mellanox.com>
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 6076c9b72ab9..e463d3007a35 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1281,21 +1281,27 @@ struct ib_qp_attr {
};
enum ib_wr_opcode {
- IB_WR_RDMA_WRITE,
- IB_WR_RDMA_WRITE_WITH_IMM,
- IB_WR_SEND,
- IB_WR_SEND_WITH_IMM,
- IB_WR_RDMA_READ,
- IB_WR_ATOMIC_CMP_AND_SWP,
- IB_WR_ATOMIC_FETCH_AND_ADD,
- IB_WR_LSO,
- IB_WR_SEND_WITH_INV,
- IB_WR_RDMA_READ_WITH_INV,
- IB_WR_LOCAL_INV,
- IB_WR_REG_MR,
- IB_WR_MASKED_ATOMIC_CMP_AND_SWP,
- IB_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+ /* These are shared with userspace */
+ IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE,
+ IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM,
+ IB_WR_SEND = IB_UVERBS_WR_SEND,
+ IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM,
+ IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ,
+ IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP,
+ IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD,
+ IB_WR_LSO = IB_UVERBS_WR_TSO,
+ IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV,
+ IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV,
+ IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV,
+ IB_WR_MASKED_ATOMIC_CMP_AND_SWP =
+ IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
+ IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
+ IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+
+ /* These are kernel only and can not be issued by userspace */
+ IB_WR_REG_MR = 0x20,
IB_WR_REG_SIG_MR,
+
/* reserve values for low level drivers' internal use.
* These values will not be used at all in the ib core layer.
*/
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 25a16760de2a..1254b51a551a 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -763,10 +763,28 @@ struct ib_uverbs_sge {
__u32 lkey;
};
+enum ib_uverbs_wr_opcode {
+ IB_UVERBS_WR_RDMA_WRITE = 0,
+ IB_UVERBS_WR_RDMA_WRITE_WITH_IMM = 1,
+ IB_UVERBS_WR_SEND = 2,
+ IB_UVERBS_WR_SEND_WITH_IMM = 3,
+ IB_UVERBS_WR_RDMA_READ = 4,
+ IB_UVERBS_WR_ATOMIC_CMP_AND_SWP = 5,
+ IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD = 6,
+ IB_UVERBS_WR_LOCAL_INV = 7,
+ IB_UVERBS_WR_BIND_MW = 8,
+ IB_UVERBS_WR_SEND_WITH_INV = 9,
+ IB_UVERBS_WR_TSO = 10,
+ IB_UVERBS_WR_RDMA_READ_WITH_INV = 11,
+ IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12,
+ IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13,
+ /* Review enum ib_wr_opcode before modifying this */
+};
+
struct ib_uverbs_send_wr {
__aligned_u64 wr_id;
__u32 num_sge;
- __u32 opcode;
+ __u32 opcode; /* see enum ib_uverbs_wr_opcode */
__u32 send_flags;
union {
__be32 imm_data;
This is backport of the upstream patch
41c73a49df31151f4ff868f28fe4f129f113fa2c. It should be backported to
stable kernels because this performance problem was seen on the android
4.4 kernel.
commit 41c73a49df31151f4ff868f28fe4f129f113fa2c
Author: Mikulas Patocka <mpatocka(a)redhat.com>
Date: Wed Nov 23 17:04:00 2016 -0500
dm bufio: drop the lock when doing GFP_NOIO allocation
If the first allocation attempt using GFP_NOWAIT fails, drop the lock
and retry using GFP_NOIO allocation (lock is dropped because the
allocation can take some time).
Note that we won't do GFP_NOIO allocation when we loop for the second
time, because the lock shouldn't be dropped between __wait_for_free_buffer
and __get_unclaimed_buffer.
Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com>
Signed-off-by: Mike Snitzer <snitzer(a)redhat.com>
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 9ef88f0e2382..1c2e1dd7ca16 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -822,6 +822,7 @@ enum new_flag {
static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
{
struct dm_buffer *b;
+ bool tried_noio_alloc = false;
/*
* dm-bufio is resistant to allocation failures (it just keeps
@@ -846,6 +847,15 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
if (nf == NF_PREFETCH)
return NULL;
+ if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
+ dm_bufio_unlock(c);
+ b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ dm_bufio_lock(c);
+ if (b)
+ return b;
+ tried_noio_alloc = true;
+ }
+
if (!list_empty(&c->reserved_buffers)) {
b = list_entry(c->reserved_buffers.next,
struct dm_buffer, lru_list);
This is backport of the upstream patch
9ea61cac0b1ad0c09022f39fd97e9b99a2cfc2dc. It should be backported to
stable kernels because this performance problem was seen on the android
4.4 kernel.
commit 9ea61cac0b1ad0c09022f39fd97e9b99a2cfc2dc
Author: Douglas Anderson <dianders(a)chromium.org>
Date: Thu Nov 17 11:24:20 2016 -0800
dm bufio: avoid sleeping while holding the dm_bufio lock
We've seen in-field reports showing _lots_ (18 in one case, 41 in
another) of tasks all sitting there blocked on:
mutex_lock+0x4c/0x68
dm_bufio_shrink_count+0x38/0x78
shrink_slab.part.54.constprop.65+0x100/0x464
shrink_zone+0xa8/0x198
In the two cases analyzed, we see one task that looks like this:
Workqueue: kverityd verity_prefetch_io
__switch_to+0x9c/0xa8
__schedule+0x440/0x6d8
schedule+0x94/0xb4
schedule_timeout+0x204/0x27c
schedule_timeout_uninterruptible+0x44/0x50
wait_iff_congested+0x9c/0x1f0
shrink_inactive_list+0x3a0/0x4cc
shrink_lruvec+0x418/0x5cc
shrink_zone+0x88/0x198
try_to_free_pages+0x51c/0x588
__alloc_pages_nodemask+0x648/0xa88
__get_free_pages+0x34/0x7c
alloc_buffer+0xa4/0x144
__bufio_new+0x84/0x278
dm_bufio_prefetch+0x9c/0x154
verity_prefetch_io+0xe8/0x10c
process_one_work+0x240/0x424
worker_thread+0x2fc/0x424
kthread+0x10c/0x114
...and that looks to be the one holding the mutex.
The problem has been reproduced on fairly easily:
0. Be running Chrome OS w/ verity enabled on the root filesystem
1. Pick test patch: http://crosreview.com/412360
2. Install launchBalloons.sh and balloon.arm from
http://crbug.com/468342
...that's just a memory stress test app.
3. On a 4GB rk3399 machine, run
nice ./launchBalloons.sh 4 900 100000
...that tries to eat 4 * 900 MB of memory and keep accessing.
4. Login to the Chrome web browser and restore many tabs
With that, I've seen printouts like:
DOUG: long bufio 90758 ms
...and stack trace always show's we're in dm_bufio_prefetch().
The problem is that we try to allocate memory with GFP_NOIO while
we're holding the dm_bufio lock. Instead we should be using
GFP_NOWAIT. Using GFP_NOIO can cause us to sleep while holding the
lock and that causes the above problems.
The current behavior explained by David Rientjes:
It will still try reclaim initially because __GFP_WAIT (or
__GFP_KSWAPD_RECLAIM) is set by GFP_NOIO. This is the cause of
contention on dm_bufio_lock() that the thread holds. You want to
pass GFP_NOWAIT instead of GFP_NOIO to alloc_buffer() when holding a
mutex that can be contended by a concurrent slab shrinker (if
count_objects didn't use a trylock, this pattern would trivially
deadlock).
This change significantly increases responsiveness of the system while
in this state. It makes a real difference because it unblocks kswapd.
In the bug report analyzed, kswapd was hung:
kswapd0 D ffffffc000204fd8 0 72 2 0x00000000
Call trace:
[<ffffffc000204fd8>] __switch_to+0x9c/0xa8
[<ffffffc00090b794>] __schedule+0x440/0x6d8
[<ffffffc00090bac0>] schedule+0x94/0xb4
[<ffffffc00090be44>] schedule_preempt_disabled+0x28/0x44
[<ffffffc00090d900>] __mutex_lock_slowpath+0x120/0x1ac
[<ffffffc00090d9d8>] mutex_lock+0x4c/0x68
[<ffffffc000708e7c>] dm_bufio_shrink_count+0x38/0x78
[<ffffffc00030b268>] shrink_slab.part.54.constprop.65+0x100/0x464
[<ffffffc00030dbd8>] shrink_zone+0xa8/0x198
[<ffffffc00030e578>] balance_pgdat+0x328/0x508
[<ffffffc00030eb7c>] kswapd+0x424/0x51c
[<ffffffc00023f06c>] kthread+0x10c/0x114
[<ffffffc000203dd0>] ret_from_fork+0x10/0x40
By unblocking kswapd memory pressure should be reduced.
Suggested-by: David Rientjes <rientjes(a)google.com>
Reviewed-by: Guenter Roeck <linux(a)roeck-us.net>
Signed-off-by: Douglas Anderson <dianders(a)chromium.org>
Signed-off-by: Mike Snitzer <snitzer(a)redhat.com>
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 125aedc3875f..b5d3428d109e 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -827,7 +827,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
* dm-bufio is resistant to allocation failures (it just keeps
* one buffer reserved in cases all the allocations fail).
* So set flags to not try too hard:
- * GFP_NOIO: don't recurse into the I/O layer
+ * GFP_NOWAIT: don't wait; if we need to sleep we'll release our
+ * mutex and wait ourselves.
* __GFP_NORETRY: don't retry and rather return failure
* __GFP_NOMEMALLOC: don't use emergency reserves
* __GFP_NOWARN: don't print a warning in case of failure
@@ -837,7 +838,7 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
*/
while (1) {
if (dm_bufio_cache_size_latch != 1) {
- b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
if (b)
return b;
}
On a powerpc 8xx, 'btc' fails as follows:
Entering kdb (current=0x(ptrval), pid 282) due to Keyboard Entry
kdb> btc
btc: cpu status: Currently on cpu 0
Available cpus: 0
kdb_getarea: Bad address 0x0
when booting the kernel with 'debug_boot_weak_hash', it fails as well
Entering kdb (current=0xba99ad80, pid 284) due to Keyboard Entry
kdb> btc
btc: cpu status: Currently on cpu 0
Available cpus: 0
kdb_getarea: Bad address 0xba99ad80
On other platforms, Oopses have been observed too, see
https://github.com/linuxppc/linux/issues/139
This is due to btc calling 'btt' with %p pointer as an argument.
This patch replaces %p by %px to get the real pointer value as
expected by 'btt'
Signed-off-by: Christophe Leroy <christophe.leroy(a)c-s.fr>
Cc: <stable(a)vger.kernel.org> # 4.15+
---
kernel/debug/kdb/kdb_bt.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 6ad4a9fcbd6f..7921ae4fca8d 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -179,14 +179,14 @@ kdb_bt(int argc, const char **argv)
kdb_printf("no process for cpu %ld\n", cpu);
return 0;
}
- sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
kdb_parse(buf);
return 0;
}
kdb_printf("btc: cpu status: ");
kdb_parse("cpu\n");
for_each_online_cpu(cpu) {
- sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+ sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
kdb_parse(buf);
touch_nmi_watchdog();
}
--
2.13.3