This is a note to let you know that I've just added the patch titled
w1: mxc_w1: Fix timeout resolution problem leading to bus error
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-testing branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will be merged to the char-misc-next branch sometime soon,
after it passes testing, and the merge window is open.
If you have any questions about this process, please let me know.
>From e1a26e13baf690444a254c4c4f088e1d059a942a Mon Sep 17 00:00:00 2001
From: Martin Fuzzey <martin.fuzzey(a)flowbird.group>
Date: Wed, 30 Sep 2020 10:36:46 +0200
Subject: w1: mxc_w1: Fix timeout resolution problem leading to bus error
On my platform (i.MX53) bus access sometimes fails with
w1_search: max_slave_count 64 reached, will continue next search.
The reason is the use of jiffies to implement a 200us timeout in
mxc_w1_ds2_touch_bit().
On some platforms the jiffies timer resolution is insufficient for this.
Fix by replacing jiffies by ktime_get().
For consistency apply the same change to the other use of jiffies in
mxc_w1_ds2_reset_bus().
Fixes: f80b2581a706 ("w1: mxc_w1: Optimize mxc_w1_ds2_touch_bit()")
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Martin Fuzzey <martin.fuzzey(a)flowbird.group>
Link: https://lore.kernel.org/r/1601455030-6607-1-git-send-email-martin.fuzzey@fl…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/w1/masters/mxc_w1.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/w1/masters/mxc_w1.c b/drivers/w1/masters/mxc_w1.c
index 1ca880e01476..090cbbf9e1e2 100644
--- a/drivers/w1/masters/mxc_w1.c
+++ b/drivers/w1/masters/mxc_w1.c
@@ -7,7 +7,7 @@
#include <linux/clk.h>
#include <linux/delay.h>
#include <linux/io.h>
-#include <linux/jiffies.h>
+#include <linux/ktime.h>
#include <linux/module.h>
#include <linux/mod_devicetable.h>
#include <linux/platform_device.h>
@@ -40,12 +40,12 @@ struct mxc_w1_device {
static u8 mxc_w1_ds2_reset_bus(void *data)
{
struct mxc_w1_device *dev = data;
- unsigned long timeout;
+ ktime_t timeout;
writeb(MXC_W1_CONTROL_RPP, dev->regs + MXC_W1_CONTROL);
/* Wait for reset sequence 511+512us, use 1500us for sure */
- timeout = jiffies + usecs_to_jiffies(1500);
+ timeout = ktime_add_us(ktime_get(), 1500);
udelay(511 + 512);
@@ -55,7 +55,7 @@ static u8 mxc_w1_ds2_reset_bus(void *data)
/* PST bit is valid after the RPP bit is self-cleared */
if (!(ctrl & MXC_W1_CONTROL_RPP))
return !(ctrl & MXC_W1_CONTROL_PST);
- } while (time_is_after_jiffies(timeout));
+ } while (ktime_before(ktime_get(), timeout));
return 1;
}
@@ -68,12 +68,12 @@ static u8 mxc_w1_ds2_reset_bus(void *data)
static u8 mxc_w1_ds2_touch_bit(void *data, u8 bit)
{
struct mxc_w1_device *dev = data;
- unsigned long timeout;
+ ktime_t timeout;
writeb(MXC_W1_CONTROL_WR(bit), dev->regs + MXC_W1_CONTROL);
/* Wait for read/write bit (60us, Max 120us), use 200us for sure */
- timeout = jiffies + usecs_to_jiffies(200);
+ timeout = ktime_add_us(ktime_get(), 200);
udelay(60);
@@ -83,7 +83,7 @@ static u8 mxc_w1_ds2_touch_bit(void *data, u8 bit)
/* RDST bit is valid after the WR1/RD bit is self-cleared */
if (!(ctrl & MXC_W1_CONTROL_WR(bit)))
return !!(ctrl & MXC_W1_CONTROL_RDST);
- } while (time_is_after_jiffies(timeout));
+ } while (ktime_before(ktime_get(), timeout));
return 0;
}
--
2.28.0
If more than two jobs end up timeout-ing concurrently, only one of them
(the one attached to the scheduler acquiring the lock) is fully handled.
The other one remains in a dangling state where it's no longer part of
the scheduling queue, but still blocks something in scheduler, leading
to repetitive timeouts when new jobs are queued.
Let's make sure all bad jobs are properly handled by the thread
acquiring the lock.
v2:
- Fix the subject prefix
- Stop the scheduler before returning from panfrost_job_timedout()
- Call cancel_delayed_work_sync() after drm_sched_stop() to make sure
no timeout handlers are in flight when we reset the GPU (Steven Price)
- Make sure we release the reset lock before restarting the
schedulers (Steven Price)
Signed-off-by: Boris Brezillon <boris.brezillon(a)collabora.com>
Fixes: f3ba91228e8e ("drm/panfrost: Add initial panfrost driver")
Cc: <stable(a)vger.kernel.org>
---
drivers/gpu/drm/panfrost/panfrost_job.c | 64 +++++++++++++++++++++----
1 file changed, 55 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 30e7b7196dab..6e4bfb938fab 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -25,7 +25,8 @@
struct panfrost_queue_state {
struct drm_gpu_scheduler sched;
-
+ bool stopped;
+ struct mutex lock;
u64 fence_context;
u64 emit_seqno;
};
@@ -369,6 +370,24 @@ void panfrost_job_enable_interrupts(struct panfrost_device *pfdev)
job_write(pfdev, JOB_INT_MASK, irq_mask);
}
+static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
+ struct drm_sched_job *bad)
+{
+ bool stopped = false;
+
+ mutex_lock(&queue->lock);
+ if (!queue->stopped) {
+ drm_sched_stop(&queue->sched, bad);
+ if (bad)
+ drm_sched_increase_karma(bad);
+ queue->stopped = true;
+ stopped = true;
+ }
+ mutex_unlock(&queue->lock);
+
+ return stopped;
+}
+
static void panfrost_job_timedout(struct drm_sched_job *sched_job)
{
struct panfrost_job *job = to_panfrost_job(sched_job);
@@ -392,19 +411,41 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
job_read(pfdev, JS_TAIL_LO(js)),
sched_job);
+ /* Scheduler is already stopped, nothing to do. */
+ if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
+ return;
+
if (!mutex_trylock(&pfdev->reset_lock))
return;
+ mutex_lock(&pfdev->sched_lock);
for (i = 0; i < NUM_JOB_SLOTS; i++) {
struct drm_gpu_scheduler *sched = &pfdev->js->queue[i].sched;
- drm_sched_stop(sched, sched_job);
- if (js != i)
- /* Ensure any timeouts on other slots have finished */
+ /*
+ * If the queue is still active, make sure we wait for any
+ * pending timeouts.
+ */
+ if (!pfdev->js->queue[i].stopped)
cancel_delayed_work_sync(&sched->work_tdr);
- }
- drm_sched_increase_karma(sched_job);
+ /*
+ * If the scheduler was not already stopped, there's a tiny
+ * chance a timeout has expired just before we stopped it, and
+ * drm_sched_stop() does not flush pending works. Let's flush
+ * them now so the timeout handler doesn't get called in the
+ * middle of a reset.
+ */
+ if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
+ cancel_delayed_work_sync(&sched->work_tdr);
+
+ /*
+ * Now that we cancelled the pending timeouts, we can safely
+ * reset the stopped state.
+ */
+ pfdev->js->queue[i].stopped = false;
+ }
+ mutex_unlock(&pfdev->sched_lock);
spin_lock_irqsave(&pfdev->js->job_lock, flags);
for (i = 0; i < NUM_JOB_SLOTS; i++) {
@@ -421,11 +462,11 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
for (i = 0; i < NUM_JOB_SLOTS; i++)
drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
+ mutex_unlock(&pfdev->reset_lock);
+
/* restart scheduler after GPU is usable again */
for (i = 0; i < NUM_JOB_SLOTS; i++)
drm_sched_start(&pfdev->js->queue[i].sched, true);
-
- mutex_unlock(&pfdev->reset_lock);
}
static const struct drm_sched_backend_ops panfrost_sched_ops = {
@@ -558,6 +599,7 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)
int ret, i;
for (i = 0; i < NUM_JOB_SLOTS; i++) {
+ mutex_init(&js->queue[i].lock);
sched = &js->queue[i].sched;
ret = drm_sched_entity_init(&panfrost_priv->sched_entity[i],
DRM_SCHED_PRIORITY_NORMAL, &sched,
@@ -570,10 +612,14 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)
void panfrost_job_close(struct panfrost_file_priv *panfrost_priv)
{
+ struct panfrost_device *pfdev = panfrost_priv->pfdev;
+ struct panfrost_job_slot *js = pfdev->js;
int i;
- for (i = 0; i < NUM_JOB_SLOTS; i++)
+ for (i = 0; i < NUM_JOB_SLOTS; i++) {
drm_sched_entity_destroy(&panfrost_priv->sched_entity[i]);
+ mutex_destroy(&js->queue[i].lock);
+ }
}
int panfrost_job_is_idle(struct panfrost_device *pfdev)
--
2.26.2
According to the "VFxxx Controller Reference Manual" (and the comment
block starting at line 97), Vybrid requires writing a one for clearing
an interrupt flag. Syncing with the method for clearing I2SR_IIF in
i2c_imx_isr().
Signed-off-by: Christian Eggers <ceggers(a)arri.de>
Cc: stable(a)vger.kernel.org
---
drivers/i2c/busses/i2c-imx.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c
index 0ab5381aa012..d8b2e632dd10 100644
--- a/drivers/i2c/busses/i2c-imx.c
+++ b/drivers/i2c/busses/i2c-imx.c
@@ -425,6 +425,7 @@ static int i2c_imx_bus_busy(struct imx_i2c_struct *i2c_imx, int for_busy, bool a
/* check for arbitration lost */
if (temp & I2SR_IAL) {
temp &= ~I2SR_IAL;
+ temp |= (i2c_imx->hwdata->i2sr_clr_opcode & I2SR_IAL);
imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2SR);
return -EAGAIN;
}
--
Christian Eggers
Embedded software developer
Arnold & Richter Cine Technik GmbH & Co. Betriebs KG
Sitz: Muenchen - Registergericht: Amtsgericht Muenchen - Handelsregisternummer: HRA 57918
Persoenlich haftender Gesellschafter: Arnold & Richter Cine Technik GmbH
Sitz: Muenchen - Registergericht: Amtsgericht Muenchen - Handelsregisternummer: HRB 54477
Geschaeftsfuehrer: Dr. Michael Neuhaeuser; Stephan Schenk; Walter Trauninger; Markus Zeiler
This is a note to let you know that I've just added the patch titled
USB: cdc-wdm: Make wdm_flush() interruptible and add wdm_fsync().
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-testing branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will be merged to the usb-next branch sometime soon,
after it passes testing, and the merge window is open.
If you have any questions about this process, please let me know.
>From 37d2a36394d954413a495da61da1b2a51ecd28ab Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum(a)suse.com>
Date: Mon, 28 Sep 2020 23:17:55 +0900
Subject: USB: cdc-wdm: Make wdm_flush() interruptible and add wdm_fsync().
syzbot is reporting hung task at wdm_flush() [1], for there is a circular
dependency that wdm_flush() from flip_close() for /dev/cdc-wdm0 forever
waits for /dev/raw-gadget to be closed while close() for /dev/raw-gadget
cannot be called unless close() for /dev/cdc-wdm0 completes.
Tetsuo Handa considered that such circular dependency is an usage error [2]
which corresponds to an unresponding broken hardware [3]. But Alan Stern
responded that we should be prepared for such hardware [4]. Therefore,
this patch changes wdm_flush() to use wait_event_interruptible_timeout()
which gives up after 30 seconds, for hardware that remains silent must be
ignored. The 30 seconds are coming out of thin air.
Changing wait_event() to wait_event_interruptible_timeout() makes error
reporting from close() syscall less reliable. To compensate it, this patch
also implements wdm_fsync() which does not use timeout. Those who want to
be very sure that data has gone out to the device are now advised to call
fsync(), with a caveat that fsync() can return -EINVAL when running on
older kernels which do not implement wdm_fsync().
This patch also fixes three more problems (listed below) found during
exhaustive discussion and testing.
Since multiple threads can concurrently call wdm_write()/wdm_flush(),
we need to use wake_up_all() whenever clearing WDM_IN_USE in order to
make sure that all waiters are woken up. Also, error reporting needs
to use fetch-and-clear approach in order not to report same error for
multiple times.
Since wdm_flush() checks WDM_DISCONNECTING, wdm_write() should as well
check WDM_DISCONNECTING.
In wdm_flush(), since locks are not held, it is not safe to dereference
desc->intf after checking that WDM_DISCONNECTING is not set [5]. Thus,
remove dev_err() from wdm_flush().
[1] https://syzkaller.appspot.com/bug?id=e7b761593b23eb50855b9ea31e3be5472b7111…
[2] https://lkml.kernel.org/r/27b7545e-8f41-10b8-7c02-e35a08eb1611@i-love.sakur…
[3] https://lkml.kernel.org/r/79ba410f-e0ef-2465-b94f-6b9a4a82adf5@i-love.sakur…
[4] https://lkml.kernel.org/r/20200530011040.GB12419@rowland.harvard.edu
[5] https://lkml.kernel.org/r/c85331fc-874c-6e46-a77f-0ef1dc075308@i-love.sakur…
Reported-by: syzbot <syzbot+854768b99f19e89d7f81(a)syzkaller.appspotmail.com>
Cc: stable <stable(a)vger.kernel.org>
Co-developed-by: Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
Signed-off-by: Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
Signed-off-by: Oliver Neukum <oneukum(a)suse.com>
Cc: Alan Stern <stern(a)rowland.harvard.edu>
Link: https://lore.kernel.org/r/20200928141755.3476-1-penguin-kernel@I-love.SAKUR…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/class/cdc-wdm.c | 72 ++++++++++++++++++++++++++++---------
1 file changed, 55 insertions(+), 17 deletions(-)
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 7f5de956a2fc..02d0cfd23bb2 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -58,6 +58,9 @@ MODULE_DEVICE_TABLE (usb, wdm_ids);
#define WDM_MAX 16
+/* we cannot wait forever at flush() */
+#define WDM_FLUSH_TIMEOUT (30 * HZ)
+
/* CDC-WMC r1.1 requires wMaxCommand to be "at least 256 decimal (0x100)" */
#define WDM_DEFAULT_BUFSIZE 256
@@ -151,7 +154,7 @@ static void wdm_out_callback(struct urb *urb)
kfree(desc->outbuf);
desc->outbuf = NULL;
clear_bit(WDM_IN_USE, &desc->flags);
- wake_up(&desc->wait);
+ wake_up_all(&desc->wait);
}
static void wdm_in_callback(struct urb *urb)
@@ -393,6 +396,9 @@ static ssize_t wdm_write
if (test_bit(WDM_RESETTING, &desc->flags))
r = -EIO;
+ if (test_bit(WDM_DISCONNECTING, &desc->flags))
+ r = -ENODEV;
+
if (r < 0) {
rv = r;
goto out_free_mem_pm;
@@ -424,6 +430,7 @@ static ssize_t wdm_write
if (rv < 0) {
desc->outbuf = NULL;
clear_bit(WDM_IN_USE, &desc->flags);
+ wake_up_all(&desc->wait); /* for wdm_wait_for_response() */
dev_err(&desc->intf->dev, "Tx URB error: %d\n", rv);
rv = usb_translate_errors(rv);
goto out_free_mem_pm;
@@ -583,28 +590,58 @@ static ssize_t wdm_read
return rv;
}
-static int wdm_flush(struct file *file, fl_owner_t id)
+static int wdm_wait_for_response(struct file *file, long timeout)
{
struct wdm_device *desc = file->private_data;
+ long rv; /* Use long here because (int) MAX_SCHEDULE_TIMEOUT < 0. */
+
+ /*
+ * Needs both flags. We cannot do with one because resetting it would
+ * cause a race with write() yet we need to signal a disconnect.
+ */
+ rv = wait_event_interruptible_timeout(desc->wait,
+ !test_bit(WDM_IN_USE, &desc->flags) ||
+ test_bit(WDM_DISCONNECTING, &desc->flags),
+ timeout);
- wait_event(desc->wait,
- /*
- * needs both flags. We cannot do with one
- * because resetting it would cause a race
- * with write() yet we need to signal
- * a disconnect
- */
- !test_bit(WDM_IN_USE, &desc->flags) ||
- test_bit(WDM_DISCONNECTING, &desc->flags));
-
- /* cannot dereference desc->intf if WDM_DISCONNECTING */
+ /*
+ * To report the correct error. This is best effort.
+ * We are inevitably racing with the hardware.
+ */
if (test_bit(WDM_DISCONNECTING, &desc->flags))
return -ENODEV;
- if (desc->werr < 0)
- dev_err(&desc->intf->dev, "Error in flush path: %d\n",
- desc->werr);
+ if (!rv)
+ return -EIO;
+ if (rv < 0)
+ return -EINTR;
+
+ spin_lock_irq(&desc->iuspin);
+ rv = desc->werr;
+ desc->werr = 0;
+ spin_unlock_irq(&desc->iuspin);
+
+ return usb_translate_errors(rv);
+
+}
+
+/*
+ * You need to send a signal when you react to malicious or defective hardware.
+ * Also, don't abort when fsync() returned -EINVAL, for older kernels which do
+ * not implement wdm_flush() will return -EINVAL.
+ */
+static int wdm_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ return wdm_wait_for_response(file, MAX_SCHEDULE_TIMEOUT);
+}
- return usb_translate_errors(desc->werr);
+/*
+ * Same with wdm_fsync(), except it uses finite timeout in order to react to
+ * malicious or defective hardware which ceased communication after close() was
+ * implicitly called due to process termination.
+ */
+static int wdm_flush(struct file *file, fl_owner_t id)
+{
+ return wdm_wait_for_response(file, WDM_FLUSH_TIMEOUT);
}
static __poll_t wdm_poll(struct file *file, struct poll_table_struct *wait)
@@ -729,6 +766,7 @@ static const struct file_operations wdm_fops = {
.owner = THIS_MODULE,
.read = wdm_read,
.write = wdm_write,
+ .fsync = wdm_fsync,
.open = wdm_open,
.flush = wdm_flush,
.release = wdm_release,
--
2.28.0
commit a10674bf2406 ("tcp: detecting the misuse of .sendpage for Slab
objects") adds the checks for Slab pages, but the pages don't have
page_count are still missing from the check.
Network layer's sendpage method is not designed to send page_count 0
pages neither, therefore both PageSlab() and page_count() should be
both checked for the sending page. This is exactly what sendpage_ok()
does.
This patch uses sendpage_ok() in do_tcp_sendpages() to detect misused
.sendpage, to make the code more robust.
Fixes: a10674bf2406 ("tcp: detecting the misuse of .sendpage for Slab objects")
Suggested-by: Eric Dumazet <eric.dumazet(a)gmail.com>
Signed-off-by: Coly Li <colyli(a)suse.de>
Cc: Vasily Averin <vvs(a)virtuozzo.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: stable(a)vger.kernel.org
---
net/ipv4/tcp.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 31f3b858db81..2135ee7c806d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -970,7 +970,8 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
if (IS_ENABLED(CONFIG_DEBUG_VM) &&
- WARN_ONCE(PageSlab(page), "page must not be a Slab one"))
+ WARN_ONCE(!sendpage_ok(page),
+ "page must not be a Slab one and have page_count > 0"))
return -EINVAL;
/* Wait for a connection to finish. One exception is TCP Fast Open
--
2.26.2
Currently nvme_tcp_try_send_data() doesn't use kernel_sendpage() to
send slab pages. But for pages allocated by __get_free_pages() without
__GFP_COMP, which also have refcount as 0, they are still sent by
kernel_sendpage() to remote end, this is problematic.
The new introduced helper sendpage_ok() checks both PageSlab tag and
page_count counter, and returns true if the checking page is OK to be
sent by kernel_sendpage().
This patch fixes the page checking issue of nvme_tcp_try_send_data()
with sendpage_ok(). If sendpage_ok() returns true, send this page by
kernel_sendpage(), otherwise use sock_no_sendpage to handle this page.
Signed-off-by: Coly Li <colyli(a)suse.de>
Cc: Chaitanya Kulkarni <chaitanya.kulkarni(a)wdc.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Hannes Reinecke <hare(a)suse.de>
Cc: Jan Kara <jack(a)suse.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Mikhail Skorzhinskii <mskorzhinskiy(a)solarflare.com>
Cc: Philipp Reisner <philipp.reisner(a)linbit.com>
Cc: Sagi Grimberg <sagi(a)grimberg.me>
Cc: Vlastimil Babka <vbabka(a)suse.com>
Cc: stable(a)vger.kernel.org
---
drivers/nvme/host/tcp.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 8f4f29f18b8c..d6a3e1487354 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -913,12 +913,11 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
else
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
- /* can't zcopy slab pages */
- if (unlikely(PageSlab(page))) {
- ret = sock_no_sendpage(queue->sock, page, offset, len,
+ if (sendpage_ok(page)) {
+ ret = kernel_sendpage(queue->sock, page, offset, len,
flags);
} else {
- ret = kernel_sendpage(queue->sock, page, offset, len,
+ ret = sock_no_sendpage(queue->sock, page, offset, len,
flags);
}
if (ret <= 0)
--
2.26.2
The original problem was from nvme-over-tcp code, who mistakenly uses
kernel_sendpage() to send pages allocated by __get_free_pages() without
__GFP_COMP flag. Such pages don't have refcount (page_count is 0) on
tail pages, sending them by kernel_sendpage() may trigger a kernel panic
from a corrupted kernel heap, because these pages are incorrectly freed
in network stack as page_count 0 pages.
This patch introduces a helper sendpage_ok(), it returns true if the
checking page,
- is not slab page: PageSlab(page) is false.
- has page refcount: page_count(page) is not zero
All drivers who want to send page to remote end by kernel_sendpage()
may use this helper to check whether the page is OK. If the helper does
not return true, the driver should try other non sendpage method (e.g.
sock_no_sendpage()) to handle the page.
Signed-off-by: Coly Li <colyli(a)suse.de>
Cc: Chaitanya Kulkarni <chaitanya.kulkarni(a)wdc.com>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Hannes Reinecke <hare(a)suse.de>
Cc: Jan Kara <jack(a)suse.com>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Mikhail Skorzhinskii <mskorzhinskiy(a)solarflare.com>
Cc: Philipp Reisner <philipp.reisner(a)linbit.com>
Cc: Sagi Grimberg <sagi(a)grimberg.me>
Cc: Vlastimil Babka <vbabka(a)suse.com>
Cc: stable(a)vger.kernel.org
---
include/linux/net.h | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/include/linux/net.h b/include/linux/net.h
index d48ff1180879..ae713c851342 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -21,6 +21,7 @@
#include <linux/rcupdate.h>
#include <linux/once.h>
#include <linux/fs.h>
+#include <linux/mm.h>
#include <linux/sockptr.h>
#include <uapi/linux/net.h>
@@ -286,6 +287,21 @@ do { \
#define net_get_random_once_wait(buf, nbytes) \
get_random_once_wait((buf), (nbytes))
+/*
+ * E.g. XFS meta- & log-data is in slab pages, or bcache meta
+ * data pages, or other high order pages allocated by
+ * __get_free_pages() without __GFP_COMP, which have a page_count
+ * of 0 and/or have PageSlab() set. We cannot use send_page for
+ * those, as that does get_page(); put_page(); and would cause
+ * either a VM_BUG directly, or __page_cache_release a page that
+ * would actually still be referenced by someone, leading to some
+ * obscure delayed Oops somewhere else.
+ */
+static inline bool sendpage_ok(struct page *page)
+{
+ return !PageSlab(page) && page_count(page) >= 1;
+}
+
int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec,
size_t num, size_t len);
int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
--
2.26.2