From: Filipe Manana <fdmanana(a)suse.com>
commit 939b656bc8ab203fdbde26ccac22bcb7f0985be5 upstream.
During an append (O_APPEND write flag) direct IO write if the input buffer
was not previously faulted in, we can corrupt the file in a way that the
final size is unexpected and it includes an unexpected hole.
The problem happens like this:
1) We have an empty file, with size 0, for example;
2) We do an O_APPEND direct IO with a length of 4096 bytes and the input
buffer is not currently faulted in;
3) We enter btrfs_direct_write(), lock the inode and call
generic_write_checks(), which calls generic_write_checks_count(), and
that function sets the iocb position to 0 with the following code:
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
4) We call btrfs_dio_write() and enter into iomap, which will end up
calling btrfs_dio_iomap_begin() and that calls
btrfs_get_blocks_direct_write(), where we update the i_size of the
inode to 4096 bytes;
5) After btrfs_dio_iomap_begin() returns, iomap will attempt to access
the page of the write input buffer (at iomap_dio_bio_iter(), with a
call to bio_iov_iter_get_pages()) and fail with -EFAULT, which gets
returned to btrfs at btrfs_direct_write() via btrfs_dio_write();
6) At btrfs_direct_write() we get the -EFAULT error, unlock the inode,
fault in the write buffer and then goto to the label 'relock';
7) We lock again the inode, do all the necessary checks again and call
again generic_write_checks(), which calls generic_write_checks_count()
again, and there we set the iocb's position to 4K, which is the current
i_size of the inode, with the following code pointed above:
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
8) Then we go again to btrfs_dio_write() and enter iomap and the write
succeeds, but it wrote to the file range [4K, 8K[, leaving a hole in
the [0, 4K[ range and an i_size of 8K, which goes against the
expections of having the data written to the range [0, 4K[ and get an
i_size of 4K.
Fix this by not unlocking the inode before faulting in the input buffer,
in case we get -EFAULT or an incomplete write, and not jumping to the
'relock' label after faulting in the buffer - instead jump to a location
immediately before calling iomap, skipping all the write checks and
relocking. This solves this problem and it's fine even in case the input
buffer is memory mapped to the same file range, since only holding the
range locked in the inode's io tree can cause a deadlock, it's safe to
keep the inode lock (VFS lock), as was fixed and described in commit
51bd9563b678 ("btrfs: fix deadlock due to page faults during direct IO
reads and writes").
A sample reproducer provided by a reporter is the following:
$ cat test.c
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <fcntl.h>
#include <stdio.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
int main(int argc, char *argv[])
{
if (argc < 2) {
fprintf(stderr, "Usage: %s <test file>\n", argv[0]);
return 1;
}
int fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC | O_DIRECT |
O_APPEND, 0644);
if (fd < 0) {
perror("creating test file");
return 1;
}
char *buf = mmap(NULL, 4096, PROT_READ,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ssize_t ret = write(fd, buf, 4096);
if (ret < 0) {
perror("pwritev2");
return 1;
}
struct stat stbuf;
ret = fstat(fd, &stbuf);
if (ret < 0) {
perror("stat");
return 1;
}
printf("size: %llu\n", (unsigned long long)stbuf.st_size);
return stbuf.st_size == 4096 ? 0 : 1;
}
A test case for fstests will be sent soon.
Reported-by: Hanna Czenczek <hreitz(a)redhat.com>
Link: https://lore.kernel.org/linux-btrfs/0b841d46-12fe-4e64-9abb-871d8d0de271@re…
Fixes: 8184620ae212 ("btrfs: fix lost file sync on direct IO write with nowait and dsync iocb")
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
---
fs/btrfs/ctree.h | 1 +
fs/btrfs/file.c | 55 ++++++++++++++++++++++++++++++++++++------------
2 files changed, 43 insertions(+), 13 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c03c58246033..ae55b5c5ae5e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -447,6 +447,7 @@ struct btrfs_file_private {
void *filldir_buf;
u64 last_index;
struct extent_state *llseek_cached_state;
+ bool fsync_skip_inode_lock;
};
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d90138683a0a..35ce1c810bd3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1550,21 +1550,37 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
* So here we disable page faults in the iov_iter and then retry if we
* got -EFAULT, faulting in the pages before the retry.
*/
+again:
from->nofault = true;
dio = btrfs_dio_write(iocb, from, written);
from->nofault = false;
- /*
- * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
- * iocb, and that needs to lock the inode. So unlock it before calling
- * iomap_dio_complete() to avoid a deadlock.
- */
- btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
-
- if (IS_ERR_OR_NULL(dio))
+ if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
- else
+ } else {
+ struct btrfs_file_private stack_private = { 0 };
+ struct btrfs_file_private *private;
+ const bool have_private = (file->private_data != NULL);
+
+ if (!have_private)
+ file->private_data = &stack_private;
+
+ /*
+ * If we have a synchoronous write, we must make sure the fsync
+ * triggered by the iomap_dio_complete() call below doesn't
+ * deadlock on the inode lock - we are already holding it and we
+ * can't call it after unlocking because we may need to complete
+ * partial writes due to the input buffer (or parts of it) not
+ * being already faulted in.
+ */
+ private = file->private_data;
+ private->fsync_skip_inode_lock = true;
ret = iomap_dio_complete(dio);
+ private->fsync_skip_inode_lock = false;
+
+ if (!have_private)
+ file->private_data = NULL;
+ }
/* No increment (+=) because iomap returns a cumulative value. */
if (ret > 0)
@@ -1591,10 +1607,12 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
} else {
fault_in_iov_iter_readable(from, left);
prev_left = left;
- goto relock;
+ goto again;
}
}
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+
/*
* If 'ret' is -ENOTBLK or we have not written all data, then it means
* we must fallback to buffered IO.
@@ -1793,6 +1811,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
+ struct btrfs_file_private *private = file->private_data;
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -1802,6 +1821,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
u64 len;
bool full_sync;
+ const bool skip_ilock = (private ? private->fsync_skip_inode_lock : false);
trace_btrfs_sync_file(file, datasync);
@@ -1829,7 +1849,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ if (skip_ilock)
+ down_write(&BTRFS_I(inode)->i_mmap_lock);
+ else
+ btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@@ -1853,7 +1876,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ if (skip_ilock)
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ else
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
@@ -1982,7 +2008,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
+ if (skip_ilock)
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
+ else
+ btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);
--
2.43.0
The following commit has been merged into the irq/urgent branch of tip:
Commit-ID: edbbaae42a56f9a2b39c52ef2504dfb3fb0a7858
Gitweb: https://git.kernel.org/tip/edbbaae42a56f9a2b39c52ef2504dfb3fb0a7858
Author: Shay Drory <shayd(a)nvidia.com>
AuthorDate: Tue, 06 Aug 2024 10:20:44 +03:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Wed, 07 Aug 2024 17:27:00 +02:00
genirq/irqdesc: Honor caller provided affinity in alloc_desc()
Currently, whenever a caller is providing an affinity hint for an
interrupt, the allocation code uses it to calculate the node and copies the
cpumask into irq_desc::affinity.
If the affinity for the interrupt is not marked 'managed' then the startup
of the interrupt ignores irq_desc::affinity and uses the system default
affinity mask.
Prevent this by setting the IRQD_AFFINITY_SET flag for the interrupt in the
allocator, which causes irq_setup_affinity() to use irq_desc::affinity on
interrupt startup if the mask contains an online CPU.
[ tglx: Massaged changelog ]
Fixes: 45ddcecbfa94 ("genirq: Use affinity hint in irqdesc allocation")
Signed-off-by: Shay Drory <shayd(a)nvidia.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/all/20240806072044.837827-1-shayd@nvidia.com
---
kernel/irq/irqdesc.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 07e99c9..1dee88b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -530,6 +530,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
flags = IRQD_AFFINITY_MANAGED |
IRQD_MANAGED_SHUTDOWN;
}
+ flags |= IRQD_AFFINITY_SET;
mask = &affinity->mask;
node = cpu_to_node(cpumask_first(mask));
affinity++;
From: Long Li <longli(a)microsoft.com>
After napi_complete_done() is called, another NAPI may be running on
another CPU and ring the doorbell before the current CPU does. When
combined with unnecessary rings when there is no need to ARM the CQ, this
triggers error paths in the hardware.
Fix this by always ring the doorbell in sequence and avoid unnecessary
rings.
Cc: stable(a)vger.kernel.org
Fixes: e1b5683ff62e ("net: mana: Move NAPI from EQ to CQ")
Signed-off-by: Long Li <longli(a)microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 24 ++++++++++++-------
include/net/mana/mana.h | 1 +
2 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index d2f07e179e86..7d08e23c6749 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1788,7 +1788,6 @@ static void mana_poll_rx_cq(struct mana_cq *cq)
static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue)
{
struct mana_cq *cq = context;
- u8 arm_bit;
int w;
WARN_ON_ONCE(cq->gdma_cq != gdma_queue);
@@ -1799,16 +1798,23 @@ static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue)
mana_poll_tx_cq(cq);
w = cq->work_done;
-
- if (w < cq->budget &&
- napi_complete_done(&cq->napi, w)) {
- arm_bit = SET_ARM_BIT;
- } else {
- arm_bit = 0;
+ cq->work_done_since_doorbell += w;
+
+ if (w < cq->budget) {
+ mana_gd_ring_cq(gdma_queue, SET_ARM_BIT);
+ cq->work_done_since_doorbell = 0;
+ napi_complete_done(&cq->napi, w);
+ } else if (cq->work_done_since_doorbell >
+ cq->gdma_cq->queue_size / COMP_ENTRY_SIZE * 4) {
+ /* MANA hardware requires at least one doorbell ring every 8
+ * wraparounds of CQ even there is no need to ARM. This driver
+ * rings the doorbell as soon as we have execceded 4
+ * wraparounds.
+ */
+ mana_gd_ring_cq(gdma_queue, 0);
+ cq->work_done_since_doorbell = 0;
}
- mana_gd_ring_cq(gdma_queue, arm_bit);
-
return w;
}
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 6439fd8b437b..7caa334f4888 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -275,6 +275,7 @@ struct mana_cq {
/* NAPI data */
struct napi_struct napi;
int work_done;
+ int work_done_since_doorbell;
int budget;
};
--
2.17.1
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 12c35c5582acb0fd8f7713ffa75f450766022ff1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024080750-lance-overcrowd-731e@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
12c35c5582ac ("drm/ast: Fix black screen after resume")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 12c35c5582acb0fd8f7713ffa75f450766022ff1 Mon Sep 17 00:00:00 2001
From: Jammy Huang <jammy_huang(a)aspeedtech.com>
Date: Thu, 18 Jul 2024 11:03:52 +0800
Subject: [PATCH] drm/ast: Fix black screen after resume
Suspend will disable pcie device. Thus, resume should do full hw
initialization again.
Add some APIs to ast_drm_thaw() before ast_post_gpu() to fix the issue.
v2:
- fix function-call arguments
Fixes: 5b71707dd13c ("drm/ast: Enable and unlock device access early during init")
Reported-by: Cary Garrett <cogarre(a)gmail.com>
Closes: https://lore.kernel.org/dri-devel/8ce1e1cc351153a890b65e62fed93b54ccd43f6a.…
Cc: Thomas Zimmermann <tzimmermann(a)suse.de>
Cc: Jocelyn Falempe <jfalempe(a)redhat.com>
Cc: Dave Airlie <airlied(a)redhat.com>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v6.6+
Signed-off-by: Jammy Huang <jammy_huang(a)aspeedtech.com>
Reviewed-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/20240718030352.654155-1-jammy…
diff --git a/drivers/gpu/drm/ast/ast_drv.c b/drivers/gpu/drm/ast/ast_drv.c
index f8c49ba68e78..af2368f6f00f 100644
--- a/drivers/gpu/drm/ast/ast_drv.c
+++ b/drivers/gpu/drm/ast/ast_drv.c
@@ -391,6 +391,11 @@ static int ast_drm_freeze(struct drm_device *dev)
static int ast_drm_thaw(struct drm_device *dev)
{
+ struct ast_device *ast = to_ast_device(dev);
+
+ ast_enable_vga(ast->ioregs);
+ ast_open_key(ast->ioregs);
+ ast_enable_mmio(dev->dev, ast->ioregs);
ast_post_gpu(dev);
return drm_mode_config_helper_resume(dev);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 0ce91928ec62d189b5c51816e325f02587b53118
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024080729-overlord-gala-fe40@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
0ce91928ec62 ("drm/ast: astdp: Wake up during connector status detection")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0ce91928ec62d189b5c51816e325f02587b53118 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann(a)suse.de>
Date: Wed, 17 Jul 2024 16:24:16 +0200
Subject: [PATCH] drm/ast: astdp: Wake up during connector status detection
Power up the ASTDP connector for connection status detection if the
connector is not active. Keep it powered if a display is attached.
This fixes a bug where the connector does not come back after
disconnecting the display. The encoder's atomic_disable turns off
power on the physical connector. Further HPD reads will fail,
thus preventing the driver from detecting re-connected displays.
For connectors that are actively used, only test the HPD flag without
touching power.
Fixes: f81bb0ac7872 ("drm/ast: report connection status on Display Port.")
Cc: Jocelyn Falempe <jfalempe(a)redhat.com>
Cc: Thomas Zimmermann <tzimmermann(a)suse.de>
Cc: Dave Airlie <airlied(a)redhat.com>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v6.6+
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Reviewed-by: Jocelyn Falempe <jfalempe(a)redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240717143319.104012-2-tzimm…
diff --git a/drivers/gpu/drm/ast/ast_dp.c b/drivers/gpu/drm/ast/ast_dp.c
index 1e9259416980..e6c7f0d64e99 100644
--- a/drivers/gpu/drm/ast/ast_dp.c
+++ b/drivers/gpu/drm/ast/ast_dp.c
@@ -158,7 +158,14 @@ void ast_dp_launch(struct drm_device *dev)
ASTDP_HOST_EDID_READ_DONE);
}
+bool ast_dp_power_is_on(struct ast_device *ast)
+{
+ u8 vgacre3;
+ vgacre3 = ast_get_index_reg(ast, AST_IO_VGACRI, 0xe3);
+
+ return !(vgacre3 & AST_DP_PHY_SLEEP);
+}
void ast_dp_power_on_off(struct drm_device *dev, bool on)
{
diff --git a/drivers/gpu/drm/ast/ast_drv.h b/drivers/gpu/drm/ast/ast_drv.h
index ba3d86973995..47bab5596c16 100644
--- a/drivers/gpu/drm/ast/ast_drv.h
+++ b/drivers/gpu/drm/ast/ast_drv.h
@@ -472,6 +472,7 @@ void ast_init_3rdtx(struct drm_device *dev);
bool ast_astdp_is_connected(struct ast_device *ast);
int ast_astdp_read_edid(struct drm_device *dev, u8 *ediddata);
void ast_dp_launch(struct drm_device *dev);
+bool ast_dp_power_is_on(struct ast_device *ast);
void ast_dp_power_on_off(struct drm_device *dev, bool no);
void ast_dp_set_on_off(struct drm_device *dev, bool no);
void ast_dp_set_mode(struct drm_crtc *crtc, struct ast_vbios_mode_info *vbios_mode);
diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c
index dc8f639e82fd..049ee1477c33 100644
--- a/drivers/gpu/drm/ast/ast_mode.c
+++ b/drivers/gpu/drm/ast/ast_mode.c
@@ -28,6 +28,7 @@
* Authors: Dave Airlie <airlied(a)redhat.com>
*/
+#include <linux/delay.h>
#include <linux/export.h>
#include <linux/pci.h>
@@ -1687,11 +1688,35 @@ static int ast_astdp_connector_helper_detect_ctx(struct drm_connector *connector
struct drm_modeset_acquire_ctx *ctx,
bool force)
{
+ struct drm_device *dev = connector->dev;
struct ast_device *ast = to_ast_device(connector->dev);
+ enum drm_connector_status status = connector_status_disconnected;
+ struct drm_connector_state *connector_state = connector->state;
+ bool is_active = false;
+
+ mutex_lock(&ast->modeset_lock);
+
+ if (connector_state && connector_state->crtc) {
+ struct drm_crtc_state *crtc_state = connector_state->crtc->state;
+
+ if (crtc_state && crtc_state->active)
+ is_active = true;
+ }
+
+ if (!is_active && !ast_dp_power_is_on(ast)) {
+ ast_dp_power_on_off(dev, true);
+ msleep(50);
+ }
if (ast_astdp_is_connected(ast))
- return connector_status_connected;
- return connector_status_disconnected;
+ status = connector_status_connected;
+
+ if (!is_active && status == connector_status_disconnected)
+ ast_dp_power_on_off(dev, false);
+
+ mutex_unlock(&ast->modeset_lock);
+
+ return status;
}
static const struct drm_connector_helper_funcs ast_astdp_connector_helper_funcs = {
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 6834097fc38c5416701c793da94558cea49c0a1f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024080714-daisy-celibacy-e263@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
6834097fc38c ("mptcp: pm: fix backup support in signal endpoints")
9ae7846c4b6b ("mptcp: dump addrs in userspace pm list")
34e74a5cf3b7 ("mptcp: implement mptcp_userspace_pm_dump_addr")
aab4d8564947 ("net: mptcp: use policy generated by YAML spec")
1e07938e29c5 ("net: mptcp: rename netlink handlers to mptcp_pm_nl_<blah>_{doit,dumpit}")
1d0507f46843 ("net: mptcp: convert netlink from small_ops to ops")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6834097fc38c5416701c793da94558cea49c0a1f Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe(a)kernel.org>
Date: Sat, 27 Jul 2024 12:01:28 +0200
Subject: [PATCH] mptcp: pm: fix backup support in signal endpoints
There was a support for signal endpoints, but only when the endpoint's
flag was changed during a connection. If an endpoint with the signal and
backup was already present, the MP_JOIN reply was not containing the
backup flag as expected.
That's confusing to have this inconsistent behaviour. On the other hand,
the infrastructure to set the backup flag in the SYN + ACK + MP_JOIN was
already there, it was just never set before. Now when requesting the
local ID from the path-manager, the backup status is also requested.
Note that when the userspace PM is used, the backup flag can be set if
the local address was already used before with a backup flag, e.g. if
the address was announced with the 'backup' flag, or a subflow was
created with the 'backup' flag.
Fixes: 4596a2c1b7f5 ("mptcp: allow creating non-backup subflows")
Cc: stable(a)vger.kernel.org
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/507
Reviewed-by: Mat Martineau <martineau(a)kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 55406720c607..23bb89c94e90 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -426,6 +426,18 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
return mptcp_pm_nl_get_local_id(msk, &skc_local);
}
+bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc)
+{
+ struct mptcp_addr_info skc_local;
+
+ mptcp_local_address((struct sock_common *)skc, &skc_local);
+
+ if (mptcp_pm_is_userspace(msk))
+ return mptcp_userspace_pm_is_backup(msk, &skc_local);
+
+ return mptcp_pm_nl_is_backup(msk, &skc_local);
+}
+
int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id,
u8 *flags, int *ifindex)
{
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 7635fac91539..37954a0b087d 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1101,6 +1101,24 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc
return ret;
}
+bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+ struct mptcp_pm_addr_entry *entry;
+ bool backup = false;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) {
+ backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return backup;
+}
+
#define MPTCP_PM_CMD_GRP_OFFSET 0
#define MPTCP_PM_EV_GRP_OFFSET 1
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index f0a4590506c6..8eaa9fbe3e34 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -165,6 +165,24 @@ int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk,
return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry, true);
}
+bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk,
+ struct mptcp_addr_info *skc)
+{
+ struct mptcp_pm_addr_entry *entry;
+ bool backup = false;
+
+ spin_lock_bh(&msk->pm.lock);
+ list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
+ if (mptcp_addresses_equal(&entry->addr, skc, false)) {
+ backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
+ break;
+ }
+ }
+ spin_unlock_bh(&msk->pm.lock);
+
+ return backup;
+}
+
int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index b8b25124e7de..60c6b073d65f 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -1109,6 +1109,9 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
+bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc);
+bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
+bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc);
int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb);
int mptcp_pm_nl_dump_addr(struct sk_buff *msg,
struct netlink_callback *cb);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index be406197b1c4..0e4b5bfbeaa1 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -100,6 +100,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
return NULL;
}
subflow_req->local_id = local_id;
+ subflow_req->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)req);
return msk;
}
@@ -620,6 +621,8 @@ static int subflow_chk_local_id(struct sock *sk)
return err;
subflow_set_local_id(subflow, err);
+ subflow->request_bkup = mptcp_pm_is_backup(msk, (struct sock_common *)sk);
+
return 0;
}