The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
365e1ececb29 ("hv_netvsc: Fix race between VF offering and VF association message from host")
d0922bf79817 ("hv_netvsc: Add error handling while switching data path")
34b06a2eee44 ("hv_netvsc: Process NETDEV_GOING_DOWN on VF hot remove")
8b31f8c982b7 ("hv_netvsc: Wait for completion on request SWITCH_DATA_PATH")
4d18fcc95f50 ("hv_netvsc: Use vmbus_requestor to generate transaction IDs for VMBus hardening")
e8b7db38449a ("Drivers: hv: vmbus: Add vmbus_requestor data structure for VMBus hardening")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 365e1ececb2905f94cc10a5817c5b644a32a3ae2 Mon Sep 17 00:00:00 2001
From: Gaurav Kohli <gauravkohli(a)linux.microsoft.com>
Date: Wed, 5 Oct 2022 22:52:59 -0700
Subject: [PATCH] hv_netvsc: Fix race between VF offering and VF association
message from host
During vm boot, there might be possibility that vf registration
call comes before the vf association from host to vm.
And this might break netvsc vf path, To prevent the same block
vf registration until vf bind message comes from host.
Cc: stable(a)vger.kernel.org
Fixes: 00d7ddba11436 ("hv_netvsc: pair VF based on serial number")
Reviewed-by: Haiyang Zhang <haiyangz(a)microsoft.com>
Signed-off-by: Gaurav Kohli <gauravkohli(a)linux.microsoft.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 25b38a374e3c..dd5919ec408b 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -1051,7 +1051,8 @@ struct net_device_context {
u32 vf_alloc;
/* Serial number of the VF to team with */
u32 vf_serial;
-
+ /* completion variable to confirm vf association */
+ struct completion vf_add;
/* Is the current data path through the VF NIC? */
bool data_path_is_vf;
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index f066de0da492..9352dad58996 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1580,6 +1580,10 @@ static void netvsc_send_vf(struct net_device *ndev,
net_device_ctx->vf_alloc = nvmsg->msg.v4_msg.vf_assoc.allocated;
net_device_ctx->vf_serial = nvmsg->msg.v4_msg.vf_assoc.serial;
+
+ if (net_device_ctx->vf_alloc)
+ complete(&net_device_ctx->vf_add);
+
netdev_info(ndev, "VF slot %u %s\n",
net_device_ctx->vf_serial,
net_device_ctx->vf_alloc ? "added" : "removed");
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 5f08482065ca..89eb4f179a3c 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2313,6 +2313,18 @@ static struct net_device *get_netvsc_byslot(const struct net_device *vf_netdev)
}
+ /* Fallback path to check synthetic vf with
+ * help of mac addr
+ */
+ list_for_each_entry(ndev_ctx, &netvsc_dev_list, list) {
+ ndev = hv_get_drvdata(ndev_ctx->device_ctx);
+ if (ether_addr_equal(vf_netdev->perm_addr, ndev->perm_addr)) {
+ netdev_notice(vf_netdev,
+ "falling back to mac addr based matching\n");
+ return ndev;
+ }
+ }
+
netdev_notice(vf_netdev,
"no netdev found for vf serial:%u\n", serial);
return NULL;
@@ -2409,6 +2421,11 @@ static int netvsc_vf_changed(struct net_device *vf_netdev, unsigned long event)
if (net_device_ctx->data_path_is_vf == vf_is_up)
return NOTIFY_OK;
+ if (vf_is_up && !net_device_ctx->vf_alloc) {
+ netdev_info(ndev, "Waiting for the VF association from host\n");
+ wait_for_completion(&net_device_ctx->vf_add);
+ }
+
ret = netvsc_switch_datapath(ndev, vf_is_up);
if (ret) {
@@ -2440,6 +2457,7 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev)
netvsc_vf_setxdp(vf_netdev, NULL);
+ reinit_completion(&net_device_ctx->vf_add);
netdev_rx_handler_unregister(vf_netdev);
netdev_upper_dev_unlink(vf_netdev, ndev);
RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
@@ -2479,6 +2497,7 @@ static int netvsc_probe(struct hv_device *dev,
INIT_DELAYED_WORK(&net_device_ctx->dwork, netvsc_link_change);
+ init_completion(&net_device_ctx->vf_add);
spin_lock_init(&net_device_ctx->lock);
INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
INIT_DELAYED_WORK(&net_device_ctx->vf_takeover, netvsc_vf_setup);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
42b6419d0aba ("io_uring: correct pinned_vm accounting")
ed29b0b4fd83 ("io_uring: move to separate directory")
ab4094024784 ("io_uring: optimise rsrc referencing")
a46be971edb6 ("io_uring: optimise io_req_set_rsrc_node()")
d886e185a128 ("io_uring: control ->async_data with a REQ_F flag")
ef05d9ebcc92 ("io_uring: kill off ->inflight_entry field")
d4b7a5ef2b9c ("io_uring: inline completion batching helpers")
3aa83bfb6e5c ("io_uring: add a helper for batch free")
c2b6c6bc4e0d ("io_uring: replace list with stack for req caches")
3ab665b74e59 ("io_uring: remove allocation cache array")
6f33b0bc4ea4 ("io_uring: use slist for completion batching")
c450178d9be9 ("io_uring: dedup CQE flushing non-empty checks")
4b628aeb69cc ("io_uring: kill off ios_left")
4c928904ff77 ("block: move CONFIG_BLOCK guard to top Makefile")
ddf21bd8ab98 ("Merge tag 'iov_iter.3-5.15-2021-09-17' of git://git.kernel.dk/linux-block")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 42b6419d0aba47c5d8644cdc0b68502254671de5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Tue, 4 Oct 2022 03:19:08 +0100
Subject: [PATCH] io_uring: correct pinned_vm accounting
->mm_account should be released only after we free all registered
buffers, otherwise __io_sqe_buffers_unregister() will see a NULL
->mm_account and skip locked_vm accounting.
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/6d798f65ed4ab8db3664c4d3397d4af16ca98846.16648499…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 63f6ce5e5355..ea5cee593bbd 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2585,12 +2585,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
-
- if (ctx->mm_account) {
- mmdrop(ctx->mm_account);
- ctx->mm_account = NULL;
- }
-
io_rsrc_refs_drop(ctx);
/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
io_wait_rsrc_data(ctx->buf_data);
@@ -2633,6 +2627,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots);
+ if (ctx->mm_account) {
+ mmdrop(ctx->mm_account);
+ ctx->mm_account = NULL;
+ }
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
42b6419d0aba ("io_uring: correct pinned_vm accounting")
ed29b0b4fd83 ("io_uring: move to separate directory")
ab4094024784 ("io_uring: optimise rsrc referencing")
a46be971edb6 ("io_uring: optimise io_req_set_rsrc_node()")
d886e185a128 ("io_uring: control ->async_data with a REQ_F flag")
ef05d9ebcc92 ("io_uring: kill off ->inflight_entry field")
d4b7a5ef2b9c ("io_uring: inline completion batching helpers")
3aa83bfb6e5c ("io_uring: add a helper for batch free")
c2b6c6bc4e0d ("io_uring: replace list with stack for req caches")
3ab665b74e59 ("io_uring: remove allocation cache array")
6f33b0bc4ea4 ("io_uring: use slist for completion batching")
c450178d9be9 ("io_uring: dedup CQE flushing non-empty checks")
4b628aeb69cc ("io_uring: kill off ios_left")
4c928904ff77 ("block: move CONFIG_BLOCK guard to top Makefile")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 42b6419d0aba47c5d8644cdc0b68502254671de5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Tue, 4 Oct 2022 03:19:08 +0100
Subject: [PATCH] io_uring: correct pinned_vm accounting
->mm_account should be released only after we free all registered
buffers, otherwise __io_sqe_buffers_unregister() will see a NULL
->mm_account and skip locked_vm accounting.
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/6d798f65ed4ab8db3664c4d3397d4af16ca98846.16648499…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 63f6ce5e5355..ea5cee593bbd 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2585,12 +2585,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
-
- if (ctx->mm_account) {
- mmdrop(ctx->mm_account);
- ctx->mm_account = NULL;
- }
-
io_rsrc_refs_drop(ctx);
/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
io_wait_rsrc_data(ctx->buf_data);
@@ -2633,6 +2627,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots);
+ if (ctx->mm_account) {
+ mmdrop(ctx->mm_account);
+ ctx->mm_account = NULL;
+ }
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
0091bfc81741 ("io_uring/af_unix: defer registered files gc to io_uring release")
735729844819 ("io_uring: move rsrc related data, core, and commands")
3b77495a9723 ("io_uring: split provided buffers handling into its own file")
7aaff708a768 ("io_uring: move cancelation into its own file")
329061d3e2f9 ("io_uring: move poll handling into its own file")
cfd22e6b3319 ("io_uring: add opcode name to io_op_defs")
92ac8beaea1f ("io_uring: include and forward-declaration sanitation")
c9f06aa7de15 ("io_uring: move io_uring_task (tctx) helpers into its own file")
a4ad4f748ea9 ("io_uring: move fdinfo helpers to its own file")
e5550a1447bf ("io_uring: use io_is_uring_fops() consistently")
17437f311490 ("io_uring: move SQPOLL related handling into its own file")
59915143e89f ("io_uring: move timeout opcodes and handling into its own file")
e418bbc97bff ("io_uring: move our reference counting into a header")
36404b09aa60 ("io_uring: move msg_ring into its own file")
f9ead18c1058 ("io_uring: split network related opcodes into its own file")
e0da14def1ee ("io_uring: move statx handling to its own file")
a9c210cebe13 ("io_uring: move epoll handler to its own file")
4cf90495281b ("io_uring: add a dummy -EOPNOTSUPP prep handler")
99f15d8d6136 ("io_uring: move uring_cmd handling to its own file")
cd40cae29ef8 ("io_uring: split out open/close operations")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0091bfc81741b8d3aeb3b7ab8636f911b2de6e80 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Mon, 3 Oct 2022 13:59:47 +0100
Subject: [PATCH] io_uring/af_unix: defer registered files gc to io_uring
release
Instead of putting io_uring's registered files in unix_gc() we want it
to be done by io_uring itself. The trick here is to consider io_uring
registered files for cycle detection but not actually putting them down.
Because io_uring can't register other ring instances, this will remove
all refs to the ring file triggering the ->release path and clean up
with io_ring_ctx_free().
Cc: stable(a)vger.kernel.org
Fixes: 6b06314c47e1 ("io_uring: add file set registration")
Reported-and-tested-by: David Bouman <dbouman03(a)gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo(a)canonical.com>
[axboe: add kerneldoc comment to skb, fold in skb leak fix]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9fcf534f2d92..7be5bb4c94b6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -803,6 +803,7 @@ typedef unsigned char *sk_buff_data_t;
* @csum_level: indicates the number of consecutive checksums found in
* the packet minus one that have been verified as
* CHECKSUM_UNNECESSARY (max 3)
+ * @scm_io_uring: SKB holds io_uring registered files
* @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB
* @slow_gro: state present at GRO time, slower prepare step required
@@ -982,6 +983,7 @@ struct sk_buff {
#endif
__u8 slow_gro:1;
__u8 csum_not_inet:1;
+ __u8 scm_io_uring:1;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 6f88ded0e7e5..012fdb04ec23 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -855,6 +855,7 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
UNIXCB(skb).fp = fpl;
skb->sk = sk;
+ skb->scm_io_uring = 1;
skb->destructor = unix_destruct_scm;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index d45d5366115a..dc2763540393 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -204,6 +204,7 @@ void wait_for_unix_gc(void)
/* The external entry point: unix_gc() */
void unix_gc(void)
{
+ struct sk_buff *next_skb, *skb;
struct unix_sock *u;
struct unix_sock *next;
struct sk_buff_head hitlist;
@@ -297,11 +298,30 @@ void unix_gc(void)
spin_unlock(&unix_gc_lock);
+ /* We need io_uring to clean its registered files, ignore all io_uring
+ * originated skbs. It's fine as io_uring doesn't keep references to
+ * other io_uring instances and so killing all other files in the cycle
+ * will put all io_uring references forcing it to go through normal
+ * release.path eventually putting registered files.
+ */
+ skb_queue_walk_safe(&hitlist, skb, next_skb) {
+ if (skb->scm_io_uring) {
+ __skb_unlink(skb, &hitlist);
+ skb_queue_tail(&skb->sk->sk_receive_queue, skb);
+ }
+ }
+
/* Here we are. Hitlist is filled. Die. */
__skb_queue_purge(&hitlist);
spin_lock(&unix_gc_lock);
+ /* There could be io_uring registered files, just push them back to
+ * the inflight list
+ */
+ list_for_each_entry_safe(u, next, &gc_candidates, link)
+ list_move_tail(&u->link, &gc_inflight_list);
+
/* All candidates should have been detached by now. */
BUG_ON(!list_empty(&gc_candidates));
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
0091bfc81741 ("io_uring/af_unix: defer registered files gc to io_uring release")
735729844819 ("io_uring: move rsrc related data, core, and commands")
3b77495a9723 ("io_uring: split provided buffers handling into its own file")
7aaff708a768 ("io_uring: move cancelation into its own file")
329061d3e2f9 ("io_uring: move poll handling into its own file")
cfd22e6b3319 ("io_uring: add opcode name to io_op_defs")
92ac8beaea1f ("io_uring: include and forward-declaration sanitation")
c9f06aa7de15 ("io_uring: move io_uring_task (tctx) helpers into its own file")
a4ad4f748ea9 ("io_uring: move fdinfo helpers to its own file")
e5550a1447bf ("io_uring: use io_is_uring_fops() consistently")
17437f311490 ("io_uring: move SQPOLL related handling into its own file")
59915143e89f ("io_uring: move timeout opcodes and handling into its own file")
e418bbc97bff ("io_uring: move our reference counting into a header")
36404b09aa60 ("io_uring: move msg_ring into its own file")
f9ead18c1058 ("io_uring: split network related opcodes into its own file")
e0da14def1ee ("io_uring: move statx handling to its own file")
a9c210cebe13 ("io_uring: move epoll handler to its own file")
4cf90495281b ("io_uring: add a dummy -EOPNOTSUPP prep handler")
99f15d8d6136 ("io_uring: move uring_cmd handling to its own file")
cd40cae29ef8 ("io_uring: split out open/close operations")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0091bfc81741b8d3aeb3b7ab8636f911b2de6e80 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Mon, 3 Oct 2022 13:59:47 +0100
Subject: [PATCH] io_uring/af_unix: defer registered files gc to io_uring
release
Instead of putting io_uring's registered files in unix_gc() we want it
to be done by io_uring itself. The trick here is to consider io_uring
registered files for cycle detection but not actually putting them down.
Because io_uring can't register other ring instances, this will remove
all refs to the ring file triggering the ->release path and clean up
with io_ring_ctx_free().
Cc: stable(a)vger.kernel.org
Fixes: 6b06314c47e1 ("io_uring: add file set registration")
Reported-and-tested-by: David Bouman <dbouman03(a)gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo(a)canonical.com>
[axboe: add kerneldoc comment to skb, fold in skb leak fix]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9fcf534f2d92..7be5bb4c94b6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -803,6 +803,7 @@ typedef unsigned char *sk_buff_data_t;
* @csum_level: indicates the number of consecutive checksums found in
* the packet minus one that have been verified as
* CHECKSUM_UNNECESSARY (max 3)
+ * @scm_io_uring: SKB holds io_uring registered files
* @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB
* @slow_gro: state present at GRO time, slower prepare step required
@@ -982,6 +983,7 @@ struct sk_buff {
#endif
__u8 slow_gro:1;
__u8 csum_not_inet:1;
+ __u8 scm_io_uring:1;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 6f88ded0e7e5..012fdb04ec23 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -855,6 +855,7 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
UNIXCB(skb).fp = fpl;
skb->sk = sk;
+ skb->scm_io_uring = 1;
skb->destructor = unix_destruct_scm;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index d45d5366115a..dc2763540393 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -204,6 +204,7 @@ void wait_for_unix_gc(void)
/* The external entry point: unix_gc() */
void unix_gc(void)
{
+ struct sk_buff *next_skb, *skb;
struct unix_sock *u;
struct unix_sock *next;
struct sk_buff_head hitlist;
@@ -297,11 +298,30 @@ void unix_gc(void)
spin_unlock(&unix_gc_lock);
+ /* We need io_uring to clean its registered files, ignore all io_uring
+ * originated skbs. It's fine as io_uring doesn't keep references to
+ * other io_uring instances and so killing all other files in the cycle
+ * will put all io_uring references forcing it to go through normal
+ * release.path eventually putting registered files.
+ */
+ skb_queue_walk_safe(&hitlist, skb, next_skb) {
+ if (skb->scm_io_uring) {
+ __skb_unlink(skb, &hitlist);
+ skb_queue_tail(&skb->sk->sk_receive_queue, skb);
+ }
+ }
+
/* Here we are. Hitlist is filled. Die. */
__skb_queue_purge(&hitlist);
spin_lock(&unix_gc_lock);
+ /* There could be io_uring registered files, just push them back to
+ * the inflight list
+ */
+ list_for_each_entry_safe(u, next, &gc_candidates, link)
+ list_move_tail(&u->link, &gc_inflight_list);
+
/* All candidates should have been detached by now. */
BUG_ON(!list_empty(&gc_candidates));
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
0091bfc81741 ("io_uring/af_unix: defer registered files gc to io_uring release")
735729844819 ("io_uring: move rsrc related data, core, and commands")
3b77495a9723 ("io_uring: split provided buffers handling into its own file")
7aaff708a768 ("io_uring: move cancelation into its own file")
329061d3e2f9 ("io_uring: move poll handling into its own file")
cfd22e6b3319 ("io_uring: add opcode name to io_op_defs")
92ac8beaea1f ("io_uring: include and forward-declaration sanitation")
c9f06aa7de15 ("io_uring: move io_uring_task (tctx) helpers into its own file")
a4ad4f748ea9 ("io_uring: move fdinfo helpers to its own file")
e5550a1447bf ("io_uring: use io_is_uring_fops() consistently")
17437f311490 ("io_uring: move SQPOLL related handling into its own file")
59915143e89f ("io_uring: move timeout opcodes and handling into its own file")
e418bbc97bff ("io_uring: move our reference counting into a header")
36404b09aa60 ("io_uring: move msg_ring into its own file")
f9ead18c1058 ("io_uring: split network related opcodes into its own file")
e0da14def1ee ("io_uring: move statx handling to its own file")
a9c210cebe13 ("io_uring: move epoll handler to its own file")
4cf90495281b ("io_uring: add a dummy -EOPNOTSUPP prep handler")
99f15d8d6136 ("io_uring: move uring_cmd handling to its own file")
cd40cae29ef8 ("io_uring: split out open/close operations")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0091bfc81741b8d3aeb3b7ab8636f911b2de6e80 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Mon, 3 Oct 2022 13:59:47 +0100
Subject: [PATCH] io_uring/af_unix: defer registered files gc to io_uring
release
Instead of putting io_uring's registered files in unix_gc() we want it
to be done by io_uring itself. The trick here is to consider io_uring
registered files for cycle detection but not actually putting them down.
Because io_uring can't register other ring instances, this will remove
all refs to the ring file triggering the ->release path and clean up
with io_ring_ctx_free().
Cc: stable(a)vger.kernel.org
Fixes: 6b06314c47e1 ("io_uring: add file set registration")
Reported-and-tested-by: David Bouman <dbouman03(a)gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo(a)canonical.com>
[axboe: add kerneldoc comment to skb, fold in skb leak fix]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9fcf534f2d92..7be5bb4c94b6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -803,6 +803,7 @@ typedef unsigned char *sk_buff_data_t;
* @csum_level: indicates the number of consecutive checksums found in
* the packet minus one that have been verified as
* CHECKSUM_UNNECESSARY (max 3)
+ * @scm_io_uring: SKB holds io_uring registered files
* @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB
* @slow_gro: state present at GRO time, slower prepare step required
@@ -982,6 +983,7 @@ struct sk_buff {
#endif
__u8 slow_gro:1;
__u8 csum_not_inet:1;
+ __u8 scm_io_uring:1;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 6f88ded0e7e5..012fdb04ec23 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -855,6 +855,7 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
UNIXCB(skb).fp = fpl;
skb->sk = sk;
+ skb->scm_io_uring = 1;
skb->destructor = unix_destruct_scm;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index d45d5366115a..dc2763540393 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -204,6 +204,7 @@ void wait_for_unix_gc(void)
/* The external entry point: unix_gc() */
void unix_gc(void)
{
+ struct sk_buff *next_skb, *skb;
struct unix_sock *u;
struct unix_sock *next;
struct sk_buff_head hitlist;
@@ -297,11 +298,30 @@ void unix_gc(void)
spin_unlock(&unix_gc_lock);
+ /* We need io_uring to clean its registered files, ignore all io_uring
+ * originated skbs. It's fine as io_uring doesn't keep references to
+ * other io_uring instances and so killing all other files in the cycle
+ * will put all io_uring references forcing it to go through normal
+ * release.path eventually putting registered files.
+ */
+ skb_queue_walk_safe(&hitlist, skb, next_skb) {
+ if (skb->scm_io_uring) {
+ __skb_unlink(skb, &hitlist);
+ skb_queue_tail(&skb->sk->sk_receive_queue, skb);
+ }
+ }
+
/* Here we are. Hitlist is filled. Die. */
__skb_queue_purge(&hitlist);
spin_lock(&unix_gc_lock);
+ /* There could be io_uring registered files, just push them back to
+ * the inflight list
+ */
+ list_for_each_entry_safe(u, next, &gc_candidates, link)
+ list_move_tail(&u->link, &gc_inflight_list);
+
/* All candidates should have been detached by now. */
BUG_ON(!list_empty(&gc_candidates));
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
3fb1bd688172 ("io_uring/net: handle -EINPROGRESS correct for IORING_OP_CONNECT")
f9ead18c1058 ("io_uring: split network related opcodes into its own file")
e0da14def1ee ("io_uring: move statx handling to its own file")
a9c210cebe13 ("io_uring: move epoll handler to its own file")
4cf90495281b ("io_uring: add a dummy -EOPNOTSUPP prep handler")
99f15d8d6136 ("io_uring: move uring_cmd handling to its own file")
cd40cae29ef8 ("io_uring: split out open/close operations")
453b329be5ea ("io_uring: separate out file table handling code")
f4c163dd7d4b ("io_uring: split out fadvise/madvise operations")
0d5847274037 ("io_uring: split out fs related sync/fallocate functions")
531113bbd5bf ("io_uring: split out splice related operations")
11aeb71406dd ("io_uring: split out filesystem related operations")
e28683bdfc2f ("io_uring: move nop into its own file")
5e2a18d93fec ("io_uring: move xattr related opcodes to its own file")
97b388d70b53 ("io_uring: handle completions in the core")
de23077eda61 ("io_uring: set completion results upfront")
e27f928ee1cb ("io_uring: add io_uring_types.h")
4d4c9cff4f70 ("io_uring: define a request type cleanup handler")
890968dc0336 ("io_uring: unify struct io_symlink and io_hardlink")
9a3a11f977f9 ("io_uring: convert iouring_cmd to io_cmd_type")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3fb1bd68817288729179444caf1fd5c5c4d2d65d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Tue, 4 Oct 2022 20:29:48 -0600
Subject: [PATCH] io_uring/net: handle -EINPROGRESS correct for
IORING_OP_CONNECT
We treat EINPROGRESS like EAGAIN, but if we're retrying post getting
EINPROGRESS, then we just need to check the socket for errors and
terminate the request.
This was exposed on a bluetooth connection request which ends up
taking a while and hitting EINPROGRESS, and yields a CQE result of
-EBADFD because we're retrying a connect on a socket that is now
connected.
Cc: stable(a)vger.kernel.org
Fixes: 87f80d623c6c ("io_uring: handle connect -EINPROGRESS like -EAGAIN")
Link: https://github.com/axboe/liburing/issues/671
Reported-by: Aidan Sun <aidansun05(a)gmail.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/net.c b/io_uring/net.c
index caa6a803cb72..8c7226b5bf41 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -46,6 +46,7 @@ struct io_connect {
struct file *file;
struct sockaddr __user *addr;
int addr_len;
+ bool in_progress;
};
struct io_sr_msg {
@@ -1386,6 +1387,7 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
conn->addr_len = READ_ONCE(sqe->addr2);
+ conn->in_progress = false;
return 0;
}
@@ -1397,6 +1399,16 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ if (connect->in_progress) {
+ struct socket *socket;
+
+ ret = -ENOTSOCK;
+ socket = sock_from_file(req->file);
+ if (socket)
+ ret = sock_error(socket->sk);
+ goto out;
+ }
+
if (req_has_async_data(req)) {
io = req->async_data;
} else {
@@ -1413,13 +1425,17 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_connect_file(req->file, &io->address,
connect->addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req_has_async_data(req))
- return -EAGAIN;
- if (io_alloc_async_data(req)) {
- ret = -ENOMEM;
- goto out;
+ if (ret == -EINPROGRESS) {
+ connect->in_progress = true;
+ } else {
+ if (req_has_async_data(req))
+ return -EAGAIN;
+ if (io_alloc_async_data(req)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(req->async_data, &__io, sizeof(__io));
}
- memcpy(req->async_data, &__io, sizeof(__io));
return -EAGAIN;
}
if (ret == -ERESTARTSYS)
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
3fb1bd688172 ("io_uring/net: handle -EINPROGRESS correct for IORING_OP_CONNECT")
f9ead18c1058 ("io_uring: split network related opcodes into its own file")
e0da14def1ee ("io_uring: move statx handling to its own file")
a9c210cebe13 ("io_uring: move epoll handler to its own file")
4cf90495281b ("io_uring: add a dummy -EOPNOTSUPP prep handler")
99f15d8d6136 ("io_uring: move uring_cmd handling to its own file")
cd40cae29ef8 ("io_uring: split out open/close operations")
453b329be5ea ("io_uring: separate out file table handling code")
f4c163dd7d4b ("io_uring: split out fadvise/madvise operations")
0d5847274037 ("io_uring: split out fs related sync/fallocate functions")
531113bbd5bf ("io_uring: split out splice related operations")
11aeb71406dd ("io_uring: split out filesystem related operations")
e28683bdfc2f ("io_uring: move nop into its own file")
5e2a18d93fec ("io_uring: move xattr related opcodes to its own file")
97b388d70b53 ("io_uring: handle completions in the core")
de23077eda61 ("io_uring: set completion results upfront")
e27f928ee1cb ("io_uring: add io_uring_types.h")
4d4c9cff4f70 ("io_uring: define a request type cleanup handler")
890968dc0336 ("io_uring: unify struct io_symlink and io_hardlink")
9a3a11f977f9 ("io_uring: convert iouring_cmd to io_cmd_type")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3fb1bd68817288729179444caf1fd5c5c4d2d65d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Tue, 4 Oct 2022 20:29:48 -0600
Subject: [PATCH] io_uring/net: handle -EINPROGRESS correct for
IORING_OP_CONNECT
We treat EINPROGRESS like EAGAIN, but if we're retrying post getting
EINPROGRESS, then we just need to check the socket for errors and
terminate the request.
This was exposed on a bluetooth connection request which ends up
taking a while and hitting EINPROGRESS, and yields a CQE result of
-EBADFD because we're retrying a connect on a socket that is now
connected.
Cc: stable(a)vger.kernel.org
Fixes: 87f80d623c6c ("io_uring: handle connect -EINPROGRESS like -EAGAIN")
Link: https://github.com/axboe/liburing/issues/671
Reported-by: Aidan Sun <aidansun05(a)gmail.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/net.c b/io_uring/net.c
index caa6a803cb72..8c7226b5bf41 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -46,6 +46,7 @@ struct io_connect {
struct file *file;
struct sockaddr __user *addr;
int addr_len;
+ bool in_progress;
};
struct io_sr_msg {
@@ -1386,6 +1387,7 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
conn->addr_len = READ_ONCE(sqe->addr2);
+ conn->in_progress = false;
return 0;
}
@@ -1397,6 +1399,16 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ if (connect->in_progress) {
+ struct socket *socket;
+
+ ret = -ENOTSOCK;
+ socket = sock_from_file(req->file);
+ if (socket)
+ ret = sock_error(socket->sk);
+ goto out;
+ }
+
if (req_has_async_data(req)) {
io = req->async_data;
} else {
@@ -1413,13 +1425,17 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_connect_file(req->file, &io->address,
connect->addr_len, file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
- if (req_has_async_data(req))
- return -EAGAIN;
- if (io_alloc_async_data(req)) {
- ret = -ENOMEM;
- goto out;
+ if (ret == -EINPROGRESS) {
+ connect->in_progress = true;
+ } else {
+ if (req_has_async_data(req))
+ return -EAGAIN;
+ if (io_alloc_async_data(req)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(req->async_data, &__io, sizeof(__io));
}
- memcpy(req->async_data, &__io, sizeof(__io));
return -EAGAIN;
}
if (ret == -ERESTARTSYS)
The patch below does not apply to the 6.0-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Possible dependencies:
108893ddcc4d ("io_uring/net: fix notif cqe reordering")
6ae91ac9a6aa ("io_uring/net: don't skip notifs for failed requests")
a75155faef4e ("io_uring/net: fix UAF in io_sendrecv_fail()")
493108d95f14 ("io_uring/net: zerocopy sendmsg")
c4c0009e0b56 ("io_uring/net: combine fail handlers")
b0e9b5517eb1 ("io_uring/net: rename io_sendzc()")
516e82f0e043 ("io_uring/net: support non-zerocopy sendto")
6ae61b7aa2c7 ("io_uring/net: refactor io_setup_async_addr")
5693bcce892d ("io_uring/net: don't lose partial send_zc on fail")
7e6b638ed501 ("io_uring/net: don't lose partial send/recv on fail")
ac9e5784bbe7 ("io_uring/net: use io_sr_msg for sendzc")
0b048557db76 ("io_uring/net: refactor io_sr_msg types")
6bf8ad25fcd4 ("io_uring/net: io_async_msghdr caches for sendzc")
95eafc74be5e ("io_uring/net: reshuffle error handling")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 108893ddcc4d3aa0a4a02aeb02d478e997001227 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Thu, 29 Sep 2022 22:23:19 +0100
Subject: [PATCH] io_uring/net: fix notif cqe reordering
send zc is not restricted to !IO_URING_F_UNLOCKED anymore and so
we can't use task-tw ordering trick to order notification cqes
with requests completions. In this case leave it alone and let
io_send_zc_cleanup() flush it.
Cc: stable(a)vger.kernel.org
Fixes: 53bdc88aac9a2 ("io_uring/notif: order notif vs send CQEs")
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/0031f3a00d492e814a4a0935a2029a46d9c9ba06.16644865…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/net.c b/io_uring/net.c
index 604eac5f7a34..caa6a803cb72 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1127,8 +1127,14 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
else if (zc->done_io)
ret = zc->done_io;
- io_notif_flush(zc->notif);
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ /*
+ * If we're in io-wq we can't rely on tw ordering guarantees, defer
+ * flushing notif to io_send_zc_cleanup()
+ */
+ if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+ io_notif_flush(zc->notif);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ }
io_req_set_res(req, ret, IORING_CQE_F_MORE);
return IOU_OK;
}
@@ -1182,8 +1188,10 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
}
/* fast path, check for non-NULL to avoid function call */
- if (kmsg->free_iov)
+ if (kmsg->free_iov) {
kfree(kmsg->free_iov);
+ kmsg->free_iov = NULL;
+ }
io_netmsg_recycle(req, issue_flags);
if (ret >= 0)
@@ -1191,8 +1199,14 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
else if (sr->done_io)
ret = sr->done_io;
- io_notif_flush(sr->notif);
- req->flags &= ~REQ_F_NEED_CLEANUP;
+ /*
+ * If we're in io-wq we can't rely on tw ordering guarantees, defer
+ * flushing notif to io_send_zc_cleanup()
+ */
+ if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+ io_notif_flush(sr->notif);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ }
io_req_set_res(req, ret, IORING_CQE_F_MORE);
return IOU_OK;
}