commit 003fb0a51162d940f25fc35e70b0996a12c9e08a upstream.
Requests to the mmc layer usually come through a block device IO.
The exceptions are the ioctl interface, RPMB chardev ioctl
and debugfs, which issue their own blk_mq requests through
blk_execute_rq and do not query the BLK_STS error but the
mmcblk-internal drv_op_result. This patch ensures that drv_op_result
defaults to an error and has to be overwritten by the operation
to be considered successful.
The behavior leads to a bug where the request never propagates
the error, e.g. by directly erroring out at mmc_blk_mq_issue_rq if
mmc_blk_part_switch fails. The ioctl caller of the rpmb chardev then
can never see an error (BLK_STS_IOERR, but drv_op_result is unchanged)
and thus may assume that their call executed successfully when it did not.
While always checking the blk_execute_rq return value would be
advised, let's eliminate the error by always setting
drv_op_result as -EIO to be overwritten on success (or other error)
Fixes: 614f0388f580 ("mmc: block: move single ioctl() commands to block requests")
Signed-off-by: Christian Loehle <cloehle(a)hyperstone.com>
---
This is for the 5.15. stable tree
drivers/mmc/core/block.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index ed034b93cb25..0b72096f10e6 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -265,6 +265,7 @@ static ssize_t power_ro_lock_store(struct device *dev,
goto out_put;
}
req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_BOOT_WP;
+ req_to_mmc_queue_req(req)->drv_op_result = -EIO;
blk_execute_rq(NULL, req, 0);
ret = req_to_mmc_queue_req(req)->drv_op_result;
blk_put_request(req);
@@ -656,6 +657,7 @@ static int mmc_blk_ioctl_cmd(struct mmc_blk_data *md,
idatas[0] = idata;
req_to_mmc_queue_req(req)->drv_op =
rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL;
+ req_to_mmc_queue_req(req)->drv_op_result = -EIO;
req_to_mmc_queue_req(req)->drv_op_data = idatas;
req_to_mmc_queue_req(req)->ioc_count = 1;
blk_execute_rq(NULL, req, 0);
@@ -725,6 +727,7 @@ static int mmc_blk_ioctl_multi_cmd(struct mmc_blk_data *md,
}
req_to_mmc_queue_req(req)->drv_op =
rpmb ? MMC_DRV_OP_IOCTL_RPMB : MMC_DRV_OP_IOCTL;
+ req_to_mmc_queue_req(req)->drv_op_result = -EIO;
req_to_mmc_queue_req(req)->drv_op_data = idata;
req_to_mmc_queue_req(req)->ioc_count = num_of_cmds;
blk_execute_rq(NULL, req, 0);
@@ -2784,6 +2787,7 @@ static int mmc_dbg_card_status_get(void *data, u64 *val)
if (IS_ERR(req))
return PTR_ERR(req);
req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_CARD_STATUS;
+ req_to_mmc_queue_req(req)->drv_op_result = -EIO;
blk_execute_rq(NULL, req, 0);
ret = req_to_mmc_queue_req(req)->drv_op_result;
if (ret >= 0) {
@@ -2822,6 +2826,7 @@ static int mmc_ext_csd_open(struct inode *inode, struct file *filp)
goto out_free;
}
req_to_mmc_queue_req(req)->drv_op = MMC_DRV_OP_GET_EXT_CSD;
+ req_to_mmc_queue_req(req)->drv_op_result = -EIO;
req_to_mmc_queue_req(req)->drv_op_data = &ext_csd;
blk_execute_rq(NULL, req, 0);
err = req_to_mmc_queue_req(req)->drv_op_result;
--
2.37.3
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 1240eb93f0616b21c675416516ff3d74798fdc97
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061939-sprout-jujitsu-b6a0@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1240eb93f0616b21c675416516ff3d74798fdc97 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo(a)netfilter.org>
Date: Thu, 8 Jun 2023 02:32:02 +0200
Subject: [PATCH] netfilter: nf_tables: incorrect error path handling with
NFT_MSG_NEWRULE
In case of error when adding a new rule that refers to an anonymous set,
deactivate expressions via NFT_TRANS_PREPARE state, not NFT_TRANS_RELEASE.
Thus, the lookup expression marks anonymous sets as inactive in the next
generation to ensure it is not reachable in this transaction anymore and
decrement the set refcount as introduced by c1592a89942e ("netfilter:
nf_tables: deactivate anonymous set from preparation phase"). The abort
step takes care of undoing the anonymous set.
This is also consistent with rule deletion, where NFT_TRANS_PREPARE is
used. Note that this error path is exercised in the preparation step of
the commit protocol. This patch replaces nf_tables_rule_release() by the
deactivate and destroy calls, this time with NFT_TRANS_PREPARE.
Due to this incorrect error handling, it is possible to access a
dangling pointer to the anonymous set that remains in the transaction
list.
[1009.379054] BUG: KASAN: use-after-free in nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379106] Read of size 8 at addr ffff88816c4c8020 by task nft-rule-add/137110
[1009.379116] CPU: 7 PID: 137110 Comm: nft-rule-add Not tainted 6.4.0-rc4+ #256
[1009.379128] Call Trace:
[1009.379132] <TASK>
[1009.379135] dump_stack_lvl+0x33/0x50
[1009.379146] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379191] print_address_description.constprop.0+0x27/0x300
[1009.379201] kasan_report+0x107/0x120
[1009.379210] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379255] nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379302] nft_lookup_init+0xa5/0x270 [nf_tables]
[1009.379350] nf_tables_newrule+0x698/0xe50 [nf_tables]
[1009.379397] ? nf_tables_rule_release+0xe0/0xe0 [nf_tables]
[1009.379441] ? kasan_unpoison+0x23/0x50
[1009.379450] nfnetlink_rcv_batch+0x97c/0xd90 [nfnetlink]
[1009.379470] ? nfnetlink_rcv_msg+0x480/0x480 [nfnetlink]
[1009.379485] ? __alloc_skb+0xb8/0x1e0
[1009.379493] ? __alloc_skb+0xb8/0x1e0
[1009.379502] ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
[1009.379509] ? unwind_get_return_address+0x2a/0x40
[1009.379517] ? write_profile+0xc0/0xc0
[1009.379524] ? avc_lookup+0x8f/0xc0
[1009.379532] ? __rcu_read_unlock+0x43/0x60
Fixes: 958bee14d071 ("netfilter: nf_tables: use new transaction infrastructure to handle sets")
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3bb0800b3849..69bceefaa5c8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3844,7 +3844,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
if (flow)
nft_flow_rule_destroy(flow);
err_release_rule:
- nf_tables_rule_release(&ctx, rule);
+ nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE);
+ nf_tables_rule_destroy(&ctx, rule);
err_release_expr:
for (i = 0; i < n; i++) {
if (expr_info[i].ops) {
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 1240eb93f0616b21c675416516ff3d74798fdc97
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061937-spiritism-reliably-6082@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1240eb93f0616b21c675416516ff3d74798fdc97 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo(a)netfilter.org>
Date: Thu, 8 Jun 2023 02:32:02 +0200
Subject: [PATCH] netfilter: nf_tables: incorrect error path handling with
NFT_MSG_NEWRULE
In case of error when adding a new rule that refers to an anonymous set,
deactivate expressions via NFT_TRANS_PREPARE state, not NFT_TRANS_RELEASE.
Thus, the lookup expression marks anonymous sets as inactive in the next
generation to ensure it is not reachable in this transaction anymore and
decrement the set refcount as introduced by c1592a89942e ("netfilter:
nf_tables: deactivate anonymous set from preparation phase"). The abort
step takes care of undoing the anonymous set.
This is also consistent with rule deletion, where NFT_TRANS_PREPARE is
used. Note that this error path is exercised in the preparation step of
the commit protocol. This patch replaces nf_tables_rule_release() by the
deactivate and destroy calls, this time with NFT_TRANS_PREPARE.
Due to this incorrect error handling, it is possible to access a
dangling pointer to the anonymous set that remains in the transaction
list.
[1009.379054] BUG: KASAN: use-after-free in nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379106] Read of size 8 at addr ffff88816c4c8020 by task nft-rule-add/137110
[1009.379116] CPU: 7 PID: 137110 Comm: nft-rule-add Not tainted 6.4.0-rc4+ #256
[1009.379128] Call Trace:
[1009.379132] <TASK>
[1009.379135] dump_stack_lvl+0x33/0x50
[1009.379146] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379191] print_address_description.constprop.0+0x27/0x300
[1009.379201] kasan_report+0x107/0x120
[1009.379210] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379255] nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379302] nft_lookup_init+0xa5/0x270 [nf_tables]
[1009.379350] nf_tables_newrule+0x698/0xe50 [nf_tables]
[1009.379397] ? nf_tables_rule_release+0xe0/0xe0 [nf_tables]
[1009.379441] ? kasan_unpoison+0x23/0x50
[1009.379450] nfnetlink_rcv_batch+0x97c/0xd90 [nfnetlink]
[1009.379470] ? nfnetlink_rcv_msg+0x480/0x480 [nfnetlink]
[1009.379485] ? __alloc_skb+0xb8/0x1e0
[1009.379493] ? __alloc_skb+0xb8/0x1e0
[1009.379502] ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
[1009.379509] ? unwind_get_return_address+0x2a/0x40
[1009.379517] ? write_profile+0xc0/0xc0
[1009.379524] ? avc_lookup+0x8f/0xc0
[1009.379532] ? __rcu_read_unlock+0x43/0x60
Fixes: 958bee14d071 ("netfilter: nf_tables: use new transaction infrastructure to handle sets")
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3bb0800b3849..69bceefaa5c8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3844,7 +3844,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
if (flow)
nft_flow_rule_destroy(flow);
err_release_rule:
- nf_tables_rule_release(&ctx, rule);
+ nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE);
+ nf_tables_rule_destroy(&ctx, rule);
err_release_expr:
for (i = 0; i < n; i++) {
if (expr_info[i].ops) {
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 1240eb93f0616b21c675416516ff3d74798fdc97
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061935-renewed-granite-7529@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1240eb93f0616b21c675416516ff3d74798fdc97 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo(a)netfilter.org>
Date: Thu, 8 Jun 2023 02:32:02 +0200
Subject: [PATCH] netfilter: nf_tables: incorrect error path handling with
NFT_MSG_NEWRULE
In case of error when adding a new rule that refers to an anonymous set,
deactivate expressions via NFT_TRANS_PREPARE state, not NFT_TRANS_RELEASE.
Thus, the lookup expression marks anonymous sets as inactive in the next
generation to ensure it is not reachable in this transaction anymore and
decrement the set refcount as introduced by c1592a89942e ("netfilter:
nf_tables: deactivate anonymous set from preparation phase"). The abort
step takes care of undoing the anonymous set.
This is also consistent with rule deletion, where NFT_TRANS_PREPARE is
used. Note that this error path is exercised in the preparation step of
the commit protocol. This patch replaces nf_tables_rule_release() by the
deactivate and destroy calls, this time with NFT_TRANS_PREPARE.
Due to this incorrect error handling, it is possible to access a
dangling pointer to the anonymous set that remains in the transaction
list.
[1009.379054] BUG: KASAN: use-after-free in nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379106] Read of size 8 at addr ffff88816c4c8020 by task nft-rule-add/137110
[1009.379116] CPU: 7 PID: 137110 Comm: nft-rule-add Not tainted 6.4.0-rc4+ #256
[1009.379128] Call Trace:
[1009.379132] <TASK>
[1009.379135] dump_stack_lvl+0x33/0x50
[1009.379146] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379191] print_address_description.constprop.0+0x27/0x300
[1009.379201] kasan_report+0x107/0x120
[1009.379210] ? nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379255] nft_set_lookup_global+0x147/0x1a0 [nf_tables]
[1009.379302] nft_lookup_init+0xa5/0x270 [nf_tables]
[1009.379350] nf_tables_newrule+0x698/0xe50 [nf_tables]
[1009.379397] ? nf_tables_rule_release+0xe0/0xe0 [nf_tables]
[1009.379441] ? kasan_unpoison+0x23/0x50
[1009.379450] nfnetlink_rcv_batch+0x97c/0xd90 [nfnetlink]
[1009.379470] ? nfnetlink_rcv_msg+0x480/0x480 [nfnetlink]
[1009.379485] ? __alloc_skb+0xb8/0x1e0
[1009.379493] ? __alloc_skb+0xb8/0x1e0
[1009.379502] ? entry_SYSCALL_64_after_hwframe+0x46/0xb0
[1009.379509] ? unwind_get_return_address+0x2a/0x40
[1009.379517] ? write_profile+0xc0/0xc0
[1009.379524] ? avc_lookup+0x8f/0xc0
[1009.379532] ? __rcu_read_unlock+0x43/0x60
Fixes: 958bee14d071 ("netfilter: nf_tables: use new transaction infrastructure to handle sets")
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3bb0800b3849..69bceefaa5c8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3844,7 +3844,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
if (flow)
nft_flow_rule_destroy(flow);
err_release_rule:
- nf_tables_rule_release(&ctx, rule);
+ nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE);
+ nf_tables_rule_destroy(&ctx, rule);
err_release_expr:
for (i = 0; i < n; i++) {
if (expr_info[i].ops) {
From: Michael Ellerman <mpe(a)ellerman.id.au>
Our logic for choosing defconfig doesn't work well in some situations.
For example if you're on a ppc64le machine but you specify a non-empty
CROSS_COMPILE, in order to use a non-default toolchain, then defconfig
will give you ppc64_defconfig (big endian):
$ make CROSS_COMPILE=~/toolchains/gcc-8/bin/powerpc-linux- defconfig
*** Default configuration is based on 'ppc64_defconfig'
This is because we assume that CROSS_COMPILE being set means we
can't be on a ppc machine and rather than checking we just default to
ppc64_defconfig.
We should just ignore CROSS_COMPILE, instead check the machine with
uname and if it's one of ppc, ppc64 or ppc64le then use that
defconfig. If it's none of those then we fall back to ppc64_defconfig.
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
(cherry picked from commit af5cd05de5dd38cf25d14ea4d30ae9b791d2420b)
Signed-off-by: Alyssa Ross <hi(a)alyssa.is>
---
arch/powerpc/Makefile | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 9c78ef298257..cbc7c05a6165 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -29,11 +29,10 @@ endif
export CROSS32CC CROSS32AR
-ifeq ($(CROSS_COMPILE),)
-KBUILD_DEFCONFIG := $(shell uname -m)_defconfig
-else
-KBUILD_DEFCONFIG := ppc64_defconfig
-endif
+# If we're on a ppc/ppc64/ppc64le machine use that defconfig, otherwise just use
+# ppc64_defconfig because we have nothing better to go on.
+uname := $(shell uname -m)
+KBUILD_DEFCONFIG := $(if $(filter ppc%,$(uname)),$(uname),ppc64)_defconfig
ifeq ($(CONFIG_PPC64),y)
new_nm := $(shell if $(NM) --help 2>&1 | grep -- '--synthetic' > /dev/null; then echo y; else echo n; fi)
base-commit: 1914956342c8cf52a377aecc4944e63f9229cb9b
--
2.37.1
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 84ad0af0bccd3691cb951c2974c5cb2c10594d4a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061948-emerald-clamor-35c8@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 84ad0af0bccd3691cb951c2974c5cb2c10594d4a Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye(a)bytedance.com>
Date: Sat, 10 Jun 2023 20:30:25 -0700
Subject: [PATCH] net/sched: qdisc_destroy() old ingress and clsact Qdiscs
before grafting
mini_Qdisc_pair::p_miniq is a double pointer to mini_Qdisc, initialized
in ingress_init() to point to net_device::miniq_ingress. ingress Qdiscs
access this per-net_device pointer in mini_qdisc_pair_swap(). Similar
for clsact Qdiscs and miniq_egress.
Unfortunately, after introducing RTNL-unlocked RTM_{NEW,DEL,GET}TFILTER
requests (thanks Hillf Danton for the hint), when replacing ingress or
clsact Qdiscs, for example, the old Qdisc ("@old") could access the same
miniq_{in,e}gress pointer(s) concurrently with the new Qdisc ("@new"),
causing race conditions [1] including a use-after-free bug in
mini_qdisc_pair_swap() reported by syzbot:
BUG: KASAN: slab-use-after-free in mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
Write of size 8 at addr ffff888045b31308 by task syz-executor690/14901
...
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106
print_address_description.constprop.0+0x2c/0x3c0 mm/kasan/report.c:319
print_report mm/kasan/report.c:430 [inline]
kasan_report+0x11c/0x130 mm/kasan/report.c:536
mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
tcf_chain_head_change_item net/sched/cls_api.c:495 [inline]
tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:509
tcf_chain_tp_insert net/sched/cls_api.c:1826 [inline]
tcf_chain_tp_insert_unique net/sched/cls_api.c:1875 [inline]
tc_new_tfilter+0x1de6/0x2290 net/sched/cls_api.c:2266
...
@old and @new should not affect each other. In other words, @old should
never modify miniq_{in,e}gress after @new, and @new should not update
@old's RCU state.
Fixing without changing sch_api.c turned out to be difficult (please
refer to Closes: for discussions). Instead, make sure @new's first call
always happen after @old's last call (in {ingress,clsact}_destroy()) has
finished:
In qdisc_graft(), return -EBUSY if @old has any ongoing filter requests,
and call qdisc_destroy() for @old before grafting @new.
Introduce qdisc_refcount_dec_if_one() as the counterpart of
qdisc_refcount_inc_nz() used for filter requests. Introduce a
non-static version of qdisc_destroy() that does a TCQ_F_BUILTIN check,
just like qdisc_put() etc.
Depends on patch "net/sched: Refactor qdisc_graft() for ingress and
clsact Qdiscs".
[1] To illustrate, the syzkaller reproducer adds ingress Qdiscs under
TC_H_ROOT (no longer possible after commit c7cfbd115001 ("net/sched:
sch_ingress: Only create under TC_H_INGRESS")) on eth0 that has 8
transmission queues:
Thread 1 creates ingress Qdisc A (containing mini Qdisc a1 and a2),
then adds a flower filter X to A.
Thread 2 creates another ingress Qdisc B (containing mini Qdisc b1 and
b2) to replace A, then adds a flower filter Y to B.
Thread 1 A's refcnt Thread 2
RTM_NEWQDISC (A, RTNL-locked)
qdisc_create(A) 1
qdisc_graft(A) 9
RTM_NEWTFILTER (X, RTNL-unlocked)
__tcf_qdisc_find(A) 10
tcf_chain0_head_change(A)
mini_qdisc_pair_swap(A) (1st)
|
| RTM_NEWQDISC (B, RTNL-locked)
RCU sync 2 qdisc_graft(B)
| 1 notify_and_destroy(A)
|
tcf_block_release(A) 0 RTM_NEWTFILTER (Y, RTNL-unlocked)
qdisc_destroy(A) tcf_chain0_head_change(B)
tcf_chain0_head_change_cb_del(A) mini_qdisc_pair_swap(B) (2nd)
mini_qdisc_pair_swap(A) (3rd) |
... ...
Here, B calls mini_qdisc_pair_swap(), pointing eth0->miniq_ingress to
its mini Qdisc, b1. Then, A calls mini_qdisc_pair_swap() again during
ingress_destroy(), setting eth0->miniq_ingress to NULL, so ingress
packets on eth0 will not find filter Y in sch_handle_ingress().
This is just one of the possible consequences of concurrently accessing
miniq_{in,e}gress pointers.
Fixes: 7a096d579e8e ("net: sched: ingress: set 'unlocked' flag for Qdisc ops")
Fixes: 87f373921c4e ("net: sched: ingress: set 'unlocked' flag for clsact Qdisc ops")
Reported-by: syzbot+b53a9c0d1ea4ad62da8b(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/0000000000006cf87705f79acf1a@google.com/
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Vlad Buslov <vladbu(a)mellanox.com>
Signed-off-by: Peilin Ye <peilin.ye(a)bytedance.com>
Acked-by: Jamal Hadi Salim <jhs(a)mojatatu.com>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 27271f2b37cb..12eadecf8cd0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -137,6 +137,13 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
refcount_inc(&qdisc->refcnt);
}
+static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return true;
+ return refcount_dec_if_one(&qdisc->refcnt);
+}
+
/* Intended to be used by unlocked users, when concurrent qdisc release is
* possible.
*/
@@ -652,6 +659,7 @@ void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
+void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 094ca3a5b633..aa6b1fe65151 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1086,10 +1086,22 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if ((q && q->flags & TCQ_F_INGRESS) ||
(new && new->flags & TCQ_F_INGRESS)) {
ingress = 1;
- if (!dev_ingress_queue(dev)) {
+ dev_queue = dev_ingress_queue(dev);
+ if (!dev_queue) {
NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
return -ENOENT;
}
+
+ q = rtnl_dereference(dev_queue->qdisc_sleeping);
+
+ /* This is the counterpart of that qdisc_refcount_inc_nz() call in
+ * __tcf_qdisc_find() for filter requests.
+ */
+ if (!qdisc_refcount_dec_if_one(q)) {
+ NL_SET_ERR_MSG(extack,
+ "Current ingress or clsact Qdisc has ongoing filter requests");
+ return -EBUSY;
+ }
}
if (dev->flags & IFF_UP)
@@ -1110,8 +1122,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
qdisc_put(old);
}
} else {
- dev_queue = dev_ingress_queue(dev);
- old = dev_graft_qdisc(dev_queue, new);
+ old = dev_graft_qdisc(dev_queue, NULL);
+
+ /* {ingress,clsact}_destroy() @old before grafting @new to avoid
+ * unprotected concurrent accesses to net_device::miniq_{in,e}gress
+ * pointer(s) in mini_qdisc_pair_swap().
+ */
+ qdisc_notify(net, skb, n, classid, old, new, extack);
+ qdisc_destroy(old);
+
+ dev_graft_qdisc(dev_queue, new);
}
skip:
@@ -1125,8 +1145,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if (new && new->ops->attach)
new->ops->attach(new);
- } else {
- notify_and_destroy(net, skb, n, classid, old, new, extack);
}
if (dev->flags & IFF_UP)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3248259eba32..5d7e23f4cc0e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1046,7 +1046,7 @@ static void qdisc_free_cb(struct rcu_head *head)
qdisc_free(q);
}
-static void qdisc_destroy(struct Qdisc *qdisc)
+static void __qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
@@ -1070,6 +1070,14 @@ static void qdisc_destroy(struct Qdisc *qdisc)
call_rcu(&qdisc->rcu, qdisc_free_cb);
}
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return;
+
+ __qdisc_destroy(qdisc);
+}
+
void qdisc_put(struct Qdisc *qdisc)
{
if (!qdisc)
@@ -1079,7 +1087,7 @@ void qdisc_put(struct Qdisc *qdisc)
!refcount_dec_and_test(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
@@ -1094,7 +1102,7 @@ void qdisc_put_unlocked(struct Qdisc *qdisc)
!refcount_dec_and_rtnl_lock(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 84ad0af0bccd3691cb951c2974c5cb2c10594d4a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061945-polish-remorse-02cf@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 84ad0af0bccd3691cb951c2974c5cb2c10594d4a Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye(a)bytedance.com>
Date: Sat, 10 Jun 2023 20:30:25 -0700
Subject: [PATCH] net/sched: qdisc_destroy() old ingress and clsact Qdiscs
before grafting
mini_Qdisc_pair::p_miniq is a double pointer to mini_Qdisc, initialized
in ingress_init() to point to net_device::miniq_ingress. ingress Qdiscs
access this per-net_device pointer in mini_qdisc_pair_swap(). Similar
for clsact Qdiscs and miniq_egress.
Unfortunately, after introducing RTNL-unlocked RTM_{NEW,DEL,GET}TFILTER
requests (thanks Hillf Danton for the hint), when replacing ingress or
clsact Qdiscs, for example, the old Qdisc ("@old") could access the same
miniq_{in,e}gress pointer(s) concurrently with the new Qdisc ("@new"),
causing race conditions [1] including a use-after-free bug in
mini_qdisc_pair_swap() reported by syzbot:
BUG: KASAN: slab-use-after-free in mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
Write of size 8 at addr ffff888045b31308 by task syz-executor690/14901
...
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106
print_address_description.constprop.0+0x2c/0x3c0 mm/kasan/report.c:319
print_report mm/kasan/report.c:430 [inline]
kasan_report+0x11c/0x130 mm/kasan/report.c:536
mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
tcf_chain_head_change_item net/sched/cls_api.c:495 [inline]
tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:509
tcf_chain_tp_insert net/sched/cls_api.c:1826 [inline]
tcf_chain_tp_insert_unique net/sched/cls_api.c:1875 [inline]
tc_new_tfilter+0x1de6/0x2290 net/sched/cls_api.c:2266
...
@old and @new should not affect each other. In other words, @old should
never modify miniq_{in,e}gress after @new, and @new should not update
@old's RCU state.
Fixing without changing sch_api.c turned out to be difficult (please
refer to Closes: for discussions). Instead, make sure @new's first call
always happen after @old's last call (in {ingress,clsact}_destroy()) has
finished:
In qdisc_graft(), return -EBUSY if @old has any ongoing filter requests,
and call qdisc_destroy() for @old before grafting @new.
Introduce qdisc_refcount_dec_if_one() as the counterpart of
qdisc_refcount_inc_nz() used for filter requests. Introduce a
non-static version of qdisc_destroy() that does a TCQ_F_BUILTIN check,
just like qdisc_put() etc.
Depends on patch "net/sched: Refactor qdisc_graft() for ingress and
clsact Qdiscs".
[1] To illustrate, the syzkaller reproducer adds ingress Qdiscs under
TC_H_ROOT (no longer possible after commit c7cfbd115001 ("net/sched:
sch_ingress: Only create under TC_H_INGRESS")) on eth0 that has 8
transmission queues:
Thread 1 creates ingress Qdisc A (containing mini Qdisc a1 and a2),
then adds a flower filter X to A.
Thread 2 creates another ingress Qdisc B (containing mini Qdisc b1 and
b2) to replace A, then adds a flower filter Y to B.
Thread 1 A's refcnt Thread 2
RTM_NEWQDISC (A, RTNL-locked)
qdisc_create(A) 1
qdisc_graft(A) 9
RTM_NEWTFILTER (X, RTNL-unlocked)
__tcf_qdisc_find(A) 10
tcf_chain0_head_change(A)
mini_qdisc_pair_swap(A) (1st)
|
| RTM_NEWQDISC (B, RTNL-locked)
RCU sync 2 qdisc_graft(B)
| 1 notify_and_destroy(A)
|
tcf_block_release(A) 0 RTM_NEWTFILTER (Y, RTNL-unlocked)
qdisc_destroy(A) tcf_chain0_head_change(B)
tcf_chain0_head_change_cb_del(A) mini_qdisc_pair_swap(B) (2nd)
mini_qdisc_pair_swap(A) (3rd) |
... ...
Here, B calls mini_qdisc_pair_swap(), pointing eth0->miniq_ingress to
its mini Qdisc, b1. Then, A calls mini_qdisc_pair_swap() again during
ingress_destroy(), setting eth0->miniq_ingress to NULL, so ingress
packets on eth0 will not find filter Y in sch_handle_ingress().
This is just one of the possible consequences of concurrently accessing
miniq_{in,e}gress pointers.
Fixes: 7a096d579e8e ("net: sched: ingress: set 'unlocked' flag for Qdisc ops")
Fixes: 87f373921c4e ("net: sched: ingress: set 'unlocked' flag for clsact Qdisc ops")
Reported-by: syzbot+b53a9c0d1ea4ad62da8b(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/0000000000006cf87705f79acf1a@google.com/
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Vlad Buslov <vladbu(a)mellanox.com>
Signed-off-by: Peilin Ye <peilin.ye(a)bytedance.com>
Acked-by: Jamal Hadi Salim <jhs(a)mojatatu.com>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 27271f2b37cb..12eadecf8cd0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -137,6 +137,13 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
refcount_inc(&qdisc->refcnt);
}
+static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return true;
+ return refcount_dec_if_one(&qdisc->refcnt);
+}
+
/* Intended to be used by unlocked users, when concurrent qdisc release is
* possible.
*/
@@ -652,6 +659,7 @@ void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
+void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 094ca3a5b633..aa6b1fe65151 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1086,10 +1086,22 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if ((q && q->flags & TCQ_F_INGRESS) ||
(new && new->flags & TCQ_F_INGRESS)) {
ingress = 1;
- if (!dev_ingress_queue(dev)) {
+ dev_queue = dev_ingress_queue(dev);
+ if (!dev_queue) {
NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
return -ENOENT;
}
+
+ q = rtnl_dereference(dev_queue->qdisc_sleeping);
+
+ /* This is the counterpart of that qdisc_refcount_inc_nz() call in
+ * __tcf_qdisc_find() for filter requests.
+ */
+ if (!qdisc_refcount_dec_if_one(q)) {
+ NL_SET_ERR_MSG(extack,
+ "Current ingress or clsact Qdisc has ongoing filter requests");
+ return -EBUSY;
+ }
}
if (dev->flags & IFF_UP)
@@ -1110,8 +1122,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
qdisc_put(old);
}
} else {
- dev_queue = dev_ingress_queue(dev);
- old = dev_graft_qdisc(dev_queue, new);
+ old = dev_graft_qdisc(dev_queue, NULL);
+
+ /* {ingress,clsact}_destroy() @old before grafting @new to avoid
+ * unprotected concurrent accesses to net_device::miniq_{in,e}gress
+ * pointer(s) in mini_qdisc_pair_swap().
+ */
+ qdisc_notify(net, skb, n, classid, old, new, extack);
+ qdisc_destroy(old);
+
+ dev_graft_qdisc(dev_queue, new);
}
skip:
@@ -1125,8 +1145,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if (new && new->ops->attach)
new->ops->attach(new);
- } else {
- notify_and_destroy(net, skb, n, classid, old, new, extack);
}
if (dev->flags & IFF_UP)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3248259eba32..5d7e23f4cc0e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1046,7 +1046,7 @@ static void qdisc_free_cb(struct rcu_head *head)
qdisc_free(q);
}
-static void qdisc_destroy(struct Qdisc *qdisc)
+static void __qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
@@ -1070,6 +1070,14 @@ static void qdisc_destroy(struct Qdisc *qdisc)
call_rcu(&qdisc->rcu, qdisc_free_cb);
}
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return;
+
+ __qdisc_destroy(qdisc);
+}
+
void qdisc_put(struct Qdisc *qdisc)
{
if (!qdisc)
@@ -1079,7 +1087,7 @@ void qdisc_put(struct Qdisc *qdisc)
!refcount_dec_and_test(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
@@ -1094,7 +1102,7 @@ void qdisc_put_unlocked(struct Qdisc *qdisc)
!refcount_dec_and_rtnl_lock(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 84ad0af0bccd3691cb951c2974c5cb2c10594d4a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061941-anaerobic-washing-b481@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 84ad0af0bccd3691cb951c2974c5cb2c10594d4a Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye(a)bytedance.com>
Date: Sat, 10 Jun 2023 20:30:25 -0700
Subject: [PATCH] net/sched: qdisc_destroy() old ingress and clsact Qdiscs
before grafting
mini_Qdisc_pair::p_miniq is a double pointer to mini_Qdisc, initialized
in ingress_init() to point to net_device::miniq_ingress. ingress Qdiscs
access this per-net_device pointer in mini_qdisc_pair_swap(). Similar
for clsact Qdiscs and miniq_egress.
Unfortunately, after introducing RTNL-unlocked RTM_{NEW,DEL,GET}TFILTER
requests (thanks Hillf Danton for the hint), when replacing ingress or
clsact Qdiscs, for example, the old Qdisc ("@old") could access the same
miniq_{in,e}gress pointer(s) concurrently with the new Qdisc ("@new"),
causing race conditions [1] including a use-after-free bug in
mini_qdisc_pair_swap() reported by syzbot:
BUG: KASAN: slab-use-after-free in mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
Write of size 8 at addr ffff888045b31308 by task syz-executor690/14901
...
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106
print_address_description.constprop.0+0x2c/0x3c0 mm/kasan/report.c:319
print_report mm/kasan/report.c:430 [inline]
kasan_report+0x11c/0x130 mm/kasan/report.c:536
mini_qdisc_pair_swap+0x1c2/0x1f0 net/sched/sch_generic.c:1573
tcf_chain_head_change_item net/sched/cls_api.c:495 [inline]
tcf_chain0_head_change.isra.0+0xb9/0x120 net/sched/cls_api.c:509
tcf_chain_tp_insert net/sched/cls_api.c:1826 [inline]
tcf_chain_tp_insert_unique net/sched/cls_api.c:1875 [inline]
tc_new_tfilter+0x1de6/0x2290 net/sched/cls_api.c:2266
...
@old and @new should not affect each other. In other words, @old should
never modify miniq_{in,e}gress after @new, and @new should not update
@old's RCU state.
Fixing without changing sch_api.c turned out to be difficult (please
refer to Closes: for discussions). Instead, make sure @new's first call
always happen after @old's last call (in {ingress,clsact}_destroy()) has
finished:
In qdisc_graft(), return -EBUSY if @old has any ongoing filter requests,
and call qdisc_destroy() for @old before grafting @new.
Introduce qdisc_refcount_dec_if_one() as the counterpart of
qdisc_refcount_inc_nz() used for filter requests. Introduce a
non-static version of qdisc_destroy() that does a TCQ_F_BUILTIN check,
just like qdisc_put() etc.
Depends on patch "net/sched: Refactor qdisc_graft() for ingress and
clsact Qdiscs".
[1] To illustrate, the syzkaller reproducer adds ingress Qdiscs under
TC_H_ROOT (no longer possible after commit c7cfbd115001 ("net/sched:
sch_ingress: Only create under TC_H_INGRESS")) on eth0 that has 8
transmission queues:
Thread 1 creates ingress Qdisc A (containing mini Qdisc a1 and a2),
then adds a flower filter X to A.
Thread 2 creates another ingress Qdisc B (containing mini Qdisc b1 and
b2) to replace A, then adds a flower filter Y to B.
Thread 1 A's refcnt Thread 2
RTM_NEWQDISC (A, RTNL-locked)
qdisc_create(A) 1
qdisc_graft(A) 9
RTM_NEWTFILTER (X, RTNL-unlocked)
__tcf_qdisc_find(A) 10
tcf_chain0_head_change(A)
mini_qdisc_pair_swap(A) (1st)
|
| RTM_NEWQDISC (B, RTNL-locked)
RCU sync 2 qdisc_graft(B)
| 1 notify_and_destroy(A)
|
tcf_block_release(A) 0 RTM_NEWTFILTER (Y, RTNL-unlocked)
qdisc_destroy(A) tcf_chain0_head_change(B)
tcf_chain0_head_change_cb_del(A) mini_qdisc_pair_swap(B) (2nd)
mini_qdisc_pair_swap(A) (3rd) |
... ...
Here, B calls mini_qdisc_pair_swap(), pointing eth0->miniq_ingress to
its mini Qdisc, b1. Then, A calls mini_qdisc_pair_swap() again during
ingress_destroy(), setting eth0->miniq_ingress to NULL, so ingress
packets on eth0 will not find filter Y in sch_handle_ingress().
This is just one of the possible consequences of concurrently accessing
miniq_{in,e}gress pointers.
Fixes: 7a096d579e8e ("net: sched: ingress: set 'unlocked' flag for Qdisc ops")
Fixes: 87f373921c4e ("net: sched: ingress: set 'unlocked' flag for clsact Qdisc ops")
Reported-by: syzbot+b53a9c0d1ea4ad62da8b(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/0000000000006cf87705f79acf1a@google.com/
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Vlad Buslov <vladbu(a)mellanox.com>
Signed-off-by: Peilin Ye <peilin.ye(a)bytedance.com>
Acked-by: Jamal Hadi Salim <jhs(a)mojatatu.com>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 27271f2b37cb..12eadecf8cd0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -137,6 +137,13 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
refcount_inc(&qdisc->refcnt);
}
+static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return true;
+ return refcount_dec_if_one(&qdisc->refcnt);
+}
+
/* Intended to be used by unlocked users, when concurrent qdisc release is
* possible.
*/
@@ -652,6 +659,7 @@ void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
+void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 094ca3a5b633..aa6b1fe65151 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1086,10 +1086,22 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if ((q && q->flags & TCQ_F_INGRESS) ||
(new && new->flags & TCQ_F_INGRESS)) {
ingress = 1;
- if (!dev_ingress_queue(dev)) {
+ dev_queue = dev_ingress_queue(dev);
+ if (!dev_queue) {
NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
return -ENOENT;
}
+
+ q = rtnl_dereference(dev_queue->qdisc_sleeping);
+
+ /* This is the counterpart of that qdisc_refcount_inc_nz() call in
+ * __tcf_qdisc_find() for filter requests.
+ */
+ if (!qdisc_refcount_dec_if_one(q)) {
+ NL_SET_ERR_MSG(extack,
+ "Current ingress or clsact Qdisc has ongoing filter requests");
+ return -EBUSY;
+ }
}
if (dev->flags & IFF_UP)
@@ -1110,8 +1122,16 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
qdisc_put(old);
}
} else {
- dev_queue = dev_ingress_queue(dev);
- old = dev_graft_qdisc(dev_queue, new);
+ old = dev_graft_qdisc(dev_queue, NULL);
+
+ /* {ingress,clsact}_destroy() @old before grafting @new to avoid
+ * unprotected concurrent accesses to net_device::miniq_{in,e}gress
+ * pointer(s) in mini_qdisc_pair_swap().
+ */
+ qdisc_notify(net, skb, n, classid, old, new, extack);
+ qdisc_destroy(old);
+
+ dev_graft_qdisc(dev_queue, new);
}
skip:
@@ -1125,8 +1145,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
if (new && new->ops->attach)
new->ops->attach(new);
- } else {
- notify_and_destroy(net, skb, n, classid, old, new, extack);
}
if (dev->flags & IFF_UP)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3248259eba32..5d7e23f4cc0e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1046,7 +1046,7 @@ static void qdisc_free_cb(struct rcu_head *head)
qdisc_free(q);
}
-static void qdisc_destroy(struct Qdisc *qdisc)
+static void __qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
@@ -1070,6 +1070,14 @@ static void qdisc_destroy(struct Qdisc *qdisc)
call_rcu(&qdisc->rcu, qdisc_free_cb);
}
+void qdisc_destroy(struct Qdisc *qdisc)
+{
+ if (qdisc->flags & TCQ_F_BUILTIN)
+ return;
+
+ __qdisc_destroy(qdisc);
+}
+
void qdisc_put(struct Qdisc *qdisc)
{
if (!qdisc)
@@ -1079,7 +1087,7 @@ void qdisc_put(struct Qdisc *qdisc)
!refcount_dec_and_test(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
@@ -1094,7 +1102,7 @@ void qdisc_put_unlocked(struct Qdisc *qdisc)
!refcount_dec_and_rtnl_lock(&qdisc->refcnt))
return;
- qdisc_destroy(qdisc);
+ __qdisc_destroy(qdisc);
rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f1a0898b5d6a77d332d036da03bad6fa9770de5b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023061907-salami-everyday-f129@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f1a0898b5d6a77d332d036da03bad6fa9770de5b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Fri, 9 Jun 2023 14:29:39 -0700
Subject: [PATCH] wifi: iwlwifi: mvm: spin_lock_bh() to fix lockdep regression
Lockdep on 6.4-rc on ThinkPad X1 Carbon 5th says
=====================================================
WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected
6.4.0-rc5 #1 Not tainted
-----------------------------------------------------
kworker/3:1/49 [HC0[0]:SC0[4]:HE1:SE0] is trying to acquire:
ffff8881066fa368 (&mvm_sta->deflink.lq_sta.rs_drv.pers.lock){+.+.}-{2:2}, at: rs_drv_get_rate+0x46/0xe7
and this task is already holding:
ffff8881066f80a8 (&sta->rate_ctrl_lock){+.-.}-{2:2}, at: rate_control_get_rate+0xbd/0x126
which would create a new lock dependency:
(&sta->rate_ctrl_lock){+.-.}-{2:2} -> (&mvm_sta->deflink.lq_sta.rs_drv.pers.lock){+.+.}-{2:2}
but this new dependency connects a SOFTIRQ-irq-safe lock:
(&sta->rate_ctrl_lock){+.-.}-{2:2}
etc. etc. etc.
Changing the spin_lock() in rs_drv_get_rate() to spin_lock_bh() was not
enough to pacify lockdep, but changing them all on pers.lock has worked.
Fixes: a8938bc881d2 ("wifi: iwlwifi: mvm: Add locking to the rate read flow")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Link: https://lore.kernel.org/r/79ffcc22-9775-cb6d-3ffd-1a517c40beef@google.com
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
index 23266d0c9ce4..9a20468345e4 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c
@@ -2692,7 +2692,7 @@ static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
lq_sta = mvm_sta;
- spin_lock(&lq_sta->pers.lock);
+ spin_lock_bh(&lq_sta->pers.lock);
iwl_mvm_hwrate_to_tx_rate_v1(lq_sta->last_rate_n_flags,
info->band, &info->control.rates[0]);
info->control.rates[0].count = 1;
@@ -2707,7 +2707,7 @@ static void rs_drv_get_rate(void *mvm_r, struct ieee80211_sta *sta,
iwl_mvm_hwrate_to_tx_rate_v1(last_ucode_rate, info->band,
&txrc->reported_rate);
}
- spin_unlock(&lq_sta->pers.lock);
+ spin_unlock_bh(&lq_sta->pers.lock);
}
static void *rs_drv_alloc_sta(void *mvm_rate, struct ieee80211_sta *sta,
@@ -3264,11 +3264,11 @@ void iwl_mvm_rs_tx_status(struct iwl_mvm *mvm, struct ieee80211_sta *sta,
/* If it's locked we are in middle of init flow
* just wait for next tx status to update the lq_sta data
*/
- if (!spin_trylock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock))
+ if (!spin_trylock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock))
return;
__iwl_mvm_rs_tx_status(mvm, sta, tid, info, ndp);
- spin_unlock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_unlock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
}
#ifdef CONFIG_MAC80211_DEBUGFS
@@ -4117,9 +4117,9 @@ void iwl_mvm_rs_rate_init(struct iwl_mvm *mvm,
} else {
struct iwl_mvm_sta *mvmsta = iwl_mvm_sta_from_mac80211(sta);
- spin_lock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_lock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
rs_drv_rate_init(mvm, sta, band);
- spin_unlock(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
+ spin_unlock_bh(&mvmsta->deflink.lq_sta.rs_drv.pers.lock);
}
}