The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x b6f3f28f604ba3de4724ad82bea6adb1300c0b5f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023071116-umbrella-fog-a65f@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b6f3f28f604ba3de4724ad82bea6adb1300c0b5f Mon Sep 17 00:00:00 2001
From: Michael Schmitz <schmitzmic(a)gmail.com>
Date: Wed, 21 Jun 2023 08:17:25 +1200
Subject: [PATCH] block: add overflow checks for Amiga partition support
The Amiga partition parser module uses signed int for partition sector
address and count, which will overflow for disks larger than 1 TB.
Use u64 as type for sector address and size to allow using disks up to
2 TB without LBD support, and disks larger than 2 TB with LBD. The RBD
format allows to specify disk sizes up to 2^128 bytes (though native
OS limitations reduce this somewhat, to max 2^68 bytes), so check for
u64 overflow carefully to protect against overflowing sector_t.
Bail out if sector addresses overflow 32 bits on kernels without LBD
support.
This bug was reported originally in 2012, and the fix was created by
the RDB author, Joanne Dow <jdow(a)earthlink.net>. A patch had been
discussed and reviewed on linux-m68k at that time but never officially
submitted (now resubmitted as patch 1 in this series).
This patch adds additional error checking and warning messages.
Reported-by: Martin Steigerwald <Martin(a)lichtvoll.de>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=43511
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Message-ID: <201206192146.09327.Martin(a)lichtvoll.de>
Cc: <stable(a)vger.kernel.org> # 5.2
Signed-off-by: Michael Schmitz <schmitzmic(a)gmail.com>
Reviewed-by: Geert Uytterhoeven <geert(a)linux-m68k.org>
Reviewed-by: Christoph Hellwig <hch(a)infradead.org>
Link: https://lore.kernel.org/r/20230620201725.7020-4-schmitzmic@gmail.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c
index 85c5c79aae48..ed222b9c901b 100644
--- a/block/partitions/amiga.c
+++ b/block/partitions/amiga.c
@@ -11,10 +11,18 @@
#define pr_fmt(fmt) fmt
#include <linux/types.h>
+#include <linux/mm_types.h>
+#include <linux/overflow.h>
#include <linux/affs_hardblocks.h>
#include "check.h"
+/* magic offsets in partition DosEnvVec */
+#define NR_HD 3
+#define NR_SECT 5
+#define LO_CYL 9
+#define HI_CYL 10
+
static __inline__ u32
checksum_block(__be32 *m, int size)
{
@@ -31,9 +39,12 @@ int amiga_partition(struct parsed_partitions *state)
unsigned char *data;
struct RigidDiskBlock *rdb;
struct PartitionBlock *pb;
- sector_t start_sect, nr_sects;
- int blk, part, res = 0;
- int blksize = 1; /* Multiplier for disk block size */
+ u64 start_sect, nr_sects;
+ sector_t blk, end_sect;
+ u32 cylblk; /* rdb_CylBlocks = nr_heads*sect_per_track */
+ u32 nr_hd, nr_sect, lo_cyl, hi_cyl;
+ int part, res = 0;
+ unsigned int blksize = 1; /* Multiplier for disk block size */
int slot = 1;
for (blk = 0; ; blk++, put_dev_sector(sect)) {
@@ -41,7 +52,7 @@ int amiga_partition(struct parsed_partitions *state)
goto rdb_done;
data = read_part_sector(state, blk, §);
if (!data) {
- pr_err("Dev %s: unable to read RDB block %d\n",
+ pr_err("Dev %s: unable to read RDB block %llu\n",
state->disk->disk_name, blk);
res = -1;
goto rdb_done;
@@ -58,12 +69,12 @@ int amiga_partition(struct parsed_partitions *state)
*(__be32 *)(data+0xdc) = 0;
if (checksum_block((__be32 *)data,
be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
- pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n",
+ pr_err("Trashed word at 0xd0 in block %llu ignored in checksum calculation\n",
blk);
break;
}
- pr_err("Dev %s: RDB in block %d has bad checksum\n",
+ pr_err("Dev %s: RDB in block %llu has bad checksum\n",
state->disk->disk_name, blk);
}
@@ -80,10 +91,15 @@ int amiga_partition(struct parsed_partitions *state)
blk = be32_to_cpu(rdb->rdb_PartitionList);
put_dev_sector(sect);
for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
- blk *= blksize; /* Read in terms partition table understands */
+ /* Read in terms partition table understands */
+ if (check_mul_overflow(blk, (sector_t) blksize, &blk)) {
+ pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n",
+ state->disk->disk_name, blk, part);
+ break;
+ }
data = read_part_sector(state, blk, §);
if (!data) {
- pr_err("Dev %s: unable to read partition block %d\n",
+ pr_err("Dev %s: unable to read partition block %llu\n",
state->disk->disk_name, blk);
res = -1;
goto rdb_done;
@@ -95,19 +111,70 @@ int amiga_partition(struct parsed_partitions *state)
if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 )
continue;
- /* Tell Kernel about it */
+ /* RDB gives us more than enough rope to hang ourselves with,
+ * many times over (2^128 bytes if all fields max out).
+ * Some careful checks are in order, so check for potential
+ * overflows.
+ * We are multiplying four 32 bit numbers to one sector_t!
+ */
+
+ nr_hd = be32_to_cpu(pb->pb_Environment[NR_HD]);
+ nr_sect = be32_to_cpu(pb->pb_Environment[NR_SECT]);
+
+ /* CylBlocks is total number of blocks per cylinder */
+ if (check_mul_overflow(nr_hd, nr_sect, &cylblk)) {
+ pr_err("Dev %s: heads*sects %u overflows u32, skipping partition!\n",
+ state->disk->disk_name, cylblk);
+ continue;
+ }
+
+ /* check for consistency with RDB defined CylBlocks */
+ if (cylblk > be32_to_cpu(rdb->rdb_CylBlocks)) {
+ pr_warn("Dev %s: cylblk %u > rdb_CylBlocks %u!\n",
+ state->disk->disk_name, cylblk,
+ be32_to_cpu(rdb->rdb_CylBlocks));
+ }
+
+ /* RDB allows for variable logical block size -
+ * normalize to 512 byte blocks and check result.
+ */
+
+ if (check_mul_overflow(cylblk, blksize, &cylblk)) {
+ pr_err("Dev %s: partition %u bytes per cyl. overflows u32, skipping partition!\n",
+ state->disk->disk_name, part);
+ continue;
+ }
+
+ /* Calculate partition start and end. Limit of 32 bit on cylblk
+ * guarantees no overflow occurs if LBD support is enabled.
+ */
+
+ lo_cyl = be32_to_cpu(pb->pb_Environment[LO_CYL]);
+ start_sect = ((u64) lo_cyl * cylblk);
+
+ hi_cyl = be32_to_cpu(pb->pb_Environment[HI_CYL]);
+ nr_sects = (((u64) hi_cyl - lo_cyl + 1) * cylblk);
- nr_sects = ((sector_t)be32_to_cpu(pb->pb_Environment[10]) + 1 -
- be32_to_cpu(pb->pb_Environment[9])) *
- be32_to_cpu(pb->pb_Environment[3]) *
- be32_to_cpu(pb->pb_Environment[5]) *
- blksize;
if (!nr_sects)
continue;
- start_sect = (sector_t)be32_to_cpu(pb->pb_Environment[9]) *
- be32_to_cpu(pb->pb_Environment[3]) *
- be32_to_cpu(pb->pb_Environment[5]) *
- blksize;
+
+ /* Warn user if partition end overflows u32 (AmigaDOS limit) */
+
+ if ((start_sect + nr_sects) > UINT_MAX) {
+ pr_warn("Dev %s: partition %u (%llu-%llu) needs 64 bit device support!\n",
+ state->disk->disk_name, part,
+ start_sect, start_sect + nr_sects);
+ }
+
+ if (check_add_overflow(start_sect, nr_sects, &end_sect)) {
+ pr_err("Dev %s: partition %u (%llu-%llu) needs LBD device support, skipping partition!\n",
+ state->disk->disk_name, part,
+ start_sect, end_sect);
+ continue;
+ }
+
+ /* Tell Kernel about it */
+
put_partition(state,slot++,start_sect,nr_sects);
{
/* Be even more informative to aid mounting */
Hi Greg, Sasha,
The following list shows the backported patches, I am using original
commit IDs for reference:
1) 1240eb93f061 ("netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE")
2) 26b5a5712eb8 ("netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain")
3) 3e70489721b6 ("netfilter: nf_tables: unbind non-anonymous set if rule construction fails")
Please, apply,
Thanks.
Pablo Neira Ayuso (3):
netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE
netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain
netfilter: nf_tables: unbind non-anonymous set if rule construction fails
include/net/netfilter/nf_tables.h | 1 +
net/netfilter/nf_tables_api.c | 29 +++++++++++++++++++++++++----
2 files changed, 26 insertions(+), 4 deletions(-)
--
2.30.2
Hi Greg, Sasha,
The following list shows the backported patches, I am using original
commit IDs for reference:
1) 1e9451cbda45 ("netfilter: nf_tables: fix nat hook table deletion")
2) 81ea01066741 ("netfilter: nf_tables: add rescheduling points during loop detection walks")
3) 802b805162a1 ("netfilter: nftables: add helper function to set the base sequence number")
4) 19c28b1374fb ("netfilter: add helper function to set up the nfnetlink header and use it")
5) 0854db2aaef3 ("netfilter: nf_tables: use net_generic infra for transaction data")
6) 1240eb93f061 ("netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE")
7) 26b5a5712eb8 ("netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain")
8) 938154b93be8 ("netfilter: nf_tables: reject unbound anonymous set before commit phase")
9) 3e70489721b6 ("netfilter: nf_tables: unbind non-anonymous set if rule construction fails")
10) 2024439bd5ce ("netfilter: nf_tables: fix scheduling-while-atomic splat")
Please, apply,
Thanks.
Florian Westphal (3):
netfilter: nf_tables: fix nat hook table deletion
netfilter: nf_tables: add rescheduling points during loop detection walks
netfilter: nf_tables: fix scheduling-while-atomic splat
Pablo Neira Ayuso (7):
netfilter: nftables: add helper function to set the base sequence number
netfilter: add helper function to set up the nfnetlink header and use it
netfilter: nf_tables: use net_generic infra for transaction data
netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE
netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain
netfilter: nf_tables: reject unbound anonymous set before commit phase
netfilter: nf_tables: unbind non-anonymous set if rule construction fails
include/linux/netfilter/nfnetlink.h | 27 ++
include/net/netfilter/nf_tables.h | 14 +
include/net/netns/nftables.h | 5 -
net/netfilter/ipset/ip_set_core.c | 17 +-
net/netfilter/nf_conntrack_netlink.c | 77 ++---
net/netfilter/nf_tables_api.c | 483 ++++++++++++++++-----------
net/netfilter/nf_tables_trace.c | 9 +-
net/netfilter/nfnetlink_acct.c | 11 +-
net/netfilter/nfnetlink_cthelper.c | 11 +-
net/netfilter/nfnetlink_cttimeout.c | 22 +-
net/netfilter/nfnetlink_log.c | 11 +-
net/netfilter/nfnetlink_queue.c | 12 +-
net/netfilter/nft_chain_filter.c | 11 +-
net/netfilter/nft_compat.c | 11 +-
net/netfilter/nft_dynset.c | 6 +-
15 files changed, 383 insertions(+), 344 deletions(-)
--
2.30.2
Hi Greg, Sasha,
The following list shows the backported patches, I am using original
commit IDs for reference:
1) 1e9451cbda45 ("netfilter: nf_tables: fix nat hook table deletion")
2) 802b805162a1 ("netfilter: nftables: add helper function to set the base sequence number")
3) 19c28b1374fb ("netfilter: add helper function to set up the nfnetlink header and use it")
4) 0854db2aaef3 ("netfilter: nf_tables: use net_generic infra for transaction data")
5) 81ea01066741 ("netfilter: nf_tables: add rescheduling points during loop detection walks")
6) 1240eb93f061 ("netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE")
7) 26b5a5712eb8 ("netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain")
8) 938154b93be8 ("netfilter: nf_tables: reject unbound anonymous set before commit phase")
9) 3e70489721b6 ("netfilter: nf_tables: unbind non-anonymous set if rule construction fails")
10) 2024439bd5ce ("netfilter: nf_tables: fix scheduling-while-atomic splat")
Please, apply,
Thanks
Florian Westphal (4):
netfilter: nf_tables: fix nat hook table deletion
netfilter: nf_tables: use net_generic infra for transaction data
netfilter: nf_tables: add rescheduling points during loop detection walks
netfilter: nf_tables: fix scheduling-while-atomic splat
Pablo Neira Ayuso (6):
netfilter: nftables: add helper function to set the base sequence number
netfilter: add helper function to set up the nfnetlink header and use it
netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE
netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain
netfilter: nf_tables: reject unbound anonymous set before commit phase
netfilter: nf_tables: unbind non-anonymous set if rule construction fails
include/linux/netfilter/nfnetlink.h | 27 ++
include/net/netfilter/nf_tables.h | 14 +
include/net/netns/nftables.h | 6 -
net/netfilter/ipset/ip_set_core.c | 17 +-
net/netfilter/nf_conntrack_netlink.c | 77 ++--
net/netfilter/nf_tables_api.c | 510 ++++++++++++++++-----------
net/netfilter/nf_tables_offload.c | 29 +-
net/netfilter/nf_tables_trace.c | 9 +-
net/netfilter/nfnetlink_acct.c | 11 +-
net/netfilter/nfnetlink_cthelper.c | 11 +-
net/netfilter/nfnetlink_cttimeout.c | 22 +-
net/netfilter/nfnetlink_log.c | 11 +-
net/netfilter/nfnetlink_queue.c | 12 +-
net/netfilter/nft_chain_filter.c | 11 +-
net/netfilter/nft_compat.c | 11 +-
net/netfilter/nft_dynset.c | 6 +-
16 files changed, 418 insertions(+), 366 deletions(-)
--
2.30.2
Hi Greg, Sasha,
[ This is v3:
- remove backported function that is useless in patch 09/11
- use: Upstream commit XYZ tag as suggested by Salvatore Bonaccorso.
]
The following list shows the backported patches. I am using original commit IDs
for reference:
1) 0854db2aaef3 ("netfilter: nf_tables: use net_generic infra for transaction data")
2) 81ea01066741 ("netfilter: nf_tables: add rescheduling points during loop detection walks")
3) 1240eb93f061 ("netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE")
4) 4bedf9eee016 ("netfilter: nf_tables: fix chain binding transaction logic")
5) 26b5a5712eb8 ("netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain")
6) 938154b93be8 ("netfilter: nf_tables: reject unbound anonymous set before commit phase")
7) 62e1e94b246e ("netfilter: nf_tables: reject unbound chain set before commit phase")
8) f8bb7889af58 ("netfilter: nftables: rename set element data activation/deactivation functions")
9) 628bd3e49cba ("netfilter: nf_tables: drop map element references from preparation phase")
10) 3e70489721b6 ("netfilter: nf_tables: unbind non-anonymous set if rule construction fails")
11) f838e0906dd3 ("netfilter: nf_tables: fix scheduling-while-atomic splat")
Notes:
- Patch #1 is a backported dependency patch required by these fixes.
- Patch #3 needs an incremental follow up fix coming in Patch #11.
Florian Westphal (3):
netfilter: nf_tables: use net_generic infra for transaction data
netfilter: nf_tables: add rescheduling points during loop detection walks
netfilter: nf_tables: fix scheduling-while-atomic splat
Pablo Neira Ayuso (8):
netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE
netfilter: nf_tables: fix chain binding transaction logic
netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain
netfilter: nf_tables: reject unbound anonymous set before commit phase
netfilter: nf_tables: reject unbound chain set before commit phase
netfilter: nftables: rename set element data activation/deactivation functions
netfilter: nf_tables: drop map element references from preparation phase
netfilter: nf_tables: unbind non-anonymous set if rule construction fails
include/net/netfilter/nf_tables.h | 41 +-
include/net/netns/nftables.h | 7 -
net/netfilter/nf_tables_api.c | 662 +++++++++++++++++++++---------
net/netfilter/nf_tables_offload.c | 30 +-
net/netfilter/nft_chain_filter.c | 11 +-
net/netfilter/nft_dynset.c | 6 +-
net/netfilter/nft_immediate.c | 90 +++-
net/netfilter/nft_set_bitmap.c | 5 +-
net/netfilter/nft_set_hash.c | 23 +-
net/netfilter/nft_set_pipapo.c | 14 +-
net/netfilter/nft_set_rbtree.c | 5 +-
11 files changed, 648 insertions(+), 246 deletions(-)
--
2.30.2
Hi,
Could you please backport the three following commits to 4.14 and 4.19:
24c363623361 ("spi: spi-fsl-spi: remove always-true conditional in
fsl_spi_do_one_msg")
17ecffa28948 ("spi: spi-fsl-spi: relax message sanity checking a little")
a798a7086c38 ("spi: spi-fsl-spi: allow changing bits_per_word while CS
is still active")
Those three commits (the last one indeed) are needed to overcome a
problem introduced in 4.14.230 by commit 42c04316d927 ("spi: fsl-cpm:
Use 16 bit mode for large transfers with even size"), which leads to the
following error in certain cases:
[ 174.900526] at25 spi0.3: bits_per_word/speed_hz should be same for
the same SPI transfer
[ 174.911844] spi_master spi0: failed to transfer one message from queue
[ 366.639467] INFO: task od:406 blocked for more than 120 seconds.
[ 366.645155] Not tainted 4.14.320-s3k-dev-dirty #342
[ 366.652996] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[ 489.519450] INFO: task od:406 blocked for more than 120 seconds.
[ 489.525156] Not tainted 4.14.320-s3k-dev-dirty #342
[ 489.532915] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
...
Thanks
Christophe
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 8a796565cec3601071cbbd27d6304e202019d014
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023071623-deafness-gargle-5297@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
8a796565cec3 ("io_uring: Use io_schedule* in cqring wait")
d33a39e57768 ("io_uring: keep timeout in io_wait_queue")
46ae7eef44f6 ("io_uring: optimise non-timeout waiting")
846072f16eed ("io_uring: mimimise io_cqring_wait_schedule")
3fcf19d592d5 ("io_uring: parse check_cq out of wq waiting")
12521a5d5cb7 ("io_uring: fix CQ waiting timeout handling")
52ea806ad983 ("io_uring: finish waiting before flushing overflow entries")
35d90f95cfa7 ("io_uring: include task_work run after scheduling in wait for events")
1b346e4aa8e7 ("io_uring: don't check overflow flush failures")
a85381d8326d ("io_uring: skip overflow CQE posting for dying ring")
c0e0d6ba25f1 ("io_uring: add IORING_SETUP_DEFER_TASKRUN")
b4c98d59a787 ("io_uring: introduce io_has_work")
78a861b94959 ("io_uring: add sync cancelation API through io_uring_register()")
c34398a8c018 ("io_uring: remove __io_req_task_work_add")
ed5ccb3beeba ("io_uring: remove priority tw list optimisation")
4a0fef62788b ("io_uring: optimize io_uring_task layout")
253993210bd8 ("io_uring: introduce locking helpers for CQE posting")
305bef988708 ("io_uring: hide eventfd assumptions in eventfd paths")
affa87db9010 ("io_uring: fix multi ctx cancellation")
d9dee4302a7c ("io_uring: remove ->flush_cqes optimisation")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8a796565cec3601071cbbd27d6304e202019d014 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres(a)anarazel.de>
Date: Fri, 7 Jul 2023 09:20:07 -0700
Subject: [PATCH] io_uring: Use io_schedule* in cqring wait
I observed poor performance of io_uring compared to synchronous IO. That
turns out to be caused by deeper CPU idle states entered with io_uring,
due to io_uring using plain schedule(), whereas synchronous IO uses
io_schedule().
The losses due to this are substantial. On my cascade lake workstation,
t/io_uring from the fio repository e.g. yields regressions between 20%
and 40% with the following command:
./t/io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S$use_sync -R 0 /mnt/t2/fio/write.0.0
This is repeatable with different filesystems, using raw block devices
and using different block devices.
Use io_schedule_prepare() / io_schedule_finish() in
io_cqring_wait_schedule() to address the difference.
After that using io_uring is on par or surpassing synchronous IO (using
registered files etc makes it reliably win, but arguably is a less fair
comparison).
There are other calls to schedule() in io_uring/, but none immediately
jump out to be similarly situated, so I did not touch them. Similarly,
it's possible that mutex_lock_io() should be used, but it's not clear if
there are cases where that matters.
Cc: stable(a)vger.kernel.org # 5.10+
Cc: Pavel Begunkov <asml.silence(a)gmail.com>
Cc: io-uring(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Signed-off-by: Andres Freund <andres(a)anarazel.de>
Link: https://lore.kernel.org/r/20230707162007.194068-1-andres@anarazel.de
[axboe: minor style fixup]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e8096d502a7c..7505de2428e0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2489,6 +2489,8 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
+ int token, ret;
+
if (unlikely(READ_ONCE(ctx->check_cq)))
return 1;
if (unlikely(!llist_empty(&ctx->work_llist)))
@@ -2499,11 +2501,20 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
return -EINTR;
if (unlikely(io_should_wake(iowq)))
return 0;
+
+ /*
+ * Use io_schedule_prepare/finish, so cpufreq can take into account
+ * that the task is waiting for IO - turns out to be important for low
+ * QD IO.
+ */
+ token = io_schedule_prepare();
+ ret = 0;
if (iowq->timeout == KTIME_MAX)
schedule();
else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
- return -ETIME;
- return 0;
+ ret = -ETIME;
+ io_schedule_finish(token);
+ return ret;
}
/*
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 20cb1c2fb7568a6054c55defe044311397e01ddb
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023071635-handsaw-enclosure-0363@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
20cb1c2fb756 ("blk-cgroup: Flush stats before releasing blkcg_gq")
2c275afeb61d ("block: make blkcg_punt_bio_submit optional")
12be09fe18f2 ("block: async_bio_lock does not need to be bh-safe")
3480373ebdf7 ("btrfs, block: move REQ_CGROUP_PUNT to btrfs")
0a0596fbbe5b ("btrfs, mm: remove the punt_to_cgroup field in struct writeback_control")
05d06a5c9d9c ("btrfs: move kthread_associate_blkcg out of btrfs_submit_compressed_write")
ae42a154ca89 ("btrfs: pass a btrfs_bio to btrfs_submit_bio")
34f888ce3a35 ("btrfs: cleanup main loop in btrfs_encoded_read_regular_fill_pages")
72b505dc5757 ("btrfs: add a wbc pointer to struct btrfs_bio_ctrl")
794c26e214ab ("btrfs: remove the sync_io flag in struct btrfs_bio_ctrl")
c000bc04bad4 ("btrfs: store the bio opf in struct btrfs_bio_ctrl")
eb8d0c6d042f ("btrfs: remove the force_bio_submit to submit_extent_page")
67998cf438e2 ("btrfs: don't set force_bio_submit in read_extent_buffer_subpage")
10e924bc320a ("btrfs: factor out a btrfs_add_compressed_bio_pages helper")
e7aff33e3161 ("btrfs: use the bbio file offset in btrfs_submit_compressed_read")
798c9fc74d03 ("btrfs: remove redundant free_extent_map in btrfs_submit_compressed_read")
544fe4a903ce ("btrfs: embed a btrfs_bio into struct compressed_bio")
3822a7c40997 ("Merge tag 'mm-stable-2023-02-20-13-37' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 20cb1c2fb7568a6054c55defe044311397e01ddb Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei(a)redhat.com>
Date: Sat, 10 Jun 2023 07:42:49 +0800
Subject: [PATCH] blk-cgroup: Flush stats before releasing blkcg_gq
As noted by Michal, the blkg_iostat_set's in the lockless list hold
reference to blkg's to protect against their removal. Those blkg's
hold reference to blkcg. When a cgroup is being destroyed,
cgroup_rstat_flush() is only called at css_release_work_fn() which
is called when the blkcg reference count reaches 0. This circular
dependency will prevent blkcg and some blkgs from being freed after
they are made offline.
It is less a problem if the cgroup to be destroyed also has other
controllers like memory that will call cgroup_rstat_flush() which will
clean up the reference count. If block is the only controller that uses
rstat, these offline blkcg and blkgs may never be freed leaking more
and more memory over time.
To prevent this potential memory leak:
- flush blkcg per-cpu stats list in __blkg_release(), when no new stat
can be added
- add global blkg_stat_lock for covering concurrent parent blkg stat
update
- don't grab bio->bi_blkg reference when adding the stats into blkcg's
per-cpu stat list since all stats are guaranteed to be consumed before
releasing blkg instance, and grabbing blkg reference for stats was the
most fragile part of original patch
Based on Waiman's patch:
https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.…
Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()")
Cc: stable(a)vger.kernel.org
Reported-by: Jay Shin <jaeshin(a)redhat.com>
Acked-by: Tejun Heo <tj(a)kernel.org>
Cc: Waiman Long <longman(a)redhat.com>
Cc: mkoutny(a)suse.com
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Signed-off-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/20230609234249.1412858-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ce64dd73cfe..f0b5c9c41cde 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -34,6 +34,8 @@
#include "blk-ioprio.h"
#include "blk-throttle.h"
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
+
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
* blkcg_pol_register_mutex nests outside of it and synchronizes entire
@@ -56,6 +58,8 @@ static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
bool blkcg_debug_stats = false;
+static DEFINE_RAW_SPINLOCK(blkg_stat_lock);
+
#define BLKG_DESTROY_BATCH_SIZE 64
/*
@@ -163,10 +167,20 @@ static void blkg_free(struct blkcg_gq *blkg)
static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+ struct blkcg *blkcg = blkg->blkcg;
+ int cpu;
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
WARN_ON(!bio_list_empty(&blkg->async_bios));
#endif
+ /*
+ * Flush all the non-empty percpu lockless lists before releasing
+ * us, given these stat belongs to us.
+ *
+ * blkg_stat_lock is for serializing blkg stat update
+ */
+ for_each_possible_cpu(cpu)
+ __blkcg_rstat_flush(blkcg, cpu);
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
@@ -951,23 +965,26 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
{
- struct blkcg *blkcg = css_to_blkcg(css);
struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
struct llist_node *lnode;
struct blkg_iostat_set *bisc, *next_bisc;
- /* Root-level stats are sourced from system-wide IO stats */
- if (!cgroup_parent(css->cgroup))
- return;
-
rcu_read_lock();
lnode = llist_del_all(lhead);
if (!lnode)
goto out;
+ /*
+ * For covering concurrent parent blkg update from blkg_release().
+ *
+ * When flushing from cgroup, cgroup_rstat_lock is always held, so
+ * this lock won't cause contention most of time.
+ */
+ raw_spin_lock(&blkg_stat_lock);
+
/*
* Iterate only the iostat_cpu's queued in the lockless list.
*/
@@ -991,13 +1008,19 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent && parent->parent)
blkcg_iostat_update(parent, &blkg->iostat.cur,
&blkg->iostat.last);
- percpu_ref_put(&blkg->refcnt);
}
-
+ raw_spin_unlock(&blkg_stat_lock);
out:
rcu_read_unlock();
}
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ /* Root-level stats are sourced from system-wide IO stats */
+ if (cgroup_parent(css->cgroup))
+ __blkcg_rstat_flush(css_to_blkcg(css), cpu);
+}
+
/*
* We source root cgroup stats from the system-wide stats to avoid
* tracking the same information twice and incurring overhead when no
@@ -2075,7 +2098,6 @@ void blk_cgroup_bio_start(struct bio *bio)
llist_add(&bis->lnode, lhead);
WRITE_ONCE(bis->lqueued, true);
- percpu_ref_get(&bis->blkg->refcnt);
}
u64_stats_update_end_irqrestore(&bis->sync, flags);