March 2024 - Linux-stable-mirror

FAILED: patch "[PATCH] x86/bugs: Fix the SRSO mitigation on Zen3/4" failed to apply to 6.6-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 6.6-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. To reproduce the conflict and resubmit, you may use the following commands: git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y git checkout FETCH_HEAD git cherry-pick -x 4535e1a4174c4111d92c5a9a21e542d232e0fcaa # <resolve conflicts, build, test, etc.> git commit -s git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024033029-roast-pajamas-3c55@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^.. Possible dependencies: thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From 4535e1a4174c4111d92c5a9a21e542d232e0fcaa Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" <bp(a)alien8.de> Date: Thu, 28 Mar 2024 13:59:05 +0100 Subject: [PATCH] x86/bugs: Fix the SRSO mitigation on Zen3/4 The original version of the mitigation would patch in the calls to the untraining routines directly. That is, the alternative() in UNTRAIN_RET will patch in the CALL to srso_alias_untrain_ret() directly. However, even if commit e7c25c441e9e ("x86/cpu: Cleanup the untrain mess") meant well in trying to clean up the situation, due to micro- architectural reasons, the untraining routine srso_alias_untrain_ret() must be the target of a CALL instruction and not of a JMP instruction as it is done now. Reshuffle the alternative macros to accomplish that. Fixes: e7c25c441e9e ("x86/cpu: Cleanup the untrain mess") Signed-off-by: Borislav Petkov (AMD) <bp(a)alien8.de> Reviewed-by: Ingo Molnar <mingo(a)kernel.org> Cc: stable(a)kernel.org Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org> diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 076bf8dee702..25466c4d2134 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -14,6 +14,7 @@ #include <asm/asm.h> #include <asm/fred.h> #include <asm/gsseg.h> +#include <asm/nospec-branch.h> #ifndef CONFIG_X86_CMPXCHG64 extern void cmpxchg8b_emu(void); diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index fc3a8a3c7ffe..170c89ed22fc 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -262,11 +262,20 @@ .Lskip_rsb_\@: .endm +/* + * The CALL to srso_alias_untrain_ret() must be patched in directly at + * the spot where untraining must be done, ie., srso_alias_untrain_ret() + * must be the target of a CALL instruction instead of indirectly + * jumping to a wrapper which then calls it. Therefore, this macro is + * called outside of __UNTRAIN_RET below, for the time being, before the + * kernel can support nested alternatives with arbitrary nesting. + */ +.macro CALL_UNTRAIN_RET #if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) -#define CALL_UNTRAIN_RET "call entry_untrain_ret" -#else -#define CALL_UNTRAIN_RET "" + ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \ + "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS #endif +.endm /* * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the @@ -282,8 +291,8 @@ .macro __UNTRAIN_RET ibpb_feature, call_depth_insns #if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY) VALIDATE_UNRET_END - ALTERNATIVE_3 "", \ - CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \ + CALL_UNTRAIN_RET + ALTERNATIVE_2 "", \ "call entry_ibpb", \ibpb_feature, \ __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif @@ -342,6 +351,8 @@ extern void retbleed_return_thunk(void); static inline void retbleed_return_thunk(void) {} #endif +extern void srso_alias_untrain_ret(void); + #ifdef CONFIG_MITIGATION_SRSO extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 721b528da9ac..02cde194a99e 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -163,6 +163,7 @@ SYM_CODE_START_NOALIGN(srso_alias_untrain_ret) lfence jmp srso_alias_return_thunk SYM_FUNC_END(srso_alias_untrain_ret) +__EXPORT_THUNK(srso_alias_untrain_ret) .popsection .pushsection .text..__x86.rethunk_safe @@ -224,10 +225,12 @@ SYM_CODE_START(srso_return_thunk) SYM_CODE_END(srso_return_thunk) #define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret" -#define JMP_SRSO_ALIAS_UNTRAIN_RET "jmp srso_alias_untrain_ret" #else /* !CONFIG_MITIGATION_SRSO */ +/* Dummy for the alternative in CALL_UNTRAIN_RET. */ +SYM_CODE_START(srso_alias_untrain_ret) + RET +SYM_FUNC_END(srso_alias_untrain_ret) #define JMP_SRSO_UNTRAIN_RET "ud2" -#define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2" #endif /* CONFIG_MITIGATION_SRSO */ #ifdef CONFIG_MITIGATION_UNRET_ENTRY @@ -319,9 +322,7 @@ SYM_FUNC_END(retbleed_untrain_ret) #if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) SYM_FUNC_START(entry_untrain_ret) - ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET, \ - JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO, \ - JMP_SRSO_ALIAS_UNTRAIN_RET, X86_FEATURE_SRSO_ALIAS + ALTERNATIVE JMP_RETBLEED_UNTRAIN_RET, JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO SYM_FUNC_END(entry_untrain_ret) __EXPORT_THUNK(entry_untrain_ret)

1 year, 8 months

3
2
0 0

FAILED: patch "[PATCH] mm, vmscan: prevent infinite loop for costly GFP_NOIO |" failed to apply to 4.19-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 4.19-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. To reproduce the conflict and resubmit, you may use the following commands: git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y git checkout FETCH_HEAD git cherry-pick -x 803de9000f334b771afacb6ff3e78622916668b0 # <resolve conflicts, build, test, etc.> git commit -s git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024032732-prowess-craving-9106@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^.. Possible dependencies: 803de9000f33 ("mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations") f98a497e1f16 ("mm: compaction: remove unnecessary is_via_compact_memory() checks") e8606320e9af ("mm: compaction: refactor __compaction_suitable()") fe573327ffb1 ("tracing: incorrect gfp_t conversion") cff387d6a294 ("mm: compaction: make compaction_zonelist_suitable return false when COMPACT_SUCCESS") 9353ffa6e9e9 ("kasan, page_alloc: allow skipping memory init for HW_TAGS") 53ae233c30a6 ("kasan, page_alloc: allow skipping unpoisoning for HW_TAGS") f49d9c5bb15c ("kasan, mm: only define ___GFP_SKIP_KASAN_POISON with HW_TAGS") e9d0ca922816 ("kasan, page_alloc: rework kasan_unpoison_pages call site") 7e3cbba65de2 ("kasan, page_alloc: move kernel_init_free_pages in post_alloc_hook") 89b271163328 ("kasan, page_alloc: move SetPageSkipKASanPoison in post_alloc_hook") 9294b1281d0a ("kasan, page_alloc: combine tag_clear_highpage calls in post_alloc_hook") b42090ae6f3a ("kasan, page_alloc: merge kasan_alloc_pages into post_alloc_hook") b8491b9052fe ("kasan, page_alloc: refactor init checks in post_alloc_hook") 1c0e5b24f117 ("kasan: only apply __GFP_ZEROTAGS when memory is zeroed") c82ce3195fd1 ("mm: clarify __GFP_ZEROTAGS comment") 7c13c163e036 ("kasan, page_alloc: merge kasan_free_pages into free_pages_prepare") 5b2c07138cbd ("kasan, page_alloc: move tag_clear_highpage out of kernel_init_free_pages") 94ae8b83fefc ("kasan, page_alloc: deduplicate should_skip_kasan_poison") 3bf03b9a0839 ("Merge branch 'akpm' (patches from Andrew)") thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From 803de9000f334b771afacb6ff3e78622916668b0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka <vbabka(a)suse.cz> Date: Wed, 21 Feb 2024 12:43:58 +0100 Subject: [PATCH] mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations Sven reports an infinite loop in __alloc_pages_slowpath() for costly order __GFP_RETRY_MAYFAIL allocations that are also GFP_NOIO. Such combination can happen in a suspend/resume context where a GFP_KERNEL allocation can have __GFP_IO masked out via gfp_allowed_mask. Quoting Sven: 1. try to do a "costly" allocation (order > PAGE_ALLOC_COSTLY_ORDER) with __GFP_RETRY_MAYFAIL set. 2. page alloc's __alloc_pages_slowpath tries to get a page from the freelist. This fails because there is nothing free of that costly order. 3. page alloc tries to reclaim by calling __alloc_pages_direct_reclaim, which bails out because a zone is ready to be compacted; it pretends to have made a single page of progress. 4. page alloc tries to compact, but this always bails out early because __GFP_IO is not set (it's not passed by the snd allocator, and even if it were, we are suspending so the __GFP_IO flag would be cleared anyway). 5. page alloc believes reclaim progress was made (because of the pretense in item 3) and so it checks whether it should retry compaction. The compaction retry logic thinks it should try again, because: a) reclaim is needed because of the early bail-out in item 4 b) a zonelist is suitable for compaction 6. goto 2. indefinite stall. (end quote) The immediate root cause is confusing the COMPACT_SKIPPED returned from __alloc_pages_direct_compact() (step 4) due to lack of __GFP_IO to be indicating a lack of order-0 pages, and in step 5 evaluating that in should_compact_retry() as a reason to retry, before incrementing and limiting the number of retries. There are however other places that wrongly assume that compaction can happen while we lack __GFP_IO. To fix this, introduce gfp_compaction_allowed() to abstract the __GFP_IO evaluation and switch the open-coded test in try_to_compact_pages() to use it. Also use the new helper in: - compaction_ready(), which will make reclaim not bail out in step 3, so there's at least one attempt to actually reclaim, even if chances are small for a costly order - in_reclaim_compaction() which will make should_continue_reclaim() return false and we don't over-reclaim unnecessarily - in __alloc_pages_slowpath() to set a local variable can_compact, which is then used to avoid retrying reclaim/compaction for costly allocations (step 5) if we can't compact and also to skip the early compaction attempt that we do in some cases Link: https://lkml.kernel.org/r/20240221114357.13655-2-vbabka@suse.cz Fixes: 3250845d0526 ("Revert "mm, oom: prevent premature OOM killer invocation for high order request"") Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz> Reported-by: Sven van Ashbrook <svenva(a)chromium.org> Closes: https://lore.kernel.org/all/CAG-rBihs_xMKb3wrMO1%2B-%2Bp4fowP9oy1pa_OTkfxBz… Tested-by: Karthikeyan Ramasubramanian <kramasub(a)chromium.org> Cc: Brian Geffon <bgeffon(a)google.com> Cc: Curtis Malainey <cujomalainey(a)chromium.org> Cc: Jaroslav Kysela <perex(a)perex.cz> Cc: Mel Gorman <mgorman(a)techsingularity.net> Cc: Michal Hocko <mhocko(a)kernel.org> Cc: Takashi Iwai <tiwai(a)suse.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> diff --git a/include/linux/gfp.h b/include/linux/gfp.h index de292a007138..e2a916cf29c4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -353,6 +353,15 @@ static inline bool gfp_has_io_fs(gfp_t gfp) return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); } +/* + * Check if the gfp flags allow compaction - GFP_NOIO is a really + * tricky context because the migration might require IO. + */ +static inline bool gfp_compaction_allowed(gfp_t gfp_mask) +{ + return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO); +} + extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); #ifdef CONFIG_CONTIG_ALLOC diff --git a/mm/compaction.c b/mm/compaction.c index 4add68d40e8d..b961db601df4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2723,16 +2723,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, struct page **capture) { - int may_perform_io = (__force int)(gfp_mask & __GFP_IO); struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - /* - * Check if the GFP flags allow compaction - GFP_NOIO is really - * tricky context because the migration might require IO - */ - if (!may_perform_io) + if (!gfp_compaction_allowed(gfp_mask)) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 150d4f23b010..a663202045dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4041,6 +4041,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + bool can_compact = gfp_compaction_allowed(gfp_mask); const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; @@ -4111,7 +4112,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * Don't try this for allocations that are allowed to ignore * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && + if (can_direct_reclaim && can_compact && (costly_order || (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) && !gfp_pfmemalloc_allowed(gfp_mask)) { @@ -4209,9 +4210,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, /* * Do not retry costly high order allocations unless they are - * __GFP_RETRY_MAYFAIL + * __GFP_RETRY_MAYFAIL and we can compact */ - if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) + if (costly_order && (!can_compact || + !(gfp_mask & __GFP_RETRY_MAYFAIL))) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -4224,7 +4226,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * implementation of the compaction depends on the sufficient amount * of free memory (see __compaction_suitable) */ - if (did_some_progress > 0 && + if (did_some_progress > 0 && can_compact && should_compact_retry(ac, order, alloc_flags, compact_result, &compact_priority, &compaction_retries)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4f9c854ce6cc..4255619a1a31 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5753,7 +5753,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + if (gfp_compaction_allowed(sc->gfp_mask) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; @@ -5998,6 +5998,9 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; + if (!gfp_compaction_allowed(sc->gfp_mask)) + return false; + /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0))

1 year, 8 months

3
2
0 0

Fixing the devicetree of Rock 5 Model B (and possibly others)

by Pratham Patel

Since the introduction of the `of: property: fw_devlink: Fix stupid bug in remote-endpoint parsing` patch, an issue with the device-tree of the Rock 5 Model B has been detected. All the stable kernels (6.7.y and 6.8.y) work on the Orange Pi 5, which has the Rockchip RK3588S SoC (same as the RK3588, but less I/O basically). So, being an owner of only two SBCs which use the RK3588* SoC, it appears that the Rock 5 Model B's DT is incorrect. I looked at the patch and tried several things, neither resulted in anything that would point me to the core issue. Then I tried this: ``` $ grep -C 3 remote-endpoint arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts port { es8316_p0_0: endpoint { remote-endpoint = <&i2s0_8ch_p0_0>; }; }; }; -- i2s0_8ch_p0_0: endpoint { dai-format = "i2s"; mclk-fs = <256>; remote-endpoint = <&es8316_p0_0>; }; }; }; ``` So, from a cursory look, the issue seems to be related to either the DT node for the audio codec or related to the es8316's binding itself. Though I doubt that the later is the issue because if that were the issue, _someone_ with a Pine64 Pinebook Pro would've raised alarms. So far, this seems to be related to the `rk3588-rock-5b.dts` and possibly with the `rk3588s-rock-5a.dts` too. I would **love** to help but I'm afraid I device-trees are not something that I am at-all familiar with. That said, I am open to methods of debugging this issue to provide a fix myself. I would have replied to the patch's link but unfortunately, I haven't yet setup neomutt and my email provider's web UI doesn't have a [straightforward] way to reply using the 'In-Reply-To' header, hence a new thread. Apologies for the inconvenience caused. -- Pratham Patel

1 year, 8 months

5
11
0 0

[PATCH] serial: 8250_dw: Revert: Do not reclock if already at correct rate

by Hans de Goede

Commit e5d6bd25f93d ("serial: 8250_dw: Do not reclock if already at correct rate") breaks the dw UARTs on Intel Bay Trail (BYT) and Cherry Trail (CHT) SoCs. Before this change the RTL8732BS Bluetooth HCI which is found connected over the dw UART on both BYT and CHT boards works properly: Bluetooth: hci0: RTL: examining hci_ver=06 hci_rev=000b lmp_ver=06 lmp_subver=8723 Bluetooth: hci0: RTL: rom_version status=0 version=1 Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_fw.bin Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_config-OBDA8723.bin Bluetooth: hci0: RTL: cfg_sz 64, total sz 24508 Bluetooth: hci0: RTL: fw version 0x365d462e where as after this change probing it fails: Bluetooth: hci0: RTL: examining hci_ver=06 hci_rev=000b lmp_ver=06 lmp_subver=8723 Bluetooth: hci0: RTL: rom_version status=0 version=1 Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_fw.bin Bluetooth: hci0: RTL: loading rtl_bt/rtl8723bs_config-OBDA8723.bin Bluetooth: hci0: RTL: cfg_sz 64, total sz 24508 Bluetooth: hci0: command 0xfc20 tx timeout Bluetooth: hci0: RTL: download fw command failed (-110) Revert the changes to fix this regression. Fixes: e5d6bd25f93d ("serial: 8250_dw: Do not reclock if already at correct rate") Cc: stable(a)vger.kernel.org Cc: Peter Collingbourne <pcc(a)google.com> Signed-off-by: Hans de Goede <hdegoede(a)redhat.com> --- Note it is not entirely clear to me why this commit is causing this issue. Maybe probe() needs to explicitly set the clk rate which it just got (that feels like a clk driver issue) or maybe the issue is that unless setup before hand by firmware / the bootloader serial8250_update_uartclk() needs to be called at least once to setup things ? Note that probe() does not call serial8250_update_uartclk(), this is only called from the dw8250_clk_notifier_cb() This requires more debugging which is why I'm proposing a straight revert to fix the regression ASAP and then this can be investigated further. --- drivers/tty/serial/8250/8250_dw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c index c1d43f040c43..2d1f350a4bea 100644 --- a/drivers/tty/serial/8250/8250_dw.c +++ b/drivers/tty/serial/8250/8250_dw.c @@ -357,9 +357,9 @@ static void dw8250_set_termios(struct uart_port *p, struct ktermios *termios, long rate; int ret; + clk_disable_unprepare(d->clk); rate = clk_round_rate(d->clk, newrate); - if (rate > 0 && p->uartclk != rate) { - clk_disable_unprepare(d->clk); + if (rate > 0) { /* * Note that any clock-notifer worker will block in * serial8250_update_uartclk() until we are done. @@ -367,8 +367,8 @@ static void dw8250_set_termios(struct uart_port *p, struct ktermios *termios, ret = clk_set_rate(d->clk, newrate); if (!ret) p->uartclk = rate; - clk_prepare_enable(d->clk); } + clk_prepare_enable(d->clk); dw8250_do_set_termios(p, termios, old); } -- 2.44.0

1 year, 8 months

5
9
0 0

FAILED: Patch "bounds: support non-power-of-two CONFIG_NR_CPUS" failed to apply to 5.4-stable tree

by Sasha Levin

The patch below does not apply to the 5.4-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. Thanks, Sasha ------------------ original commit in Linus's tree ------------------ From f2d5dcb48f7ba9e3ff249d58fc1fa963d374e66a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" <willy(a)infradead.org> Date: Tue, 10 Oct 2023 15:55:49 +0100 Subject: [PATCH] bounds: support non-power-of-two CONFIG_NR_CPUS ilog2() rounds down, so for example when PowerPC 85xx sets CONFIG_NR_CPUS to 24, we will only allocate 4 bits to store the number of CPUs instead of 5. Use bits_per() instead, which rounds up. Found by code inspection. The effect of this would probably be a misaccounting when doing NUMA balancing, so to a user, it would only be a performance penalty. The effects may be more wide-spread; it's hard to tell. Link: https://lkml.kernel.org/r/20231010145549.1244748-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) <willy(a)infradead.org> Fixes: 90572890d202 ("mm: numa: Change page last {nid,pid} into {cpu,pid}") Reviewed-by: Rik van Riel <riel(a)surriel.com> Acked-by: Mel Gorman <mgorman(a)techsingularity.net> Cc: Peter Zijlstra <peterz(a)infradead.org> Cc: Ingo Molnar <mingo(a)kernel.org> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- kernel/bounds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bounds.c b/kernel/bounds.c index b529182e8b04f..c5a9fcd2d6228 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -19,7 +19,7 @@ int main(void) DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); #ifdef CONFIG_SMP - DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); + DEFINE(NR_CPUS_BITS, bits_per(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); #ifdef CONFIG_LRU_GEN -- 2.43.0

1 year, 8 months

3
5
0 0

Re: [PATCH v2] locking/rwsem: Disable preemption while trying for rwsem lock

by Waiman Long

On 3/5/24 06:04, Aaro Koskinen wrote: > Hi, > > It seems this patch (commit 48dfb5d2560d) is missing from > at least 5.15 stable tree. > > Based on quick test, it seems to fix an issue where system locks up > easily when RT throttling is disabled, and also it applies cleanly, so > I think it should be good to have it 5.15? You need to cc stable as the locking maintainers are not responsible for merging patches to stable trees. Cheers, Longman > > A. > > On Thu, Sep 08, 2022 at 11:54:27PM +0530, Mukesh Ojha wrote: >> From: Gokul krishna Krishnakumar <quic_gokukris(a)quicinc.com> >> >> Make the region inside the rwsem_write_trylock non preemptible. >> >> We observe RT task is hogging CPU when trying to acquire rwsem lock >> which was acquired by a kworker task but before the rwsem owner was set. >> >> Here is the scenario: >> 1. CFS task (affined to a particular CPU) takes rwsem lock. >> >> 2. CFS task gets preempted by a RT task before setting owner. >> >> 3. RT task (FIFO) is trying to acquire the lock, but spinning until >> RT throttling happens for the lock as the lock was taken by CFS task. >> >> This patch attempts to fix the above issue by disabling preemption >> until owner is set for the lock. While at it also fix the issues >> at the places where rwsem_{set,clear}_owner() are called. >> >> This also adds lockdep annotation of preemption disable in >> rwsem_{set,clear}_owner() on Peter Z. suggestion. >> >> Signed-off-by: Gokul krishna Krishnakumar <quic_gokukris(a)quicinc.com> >> Signed-off-by: Mukesh Ojha <quic_mojha(a)quicinc.com> >> --- >> Changes in v2: >> - Remove preempt disable code in rwsem_try_write_lock_unqueued() >> - Addressed suggestion from Peter Z. >> - Modified commit text >> kernel/locking/rwsem.c | 14 ++++++++++++-- >> 1 file changed, 12 insertions(+), 2 deletions(-) >> >> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c >> index 65f0262..4487359 100644 >> --- a/kernel/locking/rwsem.c >> +++ b/kernel/locking/rwsem.c >> @@ -133,14 +133,19 @@ >> * the owner value concurrently without lock. Read from owner, however, >> * may not need READ_ONCE() as long as the pointer value is only used >> * for comparison and isn't being dereferenced. >> + * >> + * Both rwsem_{set,clear}_owner() functions should be in the same >> + * preempt disable section as the atomic op that changes sem->count. >> */ >> static inline void rwsem_set_owner(struct rw_semaphore *sem) >> { >> + lockdep_assert_preemption_disabled(); >> atomic_long_set(&sem->owner, (long)current); >> } >> >> static inline void rwsem_clear_owner(struct rw_semaphore *sem) >> { >> + lockdep_assert_preemption_disabled(); >> atomic_long_set(&sem->owner, 0); >> } >> >> @@ -251,13 +256,16 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) >> static inline bool rwsem_write_trylock(struct rw_semaphore *sem) >> { >> long tmp = RWSEM_UNLOCKED_VALUE; >> + bool ret = false; >> >> + preempt_disable(); >> if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { >> rwsem_set_owner(sem); >> - return true; >> + ret = true; >> } >> >> - return false; >> + preempt_enable(); >> + return ret; >> } >> >> /* >> @@ -1352,8 +1360,10 @@ static inline void __up_write(struct rw_semaphore *sem) >> DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && >> !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); >> >> + preempt_disable(); >> rwsem_clear_owner(sem); >> tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); >> + preempt_enable(); >> if (unlikely(tmp & RWSEM_FLAG_WAITERS)) >> rwsem_wake(sem); >> } >> -- >> 2.7.4 >> >>

1 year, 8 months

3
4
0 0

[PATCH v2 2/7] btrfs: reduce the log level for btrfs_dev_stat_inc_and_print()

by Qu Wenruo

Currently when we increase the device statistics, it would always lead to an error message in the kernel log. However this output is mostly duplicated with the existing ones: - For scrub operations We always have the following messages: * "fixed up error at logical %llu" * "unable to fixup (regular) error at logical %llu" So no matter if the corruption is repaired or not, it scrub would output an error message to indicate the problem. - For non-scrub read operations We also have the following messages: * "csum failed root %lld inode %llu off %llu" for data csum mismatch * "bad (tree block start|fsid|tree block level)" for metadata * "read error corrected: ino %llu off %llu" for repaired data/metadata So the error message from btrfs_dev_stat_inc_and_print() is duplicated. The real usage for the btrfs device statistics is for some user space daemon to check if there is any new errors, acting like some checks on SMART, thus we don't really need/want those messages in dmesg. This patch would reduce the log level to debug (disabled by default) for btrfs_dev_stat_inc_and_print(). For users really want to utilize btrfs devices statistics, they should go check "btrfs device stats" periodically, and we should focus the kernel error messages to more important things. CC: stable(a)vger.kernel.org Reviewed-by: Filipe Manana <fdmanana(a)suse.com> Signed-off-by: Qu Wenruo <wqu(a)suse.com> --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e49935a54da0..126145950ed3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7828,7 +7828,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) if (!dev->dev_stats_valid) return; - btrfs_err_rl_in_rcu(dev->fs_info, + btrfs_debug_rl_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), -- 2.44.0

1 year, 8 months

3
2
0 0

[PATCH v2] selftests/ftrace: Limit length in subsystem-enable tests

by Yuanhe Shu

While sched* events being traced and sched* events continuously happen, "[xx] event tracing - enable/disable with subsystem level files" would not stop as on some slower systems it seems to take forever. Select the first 100 lines of output would be enough to judge whether there are more than 3 types of sched events. Fixes: 815b18ea66d6 ("ftracetest: Add basic event tracing test cases") Cc: stable(a)vger.kernel.org Signed-off-by: Yuanhe Shu <xiangzao(a)linux.alibaba.com> --- .../selftests/ftrace/test.d/event/subsystem-enable.tc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc index b1ede6249866..b7c8f29c09a9 100644 --- a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc +++ b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc @@ -18,7 +18,7 @@ echo 'sched:*' > set_event yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -29,7 +29,7 @@ echo 1 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -lt 3 ]; then fail "at least fork, exec and exit events should be recorded" fi @@ -40,7 +40,7 @@ echo 0 > events/sched/enable yield -count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` +count=`head -n 100 trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l` if [ $count -ne 0 ]; then fail "any of scheduler events should not be recorded" fi -- 2.39.3

1 year, 8 months

4
3
0 0

FAILED: patch "[PATCH] mm, vmscan: prevent infinite loop for costly GFP_NOIO |" failed to apply to 5.4-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.4-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. To reproduce the conflict and resubmit, you may use the following commands: git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y git checkout FETCH_HEAD git cherry-pick -x 803de9000f334b771afacb6ff3e78622916668b0 # <resolve conflicts, build, test, etc.> git commit -s git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024032730-triceps-mustang-3ced@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^.. Possible dependencies: 803de9000f33 ("mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations") f98a497e1f16 ("mm: compaction: remove unnecessary is_via_compact_memory() checks") e8606320e9af ("mm: compaction: refactor __compaction_suitable()") fe573327ffb1 ("tracing: incorrect gfp_t conversion") cff387d6a294 ("mm: compaction: make compaction_zonelist_suitable return false when COMPACT_SUCCESS") 9353ffa6e9e9 ("kasan, page_alloc: allow skipping memory init for HW_TAGS") 53ae233c30a6 ("kasan, page_alloc: allow skipping unpoisoning for HW_TAGS") f49d9c5bb15c ("kasan, mm: only define ___GFP_SKIP_KASAN_POISON with HW_TAGS") e9d0ca922816 ("kasan, page_alloc: rework kasan_unpoison_pages call site") 7e3cbba65de2 ("kasan, page_alloc: move kernel_init_free_pages in post_alloc_hook") 89b271163328 ("kasan, page_alloc: move SetPageSkipKASanPoison in post_alloc_hook") 9294b1281d0a ("kasan, page_alloc: combine tag_clear_highpage calls in post_alloc_hook") b42090ae6f3a ("kasan, page_alloc: merge kasan_alloc_pages into post_alloc_hook") b8491b9052fe ("kasan, page_alloc: refactor init checks in post_alloc_hook") 1c0e5b24f117 ("kasan: only apply __GFP_ZEROTAGS when memory is zeroed") c82ce3195fd1 ("mm: clarify __GFP_ZEROTAGS comment") 7c13c163e036 ("kasan, page_alloc: merge kasan_free_pages into free_pages_prepare") 5b2c07138cbd ("kasan, page_alloc: move tag_clear_highpage out of kernel_init_free_pages") 94ae8b83fefc ("kasan, page_alloc: deduplicate should_skip_kasan_poison") 3bf03b9a0839 ("Merge branch 'akpm' (patches from Andrew)") thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From 803de9000f334b771afacb6ff3e78622916668b0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka <vbabka(a)suse.cz> Date: Wed, 21 Feb 2024 12:43:58 +0100 Subject: [PATCH] mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations Sven reports an infinite loop in __alloc_pages_slowpath() for costly order __GFP_RETRY_MAYFAIL allocations that are also GFP_NOIO. Such combination can happen in a suspend/resume context where a GFP_KERNEL allocation can have __GFP_IO masked out via gfp_allowed_mask. Quoting Sven: 1. try to do a "costly" allocation (order > PAGE_ALLOC_COSTLY_ORDER) with __GFP_RETRY_MAYFAIL set. 2. page alloc's __alloc_pages_slowpath tries to get a page from the freelist. This fails because there is nothing free of that costly order. 3. page alloc tries to reclaim by calling __alloc_pages_direct_reclaim, which bails out because a zone is ready to be compacted; it pretends to have made a single page of progress. 4. page alloc tries to compact, but this always bails out early because __GFP_IO is not set (it's not passed by the snd allocator, and even if it were, we are suspending so the __GFP_IO flag would be cleared anyway). 5. page alloc believes reclaim progress was made (because of the pretense in item 3) and so it checks whether it should retry compaction. The compaction retry logic thinks it should try again, because: a) reclaim is needed because of the early bail-out in item 4 b) a zonelist is suitable for compaction 6. goto 2. indefinite stall. (end quote) The immediate root cause is confusing the COMPACT_SKIPPED returned from __alloc_pages_direct_compact() (step 4) due to lack of __GFP_IO to be indicating a lack of order-0 pages, and in step 5 evaluating that in should_compact_retry() as a reason to retry, before incrementing and limiting the number of retries. There are however other places that wrongly assume that compaction can happen while we lack __GFP_IO. To fix this, introduce gfp_compaction_allowed() to abstract the __GFP_IO evaluation and switch the open-coded test in try_to_compact_pages() to use it. Also use the new helper in: - compaction_ready(), which will make reclaim not bail out in step 3, so there's at least one attempt to actually reclaim, even if chances are small for a costly order - in_reclaim_compaction() which will make should_continue_reclaim() return false and we don't over-reclaim unnecessarily - in __alloc_pages_slowpath() to set a local variable can_compact, which is then used to avoid retrying reclaim/compaction for costly allocations (step 5) if we can't compact and also to skip the early compaction attempt that we do in some cases Link: https://lkml.kernel.org/r/20240221114357.13655-2-vbabka@suse.cz Fixes: 3250845d0526 ("Revert "mm, oom: prevent premature OOM killer invocation for high order request"") Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz> Reported-by: Sven van Ashbrook <svenva(a)chromium.org> Closes: https://lore.kernel.org/all/CAG-rBihs_xMKb3wrMO1%2B-%2Bp4fowP9oy1pa_OTkfxBz… Tested-by: Karthikeyan Ramasubramanian <kramasub(a)chromium.org> Cc: Brian Geffon <bgeffon(a)google.com> Cc: Curtis Malainey <cujomalainey(a)chromium.org> Cc: Jaroslav Kysela <perex(a)perex.cz> Cc: Mel Gorman <mgorman(a)techsingularity.net> Cc: Michal Hocko <mhocko(a)kernel.org> Cc: Takashi Iwai <tiwai(a)suse.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> diff --git a/include/linux/gfp.h b/include/linux/gfp.h index de292a007138..e2a916cf29c4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -353,6 +353,15 @@ static inline bool gfp_has_io_fs(gfp_t gfp) return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); } +/* + * Check if the gfp flags allow compaction - GFP_NOIO is a really + * tricky context because the migration might require IO. + */ +static inline bool gfp_compaction_allowed(gfp_t gfp_mask) +{ + return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO); +} + extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); #ifdef CONFIG_CONTIG_ALLOC diff --git a/mm/compaction.c b/mm/compaction.c index 4add68d40e8d..b961db601df4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2723,16 +2723,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, struct page **capture) { - int may_perform_io = (__force int)(gfp_mask & __GFP_IO); struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - /* - * Check if the GFP flags allow compaction - GFP_NOIO is really - * tricky context because the migration might require IO - */ - if (!may_perform_io) + if (!gfp_compaction_allowed(gfp_mask)) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 150d4f23b010..a663202045dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4041,6 +4041,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + bool can_compact = gfp_compaction_allowed(gfp_mask); const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; @@ -4111,7 +4112,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * Don't try this for allocations that are allowed to ignore * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && + if (can_direct_reclaim && can_compact && (costly_order || (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) && !gfp_pfmemalloc_allowed(gfp_mask)) { @@ -4209,9 +4210,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, /* * Do not retry costly high order allocations unless they are - * __GFP_RETRY_MAYFAIL + * __GFP_RETRY_MAYFAIL and we can compact */ - if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) + if (costly_order && (!can_compact || + !(gfp_mask & __GFP_RETRY_MAYFAIL))) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -4224,7 +4226,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * implementation of the compaction depends on the sufficient amount * of free memory (see __compaction_suitable) */ - if (did_some_progress > 0 && + if (did_some_progress > 0 && can_compact && should_compact_retry(ac, order, alloc_flags, compact_result, &compact_priority, &compaction_retries)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4f9c854ce6cc..4255619a1a31 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5753,7 +5753,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + if (gfp_compaction_allowed(sc->gfp_mask) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; @@ -5998,6 +5998,9 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; + if (!gfp_compaction_allowed(sc->gfp_mask)) + return false; + /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0))

1 year, 8 months

2
1
0 0

FAILED: patch "[PATCH] mm, vmscan: prevent infinite loop for costly GFP_NOIO |" failed to apply to 5.10-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.10-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. To reproduce the conflict and resubmit, you may use the following commands: git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y git checkout FETCH_HEAD git cherry-pick -x 803de9000f334b771afacb6ff3e78622916668b0 # <resolve conflicts, build, test, etc.> git commit -s git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024032727-pastel-sincerity-a986@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^.. Possible dependencies: 803de9000f33 ("mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations") f98a497e1f16 ("mm: compaction: remove unnecessary is_via_compact_memory() checks") e8606320e9af ("mm: compaction: refactor __compaction_suitable()") fe573327ffb1 ("tracing: incorrect gfp_t conversion") cff387d6a294 ("mm: compaction: make compaction_zonelist_suitable return false when COMPACT_SUCCESS") 9353ffa6e9e9 ("kasan, page_alloc: allow skipping memory init for HW_TAGS") 53ae233c30a6 ("kasan, page_alloc: allow skipping unpoisoning for HW_TAGS") f49d9c5bb15c ("kasan, mm: only define ___GFP_SKIP_KASAN_POISON with HW_TAGS") e9d0ca922816 ("kasan, page_alloc: rework kasan_unpoison_pages call site") 7e3cbba65de2 ("kasan, page_alloc: move kernel_init_free_pages in post_alloc_hook") 89b271163328 ("kasan, page_alloc: move SetPageSkipKASanPoison in post_alloc_hook") 9294b1281d0a ("kasan, page_alloc: combine tag_clear_highpage calls in post_alloc_hook") b42090ae6f3a ("kasan, page_alloc: merge kasan_alloc_pages into post_alloc_hook") b8491b9052fe ("kasan, page_alloc: refactor init checks in post_alloc_hook") 1c0e5b24f117 ("kasan: only apply __GFP_ZEROTAGS when memory is zeroed") c82ce3195fd1 ("mm: clarify __GFP_ZEROTAGS comment") 7c13c163e036 ("kasan, page_alloc: merge kasan_free_pages into free_pages_prepare") 5b2c07138cbd ("kasan, page_alloc: move tag_clear_highpage out of kernel_init_free_pages") 94ae8b83fefc ("kasan, page_alloc: deduplicate should_skip_kasan_poison") 3bf03b9a0839 ("Merge branch 'akpm' (patches from Andrew)") thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From 803de9000f334b771afacb6ff3e78622916668b0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka <vbabka(a)suse.cz> Date: Wed, 21 Feb 2024 12:43:58 +0100 Subject: [PATCH] mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations Sven reports an infinite loop in __alloc_pages_slowpath() for costly order __GFP_RETRY_MAYFAIL allocations that are also GFP_NOIO. Such combination can happen in a suspend/resume context where a GFP_KERNEL allocation can have __GFP_IO masked out via gfp_allowed_mask. Quoting Sven: 1. try to do a "costly" allocation (order > PAGE_ALLOC_COSTLY_ORDER) with __GFP_RETRY_MAYFAIL set. 2. page alloc's __alloc_pages_slowpath tries to get a page from the freelist. This fails because there is nothing free of that costly order. 3. page alloc tries to reclaim by calling __alloc_pages_direct_reclaim, which bails out because a zone is ready to be compacted; it pretends to have made a single page of progress. 4. page alloc tries to compact, but this always bails out early because __GFP_IO is not set (it's not passed by the snd allocator, and even if it were, we are suspending so the __GFP_IO flag would be cleared anyway). 5. page alloc believes reclaim progress was made (because of the pretense in item 3) and so it checks whether it should retry compaction. The compaction retry logic thinks it should try again, because: a) reclaim is needed because of the early bail-out in item 4 b) a zonelist is suitable for compaction 6. goto 2. indefinite stall. (end quote) The immediate root cause is confusing the COMPACT_SKIPPED returned from __alloc_pages_direct_compact() (step 4) due to lack of __GFP_IO to be indicating a lack of order-0 pages, and in step 5 evaluating that in should_compact_retry() as a reason to retry, before incrementing and limiting the number of retries. There are however other places that wrongly assume that compaction can happen while we lack __GFP_IO. To fix this, introduce gfp_compaction_allowed() to abstract the __GFP_IO evaluation and switch the open-coded test in try_to_compact_pages() to use it. Also use the new helper in: - compaction_ready(), which will make reclaim not bail out in step 3, so there's at least one attempt to actually reclaim, even if chances are small for a costly order - in_reclaim_compaction() which will make should_continue_reclaim() return false and we don't over-reclaim unnecessarily - in __alloc_pages_slowpath() to set a local variable can_compact, which is then used to avoid retrying reclaim/compaction for costly allocations (step 5) if we can't compact and also to skip the early compaction attempt that we do in some cases Link: https://lkml.kernel.org/r/20240221114357.13655-2-vbabka@suse.cz Fixes: 3250845d0526 ("Revert "mm, oom: prevent premature OOM killer invocation for high order request"") Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz> Reported-by: Sven van Ashbrook <svenva(a)chromium.org> Closes: https://lore.kernel.org/all/CAG-rBihs_xMKb3wrMO1%2B-%2Bp4fowP9oy1pa_OTkfxBz… Tested-by: Karthikeyan Ramasubramanian <kramasub(a)chromium.org> Cc: Brian Geffon <bgeffon(a)google.com> Cc: Curtis Malainey <cujomalainey(a)chromium.org> Cc: Jaroslav Kysela <perex(a)perex.cz> Cc: Mel Gorman <mgorman(a)techsingularity.net> Cc: Michal Hocko <mhocko(a)kernel.org> Cc: Takashi Iwai <tiwai(a)suse.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> diff --git a/include/linux/gfp.h b/include/linux/gfp.h index de292a007138..e2a916cf29c4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -353,6 +353,15 @@ static inline bool gfp_has_io_fs(gfp_t gfp) return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS); } +/* + * Check if the gfp flags allow compaction - GFP_NOIO is a really + * tricky context because the migration might require IO. + */ +static inline bool gfp_compaction_allowed(gfp_t gfp_mask) +{ + return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO); +} + extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma); #ifdef CONFIG_CONTIG_ALLOC diff --git a/mm/compaction.c b/mm/compaction.c index 4add68d40e8d..b961db601df4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2723,16 +2723,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, struct page **capture) { - int may_perform_io = (__force int)(gfp_mask & __GFP_IO); struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - /* - * Check if the GFP flags allow compaction - GFP_NOIO is really - * tricky context because the migration might require IO - */ - if (!may_perform_io) + if (!gfp_compaction_allowed(gfp_mask)) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 150d4f23b010..a663202045dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4041,6 +4041,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + bool can_compact = gfp_compaction_allowed(gfp_mask); const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; @@ -4111,7 +4112,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * Don't try this for allocations that are allowed to ignore * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && + if (can_direct_reclaim && can_compact && (costly_order || (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) && !gfp_pfmemalloc_allowed(gfp_mask)) { @@ -4209,9 +4210,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, /* * Do not retry costly high order allocations unless they are - * __GFP_RETRY_MAYFAIL + * __GFP_RETRY_MAYFAIL and we can compact */ - if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) + if (costly_order && (!can_compact || + !(gfp_mask & __GFP_RETRY_MAYFAIL))) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -4224,7 +4226,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * implementation of the compaction depends on the sufficient amount * of free memory (see __compaction_suitable) */ - if (did_some_progress > 0 && + if (did_some_progress > 0 && can_compact && should_compact_retry(ac, order, alloc_flags, compact_result, &compact_priority, &compaction_retries)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4f9c854ce6cc..4255619a1a31 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5753,7 +5753,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + if (gfp_compaction_allowed(sc->gfp_mask) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; @@ -5998,6 +5998,9 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; + if (!gfp_compaction_allowed(sc->gfp_mask)) + return false; + /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0))

1 year, 8 months

2
3
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror March 2024