From: Keith Busch <kbusch(a)kernel.org>
Some devices are reporting Controller Ready Modes Supported, but return
0 for CRTO. These devices require a much higher time to ready than that,
so they are failing to initialize after the driver started preferring
that value over CAP.TO.
The spec requires CAP.TO match the appropritate CRTO value, or be set to
0xff if CRTO is larger than that. This means that CAP.TO can be used to
validate if CRTO is reliable, and provides an appropriate fallback for
setting the timeout value if not. Use whichever is larger.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217863
Reported-by: Cláudio Sampaio <patola(a)gmail.com>
Reported-by: Felix Yan <felixonmars(a)archlinux.org>
Based-on-a-patch-by: Felix Yan <felixonmars(a)archlinux.org>
Reviewed-by: Sagi Grimberg <sagi(a)grimberg.me>
Cc: stable(a)vger.kernel.org
Signed-off-by: Keith Busch <kbusch(a)kernel.org>
---
v1->v2:
Warn once if driver isn't relying on CRTO values
drivers/nvme/host/core.c | 54 ++++++++++++++++++++++++++--------------
1 file changed, 35 insertions(+), 19 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 37b6fa7466620..0685ed4f2dc49 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2245,25 +2245,8 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
else
ctrl->ctrl_config = NVME_CC_CSS_NVM;
- if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
- u32 crto;
-
- ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
- if (ret) {
- dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
- ret);
- return ret;
- }
-
- if (ctrl->cap & NVME_CAP_CRMS_CRIMS) {
- ctrl->ctrl_config |= NVME_CC_CRIME;
- timeout = NVME_CRTO_CRIMT(crto);
- } else {
- timeout = NVME_CRTO_CRWMT(crto);
- }
- } else {
- timeout = NVME_CAP_TIMEOUT(ctrl->cap);
- }
+ if (ctrl->cap & NVME_CAP_CRMS_CRWMS && ctrl->cap & NVME_CAP_CRMS_CRIMS)
+ ctrl->ctrl_config |= NVME_CC_CRIME;
ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
@@ -2277,6 +2260,39 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
if (ret)
return ret;
+ /* CAP value may change after initial CC write */
+ ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
+ if (ret)
+ return ret;
+
+ timeout = NVME_CAP_TIMEOUT(ctrl->cap);
+ if (ctrl->cap & NVME_CAP_CRMS_CRWMS) {
+ u32 crto, ready_timeout;
+
+ ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CRTO, &crto);
+ if (ret) {
+ dev_err(ctrl->device, "Reading CRTO failed (%d)\n",
+ ret);
+ return ret;
+ }
+
+ /*
+ * CRTO should always be greater or equal to CAP.TO, but some
+ * devices are known to get this wrong. Use the larger of the
+ * two values.
+ */
+ if (ctrl->ctrl_config & NVME_CC_CRIME)
+ ready_timeout = NVME_CRTO_CRIMT(crto);
+ else
+ ready_timeout = NVME_CRTO_CRWMT(crto);
+
+ if (ready_timeout < timeout)
+ dev_warn_once(ctrl->device, "bad crto:%x cap:%llx\n",
+ crto, ctrl->cap);
+ else
+ timeout = ready_timeout;
+ }
+
ctrl->ctrl_config |= NVME_CC_ENABLE;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
--
2.34.1
From: Zi Yan <ziy(a)nvidia.com>
When dealing with hugetlb pages, manipulating struct page pointers
directly can get to wrong struct page, since struct page is not guaranteed
to be contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle
it properly.
Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Muchun Song <songmuchun(a)bytedance.com>
---
mm/cma.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/cma.c b/mm/cma.c
index da2967c6a223..2b2494fd6b59 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -505,7 +505,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
*/
if (page) {
for (i = 0; i < count; i++)
- page_kasan_tag_reset(page + i);
+ page_kasan_tag_reset(nth_page(page, i));
}
if (ret && !no_warn) {
--
2.40.1
The patch titled
Subject: proc: nommu: /proc/<pid>/maps: release mmap read lock
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
proc-nommu-proc-pid-maps-release-mmap-read-lock.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Ben Wolsieffer <Ben.Wolsieffer(a)hefring.com>
Subject: proc: nommu: /proc/<pid>/maps: release mmap read lock
Date: Thu, 14 Sep 2023 12:30:20 -0400
On NOMMU, when running "cat /proc/1/maps" twice the second run hangs.
The no-MMU implementation of /proc/<pid>/map doesn't normally release the
mmap read lock, because it uses !IS_ERR_OR_NULL(_vml) to determine whether
to release the lock. Since _vml is NULL when the end of the mappings is
reached, the lock is not released.
This code was incorrectly adapted from the MMU implementation, which at
the time released the lock in m_next() before returning the last entry.
The MMU implementation has diverged further from the no-MMU version since
then, so this patch brings their locking and error handling into sync,
fixing the bug and hopefully avoiding similar issues in the future.
Link: https://lkml.kernel.org/r/20230914163019.4050530-2-ben.wolsieffer@hefring.c…
Fixes: 47fecca15c09 ("fs/proc/task_nommu.c: don't use priv->task->mm")
Signed-off-by: Ben Wolsieffer <ben.wolsieffer(a)hefring.com>
Cc: Giulio Benetti <giulio.benetti(a)benettiengineering.com>
Cc: Greg Ungerer <gerg(a)uclinux.org>
Cc: Oleg Nesterov <oleg(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/proc/task_nommu.c | 27 +++++++++++++++------------
1 file changed, 15 insertions(+), 12 deletions(-)
--- a/fs/proc/task_nommu.c~proc-nommu-proc-pid-maps-release-mmap-read-lock
+++ a/fs/proc/task_nommu.c
@@ -192,11 +192,16 @@ static void *m_start(struct seq_file *m,
return ERR_PTR(-ESRCH);
mm = priv->mm;
- if (!mm || !mmget_not_zero(mm))
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
return NULL;
+ }
if (mmap_read_lock_killable(mm)) {
mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
return ERR_PTR(-EINTR);
}
@@ -205,23 +210,21 @@ static void *m_start(struct seq_file *m,
if (vma)
return vma;
- mmap_read_unlock(mm);
- mmput(mm);
return NULL;
}
-static void m_stop(struct seq_file *m, void *_vml)
+static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->mm;
- if (!IS_ERR_OR_NULL(_vml)) {
- mmap_read_unlock(priv->mm);
- mmput(priv->mm);
- }
- if (priv->task) {
- put_task_struct(priv->task);
- priv->task = NULL;
- }
+ if (!priv->task)
+ return;
+
+ mmap_read_unlock(mm);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
}
static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
_
Patches currently in -mm which might be from Ben.Wolsieffer(a)hefring.com are
proc-nommu-proc-pid-maps-release-mmap-read-lock.patch
Fix three issues with resctrl selftests.
The signal handling fix became necessary after the mount/umount fixes.
The other two came up when I ran resctrl selftests across the server
fleet in our lab to validate the upcoming CAT test rewrite (the rewrite
is not part of this series).
These are developed and should apply cleanly at least on top the
benchmark cleanup series (might apply cleanly also w/o the benchmark
series, I didn't test).
Ilpo Järvinen (5):
selftests/resctrl: Extend signal handler coverage to unmount on
receiving signal
selftests/resctrl: Remove duplicate feature check from CMT test
selftests/resctrl: Refactor feature check to use resource and feature
name
selftests/resctrl: Fix feature checks
selftests/resctrl: Reduce failures due to outliers in MBA/MBM tests
tools/testing/selftests/resctrl/cat_test.c | 8 ---
tools/testing/selftests/resctrl/cmt_test.c | 3 -
tools/testing/selftests/resctrl/mba_test.c | 2 +-
tools/testing/selftests/resctrl/mbm_test.c | 2 +-
tools/testing/selftests/resctrl/resctrl.h | 6 +-
.../testing/selftests/resctrl/resctrl_tests.c | 37 ++++++++--
tools/testing/selftests/resctrl/resctrl_val.c | 22 +++---
tools/testing/selftests/resctrl/resctrlfs.c | 69 ++++++++-----------
8 files changed, 73 insertions(+), 76 deletions(-)
--
2.30.2
Initial booting is setting the task flag to idle (PF_IDLE) by the call
path sched_init() -> init_idle(). Having the task idle and calling
call_rcu() in kernel/rcu/tiny.c means that TIF_NEED_RESCHED will be
set. Subsequent calls to any cond_resched() will enable IRQs,
potentially earlier than the IRQ setup has completed. Recent changes
have caused just this scenario and IRQs have been enabled early.
This causes a warning later in start_kernel() as interrupts are enabled
before they are fully set up.
Fix this issue by clearing the PF_IDLE flag on return from sched_init()
and restore the flag in rest_init(). Although the boot task was marked
as idle since (at least) d80e4fda576d, I am not sure that it is wrong to
do so. The forced context-switch on idle task was introduced in the
tiny_rcu update, so I'm going to claim this fixes 5f6130fa52ee.
Link: https://lore.kernel.org/linux-mm/87v8cv22jh.fsf@mail.lhotse/
Link: https://lore.kernel.org/linux-mm/CAMuHMdWpvpWoDa=Ox-do92czYRvkok6_x6pYUH+Zo…
Fixes: 5f6130fa52ee ("tiny_rcu: Directly force QS when call_rcu_[bh|sched]() on idle_task")
Cc: stable(a)vger.kernel.org
Cc: Geert Uytterhoeven <geert(a)linux-m68k.org>
Cc: "Paul E. McKenney" <paulmck(a)kernel.org>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: Andreas Schwab <schwab(a)linux-m68k.org>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Peng Zhang <zhangpeng.00(a)bytedance.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Juri Lelli <juri.lelli(a)redhat.com>
Cc: Vincent Guittot <vincent.guittot(a)linaro.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: "Mike Rapoport (IBM)" <rppt(a)kernel.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
---
init/main.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/init/main.c b/init/main.c
index ad920fac325c..f74772acf612 100644
--- a/init/main.c
+++ b/init/main.c
@@ -696,7 +696,7 @@ noinline void __ref __noreturn rest_init(void)
*/
rcu_read_lock();
tsk = find_task_by_pid_ns(pid, &init_pid_ns);
- tsk->flags |= PF_NO_SETAFFINITY;
+ tsk->flags |= PF_NO_SETAFFINITY | PF_IDLE;
set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
rcu_read_unlock();
@@ -938,6 +938,8 @@ void start_kernel(void)
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
+ /* Avoid early context switch, rest_init() restores PF_IDLE */
+ current->flags &= ~PF_IDLE;
if (WARN(!irqs_disabled(),
"Interrupts were enabled *very* early, fixing it\n"))
--
2.39.2
Similar to the rk817 codec alias that was missing, the rk817 charger
driver is missing a module alias as well. This absence prevents the
driver from autoprobing on OF systems when it is built as a module.
Add the right MODULE_ALIAS to fix this.
Fixes: 11cb8da0189b ("power: supply: Add charger driver for Rockchip RK817")
Cc: stable(a)vger.kernel.org
Signed-off-by: Nicolas Frattaroli <frattaroli.nicolas(a)gmail.com>
---
drivers/power/supply/rk817_charger.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/power/supply/rk817_charger.c b/drivers/power/supply/rk817_charger.c
index 1a2143641e66..76b991e112da 100644
--- a/drivers/power/supply/rk817_charger.c
+++ b/drivers/power/supply/rk817_charger.c
@@ -1211,3 +1211,4 @@ MODULE_DESCRIPTION("Battery power supply driver for RK817 PMIC");
MODULE_AUTHOR("Maya Matuszczyk <maccraft123mc(a)gmail.com>");
MODULE_AUTHOR("Chris Morgan <macromorgan(a)hotmail.com>");
MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:rk817-charger");
--
2.41.0
The patch titled
Subject: mm: memcontrol: fix GFP_NOFS recursion in memory.high enforcement
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-memcontrol-fix-gfp_nofs-recursion-in-memoryhigh-enforcement.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Johannes Weiner <hannes(a)cmpxchg.org>
Subject: mm: memcontrol: fix GFP_NOFS recursion in memory.high enforcement
Date: Thu, 14 Sep 2023 11:21:39 -0400
Breno and Josef report a deadlock scenario from cgroup reclaim
re-entering the filesystem:
[ 361.546690] ======================================================
[ 361.559210] WARNING: possible circular locking dependency detected
[ 361.571703] 6.5.0-0_fbk700_debug_rc0_kbuilder_13159_gbf787a128001 #1 Tainted: G S E
[ 361.589704] ------------------------------------------------------
[ 361.602277] find/9315 is trying to acquire lock:
[ 361.611625] ffff88837ba140c0 (&delayed_node->mutex){+.+.}-{4:4}, at: __btrfs_release_delayed_node+0x68/0x4f0
[ 361.631437]
[ 361.631437] but task is already holding lock:
[ 361.643243] ffff8881765b8678 (btrfs-tree-01){++++}-{4:4}, at: btrfs_tree_read_lock+0x1e/0x40
[ 362.904457] mutex_lock_nested+0x1c/0x30
[ 362.912414] __btrfs_release_delayed_node+0x68/0x4f0
[ 362.922460] btrfs_evict_inode+0x301/0x770
[ 362.982726] evict+0x17c/0x380
[ 362.988944] prune_icache_sb+0x100/0x1d0
[ 363.005559] super_cache_scan+0x1f8/0x260
[ 363.013695] do_shrink_slab+0x2a2/0x540
[ 363.021489] shrink_slab_memcg+0x237/0x3d0
[ 363.050606] shrink_slab+0xa7/0x240
[ 363.083382] shrink_node_memcgs+0x262/0x3b0
[ 363.091870] shrink_node+0x1a4/0x720
[ 363.099150] shrink_zones+0x1f6/0x5d0
[ 363.148798] do_try_to_free_pages+0x19b/0x5e0
[ 363.157633] try_to_free_mem_cgroup_pages+0x266/0x370
[ 363.190575] reclaim_high+0x16f/0x1f0
[ 363.208409] mem_cgroup_handle_over_high+0x10b/0x270
[ 363.246678] try_charge_memcg+0xaf2/0xc70
[ 363.304151] charge_memcg+0xf0/0x350
[ 363.320070] __mem_cgroup_charge+0x28/0x40
[ 363.328371] __filemap_add_folio+0x870/0xd50
[ 363.371303] filemap_add_folio+0xdd/0x310
[ 363.399696] __filemap_get_folio+0x2fc/0x7d0
[ 363.419086] pagecache_get_page+0xe/0x30
[ 363.427048] alloc_extent_buffer+0x1cd/0x6a0
[ 363.435704] read_tree_block+0x43/0xc0
[ 363.443316] read_block_for_search+0x361/0x510
[ 363.466690] btrfs_search_slot+0xc8c/0x1520
This is caused by the mem_cgroup_handle_over_high() not respecting the
gfp_mask of the allocation context. We used to only call this function on
resume to userspace, where no locks were held. But c9afe31ec443 ("memcg:
synchronously enforce memory.high for large overcharges") added a call
from the allocation context without considering the gfp.
Link: https://lkml.kernel.org/r/20230914152139.100822-1-hannes@cmpxchg.org
Fixes: c9afe31ec443 ("memcg: synchronously enforce memory.high for large overcharges")
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reported-by: Breno Leitao <leitao(a)debian.org>
Reported-by: Josef Bacik <josef(a)toxicpanda.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: <stable(a)vger.kernel.org> [5.17+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/memcontrol.h | 4 ++--
include/linux/resume_user_mode.h | 2 +-
mm/memcontrol.c | 6 +++---
3 files changed, 6 insertions(+), 6 deletions(-)
--- a/include/linux/memcontrol.h~mm-memcontrol-fix-gfp_nofs-recursion-in-memoryhigh-enforcement
+++ a/include/linux/memcontrol.h
@@ -920,7 +920,7 @@ unsigned long mem_cgroup_get_zone_lru_si
return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}
-void mem_cgroup_handle_over_high(void);
+void mem_cgroup_handle_over_high(gfp_t gfp_mask);
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
@@ -1458,7 +1458,7 @@ static inline void mem_cgroup_unlock_pag
rcu_read_unlock();
}
-static inline void mem_cgroup_handle_over_high(void)
+static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
}
--- a/include/linux/resume_user_mode.h~mm-memcontrol-fix-gfp_nofs-recursion-in-memoryhigh-enforcement
+++ a/include/linux/resume_user_mode.h
@@ -55,7 +55,7 @@ static inline void resume_user_mode_work
}
#endif
- mem_cgroup_handle_over_high();
+ mem_cgroup_handle_over_high(GFP_KERNEL);
blkcg_maybe_throttle_current();
rseq_handle_notify_resume(NULL, regs);
--- a/mm/memcontrol.c~mm-memcontrol-fix-gfp_nofs-recursion-in-memoryhigh-enforcement
+++ a/mm/memcontrol.c
@@ -2555,7 +2555,7 @@ static unsigned long calculate_high_dela
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
*/
-void mem_cgroup_handle_over_high(void)
+void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
unsigned long penalty_jiffies;
unsigned long pflags;
@@ -2583,7 +2583,7 @@ retry_reclaim:
*/
nr_reclaimed = reclaim_high(memcg,
in_retry ? SWAP_CLUSTER_MAX : nr_pages,
- GFP_KERNEL);
+ gfp_mask);
/*
* memory.high is breached and reclaim is unable to keep up. Throttle
@@ -2819,7 +2819,7 @@ done_restock:
if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
!(current->flags & PF_MEMALLOC) &&
gfpflags_allow_blocking(gfp_mask)) {
- mem_cgroup_handle_over_high();
+ mem_cgroup_handle_over_high(gfp_mask);
}
return 0;
}
_
Patches currently in -mm which might be from hannes(a)cmpxchg.org are
mm-page_alloc-fix-cma-and-highatomic-landing-on-the-wrong-buddy-list.patch
mm-memcontrol-fix-gfp_nofs-recursion-in-memoryhigh-enforcement.patch
mm-page_alloc-remove-pcppage-migratetype-caching.patch
mm-page_alloc-fix-up-block-types-when-merging-compatible-blocks.patch
mm-page_alloc-move-free-pages-when-converting-block-during-isolation.patch
mm-page_alloc-fix-move_freepages_block-range-error.patch
mm-page_alloc-fix-freelist-movement-during-block-conversion.patch
mm-page_alloc-consolidate-free-page-accounting.patch