The quilt patch titled
Subject: nilfs2: fix incorrect inode allocation from reserved inodes
has been removed from the -mm tree. Its filename was
nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Subject: nilfs2: fix incorrect inode allocation from reserved inodes
Date: Sun, 23 Jun 2024 14:11:35 +0900
If the bitmap block that manages the inode allocation status is corrupted,
nilfs_ifile_create_inode() may allocate a new inode from the reserved
inode area where it should not be allocated.
Previous fix commit d325dc6eb763 ("nilfs2: fix use-after-free bug of
struct nilfs_root"), fixed the problem that reserved inodes with inode
numbers less than NILFS_USER_INO (=11) were incorrectly reallocated due to
bitmap corruption, but since the start number of non-reserved inodes is
read from the super block and may change, in which case inode allocation
may occur from the extended reserved inode area.
If that happens, access to that inode will cause an IO error, causing the
file system to degrade to an error state.
Fix this potential issue by adding a wraparound option to the common
metadata object allocation routine and by modifying
nilfs_ifile_create_inode() to disable the option so that it only allocates
inodes with inode numbers greater than or equal to the inode number read
in "nilfs->ns_first_ino", regardless of the bitmap status of reserved
inodes.
Link: https://lkml.kernel.org/r/20240623051135.4180-4-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/alloc.c | 19 +++++++++++++++----
fs/nilfs2/alloc.h | 4 ++--
fs/nilfs2/dat.c | 2 +-
fs/nilfs2/ifile.c | 7 ++-----
4 files changed, 20 insertions(+), 12 deletions(-)
--- a/fs/nilfs2/alloc.c~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes
+++ a/fs/nilfs2/alloc.c
@@ -377,11 +377,12 @@ void *nilfs_palloc_block_get_entry(const
* @target: offset number of an entry in the group (start point)
* @bsize: size in bits
* @lock: spin lock protecting @bitmap
+ * @wrap: whether to wrap around
*/
static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
unsigned int bsize,
- spinlock_t *lock)
+ spinlock_t *lock, bool wrap)
{
int pos, end = bsize;
@@ -397,6 +398,8 @@ static int nilfs_palloc_find_available_s
end = target;
}
+ if (!wrap)
+ return -ENOSPC;
/* wrap around */
for (pos = 0; pos < end; pos++) {
@@ -495,9 +498,10 @@ int nilfs_palloc_count_max_entries(struc
* nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
* @inode: inode of metadata file using this allocator
* @req: nilfs_palloc_req structure exchanged for the allocation
+ * @wrap: whether to wrap around
*/
int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
- struct nilfs_palloc_req *req)
+ struct nilfs_palloc_req *req, bool wrap)
{
struct buffer_head *desc_bh, *bitmap_bh;
struct nilfs_palloc_group_desc *desc;
@@ -516,7 +520,7 @@ int nilfs_palloc_prepare_alloc_entry(str
entries_per_group = nilfs_palloc_entries_per_group(inode);
for (i = 0; i < ngroups; i += n) {
- if (group >= ngroups) {
+ if (group >= ngroups && wrap) {
/* wrap around */
group = 0;
maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
@@ -550,7 +554,14 @@ int nilfs_palloc_prepare_alloc_entry(str
bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
- bitmap, group_offset, entries_per_group, lock);
+ bitmap, group_offset, entries_per_group, lock,
+ wrap);
+ /*
+ * Since the search for a free slot in the second and
+ * subsequent bitmap blocks always starts from the
+ * beginning, the wrap flag only has an effect on the
+ * first search.
+ */
kunmap_local(bitmap_kaddr);
if (pos >= 0)
goto found;
--- a/fs/nilfs2/alloc.h~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes
+++ a/fs/nilfs2/alloc.h
@@ -50,8 +50,8 @@ struct nilfs_palloc_req {
struct buffer_head *pr_entry_bh;
};
-int nilfs_palloc_prepare_alloc_entry(struct inode *,
- struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+ struct nilfs_palloc_req *req, bool wrap);
void nilfs_palloc_commit_alloc_entry(struct inode *,
struct nilfs_palloc_req *);
void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
--- a/fs/nilfs2/dat.c~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes
+++ a/fs/nilfs2/dat.c
@@ -75,7 +75,7 @@ int nilfs_dat_prepare_alloc(struct inode
{
int ret;
- ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+ ret = nilfs_palloc_prepare_alloc_entry(dat, req, true);
if (ret < 0)
return ret;
--- a/fs/nilfs2/ifile.c~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes
+++ a/fs/nilfs2/ifile.c
@@ -56,13 +56,10 @@ int nilfs_ifile_create_inode(struct inod
struct nilfs_palloc_req req;
int ret;
- req.pr_entry_nr = 0; /*
- * 0 says find free inode from beginning
- * of a group. dull code!!
- */
+ req.pr_entry_nr = NILFS_FIRST_INO(ifile->i_sb);
req.pr_entry_bh = NULL;
- ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+ ret = nilfs_palloc_prepare_alloc_entry(ifile, &req, false);
if (!ret) {
ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
&req.pr_entry_bh);
_
Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are
The quilt patch titled
Subject: nilfs2: add missing check for inode numbers on directory entries
has been removed from the -mm tree. Its filename was
nilfs2-add-missing-check-for-inode-numbers-on-directory-entries.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Subject: nilfs2: add missing check for inode numbers on directory entries
Date: Sun, 23 Jun 2024 14:11:34 +0900
Syzbot reported that mounting and unmounting a specific pattern of
corrupted nilfs2 filesystem images causes a use-after-free of metadata
file inodes, which triggers a kernel bug in lru_add_fn().
As Jan Kara pointed out, this is because the link count of a metadata file
gets corrupted to 0, and nilfs_evict_inode(), which is called from iput(),
tries to delete that inode (ifile inode in this case).
The inconsistency occurs because directories containing the inode numbers
of these metadata files that should not be visible in the namespace are
read without checking.
Fix this issue by treating the inode numbers of these internal files as
errors in the sanity check helper when reading directory folios/pages.
Also thanks to Hillf Danton and Matthew Wilcox for their initial mm-layer
analysis.
Link: https://lkml.kernel.org/r/20240623051135.4180-3-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+d79afb004be235636ee8(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d79afb004be235636ee8
Reported-by: Jan Kara <jack(a)suse.cz>
Closes: https://lkml.kernel.org/r/20240617075758.wewhukbrjod5fp5o@quack3
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/dir.c | 6 ++++++
fs/nilfs2/nilfs.h | 5 +++++
2 files changed, 11 insertions(+)
--- a/fs/nilfs2/dir.c~nilfs2-add-missing-check-for-inode-numbers-on-directory-entries
+++ a/fs/nilfs2/dir.c
@@ -135,6 +135,9 @@ static bool nilfs_check_folio(struct fol
goto Enamelen;
if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
goto Espan;
+ if (unlikely(p->inode &&
+ NILFS_PRIVATE_INODE(le64_to_cpu(p->inode))))
+ goto Einumber;
}
if (offs != limit)
goto Eend;
@@ -160,6 +163,9 @@ Enamelen:
goto bad_entry;
Espan:
error = "directory entry across blocks";
+ goto bad_entry;
+Einumber:
+ error = "disallowed inode number";
bad_entry:
nilfs_error(sb,
"bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d",
--- a/fs/nilfs2/nilfs.h~nilfs2-add-missing-check-for-inode-numbers-on-directory-entries
+++ a/fs/nilfs2/nilfs.h
@@ -121,6 +121,11 @@ enum {
((ino) >= NILFS_FIRST_INO(sb) || \
((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino))))
+#define NILFS_PRIVATE_INODE(ino) ({ \
+ ino_t __ino = (ino); \
+ ((__ino) < NILFS_USER_INO && (__ino) != NILFS_ROOT_INO && \
+ (__ino) != NILFS_SKETCH_INO); })
+
/**
* struct nilfs_transaction_info: context information for synchronization
* @ti_magic: Magic number
_
Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are
The quilt patch titled
Subject: nilfs2: fix inode number range checks
has been removed from the -mm tree. Its filename was
nilfs2-fix-inode-number-range-checks.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Subject: nilfs2: fix inode number range checks
Date: Sun, 23 Jun 2024 14:11:33 +0900
Patch series "nilfs2: fix potential issues related to reserved inodes".
This series fixes one use-after-free issue reported by syzbot, caused by
nilfs2's internal inode being exposed in the namespace on a corrupted
filesystem, and a couple of flaws that cause problems if the starting
number of non-reserved inodes written in the on-disk super block is
intentionally (or corruptly) changed from its default value.
This patch (of 3):
In the current implementation of nilfs2, "nilfs->ns_first_ino", which
gives the first non-reserved inode number, is read from the superblock,
but its lower limit is not checked.
As a result, if a number that overlaps with the inode number range of
reserved inodes such as the root directory or metadata files is set in the
super block parameter, the inode number test macros (NILFS_MDT_INODE and
NILFS_VALID_INODE) will not function properly.
In addition, these test macros use left bit-shift calculations using with
the inode number as the shift count via the BIT macro, but the result of a
shift calculation that exceeds the bit width of an integer is undefined in
the C specification, so if "ns_first_ino" is set to a large value other
than the default value NILFS_USER_INO (=11), the macros may potentially
malfunction depending on the environment.
Fix these issues by checking the lower bound of "nilfs->ns_first_ino" and
by preventing bit shifts equal to or greater than the NILFS_USER_INO
constant in the inode number test macros.
Also, change the type of "ns_first_ino" from signed integer to unsigned
integer to avoid the need for type casting in comparisons such as the
lower bound check introduced this time.
Link: https://lkml.kernel.org/r/20240623051135.4180-1-konishi.ryusuke@gmail.com
Link: https://lkml.kernel.org/r/20240623051135.4180-2-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/nilfs.h | 5 +++--
fs/nilfs2/the_nilfs.c | 6 ++++++
fs/nilfs2/the_nilfs.h | 2 +-
3 files changed, 10 insertions(+), 3 deletions(-)
--- a/fs/nilfs2/nilfs.h~nilfs2-fix-inode-number-range-checks
+++ a/fs/nilfs2/nilfs.h
@@ -116,9 +116,10 @@ enum {
#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
#define NILFS_MDT_INODE(sb, ino) \
- ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino)))
+ ((ino) < NILFS_USER_INO && (NILFS_MDT_INO_BITS & BIT(ino)))
#define NILFS_VALID_INODE(sb, ino) \
- ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino)))
+ ((ino) >= NILFS_FIRST_INO(sb) || \
+ ((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino))))
/**
* struct nilfs_transaction_info: context information for synchronization
--- a/fs/nilfs2/the_nilfs.c~nilfs2-fix-inode-number-range-checks
+++ a/fs/nilfs2/the_nilfs.c
@@ -452,6 +452,12 @@ static int nilfs_store_disk_layout(struc
}
nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+ if (nilfs->ns_first_ino < NILFS_USER_INO) {
+ nilfs_err(nilfs->ns_sb,
+ "too small lower limit for non-reserved inode numbers: %u",
+ nilfs->ns_first_ino);
+ return -EINVAL;
+ }
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
--- a/fs/nilfs2/the_nilfs.h~nilfs2-fix-inode-number-range-checks
+++ a/fs/nilfs2/the_nilfs.h
@@ -182,7 +182,7 @@ struct the_nilfs {
unsigned long ns_nrsvsegs;
unsigned long ns_first_data_block;
int ns_inode_size;
- int ns_first_ino;
+ unsigned int ns_first_ino;
u32 ns_crc_seed;
/* /sys/fs/<nilfs>/<device> */
_
Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are
The quilt patch titled
Subject: mm: avoid overflows in dirty throttling logic
has been removed from the -mm tree. Its filename was
mm-avoid-overflows-in-dirty-throttling-logic.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Jan Kara <jack(a)suse.cz>
Subject: mm: avoid overflows in dirty throttling logic
Date: Fri, 21 Jun 2024 16:42:38 +0200
The dirty throttling logic is interspersed with assumptions that dirty
limits in PAGE_SIZE units fit into 32-bit (so that various multiplications
fit into 64-bits). If limits end up being larger, we will hit overflows,
possible divisions by 0 etc. Fix these problems by never allowing so
large dirty limits as they have dubious practical value anyway. For
dirty_bytes / dirty_background_bytes interfaces we can just refuse to set
so large limits. For dirty_ratio / dirty_background_ratio it isn't so
simple as the dirty limit is computed from the amount of available memory
which can change due to memory hotplug etc. So when converting dirty
limits from ratios to numbers of pages, we just don't allow the result to
exceed UINT_MAX.
This is root-only triggerable problem which occurs when the operator
sets dirty limits to >16 TB.
Link: https://lkml.kernel.org/r/20240621144246.11148-2-jack@suse.cz
Signed-off-by: Jan Kara <jack(a)suse.cz>
Reported-by: Zach O'Keefe <zokeefe(a)google.com>
Reviewed-By: Zach O'Keefe <zokeefe(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page-writeback.c | 30 ++++++++++++++++++++++++++----
1 file changed, 26 insertions(+), 4 deletions(-)
--- a/mm/page-writeback.c~mm-avoid-overflows-in-dirty-throttling-logic
+++ a/mm/page-writeback.c
@@ -415,13 +415,20 @@ static void domain_dirty_limits(struct d
else
bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;
- if (bg_thresh >= thresh)
- bg_thresh = thresh / 2;
tsk = current;
if (rt_task(tsk)) {
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
}
+ /*
+ * Dirty throttling logic assumes the limits in page units fit into
+ * 32-bits. This gives 16TB dirty limits max which is hopefully enough.
+ */
+ if (thresh > UINT_MAX)
+ thresh = UINT_MAX;
+ /* This makes sure bg_thresh is within 32-bits as well */
+ if (bg_thresh >= thresh)
+ bg_thresh = thresh / 2;
dtc->thresh = thresh;
dtc->bg_thresh = bg_thresh;
@@ -471,7 +478,11 @@ static unsigned long node_dirty_limit(st
if (rt_task(tsk))
dirty += dirty / 4;
- return dirty;
+ /*
+ * Dirty throttling logic assumes the limits in page units fit into
+ * 32-bits. This gives 16TB dirty limits max which is hopefully enough.
+ */
+ return min_t(unsigned long, dirty, UINT_MAX);
}
/**
@@ -508,10 +519,17 @@ static int dirty_background_bytes_handle
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
+ unsigned long old_bytes = dirty_background_bytes;
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
- if (ret == 0 && write)
+ if (ret == 0 && write) {
+ if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) >
+ UINT_MAX) {
+ dirty_background_bytes = old_bytes;
+ return -ERANGE;
+ }
dirty_background_ratio = 0;
+ }
return ret;
}
@@ -537,6 +555,10 @@ static int dirty_bytes_handler(struct ct
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
+ if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) {
+ vm_dirty_bytes = old_bytes;
+ return -ERANGE;
+ }
writeback_set_ratelimit();
vm_dirty_ratio = 0;
}
_
Patches currently in -mm which might be from jack(a)suse.cz are
readahead-make-sure-sync-readahead-reads-needed-page.patch
filemap-fix-page_cache_next_miss-when-no-hole-found.patch
readahead-properly-shorten-readahead-when-falling-back-to-do_page_cache_ra.patch
readahead-drop-pointless-index-from-force_page_cache_ra.patch
readahead-drop-index-argument-of-page_cache_async_readahead.patch
readahead-drop-dead-code-in-page_cache_ra_order.patch
readahead-drop-dead-code-in-ondemand_readahead.patch
readahead-disentangle-async-and-sync-readahead.patch
readahead-fold-try_context_readahead-into-its-single-caller.patch
readahead-simplify-gotos-in-page_cache_sync_ra.patch
The quilt patch titled
Subject: Revert "mm/writeback: fix possible divide-by-zero in wb_dirty_limits(), again"
has been removed from the -mm tree. Its filename was
revert-mm-writeback-fix-possible-divide-by-zero-in-wb_dirty_limits-again.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Jan Kara <jack(a)suse.cz>
Subject: Revert "mm/writeback: fix possible divide-by-zero in wb_dirty_limits(), again"
Date: Fri, 21 Jun 2024 16:42:37 +0200
Patch series "mm: Avoid possible overflows in dirty throttling".
Dirty throttling logic assumes dirty limits in page units fit into
32-bits. This patch series makes sure this is true (see patch 2/2 for
more details).
This patch (of 2):
This reverts commit 9319b647902cbd5cc884ac08a8a6d54ce111fc78.
The commit is broken in several ways. Firstly, the removed (u64) cast
from the multiplication will introduce a multiplication overflow on 32-bit
archs if wb_thresh * bg_thresh >= 1<<32 (which is actually common - the
default settings with 4GB of RAM will trigger this). Secondly, the
div64_u64() is unnecessarily expensive on 32-bit archs. We have
div64_ul() in case we want to be safe & cheap. Thirdly, if dirty
thresholds are larger than 1<<32 pages, then dirty balancing is going to
blow up in many other spectacular ways anyway so trying to fix one
possible overflow is just moot.
Link: https://lkml.kernel.org/r/20240621144017.30993-1-jack@suse.cz
Link: https://lkml.kernel.org/r/20240621144246.11148-1-jack@suse.cz
Fixes: 9319b647902c ("mm/writeback: fix possible divide-by-zero in wb_dirty_limits(), again")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Reviewed-By: Zach O'Keefe <zokeefe(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page-writeback.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/page-writeback.c~revert-mm-writeback-fix-possible-divide-by-zero-in-wb_dirty_limits-again
+++ a/mm/page-writeback.c
@@ -1660,7 +1660,7 @@ static inline void wb_dirty_limits(struc
*/
dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh);
dtc->wb_bg_thresh = dtc->thresh ?
- div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
+ div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
_
Patches currently in -mm which might be from jack(a)suse.cz are
readahead-make-sure-sync-readahead-reads-needed-page.patch
filemap-fix-page_cache_next_miss-when-no-hole-found.patch
readahead-properly-shorten-readahead-when-falling-back-to-do_page_cache_ra.patch
readahead-drop-pointless-index-from-force_page_cache_ra.patch
readahead-drop-index-argument-of-page_cache_async_readahead.patch
readahead-drop-dead-code-in-page_cache_ra_order.patch
readahead-drop-dead-code-in-ondemand_readahead.patch
readahead-disentangle-async-and-sync-readahead.patch
readahead-fold-try_context_readahead-into-its-single-caller.patch
readahead-simplify-gotos-in-page_cache_sync_ra.patch
The quilt patch titled
Subject: mm: optimize the redundant loop of mm_update_owner_next()
has been removed from the -mm tree. Its filename was
mm-optimize-the-redundant-loop-of-mm_update_owner_next.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Jinliang Zheng <alexjlzheng(a)tencent.com>
Subject: mm: optimize the redundant loop of mm_update_owner_next()
Date: Thu, 20 Jun 2024 20:21:24 +0800
When mm_update_owner_next() is racing with swapoff (try_to_unuse()) or
/proc or ptrace or page migration (get_task_mm()), it is impossible to
find an appropriate task_struct in the loop whose mm_struct is the same as
the target mm_struct.
If the above race condition is combined with the stress-ng-zombie and
stress-ng-dup tests, such a long loop can easily cause a Hard Lockup in
write_lock_irq() for tasklist_lock.
Recognize this situation in advance and exit early.
Link: https://lkml.kernel.org/r/20240620122123.3877432-1-alexjlzheng@tencent.com
Signed-off-by: Jinliang Zheng <alexjlzheng(a)tencent.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Christian Brauner <brauner(a)kernel.org>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: Mateusz Guzik <mjguzik(a)gmail.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Oleg Nesterov <oleg(a)redhat.com>
Cc: Tycho Andersen <tandersen(a)netflix.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/exit.c | 2 ++
1 file changed, 2 insertions(+)
--- a/kernel/exit.c~mm-optimize-the-redundant-loop-of-mm_update_owner_next
+++ a/kernel/exit.c
@@ -484,6 +484,8 @@ retry:
* Search through everything else, we should not get here often.
*/
for_each_process(g) {
+ if (atomic_read(&mm->mm_users) <= 1)
+ break;
if (g->flags & PF_KTHREAD)
continue;
for_each_thread(g, c) {
_
Patches currently in -mm which might be from alexjlzheng(a)tencent.com are
The quilt patch titled
Subject: mm/gup: clear the LRU flag of a page before adding to LRU batch
has been removed from the -mm tree. Its filename was
mm-gup-clear-the-lru-flag-of-a-page-before-adding-to-lru-batch.patch
This patch was dropped because an alternative patch was or shall be merged
------------------------------------------------------
From: yangge <yangge1116(a)126.com>
Subject: mm/gup: clear the LRU flag of a page before adding to LRU batch
Date: Sat, 22 Jun 2024 14:48:04 +0800
If a large number of CMA memory are configured in system (for example, the
CMA memory accounts for 50% of the system memory), starting a virtual
machine, it will call pin_user_pages_remote(..., FOLL_LONGTERM, ...) to
pin memory. Normally if a page is present and in CMA area,
pin_user_pages_remote() will migrate the page from CMA area to non-CMA
area because of FOLL_LONGTERM flag. But the current code will cause the
migration failure due to unexpected page refcounts, and eventually cause
the virtual machine fail to start.
If a page is added in LRU batch, its refcount increases one, remove the
page from LRU batch decreases one. Page migration requires the page is
not referenced by others except page mapping. Before migrating a page, we
should try to drain the page from LRU batch in case the page is in it,
however, folio_test_lru() is not sufficient to tell whether the page is in
LRU batch or not, if the page is in LRU batch, the migration will fail.
To solve the problem above, we modify the logic of adding to LRU batch.
Before adding a page to LRU batch, we clear the LRU flag of the page so
that we can check whether the page is in LRU batch by
folio_test_lru(page). Seems making the LRU flag of the page invisible a
long time is no problem, because a new page is allocated from buddy and
added to the lru batch, its LRU flag is also not visible for a long time.
Link: https://lkml.kernel.org/r/1719038884-1903-1-git-send-email-yangge1116@126.c…
Signed-off-by: yangge <yangge1116(a)126.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Alex Shi <alex.shi(a)linux.alibaba.com>
Cc: Shaohua Li <shli(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/swap.c | 43 +++++++++++++++++++++++++++++++------------
1 file changed, 31 insertions(+), 12 deletions(-)
--- a/mm/swap.c~mm-gup-clear-the-lru-flag-of-a-page-before-adding-to-lru-batch
+++ a/mm/swap.c
@@ -212,10 +212,6 @@ static void folio_batch_move_lru(struct
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
- /* block memcg migration while the folio moves between lru */
- if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
- continue;
-
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
@@ -256,11 +252,16 @@ static void lru_move_tail_fn(struct lruv
void folio_rotate_reclaimable(struct folio *folio)
{
if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
- !folio_test_unevictable(folio) && folio_test_lru(folio)) {
+ !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
unsigned long flags;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock_irqsave(&lru_rotate.lock, flags);
fbatch = this_cpu_ptr(&lru_rotate.fbatch);
folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn);
@@ -353,11 +354,15 @@ static void folio_activate_drain(int cpu
void folio_activate(struct folio *folio)
{
- if (folio_test_lru(folio) && !folio_test_active(folio) &&
- !folio_test_unevictable(folio)) {
+ if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.activate);
folio_batch_add_and_move(fbatch, folio, folio_activate_fn);
@@ -701,6 +706,11 @@ void deactivate_file_folio(struct folio
return;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file);
folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn);
@@ -717,11 +727,16 @@ void deactivate_file_folio(struct folio
*/
void folio_deactivate(struct folio *folio)
{
- if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
- (folio_test_active(folio) || lru_gen_enabled())) {
+ if (!folio_test_unevictable(folio) && (folio_test_active(folio) ||
+ lru_gen_enabled())) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate);
folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn);
@@ -738,12 +753,16 @@ void folio_deactivate(struct folio *foli
*/
void folio_mark_lazyfree(struct folio *folio)
{
- if (folio_test_lru(folio) && folio_test_anon(folio) &&
- folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
- !folio_test_unevictable(folio)) {
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree);
folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn);
_
Patches currently in -mm which might be from yangge1116(a)126.com are
The try_grab_folio() is supposed to be used in fast path and it elevates
folio refcount by using add ref unless zero. We are guaranteed to have
at least one stable reference in slow path, so the simple atomic add
could be used. The performance difference should be trivial, but the
misuse may be confusing and misleading.
In another thread [1] a kernel warning was reported when pinning folio
in CMA memory when launching SEV virtual machine. The splat looks like:
[ 464.325306] WARNING: CPU: 13 PID: 6734 at mm/gup.c:1313 __get_user_pages+0x423/0x520
[ 464.325464] CPU: 13 PID: 6734 Comm: qemu-kvm Kdump: loaded Not tainted 6.6.33+ #6
[ 464.325477] RIP: 0010:__get_user_pages+0x423/0x520
[ 464.325515] Call Trace:
[ 464.325520] <TASK>
[ 464.325523] ? __get_user_pages+0x423/0x520
[ 464.325528] ? __warn+0x81/0x130
[ 464.325536] ? __get_user_pages+0x423/0x520
[ 464.325541] ? report_bug+0x171/0x1a0
[ 464.325549] ? handle_bug+0x3c/0x70
[ 464.325554] ? exc_invalid_op+0x17/0x70
[ 464.325558] ? asm_exc_invalid_op+0x1a/0x20
[ 464.325567] ? __get_user_pages+0x423/0x520
[ 464.325575] __gup_longterm_locked+0x212/0x7a0
[ 464.325583] internal_get_user_pages_fast+0xfb/0x190
[ 464.325590] pin_user_pages_fast+0x47/0x60
[ 464.325598] sev_pin_memory+0xca/0x170 [kvm_amd]
[ 464.325616] sev_mem_enc_register_region+0x81/0x130 [kvm_amd]
Per the analysis done by yangge, when starting the SEV virtual machine,
it will call pin_user_pages_fast(..., FOLL_LONGTERM, ...) to pin the
memory. But the page is in CMA area, so fast GUP will fail then
fallback to the slow path due to the longterm pinnalbe check in
try_grab_folio().
The slow path will try to pin the pages then migrate them out of CMA
area. But the slow path also uses try_grab_folio() to pin the page,
it will also fail due to the same check then the above warning
is triggered.
[1] https://lore.kernel.org/linux-mm/1719478388-31917-1-git-send-email-yangge11…
Fixes: 57edfcfd3419 ("mm/gup: accelerate thp gup even for "pages != NULL"")
Cc: <stable(a)vger.kernel.org> [6.6+]
Reported-by: yangge <yangge1116(a)126.com>
Signed-off-by: Yang Shi <yang(a)os.amperecomputing.com>
---
mm/gup.c | 278 +++++++++++++++++++++++++----------------------
mm/huge_memory.c | 2 +-
mm/internal.h | 3 +-
3 files changed, 148 insertions(+), 135 deletions(-)
v2:
1. Fixed the build warning
2. Reworked the commit log to include the bug report and analysis (reworded by me)
from yangge
3. Rebased onto the latest Linus's tree
diff --git a/mm/gup.c b/mm/gup.c
index ca0f5cedce9b..6be165224c1e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -97,95 +97,6 @@ static inline struct folio *try_get_folio(struct page *page, int refs)
return folio;
}
-/**
- * try_grab_folio() - Attempt to get or pin a folio.
- * @page: pointer to page to be grabbed
- * @refs: the value to (effectively) add to the folio's refcount
- * @flags: gup flags: these are the FOLL_* flag values.
- *
- * "grab" names in this file mean, "look at flags to decide whether to use
- * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
- *
- * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
- * same time. (That's true throughout the get_user_pages*() and
- * pin_user_pages*() APIs.) Cases:
- *
- * FOLL_GET: folio's refcount will be incremented by @refs.
- *
- * FOLL_PIN on large folios: folio's refcount will be incremented by
- * @refs, and its pincount will be incremented by @refs.
- *
- * FOLL_PIN on single-page folios: folio's refcount will be incremented by
- * @refs * GUP_PIN_COUNTING_BIAS.
- *
- * Return: The folio containing @page (with refcount appropriately
- * incremented) for success, or NULL upon failure. If neither FOLL_GET
- * nor FOLL_PIN was set, that's considered failure, and furthermore,
- * a likely bug in the caller, so a warning is also emitted.
- */
-struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
-{
- struct folio *folio;
-
- if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
- return NULL;
-
- if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
- return NULL;
-
- if (flags & FOLL_GET)
- return try_get_folio(page, refs);
-
- /* FOLL_PIN is set */
-
- /*
- * Don't take a pin on the zero page - it's not going anywhere
- * and it is used in a *lot* of places.
- */
- if (is_zero_page(page))
- return page_folio(page);
-
- folio = try_get_folio(page, refs);
- if (!folio)
- return NULL;
-
- /*
- * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
- * right zone, so fail and let the caller fall back to the slow
- * path.
- */
- if (unlikely((flags & FOLL_LONGTERM) &&
- !folio_is_longterm_pinnable(folio))) {
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
- return NULL;
- }
-
- /*
- * When pinning a large folio, use an exact count to track it.
- *
- * However, be sure to *also* increment the normal folio
- * refcount field at least once, so that the folio really
- * is pinned. That's why the refcount from the earlier
- * try_get_folio() is left intact.
- */
- if (folio_test_large(folio))
- atomic_add(refs, &folio->_pincount);
- else
- folio_ref_add(folio,
- refs * (GUP_PIN_COUNTING_BIAS - 1));
- /*
- * Adjust the pincount before re-checking the PTE for changes.
- * This is essentially a smp_mb() and is paired with a memory
- * barrier in folio_try_share_anon_rmap_*().
- */
- smp_mb__after_atomic();
-
- node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
-
- return folio;
-}
-
static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
{
if (flags & FOLL_PIN) {
@@ -203,28 +114,31 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
}
/**
- * try_grab_page() - elevate a page's refcount by a flag-dependent amount
- * @page: pointer to page to be grabbed
- * @flags: gup flags: these are the FOLL_* flag values.
+ * try_grab_folio() - add a folio's refcount by a flag-dependent amount
+ * @folio: pointer to folio to be grabbed
+ * @refs: the value to (effectively) add to the folio's refcount
+ * @flags: gup flags: these are the FOLL_* flag values
*
* This might not do anything at all, depending on the flags argument.
*
* "grab" names in this file mean, "look at flags to decide whether to use
- * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
*
* Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
- * time. Cases: please see the try_grab_folio() documentation, with
- * "refs=1".
+ * time.
*
* Return: 0 for success, or if no action was required (if neither FOLL_PIN
* nor FOLL_GET was set, nothing is done). A negative error code for failure:
*
- * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not
+ * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not
* be grabbed.
+ *
+ * It is called when we have a stable reference for the folio, typically in
+ * GUP slow path.
*/
-int __must_check try_grab_page(struct page *page, unsigned int flags)
+int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags)
{
- struct folio *folio = page_folio(page);
+ struct page *page = &folio->page;
if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
return -ENOMEM;
@@ -233,7 +147,7 @@ int __must_check try_grab_page(struct page *page, unsigned int flags)
return -EREMOTEIO;
if (flags & FOLL_GET)
- folio_ref_inc(folio);
+ folio_ref_add(folio, refs);
else if (flags & FOLL_PIN) {
/*
* Don't take a pin on the zero page - it's not going anywhere
@@ -243,18 +157,18 @@ int __must_check try_grab_page(struct page *page, unsigned int flags)
return 0;
/*
- * Similar to try_grab_folio(): be sure to *also*
- * increment the normal page refcount field at least once,
+ * Increment the normal page refcount field at least once,
* so that the page really is pinned.
*/
if (folio_test_large(folio)) {
- folio_ref_add(folio, 1);
- atomic_add(1, &folio->_pincount);
+ folio_ref_add(folio, refs);
+ atomic_add(refs, &folio->_pincount);
} else {
- folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
+ folio_ref_add(folio,
+ refs * GUP_PIN_COUNTING_BIAS);
}
- node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
+ node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
}
return 0;
@@ -535,7 +449,7 @@ static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
*/
static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz,
unsigned long addr, unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+ struct page **pages, int *nr, bool fast)
{
unsigned long pte_end;
struct page *page;
@@ -558,9 +472,15 @@ static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz
page = pte_page(pte);
refs = record_subpages(page, sz, addr, end, pages + *nr);
- folio = try_grab_folio(page, refs, flags);
- if (!folio)
- return 0;
+ if (fast) {
+ folio = try_grab_folio_fast(page, refs, flags);
+ if (!folio)
+ return 0;
+ } else {
+ folio = page_folio(page);
+ if (try_grab_folio(folio, refs, flags))
+ return 0;
+ }
if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
gup_put_folio(folio, refs, flags);
@@ -588,7 +508,7 @@ static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz
static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
unsigned long addr, unsigned int pdshift,
unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+ struct page **pages, int *nr, bool fast)
{
pte_t *ptep;
unsigned long sz = 1UL << hugepd_shift(hugepd);
@@ -598,7 +518,7 @@ static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
ptep = hugepte_offset(hugepd, addr, pdshift);
do {
next = hugepte_addr_end(addr, end, sz);
- ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr);
+ ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr, fast);
if (ret != 1)
return ret;
} while (ptep++, addr = next, addr != end);
@@ -625,7 +545,7 @@ static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
ptep = hugepte_offset(hugepd, addr, pdshift);
ptl = huge_pte_lock(h, vma->vm_mm, ptep);
ret = gup_hugepd(vma, hugepd, addr, pdshift, addr + PAGE_SIZE,
- flags, &page, &nr);
+ flags, &page, &nr, false);
spin_unlock(ptl);
if (ret == 1) {
@@ -642,7 +562,7 @@ static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
static inline int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
unsigned long addr, unsigned int pdshift,
unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+ struct page **pages, int *nr, bool fast)
{
return 0;
}
@@ -729,7 +649,7 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma,
gup_must_unshare(vma, flags, page))
return ERR_PTR(-EMLINK);
- ret = try_grab_page(page, flags);
+ ret = try_grab_folio(page_folio(page), 1, flags);
if (ret)
page = ERR_PTR(ret);
else
@@ -806,7 +726,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma,
VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
!PageAnonExclusive(page), page);
- ret = try_grab_page(page, flags);
+ ret = try_grab_folio(page_folio(page), 1, flags);
if (ret)
return ERR_PTR(ret);
@@ -968,8 +888,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
!PageAnonExclusive(page), page);
- /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
- ret = try_grab_page(page, flags);
+ /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
+ ret = try_grab_folio(page_folio(page), 1, flags);
if (unlikely(ret)) {
page = ERR_PTR(ret);
goto out;
@@ -1233,7 +1153,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
goto unmap;
*page = pte_page(entry);
}
- ret = try_grab_page(*page, gup_flags);
+ ret = try_grab_folio(page_folio(*page), 1, gup_flags);
if (unlikely(ret))
goto unmap;
out:
@@ -1636,20 +1556,19 @@ static long __get_user_pages(struct mm_struct *mm,
* pages.
*/
if (page_increm > 1) {
- struct folio *folio;
+ struct folio *folio = page_folio(page);
/*
* Since we already hold refcount on the
* large folio, this should never fail.
*/
- folio = try_grab_folio(page, page_increm - 1,
- foll_flags);
- if (WARN_ON_ONCE(!folio)) {
+ if (try_grab_folio(folio, page_increm - 1,
+ foll_flags)) {
/*
* Release the 1st page ref if the
* folio is problematic, fail hard.
*/
- gup_put_folio(page_folio(page), 1,
+ gup_put_folio(folio, 1,
foll_flags);
ret = -EFAULT;
goto out;
@@ -2797,6 +2716,101 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* This code is based heavily on the PowerPC implementation by Nick Piggin.
*/
#ifdef CONFIG_HAVE_GUP_FAST
+/**
+ * try_grab_folio_fast() - Attempt to get or pin a folio in fast path.
+ * @page: pointer to page to be grabbed
+ * @refs: the value to (effectively) add to the folio's refcount
+ * @flags: gup flags: these are the FOLL_* flag values.
+ *
+ * "grab" names in this file mean, "look at flags to decide whether to use
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
+ *
+ * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
+ * same time. (That's true throughout the get_user_pages*() and
+ * pin_user_pages*() APIs.) Cases:
+ *
+ * FOLL_GET: folio's refcount will be incremented by @refs.
+ *
+ * FOLL_PIN on large folios: folio's refcount will be incremented by
+ * @refs, and its pincount will be incremented by @refs.
+ *
+ * FOLL_PIN on single-page folios: folio's refcount will be incremented by
+ * @refs * GUP_PIN_COUNTING_BIAS.
+ *
+ * Return: The folio containing @page (with refcount appropriately
+ * incremented) for success, or NULL upon failure. If neither FOLL_GET
+ * nor FOLL_PIN was set, that's considered failure, and furthermore,
+ * a likely bug in the caller, so a warning is also emitted.
+ *
+ * It uses add ref unless zero to elevate the folio refcount and must be called
+ * in fast path only.
+ */
+static struct folio *try_grab_folio_fast(struct page *page, int refs,
+ unsigned int flags)
+{
+ struct folio *folio;
+
+ /* Raise warn if it is not called in fast GUP */
+ VM_WARN_ON_ONCE(!irqs_disabled());
+
+ if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
+ return NULL;
+
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
+ return NULL;
+
+ if (flags & FOLL_GET)
+ return try_get_folio(page, refs);
+
+ /* FOLL_PIN is set */
+
+ /*
+ * Don't take a pin on the zero page - it's not going anywhere
+ * and it is used in a *lot* of places.
+ */
+ if (is_zero_page(page))
+ return page_folio(page);
+
+ folio = try_get_folio(page, refs);
+ if (!folio)
+ return NULL;
+
+ /*
+ * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+ * right zone, so fail and let the caller fall back to the slow
+ * path.
+ */
+ if (unlikely((flags & FOLL_LONGTERM) &&
+ !folio_is_longterm_pinnable(folio))) {
+ if (!put_devmap_managed_folio_refs(folio, refs))
+ folio_put_refs(folio, refs);
+ return NULL;
+ }
+
+ /*
+ * When pinning a large folio, use an exact count to track it.
+ *
+ * However, be sure to *also* increment the normal folio
+ * refcount field at least once, so that the folio really
+ * is pinned. That's why the refcount from the earlier
+ * try_get_folio() is left intact.
+ */
+ if (folio_test_large(folio))
+ atomic_add(refs, &folio->_pincount);
+ else
+ folio_ref_add(folio,
+ refs * (GUP_PIN_COUNTING_BIAS - 1));
+ /*
+ * Adjust the pincount before re-checking the PTE for changes.
+ * This is essentially a smp_mb() and is paired with a memory
+ * barrier in folio_try_share_anon_rmap_*().
+ */
+ smp_mb__after_atomic();
+
+ node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
+
+ return folio;
+}
/*
* Used in the GUP-fast path to determine whether GUP is permitted to work on
@@ -2962,7 +2976,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
- folio = try_grab_folio(page, 1, flags);
+ folio = try_grab_folio_fast(page, 1, flags);
if (!folio)
goto pte_unmap;
@@ -3049,7 +3063,7 @@ static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
break;
}
- folio = try_grab_folio(page, 1, flags);
+ folio = try_grab_folio_fast(page, 1, flags);
if (!folio) {
gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
break;
@@ -3138,7 +3152,7 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
page = pmd_page(orig);
refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr);
- folio = try_grab_folio(page, refs, flags);
+ folio = try_grab_folio_fast(page, refs, flags);
if (!folio)
return 0;
@@ -3182,7 +3196,7 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
page = pud_page(orig);
refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr);
- folio = try_grab_folio(page, refs, flags);
+ folio = try_grab_folio_fast(page, refs, flags);
if (!folio)
return 0;
@@ -3222,7 +3236,7 @@ static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr,
page = pgd_page(orig);
refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr);
- folio = try_grab_folio(page, refs, flags);
+ folio = try_grab_folio_fast(page, refs, flags);
if (!folio)
return 0;
@@ -3276,7 +3290,7 @@ static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
* pmd format and THP pmd format
*/
if (gup_hugepd(NULL, __hugepd(pmd_val(pmd)), addr,
- PMD_SHIFT, next, flags, pages, nr) != 1)
+ PMD_SHIFT, next, flags, pages, nr, true) != 1)
return 0;
} else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags,
pages, nr))
@@ -3306,7 +3320,7 @@ static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,
return 0;
} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
if (gup_hugepd(NULL, __hugepd(pud_val(pud)), addr,
- PUD_SHIFT, next, flags, pages, nr) != 1)
+ PUD_SHIFT, next, flags, pages, nr, true) != 1)
return 0;
} else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags,
pages, nr))
@@ -3333,7 +3347,7 @@ static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
BUILD_BUG_ON(p4d_leaf(p4d));
if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
if (gup_hugepd(NULL, __hugepd(p4d_val(p4d)), addr,
- P4D_SHIFT, next, flags, pages, nr) != 1)
+ P4D_SHIFT, next, flags, pages, nr, true) != 1)
return 0;
} else if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags,
pages, nr))
@@ -3362,7 +3376,7 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
return;
} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
if (gup_hugepd(NULL, __hugepd(pgd_val(pgd)), addr,
- PGDIR_SHIFT, next, flags, pages, nr) != 1)
+ PGDIR_SHIFT, next, flags, pages, nr, true) != 1)
return;
} else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
pages, nr))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index db7946a0a28c..2120f7478e55 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1331,7 +1331,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- ret = try_grab_page(page, flags);
+ ret = try_grab_folio(page_folio(page), 1, flags);
if (ret)
page = ERR_PTR(ret);
diff --git a/mm/internal.h b/mm/internal.h
index 6902b7dd8509..52db9219b2db 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1182,8 +1182,7 @@ int migrate_device_coherent_page(struct page *page);
/*
* mm/gup.c
*/
-struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
-int __must_check try_grab_page(struct page *page, unsigned int flags);
+int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags);
/*
* mm/huge_memory.c
--
2.41.0
The patch titled
Subject: mm: gup: do not call try_grab_folio() in slow path
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-gup-do-not-call-try_grab_folio-in-slow-path.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Yang Shi <yang(a)os.amperecomputing.com>
Subject: mm: gup: do not call try_grab_folio() in slow path
Date: Thu, 27 Jun 2024 16:16:01 -0700
The try_grab_folio() is supposed to be used in fast path and it elevates
folio refcount by using add ref unless zero. We are guaranteed to have at
least one stable reference in slow path, so the simple atomic add could be
used. The performance difference should be trivial, but the misuse may be
confusing and misleading.
In another thread [1] a kernel warning was reported when pinning folio
in CMA memory when launching SEV virtual machine. The splat looks like:
[ 464.325306] WARNING: CPU: 13 PID: 6734 at mm/gup.c:1313 __get_user_pages+0x423/0x520
[ 464.325464] CPU: 13 PID: 6734 Comm: qemu-kvm Kdump: loaded Not tainted 6.6.33+ #6
[ 464.325477] RIP: 0010:__get_user_pages+0x423/0x520
[ 464.325515] Call Trace:
[ 464.325520] <TASK>
[ 464.325523] ? __get_user_pages+0x423/0x520
[ 464.325528] ? __warn+0x81/0x130
[ 464.325536] ? __get_user_pages+0x423/0x520
[ 464.325541] ? report_bug+0x171/0x1a0
[ 464.325549] ? handle_bug+0x3c/0x70
[ 464.325554] ? exc_invalid_op+0x17/0x70
[ 464.325558] ? asm_exc_invalid_op+0x1a/0x20
[ 464.325567] ? __get_user_pages+0x423/0x520
[ 464.325575] __gup_longterm_locked+0x212/0x7a0
[ 464.325583] internal_get_user_pages_fast+0xfb/0x190
[ 464.325590] pin_user_pages_fast+0x47/0x60
[ 464.325598] sev_pin_memory+0xca/0x170 [kvm_amd]
[ 464.325616] sev_mem_enc_register_region+0x81/0x130 [kvm_amd]
Per the analysis done by yangge, when starting the SEV virtual machine, it
will call pin_user_pages_fast(..., FOLL_LONGTERM, ...) to pin the memory.
But the page is in CMA area, so fast GUP will fail then fallback to the
slow path due to the longterm pinnalbe check in try_grab_folio().
The slow path will try to pin the pages then migrate them out of CMA area.
But the slow path also uses try_grab_folio() to pin the page, it will
also fail due to the same check then the above warning is triggered.
[1] https://lore.kernel.org/linux-mm/1719478388-31917-1-git-send-email-yangge11…
Link: https://lkml.kernel.org/r/20240627231601.1713119-1-yang@os.amperecomputing.…
Fixes: 57edfcfd3419 ("mm/gup: accelerate thp gup even for "pages != NULL"")
Signed-off-by: Yang Shi <yang(a)os.amperecomputing.com>
Reported-by: yangge <yangge1116(a)126.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [6.6+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/gup.c | 278 +++++++++++++++++++++++----------------------
mm/huge_memory.c | 2
mm/internal.h | 3
3 files changed, 148 insertions(+), 135 deletions(-)
--- a/mm/gup.c~mm-gup-do-not-call-try_grab_folio-in-slow-path
+++ a/mm/gup.c
@@ -97,95 +97,6 @@ retry:
return folio;
}
-/**
- * try_grab_folio() - Attempt to get or pin a folio.
- * @page: pointer to page to be grabbed
- * @refs: the value to (effectively) add to the folio's refcount
- * @flags: gup flags: these are the FOLL_* flag values.
- *
- * "grab" names in this file mean, "look at flags to decide whether to use
- * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
- *
- * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
- * same time. (That's true throughout the get_user_pages*() and
- * pin_user_pages*() APIs.) Cases:
- *
- * FOLL_GET: folio's refcount will be incremented by @refs.
- *
- * FOLL_PIN on large folios: folio's refcount will be incremented by
- * @refs, and its pincount will be incremented by @refs.
- *
- * FOLL_PIN on single-page folios: folio's refcount will be incremented by
- * @refs * GUP_PIN_COUNTING_BIAS.
- *
- * Return: The folio containing @page (with refcount appropriately
- * incremented) for success, or NULL upon failure. If neither FOLL_GET
- * nor FOLL_PIN was set, that's considered failure, and furthermore,
- * a likely bug in the caller, so a warning is also emitted.
- */
-struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
-{
- struct folio *folio;
-
- if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
- return NULL;
-
- if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
- return NULL;
-
- if (flags & FOLL_GET)
- return try_get_folio(page, refs);
-
- /* FOLL_PIN is set */
-
- /*
- * Don't take a pin on the zero page - it's not going anywhere
- * and it is used in a *lot* of places.
- */
- if (is_zero_page(page))
- return page_folio(page);
-
- folio = try_get_folio(page, refs);
- if (!folio)
- return NULL;
-
- /*
- * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
- * right zone, so fail and let the caller fall back to the slow
- * path.
- */
- if (unlikely((flags & FOLL_LONGTERM) &&
- !folio_is_longterm_pinnable(folio))) {
- if (!put_devmap_managed_folio_refs(folio, refs))
- folio_put_refs(folio, refs);
- return NULL;
- }
-
- /*
- * When pinning a large folio, use an exact count to track it.
- *
- * However, be sure to *also* increment the normal folio
- * refcount field at least once, so that the folio really
- * is pinned. That's why the refcount from the earlier
- * try_get_folio() is left intact.
- */
- if (folio_test_large(folio))
- atomic_add(refs, &folio->_pincount);
- else
- folio_ref_add(folio,
- refs * (GUP_PIN_COUNTING_BIAS - 1));
- /*
- * Adjust the pincount before re-checking the PTE for changes.
- * This is essentially a smp_mb() and is paired with a memory
- * barrier in folio_try_share_anon_rmap_*().
- */
- smp_mb__after_atomic();
-
- node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
-
- return folio;
-}
-
static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
{
if (flags & FOLL_PIN) {
@@ -203,28 +114,31 @@ static void gup_put_folio(struct folio *
}
/**
- * try_grab_page() - elevate a page's refcount by a flag-dependent amount
- * @page: pointer to page to be grabbed
- * @flags: gup flags: these are the FOLL_* flag values.
+ * try_grab_folio() - add a folio's refcount by a flag-dependent amount
+ * @folio: pointer to folio to be grabbed
+ * @refs: the value to (effectively) add to the folio's refcount
+ * @flags: gup flags: these are the FOLL_* flag values
*
* This might not do anything at all, depending on the flags argument.
*
* "grab" names in this file mean, "look at flags to decide whether to use
- * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
*
* Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
- * time. Cases: please see the try_grab_folio() documentation, with
- * "refs=1".
+ * time.
*
* Return: 0 for success, or if no action was required (if neither FOLL_PIN
* nor FOLL_GET was set, nothing is done). A negative error code for failure:
*
- * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not
+ * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not
* be grabbed.
+ *
+ * It is called when we have a stable reference for the folio, typically in
+ * GUP slow path.
*/
-int __must_check try_grab_page(struct page *page, unsigned int flags)
+int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags)
{
- struct folio *folio = page_folio(page);
+ struct page *page = &folio->page;
if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
return -ENOMEM;
@@ -233,7 +147,7 @@ int __must_check try_grab_page(struct pa
return -EREMOTEIO;
if (flags & FOLL_GET)
- folio_ref_inc(folio);
+ folio_ref_add(folio, refs);
else if (flags & FOLL_PIN) {
/*
* Don't take a pin on the zero page - it's not going anywhere
@@ -243,18 +157,18 @@ int __must_check try_grab_page(struct pa
return 0;
/*
- * Similar to try_grab_folio(): be sure to *also*
- * increment the normal page refcount field at least once,
+ * Increment the normal page refcount field at least once,
* so that the page really is pinned.
*/
if (folio_test_large(folio)) {
- folio_ref_add(folio, 1);
- atomic_add(1, &folio->_pincount);
+ folio_ref_add(folio, refs);
+ atomic_add(refs, &folio->_pincount);
} else {
- folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
+ folio_ref_add(folio,
+ refs * GUP_PIN_COUNTING_BIAS);
}
- node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
+ node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
}
return 0;
@@ -535,7 +449,7 @@ static unsigned long hugepte_addr_end(un
*/
static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz,
unsigned long addr, unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+ struct page **pages, int *nr, bool fast)
{
unsigned long pte_end;
struct page *page;
@@ -558,9 +472,15 @@ static int gup_hugepte(struct vm_area_st
page = pte_page(pte);
refs = record_subpages(page, sz, addr, end, pages + *nr);
- folio = try_grab_folio(page, refs, flags);
- if (!folio)
- return 0;
+ if (fast) {
+ folio = try_grab_folio_fast(page, refs, flags);
+ if (!folio)
+ return 0;
+ } else {
+ folio = page_folio(page);
+ if (try_grab_folio(folio, refs, flags))
+ return 0;
+ }
if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
gup_put_folio(folio, refs, flags);
@@ -588,7 +508,7 @@ static int gup_hugepte(struct vm_area_st
static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
unsigned long addr, unsigned int pdshift,
unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+ struct page **pages, int *nr, bool fast)
{
pte_t *ptep;
unsigned long sz = 1UL << hugepd_shift(hugepd);
@@ -598,7 +518,7 @@ static int gup_hugepd(struct vm_area_str
ptep = hugepte_offset(hugepd, addr, pdshift);
do {
next = hugepte_addr_end(addr, end, sz);
- ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr);
+ ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr, fast);
if (ret != 1)
return ret;
} while (ptep++, addr = next, addr != end);
@@ -625,7 +545,7 @@ static struct page *follow_hugepd(struct
ptep = hugepte_offset(hugepd, addr, pdshift);
ptl = huge_pte_lock(h, vma->vm_mm, ptep);
ret = gup_hugepd(vma, hugepd, addr, pdshift, addr + PAGE_SIZE,
- flags, &page, &nr);
+ flags, &page, &nr, false);
spin_unlock(ptl);
if (ret == 1) {
@@ -642,7 +562,7 @@ static struct page *follow_hugepd(struct
static inline int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
unsigned long addr, unsigned int pdshift,
unsigned long end, unsigned int flags,
- struct page **pages, int *nr)
+ struct page **pages, int *nr, bool fast)
{
return 0;
}
@@ -729,7 +649,7 @@ static struct page *follow_huge_pud(stru
gup_must_unshare(vma, flags, page))
return ERR_PTR(-EMLINK);
- ret = try_grab_page(page, flags);
+ ret = try_grab_folio(page_folio(page), 1, flags);
if (ret)
page = ERR_PTR(ret);
else
@@ -806,7 +726,7 @@ static struct page *follow_huge_pmd(stru
VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
!PageAnonExclusive(page), page);
- ret = try_grab_page(page, flags);
+ ret = try_grab_folio(page_folio(page), 1, flags);
if (ret)
return ERR_PTR(ret);
@@ -968,8 +888,8 @@ static struct page *follow_page_pte(stru
VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
!PageAnonExclusive(page), page);
- /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
- ret = try_grab_page(page, flags);
+ /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
+ ret = try_grab_folio(page_folio(page), 1, flags);
if (unlikely(ret)) {
page = ERR_PTR(ret);
goto out;
@@ -1233,7 +1153,7 @@ static int get_gate_page(struct mm_struc
goto unmap;
*page = pte_page(entry);
}
- ret = try_grab_page(*page, gup_flags);
+ ret = try_grab_folio(page_folio(*page), 1, gup_flags);
if (unlikely(ret))
goto unmap;
out:
@@ -1636,20 +1556,19 @@ next_page:
* pages.
*/
if (page_increm > 1) {
- struct folio *folio;
+ struct folio *folio = page_folio(page);
/*
* Since we already hold refcount on the
* large folio, this should never fail.
*/
- folio = try_grab_folio(page, page_increm - 1,
- foll_flags);
- if (WARN_ON_ONCE(!folio)) {
+ if (try_grab_folio(folio, page_increm - 1,
+ foll_flags)) {
/*
* Release the 1st page ref if the
* folio is problematic, fail hard.
*/
- gup_put_folio(page_folio(page), 1,
+ gup_put_folio(folio, 1,
foll_flags);
ret = -EFAULT;
goto out;
@@ -2797,6 +2716,101 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* This code is based heavily on the PowerPC implementation by Nick Piggin.
*/
#ifdef CONFIG_HAVE_GUP_FAST
+/**
+ * try_grab_folio_fast() - Attempt to get or pin a folio in fast path.
+ * @page: pointer to page to be grabbed
+ * @refs: the value to (effectively) add to the folio's refcount
+ * @flags: gup flags: these are the FOLL_* flag values.
+ *
+ * "grab" names in this file mean, "look at flags to decide whether to use
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
+ *
+ * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
+ * same time. (That's true throughout the get_user_pages*() and
+ * pin_user_pages*() APIs.) Cases:
+ *
+ * FOLL_GET: folio's refcount will be incremented by @refs.
+ *
+ * FOLL_PIN on large folios: folio's refcount will be incremented by
+ * @refs, and its pincount will be incremented by @refs.
+ *
+ * FOLL_PIN on single-page folios: folio's refcount will be incremented by
+ * @refs * GUP_PIN_COUNTING_BIAS.
+ *
+ * Return: The folio containing @page (with refcount appropriately
+ * incremented) for success, or NULL upon failure. If neither FOLL_GET
+ * nor FOLL_PIN was set, that's considered failure, and furthermore,
+ * a likely bug in the caller, so a warning is also emitted.
+ *
+ * It uses add ref unless zero to elevate the folio refcount and must be called
+ * in fast path only.
+ */
+static struct folio *try_grab_folio_fast(struct page *page, int refs,
+ unsigned int flags)
+{
+ struct folio *folio;
+
+ /* Raise warn if it is not called in fast GUP */
+ VM_WARN_ON_ONCE(!irqs_disabled());
+
+ if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
+ return NULL;
+
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
+ return NULL;
+
+ if (flags & FOLL_GET)
+ return try_get_folio(page, refs);
+
+ /* FOLL_PIN is set */
+
+ /*
+ * Don't take a pin on the zero page - it's not going anywhere
+ * and it is used in a *lot* of places.
+ */
+ if (is_zero_page(page))
+ return page_folio(page);
+
+ folio = try_get_folio(page, refs);
+ if (!folio)
+ return NULL;
+
+ /*
+ * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+ * right zone, so fail and let the caller fall back to the slow
+ * path.
+ */
+ if (unlikely((flags & FOLL_LONGTERM) &&
+ !folio_is_longterm_pinnable(folio))) {
+ if (!put_devmap_managed_folio_refs(folio, refs))
+ folio_put_refs(folio, refs);
+ return NULL;
+ }
+
+ /*
+ * When pinning a large folio, use an exact count to track it.
+ *
+ * However, be sure to *also* increment the normal folio
+ * refcount field at least once, so that the folio really
+ * is pinned. That's why the refcount from the earlier
+ * try_get_folio() is left intact.
+ */
+ if (folio_test_large(folio))
+ atomic_add(refs, &folio->_pincount);
+ else
+ folio_ref_add(folio,
+ refs * (GUP_PIN_COUNTING_BIAS - 1));
+ /*
+ * Adjust the pincount before re-checking the PTE for changes.
+ * This is essentially a smp_mb() and is paired with a memory
+ * barrier in folio_try_share_anon_rmap_*().
+ */
+ smp_mb__after_atomic();
+
+ node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
+
+ return folio;
+}
/*
* Used in the GUP-fast path to determine whether GUP is permitted to work on
@@ -2962,7 +2976,7 @@ static int gup_fast_pte_range(pmd_t pmd,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
- folio = try_grab_folio(page, 1, flags);
+ folio = try_grab_folio_fast(page, 1, flags);
if (!folio)
goto pte_unmap;
@@ -3049,7 +3063,7 @@ static int gup_fast_devmap_leaf(unsigned
break;
}
- folio = try_grab_folio(page, 1, flags);
+ folio = try_grab_folio_fast(page, 1, flags);
if (!folio) {
gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
break;
@@ -3138,7 +3152,7 @@ static int gup_fast_pmd_leaf(pmd_t orig,
page = pmd_page(orig);
refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr);
- folio = try_grab_folio(page, refs, flags);
+ folio = try_grab_folio_fast(page, refs, flags);
if (!folio)
return 0;
@@ -3182,7 +3196,7 @@ static int gup_fast_pud_leaf(pud_t orig,
page = pud_page(orig);
refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr);
- folio = try_grab_folio(page, refs, flags);
+ folio = try_grab_folio_fast(page, refs, flags);
if (!folio)
return 0;
@@ -3222,7 +3236,7 @@ static int gup_fast_pgd_leaf(pgd_t orig,
page = pgd_page(orig);
refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr);
- folio = try_grab_folio(page, refs, flags);
+ folio = try_grab_folio_fast(page, refs, flags);
if (!folio)
return 0;
@@ -3276,7 +3290,7 @@ static int gup_fast_pmd_range(pud_t *pud
* pmd format and THP pmd format
*/
if (gup_hugepd(NULL, __hugepd(pmd_val(pmd)), addr,
- PMD_SHIFT, next, flags, pages, nr) != 1)
+ PMD_SHIFT, next, flags, pages, nr, true) != 1)
return 0;
} else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags,
pages, nr))
@@ -3306,7 +3320,7 @@ static int gup_fast_pud_range(p4d_t *p4d
return 0;
} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
if (gup_hugepd(NULL, __hugepd(pud_val(pud)), addr,
- PUD_SHIFT, next, flags, pages, nr) != 1)
+ PUD_SHIFT, next, flags, pages, nr, true) != 1)
return 0;
} else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags,
pages, nr))
@@ -3333,7 +3347,7 @@ static int gup_fast_p4d_range(pgd_t *pgd
BUILD_BUG_ON(p4d_leaf(p4d));
if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
if (gup_hugepd(NULL, __hugepd(p4d_val(p4d)), addr,
- P4D_SHIFT, next, flags, pages, nr) != 1)
+ P4D_SHIFT, next, flags, pages, nr, true) != 1)
return 0;
} else if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags,
pages, nr))
@@ -3362,7 +3376,7 @@ static void gup_fast_pgd_range(unsigned
return;
} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
if (gup_hugepd(NULL, __hugepd(pgd_val(pgd)), addr,
- PGDIR_SHIFT, next, flags, pages, nr) != 1)
+ PGDIR_SHIFT, next, flags, pages, nr, true) != 1)
return;
} else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
pages, nr))
--- a/mm/huge_memory.c~mm-gup-do-not-call-try_grab_folio-in-slow-path
+++ a/mm/huge_memory.c
@@ -1331,7 +1331,7 @@ struct page *follow_devmap_pmd(struct vm
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- ret = try_grab_page(page, flags);
+ ret = try_grab_folio(page_folio(page), 1, flags);
if (ret)
page = ERR_PTR(ret);
--- a/mm/internal.h~mm-gup-do-not-call-try_grab_folio-in-slow-path
+++ a/mm/internal.h
@@ -1182,8 +1182,7 @@ int migrate_device_coherent_page(struct
/*
* mm/gup.c
*/
-struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
-int __must_check try_grab_page(struct page *page, unsigned int flags);
+int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags);
/*
* mm/huge_memory.c
_
Patches currently in -mm which might be from yang(a)os.amperecomputing.com are
mm-page_ref-remove-folio_try_get_rcu.patch
mm-gup-do-not-call-try_grab_folio-in-slow-path.patch
hugetlbfs-add-mte-support.patch
Read callbacks registered with nvmem core expect 0 to be returned on
success and a negative value to be returned on failure.
abx80x_nvmem_xfer() on read calls i2c_smbus_read_i2c_block_data() which
returns the number of bytes read on success as per its api description,
this return value is handled as an error and returned to nvmem even on
success.
Fix to handle all possible values that would be returned by
i2c_smbus_read_i2c_block_data().
Fixes: e90ff8ede777 ("rtc: abx80x: Add nvmem support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Joy Chakraborty <joychakr(a)google.com>
---
drivers/rtc/rtc-abx80x.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/drivers/rtc/rtc-abx80x.c b/drivers/rtc/rtc-abx80x.c
index fde2b8054c2e..1298962402ff 100644
--- a/drivers/rtc/rtc-abx80x.c
+++ b/drivers/rtc/rtc-abx80x.c
@@ -705,14 +705,18 @@ static int abx80x_nvmem_xfer(struct abx80x_priv *priv, unsigned int offset,
if (ret)
return ret;
- if (write)
+ if (write) {
ret = i2c_smbus_write_i2c_block_data(priv->client, reg,
len, val);
- else
+ if (ret)
+ return ret;
+ } else {
ret = i2c_smbus_read_i2c_block_data(priv->client, reg,
len, val);
- if (ret)
- return ret;
+ if (ret <= 0)
+ return ret ? ret : -EIO;
+ len = ret;
+ }
offset += len;
val += len;
--
2.45.2.505.gda0bf45e8d-goog
Read/write callbacks registered with nvmem core expect 0 to be returned
on success and a negative value to be returned on failure.
isl1208_nvmem_read()/isl1208_nvmem_write() currently return the number of
bytes read/written on success, fix to return 0 on success and negative on
failure.
Fixes: c3544f6f51ed ("rtc: isl1208: Add new style nvmem support to driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Joy Chakraborty <joychakr(a)google.com>
---
drivers/rtc/rtc-isl1208.c | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/drivers/rtc/rtc-isl1208.c b/drivers/rtc/rtc-isl1208.c
index e50c23ee1646..206f96b90f58 100644
--- a/drivers/rtc/rtc-isl1208.c
+++ b/drivers/rtc/rtc-isl1208.c
@@ -775,14 +775,13 @@ static int isl1208_nvmem_read(void *priv, unsigned int off, void *buf,
{
struct isl1208_state *isl1208 = priv;
struct i2c_client *client = to_i2c_client(isl1208->rtc->dev.parent);
- int ret;
/* nvmem sanitizes offset/count for us, but count==0 is possible */
if (!count)
return count;
- ret = isl1208_i2c_read_regs(client, ISL1208_REG_USR1 + off, buf,
+
+ return isl1208_i2c_read_regs(client, ISL1208_REG_USR1 + off, buf,
count);
- return ret == 0 ? count : ret;
}
static int isl1208_nvmem_write(void *priv, unsigned int off, void *buf,
@@ -790,15 +789,13 @@ static int isl1208_nvmem_write(void *priv, unsigned int off, void *buf,
{
struct isl1208_state *isl1208 = priv;
struct i2c_client *client = to_i2c_client(isl1208->rtc->dev.parent);
- int ret;
/* nvmem sanitizes off/count for us, but count==0 is possible */
if (!count)
return count;
- ret = isl1208_i2c_set_regs(client, ISL1208_REG_USR1 + off, buf,
- count);
- return ret == 0 ? count : ret;
+ return isl1208_i2c_set_regs(client, ISL1208_REG_USR1 + off, buf,
+ count);
}
static const struct nvmem_config isl1208_nvmem_config = {
--
2.45.2.505.gda0bf45e8d-goog
Read/write callbacks registered with nvmem core expect 0 to be returned
on success and a negative value to be returned on failure.
cmos_nvram_read()/cmos_nvram_write() currently return the number of
bytes read or written, fix to return 0 on success and -EIO incase number
of bytes requested was not read or written.
Fixes: 8b5b7958fd1c ("rtc: cmos: use generic nvmem")
Cc: stable(a)vger.kernel.org
Signed-off-by: Joy Chakraborty <joychakr(a)google.com>
---
drivers/rtc/rtc-cmos.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index 7d99cd2c37a0..35dca2accbb8 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -643,11 +643,10 @@ static int cmos_nvram_read(void *priv, unsigned int off, void *val,
size_t count)
{
unsigned char *buf = val;
- int retval;
off += NVRAM_OFFSET;
spin_lock_irq(&rtc_lock);
- for (retval = 0; count; count--, off++, retval++) {
+ for (; count; count--, off++) {
if (off < 128)
*buf++ = CMOS_READ(off);
else if (can_bank2)
@@ -657,7 +656,7 @@ static int cmos_nvram_read(void *priv, unsigned int off, void *val,
}
spin_unlock_irq(&rtc_lock);
- return retval;
+ return count ? -EIO : 0;
}
static int cmos_nvram_write(void *priv, unsigned int off, void *val,
@@ -665,7 +664,6 @@ static int cmos_nvram_write(void *priv, unsigned int off, void *val,
{
struct cmos_rtc *cmos = priv;
unsigned char *buf = val;
- int retval;
/* NOTE: on at least PCs and Ataris, the boot firmware uses a
* checksum on part of the NVRAM data. That's currently ignored
@@ -674,7 +672,7 @@ static int cmos_nvram_write(void *priv, unsigned int off, void *val,
*/
off += NVRAM_OFFSET;
spin_lock_irq(&rtc_lock);
- for (retval = 0; count; count--, off++, retval++) {
+ for (; count; count--, off++) {
/* don't trash RTC registers */
if (off == cmos->day_alrm
|| off == cmos->mon_alrm
@@ -689,7 +687,7 @@ static int cmos_nvram_write(void *priv, unsigned int off, void *val,
}
spin_unlock_irq(&rtc_lock);
- return retval;
+ return count ? -EIO : 0;
}
/*----------------------------------------------------------------*/
--
2.45.2.505.gda0bf45e8d-goog
Do not disable broadcast if we are using address 0 (broadcast) to
communicate with this device. Otherwise we will use proper driver but no
communication will be possible and no link changes will be detected.
There are two scenarios where we can run in to this situation:
- PHY is bootstrapped for address 0
- no PHY address is known and linux is scanning the MDIO bus, so first
respond and attached device will be on address 0.
The fixes tag points to the latest refactoring, not to the initial point
where kszphy_broadcast_disable() was introduced.
Fixes: 79e498a9c7da0 ("net: phy: micrel: Restore led_mode and clk_sel on resume")
Cc: stable(a)vger.kernel.org
Signed-off-by: Oleksij Rempel <o.rempel(a)pengutronix.de>
---
drivers/net/phy/micrel.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 81c20eb4b54b9..67c2e611150d2 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -590,7 +590,7 @@ static int kszphy_config_init(struct phy_device *phydev)
type = priv->type;
- if (type && type->has_broadcast_disable)
+ if (type && type->has_broadcast_disable && phydev->mdio.addr != 0)
kszphy_broadcast_disable(phydev);
if (type && type->has_nand_tree_disable)
--
2.39.2
The patch titled
Subject: cachestat: do not flush stats in recency check
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
cachestat-do-not-flush-stats-in-recency-check.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Nhat Pham <nphamcs(a)gmail.com>
Subject: cachestat: do not flush stats in recency check
Date: Thu, 27 Jun 2024 13:17:37 -0700
syzbot detects that cachestat() is flushing stats, which can sleep, in its
RCU read section (see [1]). This is done in the workingset_test_recent()
step (which checks if the folio's eviction is recent).
Move the stat flushing step to before the RCU read section of cachestat,
and skip stat flushing during the recency check.
[1]: https://lore.kernel.org/cgroups/000000000000f71227061bdf97e0@google.com/
Link: https://lkml.kernel.org/r/20240627201737.3506959-1-nphamcs@gmail.com
Fixes: b00684722262 ("mm: workingset: move the stats flush into workingset_test_recent()")
Signed-off-by: Nhat Pham <nphamcs(a)gmail.com>
Reported-by: syzbot+b7f13b2d0cc156edf61a(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/cgroups/000000000000f71227061bdf97e0@google.com/
Debugged-by: Johannes Weiner <hannes(a)cmpxchg.org>
Suggested-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Kairui Song <kasong(a)tencent.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Shakeel Butt <shakeel.butt(a)linux.dev>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: <stable(a)vger.kernel.org> [6.8+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/swap.h | 3 ++-
mm/filemap.c | 5 ++++-
mm/workingset.c | 14 +++++++++++---
3 files changed, 17 insertions(+), 5 deletions(-)
--- a/include/linux/swap.h~cachestat-do-not-flush-stats-in-recency-check
+++ a/include/linux/swap.h
@@ -354,7 +354,8 @@ static inline swp_entry_t page_swap_entr
}
/* linux/mm/workingset.c */
-bool workingset_test_recent(void *shadow, bool file, bool *workingset);
+bool workingset_test_recent(void *shadow, bool file, bool *workingset,
+ bool flush);
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
void workingset_refault(struct folio *folio, void *shadow);
--- a/mm/filemap.c~cachestat-do-not-flush-stats-in-recency-check
+++ a/mm/filemap.c
@@ -4248,6 +4248,9 @@ static void filemap_cachestat(struct add
XA_STATE(xas, &mapping->i_pages, first_index);
struct folio *folio;
+ /* Flush stats (and potentially sleep) outside the RCU read section. */
+ mem_cgroup_flush_stats_ratelimited(NULL);
+
rcu_read_lock();
xas_for_each(&xas, folio, last_index) {
int order;
@@ -4311,7 +4314,7 @@ static void filemap_cachestat(struct add
goto resched;
}
#endif
- if (workingset_test_recent(shadow, true, &workingset))
+ if (workingset_test_recent(shadow, true, &workingset, false))
cs->nr_recently_evicted += nr_pages;
goto resched;
--- a/mm/workingset.c~cachestat-do-not-flush-stats-in-recency-check
+++ a/mm/workingset.c
@@ -412,10 +412,12 @@ void *workingset_eviction(struct folio *
* @file: whether the corresponding folio is from the file lru.
* @workingset: where the workingset value unpacked from shadow should
* be stored.
+ * @flush: whether to flush cgroup rstat.
*
* Return: true if the shadow is for a recently evicted folio; false otherwise.
*/
-bool workingset_test_recent(void *shadow, bool file, bool *workingset)
+bool workingset_test_recent(void *shadow, bool file, bool *workingset,
+ bool flush)
{
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
@@ -467,10 +469,16 @@ bool workingset_test_recent(void *shadow
/*
* Flush stats (and potentially sleep) outside the RCU read section.
+ *
+ * Note that workingset_test_recent() itself might be called in RCU read
+ * section (for e.g, in cachestat) - these callers need to skip flushing
+ * stats (via the flush argument).
+ *
* XXX: With per-memcg flushing and thresholding, is ratelimiting
* still needed here?
*/
- mem_cgroup_flush_stats_ratelimited(eviction_memcg);
+ if (flush)
+ mem_cgroup_flush_stats_ratelimited(eviction_memcg);
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->nonresident_age);
@@ -558,7 +566,7 @@ void workingset_refault(struct folio *fo
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
- if (!workingset_test_recent(shadow, file, &workingset))
+ if (!workingset_test_recent(shadow, file, &workingset, true))
return;
folio_set_active(folio);
_
Patches currently in -mm which might be from nphamcs(a)gmail.com are
cachestat-do-not-flush-stats-in-recency-check.patch
On Wed, 2024-06-26 at 15:07 -0400, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> scsi: mpt3sas: Add ioc_<level> logging macros
>
> to the 4.19-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> scsi-mpt3sas-add-ioc_-level-logging-macros.patch
> and it can be found in the queue-4.19 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
[]
> Stable-dep-of: 4254dfeda82f ("scsi: mpt3sas: Avoid test/set_bit() operating in non-allocated memory")
Huh? This doesn't make sense as far as I can tell.
> diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h
[]
> @@ -160,6 +160,15 @@ struct mpt3sas_nvme_cmd {
> */
> #define MPT3SAS_FMT "%s: "
>
> +#define ioc_err(ioc, fmt, ...) \
> + pr_err("%s: " fmt, (ioc)->name, ##__VA_ARGS__)
> +#define ioc_notice(ioc, fmt, ...) \
> + pr_notice("%s: " fmt, (ioc)->name, ##__VA_ARGS__)
> +#define ioc_warn(ioc, fmt, ...) \
> + pr_warn("%s: " fmt, (ioc)->name, ##__VA_ARGS__)
> +#define ioc_info(ioc, fmt, ...) \
> + pr_info("%s: " fmt, (ioc)->name, ##__VA_ARGS__)
> +
This only adds ioc_<level> macros and the
nominal stable dep patch below doesn't use them.
$ git log --stat -p -1 4254dfeda82f
commit 4254dfeda82f20844299dca6c38cbffcfd499f41
Author: Breno Leitao <leitao(a)debian.org>
Date: Wed Jun 5 01:55:29 2024 -0700
scsi: mpt3sas: Avoid test/set_bit() operating in non-allocated memory
There is a potential out-of-bounds access when using test_bit() on a single
word. The test_bit() and set_bit() functions operate on long values, and
when testing or setting a single word, they can exceed the word
boundary. KASAN detects this issue and produces a dump:
BUG: KASAN: slab-out-of-bounds in _scsih_add_device.constprop.0 (./arch/x86/include/asm/bitops.h:60 ./include/asm-generic/bitops/instrumented-atomic.h:29 drivers/scsi/mpt3sas/mpt3sas_scsih.c:7331) mpt3sas
Write of size 8 at addr ffff8881d26e3c60 by task kworker/u1536:2/2965
For full log, please look at [1].
Make the allocation at least the size of sizeof(unsigned long) so that
set_bit() and test_bit() have sufficient room for read/write operations
without overwriting unallocated memory.
[1] Link: https://lore.kernel.org/all/ZkNcALr3W3KGYYJG@gmail.com/
Fixes: c696f7b83ede ("scsi: mpt3sas: Implement device_remove_in_progress check in IOCTL path")
Cc: stable(a)vger.kernel.org
Suggested-by: Keith Busch <kbusch(a)kernel.org>
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Link: https://lore.kernel.org/r/20240605085530.499432-1-leitao@debian.org
Reviewed-by: Keith Busch <kbusch(a)kernel.org>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
---
drivers/scsi/mpt3sas/mpt3sas_base.c | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c
index 258647fc6bddb..1092497563b22 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -8512,6 +8512,12 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
ioc->pd_handles_sz = (ioc->facts.MaxDevHandle / 8);
if (ioc->facts.MaxDevHandle % 8)
ioc->pd_handles_sz++;
+ /*
+ * pd_handles_sz should have, at least, the minimal room for
+ * set_bit()/test_bit(), otherwise out-of-memory touch may occur.
+ */
+ ioc->pd_handles_sz = ALIGN(ioc->pd_handles_sz, sizeof(unsigned long));
+
ioc->pd_handles = kzalloc(ioc->pd_handles_sz,
GFP_KERNEL);
if (!ioc->pd_handles) {
@@ -8529,6 +8535,13 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
ioc->pend_os_device_add_sz = (ioc->facts.MaxDevHandle / 8);
if (ioc->facts.MaxDevHandle % 8)
ioc->pend_os_device_add_sz++;
+
+ /*
+ * pend_os_device_add_sz should have, at least, the minimal room for
+ * set_bit()/test_bit(), otherwise out-of-memory may occur.
+ */
+ ioc->pend_os_device_add_sz = ALIGN(ioc->pend_os_device_add_sz,
+ sizeof(unsigned long));
ioc->pend_os_device_add = kzalloc(ioc->pend_os_device_add_sz,
GFP_KERNEL);
if (!ioc->pend_os_device_add) {
@@ -8820,6 +8833,12 @@ _base_check_ioc_facts_changes(struct MPT3SAS_ADAPTER *ioc)
if (ioc->facts.MaxDevHandle % 8)
pd_handles_sz++;
+ /*
+ * pd_handles should have, at least, the minimal room for
+ * set_bit()/test_bit(), otherwise out-of-memory touch may
+ * occur.
+ */
+ pd_handles_sz = ALIGN(pd_handles_sz, sizeof(unsigned long));
pd_handles = krealloc(ioc->pd_handles, pd_handles_sz,
GFP_KERNEL);
if (!pd_handles) {
From: Murali Nalajala <quic_mnalajal(a)quicinc.com>
Currently get_wq_ctx() is wrongly configured as a
standard call. When two SMC calls are in sleep and one
SMC wakes up, it calls get_wq_ctx() to resume the
corresponding sleeping thread. But if get_wq_ctx() is
interrupted, goes to sleep and another SMC call is
waiting to be allocated a waitq context, it leads to a
deadlock.
To avoid this get_wq_ctx() must be an atomic call and
can't be a standard SMC call. Hence mark get_wq_ctx()
as a fast call.
Fixes: 6bf325992236 ("firmware: qcom: scm: Add wait-queue handling logic")
Cc: stable(a)vger.kernel.org
Signed-off-by: Murali Nalajala <quic_mnalajal(a)quicinc.com>
Signed-off-by: Unnathi Chalicheemala <quic_uchalich(a)quicinc.com>
---
drivers/firmware/qcom/qcom_scm-smc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/firmware/qcom/qcom_scm-smc.c b/drivers/firmware/qcom/qcom_scm-smc.c
index 16cf88acfa8e..0a2a2c794d0e 100644
--- a/drivers/firmware/qcom/qcom_scm-smc.c
+++ b/drivers/firmware/qcom/qcom_scm-smc.c
@@ -71,7 +71,7 @@ int scm_get_wq_ctx(u32 *wq_ctx, u32 *flags, u32 *more_pending)
struct arm_smccc_res get_wq_res;
struct arm_smccc_args get_wq_ctx = {0};
- get_wq_ctx.args[0] = ARM_SMCCC_CALL_VAL(ARM_SMCCC_STD_CALL,
+ get_wq_ctx.args[0] = ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,
ARM_SMCCC_SMC_64, ARM_SMCCC_OWNER_SIP,
SCM_SMC_FNID(QCOM_SCM_SVC_WAITQ, QCOM_SCM_WAITQ_GET_WQ_CTX));
--
2.34.1
Currently during driver initialization, mac registration is allowed
only for devices that advertise WMI_TLV_SERVICE_WOW, but QCN9274
doesn't support WoW and hence mac registration is aborted and driver
is de-initialized.
Allow mac registration to proceed without WoW Support for devices
that don't advertise WMI_TLV_SERVICE_WOW.
Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.0.1-00029-QCAHKSWPL_SILICONZ-1
Cc: stable(a)vger.kernel.org
Fixes: 4a3c212eee0e ("wifi: ath12k: add basic WoW functionalities")
Signed-off-by: Rameshkumar Sundaram <quic_ramess(a)quicinc.com>
---
drivers/net/wireless/ath/ath12k/wow.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/wireless/ath/ath12k/wow.c b/drivers/net/wireless/ath/ath12k/wow.c
index 685e8e98d845..c5cba825a84a 100644
--- a/drivers/net/wireless/ath/ath12k/wow.c
+++ b/drivers/net/wireless/ath/ath12k/wow.c
@@ -1001,8 +1001,8 @@ int ath12k_wow_op_resume(struct ieee80211_hw *hw)
int ath12k_wow_init(struct ath12k *ar)
{
- if (WARN_ON(!test_bit(WMI_TLV_SERVICE_WOW, ar->wmi->wmi_ab->svc_map)))
- return -EINVAL;
+ if (!test_bit(WMI_TLV_SERVICE_WOW, ar->wmi->wmi_ab->svc_map))
+ return 0;
ar->wow.wowlan_support = ath12k_wowlan_support;
base-commit: dadc1101eabb54ace51aa6fc58c902bf43ac0ed7
--
2.25.1
Otherwise when the tracer changes syscall number to -1, the kernel fails
to initialize a0 with -ENOSYS and subsequently fails to return the error
code of the failed syscall to userspace. For example, it will break
strace syscall tampering.
Fixes: 52449c17bdd1 ("riscv: entry: set a0 = -ENOSYS only when syscall != -1")
Cc: stable(a)vger.kernel.org
Signed-off-by: Celeste Liu <CoelacanthusHex(a)gmail.com>
---
arch/riscv/kernel/traps.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 05a16b1f0aee..51ebfd23e007 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -319,6 +319,7 @@ void do_trap_ecall_u(struct pt_regs *regs)
regs->epc += 4;
regs->orig_a0 = regs->a0;
+ regs->a0 = -ENOSYS;
riscv_v_vstate_discard(regs);
@@ -328,8 +329,7 @@ void do_trap_ecall_u(struct pt_regs *regs)
if (syscall >= 0 && syscall < NR_syscalls)
syscall_handler(regs, syscall);
- else if (syscall != -1)
- regs->a0 = -ENOSYS;
+
/*
* Ultimately, this value will get limited by KSTACK_OFFSET_MAX(),
* so the maximum stack offset is 1k bytes (10 bits).
--
2.45.2
Usb device connect may not be detected after runtime resume if
xHC is reset during resume.
In runtime resume cases xhci_resume() will only resume roothubs if there
are pending port events. If the xHC host is reset during runtime resume
due to a Save/Restore Error (SRE) then these pending port events won't be
detected as PORTSC change bits are not immediately set by host after reset.
Unconditionally resume roothubs if xHC is reset during resume to ensure
device connections are detected.
Also return early with error code if starting xHC fails after reset.
Issue was debugged and a similar solution suggested by Remi Pommarel.
Using this instead as it simplifies future refactoring.
Reported-by: Remi Pommarel <repk(a)triplefau.lt>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218987
Suggested-by: Remi Pommarel <repk(a)triplefau.lt>
Tested-by: Remi Pommarel <repk(a)triplefau.lt>
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
---
drivers/usb/host/xhci.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 37eb37b0affa..0a8cf6c17f82 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -1125,10 +1125,20 @@ int xhci_resume(struct xhci_hcd *xhci, pm_message_t msg)
xhci_dbg(xhci, "Start the secondary HCD\n");
retval = xhci_run(xhci->shared_hcd);
}
-
+ if (retval)
+ return retval;
+ /*
+ * Resume roothubs unconditionally as PORTSC change bits are not
+ * immediately visible after xHC reset
+ */
hcd->state = HC_STATE_SUSPENDED;
- if (xhci->shared_hcd)
+
+ if (xhci->shared_hcd) {
xhci->shared_hcd->state = HC_STATE_SUSPENDED;
+ usb_hcd_resume_root_hub(xhci->shared_hcd);
+ }
+ usb_hcd_resume_root_hub(hcd);
+
goto done;
}
@@ -1152,7 +1162,6 @@ int xhci_resume(struct xhci_hcd *xhci, pm_message_t msg)
xhci_dbc_resume(xhci);
- done:
if (retval == 0) {
/*
* Resume roothubs only if there are pending events.
@@ -1178,6 +1187,7 @@ int xhci_resume(struct xhci_hcd *xhci, pm_message_t msg)
usb_hcd_resume_root_hub(hcd);
}
}
+done:
/*
* If system is subject to the Quirk, Compliance Mode Timer needs to
* be re-initialized Always after a system resume. Ports are subject
--
2.25.1
From: Jos Wang <joswang(a)lenovo.com>
This is a workaround for STAR 4846132, which only affects
DWC_usb31 version2.00a operating in host mode.
There is a problem in DWC_usb31 version 2.00a operating
in host mode that would cause a CSR read timeout When CSR
read coincides with RAM Clock Gating Entry. By disable
Clock Gating, sacrificing power consumption for normal
operation.
Cc: stable(a)vger.kernel.org
Signed-off-by: Jos Wang <joswang(a)lenovo.com>
---
v6 -> v7: code no change. Just add the v4->v5 and v5->v6
version difference description
v5 -> v6: code no change. however, there were two v5
patches, which confused the reviewers, so a v6 version
was submitted.
v4 -> v5: code no change. Patches "1/3" and "2/3" of the
patch series are under review, this patch is submitted
independently.
v3 -> v4: modify commit message, add Cc: stable(a)vger.kernel.org
v2 -> v3:
- code refactor
- modify comment, add STAR number, workaround applied in host mode
- modify commit message, add STAR number, workaround applied in host mode
- modify Author Jos Wang
v1 -> v2: code no change. Add other case patches to the patch series
drivers/usb/dwc3/core.c | 20 +++++++++++++++++++-
1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index 7ee61a89520b..2a3adc80fe0f 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -957,12 +957,16 @@ static bool dwc3_core_is_valid(struct dwc3 *dwc)
static void dwc3_core_setup_global_control(struct dwc3 *dwc)
{
+ unsigned int power_opt;
+ unsigned int hw_mode;
u32 reg;
reg = dwc3_readl(dwc->regs, DWC3_GCTL);
reg &= ~DWC3_GCTL_SCALEDOWN_MASK;
+ hw_mode = DWC3_GHWPARAMS0_MODE(dwc->hwparams.hwparams0);
+ power_opt = DWC3_GHWPARAMS1_EN_PWROPT(dwc->hwparams.hwparams1);
- switch (DWC3_GHWPARAMS1_EN_PWROPT(dwc->hwparams.hwparams1)) {
+ switch (power_opt) {
case DWC3_GHWPARAMS1_EN_PWROPT_CLK:
/**
* WORKAROUND: DWC3 revisions between 2.10a and 2.50a have an
@@ -995,6 +999,20 @@ static void dwc3_core_setup_global_control(struct dwc3 *dwc)
break;
}
+ /*
+ * This is a workaround for STAR#4846132, which only affects
+ * DWC_usb31 version2.00a operating in host mode.
+ *
+ * There is a problem in DWC_usb31 version 2.00a operating
+ * in host mode that would cause a CSR read timeout When CSR
+ * read coincides with RAM Clock Gating Entry. By disable
+ * Clock Gating, sacrificing power consumption for normal
+ * operation.
+ */
+ if (power_opt != DWC3_GHWPARAMS1_EN_PWROPT_NO &&
+ hw_mode != DWC3_GHWPARAMS0_MODE_GADGET && DWC3_VER_IS(DWC31, 200A))
+ reg |= DWC3_GCTL_DSBLCLKGTNG;
+
/* check if current dwc3 is on simulation board */
if (dwc->hwparams.hwparams6 & DWC3_GHWPARAMS6_EN_FPGA) {
dev_info(dwc->dev, "Running with FPGA optimizations\n");
--
2.17.1
Introduce a version of the fence ops that on release doesn't remove
the fence from the pending list, and thus doesn't require a lock to
fix poll->fence wait->fence unref deadlocks.
vmwgfx overwrites the wait callback to iterate over the list of all
fences and update their status, to do that it holds a lock to prevent
the list modifcations from other threads. The fence destroy callback
both deletes the fence and removes it from the list of pending
fences, for which it holds a lock.
dma buf polling cb unrefs a fence after it's been signaled: so the poll
calls the wait, which signals the fences, which are being destroyed.
The destruction tries to acquire the lock on the pending fences list
which it can never get because it's held by the wait from which it
was called.
Old bug, but not a lot of userspace apps were using dma-buf polling
interfaces. Fix those, in particular this fixes KDE stalls/deadlock.
Signed-off-by: Zack Rusin <zack.rusin(a)broadcom.com>
Fixes: 2298e804e96e ("drm/vmwgfx: rework to new fence interface, v2")
Cc: Broadcom internal kernel review list <bcm-kernel-feedback-list(a)broadcom.com>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v6.2+
---
drivers/gpu/drm/vmwgfx/vmwgfx_fence.c | 26 ++++++++++++++++++++------
1 file changed, 20 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
index 5efc6a766f64..76971ef7801a 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c
@@ -32,7 +32,6 @@
#define VMW_FENCE_WRAP (1 << 31)
struct vmw_fence_manager {
- int num_fence_objects;
struct vmw_private *dev_priv;
spinlock_t lock;
struct list_head fence_list;
@@ -120,16 +119,23 @@ static void vmw_fence_goal_write(struct vmw_private *vmw, u32 value)
* objects with actions attached to them.
*/
-static void vmw_fence_obj_destroy(struct dma_fence *f)
+static void vmw_fence_obj_destroy_removed(struct dma_fence *f)
{
struct vmw_fence_obj *fence =
container_of(f, struct vmw_fence_obj, base);
+ WARN_ON(!list_empty(&fence->head));
+ fence->destroy(fence);
+}
+
+static void vmw_fence_obj_destroy(struct dma_fence *f)
+{
+ struct vmw_fence_obj *fence =
+ container_of(f, struct vmw_fence_obj, base);
struct vmw_fence_manager *fman = fman_from_fence(fence);
spin_lock(&fman->lock);
list_del_init(&fence->head);
- --fman->num_fence_objects;
spin_unlock(&fman->lock);
fence->destroy(fence);
}
@@ -257,6 +263,13 @@ static const struct dma_fence_ops vmw_fence_ops = {
.release = vmw_fence_obj_destroy,
};
+static const struct dma_fence_ops vmw_fence_ops_removed = {
+ .get_driver_name = vmw_fence_get_driver_name,
+ .get_timeline_name = vmw_fence_get_timeline_name,
+ .enable_signaling = vmw_fence_enable_signaling,
+ .wait = vmw_fence_wait,
+ .release = vmw_fence_obj_destroy_removed,
+};
/*
* Execute signal actions on fences recently signaled.
@@ -355,7 +368,6 @@ static int vmw_fence_obj_init(struct vmw_fence_manager *fman,
goto out_unlock;
}
list_add_tail(&fence->head, &fman->fence_list);
- ++fman->num_fence_objects;
out_unlock:
spin_unlock(&fman->lock);
@@ -403,7 +415,7 @@ static bool vmw_fence_goal_new_locked(struct vmw_fence_manager *fman,
u32 passed_seqno)
{
u32 goal_seqno;
- struct vmw_fence_obj *fence;
+ struct vmw_fence_obj *fence, *next_fence;
if (likely(!fman->seqno_valid))
return false;
@@ -413,7 +425,7 @@ static bool vmw_fence_goal_new_locked(struct vmw_fence_manager *fman,
return false;
fman->seqno_valid = false;
- list_for_each_entry(fence, &fman->fence_list, head) {
+ list_for_each_entry_safe(fence, next_fence, &fman->fence_list, head) {
if (!list_empty(&fence->seq_passed_actions)) {
fman->seqno_valid = true;
vmw_fence_goal_write(fman->dev_priv,
@@ -471,6 +483,7 @@ static void __vmw_fences_update(struct vmw_fence_manager *fman)
rerun:
list_for_each_entry_safe(fence, next_fence, &fman->fence_list, head) {
if (seqno - fence->base.seqno < VMW_FENCE_WRAP) {
+ fence->base.ops = &vmw_fence_ops_removed;
list_del_init(&fence->head);
dma_fence_signal_locked(&fence->base);
INIT_LIST_HEAD(&action_list);
@@ -662,6 +675,7 @@ void vmw_fence_fifo_down(struct vmw_fence_manager *fman)
VMW_FENCE_WAIT_TIMEOUT);
if (unlikely(ret != 0)) {
+ fence->base.ops = &vmw_fence_ops_removed;
list_del_init(&fence->head);
dma_fence_signal(&fence->base);
INIT_LIST_HEAD(&action_list);
--
2.40.1
In cdv_intel_lvds_get_modes(), the return value of drm_mode_duplicate()
is assigned to mode, which will lead to a NULL pointer dereference on
failure of drm_mode_duplicate(). Add a check to avoid npd.
Cc: stable(a)vger.kernel.org
Fixes: 6a227d5fd6c4 ("gma500: Add support for Cedarview")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
Changes in v2:
- modified the patch according to suggestions from other patchs;
- added Fixes line;
- added Cc stable;
- Link: https://lore.kernel.org/lkml/20240622072514.1867582-1-make24@iscas.ac.cn/T/
---
drivers/gpu/drm/gma500/cdv_intel_lvds.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/gma500/cdv_intel_lvds.c b/drivers/gpu/drm/gma500/cdv_intel_lvds.c
index f08a6803dc18..3adc2c9ab72d 100644
--- a/drivers/gpu/drm/gma500/cdv_intel_lvds.c
+++ b/drivers/gpu/drm/gma500/cdv_intel_lvds.c
@@ -311,6 +311,9 @@ static int cdv_intel_lvds_get_modes(struct drm_connector *connector)
if (mode_dev->panel_fixed_mode != NULL) {
struct drm_display_mode *mode =
drm_mode_duplicate(dev, mode_dev->panel_fixed_mode);
+ if (!mode)
+ return 0;
+
drm_mode_probed_add(connector, mode);
return 1;
}
--
2.25.1
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: stm32: dcmipp: correct error handling in dcmipp_create_subdevs
Author: Alain Volmat <alain.volmat(a)foss.st.com>
Date: Mon Jun 24 10:41:22 2024 +0200
Correct error handling within the dcmipp_create_subdevs by properly
decrementing the i counter when releasing the subdeves.
Fixes: 28e0f3772296 ("media: stm32-dcmipp: STM32 DCMIPP camera interface driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Alain Volmat <alain.volmat(a)foss.st.com>
Signed-off-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
drivers/media/platform/st/stm32/stm32-dcmipp/dcmipp-core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
---
diff --git a/drivers/media/platform/st/stm32/stm32-dcmipp/dcmipp-core.c b/drivers/media/platform/st/stm32/stm32-dcmipp/dcmipp-core.c
index 4acc3b90d03a..4924ee36cfda 100644
--- a/drivers/media/platform/st/stm32/stm32-dcmipp/dcmipp-core.c
+++ b/drivers/media/platform/st/stm32/stm32-dcmipp/dcmipp-core.c
@@ -202,7 +202,7 @@ static int dcmipp_create_subdevs(struct dcmipp_device *dcmipp)
return 0;
err_init_entity:
- while (i > 0)
+ while (i-- > 0)
dcmipp->pipe_cfg->ents[i - 1].release(dcmipp->entity[i - 1]);
return ret;
}
In psb_intel_lvds_get_modes(), the return value of drm_mode_duplicate() is
assigned to mode, which will lead to a possible NULL pointer dereference
on failure of drm_mode_duplicate(). Add a check to avoid npd.
Cc: stable(a)vger.kernel.org
Fixes: 89c78134cc54 ("gma500: Add Poulsbo support")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
Changes in v2:
- modified the patch according to suggestions;
- added Fixes line;
- added Cc stable.
---
drivers/gpu/drm/gma500/psb_intel_lvds.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/gma500/psb_intel_lvds.c b/drivers/gpu/drm/gma500/psb_intel_lvds.c
index 8486de230ec9..8d1be94a443b 100644
--- a/drivers/gpu/drm/gma500/psb_intel_lvds.c
+++ b/drivers/gpu/drm/gma500/psb_intel_lvds.c
@@ -504,6 +504,9 @@ static int psb_intel_lvds_get_modes(struct drm_connector *connector)
if (mode_dev->panel_fixed_mode != NULL) {
struct drm_display_mode *mode =
drm_mode_duplicate(dev, mode_dev->panel_fixed_mode);
+ if (!mode)
+ return 0;
+
drm_mode_probed_add(connector, mode);
return 1;
}
--
2.25.1
Instead of computing the number of descriptor blocks a transaction can
have each time we need it (which is currently when starting each
transaction but will become more frequent later) precompute the number
once during journal initialization together with maximum transaction
size. We perform the precomputation whenever journal feature set is
updated similarly as for computation of
journal->j_revoke_records_per_block.
CC: stable(a)vger.kernel.org
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/jbd2/journal.c | 61 ++++++++++++++++++++++++++++++++-----------
fs/jbd2/transaction.c | 24 +----------------
include/linux/jbd2.h | 7 +++++
3 files changed, 54 insertions(+), 38 deletions(-)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1bb73750d307..ae5b544ed0cc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1451,6 +1451,48 @@ static int journal_revoke_records_per_block(journal_t *journal)
return space / record_size;
}
+static int jbd2_journal_get_max_txn_bufs(journal_t *journal)
+{
+ return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
+}
+
+/*
+ * Base amount of descriptor blocks we reserve for each transaction.
+ */
+static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
+{
+ int tag_space = journal->j_blocksize - sizeof(journal_header_t);
+ int tags_per_block;
+
+ /* Subtract UUID */
+ tag_space -= 16;
+ if (jbd2_journal_has_csum_v2or3(journal))
+ tag_space -= sizeof(struct jbd2_journal_block_tail);
+ /* Commit code leaves a slack space of 16 bytes at the end of block */
+ tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
+ /*
+ * Revoke descriptors are accounted separately so we need to reserve
+ * space for commit block and normal transaction descriptor blocks.
+ */
+ return 1 + DIV_ROUND_UP(jbd2_journal_get_max_txn_bufs(journal),
+ tags_per_block);
+}
+
+/*
+ * Initialize number of blocks each transaction reserves for its bookkeeping
+ * and maximum number of blocks a transaction can use. This needs to be called
+ * after the journal size and the fastcommit area size are initialized.
+ */
+static void jbd2_journal_init_transaction_limits(journal_t *journal)
+{
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
+ journal->j_transaction_overhead_buffers =
+ jbd2_descriptor_blocks_per_trans(journal);
+ journal->j_max_transaction_buffers =
+ jbd2_journal_get_max_txn_bufs(journal);
+}
+
/*
* Load the on-disk journal superblock and read the key fields into the
* journal_t.
@@ -1492,8 +1534,8 @@ static int journal_load_superblock(journal_t *journal)
if (jbd2_journal_has_csum_v2or3(journal))
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
sizeof(sb->s_uuid));
- journal->j_revoke_records_per_block =
- journal_revoke_records_per_block(journal);
+ /* After journal features are set, we can compute transaction limits */
+ jbd2_journal_init_transaction_limits(journal);
if (jbd2_has_feature_fast_commit(journal)) {
journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
@@ -1698,11 +1740,6 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
return journal;
}
-static int jbd2_journal_get_max_txn_bufs(journal_t *journal)
-{
- return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
-}
-
/*
* Given a journal_t structure, initialise the various fields for
* startup of a new journaling session. We use this both when creating
@@ -1748,8 +1785,6 @@ static int journal_reset(journal_t *journal)
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
journal->j_commit_request = journal->j_commit_sequence;
- journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
-
/*
* Now that journal recovery is done, turn fast commits off here. This
* way, if fast commit was enabled before the crash but if now FS has
@@ -2290,8 +2325,6 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
journal->j_fc_first = journal->j_last + 1;
journal->j_fc_off = 0;
journal->j_free = journal->j_last - journal->j_first;
- journal->j_max_transaction_buffers =
- jbd2_journal_get_max_txn_bufs(journal);
return 0;
}
@@ -2379,8 +2412,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
unlock_buffer(journal->j_sb_buffer);
- journal->j_revoke_records_per_block =
- journal_revoke_records_per_block(journal);
+ jbd2_journal_init_transaction_limits(journal);
return 1;
#undef COMPAT_FEATURE_ON
@@ -2411,8 +2443,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
sb->s_feature_compat &= ~cpu_to_be32(compat);
sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
sb->s_feature_incompat &= ~cpu_to_be32(incompat);
- journal->j_revoke_records_per_block =
- journal_revoke_records_per_block(journal);
+ jbd2_journal_init_transaction_limits(journal);
}
EXPORT_SYMBOL(jbd2_journal_clear_features);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cb0b8d6fc0c6..a095f1a3114b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -62,28 +62,6 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
kmem_cache_free(transaction_cache, transaction);
}
-/*
- * Base amount of descriptor blocks we reserve for each transaction.
- */
-static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
-{
- int tag_space = journal->j_blocksize - sizeof(journal_header_t);
- int tags_per_block;
-
- /* Subtract UUID */
- tag_space -= 16;
- if (jbd2_journal_has_csum_v2or3(journal))
- tag_space -= sizeof(struct jbd2_journal_block_tail);
- /* Commit code leaves a slack space of 16 bytes at the end of block */
- tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
- /*
- * Revoke descriptors are accounted separately so we need to reserve
- * space for commit block and normal transaction descriptor blocks.
- */
- return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
- tags_per_block);
-}
-
/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
@@ -109,7 +87,7 @@ static void jbd2_get_transaction(journal_t *journal,
transaction->t_expires = jiffies + journal->j_commit_interval;
atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits,
- jbd2_descriptor_blocks_per_trans(journal) +
+ journal->j_transaction_overhead_buffers +
atomic_read(&journal->j_reserved_credits));
atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f91b930abe20..b900c642210c 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1085,6 +1085,13 @@ struct journal_s
*/
int j_revoke_records_per_block;
+ /**
+ * @j_transaction_overhead:
+ *
+ * Number of blocks each transaction needs for its own bookkeeping
+ */
+ int j_transaction_overhead_buffers;
+
/**
* @j_commit_interval:
*
--
2.35.3
Commit 9f356e5a4f12 ("jbd2: Account descriptor blocks into
t_outstanding_credits") started to account descriptor blocks into
transactions outstanding credits. However it didn't appropriately
decrease the maximum amount of credits available to userspace. Thus if
the filesystem requests a transaction smaller than
j_max_transaction_buffers but large enough that when descriptor blocks
are added the size exceeds j_max_transaction_buffers, we confuse
add_transaction_credits() into thinking previous handles have grown the
transaction too much and enter infinite journal commit loop in
start_this_handle() -> add_transaction_credits() trying to create
transaction with enough credits available.
Fix the problem by properly accounting for transaction space reserved
for descriptor blocks when verifying requested transaction handle size.
CC: stable(a)vger.kernel.org
Fixes: 9f356e5a4f12 ("jbd2: Account descriptor blocks into t_outstanding_credits")
Reported-by: Alexander Coffin <alex.coffin(a)maticrobots.com>
Link: https://lore.kernel.org/all/CA+hUFcuGs04JHZ_WzA1zGN57+ehL2qmHOt5a7RMpo+rv6V…
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/jbd2/transaction.c | 21 ++++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a095f1a3114b..66513c18ca29 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -191,6 +191,13 @@ static void sub_reserved_credits(journal_t *journal, int blocks)
wake_up(&journal->j_wait_reserved);
}
+/* Maximum number of blocks for user transaction payload */
+static int jbd2_max_user_trans_buffers(journal_t *journal)
+{
+ return journal->j_max_transaction_buffers -
+ journal->j_transaction_overhead_buffers;
+}
+
/*
* Wait until we can add credits for handle to the running transaction. Called
* with j_state_lock held for reading. Returns 0 if handle joined the running
@@ -240,12 +247,12 @@ __must_hold(&journal->j_state_lock)
* big to fit this handle? Wait until reserved credits are freed.
*/
if (atomic_read(&journal->j_reserved_credits) + total >
- journal->j_max_transaction_buffers) {
+ jbd2_max_user_trans_buffers(journal)) {
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + total <=
- journal->j_max_transaction_buffers);
+ jbd2_max_user_trans_buffers(journal));
__acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -285,14 +292,14 @@ __must_hold(&journal->j_state_lock)
needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
/* We allow at most half of a transaction to be reserved */
- if (needed > journal->j_max_transaction_buffers / 2) {
+ if (needed > jbd2_max_user_trans_buffers(journal) / 2) {
sub_reserved_credits(journal, rsv_blocks);
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + rsv_blocks
- <= journal->j_max_transaction_buffers / 2);
+ <= jbd2_max_user_trans_buffers(journal) / 2);
__acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
@@ -322,12 +329,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
* size and limit the number of total credits to not exceed maximum
* transaction size per operation.
*/
- if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
- (rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
+ if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / 2 ||
+ rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) {
printk(KERN_ERR "JBD2: %s wants too many credits "
"credits:%d rsv_credits:%d max:%d\n",
current->comm, blocks, rsv_blocks,
- journal->j_max_transaction_buffers);
+ jbd2_max_user_trans_buffers(journal));
WARN_ON(1);
return -ENOSPC;
}
--
2.35.3
On some pinephones the video output sometimes freezes (flips between two
frames) [1]. It seems to be that the reason for this behaviour is that
PLL-MIPI is outside its limits, and the GPU is not running at a fixed
rate.
In this patch series I propose the following changes:
1. sunxi-ng: Adhere to the following constraints given in the
Allwinner A64 Manual regarding PLL-MIPI:
* M/N <= 3
* (PLL_VIDEO0)/M >= 24MHz
* 500MHz <= clockrate <= 1400MHz
2. Remove two operating points from the A64 DTS OPPs, so that the GPU
runs at a fixed rate of 432 MHz.
Note, that when pinning the GPU to 432 MHz the issue [1] completely
disappears for me. I've searched the BSP and could not find any
indication that supports the idea of having the three OPPs. The only
frequency I found in the BPSs for A64 is 432 MHz, which has also proven
stable for me.
I very much appreciate your feedback!
[1] https://gitlab.com/postmarketOS/pmaports/-/issues/805
Signed-off-by: Frank Oltmanns <frank(a)oltmanns.dev>
---
Changes in v4:
- sunxi-ng: common: Address review comments.
- Link to v3: https://lore.kernel.org/r/20240304-pinephone-pll-fixes-v3-0-94ab828f269a@ol…
Changes in v3:
- dts: Pin GPU to 432 MHz.
- nkm and a64: Move minimum and maximum rate handling to the common part
of the sunxi-ng driver.
- Removed st7703 patch from series.
- Link to v2: https://lore.kernel.org/r/20240205-pinephone-pll-fixes-v2-0-96a46a2d8c9b@ol…
Changes in v2:
- dts: Increase minimum GPU frequency to 192 MHz.
- nkm and a64: Add minimum and maximum rate for PLL-MIPI.
- nkm: Use the same approach for skipping invalid rates in
ccu_nkm_find_best() as in ccu_nkm_find_best_with_parent_adj().
- nkm: Improve names for ratio struct members and hence get rid of
describing comments.
- nkm and a64: Correct description in the commit messages: M/N <= 3
- Remove patches for nm as they were not needed.
- st7703: Rework the commit message to cover more background for the
change.
- Link to v1: https://lore.kernel.org/r/20231218-pinephone-pll-fixes-v1-0-e238b6ed6dc1@ol…
---
Frank Oltmanns (5):
clk: sunxi-ng: common: Support minimum and maximum rate
clk: sunxi-ng: a64: Set minimum and maximum rate for PLL-MIPI
clk: sunxi-ng: nkm: Support constraints on m/n ratio and parent rate
clk: sunxi-ng: a64: Add constraints on PLL-MIPI's n/m ratio and parent rate
arm64: dts: allwinner: a64: Run GPU at 432 MHz
arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi | 8 --------
drivers/clk/sunxi-ng/ccu-sun50i-a64.c | 14 +++++++++-----
drivers/clk/sunxi-ng/ccu_common.c | 19 +++++++++++++++++++
drivers/clk/sunxi-ng/ccu_common.h | 3 +++
drivers/clk/sunxi-ng/ccu_nkm.c | 21 +++++++++++++++++++++
drivers/clk/sunxi-ng/ccu_nkm.h | 2 ++
6 files changed, 54 insertions(+), 13 deletions(-)
---
base-commit: dcb6c8ee6acc6c347caec1e73fb900c0f4ff9806
change-id: 20231218-pinephone-pll-fixes-0ccdfde273e4
Best regards,
--
Frank Oltmanns <frank(a)oltmanns.dev>
The patch titled
Subject: mm/shmem: disable PMD-sized page cache if needed
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-shmem-disable-pmd-sized-page-cache-if-needed.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Gavin Shan <gshan(a)redhat.com>
Subject: mm/shmem: disable PMD-sized page cache if needed
Date: Thu, 27 Jun 2024 10:39:52 +1000
For shmem files, it's possible that PMD-sized page cache can't be
supported by xarray. For example, 512MB page cache on ARM64 when the base
page size is 64KB can't be supported by xarray. It leads to errors as the
following messages indicate when this sort of xarray entry is split.
WARNING: CPU: 34 PID: 7578 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
Modules linked in: binfmt_misc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 \
nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject \
nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \
ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse xfs \
libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 sha1_ce virtio_net \
net_failover virtio_console virtio_blk failover dimlib virtio_mmio
CPU: 34 PID: 7578 Comm: test Kdump: loaded Tainted: G W 6.10.0-rc5-gavin+ #9
Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
pc : xas_split_alloc+0xf8/0x128
lr : split_huge_page_to_list_to_order+0x1c4/0x720
sp : ffff8000882af5f0
x29: ffff8000882af5f0 x28: ffff8000882af650 x27: ffff8000882af768
x26: 0000000000000cc0 x25: 000000000000000d x24: ffff00010625b858
x23: ffff8000882af650 x22: ffffffdfc0900000 x21: 0000000000000000
x20: 0000000000000000 x19: ffffffdfc0900000 x18: 0000000000000000
x17: 0000000000000000 x16: 0000018000000000 x15: 52f8004000000000
x14: 0000e00000000000 x13: 0000000000002000 x12: 0000000000000020
x11: 52f8000000000000 x10: 52f8e1c0ffff6000 x9 : ffffbeb9619a681c
x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff00010b02ddb0
x5 : ffffbeb96395e378 x4 : 0000000000000000 x3 : 0000000000000cc0
x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
Call trace:
xas_split_alloc+0xf8/0x128
split_huge_page_to_list_to_order+0x1c4/0x720
truncate_inode_partial_folio+0xdc/0x160
shmem_undo_range+0x2bc/0x6a8
shmem_fallocate+0x134/0x430
vfs_fallocate+0x124/0x2e8
ksys_fallocate+0x4c/0xa0
__arm64_sys_fallocate+0x24/0x38
invoke_syscall.constprop.0+0x7c/0xd8
do_el0_svc+0xb4/0xd0
el0_svc+0x44/0x1d8
el0t_64_sync_handler+0x134/0x150
el0t_64_sync+0x17c/0x180
Fix it by disabling PMD-sized page cache when HPAGE_PMD_ORDER is larger
than MAX_PAGECACHE_ORDER. As Matthew Wilcox pointed, the page cache in a
shmem file isn't represented by a multi-index entry and doesn't have this
limitation when the xarry entry is split until commit 6b24ca4a1a8d ("mm:
Use multi-index entries in the page cache").
Link: https://lkml.kernel.org/r/20240627003953.1262512-5-gshan@redhat.com
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Darrick J. Wong <djwong(a)kernel.org>
Cc: Don Dutile <ddutile(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: William Kucharski <william.kucharski(a)oracle.com>
Cc: Zhenyu Zhang <zhenyzha(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [5.17+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/shmem.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
--- a/mm/shmem.c~mm-shmem-disable-pmd-sized-page-cache-if-needed
+++ a/mm/shmem.c
@@ -541,8 +541,9 @@ static bool shmem_confirm_swap(struct ad
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
-bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
- struct mm_struct *mm, unsigned long vm_flags)
+static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
+ bool shmem_huge_force, struct mm_struct *mm,
+ unsigned long vm_flags)
{
loff_t i_size;
@@ -573,6 +574,16 @@ bool shmem_is_huge(struct inode *inode,
}
}
+bool shmem_is_huge(struct inode *inode, pgoff_t index,
+ bool shmem_huge_force, struct mm_struct *mm,
+ unsigned long vm_flags)
+{
+ if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
+ return false;
+
+ return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags);
+}
+
#if defined(CONFIG_SYSFS)
static int shmem_parse_huge(const char *str)
{
_
Patches currently in -mm which might be from gshan(a)redhat.com are
mm-filemap-make-max_pagecache_order-acceptable-to-xarray.patch
mm-readahead-limit-page-cache-size-in-page_cache_ra_order.patch
mm-filemap-skip-to-create-pmd-sized-page-cache-if-needed.patch
mm-shmem-disable-pmd-sized-page-cache-if-needed.patch
The patch titled
Subject: mm/filemap: skip to create PMD-sized page cache if needed
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-filemap-skip-to-create-pmd-sized-page-cache-if-needed.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Gavin Shan <gshan(a)redhat.com>
Subject: mm/filemap: skip to create PMD-sized page cache if needed
Date: Thu, 27 Jun 2024 10:39:51 +1000
On ARM64, HPAGE_PMD_ORDER is 13 when the base page size is 64KB. The
PMD-sized page cache can't be supported by xarray as the following error
messages indicate.
------------[ cut here ]------------
WARNING: CPU: 35 PID: 7484 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \
nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \
ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm \
fuse xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 \
sha1_ce virtio_net net_failover virtio_console virtio_blk failover \
dimlib virtio_mmio
CPU: 35 PID: 7484 Comm: test Kdump: loaded Tainted: G W 6.10.0-rc5-gavin+ #9
Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
pc : xas_split_alloc+0xf8/0x128
lr : split_huge_page_to_list_to_order+0x1c4/0x720
sp : ffff800087a4f6c0
x29: ffff800087a4f6c0 x28: ffff800087a4f720 x27: 000000001fffffff
x26: 0000000000000c40 x25: 000000000000000d x24: ffff00010625b858
x23: ffff800087a4f720 x22: ffffffdfc0780000 x21: 0000000000000000
x20: 0000000000000000 x19: ffffffdfc0780000 x18: 000000001ff40000
x17: 00000000ffffffff x16: 0000018000000000 x15: 51ec004000000000
x14: 0000e00000000000 x13: 0000000000002000 x12: 0000000000000020
x11: 51ec000000000000 x10: 51ece1c0ffff8000 x9 : ffffbeb961a44d28
x8 : 0000000000000003 x7 : ffffffdfc0456420 x6 : ffff0000e1aa6eb8
x5 : 20bf08b4fe778fca x4 : ffffffdfc0456420 x3 : 0000000000000c40
x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
Call trace:
xas_split_alloc+0xf8/0x128
split_huge_page_to_list_to_order+0x1c4/0x720
truncate_inode_partial_folio+0xdc/0x160
truncate_inode_pages_range+0x1b4/0x4a8
truncate_pagecache_range+0x84/0xa0
xfs_flush_unmap_range+0x70/0x90 [xfs]
xfs_file_fallocate+0xfc/0x4d8 [xfs]
vfs_fallocate+0x124/0x2e8
ksys_fallocate+0x4c/0xa0
__arm64_sys_fallocate+0x24/0x38
invoke_syscall.constprop.0+0x7c/0xd8
do_el0_svc+0xb4/0xd0
el0_svc+0x44/0x1d8
el0t_64_sync_handler+0x134/0x150
el0t_64_sync+0x17c/0x180
Fix it by skipping to allocate PMD-sized page cache when its size is
larger than MAX_PAGECACHE_ORDER. For this specific case, we will fall to
regular path where the readahead window is determined by BDI's sysfs file
(read_ahead_kb).
Link: https://lkml.kernel.org/r/20240627003953.1262512-4-gshan@redhat.com
Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings")
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Suggested-by: David Hildenbrand <david(a)redhat.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Darrick J. Wong <djwong(a)kernel.org>
Cc: Don Dutile <ddutile(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: William Kucharski <william.kucharski(a)oracle.com>
Cc: Zhenyu Zhang <zhenyzha(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [5.18+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/filemap.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/filemap.c~mm-filemap-skip-to-create-pmd-sized-page-cache-if-needed
+++ a/mm/filemap.c
@@ -3124,7 +3124,7 @@ static struct file *do_sync_mmap_readahe
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
- if (vm_flags & VM_HUGEPAGE) {
+ if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
ra->size = HPAGE_PMD_NR;
_
Patches currently in -mm which might be from gshan(a)redhat.com are
mm-filemap-make-max_pagecache_order-acceptable-to-xarray.patch
mm-readahead-limit-page-cache-size-in-page_cache_ra_order.patch
mm-filemap-skip-to-create-pmd-sized-page-cache-if-needed.patch
mm-shmem-disable-pmd-sized-page-cache-if-needed.patch
The patch titled
Subject: mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-filemap-make-max_pagecache_order-acceptable-to-xarray.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Gavin Shan <gshan(a)redhat.com>
Subject: mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray
Date: Thu, 27 Jun 2024 10:39:49 +1000
Patch series "mm/filemap: Limit page cache size to that supported by
xarray", v2.
Currently, xarray can't support arbitrary page cache size. More details
can be found from the WARN_ON() statement in xas_split_alloc(). In our
test whose code is attached below, we hit the WARN_ON() on ARM64 system
where the base page size is 64KB and huge page size is 512MB. The issue
was reported long time ago and some discussions on it can be found here
[1].
[1] https://www.spinics.net/lists/linux-xfs/msg75404.html
In order to fix the issue, we need to adjust MAX_PAGECACHE_ORDER to one
supported by xarray and avoid PMD-sized page cache if needed. The code
changes are suggested by David Hildenbrand.
PATCH[1] adjusts MAX_PAGECACHE_ORDER to that supported by xarray
PATCH[2-3] avoids PMD-sized page cache in the synchronous readahead path
PATCH[4] avoids PMD-sized page cache for shmem files if needed
Test program
============
# cat test.c
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#define TEST_XFS_FILENAME "/tmp/data"
#define TEST_SHMEM_FILENAME "/dev/shm/data"
#define TEST_MEM_SIZE 0x20000000
int main(int argc, char **argv)
{
const char *filename;
int fd = 0;
void *buf = (void *)-1, *p;
int pgsize = getpagesize();
int ret;
if (pgsize != 0x10000) {
fprintf(stderr, "64KB base page size is required\n");
return -EPERM;
}
system("echo force > /sys/kernel/mm/transparent_hugepage/shmem_enabled");
system("rm -fr /tmp/data");
system("rm -fr /dev/shm/data");
system("echo 1 > /proc/sys/vm/drop_caches");
/* Open xfs or shmem file */
filename = TEST_XFS_FILENAME;
if (argc > 1 && !strcmp(argv[1], "shmem"))
filename = TEST_SHMEM_FILENAME;
fd = open(filename, O_CREAT | O_RDWR | O_TRUNC);
if (fd < 0) {
fprintf(stderr, "Unable to open <%s>\n", filename);
return -EIO;
}
/* Extend file size */
ret = ftruncate(fd, TEST_MEM_SIZE);
if (ret) {
fprintf(stderr, "Error %d to ftruncate()\n", ret);
goto cleanup;
}
/* Create VMA */
buf = mmap(NULL, TEST_MEM_SIZE,
PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (buf == (void *)-1) {
fprintf(stderr, "Unable to mmap <%s>\n", filename);
goto cleanup;
}
fprintf(stdout, "mapped buffer at 0x%p\n", buf);
ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE);
if (ret) {
fprintf(stderr, "Unable to madvise(MADV_HUGEPAGE)\n");
goto cleanup;
}
/* Populate VMA */
ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_WRITE);
if (ret) {
fprintf(stderr, "Error %d to madvise(MADV_POPULATE_WRITE)\n", ret);
goto cleanup;
}
/* Punch the file to enforce xarray split */
ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
TEST_MEM_SIZE - pgsize, pgsize);
if (ret)
fprintf(stderr, "Error %d to fallocate()\n", ret);
cleanup:
if (buf != (void *)-1)
munmap(buf, TEST_MEM_SIZE);
if (fd > 0)
close(fd);
return 0;
}
# gcc test.c -o test
# cat /proc/1/smaps | grep KernelPageSize | head -n 1
KernelPageSize: 64 kB
# ./test shmem
:
------------[ cut here ]------------
WARNING: CPU: 17 PID: 5253 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \
nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \
ip_set nf_tables rfkill nfnetlink vfat fat virtio_balloon \
drm fuse xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 \
virtio_net sha1_ce net_failover failover virtio_console virtio_blk \
dimlib virtio_mmio
CPU: 17 PID: 5253 Comm: test Kdump: loaded Tainted: G W 6.10.0-rc5-gavin+ #12
Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
pc : xas_split_alloc+0xf8/0x128
lr : split_huge_page_to_list_to_order+0x1c4/0x720
sp : ffff80008a92f5b0
x29: ffff80008a92f5b0 x28: ffff80008a92f610 x27: ffff80008a92f728
x26: 0000000000000cc0 x25: 000000000000000d x24: ffff0000cf00c858
x23: ffff80008a92f610 x22: ffffffdfc0600000 x21: 0000000000000000
x20: 0000000000000000 x19: ffffffdfc0600000 x18: 0000000000000000
x17: 0000000000000000 x16: 0000018000000000 x15: 3374004000000000
x14: 0000e00000000000 x13: 0000000000002000 x12: 0000000000000020
x11: 3374000000000000 x10: 3374e1c0ffff6000 x9 : ffffb463a84c681c
x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff00011c976ce0
x5 : ffffb463aa47e378 x4 : 0000000000000000 x3 : 0000000000000cc0
x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
Call trace:
xas_split_alloc+0xf8/0x128
split_huge_page_to_list_to_order+0x1c4/0x720
truncate_inode_partial_folio+0xdc/0x160
shmem_undo_range+0x2bc/0x6a8
shmem_fallocate+0x134/0x430
vfs_fallocate+0x124/0x2e8
ksys_fallocate+0x4c/0xa0
__arm64_sys_fallocate+0x24/0x38
invoke_syscall.constprop.0+0x7c/0xd8
do_el0_svc+0xb4/0xd0
el0_svc+0x44/0x1d8
el0t_64_sync_handler+0x134/0x150
el0t_64_sync+0x17c/0x180
This patch (of 4):
The largest page cache order can be HPAGE_PMD_ORDER (13) on ARM64 with
64KB base page size. The xarray entry with this order can't be split as
the following error messages indicate.
------------[ cut here ]------------
WARNING: CPU: 35 PID: 7484 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \
nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \
ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm \
fuse xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 \
sha1_ce virtio_net net_failover virtio_console virtio_blk failover \
dimlib virtio_mmio
CPU: 35 PID: 7484 Comm: test Kdump: loaded Tainted: G W 6.10.0-rc5-gavin+ #9
Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
pc : xas_split_alloc+0xf8/0x128
lr : split_huge_page_to_list_to_order+0x1c4/0x720
sp : ffff800087a4f6c0
x29: ffff800087a4f6c0 x28: ffff800087a4f720 x27: 000000001fffffff
x26: 0000000000000c40 x25: 000000000000000d x24: ffff00010625b858
x23: ffff800087a4f720 x22: ffffffdfc0780000 x21: 0000000000000000
x20: 0000000000000000 x19: ffffffdfc0780000 x18: 000000001ff40000
x17: 00000000ffffffff x16: 0000018000000000 x15: 51ec004000000000
x14: 0000e00000000000 x13: 0000000000002000 x12: 0000000000000020
x11: 51ec000000000000 x10: 51ece1c0ffff8000 x9 : ffffbeb961a44d28
x8 : 0000000000000003 x7 : ffffffdfc0456420 x6 : ffff0000e1aa6eb8
x5 : 20bf08b4fe778fca x4 : ffffffdfc0456420 x3 : 0000000000000c40
x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
Call trace:
xas_split_alloc+0xf8/0x128
split_huge_page_to_list_to_order+0x1c4/0x720
truncate_inode_partial_folio+0xdc/0x160
truncate_inode_pages_range+0x1b4/0x4a8
truncate_pagecache_range+0x84/0xa0
xfs_flush_unmap_range+0x70/0x90 [xfs]
xfs_file_fallocate+0xfc/0x4d8 [xfs]
vfs_fallocate+0x124/0x2e8
ksys_fallocate+0x4c/0xa0
__arm64_sys_fallocate+0x24/0x38
invoke_syscall.constprop.0+0x7c/0xd8
do_el0_svc+0xb4/0xd0
el0_svc+0x44/0x1d8
el0t_64_sync_handler+0x134/0x150
el0t_64_sync+0x17c/0x180
Fix it by decreasing MAX_PAGECACHE_ORDER to the largest supported order
by xarray. For this specific case, MAX_PAGECACHE_ORDER is dropped from
13 to 11 when CONFIG_BASE_SMALL is disabled.
Link: https://lkml.kernel.org/r/20240627003953.1262512-1-gshan@redhat.com
Link: https://lkml.kernel.org/r/20240627003953.1262512-2-gshan@redhat.com
Fixes: 793917d997df ("mm/readahead: Add large folio readahead")
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Suggested-by: David Hildenbrand <david(a)redhat.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Darrick J. Wong <djwong(a)kernel.org>
Cc: Don Dutile <ddutile(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: William Kucharski <william.kucharski(a)oracle.com>
Cc: Zhenyu Zhang <zhenyzha(a)redhat.com>
Cc: <stable(a)vger.kernel.org> [5.18+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/pagemap.h | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
--- a/include/linux/pagemap.h~mm-filemap-make-max_pagecache_order-acceptable-to-xarray
+++ a/include/linux/pagemap.h
@@ -354,11 +354,18 @@ static inline void mapping_set_gfp_mask(
* a good order (that's 1MB if you're using 4kB pages)
*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
+#define PREFERRED_MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
#else
-#define MAX_PAGECACHE_ORDER 8
+#define PREFERRED_MAX_PAGECACHE_ORDER 8
#endif
+/*
+ * xas_split_alloc() does not support arbitrary orders. This implies no
+ * 512MB THP on ARM64 with 64KB base page size.
+ */
+#define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1)
+#define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
+
/**
* mapping_set_large_folios() - Indicate the file supports large folios.
* @mapping: The file.
_
Patches currently in -mm which might be from gshan(a)redhat.com are
mm-filemap-make-max_pagecache_order-acceptable-to-xarray.patch
mm-readahead-limit-page-cache-size-in-page_cache_ra_order.patch
mm-filemap-skip-to-create-pmd-sized-page-cache-if-needed.patch
mm-shmem-disable-pmd-sized-page-cache-if-needed.patch
The patch titled
Subject: Fix userfaultfd_api to return EINVAL as expected
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
fix-userfaultfd_api-to-return-einval-as-expected.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Audra Mitchell <audra(a)redhat.com>
Subject: Fix userfaultfd_api to return EINVAL as expected
Date: Wed, 26 Jun 2024 09:05:11 -0400
Currently if we request a feature that is not set in the Kernel config we
fail silently and return all the available features. However, the man
page indicates we should return an EINVAL.
We need to fix this issue since we can end up with a Kernel warning should
a program request the feature UFFD_FEATURE_WP_UNPOPULATED on a kernel with
the config not set with this feature.
[ 200.812896] WARNING: CPU: 91 PID: 13634 at mm/memory.c:1660 zap_pte_range+0x43d/0x660
[ 200.820738] Modules linked in:
[ 200.869387] CPU: 91 PID: 13634 Comm: userfaultfd Kdump: loaded Not tainted 6.9.0-rc5+ #8
[ 200.877477] Hardware name: Dell Inc. PowerEdge R6525/0N7YGH, BIOS 2.7.3 03/30/2022
[ 200.885052] RIP: 0010:zap_pte_range+0x43d/0x660
Link: https://lkml.kernel.org/r/20240626130513.120193-1-audra@redhat.com
Fixes: e06f1e1dd499 ("userfaultfd: wp: enabled write protection in userfaultfd API")
Signed-off-by: Audra Mitchell <audra(a)redhat.com>
Cc: Al Viro <viro(a)zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Christian Brauner <brauner(a)kernel.org>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Mike Rapoport <rppt(a)linux.vnet.ibm.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Rafael Aquini <raquini(a)redhat.com>
Cc: Shaohua Li <shli(a)fb.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/userfaultfd.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
--- a/fs/userfaultfd.c~fix-userfaultfd_api-to-return-einval-as-expected
+++ a/fs/userfaultfd.c
@@ -2057,7 +2057,7 @@ static int userfaultfd_api(struct userfa
goto out;
features = uffdio_api.features;
ret = -EINVAL;
- if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
+ if (uffdio_api.api != UFFD_API)
goto err_out;
ret = -EPERM;
if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
@@ -2081,6 +2081,11 @@ static int userfaultfd_api(struct userfa
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
+
+ ret = -EINVAL;
+ if (features & ~uffdio_api.features)
+ goto err_out;
+
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
_
Patches currently in -mm which might be from audra(a)redhat.com are
fix-userfaultfd_api-to-return-einval-as-expected.patch
The patch titled
Subject: mm: vmalloc: check if a hash-index is in cpu_possible_mask
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-vmalloc-check-if-a-hash-index-is-in-cpu_possible_mask.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Uladzislau Rezki (Sony)" <urezki(a)gmail.com>
Subject: mm: vmalloc: check if a hash-index is in cpu_possible_mask
Date: Wed, 26 Jun 2024 16:03:30 +0200
The problem is that there are systems where cpu_possible_mask has gaps
between set CPUs, for example SPARC. In this scenario addr_to_vb_xa()
hash function can return an index which accesses to not-possible and not
setup CPU area using per_cpu() macro. This results in an oops on SPARC.
A per-cpu vmap_block_queue is also used as hash table, incorrectly
assuming the cpu_possible_mask has no gaps. Fix it by adjusting an index
to a next possible CPU.
Link: https://lkml.kernel.org/r/20240626140330.89836-1-urezki@gmail.com
Fixes: 062eacf57ad9 ("mm: vmalloc: remove a global vmap_blocks xarray")
Reported-by: Nick Bowler <nbowler(a)draconx.ca>
Closes: https://lore.kernel.org/linux-kernel/ZntjIE6msJbF8zTa@MiWiFi-R3L-srv/T/
Signed-off-by: Uladzislau Rezki (Sony) <urezki(a)gmail.com>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Christoph Hellwig <hch(a)infradead.org>
Cc: Hailong.Liu <hailong.liu(a)oppo.com>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko(a)sony.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmalloc.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
--- a/mm/vmalloc.c~mm-vmalloc-check-if-a-hash-index-is-in-cpu_possible_mask
+++ a/mm/vmalloc.c
@@ -2543,7 +2543,15 @@ static DEFINE_PER_CPU(struct vmap_block_
static struct xarray *
addr_to_vb_xa(unsigned long addr)
{
- int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus();
+ int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids;
+
+ /*
+ * Please note, nr_cpu_ids points on a highest set
+ * possible bit, i.e. we never invoke cpumask_next()
+ * if an index points on it which is nr_cpu_ids - 1.
+ */
+ if (!cpu_possible(index))
+ index = cpumask_next(index, cpu_possible_mask);
return &per_cpu(vmap_block_queue, index).vmap_blocks;
}
_
Patches currently in -mm which might be from urezki(a)gmail.com are
mm-vmalloc-check-if-a-hash-index-is-in-cpu_possible_mask.patch
From: Chuck Lever <chuck.lever(a)oracle.com>
Hi-
It was pointed out that the NFSD fixes that went into 5.10.220 were
missing a few forward fixes from upstream. These five are the ones
I identified. I've run them through the usual NFSD CI tests and
found no new issues.
Chuck Lever (3):
SUNRPC: Fix a NULL pointer deref in trace_svc_stats_latency()
SUNRPC: Fix svcxdr_init_decode's end-of-buffer calculation
SUNRPC: Fix svcxdr_init_encode's buflen calculation
Jeff Layton (1):
nfsd: hold a lighter-weight client reference over CB_RECALL_ANY
Yunjian Wang (1):
SUNRPC: Fix null pointer dereference in svc_rqst_free()
fs/nfsd/nfs4state.c | 7 ++-----
include/linux/sunrpc/svc.h | 20 ++++++++++++++++----
include/trace/events/sunrpc.h | 8 ++++----
net/sunrpc/svc.c | 18 +++++++++++++++++-
4 files changed, 39 insertions(+), 14 deletions(-)
--
2.45.1
Work for __counted_by on generic pointers in structures (not just
flexible array members) has started landing in Clang 19 (current tip of
tree). During the development of this feature, a restriction was added
to __counted_by to prevent the flexible array member's element type from
including a flexible array member itself such as:
struct foo {
int count;
char buf[];
};
struct bar {
int count;
struct foo data[] __counted_by(count);
};
because the size of data cannot be calculated with the standard array
size formula:
sizeof(struct foo) * count
This restriction was downgraded to a warning but due to CONFIG_WERROR,
it can still break the build. The application of __counted_by on the fod
member of 'struct nvmet_fc_tgt_queue' triggers this restriction,
resulting in:
drivers/nvme/target/fc.c:151:2: error: 'counted_by' should not be applied to an array with element of unknown size because 'struct nvmet_fc_fcp_iod' is a struct type with a flexible array member. This will be an error in a future compiler version [-Werror,-Wbounds-safety-counted-by-elt-type-unknown-size]
151 | struct nvmet_fc_fcp_iod fod[] __counted_by(sqsize);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 error generated.
Remove this use of __counted_by to fix the warning/error. However,
rather than remove it altogether, leave it commented, as it may be
possible to support this in future compiler releases.
Cc: stable(a)vger.kernel.org
Closes: https://github.com/ClangBuiltLinux/linux/issues/2027
Fixes: ccd3129aca28 ("nvmet-fc: Annotate struct nvmet_fc_tgt_queue with __counted_by")
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
drivers/nvme/target/fc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 337ee1cb09ae..381b4394731f 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -148,7 +148,7 @@ struct nvmet_fc_tgt_queue {
struct workqueue_struct *work_q;
struct kref ref;
/* array of fcp_iods */
- struct nvmet_fc_fcp_iod fod[] __counted_by(sqsize);
+ struct nvmet_fc_fcp_iod fod[] /* __counted_by(sqsize) */;
} __aligned(sizeof(unsigned long long));
struct nvmet_fc_hostport {
---
base-commit: c758b77d4a0a0ed3a1292b3fd7a2aeccd1a169a4
change-id: 20240529-drop-counted-by-fod-nvmet-fc-tgt-queue-50edd2f8d60e
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
For Gen-1 targets like IPQ6018, it is seen that stressing out the
controller in host mode results in HC died error:
xhci-hcd.12.auto: xHCI host not responding to stop endpoint command
xhci-hcd.12.auto: xHCI host controller not responding, assume dead
xhci-hcd.12.auto: HC died; cleaning up
And at this instant only restarting the host mode fixes it. Disable
SuperSpeed instance in park mode for IPQ6018 to mitigate this issue.
Cc: <stable(a)vger.kernel.org>
Fixes: 20bb9e3dd2e4 ("arm64: dts: qcom: ipq6018: add usb3 DT description")
Signed-off-by: Krishna Kurapati <quic_kriskura(a)quicinc.com>
---
arch/arm64/boot/dts/qcom/ipq6018.dtsi | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm64/boot/dts/qcom/ipq6018.dtsi b/arch/arm64/boot/dts/qcom/ipq6018.dtsi
index 9694140881c6..8b63c1a6da10 100644
--- a/arch/arm64/boot/dts/qcom/ipq6018.dtsi
+++ b/arch/arm64/boot/dts/qcom/ipq6018.dtsi
@@ -685,6 +685,7 @@ dwc_0: usb@8a00000 {
clocks = <&xo>;
clock-names = "ref";
tx-fifo-resize;
+ snps,parkmode-disable-ss-quirk;
snps,is-utmi-l1-suspend;
snps,hird-threshold = /bits/ 8 <0x0>;
snps,dis_u2_susphy_quirk;
--
2.34.1
For Gen-1 targets like MSM8998, it is seen that stressing out the
controller in host mode results in HC died error:
xhci-hcd.12.auto: xHCI host not responding to stop endpoint command
xhci-hcd.12.auto: xHCI host controller not responding, assume dead
xhci-hcd.12.auto: HC died; cleaning up
And at this instant only restarting the host mode fixes it. Disable
SuperSpeed instance in park mode for MSM8998 to mitigate this issue.
Cc: <stable(a)vger.kernel.org>
Fixes: 026dad8f5873 ("arm64: dts: qcom: msm8998: Add USB-related nodes")
Signed-off-by: Krishna Kurapati <quic_kriskura(a)quicinc.com>
---
arch/arm64/boot/dts/qcom/msm8998.dtsi | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm64/boot/dts/qcom/msm8998.dtsi b/arch/arm64/boot/dts/qcom/msm8998.dtsi
index 3c94d823a514..6d5e1c7b2da5 100644
--- a/arch/arm64/boot/dts/qcom/msm8998.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8998.dtsi
@@ -2144,6 +2144,7 @@ usb3_dwc3: usb@a800000 {
interrupts = <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>;
snps,dis_u2_susphy_quirk;
snps,dis_enblslpm_quirk;
+ snps,parkmode-disable-ss-quirk;
phys = <&qusb2phy>, <&usb3phy>;
phy-names = "usb2-phy", "usb3-phy";
snps,has-lpm-erratum;
--
2.34.1
[Why]
After supend/resume, with topology unchanged, observe that
link_address_sent of all mstb are marked as false even the topology probing
is done without any error.
It is caused by wrongly also include "ret == 0" case as a probing failure
case.
[How]
Remove inappropriate checking conditions.
Cc: Lyude Paul <lyude(a)redhat.com>
Cc: Harry Wentland <hwentlan(a)amd.com>
Cc: Jani Nikula <jani.nikula(a)intel.com>
Cc: Imre Deak <imre.deak(a)intel.com>
Cc: Daniel Vetter <daniel(a)ffwll.ch>
Cc: stable(a)vger.kernel.org
Fixes: 37dfdc55ffeb ("drm/dp_mst: Cleanup drm_dp_send_link_address() a bit")
Signed-off-by: Wayne Lin <Wayne.Lin(a)amd.com>
---
drivers/gpu/drm/display/drm_dp_mst_topology.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c
index 7f8e1cfbe19d..68831f4e502a 100644
--- a/drivers/gpu/drm/display/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c
@@ -2929,7 +2929,7 @@ static int drm_dp_send_link_address(struct drm_dp_mst_topology_mgr *mgr,
/* FIXME: Actually do some real error handling here */
ret = drm_dp_mst_wait_tx_reply(mstb, txmsg);
- if (ret <= 0) {
+ if (ret < 0) {
drm_err(mgr->dev, "Sending link address failed with %d\n", ret);
goto out;
}
@@ -2981,7 +2981,7 @@ static int drm_dp_send_link_address(struct drm_dp_mst_topology_mgr *mgr,
mutex_unlock(&mgr->lock);
out:
- if (ret <= 0)
+ if (ret < 0)
mstb->link_address_sent = false;
kfree(txmsg);
return ret < 0 ? ret : changed;
--
2.37.3
From: Reka Norman <rekanorman(a)chromium.org>
TGL systems have the same issue as ADL, where a large boot firmware
delay is seen if USB ports are left in U3 at shutdown. So apply the
XHCI_RESET_TO_DEFAULT quirk to TGL as well.
The issue it fixes is a ~20s boot time delay when booting from S5. It
affects TGL devices, and TGL support was added starting from v5.3.
Cc: stable(a)vger.kernel.org
Signed-off-by: Reka Norman <rekanorman(a)chromium.org>
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
---
drivers/usb/host/xhci-pci.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 05881153883e..dc1e345ab67e 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -50,6 +50,7 @@
#define PCI_DEVICE_ID_INTEL_DENVERTON_XHCI 0x19d0
#define PCI_DEVICE_ID_INTEL_ICE_LAKE_XHCI 0x8a13
#define PCI_DEVICE_ID_INTEL_TIGER_LAKE_XHCI 0x9a13
+#define PCI_DEVICE_ID_INTEL_TIGER_LAKE_PCH_XHCI 0xa0ed
#define PCI_DEVICE_ID_INTEL_COMET_LAKE_XHCI 0xa3af
#define PCI_DEVICE_ID_INTEL_ALDER_LAKE_PCH_XHCI 0x51ed
#define PCI_DEVICE_ID_INTEL_ALDER_LAKE_N_PCH_XHCI 0x54ed
@@ -373,7 +374,8 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
xhci->quirks |= XHCI_MISSING_CAS;
if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
- (pdev->device == PCI_DEVICE_ID_INTEL_ALDER_LAKE_PCH_XHCI ||
+ (pdev->device == PCI_DEVICE_ID_INTEL_TIGER_LAKE_PCH_XHCI ||
+ pdev->device == PCI_DEVICE_ID_INTEL_ALDER_LAKE_PCH_XHCI ||
pdev->device == PCI_DEVICE_ID_INTEL_ALDER_LAKE_N_PCH_XHCI))
xhci->quirks |= XHCI_RESET_TO_DEFAULT;
--
2.25.1
BPF kfuncs are often not directly referenced and may be inadvertently
removed by optimization steps during kernel builds, thus the __bpf_kfunc
tag mitigates against this removal by including the __used macro. However,
this macro alone does not prevent removal during linking, and may still
yield build warnings (e.g. on mips64el):
LD vmlinux
BTFIDS vmlinux
WARN: resolve_btfids: unresolved symbol bpf_verify_pkcs7_signature
WARN: resolve_btfids: unresolved symbol bpf_lookup_user_key
WARN: resolve_btfids: unresolved symbol bpf_lookup_system_key
WARN: resolve_btfids: unresolved symbol bpf_key_put
WARN: resolve_btfids: unresolved symbol bpf_iter_task_next
WARN: resolve_btfids: unresolved symbol bpf_iter_css_task_new
WARN: resolve_btfids: unresolved symbol bpf_get_file_xattr
WARN: resolve_btfids: unresolved symbol bpf_ct_insert_entry
WARN: resolve_btfids: unresolved symbol bpf_cgroup_release
WARN: resolve_btfids: unresolved symbol bpf_cgroup_from_id
WARN: resolve_btfids: unresolved symbol bpf_cgroup_acquire
WARN: resolve_btfids: unresolved symbol bpf_arena_free_pages
NM System.map
SORTTAB vmlinux
OBJCOPY vmlinux.32
Update the __bpf_kfunc tag to better guard against linker optimization by
including the new __retain compiler macro, which fixes the warnings above.
Verify the __retain macro with readelf by checking object flags for 'R':
$ readelf -Wa kernel/trace/bpf_trace.o
Section Headers:
[Nr] Name Type Address Off Size ES Flg Lk Inf Al
...
[178] .text.bpf_key_put PROGBITS 00000000 6420 0050 00 AXR 0 0 8
...
Key to Flags:
...
R (retain), D (mbind), p (processor specific)
Link: https://lore.kernel.org/bpf/ZlmGoT9KiYLZd91S@krava/T/
Reported-by: kernel test robot <lkp(a)intel.com>
Closes: https://lore.kernel.org/r/202401211357.OCX9yllM-lkp@intel.com/
Fixes: 57e7c169cd6a ("bpf: Add __bpf_kfunc tag for marking kernel functions as kfuncs")
Cc: stable(a)vger.kernel.org # v6.6+
Signed-off-by: Tony Ambardar <Tony.Ambardar(a)gmail.com>
---
include/linux/btf.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/btf.h b/include/linux/btf.h
index f9e56fd12a9f..7c3e40c3295e 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -82,7 +82,7 @@
* as to avoid issues such as the compiler inlining or eliding either a static
* kfunc, or a global kfunc in an LTO build.
*/
-#define __bpf_kfunc __used noinline
+#define __bpf_kfunc __used __retain noinline
#define __bpf_kfunc_start_defs() \
__diag_push(); \
--
2.34.1
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: e1b8ac3aae589bb57a2c2e49fa76235c687c4d23
Gitweb: https://git.kernel.org/tip/e1b8ac3aae589bb57a2c2e49fa76235c687c4d23
Author: Dexuan Cui <decui(a)microsoft.com>
AuthorDate: Mon, 20 May 2024 19:12:38 -07:00
Committer: Dave Hansen <dave.hansen(a)linux.intel.com>
CommitterDate: Tue, 25 Jun 2024 14:45:22 -07:00
x86/tdx: Support vmalloc() for tdx_enc_status_changed()
When a TDX guest runs on Hyper-V, the hv_netvsc driver's netvsc_init_buf()
allocates buffers using vzalloc(), and needs to share the buffers with the
host OS by calling set_memory_decrypted(), which is not working for
vmalloc() yet. Add the support by handling the pages one by one.
Co-developed-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Reviewed-by: Michael Kelley <mikelley(a)microsoft.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy(a)linux.intel.com>
Reviewed-by: Rick Edgecombe <rick.p.edgecombe(a)intel.com>
Reviewed-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Acked-by: Kai Huang <kai.huang(a)intel.com>
Cc:stable@vger.kernel.org
Link: https://lore.kernel.org/all/20240521021238.1803-1-decui%40microsoft.com
---
arch/x86/coco/tdx/tdx.c | 35 ++++++++++++++++++++++++++++-------
1 file changed, 28 insertions(+), 7 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index c1cb903..abf3cd5 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -7,6 +7,7 @@
#include <linux/cpufeature.h>
#include <linux/export.h>
#include <linux/io.h>
+#include <linux/mm.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
@@ -778,6 +779,19 @@ static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)
return false;
}
+static bool tdx_enc_status_changed_phys(phys_addr_t start, phys_addr_t end,
+ bool enc)
+{
+ if (!tdx_map_gpa(start, end, enc))
+ return false;
+
+ /* shared->private conversion requires memory to be accepted before use */
+ if (enc)
+ return tdx_accept_memory(start, end);
+
+ return true;
+}
+
/*
* Inform the VMM of the guest's intent for this physical page: shared with
* the VMM or private to the guest. The VMM is expected to change its mapping
@@ -785,15 +799,22 @@ static bool tdx_map_gpa(phys_addr_t start, phys_addr_t end, bool enc)
*/
static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
{
- phys_addr_t start = __pa(vaddr);
- phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
+ unsigned long start = vaddr;
+ unsigned long end = start + numpages * PAGE_SIZE;
+ unsigned long step = end - start;
+ unsigned long addr;
- if (!tdx_map_gpa(start, end, enc))
- return false;
+ /* Step through page-by-page for vmalloc() mappings */
+ if (is_vmalloc_addr((void *)vaddr))
+ step = PAGE_SIZE;
- /* shared->private conversion requires memory to be accepted before use */
- if (enc)
- return tdx_accept_memory(start, end);
+ for (addr = start; addr < end; addr += step) {
+ phys_addr_t start_pa = slow_virt_to_phys((void *)addr);
+ phys_addr_t end_pa = start_pa + step;
+
+ if (!tdx_enc_status_changed_phys(start_pa, end_pa, enc))
+ return false;
+ }
return true;
}
The patch titled
Subject: mm: page_ref: remove folio_try_get_rcu()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-page_ref-remove-folio_try_get_rcu.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Yang Shi <yang(a)os.amperecomputing.com>
Subject: mm: page_ref: remove folio_try_get_rcu()
Date: Tue, 25 Jun 2024 13:53:50 -0700
The below bug was reported on a non-SMP kernel:
[ 275.267158][ T4335] ------------[ cut here ]------------
[ 275.267949][ T4335] kernel BUG at include/linux/page_ref.h:275!
[ 275.268526][ T4335] invalid opcode: 0000 [#1] KASAN PTI
[ 275.269001][ T4335] CPU: 0 PID: 4335 Comm: trinity-c3 Not tainted 6.7.0-rc4-00061-gefa7df3e3bb5 #1
[ 275.269787][ T4335] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 275.270679][ T4335] RIP: 0010:try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3))
[ 275.272813][ T4335] RSP: 0018:ffffc90005dcf650 EFLAGS: 00010202
[ 275.273346][ T4335] RAX: 0000000000000246 RBX: ffffea00066e0000 RCX: 0000000000000000
[ 275.274032][ T4335] RDX: fffff94000cdc007 RSI: 0000000000000004 RDI: ffffea00066e0034
[ 275.274719][ T4335] RBP: ffffea00066e0000 R08: 0000000000000000 R09: fffff94000cdc006
[ 275.275404][ T4335] R10: ffffea00066e0037 R11: 0000000000000000 R12: 0000000000000136
[ 275.276106][ T4335] R13: ffffea00066e0034 R14: dffffc0000000000 R15: ffffea00066e0008
[ 275.276790][ T4335] FS: 00007fa2f9b61740(0000) GS:ffffffff89d0d000(0000) knlGS:0000000000000000
[ 275.277570][ T4335] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 275.278143][ T4335] CR2: 00007fa2f6c00000 CR3: 0000000134b04000 CR4: 00000000000406f0
[ 275.278833][ T4335] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 275.279521][ T4335] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 275.280201][ T4335] Call Trace:
[ 275.280499][ T4335] <TASK>
[ 275.280751][ T4335] ? die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434 arch/x86/kernel/dumpstack.c:447)
[ 275.281087][ T4335] ? do_trap (arch/x86/kernel/traps.c:112 arch/x86/kernel/traps.c:153)
[ 275.281463][ T4335] ? try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3))
[ 275.281884][ T4335] ? try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3))
[ 275.282300][ T4335] ? do_error_trap (arch/x86/kernel/traps.c:174)
[ 275.282711][ T4335] ? try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3))
[ 275.283129][ T4335] ? handle_invalid_op (arch/x86/kernel/traps.c:212)
[ 275.283561][ T4335] ? try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3))
[ 275.283990][ T4335] ? exc_invalid_op (arch/x86/kernel/traps.c:264)
[ 275.284415][ T4335] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:568)
[ 275.284859][ T4335] ? try_get_folio (include/linux/page_ref.h:275 (discriminator 3) mm/gup.c:79 (discriminator 3))
[ 275.285278][ T4335] try_grab_folio (mm/gup.c:148)
[ 275.285684][ T4335] __get_user_pages (mm/gup.c:1297 (discriminator 1))
[ 275.286111][ T4335] ? __pfx___get_user_pages (mm/gup.c:1188)
[ 275.286579][ T4335] ? __pfx_validate_chain (kernel/locking/lockdep.c:3825)
[ 275.287034][ T4335] ? mark_lock (kernel/locking/lockdep.c:4656 (discriminator 1))
[ 275.287416][ T4335] __gup_longterm_locked (mm/gup.c:1509 mm/gup.c:2209)
[ 275.288192][ T4335] ? __pfx___gup_longterm_locked (mm/gup.c:2204)
[ 275.288697][ T4335] ? __pfx_lock_acquire (kernel/locking/lockdep.c:5722)
[ 275.289135][ T4335] ? __pfx___might_resched (kernel/sched/core.c:10106)
[ 275.289595][ T4335] pin_user_pages_remote (mm/gup.c:3350)
[ 275.290041][ T4335] ? __pfx_pin_user_pages_remote (mm/gup.c:3350)
[ 275.290545][ T4335] ? find_held_lock (kernel/locking/lockdep.c:5244 (discriminator 1))
[ 275.290961][ T4335] ? mm_access (kernel/fork.c:1573)
[ 275.291353][ T4335] process_vm_rw_single_vec+0x142/0x360
[ 275.291900][ T4335] ? __pfx_process_vm_rw_single_vec+0x10/0x10
[ 275.292471][ T4335] ? mm_access (kernel/fork.c:1573)
[ 275.292859][ T4335] process_vm_rw_core+0x272/0x4e0
[ 275.293384][ T4335] ? hlock_class (arch/x86/include/asm/bitops.h:227 arch/x86/include/asm/bitops.h:239 include/asm-generic/bitops/instrumented-non-atomic.h:142 kernel/locking/lockdep.c:228)
[ 275.293780][ T4335] ? __pfx_process_vm_rw_core+0x10/0x10
[ 275.294350][ T4335] process_vm_rw (mm/process_vm_access.c:284)
[ 275.294748][ T4335] ? __pfx_process_vm_rw (mm/process_vm_access.c:259)
[ 275.295197][ T4335] ? __task_pid_nr_ns (include/linux/rcupdate.h:306 (discriminator 1) include/linux/rcupdate.h:780 (discriminator 1) kernel/pid.c:504 (discriminator 1))
[ 275.295634][ T4335] __x64_sys_process_vm_readv (mm/process_vm_access.c:291)
[ 275.296139][ T4335] ? syscall_enter_from_user_mode (kernel/entry/common.c:94 kernel/entry/common.c:112)
[ 275.296642][ T4335] do_syscall_64 (arch/x86/entry/common.c:51 (discriminator 1) arch/x86/entry/common.c:82 (discriminator 1))
[ 275.297032][ T4335] ? __task_pid_nr_ns (include/linux/rcupdate.h:306 (discriminator 1) include/linux/rcupdate.h:780 (discriminator 1) kernel/pid.c:504 (discriminator 1))
[ 275.297470][ T4335] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4300 kernel/locking/lockdep.c:4359)
[ 275.297988][ T4335] ? do_syscall_64 (arch/x86/include/asm/cpufeature.h:171 arch/x86/entry/common.c:97)
[ 275.298389][ T4335] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4300 kernel/locking/lockdep.c:4359)
[ 275.298906][ T4335] ? do_syscall_64 (arch/x86/include/asm/cpufeature.h:171 arch/x86/entry/common.c:97)
[ 275.299304][ T4335] ? do_syscall_64 (arch/x86/include/asm/cpufeature.h:171 arch/x86/entry/common.c:97)
[ 275.299703][ T4335] ? do_syscall_64 (arch/x86/include/asm/cpufeature.h:171 arch/x86/entry/common.c:97)
[ 275.300115][ T4335] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129)
This BUG is the VM_BUG_ON(!in_atomic() && !irqs_disabled()) assertion in
folio_ref_try_add_rcu() for non-SMP kernel.
The process_vm_readv() calls GUP to pin the THP. An optimization for
pinning THP instroduced by commit 57edfcfd3419 ("mm/gup: accelerate thp
gup even for "pages != NULL"") calls try_grab_folio() to pin the THP,
but try_grab_folio() is supposed to be called in atomic context for
non-SMP kernel, for example, irq disabled or preemption disabled, due to
the optimization introduced by commit e286781d5f2e ("mm: speculative
page references").
The commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP
boundaries") is not actually the root cause although it was bisected to.
It just makes the problem exposed more likely.
The follow up discussion suggested the optimization for non-SMP kernel
may be out-dated and not worth it anymore [1]. So removing the
optimization to silence the BUG.
However calling try_grab_folio() in GUP slow path actually is
unnecessary, so the following patch will clean this up.
[1] https://lore.kernel.org/linux-mm/821cf1d6-92b9-4ac4-bacc-d8f2364ac14f@paulm…
Link: https://lkml.kernel.org/r/20240625205350.1777481-1-yang@os.amperecomputing.…
Fixes: 57edfcfd3419 ("mm/gup: accelerate thp gup even for "pages != NULL"")
Signed-off-by: Yang Shi <yang(a)os.amperecomputing.com>
Reported-by: kernel test robot <oliver.sang(a)intel.com>
Tested-by: Oliver Sang <oliver.sang(a)intel.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Christoph Lameter <cl(a)linux.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Paul E. McKenney <paulmck(a)kernel.org>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: Vivek Kasireddy <vivek.kasireddy(a)intel.com>
Cc: <stable(a)vger.kernel.org> [6.6+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/page_ref.h | 49 +------------------------------------
mm/filemap.c | 10 +++----
mm/gup.c | 2 -
3 files changed, 8 insertions(+), 53 deletions(-)
--- a/include/linux/page_ref.h~mm-page_ref-remove-folio_try_get_rcu
+++ a/include/linux/page_ref.h
@@ -258,54 +258,9 @@ static inline bool folio_try_get(struct
return folio_ref_add_unless(folio, 1, 0);
}
-static inline bool folio_ref_try_add_rcu(struct folio *folio, int count)
+static inline bool folio_ref_try_add(struct folio *folio, int count)
{
-#ifdef CONFIG_TINY_RCU
- /*
- * The caller guarantees the folio will not be freed from interrupt
- * context, so (on !SMP) we only need preemption to be disabled
- * and TINY_RCU does that for us.
- */
-# ifdef CONFIG_PREEMPT_COUNT
- VM_BUG_ON(!in_atomic() && !irqs_disabled());
-# endif
- VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio);
- folio_ref_add(folio, count);
-#else
- if (unlikely(!folio_ref_add_unless(folio, count, 0))) {
- /* Either the folio has been freed, or will be freed. */
- return false;
- }
-#endif
- return true;
-}
-
-/**
- * folio_try_get_rcu - Attempt to increase the refcount on a folio.
- * @folio: The folio.
- *
- * This is a version of folio_try_get() optimised for non-SMP kernels.
- * If you are still holding the rcu_read_lock() after looking up the
- * page and know that the page cannot have its refcount decreased to
- * zero in interrupt context, you can use this instead of folio_try_get().
- *
- * Example users include get_user_pages_fast() (as pages are not unmapped
- * from interrupt context) and the page cache lookups (as pages are not
- * truncated from interrupt context). We also know that pages are not
- * frozen in interrupt context for the purposes of splitting or migration.
- *
- * You can also use this function if you're holding a lock that prevents
- * pages being frozen & removed; eg the i_pages lock for the page cache
- * or the mmap_lock or page table lock for page tables. In this case,
- * it will always succeed, and you could have used a plain folio_get(),
- * but it's sometimes more convenient to have a common function called
- * from both locked and RCU-protected contexts.
- *
- * Return: True if the reference count was successfully incremented.
- */
-static inline bool folio_try_get_rcu(struct folio *folio)
-{
- return folio_ref_try_add_rcu(folio, 1);
+ return folio_ref_add_unless(folio, count, 0);
}
static inline int page_ref_freeze(struct page *page, int count)
--- a/mm/filemap.c~mm-page_ref-remove-folio_try_get_rcu
+++ a/mm/filemap.c
@@ -1847,7 +1847,7 @@ repeat:
if (!folio || xa_is_value(folio))
goto out;
- if (!folio_try_get_rcu(folio))
+ if (!folio_try_get(folio))
goto repeat;
if (unlikely(folio != xas_reload(&xas))) {
@@ -2001,7 +2001,7 @@ retry:
if (!folio || xa_is_value(folio))
return folio;
- if (!folio_try_get_rcu(folio))
+ if (!folio_try_get(folio))
goto reset;
if (unlikely(folio != xas_reload(xas))) {
@@ -2181,7 +2181,7 @@ unsigned filemap_get_folios_contig(struc
if (xa_is_value(folio))
goto update_start;
- if (!folio_try_get_rcu(folio))
+ if (!folio_try_get(folio))
goto retry;
if (unlikely(folio != xas_reload(&xas)))
@@ -2313,7 +2313,7 @@ static void filemap_get_read_batch(struc
break;
if (xa_is_sibling(folio))
break;
- if (!folio_try_get_rcu(folio))
+ if (!folio_try_get(folio))
goto retry;
if (unlikely(folio != xas_reload(&xas)))
@@ -3472,7 +3472,7 @@ static struct folio *next_uptodate_folio
continue;
if (folio_test_locked(folio))
continue;
- if (!folio_try_get_rcu(folio))
+ if (!folio_try_get(folio))
continue;
/* Has the page moved or been split? */
if (unlikely(folio != xas_reload(xas)))
--- a/mm/gup.c~mm-page_ref-remove-folio_try_get_rcu
+++ a/mm/gup.c
@@ -76,7 +76,7 @@ retry:
folio = page_folio(page);
if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
return NULL;
- if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
+ if (unlikely(!folio_ref_try_add(folio, refs)))
return NULL;
/*
_
Patches currently in -mm which might be from yang(a)os.amperecomputing.com are
mm-page_ref-remove-folio_try_get_rcu.patch
In nv17_tv_get_hd_modes(), the return value of drm_mode_duplicate() is
assigned to mode, which will lead to a possible NULL pointer dereference
on failure of drm_mode_duplicate(). The same applies to drm_cvt_mode().
Add a check to avoid null pointer dereference.
Cc: stable(a)vger.kernel.org
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
drivers/gpu/drm/nouveau/dispnv04/tvnv17.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c
index 670c9739e5e1..9c3dc9a5bb46 100644
--- a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c
+++ b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c
@@ -258,6 +258,8 @@ static int nv17_tv_get_hd_modes(struct drm_encoder *encoder,
if (modes[i].hdisplay == output_mode->hdisplay &&
modes[i].vdisplay == output_mode->vdisplay) {
mode = drm_mode_duplicate(encoder->dev, output_mode);
+ if (!mode)
+ continue;
mode->type |= DRM_MODE_TYPE_PREFERRED;
} else {
@@ -265,6 +267,8 @@ static int nv17_tv_get_hd_modes(struct drm_encoder *encoder,
modes[i].vdisplay, 60, false,
(output_mode->flags &
DRM_MODE_FLAG_INTERLACE), false);
+ if (!mode)
+ continue;
}
/* CVT modes are sometimes unsuitable... */
--
2.25.1
In nv17_tv_get_ld_modes(), the return value of drm_mode_duplicate() is
assigned to mode, which will lead to a possible NULL pointer dereference
on failure of drm_mode_duplicate(). Add a check to avoid npd.
Cc: stable(a)vger.kernel.org
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
drivers/gpu/drm/nouveau/dispnv04/tvnv17.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c
index 670c9739e5e1..4a08e61f3336 100644
--- a/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c
+++ b/drivers/gpu/drm/nouveau/dispnv04/tvnv17.c
@@ -209,6 +209,8 @@ static int nv17_tv_get_ld_modes(struct drm_encoder *encoder,
struct drm_display_mode *mode;
mode = drm_mode_duplicate(encoder->dev, tv_mode);
+ if (!mode)
+ continue;
mode->clock = tv_norm->tv_enc_mode.vrefresh *
mode->htotal / 1000 *
--
2.25.1
Errata i2310[0] says, Erroneous timeout can be triggered,
if this Erroneous interrupt is not cleared then it may leads
to storm of interrupts.
Commit 9d141c1e6157 ("serial: 8250_omap: Implementation of Errata i2310")
which added the workaround but missed ensuring RX FIFO is really empty
before applying the errata workaround as recommended in the errata text.
Fix this by adding back check for UART_OMAP_RX_LVL to be 0 for
workaround to take effect.
[0] https://www.ti.com/lit/pdf/sprz536 page 23
Fixes: 9d141c1e6157 ("serial: 8250_omap: Implementation of Errata i2310")
Cc: stable(a)vger.kernel.org
Reported-by: Vignesh Raghavendra <vigneshr(a)ti.com>
Closes: https://lore.kernel.org/all/e96d0c55-0b12-4cbf-9d23-48963543de49@ti.com/
Signed-off-by: Udit Kumar <u-kumar1(a)ti.com>
---
Testlogs
https://gist.github.com/uditkumarti/4f5120f203a238fbebe411eb82b63753
Changelog:
Changes in v2
- Update commit message, subject, Fixes tag
link to v1:
https://lore.kernel.org/all/20240625051338.2761599-1-u-kumar1@ti.com/
drivers/tty/serial/8250/8250_omap.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index ddac0a13cf84..1af9aed99c65 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -672,7 +672,8 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id)
* https://www.ti.com/lit/pdf/sprz536
*/
if (priv->habit & UART_RX_TIMEOUT_QUIRK &&
- (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT) {
+ (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT &&
+ serial_port_in(port, UART_OMAP_RX_LVL) == 0) {
unsigned char efr2, timeout_h, timeout_l;
efr2 = serial_in(up, UART_OMAP_EFR2);
--
2.34.1
We got another report that CT1000BX500SSD1 does not work with LPM.
If you look in libata-core.c, we have six different Crucial devices that
are marked with ATA_HORKAGE_NOLPM. This model would have been the seventh.
(This quirk is used on Crucial models starting with both CT* and
Crucial_CT*)
It is obvious that this vendor does not have a great history of supporting
LPM properly, therefore, add the ATA_HORKAGE_NOLPM quirk for all Crucial
BX SSD1 models.
Fixes: 7627a0edef54 ("ata: ahci: Drop low power policy board type")
Cc: stable(a)vger.kernel.org
Reported-by: Tkd-Alex <alex.tkd.alex(a)gmail.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218832
Signed-off-by: Niklas Cassel <cassel(a)kernel.org>
---
drivers/ata/libata-core.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index e1bf8a19b3c8..efb5195da60c 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4137,8 +4137,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
{ "PIONEER BD-RW BDR-205", NULL, ATA_HORKAGE_NOLPM },
/* Crucial devices with broken LPM support */
- { "CT500BX100SSD1", NULL, ATA_HORKAGE_NOLPM },
- { "CT240BX500SSD1", NULL, ATA_HORKAGE_NOLPM },
+ { "CT*0BX*00SSD1", NULL, ATA_HORKAGE_NOLPM },
/* 512GB MX100 with MU01 firmware has both queued TRIM and LPM issues */
{ "Crucial_CT512MX100*", "MU01", ATA_HORKAGE_NO_NCQ_TRIM |
--
2.45.2
From: Vitor Soares <vitor.soares(a)toradex.com>
During the probe, the genpd power_dev is added to the dpm_list after
blk_ctrl due to its parent/child relationship. Making the blk_ctrl
suspend after and resume before the genpd power_dev.
As a consequence, the system hangs when resuming the VPU due to the
power domain dependency.
To ensure the proper suspend/resume order, add a device link betweem
blk_ctrl and genpd power_dev. It guarantees genpd power_dev is suspended
after and resumed before blk-ctrl.
Cc: <stable(a)vger.kernel.org>
Closes: https://lore.kernel.org/all/fccbb040330a706a4f7b34875db1d896a0bf81c8.camel@…
Link: https://lore.kernel.org/all/20240409085802.290439-1-ivitro@gmail.com/
Fixes: 2684ac05a8c4 ("soc: imx: add i.MX8M blk-ctrl driver")
Suggested-by: Lucas Stach <l.stach(a)pengutronix.de>
Signed-off-by: Vitor Soares <vitor.soares(a)toradex.com>
---
This is a new patch, but is a follow-up of:
https://lore.kernel.org/all/20240409085802.290439-1-ivitro@gmail.com/
As suggested by Lucas, we are addressing this PM issue in the imx8m-blk-ctrl
driver instead of in the imx8mm.dtsi.
drivers/pmdomain/imx/imx8m-blk-ctrl.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/drivers/pmdomain/imx/imx8m-blk-ctrl.c b/drivers/pmdomain/imx/imx8m-blk-ctrl.c
index ca942d7929c2..cd0d2296080d 100644
--- a/drivers/pmdomain/imx/imx8m-blk-ctrl.c
+++ b/drivers/pmdomain/imx/imx8m-blk-ctrl.c
@@ -283,6 +283,20 @@ static int imx8m_blk_ctrl_probe(struct platform_device *pdev)
goto cleanup_pds;
}
+ /*
+ * Enforce suspend/resume ordering by making genpd power_dev a
+ * provider of blk-ctrl. Genpd power_dev is suspended after and
+ * resumed before blk-ctrl.
+ */
+ if (!device_link_add(dev, domain->power_dev, DL_FLAG_STATELESS)) {
+ ret = -EINVAL;
+ dev_err_probe(dev, ret,
+ "failed to link to %s\n", data->name);
+ pm_genpd_remove(&domain->genpd);
+ dev_pm_domain_detach(domain->power_dev, true);
+ goto cleanup_pds;
+ }
+
/*
* We use runtime PM to trigger power on/off of the upstream GPC
* domain, as a strict hierarchical parent/child power domain
@@ -324,6 +338,7 @@ static int imx8m_blk_ctrl_probe(struct platform_device *pdev)
of_genpd_del_provider(dev->of_node);
cleanup_pds:
for (i--; i >= 0; i--) {
+ device_link_remove(dev, bc->domains[i].power_dev);
pm_genpd_remove(&bc->domains[i].genpd);
dev_pm_domain_detach(bc->domains[i].power_dev, true);
}
@@ -343,6 +358,7 @@ static void imx8m_blk_ctrl_remove(struct platform_device *pdev)
for (i = 0; bc->onecell_data.num_domains; i++) {
struct imx8m_blk_ctrl_domain *domain = &bc->domains[i];
+ device_link_remove(&pdev->dev, domain->power_dev);
pm_genpd_remove(&domain->genpd);
dev_pm_domain_detach(domain->power_dev, true);
}
--
2.34.1
On 2024-06-24 19:01, gregkh(a)linuxfoundation.org wrote:
> This is a note to let you know that I've just added the patch titled
>
> MIPS: pci: lantiq: restore reset gpio polarity
>
> to the 6.9-stable tree which can be found at:
>
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> mips-pci-lantiq-restore-reset-gpio-polarity.patch
> and it can be found in the queue-6.9 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable
> tree,
> please let <stable(a)vger.kernel.org> know about it.
This patch is buggy and should not go into the stable trees.
It has already been reverted upstream in the mips-fixes.
Thank you very much and sorry for the inconvenience,
Martin
>
>
> From 277a0363120276645ae598d8d5fea7265e076ae9 Mon Sep 17 00:00:00 2001
> From: Martin Schiller <ms(a)dev.tdt.de>
> Date: Fri, 7 Jun 2024 11:04:00 +0200
> Subject: MIPS: pci: lantiq: restore reset gpio polarity
>
> From: Martin Schiller <ms(a)dev.tdt.de>
>
> commit 277a0363120276645ae598d8d5fea7265e076ae9 upstream.
>
> Commit 90c2d2eb7ab5 ("MIPS: pci: lantiq: switch to using gpiod API")
> not
> only switched to the gpiod API, but also inverted / changed the
> polarity
> of the GPIO.
>
> According to the PCI specification, the RST# pin is an active-low
> signal. However, most of the device trees that have been widely used
> for
> a long time (mainly in the openWrt project) define this GPIO as
> active-high and the old driver code inverted the signal internally.
>
> Apparently there are actually boards where the reset gpio must be
> operated inverted. For this reason, we cannot use the
> GPIOD_OUT_LOW/HIGH
> flag for initialization. Instead, we must explicitly set the gpio to
> value 1 in order to take into account any "GPIO_ACTIVE_LOW" flag that
> may have been set.
>
> In order to remain compatible with all these existing device trees, we
> should therefore keep the logic as it was before the commit.
>
> Fixes: 90c2d2eb7ab5 ("MIPS: pci: lantiq: switch to using gpiod API")
> Cc: stable(a)vger.kernel.org
> Signed-off-by: Martin Schiller <ms(a)dev.tdt.de>
> Signed-off-by: Thomas Bogendoerfer <tsbogend(a)alpha.franken.de>
> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
> ---
> arch/mips/pci/pci-lantiq.c | 8 ++++----
> 1 file changed, 4 insertions(+), 4 deletions(-)
>
> --- a/arch/mips/pci/pci-lantiq.c
> +++ b/arch/mips/pci/pci-lantiq.c
> @@ -124,14 +124,14 @@ static int ltq_pci_startup(struct platfo
> clk_disable(clk_external);
>
> /* setup reset gpio used by pci */
> - reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset",
> - GPIOD_OUT_LOW);
> + reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset",
> GPIOD_ASIS);
> error = PTR_ERR_OR_ZERO(reset_gpio);
> if (error) {
> dev_err(&pdev->dev, "failed to request gpio: %d\n", error);
> return error;
> }
> gpiod_set_consumer_name(reset_gpio, "pci_reset");
> + gpiod_direction_output(reset_gpio, 1);
>
> /* enable auto-switching between PCI and EBU */
> ltq_pci_w32(0xa, PCI_CR_CLK_CTRL);
> @@ -194,10 +194,10 @@ static int ltq_pci_startup(struct platfo
>
> /* toggle reset pin */
> if (reset_gpio) {
> - gpiod_set_value_cansleep(reset_gpio, 1);
> + gpiod_set_value_cansleep(reset_gpio, 0);
> wmb();
> mdelay(1);
> - gpiod_set_value_cansleep(reset_gpio, 0);
> + gpiod_set_value_cansleep(reset_gpio, 1);
> }
> return 0;
> }
>
>
> Patches currently in stable-queue which might be from ms(a)dev.tdt.de are
>
> queue-6.9/mips-pci-lantiq-restore-reset-gpio-polarity.patch
On Mon, Jun 24, 2024 at 11:30:42PM -0400, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> arm64: dts: qcom: qcs404: fix bluetooth device address
>
> to the 5.4-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
> commit a48b0e85558565dc3a10c8021b1514099cada102
> Author: Johan Hovold <johan+linaro(a)kernel.org>
> Date: Wed May 1 09:52:01 2024 +0200
>
> arm64: dts: qcom: qcs404: fix bluetooth device address
>
> [ Upstream commit f5f390a77f18eaeb2c93211a1b7c5e66b5acd423 ]
>
> The 'local-bd-address' property is used to pass a unique Bluetooth
> device address from the boot firmware to the kernel and should otherwise
> be left unset so that the OS can prevent the controller from being used
> until a valid address has been provided through some other means (e.g.
> using btmgmt).
>
> Fixes: 60f77ae7d1c1 ("arm64: dts: qcom: qcs404-evb: Enable uart3 and add Bluetooth")
> Cc: stable(a)vger.kernel.org # 5.10
This was supposed to say 5.2. Thanks for catching this.
Johan
From: Arnd Bergmann <arnd(a)arndb.de>
The unusual function calling conventions on SuperH ended up causing
sync_file_range to have the wrong argument order, with the 'flags'
argument getting sorted before 'nbytes' by the compiler.
In userspace, I found that musl, glibc, uclibc and strace all expect the
normal calling conventions with 'nbytes' last, so changing the kernel
to match them should make all of those work.
In order to be able to also fix libc implementations to work with existing
kernels, they need to be able to tell which ABI is used. An easy way
to do this is to add yet another system call using the sync_file_range2
ABI that works the same on all architectures.
Old user binaries can now work on new kernels, and new binaries can
try the new sync_file_range2() to work with new kernels or fall back
to the old sync_file_range() version if that doesn't exist.
Cc: stable(a)vger.kernel.org
Fixes: 75c92acdd5b1 ("sh: Wire up new syscalls.")
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
---
arch/sh/kernel/sys_sh32.c | 11 +++++++++++
arch/sh/kernel/syscalls/syscall.tbl | 3 ++-
2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c
index 9dca568509a5..d6f4afcb0e87 100644
--- a/arch/sh/kernel/sys_sh32.c
+++ b/arch/sh/kernel/sys_sh32.c
@@ -59,3 +59,14 @@ asmlinkage int sys_fadvise64_64_wrapper(int fd, u32 offset0, u32 offset1,
(u64)len0 << 32 | len1, advice);
#endif
}
+
+/*
+ * swap the arguments the way that libc wants them instead of
+ * moving flags ahead of the 64-bit nbytes argument
+ */
+SYSCALL_DEFINE6(sh_sync_file_range6, int, fd, SC_ARG64(offset),
+ SC_ARG64(nbytes), unsigned int, flags)
+{
+ return ksys_sync_file_range(fd, SC_VAL64(loff_t, offset),
+ SC_VAL64(loff_t, nbytes), flags);
+}
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index bbf83a2db986..c55fd7696d40 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -321,7 +321,7 @@
311 common set_robust_list sys_set_robust_list
312 common get_robust_list sys_get_robust_list
313 common splice sys_splice
-314 common sync_file_range sys_sync_file_range
+314 common sync_file_range sys_sh_sync_file_range6
315 common tee sys_tee
316 common vmsplice sys_vmsplice
317 common move_pages sys_move_pages
@@ -395,6 +395,7 @@
385 common pkey_alloc sys_pkey_alloc
386 common pkey_free sys_pkey_free
387 common rseq sys_rseq
+388 common sync_file_range2 sys_sync_file_range2
# room for arch specific syscalls
393 common semget sys_semget
394 common semctl sys_semctl
--
2.39.2
On Mon, 24 Jun 2024, stable(a)vger.kernel.org wrote:
> This is a note to let you know that I've just added the patch titled
>
> nfsd: fix oops when reading pool_stats before server is started
>
> to the 6.9-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> nfsd-fix-oops-when-reading-pool_stats-before-server-.patch
> and it can be found in the queue-6.9 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
I feel this should not be added to the stable tree.
It moves at test on a field protected by a mutex outside of the
protection of that mutex, and so is obviously racey.
Depending on how the race goes, si->serv might be NULL when dereferenced
in svc_pool_stats_start(), or svc_pool_stats_stop() might unlock a mutex
that hadn't been locked.
I'll post a revert and a better fix for mainline.
Thanks,
NeilBrown
>
>
>
> commit 388a527c6cf55bde74bc0891d0b4c38f50d896be
> Author: Jeff Layton <jlayton(a)kernel.org>
> Date: Mon Jun 17 07:54:08 2024 -0400
>
> nfsd: fix oops when reading pool_stats before server is started
>
> [ Upstream commit 8e948c365d9c10b685d1deb946bd833d6a9b43e0 ]
>
> Sourbh reported an oops that is triggerable by trying to read the
> pool_stats procfile before nfsd had been started. Move the check for a
> NULL serv in svc_pool_stats_start above the mutex acquisition, and fix
> the stop routine not to unlock the mutex if there is no serv yet.
>
> Fixes: 7b207ccd9833 ("svc: don't hold reference for poolstats, only mutex.")
> Reported-by: Sourabh Jain <sourabhjain(a)linux.ibm.com>
> Signed-off-by: Jeff Layton <jlayton(a)kernel.org>
> Tested-by: Sourabh Jain <sourabhjain(a)linux.ibm.com>
> Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index b4a85a227bd7d..1a2982051f986 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -1371,12 +1371,13 @@ static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos)
>
> dprintk("svc_pool_stats_start, *pidx=%u\n", pidx);
>
> + if (!si->serv)
> + return NULL;
> +
> mutex_lock(si->mutex);
>
> if (!pidx)
> return SEQ_START_TOKEN;
> - if (!si->serv)
> - return NULL;
> return pidx > si->serv->sv_nrpools ? NULL
> : &si->serv->sv_pools[pidx - 1];
> }
> @@ -1408,7 +1409,8 @@ static void svc_pool_stats_stop(struct seq_file *m, void *p)
> {
> struct svc_info *si = m->private;
>
> - mutex_unlock(si->mutex);
> + if (si->serv)
> + mutex_unlock(si->mutex);
> }
>
> static int svc_pool_stats_show(struct seq_file *m, void *p)
>
On Mon, Jun 24, 2024, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> KVM: Use gfn instead of hva for mmu_notifier_retry
>
> to the 6.6-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> kvm-use-gfn-instead-of-hva-for-mmu_notifier_retry.patch
> and it can be found in the queue-6.6 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
>
> commit 68a14ccc3fb35047cc4900c8ddd4b6f959e25b77
> Author: Chao Peng <chao.p.peng(a)linux.intel.com>
> Date: Fri Oct 27 11:21:45 2023 -0700
>
> KVM: Use gfn instead of hva for mmu_notifier_retry
>
> [ Upstream commit 8569992d64b8f750e34b7858eac5d7daaf0f80fd ]
>
> Currently in mmu_notifier invalidate path, hva range is recorded and then
> checked against by mmu_invalidate_retry_hva() in the page fault handling
> path. However, for the soon-to-be-introduced private memory, a page fault
> may not have a hva associated, checking gfn(gpa) makes more sense.
>
> For existing hva based shared memory, gfn is expected to also work. The
> only downside is when aliasing multiple gfns to a single hva, the
> current algorithm of checking multiple ranges could result in a much
> larger range being rejected. Such aliasing should be uncommon, so the
> impact is expected small.
>
> Suggested-by: Sean Christopherson <seanjc(a)google.com>
> Cc: Xu Yilun <yilun.xu(a)intel.com>
> Signed-off-by: Chao Peng <chao.p.peng(a)linux.intel.com>
> Reviewed-by: Fuad Tabba <tabba(a)google.com>
> Tested-by: Fuad Tabba <tabba(a)google.com>
> [sean: convert vmx_set_apic_access_page_addr() to gfn-based API]
> Signed-off-by: Sean Christopherson <seanjc(a)google.com>
> Reviewed-by: Paolo Bonzini <pbonzini(a)redhat.com>
> Reviewed-by: Xu Yilun <yilun.xu(a)linux.intel.com>
> Message-Id: <20231027182217.3615211-4-seanjc(a)google.com>
> Reviewed-by: Kai Huang <kai.huang(a)intel.com>
> Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
> Stable-dep-of: c3f3edf73a8f ("KVM: Stop processing *all* memslots when "null" mmu_notifier handler is found")
Please drop this, and all other related patches. This is not at all appropriate
for stable trees.
I'm pretty sure your scripts are borked too, at least from KVM's perspective. I
specifically didn't tag c3f3edf73a8f for stable[*], and I thought we had agreed a
while back that only KVM (x86?) fixes with an explicit "Cc: stable@" would be
automatically included.
[*] https://lore.kernel.org/all/20240620230937.2214992-1-seanjc@google.com
As per Errata i2310[0], Erroneous timeout can be triggered,
if this Erroneous interrupt is not cleared then it may leads
to storm of interrupts, therefore apply Errata i2310 solution.
[0] https://www.ti.com/lit/pdf/sprz536 page 23
Fixes: b67e830d38fa ("serial: 8250: 8250_omap: Fix possible interrupt storm on K3 SoCs")
Cc: stable(a)vger.kernel.org
Signed-off-by: Udit Kumar <u-kumar1(a)ti.com>
---
Test logs:
https://gist.github.com/uditkumarti/48e239540db4e761861fbd1d7d31cfed
Change logs
Changes in v4:
- Reverted fifo empty check before applying errata
Link to v3:
https://lore.kernel.org/all/20240619105903.165434-1-u-kumar1@ti.com/
Changes in v3:
- CC stable in commit message
Link to v2:
https://lore.kernel.org/all/20240617052253.2188140-1-u-kumar1@ti.com/
Changes in v2:
- Added Fixes Tag and typo correction in commit message
- Corrected bit position to UART_OMAP_EFR2_TIMEOUT_BEHAVE
Link to v1
https://lore.kernel.org/all/20240614061314.290840-1-u-kumar1@ti.com/
drivers/tty/serial/8250/8250_omap.c | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 170639d12b2a..1af9aed99c65 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -115,6 +115,10 @@
/* RX FIFO occupancy indicator */
#define UART_OMAP_RX_LVL 0x19
+/* Timeout low and High */
+#define UART_OMAP_TO_L 0x26
+#define UART_OMAP_TO_H 0x27
+
/*
* Copy of the genpd flags for the console.
* Only used if console suspend is disabled
@@ -663,13 +667,25 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id)
/*
* On K3 SoCs, it is observed that RX TIMEOUT is signalled after
- * FIFO has been drained, in which case a dummy read of RX FIFO
- * is required to clear RX TIMEOUT condition.
+ * FIFO has been drained or erroneously.
+ * So apply solution of Errata i2310 as mentioned in
+ * https://www.ti.com/lit/pdf/sprz536
*/
if (priv->habit & UART_RX_TIMEOUT_QUIRK &&
(iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT &&
serial_port_in(port, UART_OMAP_RX_LVL) == 0) {
- serial_port_in(port, UART_RX);
+ unsigned char efr2, timeout_h, timeout_l;
+
+ efr2 = serial_in(up, UART_OMAP_EFR2);
+ timeout_h = serial_in(up, UART_OMAP_TO_H);
+ timeout_l = serial_in(up, UART_OMAP_TO_L);
+ serial_out(up, UART_OMAP_TO_H, 0xFF);
+ serial_out(up, UART_OMAP_TO_L, 0xFF);
+ serial_out(up, UART_OMAP_EFR2, UART_OMAP_EFR2_TIMEOUT_BEHAVE);
+ serial_in(up, UART_IIR);
+ serial_out(up, UART_OMAP_EFR2, efr2);
+ serial_out(up, UART_OMAP_TO_H, timeout_h);
+ serial_out(up, UART_OMAP_TO_L, timeout_l);
}
/* Stop processing interrupts on input overrun */
--
2.34.1
The quilt patch titled
Subject: mm/memory: don't require head page for do_set_pmd()
has been removed from the -mm tree. Its filename was
mm-memory-dont-require-head-page-for-do_set_pmd.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Andrew Bresticker <abrestic(a)rivosinc.com>
Subject: mm/memory: don't require head page for do_set_pmd()
Date: Tue, 11 Jun 2024 08:32:16 -0700
The requirement that the head page be passed to do_set_pmd() was added in
commit ef37b2ea08ac ("mm/memory: page_add_file_rmap() ->
folio_add_file_rmap_[pte|pmd]()") and prevents pmd-mapping in the
finish_fault() and filemap_map_pages() paths if the page to be inserted is
anything but the head page for an otherwise suitable vma and pmd-sized
page.
Matthew said:
: We're going to stop using PMDs to map large folios unless the fault is
: within the first 4KiB of the PMD. No idea how many workloads that
: affects, but it only needs to be backported as far as v6.8, so we may
: as well backport it.
Link: https://lkml.kernel.org/r/20240611153216.2794513-1-abrestic@rivosinc.com
Fixes: ef37b2ea08ac ("mm/memory: page_add_file_rmap() -> folio_add_file_rmap_[pte|pmd]()")
Signed-off-by: Andrew Bresticker <abrestic(a)rivosinc.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Acked-by: Hugh Dickins <hughd(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/mm/memory.c~mm-memory-dont-require-head-page-for-do_set_pmd
+++ a/mm/memory.c
@@ -4608,8 +4608,9 @@ vm_fault_t do_set_pmd(struct vm_fault *v
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
return ret;
- if (page != &folio->page || folio_order(folio) != HPAGE_PMD_ORDER)
+ if (folio_order(folio) != HPAGE_PMD_ORDER)
return ret;
+ page = &folio->page;
/*
* Just backoff if any subpage of a THP is corrupted otherwise
_
Patches currently in -mm which might be from abrestic(a)rivosinc.com are
The quilt patch titled
Subject: mm/page_alloc: Separate THP PCP into movable and non-movable categories
has been removed from the -mm tree. Its filename was
mm-page_alloc-separate-thp-pcp-into-movable-and-non-movable-categories.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: yangge <yangge1116(a)126.com>
Subject: mm/page_alloc: Separate THP PCP into movable and non-movable categories
Date: Thu, 20 Jun 2024 08:59:50 +0800
Since commit 5d0a661d808f ("mm/page_alloc: use only one PCP list for
THP-sized allocations") no longer differentiates the migration type of
pages in THP-sized PCP list, it's possible that non-movable allocation
requests may get a CMA page from the list, in some cases, it's not
acceptable.
If a large number of CMA memory are configured in system (for example, the
CMA memory accounts for 50% of the system memory), starting a virtual
machine with device passthrough will get stuck. During starting the
virtual machine, it will call pin_user_pages_remote(..., FOLL_LONGTERM,
...) to pin memory. Normally if a page is present and in CMA area,
pin_user_pages_remote() will migrate the page from CMA area to non-CMA
area because of FOLL_LONGTERM flag. But if non-movable allocation
requests return CMA memory, migrate_longterm_unpinnable_pages() will
migrate a CMA page to another CMA page, which will fail to pass the check
in check_and_migrate_movable_pages() and cause migration endless.
Call trace:
pin_user_pages_remote
--__gup_longterm_locked // endless loops in this function
----_get_user_pages_locked
----check_and_migrate_movable_pages
------migrate_longterm_unpinnable_pages
--------alloc_migration_target
This problem will also have a negative impact on CMA itself. For example,
when CMA is borrowed by THP, and we need to reclaim it through cma_alloc()
or dma_alloc_coherent(), we must move those pages out to ensure CMA's
users can retrieve that contigous memory. Currently, CMA's memory is
occupied by non-movable pages, meaning we can't relocate them. As a
result, cma_alloc() is more likely to fail.
To fix the problem above, we add one PCP list for THP, which will not
introduce a new cacheline for struct per_cpu_pages. THP will have 2 PCP
lists, one PCP list is used by MOVABLE allocation, and the other PCP list
is used by UNMOVABLE allocation. MOVABLE allocation contains GPF_MOVABLE,
and UNMOVABLE allocation contains GFP_UNMOVABLE and GFP_RECLAIMABLE.
Link: https://lkml.kernel.org/r/1718845190-4456-1-git-send-email-yangge1116@126.c…
Fixes: 5d0a661d808f ("mm/page_alloc: use only one PCP list for THP-sized allocations")
Signed-off-by: yangge <yangge1116(a)126.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Barry Song <21cnbao(a)gmail.com>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mmzone.h | 9 ++++-----
mm/page_alloc.c | 9 +++++++--
2 files changed, 11 insertions(+), 7 deletions(-)
--- a/include/linux/mmzone.h~mm-page_alloc-separate-thp-pcp-into-movable-and-non-movable-categories
+++ a/include/linux/mmzone.h
@@ -654,13 +654,12 @@ enum zone_watermarks {
};
/*
- * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list
- * for THP which will usually be GFP_MOVABLE. Even if it is another type,
- * it should not contribute to serious fragmentation causing THP allocation
- * failures.
+ * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists
+ * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list
+ * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE.
*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define NR_PCP_THP 1
+#define NR_PCP_THP 2
#else
#define NR_PCP_THP 0
#endif
--- a/mm/page_alloc.c~mm-page_alloc-separate-thp-pcp-into-movable-and-non-movable-categories
+++ a/mm/page_alloc.c
@@ -504,10 +504,15 @@ out:
static inline unsigned int order_to_pindex(int migratetype, int order)
{
+ bool __maybe_unused movable;
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != HPAGE_PMD_ORDER);
- return NR_LOWORDER_PCP_LISTS;
+
+ movable = migratetype == MIGRATE_MOVABLE;
+
+ return NR_LOWORDER_PCP_LISTS + movable;
}
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -521,7 +526,7 @@ static inline int pindex_to_order(unsign
int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pindex == NR_LOWORDER_PCP_LISTS)
+ if (pindex >= NR_LOWORDER_PCP_LISTS)
order = HPAGE_PMD_ORDER;
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
_
Patches currently in -mm which might be from yangge1116(a)126.com are
mm-gup-clear-the-lru-flag-of-a-page-before-adding-to-lru-batch.patch
The quilt patch titled
Subject: nfs: drop the incorrect assertion in nfs_swap_rw()
has been removed from the -mm tree. Its filename was
nfs-drop-the-incorrect-assertion-in-nfs_swap_rw.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Christoph Hellwig <hch(a)lst.de>
Subject: nfs: drop the incorrect assertion in nfs_swap_rw()
Date: Tue, 18 Jun 2024 18:56:47 +1200
Since commit 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS
swap-space"), we can plug multiple pages then unplug them all together.
That means iov_iter_count(iter) could be way bigger than PAGE_SIZE, it
actually equals the size of iov_iter_npages(iter, INT_MAX).
Note this issue has nothing to do with large folios as we don't support
THP_SWPOUT to non-block devices.
[v-songbaohua(a)oppo.com: figure out the cause and correct the commit message]
Link: https://lkml.kernel.org/r/20240618065647.21791-1-21cnbao@gmail.com
Fixes: 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS swap-space")
Signed-off-by: Christoph Hellwig <hch(a)lst.de>
Signed-off-by: Barry Song <v-songbaohua(a)oppo.com>
Closes: https://lore.kernel.org/linux-mm/20240617053201.GA16852@lst.de/
Reviewed-by: Martin Wege <martin.l.wege(a)gmail.com>
Cc: NeilBrown <neilb(a)suse.de>
Cc: Anna Schumaker <anna(a)kernel.org>
Cc: Steve French <sfrench(a)samba.org>
Cc: Trond Myklebust <trondmy(a)kernel.org>
Cc: Chuanhua Han <hanchuanhua(a)oppo.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Jeff Layton <jlayton(a)kernel.org>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nfs/direct.c | 2 --
1 file changed, 2 deletions(-)
--- a/fs/nfs/direct.c~nfs-drop-the-incorrect-assertion-in-nfs_swap_rw
+++ a/fs/nfs/direct.c
@@ -141,8 +141,6 @@ int nfs_swap_rw(struct kiocb *iocb, stru
{
ssize_t ret;
- VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
-
if (iov_iter_rw(iter) == READ)
ret = nfs_file_direct_read(iocb, iter, true);
else
_
Patches currently in -mm which might be from hch(a)lst.de are
The quilt patch titled
Subject: ocfs2: fix DIO failure due to insufficient transaction credits
has been removed from the -mm tree. Its filename was
ocfs2-fix-dio-failure-due-to-insufficient-transaction-credits.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Jan Kara <jack(a)suse.cz>
Subject: ocfs2: fix DIO failure due to insufficient transaction credits
Date: Fri, 14 Jun 2024 16:52:43 +0200
The code in ocfs2_dio_end_io_write() estimates number of necessary
transaction credits using ocfs2_calc_extend_credits(). This however does
not take into account that the IO could be arbitrarily large and can
contain arbitrary number of extents.
Extent tree manipulations do often extend the current transaction but not
in all of the cases. For example if we have only single block extents in
the tree, ocfs2_mark_extent_written() will end up calling
ocfs2_replace_extent_rec() all the time and we will never extend the
current transaction and eventually exhaust all the transaction credits if
the IO contains many single block extents. Once that happens a
WARN_ON(jbd2_handle_buffer_credits(handle) <= 0) is triggered in
jbd2_journal_dirty_metadata() and subsequently OCFS2 aborts in response to
this error. This was actually triggered by one of our customers on a
heavily fragmented OCFS2 filesystem.
To fix the issue make sure the transaction always has enough credits for
one extent insert before each call of ocfs2_mark_extent_written().
Heming Zhao said:
------
PANIC: "Kernel panic - not syncing: OCFS2: (device dm-1): panic forced after error"
PID: xxx TASK: xxxx CPU: 5 COMMAND: "SubmitThread-CA"
#0 machine_kexec at ffffffff8c069932
#1 __crash_kexec at ffffffff8c1338fa
#2 panic at ffffffff8c1d69b9
#3 ocfs2_handle_error at ffffffffc0c86c0c [ocfs2]
#4 __ocfs2_abort at ffffffffc0c88387 [ocfs2]
#5 ocfs2_journal_dirty at ffffffffc0c51e98 [ocfs2]
#6 ocfs2_split_extent at ffffffffc0c27ea3 [ocfs2]
#7 ocfs2_change_extent_flag at ffffffffc0c28053 [ocfs2]
#8 ocfs2_mark_extent_written at ffffffffc0c28347 [ocfs2]
#9 ocfs2_dio_end_io_write at ffffffffc0c2bef9 [ocfs2]
#10 ocfs2_dio_end_io at ffffffffc0c2c0f5 [ocfs2]
#11 dio_complete at ffffffff8c2b9fa7
#12 do_blockdev_direct_IO at ffffffff8c2bc09f
#13 ocfs2_direct_IO at ffffffffc0c2b653 [ocfs2]
#14 generic_file_direct_write at ffffffff8c1dcf14
#15 __generic_file_write_iter at ffffffff8c1dd07b
#16 ocfs2_file_write_iter at ffffffffc0c49f1f [ocfs2]
#17 aio_write at ffffffff8c2cc72e
#18 kmem_cache_alloc at ffffffff8c248dde
#19 do_io_submit at ffffffff8c2ccada
#20 do_syscall_64 at ffffffff8c004984
#21 entry_SYSCALL_64_after_hwframe at ffffffff8c8000ba
Link: https://lkml.kernel.org/r/20240617095543.6971-1-jack@suse.cz
Link: https://lkml.kernel.org/r/20240614145243.8837-1-jack@suse.cz
Fixes: c15471f79506 ("ocfs2: fix sparse file & data ordering issue in direct io")
Signed-off-by: Jan Kara <jack(a)suse.cz>
Reviewed-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Reviewed-by: Heming Zhao <heming.zhao(a)suse.com>
Cc: Mark Fasheh <mark(a)fasheh.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Changwei Ge <gechangwei(a)live.cn>
Cc: Gang He <ghe(a)suse.com>
Cc: Jun Piao <piaojun(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/aops.c | 5 +++++
fs/ocfs2/journal.c | 17 +++++++++++++++++
fs/ocfs2/journal.h | 2 ++
fs/ocfs2/ocfs2_trace.h | 2 ++
4 files changed, 26 insertions(+)
--- a/fs/ocfs2/aops.c~ocfs2-fix-dio-failure-due-to-insufficient-transaction-credits
+++ a/fs/ocfs2/aops.c
@@ -2366,6 +2366,11 @@ static int ocfs2_dio_end_io_write(struct
}
list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+ ret = ocfs2_assure_trans_credits(handle, credits);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
ret = ocfs2_mark_extent_written(inode, &et, handle,
ue->ue_cpos, 1,
ue->ue_phys,
--- a/fs/ocfs2/journal.c~ocfs2-fix-dio-failure-due-to-insufficient-transaction-credits
+++ a/fs/ocfs2/journal.c
@@ -446,6 +446,23 @@ bail:
}
/*
+ * Make sure handle has at least 'nblocks' credits available. If it does not
+ * have that many credits available, we will try to extend the handle to have
+ * enough credits. If that fails, we will restart transaction to have enough
+ * credits. Similar notes regarding data consistency and locking implications
+ * as for ocfs2_extend_trans() apply here.
+ */
+int ocfs2_assure_trans_credits(handle_t *handle, int nblocks)
+{
+ int old_nblks = jbd2_handle_buffer_credits(handle);
+
+ trace_ocfs2_assure_trans_credits(old_nblks);
+ if (old_nblks >= nblocks)
+ return 0;
+ return ocfs2_extend_trans(handle, nblocks - old_nblks);
+}
+
+/*
* If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
* If that fails, restart the transaction & regain write access for the
* buffer head which is used for metadata modifications.
--- a/fs/ocfs2/journal.h~ocfs2-fix-dio-failure-due-to-insufficient-transaction-credits
+++ a/fs/ocfs2/journal.h
@@ -243,6 +243,8 @@ handle_t *ocfs2_start_trans(struct
int ocfs2_commit_trans(struct ocfs2_super *osb,
handle_t *handle);
int ocfs2_extend_trans(handle_t *handle, int nblocks);
+int ocfs2_assure_trans_credits(handle_t *handle,
+ int nblocks);
int ocfs2_allocate_extend_trans(handle_t *handle,
int thresh);
--- a/fs/ocfs2/ocfs2_trace.h~ocfs2-fix-dio-failure-due-to-insufficient-transaction-credits
+++ a/fs/ocfs2/ocfs2_trace.h
@@ -2577,6 +2577,8 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+DEFINE_OCFS2_INT_EVENT(ocfs2_assure_trans_credits);
+
DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
_
Patches currently in -mm which might be from jack(a)suse.cz are
revert-mm-writeback-fix-possible-divide-by-zero-in-wb_dirty_limits-again.patch
mm-avoid-overflows-in-dirty-throttling-logic.patch
The quilt patch titled
Subject: kasan: fix bad call to unpoison_slab_object
has been removed from the -mm tree. Its filename was
kasan-fix-bad-call-to-unpoison_slab_object.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Andrey Konovalov <andreyknvl(a)gmail.com>
Subject: kasan: fix bad call to unpoison_slab_object
Date: Fri, 14 Jun 2024 16:32:38 +0200
Commit 29d7355a9d05 ("kasan: save alloc stack traces for mempool") messed
up one of the calls to unpoison_slab_object: the last two arguments are
supposed to be GFP flags and whether to init the object memory.
Fix the call.
Without this fix, __kasan_mempool_unpoison_object provides the object's
size as GFP flags to unpoison_slab_object, which can cause LOCKDEP reports
(and probably other issues).
Link: https://lkml.kernel.org/r/20240614143238.60323-1-andrey.konovalov@linux.dev
Fixes: 29d7355a9d05 ("kasan: save alloc stack traces for mempool")
Signed-off-by: Andrey Konovalov <andreyknvl(a)gmail.com>
Reported-by: Brad Spengler <spender(a)grsecurity.net>
Acked-by: Marco Elver <elver(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/kasan/common.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/kasan/common.c~kasan-fix-bad-call-to-unpoison_slab_object
+++ a/mm/kasan/common.c
@@ -532,7 +532,7 @@ void __kasan_mempool_unpoison_object(voi
return;
/* Unpoison the object and save alloc info for non-kmalloc() allocations. */
- unpoison_slab_object(slab->slab_cache, ptr, size, flags);
+ unpoison_slab_object(slab->slab_cache, ptr, flags, false);
/* Poison the redzone and save alloc info for kmalloc() allocations. */
if (is_kmalloc_cache(slab->slab_cache))
_
Patches currently in -mm which might be from andreyknvl(a)gmail.com are
After commit f3dc1bdb6b0b("cifs: Fix writeback data corruption"), the
writepages for cifs will find all folio needed writepage with two phase.
The first folio will be found in cifs_writepages_begin, and the latter
various folios will be found in cifs_extend_writeback.
All those will first get folio, and for normal case, once we set page
writeback and after do really write, we should put the reference, folio
found in cifs_extend_writeback do this with folio_batch_release. But the
folio found in cifs_writepages_begin never get the chance do it. And
every writepages call, we will leak a folio(found this problem while do
xfstests over cifs).
Besides, the exist path seem never handle this folio correctly, fix it too
with this patch.
The problem does not exist in mainline since writepages path for cifs
has changed to netfs. It's had to backport all related change, so try fix
this problem with this single patch.
Fixes: f3dc1bdb6b0b ("cifs: Fix writeback data corruption")
Signed-off-by: Yang Erkun <yangerkun(a)huawei.com>
---
fs/smb/client/file.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 9be37d0fe724..0a48d80b3871 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -2860,17 +2860,21 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile);
if (rc) {
cifs_dbg(VFS, "No writable handle in writepages rc=%d\n", rc);
+ folio_unlock(folio);
goto err_xid;
}
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->wsize,
&wsize, credits);
- if (rc != 0)
+ if (rc != 0) {
+ folio_unlock(folio);
goto err_close;
+ }
wdata = cifs_writedata_alloc(cifs_writev_complete);
if (!wdata) {
rc = -ENOMEM;
+ folio_unlock(folio);
goto err_uncredit;
}
@@ -3017,17 +3021,22 @@ static ssize_t cifs_writepages_begin(struct address_space *mapping,
lock_again:
if (wbc->sync_mode != WB_SYNC_NONE) {
ret = folio_lock_killable(folio);
- if (ret < 0)
+ if (ret < 0) {
+ folio_put(folio);
return ret;
+ }
} else {
- if (!folio_trylock(folio))
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
goto search_again;
+ }
}
if (folio->mapping != mapping ||
!folio_test_dirty(folio)) {
start += folio_size(folio);
folio_unlock(folio);
+ folio_put(folio);
goto search_again;
}
@@ -3057,6 +3066,7 @@ static ssize_t cifs_writepages_begin(struct address_space *mapping,
out:
if (ret > 0)
*_start = start + ret;
+ folio_put(folio);
return ret;
}
--
2.39.2
The patch titled
Subject: mm/gup: clear the LRU flag of a page before adding to LRU batch
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-gup-clear-the-lru-flag-of-a-page-before-adding-to-lru-batch.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: yangge <yangge1116(a)126.com>
Subject: mm/gup: clear the LRU flag of a page before adding to LRU batch
Date: Sat, 22 Jun 2024 14:48:04 +0800
If a large number of CMA memory are configured in system (for example, the
CMA memory accounts for 50% of the system memory), starting a virtual
machine, it will call pin_user_pages_remote(..., FOLL_LONGTERM, ...) to
pin memory. Normally if a page is present and in CMA area,
pin_user_pages_remote() will migrate the page from CMA area to non-CMA
area because of FOLL_LONGTERM flag. But the current code will cause the
migration failure due to unexpected page refcounts, and eventually cause
the virtual machine fail to start.
If a page is added in LRU batch, its refcount increases one, remove the
page from LRU batch decreases one. Page migration requires the page is
not referenced by others except page mapping. Before migrating a page, we
should try to drain the page from LRU batch in case the page is in it,
however, folio_test_lru() is not sufficient to tell whether the page is in
LRU batch or not, if the page is in LRU batch, the migration will fail.
To solve the problem above, we modify the logic of adding to LRU batch.
Before adding a page to LRU batch, we clear the LRU flag of the page so
that we can check whether the page is in LRU batch by
folio_test_lru(page). Seems making the LRU flag of the page invisible a
long time is no problem, because a new page is allocated from buddy and
added to the lru batch, its LRU flag is also not visible for a long time.
Link: https://lkml.kernel.org/r/1719038884-1903-1-git-send-email-yangge1116@126.c…
Signed-off-by: yangge <yangge1116(a)126.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Alex Shi <alex.shi(a)linux.alibaba.com>
Cc: Shaohua Li <shli(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/swap.c | 43 +++++++++++++++++++++++++++++++------------
1 file changed, 31 insertions(+), 12 deletions(-)
--- a/mm/swap.c~mm-gup-clear-the-lru-flag-of-a-page-before-adding-to-lru-batch
+++ a/mm/swap.c
@@ -212,10 +212,6 @@ static void folio_batch_move_lru(struct
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
- /* block memcg migration while the folio moves between lru */
- if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
- continue;
-
folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
move_fn(lruvec, folio);
@@ -256,11 +252,16 @@ static void lru_move_tail_fn(struct lruv
void folio_rotate_reclaimable(struct folio *folio)
{
if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
- !folio_test_unevictable(folio) && folio_test_lru(folio)) {
+ !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
unsigned long flags;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock_irqsave(&lru_rotate.lock, flags);
fbatch = this_cpu_ptr(&lru_rotate.fbatch);
folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn);
@@ -353,11 +354,15 @@ static void folio_activate_drain(int cpu
void folio_activate(struct folio *folio)
{
- if (folio_test_lru(folio) && !folio_test_active(folio) &&
- !folio_test_unevictable(folio)) {
+ if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.activate);
folio_batch_add_and_move(fbatch, folio, folio_activate_fn);
@@ -701,6 +706,11 @@ void deactivate_file_folio(struct folio
return;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file);
folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn);
@@ -717,11 +727,16 @@ void deactivate_file_folio(struct folio
*/
void folio_deactivate(struct folio *folio)
{
- if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
- (folio_test_active(folio) || lru_gen_enabled())) {
+ if (!folio_test_unevictable(folio) && (folio_test_active(folio) ||
+ lru_gen_enabled())) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate);
folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn);
@@ -738,12 +753,16 @@ void folio_deactivate(struct folio *foli
*/
void folio_mark_lazyfree(struct folio *folio)
{
- if (folio_test_lru(folio) && folio_test_anon(folio) &&
- folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
- !folio_test_unevictable(folio)) {
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) {
struct folio_batch *fbatch;
folio_get(folio);
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return;
+ }
+
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree);
folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn);
_
Patches currently in -mm which might be from yangge1116(a)126.com are
mm-page_alloc-separate-thp-pcp-into-movable-and-non-movable-categories.patch
mm-gup-clear-the-lru-flag-of-a-page-before-adding-to-lru-batch.patch
The patch titled
Subject: nilfs2: fix incorrect inode allocation from reserved inodes
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Subject: nilfs2: fix incorrect inode allocation from reserved inodes
Date: Sun, 23 Jun 2024 14:11:35 +0900
If the bitmap block that manages the inode allocation status is corrupted,
nilfs_ifile_create_inode() may allocate a new inode from the reserved
inode area where it should not be allocated.
Previous fix commit d325dc6eb763 ("nilfs2: fix use-after-free bug of
struct nilfs_root"), fixed the problem that reserved inodes with inode
numbers less than NILFS_USER_INO (=11) were incorrectly reallocated due to
bitmap corruption, but since the start number of non-reserved inodes is
read from the super block and may change, in which case inode allocation
may occur from the extended reserved inode area.
If that happens, access to that inode will cause an IO error, causing the
file system to degrade to an error state.
Fix this potential issue by adding a wraparound option to the common
metadata object allocation routine and by modifying
nilfs_ifile_create_inode() to disable the option so that it only allocates
inodes with inode numbers greater than or equal to the inode number read
in "nilfs->ns_first_ino", regardless of the bitmap status of reserved
inodes.
Link: https://lkml.kernel.org/r/20240623051135.4180-4-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/alloc.c | 19 +++++++++++++++----
fs/nilfs2/alloc.h | 4 ++--
fs/nilfs2/dat.c | 2 +-
fs/nilfs2/ifile.c | 7 ++-----
4 files changed, 20 insertions(+), 12 deletions(-)
--- a/fs/nilfs2/alloc.c~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes
+++ a/fs/nilfs2/alloc.c
@@ -377,11 +377,12 @@ void *nilfs_palloc_block_get_entry(const
* @target: offset number of an entry in the group (start point)
* @bsize: size in bits
* @lock: spin lock protecting @bitmap
+ * @wrap: whether to wrap around
*/
static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
unsigned int bsize,
- spinlock_t *lock)
+ spinlock_t *lock, bool wrap)
{
int pos, end = bsize;
@@ -397,6 +398,8 @@ static int nilfs_palloc_find_available_s
end = target;
}
+ if (!wrap)
+ return -ENOSPC;
/* wrap around */
for (pos = 0; pos < end; pos++) {
@@ -495,9 +498,10 @@ int nilfs_palloc_count_max_entries(struc
* nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
* @inode: inode of metadata file using this allocator
* @req: nilfs_palloc_req structure exchanged for the allocation
+ * @wrap: whether to wrap around
*/
int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
- struct nilfs_palloc_req *req)
+ struct nilfs_palloc_req *req, bool wrap)
{
struct buffer_head *desc_bh, *bitmap_bh;
struct nilfs_palloc_group_desc *desc;
@@ -516,7 +520,7 @@ int nilfs_palloc_prepare_alloc_entry(str
entries_per_group = nilfs_palloc_entries_per_group(inode);
for (i = 0; i < ngroups; i += n) {
- if (group >= ngroups) {
+ if (group >= ngroups && wrap) {
/* wrap around */
group = 0;
maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
@@ -550,7 +554,14 @@ int nilfs_palloc_prepare_alloc_entry(str
bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
- bitmap, group_offset, entries_per_group, lock);
+ bitmap, group_offset, entries_per_group, lock,
+ wrap);
+ /*
+ * Since the search for a free slot in the second and
+ * subsequent bitmap blocks always starts from the
+ * beginning, the wrap flag only has an effect on the
+ * first search.
+ */
kunmap_local(bitmap_kaddr);
if (pos >= 0)
goto found;
--- a/fs/nilfs2/alloc.h~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes
+++ a/fs/nilfs2/alloc.h
@@ -50,8 +50,8 @@ struct nilfs_palloc_req {
struct buffer_head *pr_entry_bh;
};
-int nilfs_palloc_prepare_alloc_entry(struct inode *,
- struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+ struct nilfs_palloc_req *req, bool wrap);
void nilfs_palloc_commit_alloc_entry(struct inode *,
struct nilfs_palloc_req *);
void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
--- a/fs/nilfs2/dat.c~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes
+++ a/fs/nilfs2/dat.c
@@ -75,7 +75,7 @@ int nilfs_dat_prepare_alloc(struct inode
{
int ret;
- ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+ ret = nilfs_palloc_prepare_alloc_entry(dat, req, true);
if (ret < 0)
return ret;
--- a/fs/nilfs2/ifile.c~nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes
+++ a/fs/nilfs2/ifile.c
@@ -56,13 +56,10 @@ int nilfs_ifile_create_inode(struct inod
struct nilfs_palloc_req req;
int ret;
- req.pr_entry_nr = 0; /*
- * 0 says find free inode from beginning
- * of a group. dull code!!
- */
+ req.pr_entry_nr = NILFS_FIRST_INO(ifile->i_sb);
req.pr_entry_bh = NULL;
- ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+ ret = nilfs_palloc_prepare_alloc_entry(ifile, &req, false);
if (!ret) {
ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
&req.pr_entry_bh);
_
Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are
nilfs2-fix-inode-number-range-checks.patch
nilfs2-add-missing-check-for-inode-numbers-on-directory-entries.patch
nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes.patch
nilfs2-prepare-backing-device-folios-for-writing-after-adding-checksums.patch
nilfs2-do-not-call-inode_attach_wb-directly.patch
The patch titled
Subject: nilfs2: add missing check for inode numbers on directory entries
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
nilfs2-add-missing-check-for-inode-numbers-on-directory-entries.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Subject: nilfs2: add missing check for inode numbers on directory entries
Date: Sun, 23 Jun 2024 14:11:34 +0900
Syzbot reported that mounting and unmounting a specific pattern of
corrupted nilfs2 filesystem images causes a use-after-free of metadata
file inodes, which triggers a kernel bug in lru_add_fn().
As Jan Kara pointed out, this is because the link count of a metadata file
gets corrupted to 0, and nilfs_evict_inode(), which is called from iput(),
tries to delete that inode (ifile inode in this case).
The inconsistency occurs because directories containing the inode numbers
of these metadata files that should not be visible in the namespace are
read without checking.
Fix this issue by treating the inode numbers of these internal files as
errors in the sanity check helper when reading directory folios/pages.
Also thanks to Hillf Danton and Matthew Wilcox for their initial mm-layer
analysis.
Link: https://lkml.kernel.org/r/20240623051135.4180-3-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+d79afb004be235636ee8(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d79afb004be235636ee8
Reported-by: Jan Kara <jack(a)suse.cz>
Closes: https://lkml.kernel.org/r/20240617075758.wewhukbrjod5fp5o@quack3
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/dir.c | 6 ++++++
fs/nilfs2/nilfs.h | 5 +++++
2 files changed, 11 insertions(+)
--- a/fs/nilfs2/dir.c~nilfs2-add-missing-check-for-inode-numbers-on-directory-entries
+++ a/fs/nilfs2/dir.c
@@ -135,6 +135,9 @@ static bool nilfs_check_folio(struct fol
goto Enamelen;
if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
goto Espan;
+ if (unlikely(p->inode &&
+ NILFS_PRIVATE_INODE(le64_to_cpu(p->inode))))
+ goto Einumber;
}
if (offs != limit)
goto Eend;
@@ -160,6 +163,9 @@ Enamelen:
goto bad_entry;
Espan:
error = "directory entry across blocks";
+ goto bad_entry;
+Einumber:
+ error = "disallowed inode number";
bad_entry:
nilfs_error(sb,
"bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d",
--- a/fs/nilfs2/nilfs.h~nilfs2-add-missing-check-for-inode-numbers-on-directory-entries
+++ a/fs/nilfs2/nilfs.h
@@ -121,6 +121,11 @@ enum {
((ino) >= NILFS_FIRST_INO(sb) || \
((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino))))
+#define NILFS_PRIVATE_INODE(ino) ({ \
+ ino_t __ino = (ino); \
+ ((__ino) < NILFS_USER_INO && (__ino) != NILFS_ROOT_INO && \
+ (__ino) != NILFS_SKETCH_INO); })
+
/**
* struct nilfs_transaction_info: context information for synchronization
* @ti_magic: Magic number
_
Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are
nilfs2-fix-inode-number-range-checks.patch
nilfs2-add-missing-check-for-inode-numbers-on-directory-entries.patch
nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes.patch
nilfs2-prepare-backing-device-folios-for-writing-after-adding-checksums.patch
nilfs2-do-not-call-inode_attach_wb-directly.patch
The patch titled
Subject: nilfs2: fix inode number range checks
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
nilfs2-fix-inode-number-range-checks.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Subject: nilfs2: fix inode number range checks
Date: Sun, 23 Jun 2024 14:11:33 +0900
Patch series "nilfs2: fix potential issues related to reserved inodes".
This series fixes one use-after-free issue reported by syzbot, caused by
nilfs2's internal inode being exposed in the namespace on a corrupted
filesystem, and a couple of flaws that cause problems if the starting
number of non-reserved inodes written in the on-disk super block is
intentionally (or corruptly) changed from its default value.
This patch (of 3):
In the current implementation of nilfs2, "nilfs->ns_first_ino", which
gives the first non-reserved inode number, is read from the superblock,
but its lower limit is not checked.
As a result, if a number that overlaps with the inode number range of
reserved inodes such as the root directory or metadata files is set in the
super block parameter, the inode number test macros (NILFS_MDT_INODE and
NILFS_VALID_INODE) will not function properly.
In addition, these test macros use left bit-shift calculations using with
the inode number as the shift count via the BIT macro, but the result of a
shift calculation that exceeds the bit width of an integer is undefined in
the C specification, so if "ns_first_ino" is set to a large value other
than the default value NILFS_USER_INO (=11), the macros may potentially
malfunction depending on the environment.
Fix these issues by checking the lower bound of "nilfs->ns_first_ino" and
by preventing bit shifts equal to or greater than the NILFS_USER_INO
constant in the inode number test macros.
Also, change the type of "ns_first_ino" from signed integer to unsigned
integer to avoid the need for type casting in comparisons such as the
lower bound check introduced this time.
Link: https://lkml.kernel.org/r/20240623051135.4180-1-konishi.ryusuke@gmail.com
Link: https://lkml.kernel.org/r/20240623051135.4180-2-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/nilfs.h | 5 +++--
fs/nilfs2/the_nilfs.c | 6 ++++++
fs/nilfs2/the_nilfs.h | 2 +-
3 files changed, 10 insertions(+), 3 deletions(-)
--- a/fs/nilfs2/nilfs.h~nilfs2-fix-inode-number-range-checks
+++ a/fs/nilfs2/nilfs.h
@@ -116,9 +116,10 @@ enum {
#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
#define NILFS_MDT_INODE(sb, ino) \
- ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino)))
+ ((ino) < NILFS_USER_INO && (NILFS_MDT_INO_BITS & BIT(ino)))
#define NILFS_VALID_INODE(sb, ino) \
- ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino)))
+ ((ino) >= NILFS_FIRST_INO(sb) || \
+ ((ino) < NILFS_USER_INO && (NILFS_SYS_INO_BITS & BIT(ino))))
/**
* struct nilfs_transaction_info: context information for synchronization
--- a/fs/nilfs2/the_nilfs.c~nilfs2-fix-inode-number-range-checks
+++ a/fs/nilfs2/the_nilfs.c
@@ -452,6 +452,12 @@ static int nilfs_store_disk_layout(struc
}
nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+ if (nilfs->ns_first_ino < NILFS_USER_INO) {
+ nilfs_err(nilfs->ns_sb,
+ "too small lower limit for non-reserved inode numbers: %u",
+ nilfs->ns_first_ino);
+ return -EINVAL;
+ }
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
--- a/fs/nilfs2/the_nilfs.h~nilfs2-fix-inode-number-range-checks
+++ a/fs/nilfs2/the_nilfs.h
@@ -182,7 +182,7 @@ struct the_nilfs {
unsigned long ns_nrsvsegs;
unsigned long ns_first_data_block;
int ns_inode_size;
- int ns_first_ino;
+ unsigned int ns_first_ino;
u32 ns_crc_seed;
/* /sys/fs/<nilfs>/<device> */
_
Patches currently in -mm which might be from konishi.ryusuke(a)gmail.com are
nilfs2-fix-inode-number-range-checks.patch
nilfs2-add-missing-check-for-inode-numbers-on-directory-entries.patch
nilfs2-fix-incorrect-inode-allocation-from-reserved-inodes.patch
nilfs2-prepare-backing-device-folios-for-writing-after-adding-checksums.patch
nilfs2-do-not-call-inode_attach_wb-directly.patch
The following commit has been merged into the irq/urgent branch of tip:
Commit-ID: 9eee5330656bf92f51cb1f09b2dc9f8cf975b3d1
Gitweb: https://git.kernel.org/tip/9eee5330656bf92f51cb1f09b2dc9f8cf975b3d1
Author: Mostafa Saleh <smostafa(a)google.com>
AuthorDate: Mon, 24 Jun 2024 20:37:28
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Mon, 24 Jun 2024 23:33:38 +02:00
PCI/MSI: Fix UAF in msi_capability_init
KFENCE reports the following UAF:
BUG: KFENCE: use-after-free read in __pci_enable_msi_range+0x2c0/0x488
Use-after-free read at 0x0000000024629571 (in kfence-#12):
__pci_enable_msi_range+0x2c0/0x488
pci_alloc_irq_vectors_affinity+0xec/0x14c
pci_alloc_irq_vectors+0x18/0x28
kfence-#12: 0x0000000008614900-0x00000000e06c228d, size=104, cache=kmalloc-128
allocated by task 81 on cpu 7 at 10.808142s:
__kmem_cache_alloc_node+0x1f0/0x2bc
kmalloc_trace+0x44/0x138
msi_alloc_desc+0x3c/0x9c
msi_domain_insert_msi_desc+0x30/0x78
msi_setup_msi_desc+0x13c/0x184
__pci_enable_msi_range+0x258/0x488
pci_alloc_irq_vectors_affinity+0xec/0x14c
pci_alloc_irq_vectors+0x18/0x28
freed by task 81 on cpu 7 at 10.811436s:
msi_domain_free_descs+0xd4/0x10c
msi_domain_free_locked.part.0+0xc0/0x1d8
msi_domain_alloc_irqs_all_locked+0xb4/0xbc
pci_msi_setup_msi_irqs+0x30/0x4c
__pci_enable_msi_range+0x2a8/0x488
pci_alloc_irq_vectors_affinity+0xec/0x14c
pci_alloc_irq_vectors+0x18/0x28
Descriptor allocation done in:
__pci_enable_msi_range
msi_capability_init
msi_setup_msi_desc
msi_insert_msi_desc
msi_domain_insert_msi_desc
msi_alloc_desc
...
Freed in case of failure in __msi_domain_alloc_locked()
__pci_enable_msi_range
msi_capability_init
pci_msi_setup_msi_irqs
msi_domain_alloc_irqs_all_locked
msi_domain_alloc_locked
__msi_domain_alloc_locked => fails
msi_domain_free_locked
...
That failure propagates back to pci_msi_setup_msi_irqs() in
msi_capability_init() which accesses the descriptor for unmasking in the
error exit path.
Cure it by copying the descriptor and using the copy for the error exit path
unmask operation.
[ tglx: Massaged change log ]
Fixes: bf6e054e0e3f ("genirq/msi: Provide msi_device_populate/destroy_sysfs()")
Suggested-by: Thomas Gleixner <tglx(a)linutronix.de>
Signed-off-by: Mostafa Saleh <smostafa(a)google.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Bjorn Heelgas <bhelgaas(a)google.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240624203729.1094506-1-smostafa@google.com
---
drivers/pci/msi/msi.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
index c5625dd..3a45879 100644
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -352,7 +352,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
struct irq_affinity *affd)
{
struct irq_affinity_desc *masks = NULL;
- struct msi_desc *entry;
+ struct msi_desc *entry, desc;
int ret;
/* Reject multi-MSI early on irq domain enabled architectures */
@@ -377,6 +377,12 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
/* All MSIs are unmasked by default; mask them all */
entry = msi_first_desc(&dev->dev, MSI_DESC_ALL);
pci_msi_mask(entry, msi_multi_mask(entry));
+ /*
+ * Copy the MSI descriptor for the error path because
+ * pci_msi_setup_msi_irqs() will free it for the hierarchical
+ * interrupt domain case.
+ */
+ memcpy(&desc, entry, sizeof(desc));
/* Configure MSI capability structure */
ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI);
@@ -396,7 +402,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
goto unlock;
err:
- pci_msi_unmask(entry, msi_multi_mask(entry));
+ pci_msi_unmask(&desc, msi_multi_mask(&desc));
pci_free_msi_irqs(dev);
fail:
dev->msi_enabled = 0;
The patch titled
Subject: mm/damon/core: merge regions aggressively when max_nr_regions is unmet
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-damon-core-merge-regions-aggressively-when-max_nr_regions-is-unmet.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: SeongJae Park <sj(a)kernel.org>
Subject: mm/damon/core: merge regions aggressively when max_nr_regions is unmet
Date: Mon, 24 Jun 2024 10:58:14 -0700
DAMON keeps the number of regions under max_nr_regions by skipping regions
split operations when doing so can make the number higher than the limit.
It works well for preventing violation of the limit. But, if somehow the
violation happens, it cannot recovery well depending on the situation. In
detail, if the real number of regions having different access pattern is
higher than the limit, the mechanism cannot reduce the number below the
limit. In such a case, the system could suffer from high monitoring
overhead of DAMON.
The violation can actually happen. For an example, the user could reduce
max_nr_regions while DAMON is running, to be lower than the current number
of regions. Fix the problem by repeating the merge operations with
increasing aggressiveness in kdamond_merge_regions() for the case, until
the limit is met.
Link: https://lkml.kernel.org/r/20240624175814.89611-1-sj@kernel.org
Fixes: b9a6ac4e4ede ("mm/damon: adaptively adjust regions")
Signed-off-by: SeongJae Park <sj(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [5.15+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/core.c | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
--- a/mm/damon/core.c~mm-damon-core-merge-regions-aggressively-when-max_nr_regions-is-unmet
+++ a/mm/damon/core.c
@@ -1358,14 +1358,30 @@ static void damon_merge_regions_of(struc
* access frequencies are similar. This is for minimizing the monitoring
* overhead under the dynamically changeable access pattern. If a merge was
* unnecessarily made, later 'kdamond_split_regions()' will revert it.
+ *
+ * The total number of regions could be temporarily higher than the
+ * user-defined limit, max_nr_regions for some cases. For an example, the user
+ * updates max_nr_regions to a number that lower than the current number of
+ * regions while DAMON is running. Depending on the access pattern, it could
+ * take indefinitve time to reduce the number below the limit. For such a
+ * case, repeat merging until the limit is met while increasing @threshold and
+ * @sz_limit.
*/
static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
unsigned long sz_limit)
{
struct damon_target *t;
+ unsigned int nr_regions;
- damon_for_each_target(t, c)
- damon_merge_regions_of(t, threshold, sz_limit);
+ do {
+ nr_regions = 0;
+ damon_for_each_target(t, c) {
+ damon_merge_regions_of(t, threshold, sz_limit);
+ nr_regions += damon_nr_regions(t);
+ }
+ threshold = max(1, threshold * 2);
+ sz_limit = max(1, sz_limit * 2);
+ } while (nr_regions > c->attrs.max_nr_regions);
}
/*
_
Patches currently in -mm which might be from sj(a)kernel.org are
mm-damon-core-merge-regions-aggressively-when-max_nr_regions-is-unmet.patch
mm-damon-sysfs-schemes-add-target_nid-on-sysfs-schemes-fix.patch
docs-damon-document-damos_migrate_hotcold-fix.patch
mm-damon-core-implement-damos-quota-goals-online-commit-function.patch
mm-damon-core-implement-damon-context-commit-function.patch
mm-damon-sysfs-use-damon_commit_ctx.patch
mm-damon-sysfs-schemes-use-damos_commit_quota_goals.patch
mm-damon-sysfs-remove-unnecessary-online-tuning-handling-code.patch
mm-damon-sysfs-rename-damon_sysfs_set_targets-to-add_targets.patch
mm-damon-sysfs-schemes-remove-unnecessary-online-tuning-handling-code.patch
mm-damon-sysfs-schemes-rename-_set_schemesscheme_filtersquota_scoreschemes.patch
mm-damon-reclaim-use-damon_commit_ctx.patch
mm-damon-reclaim-remove-unnecessary-code-for-online-tuning.patch
mm-damon-lru_sort-use-damon_commit_ctx.patch
mm-damon-lru_sort-remove-unnecessary-online-tuning-handling-code.patch
docs-mm-damon-maintainer-profile-introduce-hackermail.patch
docs-mm-damon-maintainer-profile-document-damon-community-meetups.patch
The RK3066 VOP sets a dma_stop bit when it's done scanning out a frame
and needs the driver to acknowledge that by clearing the bit.
Unless we clear it "between" frames, the RGB output only shows noise
instead of the picture. atomic_flush is the place for it that least
affects other code (doing it on vblank would require converting all
other usages of the reg_lock to spin_(un)lock_irq, which would affect
performance for everyone).
This seems to be a redundant synchronization mechanism that was removed
in later iterations of the VOP hardware block.
Fixes: f4a6de8 ("drm: rockchip: vop: add rk3066 vop definitions")
Cc: stable(a)vger.kernel.org
Signed-off-by: Val Packett <val(a)packett.cool>
---
drivers/gpu/drm/rockchip/rockchip_drm_vop.c | 4 ++++
drivers/gpu/drm/rockchip/rockchip_drm_vop.h | 1 +
drivers/gpu/drm/rockchip/rockchip_vop_reg.c | 1 +
3 files changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c
index a13473b2d..e88fbd568 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c
@@ -1583,6 +1583,10 @@ static void vop_crtc_atomic_flush(struct drm_crtc *crtc,
VOP_AFBC_SET(vop, enable, s->enable_afbc);
vop_cfg_done(vop);
+ /* Ack the DMA transfer of the previous frame (RK3066). */
+ if (VOP_HAS_REG(vop, common, dma_stop))
+ VOP_REG_SET(vop, common, dma_stop, 0);
+
spin_unlock(&vop->reg_lock);
/*
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop.h b/drivers/gpu/drm/rockchip/rockchip_drm_vop.h
index b33e5bdc2..0cf512cc1 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_vop.h
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop.h
@@ -122,6 +122,7 @@ struct vop_common {
struct vop_reg lut_buffer_index;
struct vop_reg gate_en;
struct vop_reg mmu_en;
+ struct vop_reg dma_stop;
struct vop_reg out_mode;
struct vop_reg standby;
};
diff --git a/drivers/gpu/drm/rockchip/rockchip_vop_reg.c b/drivers/gpu/drm/rockchip/rockchip_vop_reg.c
index b9ee02061..9bcb40a64 100644
--- a/drivers/gpu/drm/rockchip/rockchip_vop_reg.c
+++ b/drivers/gpu/drm/rockchip/rockchip_vop_reg.c
@@ -466,6 +466,7 @@ static const struct vop_output rk3066_output = {
};
static const struct vop_common rk3066_common = {
+ .dma_stop = VOP_REG(RK3066_SYS_CTRL0, 0x1, 0),
.standby = VOP_REG(RK3066_SYS_CTRL0, 0x1, 1),
.out_mode = VOP_REG(RK3066_DSP_CTRL0, 0xf, 0),
.cfg_done = VOP_REG(RK3066_REG_CFG_DONE, 0x1, 0),
--
2.45.2
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 70cb9188ffc75e643debf292fcddff36c9dbd4ae
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024061929-scalding-tweak-f522@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
70cb9188ffc7 ("drm/i915/gt: Disarm breadcrumbs if engines are already idle")
c877bed82e10 ("drm/i915/gt: Only kick the signal worker if there's been an update")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 70cb9188ffc75e643debf292fcddff36c9dbd4ae Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris(a)chris-wilson.co.uk>
Date: Tue, 23 Apr 2024 18:23:10 +0200
Subject: [PATCH] drm/i915/gt: Disarm breadcrumbs if engines are already idle
The breadcrumbs use a GT wakeref for guarding the interrupt, but are
disarmed during release of the engine wakeref. This leaves a hole where
we may attach a breadcrumb just as the engine is parking (after it has
parked its breadcrumbs), execute the irq worker with some signalers still
attached, but never be woken again.
That issue manifests itself in CI with IGT runner timeouts while tests
are waiting indefinitely for release of all GT wakerefs.
<6> [209.151778] i915: Running live_engine_pm_selftests/live_engine_busy_stats
<7> [209.231628] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_5
<7> [209.231816] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_4
<7> [209.231944] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_3
<7> [209.232056] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling PW_2
<7> [209.232166] i915 0000:00:02.0: [drm:intel_power_well_disable [i915]] disabling DC_off
<7> [209.232270] i915 0000:00:02.0: [drm:skl_enable_dc6 [i915]] Enabling DC6
<7> [209.232368] i915 0000:00:02.0: [drm:gen9_set_dc_state.part.0 [i915]] Setting DC state from 00 to 02
<4> [299.356116] [IGT] Inactivity timeout exceeded. Killing the current test with SIGQUIT.
...
<6> [299.356526] sysrq: Show State
...
<6> [299.373964] task:i915_selftest state:D stack:11784 pid:5578 tgid:5578 ppid:873 flags:0x00004002
<6> [299.373967] Call Trace:
<6> [299.373968] <TASK>
<6> [299.373970] __schedule+0x3bb/0xda0
<6> [299.373974] schedule+0x41/0x110
<6> [299.373976] intel_wakeref_wait_for_idle+0x82/0x100 [i915]
<6> [299.374083] ? __pfx_var_wake_function+0x10/0x10
<6> [299.374087] live_engine_busy_stats+0x9b/0x500 [i915]
<6> [299.374173] __i915_subtests+0xbe/0x240 [i915]
<6> [299.374277] ? __pfx___intel_gt_live_setup+0x10/0x10 [i915]
<6> [299.374369] ? __pfx___intel_gt_live_teardown+0x10/0x10 [i915]
<6> [299.374456] intel_engine_live_selftests+0x1c/0x30 [i915]
<6> [299.374547] __run_selftests+0xbb/0x190 [i915]
<6> [299.374635] i915_live_selftests+0x4b/0x90 [i915]
<6> [299.374717] i915_pci_probe+0x10d/0x210 [i915]
At the end of the interrupt worker, if there are no more engines awake,
disarm the breadcrumb and go to sleep.
Fixes: 9d5612ca165a ("drm/i915/gt: Defer enabling the breadcrumb interrupt to after submission")
Closes: https://gitlab.freedesktop.org/drm/intel/issues/10026
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Andrzej Hajda <andrzej.hajda(a)intel.com>
Cc: <stable(a)vger.kernel.org> # v5.12+
Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik(a)linux.intel.com>
Acked-by: Nirmoy Das <nirmoy.das(a)intel.com>
Reviewed-by: Andrzej Hajda <andrzej.hajda(a)intel.com>
Reviewed-by: Andi Shyti <andi.shyti(a)linux.intel.com>
Signed-off-by: Andi Shyti <andi.shyti(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240423165505.465734-2-janus…
(cherry picked from commit fbad43eccae5cb14594195c20113369aabaa22b5)
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
index d650beb8ed22..20b9b04ec1e0 100644
--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c
@@ -263,8 +263,13 @@ static void signal_irq_work(struct irq_work *work)
i915_request_put(rq);
}
+ /* Lazy irq enabling after HW submission */
if (!READ_ONCE(b->irq_armed) && !list_empty(&b->signalers))
intel_breadcrumbs_arm_irq(b);
+
+ /* And confirm that we still want irqs enabled before we yield */
+ if (READ_ONCE(b->irq_armed) && !atomic_read(&b->active))
+ intel_breadcrumbs_disarm_irq(b);
}
struct intel_breadcrumbs *
@@ -315,13 +320,7 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b)
return;
/* Kick the work once more to drain the signalers, and disarm the irq */
- irq_work_sync(&b->irq_work);
- while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) {
- local_irq_disable();
- signal_irq_work(&b->irq_work);
- local_irq_enable();
- cond_resched();
- }
+ irq_work_queue(&b->irq_work);
}
void intel_breadcrumbs_free(struct kref *kref)
@@ -404,7 +403,7 @@ static void insert_breadcrumb(struct i915_request *rq)
* the request as it may have completed and raised the interrupt as
* we were attaching it into the lists.
*/
- if (!b->irq_armed || __i915_request_is_complete(rq))
+ if (!READ_ONCE(b->irq_armed) || __i915_request_is_complete(rq))
irq_work_queue(&b->irq_work);
}
From: Peter Ujfalusi <peter.ujfalusi(a)linux.intel.com>
It is incorrect to request the input pin format of the destination widget
using the output pin index of the source module as the indexes are not
necessarily matching.
moduleA.out_pin1 can be connected to moduleB.in_pin0 for example.
Use the dst_queue_id to request the input format of the destination module.
This bug remained unnoticed likely because in nocodec topologies we don't
have process modules after a module copier, thus the pin/queue index is
ignored.
For the process module case, the code was likely have been tested in a
controlled way where all the pin/queue/format properties were present to
work.
Update the debug prints to have better information.
Reviewed-by: Kai Vehmanen <kai.vehmanen(a)linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan(a)linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao(a)linux.intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi(a)linux.intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # v6.8+
---
sound/soc/sof/ipc4-topology.c | 28 +++++++++++++++-------------
1 file changed, 15 insertions(+), 13 deletions(-)
diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c
index ce2910f70e65..90f6856ee80c 100644
--- a/sound/soc/sof/ipc4-topology.c
+++ b/sound/soc/sof/ipc4-topology.c
@@ -2869,7 +2869,7 @@ static void sof_ipc4_put_queue_id(struct snd_sof_widget *swidget, int queue_id,
static int sof_ipc4_set_copier_sink_format(struct snd_sof_dev *sdev,
struct snd_sof_widget *src_widget,
struct snd_sof_widget *sink_widget,
- int sink_id)
+ struct snd_sof_route *sroute)
{
struct sof_ipc4_copier_config_set_sink_format format;
const struct sof_ipc_ops *iops = sdev->ipc->ops;
@@ -2878,9 +2878,6 @@ static int sof_ipc4_set_copier_sink_format(struct snd_sof_dev *sdev,
struct sof_ipc4_fw_module *fw_module;
struct sof_ipc4_msg msg = {{ 0 }};
- dev_dbg(sdev->dev, "%s set copier sink %d format\n",
- src_widget->widget->name, sink_id);
-
if (WIDGET_IS_DAI(src_widget->id)) {
struct snd_sof_dai *dai = src_widget->private;
@@ -2891,13 +2888,15 @@ static int sof_ipc4_set_copier_sink_format(struct snd_sof_dev *sdev,
fw_module = src_widget->module_info;
- format.sink_id = sink_id;
+ format.sink_id = sroute->src_queue_id;
memcpy(&format.source_fmt, &src_config->audio_fmt, sizeof(format.source_fmt));
- pin_fmt = sof_ipc4_get_input_pin_audio_fmt(sink_widget, sink_id);
+ pin_fmt = sof_ipc4_get_input_pin_audio_fmt(sink_widget, sroute->dst_queue_id);
if (!pin_fmt) {
- dev_err(sdev->dev, "Unable to get pin %d format for %s",
- sink_id, sink_widget->widget->name);
+ dev_err(sdev->dev,
+ "Failed to get input audio format of %s:%d for output of %s:%d\n",
+ sink_widget->widget->name, sroute->dst_queue_id,
+ src_widget->widget->name, sroute->src_queue_id);
return -EINVAL;
}
@@ -2955,7 +2954,8 @@ static int sof_ipc4_route_setup(struct snd_sof_dev *sdev, struct snd_sof_route *
sroute->src_queue_id = sof_ipc4_get_queue_id(src_widget, sink_widget,
SOF_PIN_TYPE_OUTPUT);
if (sroute->src_queue_id < 0) {
- dev_err(sdev->dev, "failed to get queue ID for source widget: %s\n",
+ dev_err(sdev->dev,
+ "failed to get src_queue_id ID from source widget %s\n",
src_widget->widget->name);
return sroute->src_queue_id;
}
@@ -2963,7 +2963,8 @@ static int sof_ipc4_route_setup(struct snd_sof_dev *sdev, struct snd_sof_route *
sroute->dst_queue_id = sof_ipc4_get_queue_id(src_widget, sink_widget,
SOF_PIN_TYPE_INPUT);
if (sroute->dst_queue_id < 0) {
- dev_err(sdev->dev, "failed to get queue ID for sink widget: %s\n",
+ dev_err(sdev->dev,
+ "failed to get dst_queue_id ID from sink widget %s\n",
sink_widget->widget->name);
sof_ipc4_put_queue_id(src_widget, sroute->src_queue_id,
SOF_PIN_TYPE_OUTPUT);
@@ -2972,10 +2973,11 @@ static int sof_ipc4_route_setup(struct snd_sof_dev *sdev, struct snd_sof_route *
/* Pin 0 format is already set during copier module init */
if (sroute->src_queue_id > 0 && WIDGET_IS_COPIER(src_widget->id)) {
- ret = sof_ipc4_set_copier_sink_format(sdev, src_widget, sink_widget,
- sroute->src_queue_id);
+ ret = sof_ipc4_set_copier_sink_format(sdev, src_widget,
+ sink_widget, sroute);
if (ret < 0) {
- dev_err(sdev->dev, "failed to set sink format for %s source queue ID %d\n",
+ dev_err(sdev->dev,
+ "failed to set sink format for source %s:%d\n",
src_widget->widget->name, sroute->src_queue_id);
goto out;
}
--
2.43.0
DAMON keeps the number of regions under max_nr_regions by skipping
regions split operations when doing so can make the number higher than
the limit. It works well for preventing violation of the limit. But,
if somehow the violation happens, it cannot recovery well depending on
the situation. In detail, if the real number of regions having
different access pattern is higher than the limit, the mechanism cannot
reduce the number below the limit. In such a case, the system could
suffer from high monitoring overhead of DAMON.
The violation can actually happen. For an example, the user could
reduce max_nr_regions while DAMON is running, to be lower than the
current number of regions. Fix the problem by repeating the merge
operations with increasing aggressiveness in kdamond_merge_regions() for
the case, until the limit is met.
Fixes: b9a6ac4e4ede ("mm/damon: adaptively adjust regions")
Cc: <stable(a)vger.kernel.org> # 5.15.x
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
mm/damon/core.c | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/mm/damon/core.c b/mm/damon/core.c
index f69250b68bcc..e6598c44b53c 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1694,14 +1694,30 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
* access frequencies are similar. This is for minimizing the monitoring
* overhead under the dynamically changeable access pattern. If a merge was
* unnecessarily made, later 'kdamond_split_regions()' will revert it.
+ *
+ * The total number of regions could be temporarily higher than the
+ * user-defined limit, max_nr_regions for some cases. For an example, the user
+ * updates max_nr_regions to a number that lower than the current number of
+ * regions while DAMON is running. Depending on the access pattern, it could
+ * take indefinitve time to reduce the number below the limit. For such a
+ * case, repeat merging until the limit is met while increasing @threshold and
+ * @sz_limit.
*/
static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
unsigned long sz_limit)
{
struct damon_target *t;
+ unsigned int nr_regions;
- damon_for_each_target(t, c)
- damon_merge_regions_of(t, threshold, sz_limit);
+ do {
+ nr_regions = 0;
+ damon_for_each_target(t, c) {
+ damon_merge_regions_of(t, threshold, sz_limit);
+ nr_regions += damon_nr_regions(t);
+ }
+ threshold = max(1, threshold * 2);
+ sz_limit = max(1, sz_limit * 2);
+ } while (nr_regions > c->attrs.max_nr_regions);
}
/*
--
2.39.2
From: Kan Liang <kan.liang(a)linux.intel.com>
The hard-coded metrics is wrongly calculated on the hybrid machine.
$ perf stat -e cycles,instructions -a sleep 1
Performance counter stats for 'system wide':
18,205,487 cpu_atom/cycles/
9,733,603 cpu_core/cycles/
9,423,111 cpu_atom/instructions/ # 0.52 insn per cycle
4,268,965 cpu_core/instructions/ # 0.23 insn per cycle
The insn per cycle for cpu_core should be 4,268,965 / 9,733,603 = 0.44.
When finding the metric events, the find_stat() doesn't take the PMU
type into account. The cpu_atom/cycles/ is wrongly used to calculate
the IPC of the cpu_core.
In the hard-coded metrics, the events from a different PMU are only
SW_CPU_CLOCK and SW_TASK_CLOCK. They both have the stat type,
STAT_NSECS. Except the SW CLOCK events, check the PMU type as well.
Fixes: 0a57b910807a ("perf stat: Use counts rather than saved_value")
Reported-by: "Khalil, Amiri" <amiri.khalil(a)intel.com>
Reviewed-by: Ian Rogers <irogers(a)google.com>
Signed-off-by: Kan Liang <kan.liang(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
---
Changes since V1:
- Don't check the PMU of the SW CLOCK events
tools/perf/util/stat-shadow.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 3466aa952442..6bb975e46de3 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -176,6 +176,13 @@ static double find_stat(const struct evsel *evsel, int aggr_idx, enum stat_type
if (type != evsel__stat_type(cur))
continue;
+ /*
+ * Except the SW CLOCK events,
+ * ignore if not the PMU we're looking for.
+ */
+ if ((type != STAT_NSECS) && (evsel->pmu != cur->pmu))
+ continue;
+
aggr = &cur->stats->aggr[aggr_idx];
if (type == STAT_NSECS)
return aggr->counts.val;
--
2.35.1
mkdir /mnt/test/comp
f2fs_io setflags compression /mnt/test/comp
dd if=/dev/zero of=/mnt/test/comp/testfile bs=16k count=1
truncate --size 13 /mnt/test/comp/testfile
In the above scenario, we can get a BUG_ON.
kernel BUG at fs/f2fs/segment.c:3589!
Call Trace:
do_write_page+0x78/0x390 [f2fs]
f2fs_outplace_write_data+0x62/0xb0 [f2fs]
f2fs_do_write_data_page+0x275/0x740 [f2fs]
f2fs_write_single_data_page+0x1dc/0x8f0 [f2fs]
f2fs_write_multi_pages+0x1e5/0xae0 [f2fs]
f2fs_write_cache_pages+0xab1/0xc60 [f2fs]
f2fs_write_data_pages+0x2d8/0x330 [f2fs]
do_writepages+0xcf/0x270
__writeback_single_inode+0x44/0x350
writeback_sb_inodes+0x242/0x530
__writeback_inodes_wb+0x54/0xf0
wb_writeback+0x192/0x310
wb_workfn+0x30d/0x400
The reason is we gave CURSEG_ALL_DATA_ATGC to COMPR_ADDR where the
page was set the gcing flag by set_cluster_dirty().
Cc: stable(a)vger.kernel.org
Fixes: 4961acdd65c9 ("f2fs: fix to tag gcing flag on page during block migration")
Signed-off-by: Jaegeuk Kim <jaegeuk(a)kernel.org>
---
fs/f2fs/segment.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6e8a4b332ad5..ce2300391031 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -3484,6 +3484,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
if (fio->sbi->am.atgc_enabled &&
(fio->io_type == FS_DATA_IO) &&
(fio->sbi->gc_mode != GC_URGENT_HIGH) &&
+ __is_valid_data_blkaddr(fio->old_blkaddr) &&
!is_inode_flag_set(inode, FI_OPU_WRITE))
return CURSEG_ALL_DATA_ATGC;
else
--
2.45.2.627.g7a2c4fd464-goog
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062418-serotonin-immobile-a6f4@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Date: Fri, 7 Jun 2024 11:34:23 +0100
Subject: [PATCH] spi: cs42l43: Drop cs35l56 SPI speed down to 11MHz
Some internals of the cs35l56 can only support SPI speeds of up to
11MHz. Whilst some use-cases could support higher rates, keep things
simple by dropping the SPI speed down to this avoid any potential
issues.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240607103423.4159834-1-ckeepax@opensource.cirru…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c
index 902a0734cc36..8b618ef0f711 100644
--- a/drivers/spi/spi-cs42l43.c
+++ b/drivers/spi/spi-cs42l43.c
@@ -54,7 +54,7 @@ static const struct software_node ampr = {
static struct spi_board_info ampl_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 0,
.mode = SPI_MODE_0,
.swnode = &l,
@@ -62,7 +62,7 @@ static struct spi_board_info ampl_info = {
static struct spi_board_info ampr_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 1,
.mode = SPI_MODE_0,
.swnode = &r,
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062417-scribing-deluge-54e8@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Date: Fri, 7 Jun 2024 11:34:23 +0100
Subject: [PATCH] spi: cs42l43: Drop cs35l56 SPI speed down to 11MHz
Some internals of the cs35l56 can only support SPI speeds of up to
11MHz. Whilst some use-cases could support higher rates, keep things
simple by dropping the SPI speed down to this avoid any potential
issues.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240607103423.4159834-1-ckeepax@opensource.cirru…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c
index 902a0734cc36..8b618ef0f711 100644
--- a/drivers/spi/spi-cs42l43.c
+++ b/drivers/spi/spi-cs42l43.c
@@ -54,7 +54,7 @@ static const struct software_node ampr = {
static struct spi_board_info ampl_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 0,
.mode = SPI_MODE_0,
.swnode = &l,
@@ -62,7 +62,7 @@ static struct spi_board_info ampl_info = {
static struct spi_board_info ampr_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 1,
.mode = SPI_MODE_0,
.swnode = &r,
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062417-data-circling-74b0@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Date: Fri, 7 Jun 2024 11:34:23 +0100
Subject: [PATCH] spi: cs42l43: Drop cs35l56 SPI speed down to 11MHz
Some internals of the cs35l56 can only support SPI speeds of up to
11MHz. Whilst some use-cases could support higher rates, keep things
simple by dropping the SPI speed down to this avoid any potential
issues.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240607103423.4159834-1-ckeepax@opensource.cirru…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c
index 902a0734cc36..8b618ef0f711 100644
--- a/drivers/spi/spi-cs42l43.c
+++ b/drivers/spi/spi-cs42l43.c
@@ -54,7 +54,7 @@ static const struct software_node ampr = {
static struct spi_board_info ampl_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 0,
.mode = SPI_MODE_0,
.swnode = &l,
@@ -62,7 +62,7 @@ static struct spi_board_info ampl_info = {
static struct spi_board_info ampr_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 1,
.mode = SPI_MODE_0,
.swnode = &r,
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062416-gumdrop-tinfoil-9210@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Date: Fri, 7 Jun 2024 11:34:23 +0100
Subject: [PATCH] spi: cs42l43: Drop cs35l56 SPI speed down to 11MHz
Some internals of the cs35l56 can only support SPI speeds of up to
11MHz. Whilst some use-cases could support higher rates, keep things
simple by dropping the SPI speed down to this avoid any potential
issues.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240607103423.4159834-1-ckeepax@opensource.cirru…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c
index 902a0734cc36..8b618ef0f711 100644
--- a/drivers/spi/spi-cs42l43.c
+++ b/drivers/spi/spi-cs42l43.c
@@ -54,7 +54,7 @@ static const struct software_node ampr = {
static struct spi_board_info ampl_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 0,
.mode = SPI_MODE_0,
.swnode = &l,
@@ -62,7 +62,7 @@ static struct spi_board_info ampl_info = {
static struct spi_board_info ampr_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 1,
.mode = SPI_MODE_0,
.swnode = &r,
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062415-acquire-poise-3326@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Date: Fri, 7 Jun 2024 11:34:23 +0100
Subject: [PATCH] spi: cs42l43: Drop cs35l56 SPI speed down to 11MHz
Some internals of the cs35l56 can only support SPI speeds of up to
11MHz. Whilst some use-cases could support higher rates, keep things
simple by dropping the SPI speed down to this avoid any potential
issues.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240607103423.4159834-1-ckeepax@opensource.cirru…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c
index 902a0734cc36..8b618ef0f711 100644
--- a/drivers/spi/spi-cs42l43.c
+++ b/drivers/spi/spi-cs42l43.c
@@ -54,7 +54,7 @@ static const struct software_node ampr = {
static struct spi_board_info ampl_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 0,
.mode = SPI_MODE_0,
.swnode = &l,
@@ -62,7 +62,7 @@ static struct spi_board_info ampl_info = {
static struct spi_board_info ampr_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 1,
.mode = SPI_MODE_0,
.swnode = &r,
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062415-shakable-matching-e69e@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Date: Fri, 7 Jun 2024 11:34:23 +0100
Subject: [PATCH] spi: cs42l43: Drop cs35l56 SPI speed down to 11MHz
Some internals of the cs35l56 can only support SPI speeds of up to
11MHz. Whilst some use-cases could support higher rates, keep things
simple by dropping the SPI speed down to this avoid any potential
issues.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240607103423.4159834-1-ckeepax@opensource.cirru…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c
index 902a0734cc36..8b618ef0f711 100644
--- a/drivers/spi/spi-cs42l43.c
+++ b/drivers/spi/spi-cs42l43.c
@@ -54,7 +54,7 @@ static const struct software_node ampr = {
static struct spi_board_info ampl_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 0,
.mode = SPI_MODE_0,
.swnode = &l,
@@ -62,7 +62,7 @@ static struct spi_board_info ampl_info = {
static struct spi_board_info ampr_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 1,
.mode = SPI_MODE_0,
.swnode = &r,
The patch below does not apply to the 6.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.9.y
git checkout FETCH_HEAD
git cherry-pick -x 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062414-convent-usage-bb3a@gregkh' --subject-prefix 'PATCH 6.9.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 60980cf5b8c8cc9182e5e9dbb62cbfd345c54074 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Date: Fri, 7 Jun 2024 11:34:23 +0100
Subject: [PATCH] spi: cs42l43: Drop cs35l56 SPI speed down to 11MHz
Some internals of the cs35l56 can only support SPI speeds of up to
11MHz. Whilst some use-cases could support higher rates, keep things
simple by dropping the SPI speed down to this avoid any potential
issues.
Signed-off-by: Charles Keepax <ckeepax(a)opensource.cirrus.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240607103423.4159834-1-ckeepax@opensource.cirru…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
diff --git a/drivers/spi/spi-cs42l43.c b/drivers/spi/spi-cs42l43.c
index 902a0734cc36..8b618ef0f711 100644
--- a/drivers/spi/spi-cs42l43.c
+++ b/drivers/spi/spi-cs42l43.c
@@ -54,7 +54,7 @@ static const struct software_node ampr = {
static struct spi_board_info ampl_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 0,
.mode = SPI_MODE_0,
.swnode = &l,
@@ -62,7 +62,7 @@ static struct spi_board_info ampl_info = {
static struct spi_board_info ampr_info = {
.modalias = "cs35l56",
- .max_speed_hz = 20 * HZ_PER_MHZ,
+ .max_speed_hz = 11 * HZ_PER_MHZ,
.chip_select = 1,
.mode = SPI_MODE_0,
.swnode = &r,
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x d4e001ffeccfc128c715057e866f301ac9b95728
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062426-sleeve-manicure-e52d@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d4e001ffeccfc128c715057e866f301ac9b95728 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Date: Thu, 20 Jun 2024 13:34:49 +0200
Subject: [PATCH] dt-bindings: i2c: atmel,at91sam: correct path to
i2c-controller schema
The referenced i2c-controller.yaml schema is provided by dtschema
package (outside of Linux kernel), so use full path to reference it.
Cc: stable(a)vger.kernel.org
Fixes: 7ea75dd386be ("dt-bindings: i2c: convert i2c-at91 to json-schema")
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
Reviewed-by: Conor Dooley <conor.dooley(a)microchip.com>
Signed-off-by: Andi Shyti <andi.shyti(a)kernel.org>
diff --git a/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml b/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml
index b1c13bab2472..b2d19cfb87ad 100644
--- a/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml
+++ b/Documentation/devicetree/bindings/i2c/atmel,at91sam-i2c.yaml
@@ -77,7 +77,7 @@ required:
- clocks
allOf:
- - $ref: i2c-controller.yaml
+ - $ref: /schemas/i2c/i2c-controller.yaml#
- if:
properties:
compatible:
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 5a72477273066b5b357801ab2d315ef14949d402
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062412-mobster-doable-acc6@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5a72477273066b5b357801ab2d315ef14949d402 Mon Sep 17 00:00:00 2001
From: Grygorii Tertychnyi <grembeter(a)gmail.com>
Date: Mon, 20 May 2024 17:39:32 +0200
Subject: [PATCH] i2c: ocores: set IACK bit after core is enabled
Setting IACK bit when core is disabled does not clear the "Interrupt Flag"
bit in the status register, and the interrupt remains pending.
Sometimes it causes failure for the very first message transfer, that is
usually a device probe.
Hence, set IACK bit after core is enabled to clear pending interrupt.
Fixes: 18f98b1e3147 ("[PATCH] i2c: New bus driver for the OpenCores I2C controller")
Signed-off-by: Grygorii Tertychnyi <grygorii.tertychnyi(a)leica-geosystems.com>
Acked-by: Peter Korsgaard <peter(a)korsgaard.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Andi Shyti <andi.shyti(a)kernel.org>
diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index 56a4dabf5a38..4ad670a80a63 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -431,8 +431,8 @@ static int ocores_init(struct device *dev, struct ocores_i2c *i2c)
oc_setreg(i2c, OCI2C_PREHIGH, prescale >> 8);
/* Init the device */
- oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_IACK);
oc_setreg(i2c, OCI2C_CONTROL, ctrl | OCI2C_CTRL_EN);
+ oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_IACK);
return 0;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x c45fcf46ca2368dafe7e5c513a711a6f0f974308
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062459-ebay-electable-e94e@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c45fcf46ca2368dafe7e5c513a711a6f0f974308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig(a)baylibre.com>
Date: Fri, 21 Jun 2024 16:37:12 +0200
Subject: [PATCH] pwm: stm32: Refuse too small period requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
If period_ns is small, prd might well become 0. Catch that case because
otherwise with
regmap_write(priv->regmap, TIM_ARR, prd - 1);
a few lines down quite a big period is configured.
Fixes: 7edf7369205b ("pwm: Add driver for STM32 plaftorm")
Cc: stable(a)vger.kernel.org
Reviewed-by: Trevor Gamblin <tgamblin(a)baylibre.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig(a)baylibre.com>
Link: https://lore.kernel.org/r/b86f62f099983646f97eeb6bfc0117bb2d0c340d.17189791…
Signed-off-by: Uwe Kleine-König <ukleinek(a)kernel.org>
diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index a2f231d13a9f..3e7b2a8e34e7 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -337,6 +337,8 @@ static int stm32_pwm_config(struct stm32_pwm *priv, unsigned int ch,
prd = mul_u64_u64_div_u64(period_ns, clk_get_rate(priv->clk),
(u64)NSEC_PER_SEC * (prescaler + 1));
+ if (!prd)
+ return -EINVAL;
/*
* All channels share the same prescaler and counter so when two
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x c45fcf46ca2368dafe7e5c513a711a6f0f974308
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062458-try-bouncing-bb82@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c45fcf46ca2368dafe7e5c513a711a6f0f974308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig(a)baylibre.com>
Date: Fri, 21 Jun 2024 16:37:12 +0200
Subject: [PATCH] pwm: stm32: Refuse too small period requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
If period_ns is small, prd might well become 0. Catch that case because
otherwise with
regmap_write(priv->regmap, TIM_ARR, prd - 1);
a few lines down quite a big period is configured.
Fixes: 7edf7369205b ("pwm: Add driver for STM32 plaftorm")
Cc: stable(a)vger.kernel.org
Reviewed-by: Trevor Gamblin <tgamblin(a)baylibre.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig(a)baylibre.com>
Link: https://lore.kernel.org/r/b86f62f099983646f97eeb6bfc0117bb2d0c340d.17189791…
Signed-off-by: Uwe Kleine-König <ukleinek(a)kernel.org>
diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index a2f231d13a9f..3e7b2a8e34e7 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -337,6 +337,8 @@ static int stm32_pwm_config(struct stm32_pwm *priv, unsigned int ch,
prd = mul_u64_u64_div_u64(period_ns, clk_get_rate(priv->clk),
(u64)NSEC_PER_SEC * (prescaler + 1));
+ if (!prd)
+ return -EINVAL;
/*
* All channels share the same prescaler and counter so when two
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x c45fcf46ca2368dafe7e5c513a711a6f0f974308
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062457-reversal-unkind-166b@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c45fcf46ca2368dafe7e5c513a711a6f0f974308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig(a)baylibre.com>
Date: Fri, 21 Jun 2024 16:37:12 +0200
Subject: [PATCH] pwm: stm32: Refuse too small period requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
If period_ns is small, prd might well become 0. Catch that case because
otherwise with
regmap_write(priv->regmap, TIM_ARR, prd - 1);
a few lines down quite a big period is configured.
Fixes: 7edf7369205b ("pwm: Add driver for STM32 plaftorm")
Cc: stable(a)vger.kernel.org
Reviewed-by: Trevor Gamblin <tgamblin(a)baylibre.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig(a)baylibre.com>
Link: https://lore.kernel.org/r/b86f62f099983646f97eeb6bfc0117bb2d0c340d.17189791…
Signed-off-by: Uwe Kleine-König <ukleinek(a)kernel.org>
diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index a2f231d13a9f..3e7b2a8e34e7 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -337,6 +337,8 @@ static int stm32_pwm_config(struct stm32_pwm *priv, unsigned int ch,
prd = mul_u64_u64_div_u64(period_ns, clk_get_rate(priv->clk),
(u64)NSEC_PER_SEC * (prescaler + 1));
+ if (!prd)
+ return -EINVAL;
/*
* All channels share the same prescaler and counter so when two
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x c45fcf46ca2368dafe7e5c513a711a6f0f974308
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062457-irritant-squatter-ff51@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c45fcf46ca2368dafe7e5c513a711a6f0f974308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig(a)baylibre.com>
Date: Fri, 21 Jun 2024 16:37:12 +0200
Subject: [PATCH] pwm: stm32: Refuse too small period requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
If period_ns is small, prd might well become 0. Catch that case because
otherwise with
regmap_write(priv->regmap, TIM_ARR, prd - 1);
a few lines down quite a big period is configured.
Fixes: 7edf7369205b ("pwm: Add driver for STM32 plaftorm")
Cc: stable(a)vger.kernel.org
Reviewed-by: Trevor Gamblin <tgamblin(a)baylibre.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig(a)baylibre.com>
Link: https://lore.kernel.org/r/b86f62f099983646f97eeb6bfc0117bb2d0c340d.17189791…
Signed-off-by: Uwe Kleine-König <ukleinek(a)kernel.org>
diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index a2f231d13a9f..3e7b2a8e34e7 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -337,6 +337,8 @@ static int stm32_pwm_config(struct stm32_pwm *priv, unsigned int ch,
prd = mul_u64_u64_div_u64(period_ns, clk_get_rate(priv->clk),
(u64)NSEC_PER_SEC * (prescaler + 1));
+ if (!prd)
+ return -EINVAL;
/*
* All channels share the same prescaler and counter so when two
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x c45fcf46ca2368dafe7e5c513a711a6f0f974308
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062456-unvocal-anyone-3135@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c45fcf46ca2368dafe7e5c513a711a6f0f974308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig(a)baylibre.com>
Date: Fri, 21 Jun 2024 16:37:12 +0200
Subject: [PATCH] pwm: stm32: Refuse too small period requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
If period_ns is small, prd might well become 0. Catch that case because
otherwise with
regmap_write(priv->regmap, TIM_ARR, prd - 1);
a few lines down quite a big period is configured.
Fixes: 7edf7369205b ("pwm: Add driver for STM32 plaftorm")
Cc: stable(a)vger.kernel.org
Reviewed-by: Trevor Gamblin <tgamblin(a)baylibre.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig(a)baylibre.com>
Link: https://lore.kernel.org/r/b86f62f099983646f97eeb6bfc0117bb2d0c340d.17189791…
Signed-off-by: Uwe Kleine-König <ukleinek(a)kernel.org>
diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index a2f231d13a9f..3e7b2a8e34e7 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -337,6 +337,8 @@ static int stm32_pwm_config(struct stm32_pwm *priv, unsigned int ch,
prd = mul_u64_u64_div_u64(period_ns, clk_get_rate(priv->clk),
(u64)NSEC_PER_SEC * (prescaler + 1));
+ if (!prd)
+ return -EINVAL;
/*
* All channels share the same prescaler and counter so when two
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x c45fcf46ca2368dafe7e5c513a711a6f0f974308
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062455-magnetism-backing-fbef@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c45fcf46ca2368dafe7e5c513a711a6f0f974308 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig(a)baylibre.com>
Date: Fri, 21 Jun 2024 16:37:12 +0200
Subject: [PATCH] pwm: stm32: Refuse too small period requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
If period_ns is small, prd might well become 0. Catch that case because
otherwise with
regmap_write(priv->regmap, TIM_ARR, prd - 1);
a few lines down quite a big period is configured.
Fixes: 7edf7369205b ("pwm: Add driver for STM32 plaftorm")
Cc: stable(a)vger.kernel.org
Reviewed-by: Trevor Gamblin <tgamblin(a)baylibre.com>
Signed-off-by: Uwe Kleine-König <u.kleine-koenig(a)baylibre.com>
Link: https://lore.kernel.org/r/b86f62f099983646f97eeb6bfc0117bb2d0c340d.17189791…
Signed-off-by: Uwe Kleine-König <ukleinek(a)kernel.org>
diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c
index a2f231d13a9f..3e7b2a8e34e7 100644
--- a/drivers/pwm/pwm-stm32.c
+++ b/drivers/pwm/pwm-stm32.c
@@ -337,6 +337,8 @@ static int stm32_pwm_config(struct stm32_pwm *priv, unsigned int ch,
prd = mul_u64_u64_div_u64(period_ns, clk_get_rate(priv->clk),
(u64)NSEC_PER_SEC * (prescaler + 1));
+ if (!prd)
+ return -EINVAL;
/*
* All channels share the same prescaler and counter so when two
As per Errata i2310[0], Erroneous timeout can be triggered,
if this Erroneous interrupt is not cleared then it may leads
to storm of interrupts, therefore apply Errata i2310 solution.
[0] https://www.ti.com/lit/pdf/sprz536 page 23
Fixes: b67e830d38fa ("serial: 8250: 8250_omap: Fix possible interrupt storm on K3 SoCs")
Cc: stable(a)vger.kernel.org
Signed-off-by: Udit Kumar <u-kumar1(a)ti.com>
---
Change logs
Changes in v3:
- CC stable in commit message
Link to v2:
https://lore.kernel.org/all/20240617052253.2188140-1-u-kumar1@ti.com/
Changes in v2:
- Added Fixes Tag and typo correction in commit message
- Corrected bit position to UART_OMAP_EFR2_TIMEOUT_BEHAVE
Link to v1
https://lore.kernel.org/all/20240614061314.290840-1-u-kumar1@ti.com/
drivers/tty/serial/8250/8250_omap.c | 25 ++++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 170639d12b2a..ddac0a13cf84 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -115,6 +115,10 @@
/* RX FIFO occupancy indicator */
#define UART_OMAP_RX_LVL 0x19
+/* Timeout low and High */
+#define UART_OMAP_TO_L 0x26
+#define UART_OMAP_TO_H 0x27
+
/*
* Copy of the genpd flags for the console.
* Only used if console suspend is disabled
@@ -663,13 +667,24 @@ static irqreturn_t omap8250_irq(int irq, void *dev_id)
/*
* On K3 SoCs, it is observed that RX TIMEOUT is signalled after
- * FIFO has been drained, in which case a dummy read of RX FIFO
- * is required to clear RX TIMEOUT condition.
+ * FIFO has been drained or erroneously.
+ * So apply solution of Errata i2310 as mentioned in
+ * https://www.ti.com/lit/pdf/sprz536
*/
if (priv->habit & UART_RX_TIMEOUT_QUIRK &&
- (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT &&
- serial_port_in(port, UART_OMAP_RX_LVL) == 0) {
- serial_port_in(port, UART_RX);
+ (iir & UART_IIR_RX_TIMEOUT) == UART_IIR_RX_TIMEOUT) {
+ unsigned char efr2, timeout_h, timeout_l;
+
+ efr2 = serial_in(up, UART_OMAP_EFR2);
+ timeout_h = serial_in(up, UART_OMAP_TO_H);
+ timeout_l = serial_in(up, UART_OMAP_TO_L);
+ serial_out(up, UART_OMAP_TO_H, 0xFF);
+ serial_out(up, UART_OMAP_TO_L, 0xFF);
+ serial_out(up, UART_OMAP_EFR2, UART_OMAP_EFR2_TIMEOUT_BEHAVE);
+ serial_in(up, UART_IIR);
+ serial_out(up, UART_OMAP_EFR2, efr2);
+ serial_out(up, UART_OMAP_TO_H, timeout_h);
+ serial_out(up, UART_OMAP_TO_L, timeout_l);
}
/* Stop processing interrupts on input overrun */
--
2.34.1
From: Arnd Bergmann <arnd(a)arndb.de>
The old ftruncate() syscall, using the 32-bit off_t misses a sign
extension when called in compat mode on 64-bit architectures. As a
result, passing a negative length accidentally succeeds in truncating
to file size between 2GiB and 4GiB.
Changing the type of the compat syscall to the signed compat_off_t
changes the behavior so it instead returns -EINVAL.
The native entry point, the truncate() syscall and the corresponding
loff_t based variants are all correct already and do not suffer
from this mistake.
Fixes: 3f6d078d4acc ("fix compat truncate/ftruncate")
Reviewed-by: Christian Brauner <brauner(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
---
fs/open.c | 4 ++--
include/linux/compat.h | 2 +-
include/linux/syscalls.h | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/fs/open.c b/fs/open.c
index 89cafb572061..50e45bc7c4d8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -202,13 +202,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
return error;
}
-SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
+SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
{
return do_sys_ftruncate(fd, length, 1);
}
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
+COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length)
{
return do_sys_ftruncate(fd, length, 1);
}
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 233f61ec8afc..56cebaff0c91 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -608,7 +608,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd,
asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz,
struct compat_statfs64 __user *buf);
asmlinkage long compat_sys_truncate(const char __user *, compat_off_t);
-asmlinkage long compat_sys_ftruncate(unsigned int, compat_ulong_t);
+asmlinkage long compat_sys_ftruncate(unsigned int, compat_off_t);
/* No generic prototype for truncate64, ftruncate64, fallocate */
asmlinkage long compat_sys_openat(int dfd, const char __user *filename,
int flags, umode_t mode);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 9104952d323d..ba9337709878 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -418,7 +418,7 @@ asmlinkage long sys_listmount(const struct mnt_id_req __user *req,
u64 __user *mnt_ids, size_t nr_mnt_ids,
unsigned int flags);
asmlinkage long sys_truncate(const char __user *path, long length);
-asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
+asmlinkage long sys_ftruncate(unsigned int fd, off_t length);
#if BITS_PER_LONG == 32
asmlinkage long sys_truncate64(const char __user *path, loff_t length);
asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length);
--
2.39.2
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 56342da3d8cc15efe9df7f29985ba8d256bdc258
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062453-fretted-sulfur-3c0f@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 56342da3d8cc15efe9df7f29985ba8d256bdc258 Mon Sep 17 00:00:00 2001
From: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Date: Mon, 3 Jun 2024 10:16:45 -0400
Subject: [PATCH] drm/amd/display: prevent register access while in IPS
We can't read/write to DCN registers while in IPS. Since, that can cause
the system to hang. So, before proceeding with the access in that
scenario, force the system out of IPS.
Cc: stable(a)vger.kernel.org # 6.6+
Reviewed-by: Roman Li <roman.li(a)amd.com>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index e426adf95d7d..e9ac20bed0f2 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -11437,6 +11437,12 @@ void amdgpu_dm_trigger_timing_sync(struct drm_device *dev)
mutex_unlock(&adev->dm.dc_lock);
}
+static inline void amdgpu_dm_exit_ips_for_hw_access(struct dc *dc)
+{
+ if (dc->ctx->dmub_srv && !dc->ctx->dmub_srv->idle_exit_counter)
+ dc_exit_ips_for_hw_access(dc);
+}
+
void dm_write_reg_func(const struct dc_context *ctx, uint32_t address,
u32 value, const char *func_name)
{
@@ -11447,6 +11453,8 @@ void dm_write_reg_func(const struct dc_context *ctx, uint32_t address,
return;
}
#endif
+
+ amdgpu_dm_exit_ips_for_hw_access(ctx->dc);
cgs_write_register(ctx->cgs_device, address, value);
trace_amdgpu_dc_wreg(&ctx->perf_trace->write_count, address, value);
}
@@ -11470,6 +11478,8 @@ uint32_t dm_read_reg_func(const struct dc_context *ctx, uint32_t address,
return 0;
}
+ amdgpu_dm_exit_ips_for_hw_access(ctx->dc);
+
value = cgs_read_register(ctx->cgs_device, address);
trace_amdgpu_dc_rreg(&ctx->perf_trace->read_count, address, value);
The patch below does not apply to the 6.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.9.y
git checkout FETCH_HEAD
git cherry-pick -x 56342da3d8cc15efe9df7f29985ba8d256bdc258
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062454-portable-grinch-582f@gregkh' --subject-prefix 'PATCH 6.9.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 56342da3d8cc15efe9df7f29985ba8d256bdc258 Mon Sep 17 00:00:00 2001
From: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Date: Mon, 3 Jun 2024 10:16:45 -0400
Subject: [PATCH] drm/amd/display: prevent register access while in IPS
We can't read/write to DCN registers while in IPS. Since, that can cause
the system to hang. So, before proceeding with the access in that
scenario, force the system out of IPS.
Cc: stable(a)vger.kernel.org # 6.6+
Reviewed-by: Roman Li <roman.li(a)amd.com>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index e426adf95d7d..e9ac20bed0f2 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -11437,6 +11437,12 @@ void amdgpu_dm_trigger_timing_sync(struct drm_device *dev)
mutex_unlock(&adev->dm.dc_lock);
}
+static inline void amdgpu_dm_exit_ips_for_hw_access(struct dc *dc)
+{
+ if (dc->ctx->dmub_srv && !dc->ctx->dmub_srv->idle_exit_counter)
+ dc_exit_ips_for_hw_access(dc);
+}
+
void dm_write_reg_func(const struct dc_context *ctx, uint32_t address,
u32 value, const char *func_name)
{
@@ -11447,6 +11453,8 @@ void dm_write_reg_func(const struct dc_context *ctx, uint32_t address,
return;
}
#endif
+
+ amdgpu_dm_exit_ips_for_hw_access(ctx->dc);
cgs_write_register(ctx->cgs_device, address, value);
trace_amdgpu_dc_wreg(&ctx->perf_trace->write_count, address, value);
}
@@ -11470,6 +11478,8 @@ uint32_t dm_read_reg_func(const struct dc_context *ctx, uint32_t address,
return 0;
}
+ amdgpu_dm_exit_ips_for_hw_access(ctx->dc);
+
value = cgs_read_register(ctx->cgs_device, address);
trace_amdgpu_dc_rreg(&ctx->perf_trace->read_count, address, value);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x c03d770c0b014a3007a5874bf6b3c3e64d32aaac
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062458-impotency-swifter-6e17@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c03d770c0b014a3007a5874bf6b3c3e64d32aaac Mon Sep 17 00:00:00 2001
From: Michael Strauss <michael.strauss(a)amd.com>
Date: Tue, 7 May 2024 12:03:15 -0400
Subject: [PATCH] drm/amd/display: Attempt to avoid empty TUs when endpoint is
DPIA
[WHY]
Empty SST TUs are illegal to transmit over a USB4 DP tunnel.
Current policy is to configure stream encoder to pack 2 pixels per pclk
even when ODM combine is not in use, allowing seamless dynamic ODM
reconfiguration. However, in extreme edge cases where average pixel
count per TU is less than 2, this can lead to unexpected empty TU
generation during compliance testing. For example, VIC 1 with a 1xHBR3
link configuration will average 1.98 pix/TU.
[HOW]
Calculate average pixel count per TU, and block 2 pixels per clock if
endpoint is a DPIA tunnel and pixel clock is low enough that we will
never require 2:1 ODM combine.
Cc: stable(a)vger.kernel.org # 6.6+
Reviewed-by: Wenjing Liu <wenjing.liu(a)amd.com>
Acked-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
Signed-off-by: Michael Strauss <michael.strauss(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
index 5295f52e4fc8..dcced89c07b3 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
@@ -1439,3 +1439,75 @@ void dcn35_set_long_vblank(struct pipe_ctx **pipe_ctx,
}
}
}
+
+static bool should_avoid_empty_tu(struct pipe_ctx *pipe_ctx)
+{
+ /* Calculate average pixel count per TU, return false if under ~2.00 to
+ * avoid empty TUs. This is only required for DPIA tunneling as empty TUs
+ * are legal to generate for native DP links. Assume TU size 64 as there
+ * is currently no scenario where it's reprogrammed from HW default.
+ * MTPs have no such limitation, so this does not affect MST use cases.
+ */
+ unsigned int pix_clk_mhz;
+ unsigned int symclk_mhz;
+ unsigned int avg_pix_per_tu_x1000;
+ unsigned int tu_size_bytes = 64;
+ struct dc_crtc_timing *timing = &pipe_ctx->stream->timing;
+ struct dc_link_settings *link_settings = &pipe_ctx->link_config.dp_link_settings;
+ const struct dc *dc = pipe_ctx->stream->link->dc;
+
+ if (pipe_ctx->stream->link->ep_type != DISPLAY_ENDPOINT_USB4_DPIA)
+ return false;
+
+ // Not necessary for MST configurations
+ if (pipe_ctx->stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST)
+ return false;
+
+ pix_clk_mhz = timing->pix_clk_100hz / 10000;
+
+ // If this is true, can't block due to dynamic ODM
+ if (pix_clk_mhz > dc->clk_mgr->bw_params->clk_table.entries[0].dispclk_mhz)
+ return false;
+
+ switch (link_settings->link_rate) {
+ case LINK_RATE_LOW:
+ symclk_mhz = 162;
+ break;
+ case LINK_RATE_HIGH:
+ symclk_mhz = 270;
+ break;
+ case LINK_RATE_HIGH2:
+ symclk_mhz = 540;
+ break;
+ case LINK_RATE_HIGH3:
+ symclk_mhz = 810;
+ break;
+ default:
+ // We shouldn't be tunneling any other rates, something is wrong
+ ASSERT(0);
+ return false;
+ }
+
+ avg_pix_per_tu_x1000 = (1000 * pix_clk_mhz * tu_size_bytes)
+ / (symclk_mhz * link_settings->lane_count);
+
+ // Add small empirically-decided margin to account for potential jitter
+ return (avg_pix_per_tu_x1000 < 2020);
+}
+
+bool dcn35_is_dp_dig_pixel_rate_div_policy(struct pipe_ctx *pipe_ctx)
+{
+ struct dc *dc = pipe_ctx->stream->ctx->dc;
+
+ if (!is_h_timing_divisible_by_2(pipe_ctx->stream))
+ return false;
+
+ if (should_avoid_empty_tu(pipe_ctx))
+ return false;
+
+ if (dc_is_dp_signal(pipe_ctx->stream->signal) && !dc->link_srv->dp_is_128b_132b_signal(pipe_ctx) &&
+ dc->debug.enable_dp_dig_pixel_rate_div_policy)
+ return true;
+
+ return false;
+}
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h
index a731c8880d60..f0ea7d1511ae 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.h
@@ -95,4 +95,6 @@ void dcn35_set_static_screen_control(struct pipe_ctx **pipe_ctx,
void dcn35_set_long_vblank(struct pipe_ctx **pipe_ctx,
int num_pipes, uint32_t v_total_min, uint32_t v_total_max);
+bool dcn35_is_dp_dig_pixel_rate_div_policy(struct pipe_ctx *pipe_ctx);
+
#endif /* __DC_HWSS_DCN35_H__ */
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c
index df3bf77f3fb4..199781233fd5 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_init.c
@@ -158,7 +158,7 @@ static const struct hwseq_private_funcs dcn35_private_funcs = {
.setup_hpo_hw_control = dcn35_setup_hpo_hw_control,
.calculate_dccg_k1_k2_values = dcn32_calculate_dccg_k1_k2_values,
.set_pixels_per_cycle = dcn32_set_pixels_per_cycle,
- .is_dp_dig_pixel_rate_div_policy = dcn32_is_dp_dig_pixel_rate_div_policy,
+ .is_dp_dig_pixel_rate_div_policy = dcn35_is_dp_dig_pixel_rate_div_policy,
.dsc_pg_control = dcn35_dsc_pg_control,
.dsc_pg_status = dcn32_dsc_pg_status,
.enable_plane = dcn35_enable_plane,
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 8bd82363e2ee2eb3a9a8ea1fa94ebe1900d05a71
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062437-coveted-renounce-4f9a@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8bd82363e2ee2eb3a9a8ea1fa94ebe1900d05a71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig(a)amd.com>
Date: Wed, 5 Jun 2024 13:27:20 +0200
Subject: [PATCH] drm/amdgpu: revert "take runtime pm reference when we attach
a buffer" v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This reverts commit b8c415e3bf98 ("drm/amdgpu: take runtime pm reference
when we attach a buffer") and commit 425285d39afd ("drm/amdgpu: add amdgpu
runpm usage trace for separate funcs").
Taking a runtime pm reference for DMA-buf is actually completely
unnecessary and even dangerous.
The problem is that calling pm_runtime_get_sync() from the DMA-buf
callbacks is illegal because we have the reservation locked here
which is also taken during resume. So this would deadlock.
When the buffer is in GTT it is still accessible even when the GPU
is powered down and when it is in VRAM the buffer gets migrated to
GTT before powering down.
The only use case which would make it mandatory to keep the runtime
pm reference would be if we pin the buffer into VRAM, and that's not
something we currently do.
v2: improve the commit message
Signed-off-by: Christian König <christian.koenig(a)amd.com>
Reviewed-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
CC: stable(a)vger.kernel.org
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 055ba2ea4c12..662d0f28f358 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -41,8 +41,6 @@
#include <linux/dma-buf.h>
#include <linux/dma-fence-array.h>
#include <linux/pci-p2pdma.h>
-#include <linux/pm_runtime.h>
-#include "amdgpu_trace.h"
/**
* amdgpu_dma_buf_attach - &dma_buf_ops.attach implementation
@@ -58,42 +56,11 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf,
struct drm_gem_object *obj = dmabuf->priv;
struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
- int r;
if (pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0)
attach->peer2peer = false;
- r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
- trace_amdgpu_runpm_reference_dumps(1, __func__);
- if (r < 0)
- goto out;
-
return 0;
-
-out:
- pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
- trace_amdgpu_runpm_reference_dumps(0, __func__);
- return r;
-}
-
-/**
- * amdgpu_dma_buf_detach - &dma_buf_ops.detach implementation
- *
- * @dmabuf: DMA-buf where we remove the attachment from
- * @attach: the attachment to remove
- *
- * Called when an attachment is removed from the DMA-buf.
- */
-static void amdgpu_dma_buf_detach(struct dma_buf *dmabuf,
- struct dma_buf_attachment *attach)
-{
- struct drm_gem_object *obj = dmabuf->priv;
- struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
- struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
-
- pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
- pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
- trace_amdgpu_runpm_reference_dumps(0, __func__);
}
/**
@@ -267,7 +234,6 @@ static int amdgpu_dma_buf_begin_cpu_access(struct dma_buf *dma_buf,
const struct dma_buf_ops amdgpu_dmabuf_ops = {
.attach = amdgpu_dma_buf_attach,
- .detach = amdgpu_dma_buf_detach,
.pin = amdgpu_dma_buf_pin,
.unpin = amdgpu_dma_buf_unpin,
.map_dma_buf = amdgpu_dma_buf_map,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 10832b470448..bc3ac73b6b8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -181,7 +181,6 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
seq, flags | AMDGPU_FENCE_FLAG_INT);
pm_runtime_get_noresume(adev_to_drm(adev)->dev);
- trace_amdgpu_runpm_reference_dumps(1, __func__);
ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
if (unlikely(rcu_dereference_protected(*ptr, 1))) {
struct dma_fence *old;
@@ -309,7 +308,6 @@ bool amdgpu_fence_process(struct amdgpu_ring *ring)
dma_fence_put(fence);
pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
- trace_amdgpu_runpm_reference_dumps(0, __func__);
} while (last_seq != seq);
return true;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index 7aafeb763e5d..383fce40d4dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -554,21 +554,6 @@ TRACE_EVENT(amdgpu_reset_reg_dumps,
__entry->value)
);
-TRACE_EVENT(amdgpu_runpm_reference_dumps,
- TP_PROTO(uint32_t index, const char *func),
- TP_ARGS(index, func),
- TP_STRUCT__entry(
- __field(uint32_t, index)
- __string(func, func)
- ),
- TP_fast_assign(
- __entry->index = index;
- __assign_str(func);
- ),
- TP_printk("amdgpu runpm reference dump 0x%x: 0x%s\n",
- __entry->index,
- __get_str(func))
-);
#undef AMDGPU_JOB_GET_TIMELINE_NAME
#endif
The patch below does not apply to the 6.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.9.y
git checkout FETCH_HEAD
git cherry-pick -x 8bd82363e2ee2eb3a9a8ea1fa94ebe1900d05a71
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062432-cinch-refining-262b@gregkh' --subject-prefix 'PATCH 6.9.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8bd82363e2ee2eb3a9a8ea1fa94ebe1900d05a71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig(a)amd.com>
Date: Wed, 5 Jun 2024 13:27:20 +0200
Subject: [PATCH] drm/amdgpu: revert "take runtime pm reference when we attach
a buffer" v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This reverts commit b8c415e3bf98 ("drm/amdgpu: take runtime pm reference
when we attach a buffer") and commit 425285d39afd ("drm/amdgpu: add amdgpu
runpm usage trace for separate funcs").
Taking a runtime pm reference for DMA-buf is actually completely
unnecessary and even dangerous.
The problem is that calling pm_runtime_get_sync() from the DMA-buf
callbacks is illegal because we have the reservation locked here
which is also taken during resume. So this would deadlock.
When the buffer is in GTT it is still accessible even when the GPU
is powered down and when it is in VRAM the buffer gets migrated to
GTT before powering down.
The only use case which would make it mandatory to keep the runtime
pm reference would be if we pin the buffer into VRAM, and that's not
something we currently do.
v2: improve the commit message
Signed-off-by: Christian König <christian.koenig(a)amd.com>
Reviewed-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
CC: stable(a)vger.kernel.org
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 055ba2ea4c12..662d0f28f358 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -41,8 +41,6 @@
#include <linux/dma-buf.h>
#include <linux/dma-fence-array.h>
#include <linux/pci-p2pdma.h>
-#include <linux/pm_runtime.h>
-#include "amdgpu_trace.h"
/**
* amdgpu_dma_buf_attach - &dma_buf_ops.attach implementation
@@ -58,42 +56,11 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf,
struct drm_gem_object *obj = dmabuf->priv;
struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
- int r;
if (pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0)
attach->peer2peer = false;
- r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
- trace_amdgpu_runpm_reference_dumps(1, __func__);
- if (r < 0)
- goto out;
-
return 0;
-
-out:
- pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
- trace_amdgpu_runpm_reference_dumps(0, __func__);
- return r;
-}
-
-/**
- * amdgpu_dma_buf_detach - &dma_buf_ops.detach implementation
- *
- * @dmabuf: DMA-buf where we remove the attachment from
- * @attach: the attachment to remove
- *
- * Called when an attachment is removed from the DMA-buf.
- */
-static void amdgpu_dma_buf_detach(struct dma_buf *dmabuf,
- struct dma_buf_attachment *attach)
-{
- struct drm_gem_object *obj = dmabuf->priv;
- struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
- struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
-
- pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
- pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
- trace_amdgpu_runpm_reference_dumps(0, __func__);
}
/**
@@ -267,7 +234,6 @@ static int amdgpu_dma_buf_begin_cpu_access(struct dma_buf *dma_buf,
const struct dma_buf_ops amdgpu_dmabuf_ops = {
.attach = amdgpu_dma_buf_attach,
- .detach = amdgpu_dma_buf_detach,
.pin = amdgpu_dma_buf_pin,
.unpin = amdgpu_dma_buf_unpin,
.map_dma_buf = amdgpu_dma_buf_map,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 10832b470448..bc3ac73b6b8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -181,7 +181,6 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
seq, flags | AMDGPU_FENCE_FLAG_INT);
pm_runtime_get_noresume(adev_to_drm(adev)->dev);
- trace_amdgpu_runpm_reference_dumps(1, __func__);
ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
if (unlikely(rcu_dereference_protected(*ptr, 1))) {
struct dma_fence *old;
@@ -309,7 +308,6 @@ bool amdgpu_fence_process(struct amdgpu_ring *ring)
dma_fence_put(fence);
pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
- trace_amdgpu_runpm_reference_dumps(0, __func__);
} while (last_seq != seq);
return true;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index 7aafeb763e5d..383fce40d4dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -554,21 +554,6 @@ TRACE_EVENT(amdgpu_reset_reg_dumps,
__entry->value)
);
-TRACE_EVENT(amdgpu_runpm_reference_dumps,
- TP_PROTO(uint32_t index, const char *func),
- TP_ARGS(index, func),
- TP_STRUCT__entry(
- __field(uint32_t, index)
- __string(func, func)
- ),
- TP_fast_assign(
- __entry->index = index;
- __assign_str(func);
- ),
- TP_printk("amdgpu runpm reference dump 0x%x: 0x%s\n",
- __entry->index,
- __get_str(func))
-);
#undef AMDGPU_JOB_GET_TIMELINE_NAME
#endif
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 49f683b41f28918df3e51ddc0d928cb2e934ccdb
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062422-composer-engaging-202c@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 49f683b41f28918df3e51ddc0d928cb2e934ccdb Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao(a)debian.org>
Date: Fri, 10 May 2024 02:23:52 -0700
Subject: [PATCH] KVM: Fix a data race on last_boosted_vcpu in
kvm_vcpu_on_spin()
Use {READ,WRITE}_ONCE() to access kvm->last_boosted_vcpu to ensure the
loads and stores are atomic. In the extremely unlikely scenario the
compiler tears the stores, it's theoretically possible for KVM to attempt
to get a vCPU using an out-of-bounds index, e.g. if the write is split
into multiple 8-bit stores, and is paired with a 32-bit load on a VM with
257 vCPUs:
CPU0 CPU1
last_boosted_vcpu = 0xff;
(last_boosted_vcpu = 0x100)
last_boosted_vcpu[15:8] = 0x01;
i = (last_boosted_vcpu = 0x1ff)
last_boosted_vcpu[7:0] = 0x00;
vcpu = kvm->vcpu_array[0x1ff];
As detected by KCSAN:
BUG: KCSAN: data-race in kvm_vcpu_on_spin [kvm] / kvm_vcpu_on_spin [kvm]
write to 0xffffc90025a92344 of 4 bytes by task 4340 on cpu 16:
kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4112) kvm
handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel
vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:?
arch/x86/kvm/vmx/vmx.c:6606) kvm_intel
vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm
kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm
kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm
__se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890)
__x64_sys_ioctl (fs/ioctl.c:890)
x64_sys_call (arch/x86/entry/syscall_64.c:33)
do_syscall_64 (arch/x86/entry/common.c:?)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
read to 0xffffc90025a92344 of 4 bytes by task 4342 on cpu 4:
kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4069) kvm
handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel
vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:?
arch/x86/kvm/vmx/vmx.c:6606) kvm_intel
vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm
kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm
kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm
__se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890)
__x64_sys_ioctl (fs/ioctl.c:890)
x64_sys_call (arch/x86/entry/syscall_64.c:33)
do_syscall_64 (arch/x86/entry/common.c:?)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
value changed: 0x00000012 -> 0x00000000
Fixes: 217ece6129f2 ("KVM: use yield_to instead of sleep in kvm_vcpu_on_spin")
Cc: stable(a)vger.kernel.org
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Link: https://lore.kernel.org/r/20240510092353.2261824-1-leitao@debian.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 14841acb8b95..843aa68cbcd0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4025,12 +4025,13 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
{
struct kvm *kvm = me->kvm;
struct kvm_vcpu *vcpu;
- int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
+ int last_boosted_vcpu;
unsigned long i;
int yielded = 0;
int try = 3;
int pass;
+ last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu);
kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
@@ -4068,7 +4069,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
yielded = kvm_vcpu_yield_to(vcpu);
if (yielded > 0) {
- kvm->last_boosted_vcpu = i;
+ WRITE_ONCE(kvm->last_boosted_vcpu, i);
break;
} else if (yielded < 0) {
try--;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 49f683b41f28918df3e51ddc0d928cb2e934ccdb
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062420-pendant-avenging-0263@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 49f683b41f28918df3e51ddc0d928cb2e934ccdb Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao(a)debian.org>
Date: Fri, 10 May 2024 02:23:52 -0700
Subject: [PATCH] KVM: Fix a data race on last_boosted_vcpu in
kvm_vcpu_on_spin()
Use {READ,WRITE}_ONCE() to access kvm->last_boosted_vcpu to ensure the
loads and stores are atomic. In the extremely unlikely scenario the
compiler tears the stores, it's theoretically possible for KVM to attempt
to get a vCPU using an out-of-bounds index, e.g. if the write is split
into multiple 8-bit stores, and is paired with a 32-bit load on a VM with
257 vCPUs:
CPU0 CPU1
last_boosted_vcpu = 0xff;
(last_boosted_vcpu = 0x100)
last_boosted_vcpu[15:8] = 0x01;
i = (last_boosted_vcpu = 0x1ff)
last_boosted_vcpu[7:0] = 0x00;
vcpu = kvm->vcpu_array[0x1ff];
As detected by KCSAN:
BUG: KCSAN: data-race in kvm_vcpu_on_spin [kvm] / kvm_vcpu_on_spin [kvm]
write to 0xffffc90025a92344 of 4 bytes by task 4340 on cpu 16:
kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4112) kvm
handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel
vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:?
arch/x86/kvm/vmx/vmx.c:6606) kvm_intel
vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm
kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm
kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm
__se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890)
__x64_sys_ioctl (fs/ioctl.c:890)
x64_sys_call (arch/x86/entry/syscall_64.c:33)
do_syscall_64 (arch/x86/entry/common.c:?)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
read to 0xffffc90025a92344 of 4 bytes by task 4342 on cpu 4:
kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4069) kvm
handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel
vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:?
arch/x86/kvm/vmx/vmx.c:6606) kvm_intel
vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm
kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm
kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm
__se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890)
__x64_sys_ioctl (fs/ioctl.c:890)
x64_sys_call (arch/x86/entry/syscall_64.c:33)
do_syscall_64 (arch/x86/entry/common.c:?)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
value changed: 0x00000012 -> 0x00000000
Fixes: 217ece6129f2 ("KVM: use yield_to instead of sleep in kvm_vcpu_on_spin")
Cc: stable(a)vger.kernel.org
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Link: https://lore.kernel.org/r/20240510092353.2261824-1-leitao@debian.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 14841acb8b95..843aa68cbcd0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4025,12 +4025,13 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
{
struct kvm *kvm = me->kvm;
struct kvm_vcpu *vcpu;
- int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
+ int last_boosted_vcpu;
unsigned long i;
int yielded = 0;
int try = 3;
int pass;
+ last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu);
kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
@@ -4068,7 +4069,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
yielded = kvm_vcpu_yield_to(vcpu);
if (yielded > 0) {
- kvm->last_boosted_vcpu = i;
+ WRITE_ONCE(kvm->last_boosted_vcpu, i);
break;
} else if (yielded < 0) {
try--;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 49f683b41f28918df3e51ddc0d928cb2e934ccdb
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062419-goes-tingling-5681@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 49f683b41f28918df3e51ddc0d928cb2e934ccdb Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao(a)debian.org>
Date: Fri, 10 May 2024 02:23:52 -0700
Subject: [PATCH] KVM: Fix a data race on last_boosted_vcpu in
kvm_vcpu_on_spin()
Use {READ,WRITE}_ONCE() to access kvm->last_boosted_vcpu to ensure the
loads and stores are atomic. In the extremely unlikely scenario the
compiler tears the stores, it's theoretically possible for KVM to attempt
to get a vCPU using an out-of-bounds index, e.g. if the write is split
into multiple 8-bit stores, and is paired with a 32-bit load on a VM with
257 vCPUs:
CPU0 CPU1
last_boosted_vcpu = 0xff;
(last_boosted_vcpu = 0x100)
last_boosted_vcpu[15:8] = 0x01;
i = (last_boosted_vcpu = 0x1ff)
last_boosted_vcpu[7:0] = 0x00;
vcpu = kvm->vcpu_array[0x1ff];
As detected by KCSAN:
BUG: KCSAN: data-race in kvm_vcpu_on_spin [kvm] / kvm_vcpu_on_spin [kvm]
write to 0xffffc90025a92344 of 4 bytes by task 4340 on cpu 16:
kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4112) kvm
handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel
vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:?
arch/x86/kvm/vmx/vmx.c:6606) kvm_intel
vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm
kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm
kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm
__se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890)
__x64_sys_ioctl (fs/ioctl.c:890)
x64_sys_call (arch/x86/entry/syscall_64.c:33)
do_syscall_64 (arch/x86/entry/common.c:?)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
read to 0xffffc90025a92344 of 4 bytes by task 4342 on cpu 4:
kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4069) kvm
handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel
vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:?
arch/x86/kvm/vmx/vmx.c:6606) kvm_intel
vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm
kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm
kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm
__se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890)
__x64_sys_ioctl (fs/ioctl.c:890)
x64_sys_call (arch/x86/entry/syscall_64.c:33)
do_syscall_64 (arch/x86/entry/common.c:?)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
value changed: 0x00000012 -> 0x00000000
Fixes: 217ece6129f2 ("KVM: use yield_to instead of sleep in kvm_vcpu_on_spin")
Cc: stable(a)vger.kernel.org
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Link: https://lore.kernel.org/r/20240510092353.2261824-1-leitao@debian.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 14841acb8b95..843aa68cbcd0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4025,12 +4025,13 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
{
struct kvm *kvm = me->kvm;
struct kvm_vcpu *vcpu;
- int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
+ int last_boosted_vcpu;
unsigned long i;
int yielded = 0;
int try = 3;
int pass;
+ last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu);
kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
@@ -4068,7 +4069,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
yielded = kvm_vcpu_yield_to(vcpu);
if (yielded > 0) {
- kvm->last_boosted_vcpu = i;
+ WRITE_ONCE(kvm->last_boosted_vcpu, i);
break;
} else if (yielded < 0) {
try--;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 49f683b41f28918df3e51ddc0d928cb2e934ccdb
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062418-emboss-transpose-5310@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 49f683b41f28918df3e51ddc0d928cb2e934ccdb Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao(a)debian.org>
Date: Fri, 10 May 2024 02:23:52 -0700
Subject: [PATCH] KVM: Fix a data race on last_boosted_vcpu in
kvm_vcpu_on_spin()
Use {READ,WRITE}_ONCE() to access kvm->last_boosted_vcpu to ensure the
loads and stores are atomic. In the extremely unlikely scenario the
compiler tears the stores, it's theoretically possible for KVM to attempt
to get a vCPU using an out-of-bounds index, e.g. if the write is split
into multiple 8-bit stores, and is paired with a 32-bit load on a VM with
257 vCPUs:
CPU0 CPU1
last_boosted_vcpu = 0xff;
(last_boosted_vcpu = 0x100)
last_boosted_vcpu[15:8] = 0x01;
i = (last_boosted_vcpu = 0x1ff)
last_boosted_vcpu[7:0] = 0x00;
vcpu = kvm->vcpu_array[0x1ff];
As detected by KCSAN:
BUG: KCSAN: data-race in kvm_vcpu_on_spin [kvm] / kvm_vcpu_on_spin [kvm]
write to 0xffffc90025a92344 of 4 bytes by task 4340 on cpu 16:
kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4112) kvm
handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel
vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:?
arch/x86/kvm/vmx/vmx.c:6606) kvm_intel
vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm
kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm
kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm
__se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890)
__x64_sys_ioctl (fs/ioctl.c:890)
x64_sys_call (arch/x86/entry/syscall_64.c:33)
do_syscall_64 (arch/x86/entry/common.c:?)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
read to 0xffffc90025a92344 of 4 bytes by task 4342 on cpu 4:
kvm_vcpu_on_spin (arch/x86/kvm/../../../virt/kvm/kvm_main.c:4069) kvm
handle_pause (arch/x86/kvm/vmx/vmx.c:5929) kvm_intel
vmx_handle_exit (arch/x86/kvm/vmx/vmx.c:?
arch/x86/kvm/vmx/vmx.c:6606) kvm_intel
vcpu_run (arch/x86/kvm/x86.c:11107 arch/x86/kvm/x86.c:11211) kvm
kvm_arch_vcpu_ioctl_run (arch/x86/kvm/x86.c:?) kvm
kvm_vcpu_ioctl (arch/x86/kvm/../../../virt/kvm/kvm_main.c:?) kvm
__se_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:904 fs/ioctl.c:890)
__x64_sys_ioctl (fs/ioctl.c:890)
x64_sys_call (arch/x86/entry/syscall_64.c:33)
do_syscall_64 (arch/x86/entry/common.c:?)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
value changed: 0x00000012 -> 0x00000000
Fixes: 217ece6129f2 ("KVM: use yield_to instead of sleep in kvm_vcpu_on_spin")
Cc: stable(a)vger.kernel.org
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Link: https://lore.kernel.org/r/20240510092353.2261824-1-leitao@debian.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 14841acb8b95..843aa68cbcd0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4025,12 +4025,13 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
{
struct kvm *kvm = me->kvm;
struct kvm_vcpu *vcpu;
- int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
+ int last_boosted_vcpu;
unsigned long i;
int yielded = 0;
int try = 3;
int pass;
+ last_boosted_vcpu = READ_ONCE(kvm->last_boosted_vcpu);
kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
@@ -4068,7 +4069,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
yielded = kvm_vcpu_yield_to(vcpu);
if (yielded > 0) {
- kvm->last_boosted_vcpu = i;
+ WRITE_ONCE(kvm->last_boosted_vcpu, i);
break;
} else if (yielded < 0) {
try--;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x d0a1c07739e1b7f74683fe061545669156d102f2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062452-savage-commotion-3ab2@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d0a1c07739e1b7f74683fe061545669156d102f2 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee(a)linux.alibaba.com>
Date: Fri, 21 Jun 2024 10:18:40 +0800
Subject: [PATCH] LoongArch: KVM: Remove an unneeded semicolon
Remove an unneeded semicolon to avoid build warnings:
./arch/loongarch/kvm/exit.c:764:2-3: Unneeded semicolon
Cc: stable(a)vger.kernel.org
Reported-by: Abaci Robot <abaci(a)linux.alibaba.com>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=9343
Signed-off-by: Yang Li <yang.lee(a)linux.alibaba.com>
Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn>
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index c86e099af5ca..a68573e091c0 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -761,7 +761,7 @@ static void kvm_handle_service(struct kvm_vcpu *vcpu)
default:
ret = KVM_HCALL_INVALID_CODE;
break;
- };
+ }
kvm_write_reg(vcpu, LOONGARCH_GPR_A0, ret);
}
The patch below does not apply to the 6.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.9.y
git checkout FETCH_HEAD
git cherry-pick -x d0a1c07739e1b7f74683fe061545669156d102f2
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062451-unsalted-unvalued-9498@gregkh' --subject-prefix 'PATCH 6.9.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d0a1c07739e1b7f74683fe061545669156d102f2 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee(a)linux.alibaba.com>
Date: Fri, 21 Jun 2024 10:18:40 +0800
Subject: [PATCH] LoongArch: KVM: Remove an unneeded semicolon
Remove an unneeded semicolon to avoid build warnings:
./arch/loongarch/kvm/exit.c:764:2-3: Unneeded semicolon
Cc: stable(a)vger.kernel.org
Reported-by: Abaci Robot <abaci(a)linux.alibaba.com>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=9343
Signed-off-by: Yang Li <yang.lee(a)linux.alibaba.com>
Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn>
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index c86e099af5ca..a68573e091c0 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -761,7 +761,7 @@ static void kvm_handle_service(struct kvm_vcpu *vcpu)
default:
ret = KVM_HCALL_INVALID_CODE;
break;
- };
+ }
kvm_write_reg(vcpu, LOONGARCH_GPR_A0, ret);
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 29433a17a79caa8680b9c0761f2b10502fda9ce3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062429-swinging-gully-4fc8@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 29433a17a79caa8680b9c0761f2b10502fda9ce3 Mon Sep 17 00:00:00 2001
From: Barry Song <baohua(a)kernel.org>
Date: Tue, 18 Jun 2024 19:22:58 +1200
Subject: [PATCH] cifs: drop the incorrect assertion in cifs_swap_rw()
Since commit 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS
swap-space"), we can plug multiple pages then unplug them all together.
That means iov_iter_count(iter) could be way bigger than PAGE_SIZE, it
actually equals the size of iov_iter_npages(iter, INT_MAX).
Note this issue has nothing to do with large folios as we don't support
THP_SWPOUT to non-block devices.
Fixes: 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS swap-space")
Reported-by: Christoph Hellwig <hch(a)lst.de>
Closes: https://lore.kernel.org/linux-mm/20240614100329.1203579-1-hch@lst.de/
Cc: NeilBrown <neilb(a)suse.de>
Cc: Anna Schumaker <anna(a)kernel.org>
Cc: Steve French <sfrench(a)samba.org>
Cc: Trond Myklebust <trondmy(a)kernel.org>
Cc: Chuanhua Han <hanchuanhua(a)oppo.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Jeff Layton <jlayton(a)kernel.org>
Cc: Paulo Alcantara <pc(a)manguebit.com>
Cc: Ronnie Sahlberg <ronniesahlberg(a)gmail.com>
Cc: Shyam Prasad N <sprasad(a)microsoft.com>
Cc: Tom Talpey <tom(a)talpey.com>
Cc: Bharath SM <bharathsm(a)microsoft.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Barry Song <v-songbaohua(a)oppo.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 9d5c2440abfc..1e269e0bc75b 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -3200,8 +3200,6 @@ static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t ret;
- WARN_ON_ONCE(iov_iter_count(iter) != PAGE_SIZE);
-
if (iov_iter_rw(iter) == READ)
ret = netfs_unbuffered_read_iter_locked(iocb, iter);
else
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 29433a17a79caa8680b9c0761f2b10502fda9ce3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062428-juggle-shrine-cd22@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 29433a17a79caa8680b9c0761f2b10502fda9ce3 Mon Sep 17 00:00:00 2001
From: Barry Song <baohua(a)kernel.org>
Date: Tue, 18 Jun 2024 19:22:58 +1200
Subject: [PATCH] cifs: drop the incorrect assertion in cifs_swap_rw()
Since commit 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS
swap-space"), we can plug multiple pages then unplug them all together.
That means iov_iter_count(iter) could be way bigger than PAGE_SIZE, it
actually equals the size of iov_iter_npages(iter, INT_MAX).
Note this issue has nothing to do with large folios as we don't support
THP_SWPOUT to non-block devices.
Fixes: 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS swap-space")
Reported-by: Christoph Hellwig <hch(a)lst.de>
Closes: https://lore.kernel.org/linux-mm/20240614100329.1203579-1-hch@lst.de/
Cc: NeilBrown <neilb(a)suse.de>
Cc: Anna Schumaker <anna(a)kernel.org>
Cc: Steve French <sfrench(a)samba.org>
Cc: Trond Myklebust <trondmy(a)kernel.org>
Cc: Chuanhua Han <hanchuanhua(a)oppo.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Jeff Layton <jlayton(a)kernel.org>
Cc: Paulo Alcantara <pc(a)manguebit.com>
Cc: Ronnie Sahlberg <ronniesahlberg(a)gmail.com>
Cc: Shyam Prasad N <sprasad(a)microsoft.com>
Cc: Tom Talpey <tom(a)talpey.com>
Cc: Bharath SM <bharathsm(a)microsoft.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Barry Song <v-songbaohua(a)oppo.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 9d5c2440abfc..1e269e0bc75b 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -3200,8 +3200,6 @@ static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t ret;
- WARN_ON_ONCE(iov_iter_count(iter) != PAGE_SIZE);
-
if (iov_iter_rw(iter) == READ)
ret = netfs_unbuffered_read_iter_locked(iocb, iter);
else
The patch below does not apply to the 6.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.9.y
git checkout FETCH_HEAD
git cherry-pick -x 29433a17a79caa8680b9c0761f2b10502fda9ce3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062428-captivate-grasp-a4cf@gregkh' --subject-prefix 'PATCH 6.9.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 29433a17a79caa8680b9c0761f2b10502fda9ce3 Mon Sep 17 00:00:00 2001
From: Barry Song <baohua(a)kernel.org>
Date: Tue, 18 Jun 2024 19:22:58 +1200
Subject: [PATCH] cifs: drop the incorrect assertion in cifs_swap_rw()
Since commit 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS
swap-space"), we can plug multiple pages then unplug them all together.
That means iov_iter_count(iter) could be way bigger than PAGE_SIZE, it
actually equals the size of iov_iter_npages(iter, INT_MAX).
Note this issue has nothing to do with large folios as we don't support
THP_SWPOUT to non-block devices.
Fixes: 2282679fb20b ("mm: submit multipage write for SWP_FS_OPS swap-space")
Reported-by: Christoph Hellwig <hch(a)lst.de>
Closes: https://lore.kernel.org/linux-mm/20240614100329.1203579-1-hch@lst.de/
Cc: NeilBrown <neilb(a)suse.de>
Cc: Anna Schumaker <anna(a)kernel.org>
Cc: Steve French <sfrench(a)samba.org>
Cc: Trond Myklebust <trondmy(a)kernel.org>
Cc: Chuanhua Han <hanchuanhua(a)oppo.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Jeff Layton <jlayton(a)kernel.org>
Cc: Paulo Alcantara <pc(a)manguebit.com>
Cc: Ronnie Sahlberg <ronniesahlberg(a)gmail.com>
Cc: Shyam Prasad N <sprasad(a)microsoft.com>
Cc: Tom Talpey <tom(a)talpey.com>
Cc: Bharath SM <bharathsm(a)microsoft.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Barry Song <v-songbaohua(a)oppo.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 9d5c2440abfc..1e269e0bc75b 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -3200,8 +3200,6 @@ static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t ret;
- WARN_ON_ONCE(iov_iter_count(iter) != PAGE_SIZE);
-
if (iov_iter_rw(iter) == READ)
ret = netfs_unbuffered_read_iter_locked(iocb, iter);
else
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 8bf0287528da1992c5e49d757b99ad6bbc34b522
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062411-ipad-conical-35fb@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8bf0287528da1992c5e49d757b99ad6bbc34b522 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench(a)microsoft.com>
Date: Wed, 19 Jun 2024 14:46:48 -0500
Subject: [PATCH] cifs: fix typo in module parameter enable_gcm_256
enable_gcm_256 (which allows the server to require the strongest
encryption) is enabled by default, but the modinfo description
incorrectly showed it disabled by default. Fix the typo.
Cc: stable(a)vger.kernel.org
Fixes: fee742b50289 ("smb3.1.1: enable negotiating stronger encryption by default")
Signed-off-by: Steve French <stfrench(a)microsoft.com>
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index bb86fc0641d8..6397fdefd876 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -134,7 +134,7 @@ module_param(enable_oplocks, bool, 0644);
MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
module_param(enable_gcm_256, bool, 0644);
-MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: n/N/0");
+MODULE_PARM_DESC(enable_gcm_256, "Enable requesting strongest (256 bit) GCM encryption. Default: y/Y/0");
module_param(require_gcm_256, bool, 0644);
MODULE_PARM_DESC(require_gcm_256, "Require strongest (256 bit) GCM encryption. Default: n/N/0");
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 8851346912a1fa33e7a5966fe51f07313b274627
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024062402-reabsorb-plausible-88b5@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
8851346912a1 ("net: stmmac: Assign configured channel value to EXTTS event")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8851346912a1fa33e7a5966fe51f07313b274627 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel(a)pengutronix.de>
Date: Tue, 18 Jun 2024 09:38:21 +0200
Subject: [PATCH] net: stmmac: Assign configured channel value to EXTTS event
Assign the configured channel value to the EXTTS event in the timestamp
interrupt handler. Without assigning the correct channel, applications
like ts2phc will refuse to accept the event, resulting in errors such
as:
...
ts2phc[656.834]: config item end1.ts2phc.pin_index is 0
ts2phc[656.834]: config item end1.ts2phc.channel is 3
ts2phc[656.834]: config item end1.ts2phc.extts_polarity is 2
ts2phc[656.834]: config item end1.ts2phc.extts_correction is 0
...
ts2phc[656.862]: extts on unexpected channel
ts2phc[658.141]: extts on unexpected channel
ts2phc[659.140]: extts on unexpected channel
Fixes: f4da56529da60 ("net: stmmac: Add support for external trigger timestamping")
Cc: stable(a)vger.kernel.org
Signed-off-by: Oleksij Rempel <o.rempel(a)pengutronix.de>
Reviewed-by: Wojciech Drewek <wojciech.drewek(a)intel.com>
Link: https://lore.kernel.org/r/20240618073821.619751-1-o.rempel@pengutronix.de
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
index f05bd757dfe5..5ef52ef2698f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
@@ -218,6 +218,7 @@ static void timestamp_interrupt(struct stmmac_priv *priv)
{
u32 num_snapshot, ts_status, tsync_int;
struct ptp_clock_event event;
+ u32 acr_value, channel;
unsigned long flags;
u64 ptp_time;
int i;
@@ -243,12 +244,15 @@ static void timestamp_interrupt(struct stmmac_priv *priv)
num_snapshot = (ts_status & GMAC_TIMESTAMP_ATSNS_MASK) >>
GMAC_TIMESTAMP_ATSNS_SHIFT;
+ acr_value = readl(priv->ptpaddr + PTP_ACR);
+ channel = ilog2(FIELD_GET(PTP_ACR_MASK, acr_value));
+
for (i = 0; i < num_snapshot; i++) {
read_lock_irqsave(&priv->ptp_lock, flags);
get_ptptime(priv->ptpaddr, &ptp_time);
read_unlock_irqrestore(&priv->ptp_lock, flags);
event.type = PTP_CLOCK_EXTTS;
- event.index = 0;
+ event.index = channel;
event.timestamp = ptp_time;
ptp_clock_event(priv->ptp_clock, &event);
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x f92a59f6d12e31ead999fee9585471b95a8ae8a3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024061810-overflow-president-399a@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
f92a59f6d12e ("locking/atomic: scripts: fix ${atomic}_sub_and_test() kerneldoc")
6dfee110c6cc ("locking/atomic: scripts: Clarify ordering of conditional atomics")
e01cc1e8c2ad ("locking/atomic: Add generic support for sync_try_cmpxchg() and its fallback")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f92a59f6d12e31ead999fee9585471b95a8ae8a3 Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas(a)google.com>
Date: Wed, 15 May 2024 13:37:10 +0000
Subject: [PATCH] locking/atomic: scripts: fix ${atomic}_sub_and_test()
kerneldoc
For ${atomic}_sub_and_test() the @i parameter is the value to subtract,
not add. Fix the typo in the kerneldoc template and generate the headers
with this update.
Fixes: ad8110706f38 ("locking/atomic: scripts: generate kerneldoc comments")
Suggested-by: Mark Rutland <mark.rutland(a)arm.com>
Signed-off-by: Carlos Llamas <cmllamas(a)google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Acked-by: Mark Rutland <mark.rutland(a)arm.com>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20240515133844.3502360-1-cmllamas@google.com
diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
index 956bcba5dbf2..2f9d36b72bd8 100644
--- a/include/linux/atomic/atomic-arch-fallback.h
+++ b/include/linux/atomic/atomic-arch-fallback.h
@@ -2242,7 +2242,7 @@ raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
/**
* raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering
- * @i: int value to add
+ * @i: int value to subtract
* @v: pointer to atomic_t
*
* Atomically updates @v to (@v - @i) with full ordering.
@@ -4368,7 +4368,7 @@ raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
/**
* raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
- * @i: s64 value to add
+ * @i: s64 value to subtract
* @v: pointer to atomic64_t
*
* Atomically updates @v to (@v - @i) with full ordering.
@@ -4690,4 +4690,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v)
}
#endif /* _LINUX_ATOMIC_FALLBACK_H */
-// 14850c0b0db20c62fdc78ccd1d42b98b88d76331
+// b565db590afeeff0d7c9485ccbca5bb6e155749f
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index debd487fe971..9409a6ddf3e0 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -1349,7 +1349,7 @@ atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
/**
* atomic_sub_and_test() - atomic subtract and test if zero with full ordering
- * @i: int value to add
+ * @i: int value to subtract
* @v: pointer to atomic_t
*
* Atomically updates @v to (@v - @i) with full ordering.
@@ -2927,7 +2927,7 @@ atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
/**
* atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
- * @i: s64 value to add
+ * @i: s64 value to subtract
* @v: pointer to atomic64_t
*
* Atomically updates @v to (@v - @i) with full ordering.
@@ -4505,7 +4505,7 @@ atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
/**
* atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
- * @i: long value to add
+ * @i: long value to subtract
* @v: pointer to atomic_long_t
*
* Atomically updates @v to (@v - @i) with full ordering.
@@ -5050,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// ce5b65e0f1f8a276268b667194581d24bed219d4
+// 8829b337928e9508259079d32581775ececd415b
diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h
index 3ef844b3ab8a..f86b29d90877 100644
--- a/include/linux/atomic/atomic-long.h
+++ b/include/linux/atomic/atomic-long.h
@@ -1535,7 +1535,7 @@ raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
/**
* raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
- * @i: long value to add
+ * @i: long value to subtract
* @v: pointer to atomic_long_t
*
* Atomically updates @v to (@v - @i) with full ordering.
@@ -1809,4 +1809,4 @@ raw_atomic_long_dec_if_positive(atomic_long_t *v)
}
#endif /* _LINUX_ATOMIC_LONG_H */
-// 1c4a26fc77f345342953770ebe3c4d08e7ce2f9a
+// eadf183c3600b8b92b91839dd3be6bcc560c752d
diff --git a/scripts/atomic/kerneldoc/sub_and_test b/scripts/atomic/kerneldoc/sub_and_test
index d3760f7749d4..96615e50836b 100644
--- a/scripts/atomic/kerneldoc/sub_and_test
+++ b/scripts/atomic/kerneldoc/sub_and_test
@@ -1,7 +1,7 @@
cat <<EOF
/**
* ${class}${atomicname}() - atomic subtract and test if zero with ${desc_order} ordering
- * @i: ${int} value to add
+ * @i: ${int} value to subtract
* @v: pointer to ${atomic}_t
*
* Atomically updates @v to (@v - @i) with ${desc_order} ordering.
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024061320-handcart-crook-0519@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
3a5a8d343e1c ("mm: fix race between __split_huge_pmd_locked() and GUP-fast")
4f83145721f3 ("mm: avoid unnecessary flush on change_huge_pmd()")
c9fe66560bf2 ("mm/mprotect: do not flush when not required architecturally")
4a18419f71cd ("mm/mprotect: use mmu_gather")
e346e6688c4a ("mm: thp: skip make PMD PROT_NONE if THP migration is not supported")
f0953a1bbaca ("mm: fix typos in comments")
e2db1a9aa381 ("kasan, mm: optimize kmalloc poisoning")
928501344fc6 ("kasan, mm: don't save alloc stacks twice")
2b8305260fb3 ("kfence, kasan: make KFENCE compatible with KASAN")
0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure")
41139aa4c3a3 ("mm/filemap: add mapping_seek_hole_data")
a1ba9da8f0f9 ("mm/hugetlb.c: fix unnecessary address expansion of pmd sharing")
611806b4bf8d ("kasan: fix bug detection via ksize for HW_TAGS mode")
027b37b552f3 ("kasan: move _RET_IP_ to inline wrappers")
573a48092313 ("kasan: add match-all tag tests")
f00748bfa024 ("kasan: prefix global functions with kasan_")
dbf53f7597be ("mm/mprotect.c: optimize error detection in do_mprotect_pkey()")
96667f8a4382 ("mm: Close race in generic_access_phys")
97593cad003c ("kasan: sanitize objects when metadata doesn't fit")
1ef3133bd3b8 ("kasan: simplify assign_tag and set_tag calls")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts(a)arm.com>
Date: Wed, 1 May 2024 15:33:10 +0100
Subject: [PATCH] mm: fix race between __split_huge_pmd_locked() and GUP-fast
__split_huge_pmd_locked() can be called for a present THP, devmap or
(non-present) migration entry. It calls pmdp_invalidate() unconditionally
on the pmdp and only determines if it is present or not based on the
returned old pmd. This is a problem for the migration entry case because
pmd_mkinvalid(), called by pmdp_invalidate() must only be called for a
present pmd.
On arm64 at least, pmd_mkinvalid() will mark the pmd such that any future
call to pmd_present() will return true. And therefore any lockless
pgtable walker could see the migration entry pmd in this state and start
interpretting the fields as if it were present, leading to BadThings (TM).
GUP-fast appears to be one such lockless pgtable walker.
x86 does not suffer the above problem, but instead pmd_mkinvalid() will
corrupt the offset field of the swap entry within the swap pte. See link
below for discussion of that problem.
Fix all of this by only calling pmdp_invalidate() for a present pmd. And
for good measure let's add a warning to all implementations of
pmdp_invalidate[_ad](). I've manually reviewed all other
pmdp_invalidate[_ad]() call sites and believe all others to be conformant.
This is a theoretical bug found during code review. I don't have any test
case to trigger it in practice.
Link: https://lkml.kernel.org/r/20240501143310.1381675-1-ryan.roberts@arm.com
Link: https://lore.kernel.org/all/0dd7827a-6334-439a-8fd0-43c98e6af22b@arm.com/
Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
Signed-off-by: Ryan Roberts <ryan.roberts(a)arm.com>
Reviewed-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual(a)arm.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar(a)kernel.org>
Cc: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.ibm.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Sven Schnelle <svens(a)linux.ibm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst
index 2466d3363af7..ad50ca6f495e 100644
--- a/Documentation/mm/arch_pgtable_helpers.rst
+++ b/Documentation/mm/arch_pgtable_helpers.rst
@@ -140,7 +140,8 @@ PMD Page Table Helpers
+---------------------------+--------------------------------------------------+
| pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD |
+---------------------------+--------------------------------------------------+
-| pmd_mkinvalid | Invalidates a mapped PMD [1] |
+| pmd_mkinvalid | Invalidates a present PMD; do not call for |
+| | non-present PMD [1] |
+---------------------------+--------------------------------------------------+
| pmd_set_huge | Creates a PMD huge mapping |
+---------------------------+--------------------------------------------------+
@@ -196,7 +197,8 @@ PUD Page Table Helpers
+---------------------------+--------------------------------------------------+
| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD |
+---------------------------+--------------------------------------------------+
-| pud_mkinvalid | Invalidates a mapped PUD [1] |
+| pud_mkinvalid | Invalidates a present PUD; do not call for |
+| | non-present PUD [1] |
+---------------------------+--------------------------------------------------+
| pud_set_huge | Creates a PUD huge mapping |
+---------------------------+--------------------------------------------------+
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 83823db3488b..2975ea0841ba 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -170,6 +170,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
{
unsigned long old_pmd;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return __pmd(old_pmd);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2cb2a2e7b34b..558902edbfec 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1769,8 +1769,10 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
+ pmd_t pmd;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+ pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
}
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 19642f7ffb52..8648a50afe88 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -249,6 +249,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
{
pmd_t old, entry;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID);
old = pmdp_establish(vma, address, pmdp, entry);
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 94767c82fc0d..93e54ba91fbf 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -631,6 +631,8 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+
/*
* No flush is necessary. Once an invalid PTE is established, the PTE's
* access and dirty bits cannot be updated.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08e4f3343bcd..ccdcff73284a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2430,32 +2430,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- /*
- * Up to this point the pmd is present and huge and userland has the
- * whole access to the hugepage during the split (which happens in
- * place). If we overwrite the pmd with the not-huge version pointing
- * to the pte here (which of course we could if all CPUs were bug
- * free), userland could trigger a small page size TLB miss on the
- * small sized TLB while the hugepage TLB entry is still established in
- * the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
- * 383 on page 105. Intel should be safe but is also warns that it's
- * only safe if the permission and cache attributes of the two entries
- * loaded in the two TLB is identical (which should be the case here).
- * But it is generally safer to never allow small and huge TLB entries
- * for the same virtual address to be loaded simultaneously. So instead
- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
- * current pmd notpresent (atomically because here the pmd_trans_huge
- * must remain set at all times on the pmd until the split is complete
- * for this pmd), then we flush the SMP TLB and finally we write the
- * non-huge version of the pmd entry with pmd_populate.
- */
- old_pmd = pmdp_invalidate(vma, haddr, pmd);
-
- pmd_migration = is_pmd_migration_entry(old_pmd);
+ pmd_migration = is_pmd_migration_entry(*pmd);
if (unlikely(pmd_migration)) {
swp_entry_t entry;
+ old_pmd = *pmd;
entry = pmd_to_swp_entry(old_pmd);
page = pfn_swap_entry_to_page(entry);
write = is_writable_migration_entry(entry);
@@ -2466,6 +2445,30 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
+ /*
+ * Up to this point the pmd is present and huge and userland has
+ * the whole access to the hugepage during the split (which
+ * happens in place). If we overwrite the pmd with the not-huge
+ * version pointing to the pte here (which of course we could if
+ * all CPUs were bug free), userland could trigger a small page
+ * size TLB miss on the small sized TLB while the hugepage TLB
+ * entry is still established in the huge TLB. Some CPU doesn't
+ * like that. See
+ * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that
+ * it's only safe if the permission and cache attributes of the
+ * two entries loaded in the two TLB is identical (which should
+ * be the case here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual address to be
+ * loaded simultaneously. So instead of doing "pmd_populate();
+ * flush_pmd_tlb_range();" we first mark the current pmd
+ * notpresent (atomically because here the pmd_trans_huge must
+ * remain set at all times on the pmd until the split is
+ * complete for this pmd), then we flush the SMP TLB and finally
+ * we write the non-huge version of the pmd entry with
+ * pmd_populate.
+ */
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
page = pmd_page(old_pmd);
folio = page_folio(page);
if (pmd_dirty(old_pmd)) {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4fcd959dcc4d..a78a4adf711a 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return old;
@@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
return pmdp_invalidate(vma, address, pmdp);
}
#endif
From: Martin Kaiser <martin(a)kaiser.cx>
commit a73bda63a102a5f1feb730d4d809de098a3d1886 upstream.
The vf610 gpio driver is used in i.MX8QM, DXL, ULP and i.MX93 chips.
Enable it in arm64 defconfig.
(vf610 gpio used to be enabled by default for all i.MX chips. This was
changed recently as most i.MX chips don't need this driver.)
Cc: <stable(a)vger.kernel.org> # 6.6.x
Signed-off-by: Martin Kaiser <martin(a)kaiser.cx>
Signed-off-by: Shawn Guo <shawnguo(a)kernel.org>
Signed-off-by: Fabio Estevam <festevam(a)gmail.com>
---
Hi,
This fixes a boot regression on imx93-evk running 6.6.32.
Please apply it to the 6.6.y stable tree.
arch/arm64/configs/defconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 24f395d9ce2a36..9f82eb906a8a34 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -632,6 +632,7 @@ CONFIG_GPIO_SYSCON=y
CONFIG_GPIO_UNIPHIER=y
CONFIG_GPIO_VISCONTI=y
CONFIG_GPIO_WCD934X=m
+CONFIG_GPIO_VF610=y
CONFIG_GPIO_XGENE=y
CONFIG_GPIO_XGENE_SB=y
CONFIG_GPIO_MAX732X=y
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x aba091547ef6159d52471f42a3ef531b7b660ed8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024061340-troubling-automated-9989@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
aba091547ef6 ("kbuild: Remove support for Clang's ThinLTO caching")
1db773da58df ("kbuild: remove old Rust docs output path")
7ea01d3169a2 ("rust: delete rust-project.json when running make clean")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aba091547ef6159d52471f42a3ef531b7b660ed8 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan(a)kernel.org>
Date: Wed, 1 May 2024 15:55:25 -0700
Subject: [PATCH] kbuild: Remove support for Clang's ThinLTO caching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There is an issue in clang's ThinLTO caching (enabled for the kernel via
'--thinlto-cache-dir') with .incbin, which the kernel occasionally uses
to include data within the kernel, such as the .config file for
/proc/config.gz. For example, when changing the .config and rebuilding
vmlinux, the copy of .config in vmlinux does not match the copy of
.config in the build folder:
$ echo 'CONFIG_LTO_NONE=n
CONFIG_LTO_CLANG_THIN=y
CONFIG_IKCONFIG=y
CONFIG_HEADERS_INSTALL=y' >kernel/configs/repro.config
$ make -skj"$(nproc)" ARCH=x86_64 LLVM=1 clean defconfig repro.config vmlinux
...
$ grep CONFIG_HEADERS_INSTALL .config
CONFIG_HEADERS_INSTALL=y
$ scripts/extract-ikconfig vmlinux | grep CONFIG_HEADERS_INSTALL
CONFIG_HEADERS_INSTALL=y
$ scripts/config -d HEADERS_INSTALL
$ make -kj"$(nproc)" ARCH=x86_64 LLVM=1 vmlinux
...
UPD kernel/config_data
GZIP kernel/config_data.gz
CC kernel/configs.o
...
LD vmlinux
...
$ grep CONFIG_HEADERS_INSTALL .config
# CONFIG_HEADERS_INSTALL is not set
$ scripts/extract-ikconfig vmlinux | grep CONFIG_HEADERS_INSTALL
CONFIG_HEADERS_INSTALL=y
Without '--thinlto-cache-dir' or when using full LTO, this issue does
not occur.
Benchmarking incremental builds on a few different machines with and
without the cache shows a 20% increase in incremental build time without
the cache when measured by touching init/main.c and running 'make all'.
ARCH=arm64 defconfig + CONFIG_LTO_CLANG_THIN=y on an arm64 host:
Benchmark 1: With ThinLTO cache
Time (mean ± σ): 56.347 s ± 0.163 s [User: 83.768 s, System: 24.661 s]
Range (min … max): 56.109 s … 56.594 s 10 runs
Benchmark 2: Without ThinLTO cache
Time (mean ± σ): 67.740 s ± 0.479 s [User: 718.458 s, System: 31.797 s]
Range (min … max): 67.059 s … 68.556 s 10 runs
Summary
With ThinLTO cache ran
1.20 ± 0.01 times faster than Without ThinLTO cache
ARCH=x86_64 defconfig + CONFIG_LTO_CLANG_THIN=y on an x86_64 host:
Benchmark 1: With ThinLTO cache
Time (mean ± σ): 85.772 s ± 0.252 s [User: 91.505 s, System: 8.408 s]
Range (min … max): 85.447 s … 86.244 s 10 runs
Benchmark 2: Without ThinLTO cache
Time (mean ± σ): 103.833 s ± 0.288 s [User: 232.058 s, System: 8.569 s]
Range (min … max): 103.286 s … 104.124 s 10 runs
Summary
With ThinLTO cache ran
1.21 ± 0.00 times faster than Without ThinLTO cache
While it is unfortunate to take this performance improvement off the
table, correctness is more important. If/when this is fixed in LLVM, it
can potentially be brought back in a conditional manner. Alternatively,
a developer can just disable LTO if doing incremental compiles quickly
is important, as a full compile cycle can still take over a minute even
with the cache and it is unlikely that LTO will result in functional
differences for a kernel change.
Cc: stable(a)vger.kernel.org
Fixes: dc5723b02e52 ("kbuild: add support for Clang LTO")
Reported-by: Yifan Hong <elsk(a)google.com>
Closes: https://github.com/ClangBuiltLinux/linux/issues/2021
Reported-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Closes: https://lore.kernel.org/r/20220327115526.cc4b0ff55fc53c97683c3e4d@kernel.or…
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
Signed-off-by: Masahiro Yamada <masahiroy(a)kernel.org>
diff --git a/Makefile b/Makefile
index 026971f9f6bc..12e1a792b8de 100644
--- a/Makefile
+++ b/Makefile
@@ -942,7 +942,6 @@ endif
ifdef CONFIG_LTO_CLANG
ifdef CONFIG_LTO_CLANG_THIN
CC_FLAGS_LTO := -flto=thin -fsplit-lto-unit
-KBUILD_LDFLAGS += --thinlto-cache-dir=$(extmod_prefix).thinlto-cache
else
CC_FLAGS_LTO := -flto
endif
@@ -1480,7 +1479,7 @@ endif # CONFIG_MODULES
# Directories & files removed with 'make clean'
CLEAN_FILES += vmlinux.symvers modules-only.symvers \
modules.builtin modules.builtin.modinfo modules.nsdeps \
- compile_commands.json .thinlto-cache rust/test \
+ compile_commands.json rust/test \
rust-project.json .vmlinux.objs .vmlinux.export.c
# Directories & files removed with 'make mrproper'
@@ -1787,7 +1786,7 @@ PHONY += compile_commands.json
clean-dirs := $(KBUILD_EXTMOD)
clean: rm-files := $(KBUILD_EXTMOD)/Module.symvers $(KBUILD_EXTMOD)/modules.nsdeps \
- $(KBUILD_EXTMOD)/compile_commands.json $(KBUILD_EXTMOD)/.thinlto-cache
+ $(KBUILD_EXTMOD)/compile_commands.json
PHONY += prepare
# now expand this into a simple variable to reduce the cost of shell evaluations
On 6/21/2024 5:47 PM, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> ACPI: EC: Install address space handler at the namespace root
For this, you need
https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git/commit/…
too (here and for 6.9).
Thanks!
> to the 6.6-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> acpi-ec-install-address-space-handler-at-the-namespa.patch
> and it can be found in the queue-6.6 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
>
> commit ee44236dfbf5541d5fbcb52db961616292c84c0d
> Author: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
> Date: Wed May 15 21:40:54 2024 +0200
>
> ACPI: EC: Install address space handler at the namespace root
>
> [ Upstream commit 60fa6ae6e6d09e377fce6f8d9b6f6a4d88769f63 ]
>
> It is reported that _DSM evaluation fails in ucsi_acpi_dsm() on Lenovo
> IdeaPad Pro 5 due to a missing address space handler for the EC address
> space:
>
> ACPI Error: No handler for Region [ECSI] (000000007b8176ee) [EmbeddedControl] (20230628/evregion-130)
>
> This happens because if there is no ECDT, the EC driver only registers
> the EC address space handler for operation regions defined in the EC
> device scope of the ACPI namespace while the operation region being
> accessed by the _DSM in question is located beyond that scope.
>
> To address this, modify the ACPI EC driver to install the EC address
> space handler at the root of the ACPI namespace for the first EC that
> can be found regardless of whether or not an ECDT is present.
>
> Note that this change is consistent with some examples in the ACPI
> specification in which EC operation regions located outside the EC
> device scope are used (for example, see Section 9.17.15 in ACPI 6.5),
> so the current behavior of the EC driver is arguably questionable.
>
> Reported-by: webcaptcha <webcapcha(a)gmail.com>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=218789
> Link: https://uefi.org/specs/ACPI/6.5/09_ACPI_Defined_Devices_and_Device_Specific…
> Link: https://lore.kernel.org/linux-acpi/Zi+0whTvDbAdveHq@kuha.fi.intel.com
> Suggested-by: Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
> Reviewed-by: Hans de Goede <hdegoede(a)redhat.com>
> Reviewed-by: Mario Limonciello <mario.limonciello(a)amd.com>
> Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
> index a59c11df73754..0795f92d8927d 100644
> --- a/drivers/acpi/ec.c
> +++ b/drivers/acpi/ec.c
> @@ -1482,13 +1482,14 @@ static bool install_gpio_irq_event_handler(struct acpi_ec *ec)
> static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device,
> bool call_reg)
> {
> + acpi_handle scope_handle = ec == first_ec ? ACPI_ROOT_OBJECT : ec->handle;
> acpi_status status;
>
> acpi_ec_start(ec, false);
>
> if (!test_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags)) {
> acpi_ec_enter_noirq(ec);
> - status = acpi_install_address_space_handler_no_reg(ec->handle,
> + status = acpi_install_address_space_handler_no_reg(scope_handle,
> ACPI_ADR_SPACE_EC,
> &acpi_ec_space_handler,
> NULL, ec);
> @@ -1497,11 +1498,10 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device,
> return -ENODEV;
> }
> set_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags);
> - ec->address_space_handler_holder = ec->handle;
> }
>
> if (call_reg && !test_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags)) {
> - acpi_execute_reg_methods(ec->handle, ACPI_ADR_SPACE_EC);
> + acpi_execute_reg_methods(scope_handle, ACPI_ADR_SPACE_EC);
> set_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags);
> }
>
> @@ -1553,10 +1553,13 @@ static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device,
>
> static void ec_remove_handlers(struct acpi_ec *ec)
> {
> + acpi_handle scope_handle = ec == first_ec ? ACPI_ROOT_OBJECT : ec->handle;
> +
> if (test_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags)) {
> if (ACPI_FAILURE(acpi_remove_address_space_handler(
> - ec->address_space_handler_holder,
> - ACPI_ADR_SPACE_EC, &acpi_ec_space_handler)))
> + scope_handle,
> + ACPI_ADR_SPACE_EC,
> + &acpi_ec_space_handler)))
> pr_err("failed to remove space handler\n");
> clear_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags);
> }
> @@ -1595,14 +1598,18 @@ static int acpi_ec_setup(struct acpi_ec *ec, struct acpi_device *device, bool ca
> {
> int ret;
>
> - ret = ec_install_handlers(ec, device, call_reg);
> - if (ret)
> - return ret;
> -
> /* First EC capable of handling transactions */
> if (!first_ec)
> first_ec = ec;
>
> + ret = ec_install_handlers(ec, device, call_reg);
> + if (ret) {
> + if (ec == first_ec)
> + first_ec = NULL;
> +
> + return ret;
> + }
> +
> pr_info("EC_CMD/EC_SC=0x%lx, EC_DATA=0x%lx\n", ec->command_addr,
> ec->data_addr);
>
> diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
> index 866c7c4ed2331..6db1a03dd5399 100644
> --- a/drivers/acpi/internal.h
> +++ b/drivers/acpi/internal.h
> @@ -167,7 +167,6 @@ enum acpi_ec_event_state {
>
> struct acpi_ec {
> acpi_handle handle;
> - acpi_handle address_space_handler_holder;
> int gpe;
> int irq;
> unsigned long command_addr;
On Fri, Jun 21, 2024 at 11:42:01AM -0400, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> ASoC: Intel: sof_cs42l42: rename BT offload quirk
This is not obvious stable material.
>
> Reviewed-by: Bard Liao <yung-chuan.liao(a)linux.intel.com>
> Signed-off-by: Brent Lu <brent.lu(a)intel.com>
> Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart(a)linux.intel.com>
> Link: https://msgid.link/r/20240325221059.206042-7-pierre-louis.bossart@linux.int…
> Signed-off-by: Mark Brown <broonie(a)kernel.org>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
I'm also not seeing any stable-dep-of r anything in here.
When the source address is selected, the scope must be checked. For
example, if a loopback address is assigned to the vrf device, it must not
be chosen for packets sent outside.
CC: stable(a)vger.kernel.org
Fixes: afbac6010aec ("net: ipv6: Address selection needs to consider L3 domains")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel(a)6wind.com>
---
net/ipv6/addrconf.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 5c424a0e7232..4f2c5cc31015 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1873,7 +1873,8 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
master, &dst,
scores, hiscore_idx);
- if (scores[hiscore_idx].ifa)
+ if (scores[hiscore_idx].ifa &&
+ scores[hiscore_idx].scopedist >= 0)
goto out;
}
--
2.43.1
By default, an address assigned to the output interface is selected when
the source address is not specified. This is problematic when a route,
configured in a vrf, uses an interface from another vrf (aka route leak).
The original vrf does not own the selected source address.
Let's add a check against the output interface and call the appropriate
function to select the source address.
CC: stable(a)vger.kernel.org
Fixes: 8cbb512c923d ("net: Add source address lookup op for VRF")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel(a)6wind.com>
---
net/ipv4/fib_semantics.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f669da98d11d..459082f4936d 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -2270,6 +2270,13 @@ void fib_select_path(struct net *net, struct fib_result *res,
fib_select_default(fl4, res);
check_saddr:
- if (!fl4->saddr)
- fl4->saddr = fib_result_prefsrc(net, res);
+ if (!fl4->saddr) {
+ struct net_device *l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev);
+
+ if (!l3mdev ||
+ l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev)
+ fl4->saddr = fib_result_prefsrc(net, res);
+ else
+ fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK);
+ }
}
--
2.43.1
From: Arnd Bergmann <arnd(a)arndb.de>
The unusual function calling conventions on superh ended up causing
sync_file_range to have the wrong argument order, with the 'flags'
argument getting sorted before 'nbytes' by the compiler.
In userspace, I found that musl, glibc, uclibc and strace all expect the
normal calling conventions with 'nbytes' last, so changing the kernel
to match them should make all of those work.
In order to be able to also fix libc implementations to work with existing
kernels, they need to be able to tell which ABI is used. An easy way
to do this is to add yet another system call using the sync_file_range2
ABI that works the same on all architectures.
Old user binaries can now work on new kernels, and new binaries can
try the new sync_file_range2() to work with new kernels or fall back
to the old sync_file_range() version if that doesn't exist.
Cc: stable(a)vger.kernel.org
Fixes: 75c92acdd5b1 ("sh: Wire up new syscalls.")
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
---
arch/sh/kernel/sys_sh32.c | 11 +++++++++++
arch/sh/kernel/syscalls/syscall.tbl | 3 ++-
2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c
index 9dca568509a5..d5a4f7c697d8 100644
--- a/arch/sh/kernel/sys_sh32.c
+++ b/arch/sh/kernel/sys_sh32.c
@@ -59,3 +59,14 @@ asmlinkage int sys_fadvise64_64_wrapper(int fd, u32 offset0, u32 offset1,
(u64)len0 << 32 | len1, advice);
#endif
}
+
+/*
+ * swap the arguments the way that libc wants it instead of
+ * moving flags ahead of the 64-bit nbytes argument
+ */
+SYSCALL_DEFINE6(sh_sync_file_range6, int, fd, SC_ARG64(offset),
+ SC_ARG64(nbytes), unsigned int, flags)
+{
+ return ksys_sync_file_range(fd, SC_VAL64(loff_t, offset),
+ SC_VAL64(loff_t, nbytes), flags);
+}
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index bbf83a2db986..c55fd7696d40 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -321,7 +321,7 @@
311 common set_robust_list sys_set_robust_list
312 common get_robust_list sys_get_robust_list
313 common splice sys_splice
-314 common sync_file_range sys_sync_file_range
+314 common sync_file_range sys_sh_sync_file_range6
315 common tee sys_tee
316 common vmsplice sys_vmsplice
317 common move_pages sys_move_pages
@@ -395,6 +395,7 @@
385 common pkey_alloc sys_pkey_alloc
386 common pkey_free sys_pkey_free
387 common rseq sys_rseq
+388 common sync_file_range2 sys_sync_file_range2
# room for arch specific syscalls
393 common semget sys_semget
394 common semctl sys_semctl
--
2.39.2
According to the code logic, when the kernel is loaded to address 0,
no copying operation should be performed, but it is currently being
done.
This patch fixes the issue where the kernel code was incorrectly
duplicated to address 0 when booting from address 0.
Fixes: b270bebd34e3 ("powerpc/64s: Run at the kernel virtual address earlier in boot")
Signed-off-by: Jinglin Wen <jinglin.wen(a)shingroup.cn>
Suggested-by: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: <stable(a)vger.kernel.org>
---
v2:
- According to 87le336c6k.fsf(a)mail.lhotse, improve this patch.
v1:
- 20240617023509.5674-1-jinglin.wen(a)shingroup.cn
arch/powerpc/kernel/head_64.S | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 4690c219bfa4..63432a33ec49 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -647,8 +647,9 @@ __after_prom_start:
* Note: This process overwrites the OF exception vectors.
*/
LOAD_REG_IMMEDIATE(r3, PAGE_OFFSET)
- mr. r4,r26 /* In some cases the loader may */
- beq 9f /* have already put us at zero */
+ mr r4,r26 /* Load the virtual source address into r4 */
+ cmpld r3,r4 /* Check if source == dest */
+ beq 9f /* If so skip the copy */
li r6,0x100 /* Start offset, the first 0x100 */
/* bytes were copied earlier. */
--
2.25.1
Rename tdx_parse_tdinfo() to tdx_setup() and move setting NOTIFY_ENABLES
there.
The function will be extended to adjust TD configuration.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy(a)linux.intel.com>
Reviewed-by: Kai Huang <kai.huang(a)intel.com>
Cc: stable(a)vger.kernel.org
---
arch/x86/coco/tdx/tdx.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 64717a96a936..08ce488b54d0 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -193,7 +193,7 @@ static void __noreturn tdx_panic(const char *msg)
__tdx_hypercall(&args);
}
-static void tdx_parse_tdinfo(u64 *cc_mask)
+static void tdx_setup(u64 *cc_mask)
{
struct tdx_module_args args = {};
unsigned int gpa_width;
@@ -218,6 +218,9 @@ static void tdx_parse_tdinfo(u64 *cc_mask)
gpa_width = args.rcx & GENMASK(5, 0);
*cc_mask = BIT_ULL(gpa_width - 1);
+ /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
+ tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
+
/*
* The kernel can not handle #VE's when accessing normal kernel
* memory. Ensure that no #VE will be delivered for accesses to
@@ -964,11 +967,11 @@ void __init tdx_early_init(void)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
cc_vendor = CC_VENDOR_INTEL;
- tdx_parse_tdinfo(&cc_mask);
- cc_set_mask(cc_mask);
- /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
- tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
+ /* Configure the TD */
+ tdx_setup(&cc_mask);
+
+ cc_set_mask(cc_mask);
/*
* All bits above GPA width are reserved and kernel treats shared bit
--
2.43.0
The TDG_VM_WR TDCALL is used to ask the TDX module to change some
TD-specific VM configuration. There is currently only one user in the
kernel of this TDCALL leaf. More will be added shortly.
Refactor to make way for more users of TDG_VM_WR who will need to modify
other TD configuration values.
Add a wrapper for the TDG_VM_RD TDCALL that requests TD-specific
metadata from the TDX module. There are currently no users for
TDG_VM_RD. Mark it as __maybe_unused until the first user appears.
This is preparation for enumeration and enabling optional TD features.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reviewed-by: Kai Huang <kai.huang(a)intel.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
---
arch/x86/coco/tdx/tdx.c | 32 ++++++++++++++++++++++++++-----
arch/x86/include/asm/shared/tdx.h | 1 +
2 files changed, 28 insertions(+), 5 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 078e2bac2553..64717a96a936 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -77,6 +77,32 @@ static inline void tdcall(u64 fn, struct tdx_module_args *args)
panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
}
+/* Read TD-scoped metadata */
+static inline u64 __maybe_unused tdg_vm_rd(u64 field, u64 *value)
+{
+ struct tdx_module_args args = {
+ .rdx = field,
+ };
+ u64 ret;
+
+ ret = __tdcall_ret(TDG_VM_RD, &args);
+ *value = args.r8;
+
+ return ret;
+}
+
+/* Write TD-scoped metadata */
+static inline u64 tdg_vm_wr(u64 field, u64 value, u64 mask)
+{
+ struct tdx_module_args args = {
+ .rdx = field,
+ .r8 = value,
+ .r9 = mask,
+ };
+
+ return __tdcall(TDG_VM_WR, &args);
+}
+
/**
* tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
* subtype 0) using TDG.MR.REPORT TDCALL.
@@ -924,10 +950,6 @@ static void tdx_kexec_finish(void)
void __init tdx_early_init(void)
{
- struct tdx_module_args args = {
- .rdx = TDCS_NOTIFY_ENABLES,
- .r9 = -1ULL,
- };
u64 cc_mask;
u32 eax, sig[3];
@@ -946,7 +968,7 @@ void __init tdx_early_init(void)
cc_set_mask(cc_mask);
/* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
- tdcall(TDG_VM_WR, &args);
+ tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
/*
* All bits above GPA width are reserved and kernel treats shared bit
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index fdfd41511b02..7e12cfa28bec 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -16,6 +16,7 @@
#define TDG_VP_VEINFO_GET 3
#define TDG_MR_REPORT 4
#define TDG_MEM_PAGE_ACCEPT 6
+#define TDG_VM_RD 7
#define TDG_VM_WR 8
/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */
--
2.43.0
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024061319-skater-sculptor-905f@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
3a5a8d343e1c ("mm: fix race between __split_huge_pmd_locked() and GUP-fast")
4f83145721f3 ("mm: avoid unnecessary flush on change_huge_pmd()")
c9fe66560bf2 ("mm/mprotect: do not flush when not required architecturally")
4a18419f71cd ("mm/mprotect: use mmu_gather")
e346e6688c4a ("mm: thp: skip make PMD PROT_NONE if THP migration is not supported")
f0953a1bbaca ("mm: fix typos in comments")
e2db1a9aa381 ("kasan, mm: optimize kmalloc poisoning")
928501344fc6 ("kasan, mm: don't save alloc stacks twice")
2b8305260fb3 ("kfence, kasan: make KFENCE compatible with KASAN")
0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure")
41139aa4c3a3 ("mm/filemap: add mapping_seek_hole_data")
a1ba9da8f0f9 ("mm/hugetlb.c: fix unnecessary address expansion of pmd sharing")
611806b4bf8d ("kasan: fix bug detection via ksize for HW_TAGS mode")
027b37b552f3 ("kasan: move _RET_IP_ to inline wrappers")
573a48092313 ("kasan: add match-all tag tests")
f00748bfa024 ("kasan: prefix global functions with kasan_")
dbf53f7597be ("mm/mprotect.c: optimize error detection in do_mprotect_pkey()")
96667f8a4382 ("mm: Close race in generic_access_phys")
97593cad003c ("kasan: sanitize objects when metadata doesn't fit")
1ef3133bd3b8 ("kasan: simplify assign_tag and set_tag calls")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts(a)arm.com>
Date: Wed, 1 May 2024 15:33:10 +0100
Subject: [PATCH] mm: fix race between __split_huge_pmd_locked() and GUP-fast
__split_huge_pmd_locked() can be called for a present THP, devmap or
(non-present) migration entry. It calls pmdp_invalidate() unconditionally
on the pmdp and only determines if it is present or not based on the
returned old pmd. This is a problem for the migration entry case because
pmd_mkinvalid(), called by pmdp_invalidate() must only be called for a
present pmd.
On arm64 at least, pmd_mkinvalid() will mark the pmd such that any future
call to pmd_present() will return true. And therefore any lockless
pgtable walker could see the migration entry pmd in this state and start
interpretting the fields as if it were present, leading to BadThings (TM).
GUP-fast appears to be one such lockless pgtable walker.
x86 does not suffer the above problem, but instead pmd_mkinvalid() will
corrupt the offset field of the swap entry within the swap pte. See link
below for discussion of that problem.
Fix all of this by only calling pmdp_invalidate() for a present pmd. And
for good measure let's add a warning to all implementations of
pmdp_invalidate[_ad](). I've manually reviewed all other
pmdp_invalidate[_ad]() call sites and believe all others to be conformant.
This is a theoretical bug found during code review. I don't have any test
case to trigger it in practice.
Link: https://lkml.kernel.org/r/20240501143310.1381675-1-ryan.roberts@arm.com
Link: https://lore.kernel.org/all/0dd7827a-6334-439a-8fd0-43c98e6af22b@arm.com/
Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
Signed-off-by: Ryan Roberts <ryan.roberts(a)arm.com>
Reviewed-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual(a)arm.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar(a)kernel.org>
Cc: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.ibm.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Sven Schnelle <svens(a)linux.ibm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst
index 2466d3363af7..ad50ca6f495e 100644
--- a/Documentation/mm/arch_pgtable_helpers.rst
+++ b/Documentation/mm/arch_pgtable_helpers.rst
@@ -140,7 +140,8 @@ PMD Page Table Helpers
+---------------------------+--------------------------------------------------+
| pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD |
+---------------------------+--------------------------------------------------+
-| pmd_mkinvalid | Invalidates a mapped PMD [1] |
+| pmd_mkinvalid | Invalidates a present PMD; do not call for |
+| | non-present PMD [1] |
+---------------------------+--------------------------------------------------+
| pmd_set_huge | Creates a PMD huge mapping |
+---------------------------+--------------------------------------------------+
@@ -196,7 +197,8 @@ PUD Page Table Helpers
+---------------------------+--------------------------------------------------+
| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD |
+---------------------------+--------------------------------------------------+
-| pud_mkinvalid | Invalidates a mapped PUD [1] |
+| pud_mkinvalid | Invalidates a present PUD; do not call for |
+| | non-present PUD [1] |
+---------------------------+--------------------------------------------------+
| pud_set_huge | Creates a PUD huge mapping |
+---------------------------+--------------------------------------------------+
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 83823db3488b..2975ea0841ba 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -170,6 +170,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
{
unsigned long old_pmd;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return __pmd(old_pmd);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2cb2a2e7b34b..558902edbfec 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1769,8 +1769,10 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
+ pmd_t pmd;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+ pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
}
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 19642f7ffb52..8648a50afe88 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -249,6 +249,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
{
pmd_t old, entry;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID);
old = pmdp_establish(vma, address, pmdp, entry);
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 94767c82fc0d..93e54ba91fbf 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -631,6 +631,8 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+
/*
* No flush is necessary. Once an invalid PTE is established, the PTE's
* access and dirty bits cannot be updated.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08e4f3343bcd..ccdcff73284a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2430,32 +2430,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- /*
- * Up to this point the pmd is present and huge and userland has the
- * whole access to the hugepage during the split (which happens in
- * place). If we overwrite the pmd with the not-huge version pointing
- * to the pte here (which of course we could if all CPUs were bug
- * free), userland could trigger a small page size TLB miss on the
- * small sized TLB while the hugepage TLB entry is still established in
- * the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
- * 383 on page 105. Intel should be safe but is also warns that it's
- * only safe if the permission and cache attributes of the two entries
- * loaded in the two TLB is identical (which should be the case here).
- * But it is generally safer to never allow small and huge TLB entries
- * for the same virtual address to be loaded simultaneously. So instead
- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
- * current pmd notpresent (atomically because here the pmd_trans_huge
- * must remain set at all times on the pmd until the split is complete
- * for this pmd), then we flush the SMP TLB and finally we write the
- * non-huge version of the pmd entry with pmd_populate.
- */
- old_pmd = pmdp_invalidate(vma, haddr, pmd);
-
- pmd_migration = is_pmd_migration_entry(old_pmd);
+ pmd_migration = is_pmd_migration_entry(*pmd);
if (unlikely(pmd_migration)) {
swp_entry_t entry;
+ old_pmd = *pmd;
entry = pmd_to_swp_entry(old_pmd);
page = pfn_swap_entry_to_page(entry);
write = is_writable_migration_entry(entry);
@@ -2466,6 +2445,30 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
+ /*
+ * Up to this point the pmd is present and huge and userland has
+ * the whole access to the hugepage during the split (which
+ * happens in place). If we overwrite the pmd with the not-huge
+ * version pointing to the pte here (which of course we could if
+ * all CPUs were bug free), userland could trigger a small page
+ * size TLB miss on the small sized TLB while the hugepage TLB
+ * entry is still established in the huge TLB. Some CPU doesn't
+ * like that. See
+ * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that
+ * it's only safe if the permission and cache attributes of the
+ * two entries loaded in the two TLB is identical (which should
+ * be the case here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual address to be
+ * loaded simultaneously. So instead of doing "pmd_populate();
+ * flush_pmd_tlb_range();" we first mark the current pmd
+ * notpresent (atomically because here the pmd_trans_huge must
+ * remain set at all times on the pmd until the split is
+ * complete for this pmd), then we flush the SMP TLB and finally
+ * we write the non-huge version of the pmd entry with
+ * pmd_populate.
+ */
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
page = pmd_page(old_pmd);
folio = page_folio(page);
if (pmd_dirty(old_pmd)) {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4fcd959dcc4d..a78a4adf711a 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return old;
@@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
return pmdp_invalidate(vma, address, pmdp);
}
#endif
Avoid spurious link status logs that may ultimately be wrong; for example,
if the link is set to down with the cable plugged, then the cable is
unplugged and after this the link is set to up, the last new log that is
appearing is incorrectly telling that the link is up.
In order to avoid errors, show link status logs after link_reset
processing, and in order to avoid spurious as much as possible, only show
the link loss when some link status change is detected.
cc: stable(a)vger.kernel.org
Fixes: e2ca90c276e1 ("ax88179_178a: ASIX AX88179_178A USB 3.0/2.0 to gigabit ethernet adapter driver")
Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm(a)redhat.com>
---
v3:
- Add net as target tree in the subject.
- Use constant string instead of variable because no other value is possible.
v2: https://lore.kernel.org/netdev/20240618115054.101577-1-jtornosm@redhat.com/
- Fix the nits
v1: https://lore.kernel.org/netdev/20240617103405.654567-1-jtornosm@redhat.com/
drivers/net/usb/ax88179_178a.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index c2fb736f78b2..b034ef8a73ea 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -326,7 +326,8 @@ static void ax88179_status(struct usbnet *dev, struct urb *urb)
if (netif_carrier_ok(dev->net) != link) {
usbnet_link_change(dev, link, 1);
- netdev_info(dev->net, "ax88179 - Link status is: %d\n", link);
+ if (!link)
+ netdev_info(dev->net, "ax88179 - Link status is: 0\n");
}
}
@@ -1542,6 +1543,7 @@ static int ax88179_link_reset(struct usbnet *dev)
GMII_PHY_PHYSR, 2, &tmp16);
if (!(tmp16 & GMII_PHY_PHYSR_LINK)) {
+ netdev_info(dev->net, "ax88179 - Link status is: 0\n");
return 0;
} else if (GMII_PHY_PHYSR_GIGA == (tmp16 & GMII_PHY_PHYSR_SMASK)) {
mode |= AX_MEDIUM_GIGAMODE | AX_MEDIUM_EN_125MHZ;
@@ -1579,6 +1581,8 @@ static int ax88179_link_reset(struct usbnet *dev)
netif_carrier_on(dev->net);
+ netdev_info(dev->net, "ax88179 - Link status is: 1\n");
+
return 0;
}
--
2.45.1
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024061316-brook-imaging-a202@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
3a5a8d343e1c ("mm: fix race between __split_huge_pmd_locked() and GUP-fast")
4f83145721f3 ("mm: avoid unnecessary flush on change_huge_pmd()")
c9fe66560bf2 ("mm/mprotect: do not flush when not required architecturally")
4a18419f71cd ("mm/mprotect: use mmu_gather")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts(a)arm.com>
Date: Wed, 1 May 2024 15:33:10 +0100
Subject: [PATCH] mm: fix race between __split_huge_pmd_locked() and GUP-fast
__split_huge_pmd_locked() can be called for a present THP, devmap or
(non-present) migration entry. It calls pmdp_invalidate() unconditionally
on the pmdp and only determines if it is present or not based on the
returned old pmd. This is a problem for the migration entry case because
pmd_mkinvalid(), called by pmdp_invalidate() must only be called for a
present pmd.
On arm64 at least, pmd_mkinvalid() will mark the pmd such that any future
call to pmd_present() will return true. And therefore any lockless
pgtable walker could see the migration entry pmd in this state and start
interpretting the fields as if it were present, leading to BadThings (TM).
GUP-fast appears to be one such lockless pgtable walker.
x86 does not suffer the above problem, but instead pmd_mkinvalid() will
corrupt the offset field of the swap entry within the swap pte. See link
below for discussion of that problem.
Fix all of this by only calling pmdp_invalidate() for a present pmd. And
for good measure let's add a warning to all implementations of
pmdp_invalidate[_ad](). I've manually reviewed all other
pmdp_invalidate[_ad]() call sites and believe all others to be conformant.
This is a theoretical bug found during code review. I don't have any test
case to trigger it in practice.
Link: https://lkml.kernel.org/r/20240501143310.1381675-1-ryan.roberts@arm.com
Link: https://lore.kernel.org/all/0dd7827a-6334-439a-8fd0-43c98e6af22b@arm.com/
Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
Signed-off-by: Ryan Roberts <ryan.roberts(a)arm.com>
Reviewed-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual(a)arm.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar(a)kernel.org>
Cc: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.ibm.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Sven Schnelle <svens(a)linux.ibm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst
index 2466d3363af7..ad50ca6f495e 100644
--- a/Documentation/mm/arch_pgtable_helpers.rst
+++ b/Documentation/mm/arch_pgtable_helpers.rst
@@ -140,7 +140,8 @@ PMD Page Table Helpers
+---------------------------+--------------------------------------------------+
| pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD |
+---------------------------+--------------------------------------------------+
-| pmd_mkinvalid | Invalidates a mapped PMD [1] |
+| pmd_mkinvalid | Invalidates a present PMD; do not call for |
+| | non-present PMD [1] |
+---------------------------+--------------------------------------------------+
| pmd_set_huge | Creates a PMD huge mapping |
+---------------------------+--------------------------------------------------+
@@ -196,7 +197,8 @@ PUD Page Table Helpers
+---------------------------+--------------------------------------------------+
| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD |
+---------------------------+--------------------------------------------------+
-| pud_mkinvalid | Invalidates a mapped PUD [1] |
+| pud_mkinvalid | Invalidates a present PUD; do not call for |
+| | non-present PUD [1] |
+---------------------------+--------------------------------------------------+
| pud_set_huge | Creates a PUD huge mapping |
+---------------------------+--------------------------------------------------+
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 83823db3488b..2975ea0841ba 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -170,6 +170,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
{
unsigned long old_pmd;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return __pmd(old_pmd);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2cb2a2e7b34b..558902edbfec 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1769,8 +1769,10 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
+ pmd_t pmd;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+ pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
}
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 19642f7ffb52..8648a50afe88 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -249,6 +249,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
{
pmd_t old, entry;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID);
old = pmdp_establish(vma, address, pmdp, entry);
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 94767c82fc0d..93e54ba91fbf 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -631,6 +631,8 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+
/*
* No flush is necessary. Once an invalid PTE is established, the PTE's
* access and dirty bits cannot be updated.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08e4f3343bcd..ccdcff73284a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2430,32 +2430,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- /*
- * Up to this point the pmd is present and huge and userland has the
- * whole access to the hugepage during the split (which happens in
- * place). If we overwrite the pmd with the not-huge version pointing
- * to the pte here (which of course we could if all CPUs were bug
- * free), userland could trigger a small page size TLB miss on the
- * small sized TLB while the hugepage TLB entry is still established in
- * the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
- * 383 on page 105. Intel should be safe but is also warns that it's
- * only safe if the permission and cache attributes of the two entries
- * loaded in the two TLB is identical (which should be the case here).
- * But it is generally safer to never allow small and huge TLB entries
- * for the same virtual address to be loaded simultaneously. So instead
- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
- * current pmd notpresent (atomically because here the pmd_trans_huge
- * must remain set at all times on the pmd until the split is complete
- * for this pmd), then we flush the SMP TLB and finally we write the
- * non-huge version of the pmd entry with pmd_populate.
- */
- old_pmd = pmdp_invalidate(vma, haddr, pmd);
-
- pmd_migration = is_pmd_migration_entry(old_pmd);
+ pmd_migration = is_pmd_migration_entry(*pmd);
if (unlikely(pmd_migration)) {
swp_entry_t entry;
+ old_pmd = *pmd;
entry = pmd_to_swp_entry(old_pmd);
page = pfn_swap_entry_to_page(entry);
write = is_writable_migration_entry(entry);
@@ -2466,6 +2445,30 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
+ /*
+ * Up to this point the pmd is present and huge and userland has
+ * the whole access to the hugepage during the split (which
+ * happens in place). If we overwrite the pmd with the not-huge
+ * version pointing to the pte here (which of course we could if
+ * all CPUs were bug free), userland could trigger a small page
+ * size TLB miss on the small sized TLB while the hugepage TLB
+ * entry is still established in the huge TLB. Some CPU doesn't
+ * like that. See
+ * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that
+ * it's only safe if the permission and cache attributes of the
+ * two entries loaded in the two TLB is identical (which should
+ * be the case here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual address to be
+ * loaded simultaneously. So instead of doing "pmd_populate();
+ * flush_pmd_tlb_range();" we first mark the current pmd
+ * notpresent (atomically because here the pmd_trans_huge must
+ * remain set at all times on the pmd until the split is
+ * complete for this pmd), then we flush the SMP TLB and finally
+ * we write the non-huge version of the pmd entry with
+ * pmd_populate.
+ */
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
page = pmd_page(old_pmd);
folio = page_folio(page);
if (pmd_dirty(old_pmd)) {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4fcd959dcc4d..a78a4adf711a 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return old;
@@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
return pmdp_invalidate(vma, address, pmdp);
}
#endif
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024061317-promoter-record-bc91@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
3a5a8d343e1c ("mm: fix race between __split_huge_pmd_locked() and GUP-fast")
4f83145721f3 ("mm: avoid unnecessary flush on change_huge_pmd()")
c9fe66560bf2 ("mm/mprotect: do not flush when not required architecturally")
4a18419f71cd ("mm/mprotect: use mmu_gather")
e346e6688c4a ("mm: thp: skip make PMD PROT_NONE if THP migration is not supported")
f0953a1bbaca ("mm: fix typos in comments")
e2db1a9aa381 ("kasan, mm: optimize kmalloc poisoning")
928501344fc6 ("kasan, mm: don't save alloc stacks twice")
2b8305260fb3 ("kfence, kasan: make KFENCE compatible with KASAN")
0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure")
41139aa4c3a3 ("mm/filemap: add mapping_seek_hole_data")
a1ba9da8f0f9 ("mm/hugetlb.c: fix unnecessary address expansion of pmd sharing")
611806b4bf8d ("kasan: fix bug detection via ksize for HW_TAGS mode")
027b37b552f3 ("kasan: move _RET_IP_ to inline wrappers")
573a48092313 ("kasan: add match-all tag tests")
f00748bfa024 ("kasan: prefix global functions with kasan_")
dbf53f7597be ("mm/mprotect.c: optimize error detection in do_mprotect_pkey()")
96667f8a4382 ("mm: Close race in generic_access_phys")
97593cad003c ("kasan: sanitize objects when metadata doesn't fit")
1ef3133bd3b8 ("kasan: simplify assign_tag and set_tag calls")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3a5a8d343e1cf96eb9971b17cbd4b832ab19b8e7 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts(a)arm.com>
Date: Wed, 1 May 2024 15:33:10 +0100
Subject: [PATCH] mm: fix race between __split_huge_pmd_locked() and GUP-fast
__split_huge_pmd_locked() can be called for a present THP, devmap or
(non-present) migration entry. It calls pmdp_invalidate() unconditionally
on the pmdp and only determines if it is present or not based on the
returned old pmd. This is a problem for the migration entry case because
pmd_mkinvalid(), called by pmdp_invalidate() must only be called for a
present pmd.
On arm64 at least, pmd_mkinvalid() will mark the pmd such that any future
call to pmd_present() will return true. And therefore any lockless
pgtable walker could see the migration entry pmd in this state and start
interpretting the fields as if it were present, leading to BadThings (TM).
GUP-fast appears to be one such lockless pgtable walker.
x86 does not suffer the above problem, but instead pmd_mkinvalid() will
corrupt the offset field of the swap entry within the swap pte. See link
below for discussion of that problem.
Fix all of this by only calling pmdp_invalidate() for a present pmd. And
for good measure let's add a warning to all implementations of
pmdp_invalidate[_ad](). I've manually reviewed all other
pmdp_invalidate[_ad]() call sites and believe all others to be conformant.
This is a theoretical bug found during code review. I don't have any test
case to trigger it in practice.
Link: https://lkml.kernel.org/r/20240501143310.1381675-1-ryan.roberts@arm.com
Link: https://lore.kernel.org/all/0dd7827a-6334-439a-8fd0-43c98e6af22b@arm.com/
Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path")
Signed-off-by: Ryan Roberts <ryan.roberts(a)arm.com>
Reviewed-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual(a)arm.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar(a)kernel.org>
Cc: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy(a)csgroup.eu>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Naveen N. Rao <naveen.n.rao(a)linux.ibm.com>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Sven Schnelle <svens(a)linux.ibm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst
index 2466d3363af7..ad50ca6f495e 100644
--- a/Documentation/mm/arch_pgtable_helpers.rst
+++ b/Documentation/mm/arch_pgtable_helpers.rst
@@ -140,7 +140,8 @@ PMD Page Table Helpers
+---------------------------+--------------------------------------------------+
| pmd_swp_clear_soft_dirty | Clears a soft dirty swapped PMD |
+---------------------------+--------------------------------------------------+
-| pmd_mkinvalid | Invalidates a mapped PMD [1] |
+| pmd_mkinvalid | Invalidates a present PMD; do not call for |
+| | non-present PMD [1] |
+---------------------------+--------------------------------------------------+
| pmd_set_huge | Creates a PMD huge mapping |
+---------------------------+--------------------------------------------------+
@@ -196,7 +197,8 @@ PUD Page Table Helpers
+---------------------------+--------------------------------------------------+
| pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD |
+---------------------------+--------------------------------------------------+
-| pud_mkinvalid | Invalidates a mapped PUD [1] |
+| pud_mkinvalid | Invalidates a present PUD; do not call for |
+| | non-present PUD [1] |
+---------------------------+--------------------------------------------------+
| pud_set_huge | Creates a PUD huge mapping |
+---------------------------+--------------------------------------------------+
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 83823db3488b..2975ea0841ba 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -170,6 +170,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
{
unsigned long old_pmd;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, _PAGE_INVALID);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return __pmd(old_pmd);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 2cb2a2e7b34b..558902edbfec 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1769,8 +1769,10 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
+ pmd_t pmd;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+ pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
}
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 19642f7ffb52..8648a50afe88 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -249,6 +249,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
{
pmd_t old, entry;
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID);
old = pmdp_establish(vma, address, pmdp, entry);
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 94767c82fc0d..93e54ba91fbf 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -631,6 +631,8 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
+
/*
* No flush is necessary. Once an invalid PTE is established, the PTE's
* access and dirty bits cannot be updated.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08e4f3343bcd..ccdcff73284a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2430,32 +2430,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- /*
- * Up to this point the pmd is present and huge and userland has the
- * whole access to the hugepage during the split (which happens in
- * place). If we overwrite the pmd with the not-huge version pointing
- * to the pte here (which of course we could if all CPUs were bug
- * free), userland could trigger a small page size TLB miss on the
- * small sized TLB while the hugepage TLB entry is still established in
- * the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
- * 383 on page 105. Intel should be safe but is also warns that it's
- * only safe if the permission and cache attributes of the two entries
- * loaded in the two TLB is identical (which should be the case here).
- * But it is generally safer to never allow small and huge TLB entries
- * for the same virtual address to be loaded simultaneously. So instead
- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
- * current pmd notpresent (atomically because here the pmd_trans_huge
- * must remain set at all times on the pmd until the split is complete
- * for this pmd), then we flush the SMP TLB and finally we write the
- * non-huge version of the pmd entry with pmd_populate.
- */
- old_pmd = pmdp_invalidate(vma, haddr, pmd);
-
- pmd_migration = is_pmd_migration_entry(old_pmd);
+ pmd_migration = is_pmd_migration_entry(*pmd);
if (unlikely(pmd_migration)) {
swp_entry_t entry;
+ old_pmd = *pmd;
entry = pmd_to_swp_entry(old_pmd);
page = pfn_swap_entry_to_page(entry);
write = is_writable_migration_entry(entry);
@@ -2466,6 +2445,30 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
+ /*
+ * Up to this point the pmd is present and huge and userland has
+ * the whole access to the hugepage during the split (which
+ * happens in place). If we overwrite the pmd with the not-huge
+ * version pointing to the pte here (which of course we could if
+ * all CPUs were bug free), userland could trigger a small page
+ * size TLB miss on the small sized TLB while the hugepage TLB
+ * entry is still established in the huge TLB. Some CPU doesn't
+ * like that. See
+ * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that
+ * it's only safe if the permission and cache attributes of the
+ * two entries loaded in the two TLB is identical (which should
+ * be the case here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual address to be
+ * loaded simultaneously. So instead of doing "pmd_populate();
+ * flush_pmd_tlb_range();" we first mark the current pmd
+ * notpresent (atomically because here the pmd_trans_huge must
+ * remain set at all times on the pmd until the split is
+ * complete for this pmd), then we flush the SMP TLB and finally
+ * we write the non-huge version of the pmd entry with
+ * pmd_populate.
+ */
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
page = pmd_page(old_pmd);
folio = page_folio(page);
if (pmd_dirty(old_pmd)) {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4fcd959dcc4d..a78a4adf711a 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -198,6 +198,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return old;
@@ -208,6 +209,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ VM_WARN_ON_ONCE(!pmd_present(*pmdp));
return pmdp_invalidate(vma, address, pmdp);
}
#endif
This is the start of the stable review cycle for the 6.1.94 release.
There are 85 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sat, 15 Jun 2024 11:31:50 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.1.94-rc1…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.1.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 6.1.94-rc1
Enzo Matsumiya <ematsumiya(a)suse.de>
smb: client: fix deadlock in smb2_find_smb_tcon()
Puranjay Mohan <puranjay(a)kernel.org>
powerpc/bpf: enforce full ordering for ATOMIC operations with BPF_FETCH
Omar Sandoval <osandov(a)fb.com>
btrfs: fix crash on racing fsync and size-extending write into prealloc
Anna Schumaker <Anna.Schumaker(a)Netapp.com>
NFS: Fix READ_PLUS when server doesn't support OP_READ_PLUS
Sergey Shtylyov <s.shtylyov(a)omp.ru>
nfs: fix undefined behavior in nfs_block_bits()
Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
EDAC/igen6: Convert PCIBIOS_* return codes to errnos
Frank Li <Frank.Li(a)nxp.com>
i3c: master: svc: fix invalidate IBI type and miss call client IBI handler
Harald Freudenberger <freude(a)linux.ibm.com>
s390/cpacf: Make use of invalid opcode produce a link error
Harald Freudenberger <freude(a)linux.ibm.com>
s390/cpacf: Split and rework cpacf query functions
Harald Freudenberger <freude(a)linux.ibm.com>
s390/ap: Fix crash in AP internal function modify_bitmap()
Helge Deller <deller(a)kernel.org>
parisc: Define sigset_t in parisc uapi header
Helge Deller <deller(a)gmx.de>
parisc: Define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
Baokun Li <libaokun1(a)huawei.com>
ext4: fix mb_cache_entry's e_refcnt leak in ext4_xattr_block_cache_find()
Baokun Li <libaokun1(a)huawei.com>
ext4: set type of ac_groups_linear_remaining to __u32 to avoid overflow
Mike Gilbert <floppym(a)gentoo.org>
sparc: move struct termio to asm/termios.h
Eric Dumazet <edumazet(a)google.com>
net: fix __dst_negative_advice() race
Daniel Thompson <daniel.thompson(a)linaro.org>
kdb: Use format-specifiers rather than memset() for padding in kdb_read()
Daniel Thompson <daniel.thompson(a)linaro.org>
kdb: Merge identical case statements in kdb_read()
Daniel Thompson <daniel.thompson(a)linaro.org>
kdb: Fix console handling when editing and tab-completing commands
Daniel Thompson <daniel.thompson(a)linaro.org>
kdb: Use format-strings rather than '\0' injection in kdb_read()
Daniel Thompson <daniel.thompson(a)linaro.org>
kdb: Fix buffer overflow during tab-complete
Judith Mendez <jm(a)ti.com>
watchdog: rti_wdt: Set min_hw_heartbeat_ms to accommodate a safety margin
Frank van der Linden <fvdl(a)google.com>
mm/hugetlb: pass correct order_per_bit to cma_declare_contiguous_nid
Frank van der Linden <fvdl(a)google.com>
mm/cma: drop incorrect alignment check in cma_init_reserved_mem
Sam Ravnborg <sam(a)ravnborg.org>
sparc64: Fix number of online CPUs
Alexander Shishkin <alexander.shishkin(a)linux.intel.com>
intel_th: pci: Add Meteor Lake-S CPU support
Dhananjay Ugwekar <Dhananjay.Ugwekar(a)amd.com>
cpufreq: amd-pstate: Fix the inconsistency in max frequency units
Alexander Potapenko <glider(a)google.com>
kmsan: do not wipe out origin when doing partial unpoisoning
Nikita Zhandarovich <n.zhandarovich(a)fintech.ru>
net/9p: fix uninit-value in p9_client_rpc()
xu xin <xu.xin16(a)zte.com.cn>
net/ipv6: Fix route deleting failure when metric equals 0
Martin K. Petersen <martin.petersen(a)oracle.com>
scsi: core: Handle devices which return an unusually large VPD page count
Ryan Roberts <ryan.roberts(a)arm.com>
mm: fix race between __split_huge_pmd_locked() and GUP-fast
Herbert Xu <herbert(a)gondor.apana.org.au>
crypto: qat - Fix ADF_DEV_RESET_SYNC memory leak
Vitaly Chikunov <vt(a)altlinux.org>
crypto: ecrdsa - Fix module auto-load on add_key
Stefan Berger <stefanb(a)linux.ibm.com>
crypto: ecdsa - Fix module auto-load on add-key
Marc Zyngier <maz(a)kernel.org>
KVM: arm64: AArch32: Fix spurious trapping of conditional instructions
Marc Zyngier <maz(a)kernel.org>
KVM: arm64: Allow AArch32 PSTATE.M to be restored as System mode
Marc Zyngier <maz(a)kernel.org>
KVM: arm64: Fix AArch32 register narrowing on userspace write
Mario Limonciello <mario.limonciello(a)amd.com>
drm/amd: Fix shutdown (again) on some SMU v13.0.4/11 platforms
Dominique Martinet <asmadeus(a)codewreck.org>
9p: add missing locking around taking dentry fid list
Li Ma <li.ma(a)amd.com>
drm/amdgpu/atomfirmware: add intergrated info v2.3 table
Cai Xinchen <caixinchen1(a)huawei.com>
fbdev: savage: Handle err return when savagefb_check_var failed
Hans de Goede <hdegoede(a)redhat.com>
mmc: sdhci-acpi: Add quirk to enable pull-up on the card-detect GPIO on Asus T100TA
Hans de Goede <hdegoede(a)redhat.com>
mmc: sdhci-acpi: Disable write protect detection on Toshiba WT10-A
Hans de Goede <hdegoede(a)redhat.com>
mmc: sdhci-acpi: Fix Lenovo Yoga Tablet 2 Pro 1380 sdcard slot not working
Hans de Goede <hdegoede(a)redhat.com>
mmc: sdhci-acpi: Sort DMI quirks alphabetically
Adrian Hunter <adrian.hunter(a)intel.com>
mmc: sdhci: Add support for "Tuning Error" interrupts
Hans de Goede <hdegoede(a)redhat.com>
mmc: core: Add mmc_gpiod_set_cd_config() function
Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
media: v4l2-core: hold videodev_lock until dev reg, finishes
Nathan Chancellor <nathan(a)kernel.org>
media: mxl5xx: Move xpt structures off stack
Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
media: mc: mark the media devnode as registered from the, start
Tomi Valkeinen <tomi.valkeinen(a)ideasonboard.com>
media: mc: Fix graph walk in media_pipeline_start
Yang Xiwen <forbidden405(a)outlook.com>
arm64: dts: hi3798cv200: fix the size of GICR
Bitterblue Smith <rtl8821cerfe2(a)gmail.com>
wifi: rtlwifi: rtl8192de: Fix endianness issue in RX path
Bitterblue Smith <rtl8821cerfe2(a)gmail.com>
wifi: rtlwifi: rtl8192de: Fix low speed with WPA3-SAE
Bitterblue Smith <rtl8821cerfe2(a)gmail.com>
wifi: rtlwifi: rtl8192de: Fix 5 GHz TX power
Bitterblue Smith <rtl8821cerfe2(a)gmail.com>
wifi: rtl8xxxu: Fix the TX power of RTL8192CU, RTL8723AU
Ping-Ke Shih <pkshih(a)realtek.com>
wifi: rtw89: pci: correct TX resource checking for PCI DMA channel of firmware command
Yu Kuai <yukuai3(a)huawei.com>
md/raid5: fix deadlock that raid5d() wait for itself to clear MD_SB_CHANGE_PENDING
Johan Hovold <johan+linaro(a)kernel.org>
arm64: dts: qcom: qcs404: fix bluetooth device address
Krzysztof Kozlowski <krzk(a)kernel.org>
arm64: tegra: Correct Tegra132 I2C alias
Christoffer Sandberg <cs(a)tuxedo.de>
ACPI: resource: Do IRQ override on TongFang GXxHRXx and GMxHGxx
Maulik Shah <quic_mkshah(a)quicinc.com>
soc: qcom: rpmh-rsc: Enhance check for VRM in-flight request
Konrad Dybcio <konrad.dybcio(a)linaro.org>
thermal/drivers/qcom/lmh: Check for SCM availability at probe
Sergey Shtylyov <s.shtylyov(a)omp.ru>
ata: pata_legacy: make legacy_exit() work again
Ping-Ke Shih <pkshih(a)realtek.com>
wifi: rtw89: correct aSIFSTime for 6GHz band
Matthew Mirvish <matthew(a)mm12.xyz>
bcache: fix variable length array abuse in btree_iter
Bob Zhou <bob.zhou(a)amd.com>
drm/amdgpu: add error handle to avoid out-of-bounds
Zheyu Ma <zheyuma97(a)gmail.com>
media: lgdt3306a: Add a check against null-pointer-def
Chao Yu <chao(a)kernel.org>
f2fs: fix to do sanity check on i_xattr_nid in sanity_check_inode()
Florian Fainelli <florian.fainelli(a)broadcom.com>
scripts/gdb: fix SB_* constants parsing
Daniel Borkmann <daniel(a)iogearbox.net>
vxlan: Fix regression when dropping packets due to invalid src addresses
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
mptcp: fix full TCP keep-alive support
Paolo Abeni <pabeni(a)redhat.com>
mptcp: cleanup SOL_TCP handling
Paolo Abeni <pabeni(a)redhat.com>
mptcp: avoid some duplicate code in socket option handling
Chaitanya Kumar Borah <chaitanya.kumar.borah(a)intel.com>
drm/i915/audio: Fix audio time stamp programming for DP
Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
nilfs2: fix use-after-free of timer for log writer thread
Haorong Lu <ancientmodern4(a)gmail.com>
riscv: signal: handle syscall restart before get_signal
Marc Dionne <marc.dionne(a)auristor.com>
afs: Don't cross .backup mountpoint from backup volume
Jorge Ramirez-Ortiz <jorge(a)foundries.io>
mmc: core: Do not force a retune before RPMB switch
Liam R. Howlett <Liam.Howlett(a)oracle.com>
maple_tree: fix mas_empty_area_rev() null pointer dereference
Peng Zhang <zhangpeng.00(a)bytedance.com>
maple_tree: fix allocation in mas_sparse_area()
Dan Gora <dan.gora(a)gmail.com>
Bluetooth: btrtl: Add missing MODULE_FIRMWARE declarations
Shradha Gupta <shradhagupta(a)linux.microsoft.com>
drm: Check polling initialized before enabling in drm_helper_probe_single_connector_modes
Shradha Gupta <shradhagupta(a)linux.microsoft.com>
drm: Check output polling initialized before disabling
-------------
Diffstat:
Documentation/mm/arch_pgtable_helpers.rst | 6 +-
Makefile | 4 +-
arch/arm64/boot/dts/hisilicon/hi3798cv200.dtsi | 2 +-
arch/arm64/boot/dts/nvidia/tegra132-norrin.dts | 4 +-
arch/arm64/boot/dts/nvidia/tegra132.dtsi | 2 +-
arch/arm64/boot/dts/qcom/qcs404-evb.dtsi | 2 +-
arch/arm64/kvm/guest.c | 3 +-
arch/arm64/kvm/hyp/aarch32.c | 18 ++-
arch/parisc/include/asm/page.h | 1 +
arch/parisc/include/asm/signal.h | 12 --
arch/parisc/include/uapi/asm/signal.h | 10 ++
arch/powerpc/mm/book3s64/pgtable.c | 1 +
arch/powerpc/net/bpf_jit_comp32.c | 12 ++
arch/powerpc/net/bpf_jit_comp64.c | 12 ++
arch/riscv/kernel/signal.c | 95 +++++++-------
arch/s390/include/asm/cpacf.h | 109 +++++++++++++---
arch/s390/include/asm/pgtable.h | 4 +-
arch/sparc/include/asm/smp_64.h | 2 -
arch/sparc/include/uapi/asm/termbits.h | 10 --
arch/sparc/include/uapi/asm/termios.h | 9 ++
arch/sparc/kernel/prom_64.c | 4 +-
arch/sparc/kernel/setup_64.c | 1 -
arch/sparc/kernel/smp_64.c | 14 --
arch/sparc/mm/tlb.c | 1 +
arch/x86/mm/pgtable.c | 2 +
crypto/ecdsa.c | 3 +
crypto/ecrdsa.c | 1 +
drivers/acpi/resource.c | 12 ++
drivers/ata/pata_legacy.c | 8 +-
drivers/bluetooth/btrtl.c | 18 ++-
drivers/cpufreq/amd-pstate.c | 2 +-
drivers/crypto/qat/qat_common/adf_aer.c | 19 +--
drivers/edac/igen6_edac.c | 4 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 15 +++
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 3 +
drivers/gpu/drm/amd/include/atomfirmware.h | 43 ++++++
.../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_4_ppt.c | 20 +--
drivers/gpu/drm/drm_modeset_helper.c | 19 ++-
drivers/gpu/drm/drm_probe_helper.c | 15 ++-
drivers/gpu/drm/i915/display/intel_audio.c | 116 ++---------------
drivers/hwtracing/intel_th/pci.c | 5 +
drivers/i3c/master/svc-i3c-master.c | 16 ++-
drivers/md/bcache/bset.c | 44 +++----
drivers/md/bcache/bset.h | 28 ++--
drivers/md/bcache/btree.c | 40 +++---
drivers/md/bcache/super.c | 5 +-
drivers/md/bcache/sysfs.c | 2 +-
drivers/md/bcache/writeback.c | 10 +-
drivers/md/raid5.c | 15 +--
drivers/media/dvb-frontends/lgdt3306a.c | 5 +
drivers/media/dvb-frontends/mxl5xx.c | 22 ++--
drivers/media/mc/mc-devnode.c | 5 +-
drivers/media/mc/mc-entity.c | 6 +
drivers/media/v4l2-core/v4l2-dev.c | 3 +
drivers/mmc/core/host.c | 3 +-
drivers/mmc/core/slot-gpio.c | 20 +++
drivers/mmc/host/sdhci-acpi.c | 61 ++++++++-
drivers/mmc/host/sdhci.c | 10 +-
drivers/mmc/host/sdhci.h | 3 +-
drivers/net/vxlan/vxlan_core.c | 4 -
.../net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c | 25 ++--
.../net/wireless/realtek/rtlwifi/rtl8192de/phy.c | 4 +-
.../net/wireless/realtek/rtlwifi/rtl8192de/trx.c | 21 ++-
.../net/wireless/realtek/rtlwifi/rtl8192de/trx.h | 79 +++--------
drivers/net/wireless/realtek/rtw89/mac80211.c | 2 +-
drivers/net/wireless/realtek/rtw89/pci.c | 3 +-
drivers/s390/crypto/ap_bus.c | 2 +-
drivers/scsi/scsi.c | 7 +
drivers/soc/qcom/cmd-db.c | 32 ++++-
drivers/soc/qcom/rpmh-rsc.c | 3 +-
drivers/thermal/qcom/lmh.c | 3 +
drivers/video/fbdev/savage/savagefb_driver.c | 5 +-
drivers/watchdog/rti_wdt.c | 34 +++--
fs/9p/vfs_dentry.c | 9 +-
fs/afs/mntpt.c | 5 +
fs/btrfs/tree-log.c | 17 ++-
fs/ext4/mballoc.h | 2 +-
fs/ext4/xattr.c | 4 +-
fs/f2fs/inode.c | 6 +
fs/nfs/internal.h | 4 +-
fs/nfs/nfs4proc.c | 2 +-
fs/nilfs2/segment.c | 25 +++-
fs/smb/client/smb2transport.c | 2 +-
include/linux/mmc/slot-gpio.h | 1 +
include/net/dst_ops.h | 2 +-
include/net/sock.h | 13 +-
include/soc/qcom/cmd-db.h | 10 +-
kernel/debug/kdb/kdb_io.c | 99 ++++++++------
lib/maple_tree.c | 55 ++++----
mm/cma.c | 4 -
mm/huge_memory.c | 49 +++----
mm/hugetlb.c | 6 +-
mm/kmsan/core.c | 15 ++-
mm/pgtable-generic.c | 2 +
net/9p/client.c | 2 +
net/ipv4/route.c | 22 ++--
net/ipv6/route.c | 34 ++---
net/mptcp/protocol.h | 3 +
net/mptcp/sockopt.c | 144 +++++++++++++++------
net/xfrm/xfrm_policy.c | 11 +-
scripts/gdb/linux/constants.py.in | 12 +-
101 files changed, 1030 insertions(+), 695 deletions(-)
Commit 90c2d2eb7ab5 ("MIPS: pci: lantiq: switch to using gpiod API") not
only switched to the gpiod API, but also inverted / changed the polarity
of the GPIO.
According to the PCI specification, the RST# pin is an active-low
signal. However, most of the device trees that have been widely used for
a long time (mainly in the openWrt project) define this GPIO as
active-high and the old driver code inverted the signal internally.
Apparently there are actually boards where the reset gpio must be
operated inverted. For this reason, we cannot use the GPIOD_OUT_LOW/HIGH
flag for initialization. Instead, we must explicitly set the gpio to
value 1 in order to take into account any "GPIO_ACTIVE_LOW" flag that
may have been set.
In order to remain compatible with all these existing device trees, we
should therefore keep the logic as it was before the commit.
Fixes: 90c2d2eb7ab5 ("MIPS: pci: lantiq: switch to using gpiod API")
Cc: stable(a)vger.kernel.org
Signed-off-by: Martin Schiller <ms(a)dev.tdt.de>
---
arch/mips/pci/pci-lantiq.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c
index 68a8cefed420..0844db34022e 100644
--- a/arch/mips/pci/pci-lantiq.c
+++ b/arch/mips/pci/pci-lantiq.c
@@ -124,14 +124,14 @@ static int ltq_pci_startup(struct platform_device *pdev)
clk_disable(clk_external);
/* setup reset gpio used by pci */
- reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset",
- GPIOD_OUT_LOW);
+ reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_ASIS);
error = PTR_ERR_OR_ZERO(reset_gpio);
if (error) {
dev_err(&pdev->dev, "failed to request gpio: %d\n", error);
return error;
}
gpiod_set_consumer_name(reset_gpio, "pci_reset");
+ gpiod_direction_output(reset_gpio, 1);
/* enable auto-switching between PCI and EBU */
ltq_pci_w32(0xa, PCI_CR_CLK_CTRL);
@@ -194,10 +194,10 @@ static int ltq_pci_startup(struct platform_device *pdev)
/* toggle reset pin */
if (reset_gpio) {
- gpiod_set_value_cansleep(reset_gpio, 1);
+ gpiod_set_value_cansleep(reset_gpio, 0);
wmb();
mdelay(1);
- gpiod_set_value_cansleep(reset_gpio, 0);
+ gpiod_set_value_cansleep(reset_gpio, 1);
}
return 0;
}
--
2.39.2
In a TDX VM without paravisor, currently the default timer is the Hyper-V
timer, which depends on the slow VM Reference Counter MSR: the Hyper-V TSC
page is not enabled in such a VM because the VM uses Invariant TSC as a
better clocksource and it's challenging to mark the Hyper-V TSC page shared
in very early boot.
Lower the rating of the Hyper-V timer so the local APIC timer becomes the
the default timer in such a VM, and print a warning in case Invariant TSC
is unavailable in such a VM. This change should cause no perceivable
performance difference.
Cc: stable(a)vger.kernel.org # 6.6+
Reviewed-by: Roman Kisel <romank(a)linux.microsoft.com>
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
---
Changes in v2:
Improved the comments in ms_hyperv_init_platform() [Michael Kelley]
Added "print a warning in case Invariant TSC unavailable" in the changelog.
Added Roman's Reviewed-by.
arch/x86/kernel/cpu/mshyperv.c | 16 +++++++++++++++-
drivers/clocksource/hyperv_timer.c | 16 +++++++++++++++-
2 files changed, 30 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e0fd57a8ba840..954b7cbfa2f02 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -449,9 +449,23 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
if (!ms_hyperv.paravisor_present) {
- /* To be supported: more work is required. */
+ /*
+ * Mark the Hyper-V TSC page feature as disabled
+ * in a TDX VM without paravisor so that the
+ * Invariant TSC, which is a better clocksource
+ * anyway, is used instead.
+ */
ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+ /*
+ * The Invariant TSC is expected to be available
+ * in a TDX VM without paravisor, but if not,
+ * print a warning message. The slower Hyper-V MSR-based
+ * Ref Counter should end up being the clocksource.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ pr_warn("Hyper-V: Invariant TSC is unavailable\n");
+
/* HV_MSR_CRASH_CTL is unsupported. */
ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index b2a080647e413..99177835cadec 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -137,7 +137,21 @@ static int hv_stimer_init(unsigned int cpu)
ce->name = "Hyper-V clockevent";
ce->features = CLOCK_EVT_FEAT_ONESHOT;
ce->cpumask = cpumask_of(cpu);
- ce->rating = 1000;
+
+ /*
+ * Lower the rating of the Hyper-V timer in a TDX VM without paravisor,
+ * so the local APIC timer (lapic_clockevent) is the default timer in
+ * such a VM. The Hyper-V timer is not preferred in such a VM because
+ * it depends on the slow VM Reference Counter MSR (the Hyper-V TSC
+ * page is not enbled in such a VM because the VM uses Invariant TSC
+ * as a better clocksource and it's challenging to mark the Hyper-V
+ * TSC page shared in very early boot).
+ */
+ if (!ms_hyperv.paravisor_present && hv_isolation_type_tdx())
+ ce->rating = 90;
+ else
+ ce->rating = 1000;
+
ce->set_state_shutdown = hv_ce_shutdown;
ce->set_state_oneshot = hv_ce_set_oneshot;
ce->set_next_event = hv_ce_set_next_event;
--
2.25.1
Hi
Side note: This fix requires
4e7aaa6b82d6 ("netfilter: ipset: Fix race between namespace cleanup and gc in the list:set type"
in first place, as a dependency.
Thanks
On Sat, Jun 22, 2024 at 07:41:24PM -0400, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> netfilter: ipset: Fix suspicious rcu_dereference_protected()
>
> to the 6.9-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> netfilter-ipset-fix-suspicious-rcu_dereference_prote.patch
> and it can be found in the queue-6.9 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
>
> commit 0226dfa53edc90463c1b0d50167da948c88025ef
> Author: Jozsef Kadlecsik <kadlec(a)netfilter.org>
> Date: Mon Jun 17 11:18:15 2024 +0200
>
> netfilter: ipset: Fix suspicious rcu_dereference_protected()
>
> [ Upstream commit 8ecd06277a7664f4ef018abae3abd3451d64e7a6 ]
>
> When destroying all sets, we are either in pernet exit phase or
> are executing a "destroy all sets command" from userspace. The latter
> was taken into account in ip_set_dereference() (nfnetlink mutex is held),
> but the former was not. The patch adds the required check to
> rcu_dereference_protected() in ip_set_dereference().
>
> Fixes: 4e7aaa6b82d6 ("netfilter: ipset: Fix race between namespace cleanup and gc in the list:set type")
> Reported-by: syzbot+b62c37cdd58103293a5a(a)syzkaller.appspotmail.com
> Reported-by: syzbot+cfbe1da5fdfc39efc293(a)syzkaller.appspotmail.com
> Reported-by: kernel test robot <oliver.sang(a)intel.com>
> Closes: https://lore.kernel.org/oe-lkp/202406141556.e0b6f17e-lkp@intel.com
> Signed-off-by: Jozsef Kadlecsik <kadlec(a)netfilter.org>
> Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> index c7ae4d9bf3d24..61431690cbd5f 100644
> --- a/net/netfilter/ipset/ip_set_core.c
> +++ b/net/netfilter/ipset/ip_set_core.c
> @@ -53,12 +53,13 @@ MODULE_DESCRIPTION("core IP set support");
> MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
>
> /* When the nfnl mutex or ip_set_ref_lock is held: */
> -#define ip_set_dereference(p) \
> - rcu_dereference_protected(p, \
> +#define ip_set_dereference(inst) \
> + rcu_dereference_protected((inst)->ip_set_list, \
> lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
> - lockdep_is_held(&ip_set_ref_lock))
> + lockdep_is_held(&ip_set_ref_lock) || \
> + (inst)->is_deleted)
> #define ip_set(inst, id) \
> - ip_set_dereference((inst)->ip_set_list)[id]
> + ip_set_dereference(inst)[id]
> #define ip_set_ref_netlink(inst,id) \
> rcu_dereference_raw((inst)->ip_set_list)[id]
> #define ip_set_dereference_nfnl(p) \
> @@ -1133,7 +1134,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info,
> if (!list)
> goto cleanup;
> /* nfnl mutex is held, both lists are valid */
> - tmp = ip_set_dereference(inst->ip_set_list);
> + tmp = ip_set_dereference(inst);
> memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max);
> rcu_assign_pointer(inst->ip_set_list, list);
> /* Make sure all current packets have passed through */
The following commit has been merged into the smp/urgent branch of tip:
Commit-ID: 932d8476399f622aa0767a4a0a9e78e5341dc0e1
Gitweb: https://git.kernel.org/tip/932d8476399f622aa0767a4a0a9e78e5341dc0e1
Author: Yuntao Wang <ytcoode(a)gmail.com>
AuthorDate: Wed, 15 May 2024 21:45:54 +08:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Mon, 17 Jun 2024 15:08:04 +02:00
cpu/hotplug: Fix dynstate assignment in __cpuhp_setup_state_cpuslocked()
Commit 4205e4786d0b ("cpu/hotplug: Provide dynamic range for prepare
stage") added a dynamic range for the prepare states, but did not handle
the assignment of the dynstate variable in __cpuhp_setup_state_cpuslocked().
This causes the corresponding startup callback not to be invoked when
calling __cpuhp_setup_state_cpuslocked() with the CPUHP_BP_PREPARE_DYN
parameter, even though it should be.
Currently, the users of __cpuhp_setup_state_cpuslocked(), for one reason or
another, have not triggered this bug.
Fixes: 4205e4786d0b ("cpu/hotplug: Provide dynamic range for prepare stage")
Signed-off-by: Yuntao Wang <ytcoode(a)gmail.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240515134554.427071-1-ytcoode@gmail.com
---
kernel/cpu.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 563877d..74cfdb6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2446,7 +2446,7 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
* The caller needs to hold cpus read locked while calling this function.
* Return:
* On success:
- * Positive state number if @state is CPUHP_AP_ONLINE_DYN;
+ * Positive state number if @state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN;
* 0 for all other states
* On failure: proper (negative) error code
*/
@@ -2469,7 +2469,7 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
ret = cpuhp_store_callbacks(state, name, startup, teardown,
multi_instance);
- dynstate = state == CPUHP_AP_ONLINE_DYN;
+ dynstate = state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN;
if (ret > 0 && dynstate) {
state = ret;
ret = 0;
@@ -2500,8 +2500,8 @@ int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
out:
mutex_unlock(&cpuhp_state_mutex);
/*
- * If the requested state is CPUHP_AP_ONLINE_DYN, return the
- * dynamically allocated state in case of success.
+ * If the requested state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN,
+ * return the dynamically allocated state in case of success.
*/
if (!ret && dynstate)
return state;
After the reworking of "Parallel CPU bringup", the cmdline "nosmp" and
"maxcpus=0" is broken. Because these parameters make setup_max_cpus be
zero, and setup_max_cpus is the "max_cpus" of bringup_nonboot_cpus().
In this case, "if (!--ncpus)" will not be true in cpuhp_bringup_mask(),
and the result is all the possible cpus are brought up.
We can fix it by changing "if (!--ncpus)" to "if (!ncpus--)". But to
make logic more clear and save some cpu cycles, it is better to check
"max_cpus" in bringup_nonboot_cpus(), return early if it is zero.
Cc: stable(a)vger.kernel.org
Fixes: 18415f33e2ac4ab382 ("cpu/hotplug: Allow "parallel" bringup up to CPUHP_BP_KICK_AP_STATE")
Fixes: 06c6796e0304234da6 ("cpu/hotplug: Fix off by one in cpuhp_bringup_mask()")
Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn>
---
kernel/cpu.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 563877d6c28b..200974a31de8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1859,6 +1859,9 @@ static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return fals
void __init bringup_nonboot_cpus(unsigned int max_cpus)
{
+ if (!max_cpus)
+ return;
+
/* Try parallel bringup optimization if enabled */
if (cpuhp_bringup_cpus_parallel(max_cpus))
return;
--
2.43.0
x86_of_pci_irq_enable() returns PCIBIOS_* code received from
pci_read_config_byte() directly and also -EINVAL which are not
compatible error types. x86_of_pci_irq_enable() is used as
(*pcibios_enable_irq) function which should not return PCIBIOS_* codes.
Convert the PCIBIOS_* return code from pci_read_config_byte() into
normal errno using pcibios_err_to_errno().
Fixes: 96e0a0797eba ("x86: dtb: Add support for PCI devices backed by dtb nodes")
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
---
arch/x86/kernel/devicetree.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 8e3c53b4d070..64280879c68c 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -83,7 +83,7 @@ static int x86_of_pci_irq_enable(struct pci_dev *dev)
ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (ret)
- return ret;
+ return pcibios_err_to_errno(ret);
if (!pin)
return 0;
--
2.39.2
From: Arnd Bergmann <arnd(a)arndb.de>
Both of these architectures require u64 function arguments to be
passed in even/odd pairs of registers or stack slots, which in case of
sync_file_range would result in a seven-argument system call that is
not currently possible. The system call is therefore incompatible with
all existing binaries.
While it would be possible to implement support for seven arguments
like on mips, it seems better to use a six-argument version, either
with the normal argument order but misaligned as on most architectures
or with the reordered sync_file_range2() calling conventions as on
arm and powerpc.
Cc: stable(a)vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
---
arch/csky/include/uapi/asm/unistd.h | 1 +
arch/hexagon/include/uapi/asm/unistd.h | 1 +
2 files changed, 2 insertions(+)
diff --git a/arch/csky/include/uapi/asm/unistd.h b/arch/csky/include/uapi/asm/unistd.h
index 7ff6a2466af1..e0594b6370a6 100644
--- a/arch/csky/include/uapi/asm/unistd.h
+++ b/arch/csky/include/uapi/asm/unistd.h
@@ -6,6 +6,7 @@
#define __ARCH_WANT_SYS_CLONE3
#define __ARCH_WANT_SET_GET_RLIMIT
#define __ARCH_WANT_TIME32_SYSCALLS
+#define __ARCH_WANT_SYNC_FILE_RANGE2
#include <asm-generic/unistd.h>
#define __NR_set_thread_area (__NR_arch_specific_syscall + 0)
diff --git a/arch/hexagon/include/uapi/asm/unistd.h b/arch/hexagon/include/uapi/asm/unistd.h
index 432c4db1b623..21ae22306b5d 100644
--- a/arch/hexagon/include/uapi/asm/unistd.h
+++ b/arch/hexagon/include/uapi/asm/unistd.h
@@ -36,5 +36,6 @@
#define __ARCH_WANT_SYS_VFORK
#define __ARCH_WANT_SYS_FORK
#define __ARCH_WANT_TIME32_SYSCALLS
+#define __ARCH_WANT_SYNC_FILE_RANGE2
#include <asm-generic/unistd.h>
--
2.39.2
The following commit has been merged into the irq/urgent branch of tip:
Commit-ID: 2d64eaeeeda5659d52da1af79d237269ba3c2d2c
Gitweb: https://git.kernel.org/tip/2d64eaeeeda5659d52da1af79d237269ba3c2d2c
Author: Huacai Chen <chenhuacai(a)loongson.cn>
AuthorDate: Sun, 23 Jun 2024 11:41:13 +08:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Sun, 23 Jun 2024 17:09:26 +02:00
irqchip/loongson-eiointc: Use early_cpu_to_node() instead of cpu_to_node()
Multi-bridge machines required that all eiointc controllers in the system
are initialized, otherwise the system does not boot.
The initialization happens on the boot CPU during early boot and relies on
cpu_to_node() for identifying the individual nodes.
That works when the number of possible CPUs is large enough, but with a
command line limit, e.g. "nr_cpus=$N" for kdump, but fails when the CPUs
of the secondary nodes are not covered.
During early ACPI enumeration all CPU to node mappings are recorded up to
CONFIG_NR_CPUS. These are accessible via early_cpu_to_node() even in the
case that "nr_cpus=N" truncates the number of possible CPUs and only
provides the possible CPUs via cpu_to_node() translation.
Change the node lookup in the driver to use early_cpu_to_node() so that
even with a limitation on the number of possible CPUs all eointc instances
are initialized.
This can't obviously cure the case where CONFIG_NR_CPUS is too small.
[ tglx: Massaged changelog ]
Fixes: 64cc451e45e1 ("irqchip/loongson-eiointc: Fix incorrect use of acpi_get_vec_parent")
Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240623034113.1808727-1-chenhuacai@loongson.cn
---
drivers/irqchip/irq-loongson-eiointc.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/irqchip/irq-loongson-eiointc.c b/drivers/irqchip/irq-loongson-eiointc.c
index c7ddebf..b1f2080 100644
--- a/drivers/irqchip/irq-loongson-eiointc.c
+++ b/drivers/irqchip/irq-loongson-eiointc.c
@@ -15,6 +15,7 @@
#include <linux/irqchip/chained_irq.h>
#include <linux/kernel.h>
#include <linux/syscore_ops.h>
+#include <asm/numa.h>
#define EIOINTC_REG_NODEMAP 0x14a0
#define EIOINTC_REG_IPMAP 0x14c0
@@ -339,7 +340,7 @@ static int __init pch_msi_parse_madt(union acpi_subtable_headers *header,
int node;
if (cpu_has_flatmode)
- node = cpu_to_node(eiointc_priv[nr_pics - 1]->node * CORES_PER_EIO_NODE);
+ node = early_cpu_to_node(eiointc_priv[nr_pics - 1]->node * CORES_PER_EIO_NODE);
else
node = eiointc_priv[nr_pics - 1]->node;
@@ -431,7 +432,7 @@ int __init eiointc_acpi_init(struct irq_domain *parent,
goto out_free_handle;
if (cpu_has_flatmode)
- node = cpu_to_node(acpi_eiointc->node * CORES_PER_EIO_NODE);
+ node = early_cpu_to_node(acpi_eiointc->node * CORES_PER_EIO_NODE);
else
node = acpi_eiointc->node;
acpi_set_vec_parent(node, priv->eiointc_domain, pch_group);
In the liointc hardware, there are different ISRs (i.e. Interrupt Status
Registers) for different cores. We always use core#0's ISR before but it
has no problem, this is because the interrupts are routed to core#0 by
default. If we change the routing (which can be done by changing firmware
configuration) then we will lose interrupts while CPU hotplugging, so we
should set correct ISRs for different cores.
Cc: <stable(a)vger.kernel.org>
Co-developed-by: Tianli Xiong <xiongtianli(a)loongson.cn>
Signed-off-by: Tianli Xiong <xiongtianli(a)loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn>
---
V2: Update commit messages.
drivers/irqchip/irq-loongson-liointc.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/irqchip/irq-loongson-liointc.c b/drivers/irqchip/irq-loongson-liointc.c
index e4b33aed1c97..7c4fe7ab4b83 100644
--- a/drivers/irqchip/irq-loongson-liointc.c
+++ b/drivers/irqchip/irq-loongson-liointc.c
@@ -28,7 +28,7 @@
#define LIOINTC_INTC_CHIP_START 0x20
-#define LIOINTC_REG_INTC_STATUS (LIOINTC_INTC_CHIP_START + 0x20)
+#define LIOINTC_REG_INTC_STATUS(core) (LIOINTC_INTC_CHIP_START + 0x20 + (core) * 8)
#define LIOINTC_REG_INTC_EN_STATUS (LIOINTC_INTC_CHIP_START + 0x04)
#define LIOINTC_REG_INTC_ENABLE (LIOINTC_INTC_CHIP_START + 0x08)
#define LIOINTC_REG_INTC_DISABLE (LIOINTC_INTC_CHIP_START + 0x0c)
@@ -217,7 +217,7 @@ static int liointc_init(phys_addr_t addr, unsigned long size, int revision,
goto out_free_priv;
for (i = 0; i < LIOINTC_NUM_CORES; i++)
- priv->core_isr[i] = base + LIOINTC_REG_INTC_STATUS;
+ priv->core_isr[i] = base + LIOINTC_REG_INTC_STATUS(i);
for (i = 0; i < LIOINTC_NUM_PARENT; i++)
priv->handler[i].parent_int_map = parent_int_map[i];
--
2.43.0
From: Yuntao Wang <yuntao.wang(a)linux.dev>
[ Upstream commit ed8c7fbdfe117abbef81f65428ba263118ef298a ]
The maximum possible return value of find_next_zero_bit(fdt->full_fds_bits,
maxbit, bitbit) is maxbit. This return value, multiplied by BITS_PER_LONG,
gives the value of bitbit, which can never be greater than maxfd, it can
only be equal to maxfd at most, so the following check 'if (bitbit > maxfd)'
will never be true.
Moreover, when bitbit equals maxfd, it indicates that there are no unused
fds, and the function can directly return.
Fix this check.
Signed-off-by: Yuntao Wang <yuntao.wang(a)linux.dev>
Link: https://lore.kernel.org/r/20240529160656.209352-1-yuntao.wang@linux.dev
Reviewed-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/file.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/file.c b/fs/file.c
index 928ba7b8df1e9..f5ba0e6f1a4c6 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -462,12 +462,12 @@ struct files_struct init_files = {
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
- unsigned int maxfd = fdt->max_fds;
+ unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
- if (bitbit > maxfd)
+ if (bitbit >= maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
--
2.43.0
From: Yuntao Wang <yuntao.wang(a)linux.dev>
[ Upstream commit ed8c7fbdfe117abbef81f65428ba263118ef298a ]
The maximum possible return value of find_next_zero_bit(fdt->full_fds_bits,
maxbit, bitbit) is maxbit. This return value, multiplied by BITS_PER_LONG,
gives the value of bitbit, which can never be greater than maxfd, it can
only be equal to maxfd at most, so the following check 'if (bitbit > maxfd)'
will never be true.
Moreover, when bitbit equals maxfd, it indicates that there are no unused
fds, and the function can directly return.
Fix this check.
Signed-off-by: Yuntao Wang <yuntao.wang(a)linux.dev>
Link: https://lore.kernel.org/r/20240529160656.209352-1-yuntao.wang@linux.dev
Reviewed-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/file.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/file.c b/fs/file.c
index e56059fa1b309..64892b7444191 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -462,12 +462,12 @@ struct files_struct init_files = {
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
- unsigned int maxfd = fdt->max_fds;
+ unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
- if (bitbit > maxfd)
+ if (bitbit >= maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
--
2.43.0
From: Yuntao Wang <yuntao.wang(a)linux.dev>
[ Upstream commit ed8c7fbdfe117abbef81f65428ba263118ef298a ]
The maximum possible return value of find_next_zero_bit(fdt->full_fds_bits,
maxbit, bitbit) is maxbit. This return value, multiplied by BITS_PER_LONG,
gives the value of bitbit, which can never be greater than maxfd, it can
only be equal to maxfd at most, so the following check 'if (bitbit > maxfd)'
will never be true.
Moreover, when bitbit equals maxfd, it indicates that there are no unused
fds, and the function can directly return.
Fix this check.
Signed-off-by: Yuntao Wang <yuntao.wang(a)linux.dev>
Link: https://lore.kernel.org/r/20240529160656.209352-1-yuntao.wang@linux.dev
Reviewed-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/file.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/file.c b/fs/file.c
index fdb84a64724b7..913f7d897d2fc 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -494,12 +494,12 @@ struct files_struct init_files = {
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
- unsigned int maxfd = fdt->max_fds;
+ unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
- if (bitbit > maxfd)
+ if (bitbit >= maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
--
2.43.0
From: Yuntao Wang <yuntao.wang(a)linux.dev>
[ Upstream commit ed8c7fbdfe117abbef81f65428ba263118ef298a ]
The maximum possible return value of find_next_zero_bit(fdt->full_fds_bits,
maxbit, bitbit) is maxbit. This return value, multiplied by BITS_PER_LONG,
gives the value of bitbit, which can never be greater than maxfd, it can
only be equal to maxfd at most, so the following check 'if (bitbit > maxfd)'
will never be true.
Moreover, when bitbit equals maxfd, it indicates that there are no unused
fds, and the function can directly return.
Fix this check.
Signed-off-by: Yuntao Wang <yuntao.wang(a)linux.dev>
Link: https://lore.kernel.org/r/20240529160656.209352-1-yuntao.wang@linux.dev
Reviewed-by: Jan Kara <jack(a)suse.cz>
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/file.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/file.c b/fs/file.c
index 69a51d37b66d9..b46a4a725a0ef 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -481,12 +481,12 @@ struct files_struct init_files = {
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
- unsigned int maxfd = fdt->max_fds;
+ unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
- if (bitbit > maxfd)
+ if (bitbit >= maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
--
2.43.0