Event polling delay is set to 0 if there are any pending requests in
either rx or tx requests lists. Checking for pending requests does
not work well for "IN" transfers as the tty driver always queues
requests to the list and TRBs to the ring, preparing to receive data
from the host.
This causes unnecessary busylooping and cpu hogging.
Only set the event polling delay to 0 if there are pending tx "write"
transfers, or if it was less than 10ms since last active data transfer
in any direction.
Cc: Łukasz Bartosik <ukaszb(a)chromium.org>
Fixes: fb18e5bb9660 ("xhci: dbc: poll at different rate depending on data transfer activity")
Cc: stable(a)vger.kernel.org
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
---
drivers/usb/host/xhci-dbgcap.c | 19 ++++++++++++++++---
drivers/usb/host/xhci-dbgcap.h | 3 +++
2 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/drivers/usb/host/xhci-dbgcap.c b/drivers/usb/host/xhci-dbgcap.c
index fd7895b24367..0d4ce5734165 100644
--- a/drivers/usb/host/xhci-dbgcap.c
+++ b/drivers/usb/host/xhci-dbgcap.c
@@ -823,6 +823,7 @@ static enum evtreturn xhci_dbc_do_handle_events(struct xhci_dbc *dbc)
{
dma_addr_t deq;
union xhci_trb *evt;
+ enum evtreturn ret = EVT_DONE;
u32 ctrl, portsc;
bool update_erdp = false;
@@ -909,6 +910,7 @@ static enum evtreturn xhci_dbc_do_handle_events(struct xhci_dbc *dbc)
break;
case TRB_TYPE(TRB_TRANSFER):
dbc_handle_xfer_event(dbc, evt);
+ ret = EVT_XFER_DONE;
break;
default:
break;
@@ -927,7 +929,7 @@ static enum evtreturn xhci_dbc_do_handle_events(struct xhci_dbc *dbc)
lo_hi_writeq(deq, &dbc->regs->erdp);
}
- return EVT_DONE;
+ return ret;
}
static void xhci_dbc_handle_events(struct work_struct *work)
@@ -936,6 +938,7 @@ static void xhci_dbc_handle_events(struct work_struct *work)
struct xhci_dbc *dbc;
unsigned long flags;
unsigned int poll_interval;
+ unsigned long busypoll_timelimit;
dbc = container_of(to_delayed_work(work), struct xhci_dbc, event_work);
poll_interval = dbc->poll_interval;
@@ -954,11 +957,21 @@ static void xhci_dbc_handle_events(struct work_struct *work)
dbc->driver->disconnect(dbc);
break;
case EVT_DONE:
- /* set fast poll rate if there are pending data transfers */
+ /*
+ * Set fast poll rate if there are pending out transfers, or
+ * a transfer was recently processed
+ */
+ busypoll_timelimit = dbc->xfer_timestamp +
+ msecs_to_jiffies(DBC_XFER_INACTIVITY_TIMEOUT);
+
if (!list_empty(&dbc->eps[BULK_OUT].list_pending) ||
- !list_empty(&dbc->eps[BULK_IN].list_pending))
+ time_is_after_jiffies(busypoll_timelimit))
poll_interval = 0;
break;
+ case EVT_XFER_DONE:
+ dbc->xfer_timestamp = jiffies;
+ poll_interval = 0;
+ break;
default:
dev_info(dbc->dev, "stop handling dbc events\n");
return;
diff --git a/drivers/usb/host/xhci-dbgcap.h b/drivers/usb/host/xhci-dbgcap.h
index 9dc8f4d8077c..47ac72c2286d 100644
--- a/drivers/usb/host/xhci-dbgcap.h
+++ b/drivers/usb/host/xhci-dbgcap.h
@@ -96,6 +96,7 @@ struct dbc_ep {
#define DBC_WRITE_BUF_SIZE 8192
#define DBC_POLL_INTERVAL_DEFAULT 64 /* milliseconds */
#define DBC_POLL_INTERVAL_MAX 5000 /* milliseconds */
+#define DBC_XFER_INACTIVITY_TIMEOUT 10 /* milliseconds */
/*
* Private structure for DbC hardware state:
*/
@@ -142,6 +143,7 @@ struct xhci_dbc {
enum dbc_state state;
struct delayed_work event_work;
unsigned int poll_interval; /* ms */
+ unsigned long xfer_timestamp;
unsigned resume_required:1;
struct dbc_ep eps[2];
@@ -187,6 +189,7 @@ struct dbc_request {
enum evtreturn {
EVT_ERR = -1,
EVT_DONE,
+ EVT_XFER_DONE,
EVT_GSER,
EVT_DISC,
};
--
2.43.0
Generally PASID support requires ACS settings that usually create
single device groups, but there are some niche cases where we can get
multi-device groups and still have working PASID support. The primary
issue is that PCI switches are not required to treat PASID tagged TLPs
specially so appropriate ACS settings are required to route all TLPs to
the host bridge if PASID is going to work properly.
pci_enable_pasid() does check that each device that will use PASID has
the proper ACS settings to achieve this routing.
However, no-PASID devices can be combined with PASID capable devices
within the same topology using non-uniform ACS settings. In this case
the no-PASID devices may not have strict route to host ACS flags and
end up being grouped with the PASID devices.
This configuration fails to allow use of the PASID within the iommu
core code which wrongly checks if the no-PASID device supports PASID.
Fix this by ignoring no-PASID devices during the PASID validation. They
will never issue a PASID TLP anyhow so they can be ignored.
Fixes: c404f55c26fc ("iommu: Validate the PASID in iommu_attach_device_pasid()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Tushar Dave <tdave(a)nvidia.com>
---
changes in v2:
- added no-pasid check in __iommu_set_group_pasid and __iommu_remove_group_pasid
drivers/iommu/iommu.c | 22 ++++++++++++++++------
1 file changed, 16 insertions(+), 6 deletions(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 60aed01e54f2..8251b07f4022 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3329,8 +3329,9 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain,
int ret;
for_each_group_device(group, device) {
- ret = domain->ops->set_dev_pasid(domain, device->dev,
- pasid, NULL);
+ if (device->dev->iommu->max_pasids > 0)
+ ret = domain->ops->set_dev_pasid(domain, device->dev,
+ pasid, NULL);
if (ret)
goto err_revert;
}
@@ -3342,7 +3343,8 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain,
for_each_group_device(group, device) {
if (device == last_gdev)
break;
- iommu_remove_dev_pasid(device->dev, pasid, domain);
+ if (device->dev->iommu->max_pasids > 0)
+ iommu_remove_dev_pasid(device->dev, pasid, domain);
}
return ret;
}
@@ -3353,8 +3355,10 @@ static void __iommu_remove_group_pasid(struct iommu_group *group,
{
struct group_device *device;
- for_each_group_device(group, device)
- iommu_remove_dev_pasid(device->dev, pasid, domain);
+ for_each_group_device(group, device) {
+ if (device->dev->iommu->max_pasids > 0)
+ iommu_remove_dev_pasid(device->dev, pasid, domain);
+ }
}
/*
@@ -3391,7 +3395,13 @@ int iommu_attach_device_pasid(struct iommu_domain *domain,
mutex_lock(&group->mutex);
for_each_group_device(group, device) {
- if (pasid >= device->dev->iommu->max_pasids) {
+ /*
+ * Skip PASID validation for devices without PASID support
+ * (max_pasids = 0). These devices cannot issue transactions
+ * with PASID, so they don't affect group's PASID usage.
+ */
+ if ((device->dev->iommu->max_pasids > 0) &&
+ (pasid >= device->dev->iommu->max_pasids)) {
ret = -EINVAL;
goto out_unlock;
}
--
2.34.1
Currently the difference is computed on 32-bit unsigned values although
eventually it is stored in a variable of int64_t type. This gives awkward
results, e.g. when the diff _should_ be negative, it is represented as
some large positive int64_t value.
Perform the calculations directly in int64_t as all other diff_two_keys
routines actually do.
Found by Linux Verification Center (linuxtesting.org) with Svace static
analysis tool.
Fixes: 08438b1e386b ("xfs: plumb in needed functions for range querying of the freespace btrees")
Cc: stable(a)vger.kernel.org
Signed-off-by: Fedor Pchelkin <pchelkin(a)ispras.ru>
---
fs/xfs/libxfs/xfs_alloc_btree.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index a4ac37ba5d51..b3c54ae90e25 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -238,13 +238,13 @@ xfs_cntbt_diff_two_keys(
ASSERT(!mask || (mask->alloc.ar_blockcount &&
mask->alloc.ar_startblock));
- diff = be32_to_cpu(k1->alloc.ar_blockcount) -
- be32_to_cpu(k2->alloc.ar_blockcount);
+ diff = (int64_t)be32_to_cpu(k1->alloc.ar_blockcount) -
+ be32_to_cpu(k2->alloc.ar_blockcount);
if (diff)
return diff;
- return be32_to_cpu(k1->alloc.ar_startblock) -
- be32_to_cpu(k2->alloc.ar_startblock);
+ return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
+ be32_to_cpu(k2->alloc.ar_startblock);
}
static xfs_failaddr_t
--
2.49.0
These patchset adds support for VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW
message added in recent VPU firmware. Without it the driver will not be able to
process any jobs after this message is received and would need to be reloaded.
Most patches are as-is from upstream besides these two:
- Fix locking order in ivpu_job_submit
- Abort all jobs after command queue unregister
Both these patches need to be rebased because of missing new CMDQ UAPI changes
that should not be backported to stable.
Andrew Kreimer (1):
accel/ivpu: Fix a typo
Andrzej Kacprowski (1):
accel/ivpu: Update VPU FW API headers
Karol Wachowski (4):
accel/ivpu: Use xa_alloc_cyclic() instead of custom function
accel/ivpu: Abort all jobs after command queue unregister
accel/ivpu: Fix locking order in ivpu_job_submit
accel/ivpu: Add handling of VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW
Tomasz Rusinowicz (1):
accel/ivpu: Make DB_ID and JOB_ID allocations incremental
drivers/accel/ivpu/ivpu_drv.c | 38 ++--
drivers/accel/ivpu/ivpu_drv.h | 9 +
drivers/accel/ivpu/ivpu_job.c | 125 +++++++++---
drivers/accel/ivpu/ivpu_job.h | 1 +
drivers/accel/ivpu/ivpu_jsm_msg.c | 3 +-
drivers/accel/ivpu/ivpu_mmu.c | 3 +-
drivers/accel/ivpu/ivpu_sysfs.c | 5 +-
drivers/accel/ivpu/vpu_boot_api.h | 45 +++--
drivers/accel/ivpu/vpu_jsm_api.h | 303 +++++++++++++++++++++++++-----
9 files changed, 412 insertions(+), 120 deletions(-)
--
2.45.1
These patchset adds support for VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW
message added in recent VPU firmware. Without it the driver will not be able to
process any jobs after this message is received and would need to be reloaded.
The last patch in this series is as-is from upstream, but other two patches
had to be rebased because of missing new CMDQ UAPI changes that should not be
backported to stable.
Changes since v1:
- Documented deviations from the original upstream patches in commit messages
Karol Wachowski (3):
accel/ivpu: Abort all jobs after command queue unregister
accel/ivpu: Fix locking order in ivpu_job_submit
accel/ivpu: Add handling of VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW
drivers/accel/ivpu/ivpu_drv.c | 32 ++-------
drivers/accel/ivpu/ivpu_drv.h | 2 +
drivers/accel/ivpu/ivpu_job.c | 111 ++++++++++++++++++++++++++------
drivers/accel/ivpu/ivpu_job.h | 1 +
drivers/accel/ivpu/ivpu_mmu.c | 3 +-
drivers/accel/ivpu/ivpu_sysfs.c | 5 +-
6 files changed, 103 insertions(+), 51 deletions(-)
--
2.45.1
If the directory is corrupted and the number of nlinks is less than 2
(valid nlinks have at least 2), then when the directory is deleted, the
minix_rmdir will try to reduce the nlinks(unsigned int) to a negative
value.
Make nlinks validity check for directory in minix_lookup.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
Signed-off-by: Andrey Kriulin <kitotavrik.media(a)gmail.com>
---
fs/minix/namei.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 8938536d8..5717a56fa 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -28,8 +28,13 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un
return ERR_PTR(-ENAMETOOLONG);
ino = minix_inode_by_name(dentry);
- if (ino)
+ if (ino) {
inode = minix_iget(dir->i_sb, ino);
+ if (S_ISDIR(inode->i_mode) && inode->i_nlink < 2) {
+ iput(inode);
+ return ERR_PTR(-EIO);
+ }
+ }
return d_splice_alias(inode, dentry);
}
--
2.47.2
If the directory is corrupted and the number of nlinks is less than 2
(valid nlinks have at least 2), then when the directory is deleted, the
minix_rmdir will try to reduce the nlinks(unsigned int) to a negative
value.
Make nlinks validity check for directory in minix_lookup.
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
Signed-off-by: Andrey Kriulin <kitotavrik.media(a)gmail.com>
---
fs/minix/namei.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 8938536d8..5717a56fa 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -28,8 +28,13 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un
return ERR_PTR(-ENAMETOOLONG);
ino = minix_inode_by_name(dentry);
- if (ino)
+ if (ino) {
inode = minix_iget(dir->i_sb, ino);
+ if (S_ISDIR(inode->i_mode) && inode->i_nlink < 2) {
+ iput(inode);
+ return ERR_PTR(-EIO);
+ }
+ }
return d_splice_alias(inode, dentry);
}
--
2.47.2
I'm announcing the release of the 6.12.27 kernel.
This fixes a build problem in the 6.12.26 release. If you do not have a
build issue with 6.12.26, there is no need to update.
The updated 6.12.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-6.12.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2 +-
kernel/bpf/preload/bpf_preload_kern.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
Greg Kroah-Hartman (1):
Linux 6.12.27
Xi Ruoyao (1):
bpf: Fix BPF_INTERNAL namespace import
I'm announcing the release of the 6.1.137 kernel.
This release fixes a problem building the Loongarch target in the
6.1.136 release. If you do not have build problems with the 6.1.136
release, there is no need for you to update to this release.
The updated 6.1.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-6.1.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Makefile | 2 +-
arch/loongarch/mm/hugetlbpage.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
Greg Kroah-Hartman (1):
Linux 6.1.137
Huacai Chen (1):
LoongArch: Fix build error due to backport
If a driver is removed, the driver framework invokes the driver's
remove callback. A CAN driver's remove function calls
unregister_candev(), which calls net_device_ops::ndo_stop further down
in the call stack for interfaces which are in the "up" state.
With the mcp251xfd driver the removal of the module causes the
following warning:
| WARNING: CPU: 0 PID: 352 at net/core/dev.c:7342 __netif_napi_del_locked+0xc8/0xd8
as can_rx_offload_del() deletes the NAPI, while it is still active,
because the interface is still up.
To fix the warning, first unregister the network interface, which
calls net_device_ops::ndo_stop, which disables the NAPI, and then call
can_rx_offload_del().
All other driver using the rx-offload helper have been checked and the
same issue has been found in the rockchip and m_can driver. These have
been fixed, but only compile time tested. On the mcp251xfd the fix was
tested on hardware.
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
Marc Kleine-Budde (3):
can: mcp251xfd: mcp251xfd_remove(): fix order of unregistration calls
can: rockchip_canfd: m_can_class_unregister: fix order of unregistration calls
can: mcan: m_can_class_unregister: fix order of unregistration calls
drivers/net/can/m_can/m_can.c | 2 +-
drivers/net/can/rockchip/rockchip_canfd-core.c | 2 +-
drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
---
base-commit: ebd297a2affadb6f6f4d2e5d975c1eda18ac762d
change-id: 20250502-can-rx-offload-del-eb79379733dd
Best regards,
--
Marc Kleine-Budde <mkl(a)pengutronix.de>
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 32dce6b1949a696dc7abddc04de8cbe35c260217
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050556-copier-vengeful-2f43@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 32dce6b1949a696dc7abddc04de8cbe35c260217 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j(a)jannau.net>
Date: Tue, 4 Mar 2025 20:12:14 +0100
Subject: [PATCH] drm: Select DRM_KMS_HELPER from
DRM_DEBUG_DP_MST_TOPOLOGY_REFS
Using "depends on" and "select" for the same Kconfig symbol is known to
cause circular dependencies (cmp. "Kconfig recursive dependency
limitations" in Documentation/kbuild/kconfig-language.rst.
DRM drivers are selecting drm helpers so do the same for
DRM_DEBUG_DP_MST_TOPOLOGY_REFS.
Fixes following circular dependency reported on x86 for the downstream
Asahi Linux tree:
error: recursive dependency detected!
symbol DRM_KMS_HELPER is selected by DRM_GEM_SHMEM_HELPER
symbol DRM_GEM_SHMEM_HELPER is selected by RUST_DRM_GEM_SHMEM_HELPER
symbol RUST_DRM_GEM_SHMEM_HELPER is selected by DRM_ASAHI
symbol DRM_ASAHI depends on RUST
symbol RUST depends on CALL_PADDING
symbol CALL_PADDING depends on OBJTOOL
symbol OBJTOOL is selected by STACK_VALIDATION
symbol STACK_VALIDATION depends on UNWINDER_FRAME_POINTER
symbol UNWINDER_FRAME_POINTER is part of choice block at arch/x86/Kconfig.debug:224
symbol <choice> unknown is visible depending on UNWINDER_GUESS
symbol UNWINDER_GUESS prompt is visible depending on STACKDEPOT
symbol STACKDEPOT is selected by DRM_DEBUG_DP_MST_TOPOLOGY_REFS
symbol DRM_DEBUG_DP_MST_TOPOLOGY_REFS depends on DRM_KMS_HELPER
Fixes: 12a280c72868 ("drm/dp_mst: Add topology ref history tracking for debugging")
Cc: stable(a)vger.kernel.org
Signed-off-by: Janne Grunau <j(a)jannau.net>
Acked-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Link: https://lore.kernel.org/r/20250304-drm_debug_dp_mst_topo_kconfig-v1-1-e16fd…
Signed-off-by: Alyssa Rosenzweig <alyssa(a)rosenzweig.io>
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 2cba2b6ebe1c..f01925ed8176 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -188,7 +188,7 @@ config DRM_DEBUG_DP_MST_TOPOLOGY_REFS
bool "Enable refcount backtrace history in the DP MST helpers"
depends on STACKTRACE_SUPPORT
select STACKDEPOT
- depends on DRM_KMS_HELPER
+ select DRM_KMS_HELPER
depends on DEBUG_KERNEL
depends on EXPERT
help
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 32dce6b1949a696dc7abddc04de8cbe35c260217
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050556-precision-sandbank-dcf8@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 32dce6b1949a696dc7abddc04de8cbe35c260217 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j(a)jannau.net>
Date: Tue, 4 Mar 2025 20:12:14 +0100
Subject: [PATCH] drm: Select DRM_KMS_HELPER from
DRM_DEBUG_DP_MST_TOPOLOGY_REFS
Using "depends on" and "select" for the same Kconfig symbol is known to
cause circular dependencies (cmp. "Kconfig recursive dependency
limitations" in Documentation/kbuild/kconfig-language.rst.
DRM drivers are selecting drm helpers so do the same for
DRM_DEBUG_DP_MST_TOPOLOGY_REFS.
Fixes following circular dependency reported on x86 for the downstream
Asahi Linux tree:
error: recursive dependency detected!
symbol DRM_KMS_HELPER is selected by DRM_GEM_SHMEM_HELPER
symbol DRM_GEM_SHMEM_HELPER is selected by RUST_DRM_GEM_SHMEM_HELPER
symbol RUST_DRM_GEM_SHMEM_HELPER is selected by DRM_ASAHI
symbol DRM_ASAHI depends on RUST
symbol RUST depends on CALL_PADDING
symbol CALL_PADDING depends on OBJTOOL
symbol OBJTOOL is selected by STACK_VALIDATION
symbol STACK_VALIDATION depends on UNWINDER_FRAME_POINTER
symbol UNWINDER_FRAME_POINTER is part of choice block at arch/x86/Kconfig.debug:224
symbol <choice> unknown is visible depending on UNWINDER_GUESS
symbol UNWINDER_GUESS prompt is visible depending on STACKDEPOT
symbol STACKDEPOT is selected by DRM_DEBUG_DP_MST_TOPOLOGY_REFS
symbol DRM_DEBUG_DP_MST_TOPOLOGY_REFS depends on DRM_KMS_HELPER
Fixes: 12a280c72868 ("drm/dp_mst: Add topology ref history tracking for debugging")
Cc: stable(a)vger.kernel.org
Signed-off-by: Janne Grunau <j(a)jannau.net>
Acked-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Link: https://lore.kernel.org/r/20250304-drm_debug_dp_mst_topo_kconfig-v1-1-e16fd…
Signed-off-by: Alyssa Rosenzweig <alyssa(a)rosenzweig.io>
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 2cba2b6ebe1c..f01925ed8176 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -188,7 +188,7 @@ config DRM_DEBUG_DP_MST_TOPOLOGY_REFS
bool "Enable refcount backtrace history in the DP MST helpers"
depends on STACKTRACE_SUPPORT
select STACKDEPOT
- depends on DRM_KMS_HELPER
+ select DRM_KMS_HELPER
depends on DEBUG_KERNEL
depends on EXPERT
help
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 32dce6b1949a696dc7abddc04de8cbe35c260217
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050555-pulse-ended-2cea@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 32dce6b1949a696dc7abddc04de8cbe35c260217 Mon Sep 17 00:00:00 2001
From: Janne Grunau <j(a)jannau.net>
Date: Tue, 4 Mar 2025 20:12:14 +0100
Subject: [PATCH] drm: Select DRM_KMS_HELPER from
DRM_DEBUG_DP_MST_TOPOLOGY_REFS
Using "depends on" and "select" for the same Kconfig symbol is known to
cause circular dependencies (cmp. "Kconfig recursive dependency
limitations" in Documentation/kbuild/kconfig-language.rst.
DRM drivers are selecting drm helpers so do the same for
DRM_DEBUG_DP_MST_TOPOLOGY_REFS.
Fixes following circular dependency reported on x86 for the downstream
Asahi Linux tree:
error: recursive dependency detected!
symbol DRM_KMS_HELPER is selected by DRM_GEM_SHMEM_HELPER
symbol DRM_GEM_SHMEM_HELPER is selected by RUST_DRM_GEM_SHMEM_HELPER
symbol RUST_DRM_GEM_SHMEM_HELPER is selected by DRM_ASAHI
symbol DRM_ASAHI depends on RUST
symbol RUST depends on CALL_PADDING
symbol CALL_PADDING depends on OBJTOOL
symbol OBJTOOL is selected by STACK_VALIDATION
symbol STACK_VALIDATION depends on UNWINDER_FRAME_POINTER
symbol UNWINDER_FRAME_POINTER is part of choice block at arch/x86/Kconfig.debug:224
symbol <choice> unknown is visible depending on UNWINDER_GUESS
symbol UNWINDER_GUESS prompt is visible depending on STACKDEPOT
symbol STACKDEPOT is selected by DRM_DEBUG_DP_MST_TOPOLOGY_REFS
symbol DRM_DEBUG_DP_MST_TOPOLOGY_REFS depends on DRM_KMS_HELPER
Fixes: 12a280c72868 ("drm/dp_mst: Add topology ref history tracking for debugging")
Cc: stable(a)vger.kernel.org
Signed-off-by: Janne Grunau <j(a)jannau.net>
Acked-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Link: https://lore.kernel.org/r/20250304-drm_debug_dp_mst_topo_kconfig-v1-1-e16fd…
Signed-off-by: Alyssa Rosenzweig <alyssa(a)rosenzweig.io>
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 2cba2b6ebe1c..f01925ed8176 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -188,7 +188,7 @@ config DRM_DEBUG_DP_MST_TOPOLOGY_REFS
bool "Enable refcount backtrace history in the DP MST helpers"
depends on STACKTRACE_SUPPORT
select STACKDEPOT
- depends on DRM_KMS_HELPER
+ select DRM_KMS_HELPER
depends on DEBUG_KERNEL
depends on EXPERT
help
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 12f78021973ae422564b234136c702a305932d73
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050545-renovator-scuba-7cbb@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 12f78021973ae422564b234136c702a305932d73 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs(a)nvidia.com>
Date: Sat, 12 Apr 2025 10:23:54 +1000
Subject: [PATCH] iommu/arm-smmu-v3: Fix pgsize_bit for sva domains
UBSan caught a bug with IOMMU SVA domains, where the reported exponent
value in __arm_smmu_tlb_inv_range() was >= 64.
__arm_smmu_tlb_inv_range() uses the domain's pgsize_bitmap to compute
the number of pages to invalidate and the invalidation range. Currently
arm_smmu_sva_domain_alloc() does not setup the iommu domain's
pgsize_bitmap. This leads to __ffs() on the value returning 64 and that
leads to undefined behaviour w.r.t. shift operations
Fix this by initializing the iommu_domain's pgsize_bitmap to PAGE_SIZE.
Effectively the code needs to use the smallest page size for
invalidation
Cc: stable(a)vger.kernel.org
Fixes: eb6c97647be2 ("iommu/arm-smmu-v3: Avoid constructing invalid range commands")
Suggested-by: Jason Gunthorpe <jgg(a)nvidia.com>
Signed-off-by: Balbir Singh <balbirs(a)nvidia.com>
Cc: Jean-Philippe Brucker <jean-philippe(a)linaro.org>
Cc: Will Deacon <will(a)kernel.org>
Cc: Robin Murphy <robin.murphy(a)arm.com>
Cc: Joerg Roedel <joro(a)8bytes.org>
Cc: Jason Gunthorpe <jgg(a)ziepe.ca>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Link: https://lore.kernel.org/r/20250412002354.3071449-1-balbirs@nvidia.com
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 9ba596430e7c..980cc6b33c43 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -411,6 +411,12 @@ struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
return ERR_CAST(smmu_domain);
smmu_domain->domain.type = IOMMU_DOMAIN_SVA;
smmu_domain->domain.ops = &arm_smmu_sva_domain_ops;
+
+ /*
+ * Choose page_size as the leaf page size for invalidation when
+ * ARM_SMMU_FEAT_RANGE_INV is present
+ */
+ smmu_domain->domain.pgsize_bitmap = PAGE_SIZE;
smmu_domain->smmu = smmu;
ret = xa_alloc(&arm_smmu_asid_xa, &asid, smmu_domain,
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 12f78021973ae422564b234136c702a305932d73
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050545-next-pesticide-ba2f@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 12f78021973ae422564b234136c702a305932d73 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs(a)nvidia.com>
Date: Sat, 12 Apr 2025 10:23:54 +1000
Subject: [PATCH] iommu/arm-smmu-v3: Fix pgsize_bit for sva domains
UBSan caught a bug with IOMMU SVA domains, where the reported exponent
value in __arm_smmu_tlb_inv_range() was >= 64.
__arm_smmu_tlb_inv_range() uses the domain's pgsize_bitmap to compute
the number of pages to invalidate and the invalidation range. Currently
arm_smmu_sva_domain_alloc() does not setup the iommu domain's
pgsize_bitmap. This leads to __ffs() on the value returning 64 and that
leads to undefined behaviour w.r.t. shift operations
Fix this by initializing the iommu_domain's pgsize_bitmap to PAGE_SIZE.
Effectively the code needs to use the smallest page size for
invalidation
Cc: stable(a)vger.kernel.org
Fixes: eb6c97647be2 ("iommu/arm-smmu-v3: Avoid constructing invalid range commands")
Suggested-by: Jason Gunthorpe <jgg(a)nvidia.com>
Signed-off-by: Balbir Singh <balbirs(a)nvidia.com>
Cc: Jean-Philippe Brucker <jean-philippe(a)linaro.org>
Cc: Will Deacon <will(a)kernel.org>
Cc: Robin Murphy <robin.murphy(a)arm.com>
Cc: Joerg Roedel <joro(a)8bytes.org>
Cc: Jason Gunthorpe <jgg(a)ziepe.ca>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Link: https://lore.kernel.org/r/20250412002354.3071449-1-balbirs@nvidia.com
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 9ba596430e7c..980cc6b33c43 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -411,6 +411,12 @@ struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
return ERR_CAST(smmu_domain);
smmu_domain->domain.type = IOMMU_DOMAIN_SVA;
smmu_domain->domain.ops = &arm_smmu_sva_domain_ops;
+
+ /*
+ * Choose page_size as the leaf page size for invalidation when
+ * ARM_SMMU_FEAT_RANGE_INV is present
+ */
+ smmu_domain->domain.pgsize_bitmap = PAGE_SIZE;
smmu_domain->smmu = smmu;
ret = xa_alloc(&arm_smmu_asid_xa, &asid, smmu_domain,
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 12f78021973ae422564b234136c702a305932d73
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050539-scorpion-gents-a5e3@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 12f78021973ae422564b234136c702a305932d73 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbirs(a)nvidia.com>
Date: Sat, 12 Apr 2025 10:23:54 +1000
Subject: [PATCH] iommu/arm-smmu-v3: Fix pgsize_bit for sva domains
UBSan caught a bug with IOMMU SVA domains, where the reported exponent
value in __arm_smmu_tlb_inv_range() was >= 64.
__arm_smmu_tlb_inv_range() uses the domain's pgsize_bitmap to compute
the number of pages to invalidate and the invalidation range. Currently
arm_smmu_sva_domain_alloc() does not setup the iommu domain's
pgsize_bitmap. This leads to __ffs() on the value returning 64 and that
leads to undefined behaviour w.r.t. shift operations
Fix this by initializing the iommu_domain's pgsize_bitmap to PAGE_SIZE.
Effectively the code needs to use the smallest page size for
invalidation
Cc: stable(a)vger.kernel.org
Fixes: eb6c97647be2 ("iommu/arm-smmu-v3: Avoid constructing invalid range commands")
Suggested-by: Jason Gunthorpe <jgg(a)nvidia.com>
Signed-off-by: Balbir Singh <balbirs(a)nvidia.com>
Cc: Jean-Philippe Brucker <jean-philippe(a)linaro.org>
Cc: Will Deacon <will(a)kernel.org>
Cc: Robin Murphy <robin.murphy(a)arm.com>
Cc: Joerg Roedel <joro(a)8bytes.org>
Cc: Jason Gunthorpe <jgg(a)ziepe.ca>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Link: https://lore.kernel.org/r/20250412002354.3071449-1-balbirs@nvidia.com
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 9ba596430e7c..980cc6b33c43 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -411,6 +411,12 @@ struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
return ERR_CAST(smmu_domain);
smmu_domain->domain.type = IOMMU_DOMAIN_SVA;
smmu_domain->domain.ops = &arm_smmu_sva_domain_ops;
+
+ /*
+ * Choose page_size as the leaf page size for invalidation when
+ * ARM_SMMU_FEAT_RANGE_INV is present
+ */
+ smmu_domain->domain.pgsize_bitmap = PAGE_SIZE;
smmu_domain->smmu = smmu;
ret = xa_alloc(&arm_smmu_asid_xa, &asid, smmu_domain,
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x b00d24997a11c10d3e420614f0873b83ce358a34
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050523-phonics-bankable-dd5b@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b00d24997a11c10d3e420614f0873b83ce358a34 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc(a)nvidia.com>
Date: Tue, 15 Apr 2025 11:56:20 -0700
Subject: [PATCH] iommu/arm-smmu-v3: Fix iommu_device_probe bug due to
duplicated stream ids
ASPEED VGA card has two built-in devices:
0008:06:00.0 PCI bridge: ASPEED Technology, Inc. AST1150 PCI-to-PCI Bridge (rev 06)
0008:07:00.0 VGA compatible controller: ASPEED Technology, Inc. ASPEED Graphics Family (rev 52)
Its toplogy looks like this:
+-[0008:00]---00.0-[01-09]--+-00.0-[02-09]--+-00.0-[03]----00.0 Sandisk Corp Device 5017
| +-01.0-[04]--
| +-02.0-[05]----00.0 NVIDIA Corporation Device
| +-03.0-[06-07]----00.0-[07]----00.0 ASPEED Technology, Inc. ASPEED Graphics Family
| +-04.0-[08]----00.0 Renesas Technology Corp. uPD720201 USB 3.0 Host Controller
| \-05.0-[09]----00.0 Realtek Semiconductor Co., Ltd. RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
\-00.1 PMC-Sierra Inc. Device 4028
The IORT logic populaties two identical IDs into the fwspec->ids array via
DMA aliasing in iort_pci_iommu_init() called by pci_for_each_dma_alias().
Though the SMMU driver had been able to handle this situation since commit
563b5cbe334e ("iommu/arm-smmu-v3: Cope with duplicated Stream IDs"), that
got broken by the later commit cdf315f907d4 ("iommu/arm-smmu-v3: Maintain
a SID->device structure"), which ended up with allocating separate streams
with the same stuffing.
On a kernel prior to v6.15-rc1, there has been an overlooked warning:
pci 0008:07:00.0: vgaarb: setting as boot VGA device
pci 0008:07:00.0: vgaarb: bridge control possible
pci 0008:07:00.0: vgaarb: VGA device added: decodes=io+mem,owns=none,locks=none
pcieport 0008:06:00.0: Adding to iommu group 14
ast 0008:07:00.0: stream 67328 already in tree <===== WARNING
ast 0008:07:00.0: enabling device (0002 -> 0003)
ast 0008:07:00.0: Using default configuration
ast 0008:07:00.0: AST 2600 detected
ast 0008:07:00.0: [drm] Using analog VGA
ast 0008:07:00.0: [drm] dram MCLK=396 Mhz type=1 bus_width=16
[drm] Initialized ast 0.1.0 for 0008:07:00.0 on minor 0
ast 0008:07:00.0: [drm] fb0: astdrmfb frame buffer device
With v6.15-rc, since the commit bcb81ac6ae3c ("iommu: Get DT/ACPI parsing
into the proper probe path"), the error returned with the warning is moved
to the SMMU device probe flow:
arm_smmu_probe_device+0x15c/0x4c0
__iommu_probe_device+0x150/0x4f8
probe_iommu_group+0x44/0x80
bus_for_each_dev+0x7c/0x100
bus_iommu_probe+0x48/0x1a8
iommu_device_register+0xb8/0x178
arm_smmu_device_probe+0x1350/0x1db0
which then fails the entire SMMU driver probe:
pci 0008:06:00.0: Adding to iommu group 21
pci 0008:07:00.0: stream 67328 already in tree
arm-smmu-v3 arm-smmu-v3.9.auto: Failed to register iommu
arm-smmu-v3 arm-smmu-v3.9.auto: probe with driver arm-smmu-v3 failed with error -22
Since SMMU driver had been already expecting a potential duplicated Stream
ID in arm_smmu_install_ste_for_dev(), change the arm_smmu_insert_master()
routine to ignore a duplicated ID from the fwspec->sids array as well.
Note: this has been failing the iommu_device_probe() since 2021, although a
recent iommu commit in v6.15-rc1 that moves iommu_device_probe() started to
fail the SMMU driver probe. Since nobody has cared about DMA Alias support,
leave that as it was but fix the fundamental iommu_device_probe() breakage.
Fixes: cdf315f907d4 ("iommu/arm-smmu-v3: Maintain a SID->device structure")
Cc: stable(a)vger.kernel.org
Suggested-by: Jason Gunthorpe <jgg(a)nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc(a)nvidia.com>
Link: https://lore.kernel.org/r/20250415185620.504299-1-nicolinc@nvidia.com
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5467f85dd463..0826b6bdf327 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3388,6 +3388,7 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
mutex_lock(&smmu->streams_mutex);
for (i = 0; i < fwspec->num_ids; i++) {
struct arm_smmu_stream *new_stream = &master->streams[i];
+ struct rb_node *existing;
u32 sid = fwspec->ids[i];
new_stream->id = sid;
@@ -3398,10 +3399,20 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
break;
/* Insert into SID tree */
- if (rb_find_add(&new_stream->node, &smmu->streams,
- arm_smmu_streams_cmp_node)) {
- dev_warn(master->dev, "stream %u already in tree\n",
- sid);
+ existing = rb_find_add(&new_stream->node, &smmu->streams,
+ arm_smmu_streams_cmp_node);
+ if (existing) {
+ struct arm_smmu_master *existing_master =
+ rb_entry(existing, struct arm_smmu_stream, node)
+ ->master;
+
+ /* Bridged PCI devices may end up with duplicated IDs */
+ if (existing_master == master)
+ continue;
+
+ dev_warn(master->dev,
+ "stream %u already in tree from dev %s\n", sid,
+ dev_name(existing_master->dev));
ret = -EINVAL;
break;
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x b00d24997a11c10d3e420614f0873b83ce358a34
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050522-frown-chimp-96cf@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b00d24997a11c10d3e420614f0873b83ce358a34 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc(a)nvidia.com>
Date: Tue, 15 Apr 2025 11:56:20 -0700
Subject: [PATCH] iommu/arm-smmu-v3: Fix iommu_device_probe bug due to
duplicated stream ids
ASPEED VGA card has two built-in devices:
0008:06:00.0 PCI bridge: ASPEED Technology, Inc. AST1150 PCI-to-PCI Bridge (rev 06)
0008:07:00.0 VGA compatible controller: ASPEED Technology, Inc. ASPEED Graphics Family (rev 52)
Its toplogy looks like this:
+-[0008:00]---00.0-[01-09]--+-00.0-[02-09]--+-00.0-[03]----00.0 Sandisk Corp Device 5017
| +-01.0-[04]--
| +-02.0-[05]----00.0 NVIDIA Corporation Device
| +-03.0-[06-07]----00.0-[07]----00.0 ASPEED Technology, Inc. ASPEED Graphics Family
| +-04.0-[08]----00.0 Renesas Technology Corp. uPD720201 USB 3.0 Host Controller
| \-05.0-[09]----00.0 Realtek Semiconductor Co., Ltd. RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
\-00.1 PMC-Sierra Inc. Device 4028
The IORT logic populaties two identical IDs into the fwspec->ids array via
DMA aliasing in iort_pci_iommu_init() called by pci_for_each_dma_alias().
Though the SMMU driver had been able to handle this situation since commit
563b5cbe334e ("iommu/arm-smmu-v3: Cope with duplicated Stream IDs"), that
got broken by the later commit cdf315f907d4 ("iommu/arm-smmu-v3: Maintain
a SID->device structure"), which ended up with allocating separate streams
with the same stuffing.
On a kernel prior to v6.15-rc1, there has been an overlooked warning:
pci 0008:07:00.0: vgaarb: setting as boot VGA device
pci 0008:07:00.0: vgaarb: bridge control possible
pci 0008:07:00.0: vgaarb: VGA device added: decodes=io+mem,owns=none,locks=none
pcieport 0008:06:00.0: Adding to iommu group 14
ast 0008:07:00.0: stream 67328 already in tree <===== WARNING
ast 0008:07:00.0: enabling device (0002 -> 0003)
ast 0008:07:00.0: Using default configuration
ast 0008:07:00.0: AST 2600 detected
ast 0008:07:00.0: [drm] Using analog VGA
ast 0008:07:00.0: [drm] dram MCLK=396 Mhz type=1 bus_width=16
[drm] Initialized ast 0.1.0 for 0008:07:00.0 on minor 0
ast 0008:07:00.0: [drm] fb0: astdrmfb frame buffer device
With v6.15-rc, since the commit bcb81ac6ae3c ("iommu: Get DT/ACPI parsing
into the proper probe path"), the error returned with the warning is moved
to the SMMU device probe flow:
arm_smmu_probe_device+0x15c/0x4c0
__iommu_probe_device+0x150/0x4f8
probe_iommu_group+0x44/0x80
bus_for_each_dev+0x7c/0x100
bus_iommu_probe+0x48/0x1a8
iommu_device_register+0xb8/0x178
arm_smmu_device_probe+0x1350/0x1db0
which then fails the entire SMMU driver probe:
pci 0008:06:00.0: Adding to iommu group 21
pci 0008:07:00.0: stream 67328 already in tree
arm-smmu-v3 arm-smmu-v3.9.auto: Failed to register iommu
arm-smmu-v3 arm-smmu-v3.9.auto: probe with driver arm-smmu-v3 failed with error -22
Since SMMU driver had been already expecting a potential duplicated Stream
ID in arm_smmu_install_ste_for_dev(), change the arm_smmu_insert_master()
routine to ignore a duplicated ID from the fwspec->sids array as well.
Note: this has been failing the iommu_device_probe() since 2021, although a
recent iommu commit in v6.15-rc1 that moves iommu_device_probe() started to
fail the SMMU driver probe. Since nobody has cared about DMA Alias support,
leave that as it was but fix the fundamental iommu_device_probe() breakage.
Fixes: cdf315f907d4 ("iommu/arm-smmu-v3: Maintain a SID->device structure")
Cc: stable(a)vger.kernel.org
Suggested-by: Jason Gunthorpe <jgg(a)nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc(a)nvidia.com>
Link: https://lore.kernel.org/r/20250415185620.504299-1-nicolinc@nvidia.com
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5467f85dd463..0826b6bdf327 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3388,6 +3388,7 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
mutex_lock(&smmu->streams_mutex);
for (i = 0; i < fwspec->num_ids; i++) {
struct arm_smmu_stream *new_stream = &master->streams[i];
+ struct rb_node *existing;
u32 sid = fwspec->ids[i];
new_stream->id = sid;
@@ -3398,10 +3399,20 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
break;
/* Insert into SID tree */
- if (rb_find_add(&new_stream->node, &smmu->streams,
- arm_smmu_streams_cmp_node)) {
- dev_warn(master->dev, "stream %u already in tree\n",
- sid);
+ existing = rb_find_add(&new_stream->node, &smmu->streams,
+ arm_smmu_streams_cmp_node);
+ if (existing) {
+ struct arm_smmu_master *existing_master =
+ rb_entry(existing, struct arm_smmu_stream, node)
+ ->master;
+
+ /* Bridged PCI devices may end up with duplicated IDs */
+ if (existing_master == master)
+ continue;
+
+ dev_warn(master->dev,
+ "stream %u already in tree from dev %s\n", sid,
+ dev_name(existing_master->dev));
ret = -EINVAL;
break;
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x b00d24997a11c10d3e420614f0873b83ce358a34
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050520-chastity-tasty-6f1e@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b00d24997a11c10d3e420614f0873b83ce358a34 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc(a)nvidia.com>
Date: Tue, 15 Apr 2025 11:56:20 -0700
Subject: [PATCH] iommu/arm-smmu-v3: Fix iommu_device_probe bug due to
duplicated stream ids
ASPEED VGA card has two built-in devices:
0008:06:00.0 PCI bridge: ASPEED Technology, Inc. AST1150 PCI-to-PCI Bridge (rev 06)
0008:07:00.0 VGA compatible controller: ASPEED Technology, Inc. ASPEED Graphics Family (rev 52)
Its toplogy looks like this:
+-[0008:00]---00.0-[01-09]--+-00.0-[02-09]--+-00.0-[03]----00.0 Sandisk Corp Device 5017
| +-01.0-[04]--
| +-02.0-[05]----00.0 NVIDIA Corporation Device
| +-03.0-[06-07]----00.0-[07]----00.0 ASPEED Technology, Inc. ASPEED Graphics Family
| +-04.0-[08]----00.0 Renesas Technology Corp. uPD720201 USB 3.0 Host Controller
| \-05.0-[09]----00.0 Realtek Semiconductor Co., Ltd. RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
\-00.1 PMC-Sierra Inc. Device 4028
The IORT logic populaties two identical IDs into the fwspec->ids array via
DMA aliasing in iort_pci_iommu_init() called by pci_for_each_dma_alias().
Though the SMMU driver had been able to handle this situation since commit
563b5cbe334e ("iommu/arm-smmu-v3: Cope with duplicated Stream IDs"), that
got broken by the later commit cdf315f907d4 ("iommu/arm-smmu-v3: Maintain
a SID->device structure"), which ended up with allocating separate streams
with the same stuffing.
On a kernel prior to v6.15-rc1, there has been an overlooked warning:
pci 0008:07:00.0: vgaarb: setting as boot VGA device
pci 0008:07:00.0: vgaarb: bridge control possible
pci 0008:07:00.0: vgaarb: VGA device added: decodes=io+mem,owns=none,locks=none
pcieport 0008:06:00.0: Adding to iommu group 14
ast 0008:07:00.0: stream 67328 already in tree <===== WARNING
ast 0008:07:00.0: enabling device (0002 -> 0003)
ast 0008:07:00.0: Using default configuration
ast 0008:07:00.0: AST 2600 detected
ast 0008:07:00.0: [drm] Using analog VGA
ast 0008:07:00.0: [drm] dram MCLK=396 Mhz type=1 bus_width=16
[drm] Initialized ast 0.1.0 for 0008:07:00.0 on minor 0
ast 0008:07:00.0: [drm] fb0: astdrmfb frame buffer device
With v6.15-rc, since the commit bcb81ac6ae3c ("iommu: Get DT/ACPI parsing
into the proper probe path"), the error returned with the warning is moved
to the SMMU device probe flow:
arm_smmu_probe_device+0x15c/0x4c0
__iommu_probe_device+0x150/0x4f8
probe_iommu_group+0x44/0x80
bus_for_each_dev+0x7c/0x100
bus_iommu_probe+0x48/0x1a8
iommu_device_register+0xb8/0x178
arm_smmu_device_probe+0x1350/0x1db0
which then fails the entire SMMU driver probe:
pci 0008:06:00.0: Adding to iommu group 21
pci 0008:07:00.0: stream 67328 already in tree
arm-smmu-v3 arm-smmu-v3.9.auto: Failed to register iommu
arm-smmu-v3 arm-smmu-v3.9.auto: probe with driver arm-smmu-v3 failed with error -22
Since SMMU driver had been already expecting a potential duplicated Stream
ID in arm_smmu_install_ste_for_dev(), change the arm_smmu_insert_master()
routine to ignore a duplicated ID from the fwspec->sids array as well.
Note: this has been failing the iommu_device_probe() since 2021, although a
recent iommu commit in v6.15-rc1 that moves iommu_device_probe() started to
fail the SMMU driver probe. Since nobody has cared about DMA Alias support,
leave that as it was but fix the fundamental iommu_device_probe() breakage.
Fixes: cdf315f907d4 ("iommu/arm-smmu-v3: Maintain a SID->device structure")
Cc: stable(a)vger.kernel.org
Suggested-by: Jason Gunthorpe <jgg(a)nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg(a)nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc(a)nvidia.com>
Link: https://lore.kernel.org/r/20250415185620.504299-1-nicolinc@nvidia.com
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5467f85dd463..0826b6bdf327 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3388,6 +3388,7 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
mutex_lock(&smmu->streams_mutex);
for (i = 0; i < fwspec->num_ids; i++) {
struct arm_smmu_stream *new_stream = &master->streams[i];
+ struct rb_node *existing;
u32 sid = fwspec->ids[i];
new_stream->id = sid;
@@ -3398,10 +3399,20 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
break;
/* Insert into SID tree */
- if (rb_find_add(&new_stream->node, &smmu->streams,
- arm_smmu_streams_cmp_node)) {
- dev_warn(master->dev, "stream %u already in tree\n",
- sid);
+ existing = rb_find_add(&new_stream->node, &smmu->streams,
+ arm_smmu_streams_cmp_node);
+ if (existing) {
+ struct arm_smmu_master *existing_master =
+ rb_entry(existing, struct arm_smmu_stream, node)
+ ->master;
+
+ /* Bridged PCI devices may end up with duplicated IDs */
+ if (existing_master == master)
+ continue;
+
+ dev_warn(master->dev,
+ "stream %u already in tree from dev %s\n", sid,
+ dev_name(existing_master->dev));
ret = -EINVAL;
break;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 8dee308e4c01dea48fc104d37f92d5b58c50b96c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050502-aqueduct-contented-b9f9@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8dee308e4c01dea48fc104d37f92d5b58c50b96c Mon Sep 17 00:00:00 2001
From: Pavel Paklov <Pavel.Paklov(a)cyberprotect.ru>
Date: Tue, 25 Mar 2025 09:22:44 +0000
Subject: [PATCH] iommu/amd: Fix potential buffer overflow in
parse_ivrs_acpihid
There is a string parsing logic error which can lead to an overflow of hid
or uid buffers. Comparing ACPIID_LEN against a total string length doesn't
take into account the lengths of individual hid and uid buffers so the
check is insufficient in some cases. For example if the length of hid
string is 4 and the length of the uid string is 260, the length of str
will be equal to ACPIID_LEN + 1 but uid string will overflow uid buffer
which size is 256.
The same applies to the hid string with length 13 and uid string with
length 250.
Check the length of hid and uid strings separately to prevent
buffer overflow.
Found by Linux Verification Center (linuxtesting.org) with SVACE.
Fixes: ca3bf5d47cec ("iommu/amd: Introduces ivrs_acpihid kernel parameter")
Cc: stable(a)vger.kernel.org
Signed-off-by: Pavel Paklov <Pavel.Paklov(a)cyberprotect.ru>
Link: https://lore.kernel.org/r/20250325092259.392844-1-Pavel.Paklov@cyberprotect…
Signed-off-by: Joerg Roedel <jroedel(a)suse.de>
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index dd9e26b7b718..14aa0d77df26 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -3664,6 +3664,14 @@ static int __init parse_ivrs_acpihid(char *str)
while (*uid == '0' && *(uid + 1))
uid++;
+ if (strlen(hid) >= ACPIHID_HID_LEN) {
+ pr_err("Invalid command line: hid is too long\n");
+ return 1;
+ } else if (strlen(uid) >= ACPIHID_UID_LEN) {
+ pr_err("Invalid command line: uid is too long\n");
+ return 1;
+ }
+
i = early_acpihid_map_size++;
memcpy(early_acpihid_map[i].hid, hid, strlen(hid));
memcpy(early_acpihid_map[i].uid, uid, strlen(uid));
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x f95bbfe18512c5c018720468959edac056a17196
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050548-pursuable-absence-aa7d@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f95bbfe18512c5c018720468959edac056a17196 Mon Sep 17 00:00:00 2001
From: Shyam Saini <shyamsaini(a)linux.microsoft.com>
Date: Thu, 27 Feb 2025 10:49:30 -0800
Subject: [PATCH] drivers: base: handle module_kobject creation
module_add_driver() relies on module_kset list for
/sys/module/<built-in-module>/drivers directory creation.
Since,
commit 96a1a2412acba ("kernel/params.c: defer most of param_sysfs_init() to late_initcall time")
drivers which are initialized from subsys_initcall() or any other
higher precedence initcall couldn't find the related kobject entry
in the module_kset list because module_kset is not fully populated
by the time module_add_driver() refers it. As a consequence,
module_add_driver() returns early without calling make_driver_name().
Therefore, /sys/module/<built-in-module>/drivers is never created.
Fix this issue by letting module_add_driver() handle module_kobject
creation itself.
Fixes: 96a1a2412acb ("kernel/params.c: defer most of param_sysfs_init() to late_initcall time")
Cc: stable(a)vger.kernel.org # requires all other patches from the series
Suggested-by: Rasmus Villemoes <linux(a)rasmusvillemoes.dk>
Signed-off-by: Shyam Saini <shyamsaini(a)linux.microsoft.com>
Acked-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Link: https://lore.kernel.org/r/20250227184930.34163-5-shyamsaini@linux.microsoft…
Signed-off-by: Petr Pavlu <petr.pavlu(a)suse.com>
diff --git a/drivers/base/module.c b/drivers/base/module.c
index 5bc71bea883a..218aaa096455 100644
--- a/drivers/base/module.c
+++ b/drivers/base/module.c
@@ -42,16 +42,13 @@ int module_add_driver(struct module *mod, const struct device_driver *drv)
if (mod)
mk = &mod->mkobj;
else if (drv->mod_name) {
- struct kobject *mkobj;
-
- /* Lookup built-in module entry in /sys/modules */
- mkobj = kset_find_obj(module_kset, drv->mod_name);
- if (mkobj) {
- mk = container_of(mkobj, struct module_kobject, kobj);
+ /* Lookup or create built-in module entry in /sys/modules */
+ mk = lookup_or_create_module_kobject(drv->mod_name);
+ if (mk) {
/* remember our module structure */
drv->p->mkobj = mk;
- /* kset_find_obj took a reference */
- kobject_put(mkobj);
+ /* lookup_or_create_module_kobject took a reference */
+ kobject_put(&mk->kobj);
}
}
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x f95bbfe18512c5c018720468959edac056a17196
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050547-conduit-flyable-e816@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f95bbfe18512c5c018720468959edac056a17196 Mon Sep 17 00:00:00 2001
From: Shyam Saini <shyamsaini(a)linux.microsoft.com>
Date: Thu, 27 Feb 2025 10:49:30 -0800
Subject: [PATCH] drivers: base: handle module_kobject creation
module_add_driver() relies on module_kset list for
/sys/module/<built-in-module>/drivers directory creation.
Since,
commit 96a1a2412acba ("kernel/params.c: defer most of param_sysfs_init() to late_initcall time")
drivers which are initialized from subsys_initcall() or any other
higher precedence initcall couldn't find the related kobject entry
in the module_kset list because module_kset is not fully populated
by the time module_add_driver() refers it. As a consequence,
module_add_driver() returns early without calling make_driver_name().
Therefore, /sys/module/<built-in-module>/drivers is never created.
Fix this issue by letting module_add_driver() handle module_kobject
creation itself.
Fixes: 96a1a2412acb ("kernel/params.c: defer most of param_sysfs_init() to late_initcall time")
Cc: stable(a)vger.kernel.org # requires all other patches from the series
Suggested-by: Rasmus Villemoes <linux(a)rasmusvillemoes.dk>
Signed-off-by: Shyam Saini <shyamsaini(a)linux.microsoft.com>
Acked-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Link: https://lore.kernel.org/r/20250227184930.34163-5-shyamsaini@linux.microsoft…
Signed-off-by: Petr Pavlu <petr.pavlu(a)suse.com>
diff --git a/drivers/base/module.c b/drivers/base/module.c
index 5bc71bea883a..218aaa096455 100644
--- a/drivers/base/module.c
+++ b/drivers/base/module.c
@@ -42,16 +42,13 @@ int module_add_driver(struct module *mod, const struct device_driver *drv)
if (mod)
mk = &mod->mkobj;
else if (drv->mod_name) {
- struct kobject *mkobj;
-
- /* Lookup built-in module entry in /sys/modules */
- mkobj = kset_find_obj(module_kset, drv->mod_name);
- if (mkobj) {
- mk = container_of(mkobj, struct module_kobject, kobj);
+ /* Lookup or create built-in module entry in /sys/modules */
+ mk = lookup_or_create_module_kobject(drv->mod_name);
+ if (mk) {
/* remember our module structure */
drv->p->mkobj = mk;
- /* kset_find_obj took a reference */
- kobject_put(mkobj);
+ /* lookup_or_create_module_kobject took a reference */
+ kobject_put(&mk->kobj);
}
}
The patch below does not apply to the 6.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.14.y
git checkout FETCH_HEAD
git cherry-pick -x f95bbfe18512c5c018720468959edac056a17196
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050546-oozy-nylon-220d@gregkh' --subject-prefix 'PATCH 6.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f95bbfe18512c5c018720468959edac056a17196 Mon Sep 17 00:00:00 2001
From: Shyam Saini <shyamsaini(a)linux.microsoft.com>
Date: Thu, 27 Feb 2025 10:49:30 -0800
Subject: [PATCH] drivers: base: handle module_kobject creation
module_add_driver() relies on module_kset list for
/sys/module/<built-in-module>/drivers directory creation.
Since,
commit 96a1a2412acba ("kernel/params.c: defer most of param_sysfs_init() to late_initcall time")
drivers which are initialized from subsys_initcall() or any other
higher precedence initcall couldn't find the related kobject entry
in the module_kset list because module_kset is not fully populated
by the time module_add_driver() refers it. As a consequence,
module_add_driver() returns early without calling make_driver_name().
Therefore, /sys/module/<built-in-module>/drivers is never created.
Fix this issue by letting module_add_driver() handle module_kobject
creation itself.
Fixes: 96a1a2412acb ("kernel/params.c: defer most of param_sysfs_init() to late_initcall time")
Cc: stable(a)vger.kernel.org # requires all other patches from the series
Suggested-by: Rasmus Villemoes <linux(a)rasmusvillemoes.dk>
Signed-off-by: Shyam Saini <shyamsaini(a)linux.microsoft.com>
Acked-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Link: https://lore.kernel.org/r/20250227184930.34163-5-shyamsaini@linux.microsoft…
Signed-off-by: Petr Pavlu <petr.pavlu(a)suse.com>
diff --git a/drivers/base/module.c b/drivers/base/module.c
index 5bc71bea883a..218aaa096455 100644
--- a/drivers/base/module.c
+++ b/drivers/base/module.c
@@ -42,16 +42,13 @@ int module_add_driver(struct module *mod, const struct device_driver *drv)
if (mod)
mk = &mod->mkobj;
else if (drv->mod_name) {
- struct kobject *mkobj;
-
- /* Lookup built-in module entry in /sys/modules */
- mkobj = kset_find_obj(module_kset, drv->mod_name);
- if (mkobj) {
- mk = container_of(mkobj, struct module_kobject, kobj);
+ /* Lookup or create built-in module entry in /sys/modules */
+ mk = lookup_or_create_module_kobject(drv->mod_name);
+ if (mk) {
/* remember our module structure */
drv->p->mkobj = mk;
- /* kset_find_obj took a reference */
- kobject_put(mkobj);
+ /* lookup_or_create_module_kobject took a reference */
+ kobject_put(&mk->kobj);
}
}
Good Day,
How are you?
My name is Calib Cassim from Eskom Holdings Ltd. SA. I got your email from
my personal search.
I have in my position an (over-invoice / over-estimated) contract amount
executed by a Foreign Contractor through my Department, which the
contractor has been paid, and left the amount of $25.5M with the payment
Management.
So, I need your help to receive the amount as the Beneficiary on my behalf
for an investment in your area, I will obtain all the needed legal
documents on your name for the transfer.
If you can help handle it, please reply with your phone number for more
discussions and guidelines.
Thanks,
Mr. Calib
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x ac4e04d9e378f5aa826c2406ad7871ae1b6a6fb9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050520-deprecate-skinny-0232@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ac4e04d9e378f5aa826c2406ad7871ae1b6a6fb9 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Date: Tue, 29 Apr 2025 14:07:11 -0700
Subject: [PATCH] cpufreq: intel_pstate: Unchecked MSR aceess in legacy mode
When turbo mode is unavailable on a Skylake-X system, executing the
command:
# echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
results in an unchecked MSR access error:
WRMSR to 0x199 (attempted to write 0x0000000100001300).
This issue was reproduced on an OEM (Original Equipment Manufacturer)
system and is not a common problem across all Skylake-X systems.
This error occurs because the MSR 0x199 Turbo Engage Bit (bit 32) is set
when turbo mode is disabled. The issue arises when intel_pstate fails to
detect that turbo mode is disabled. Here intel_pstate relies on
MSR_IA32_MISC_ENABLE bit 38 to determine the status of turbo mode.
However, on this system, bit 38 is not set even when turbo mode is
disabled.
According to the Intel Software Developer's Manual (SDM), the BIOS sets
this bit during platform initialization to enable or disable
opportunistic processor performance operations. Logically, this bit
should be set in such cases. However, the SDM also specifies that "OS
and applications must use CPUID leaf 06H to detect processors with
opportunistic processor performance operations enabled."
Therefore, in addition to checking MSR_IA32_MISC_ENABLE bit 38, verify
that CPUID.06H:EAX[1] is 0 to accurately determine if turbo mode is
disabled.
Fixes: 4521e1a0ce17 ("cpufreq: intel_pstate: Reflect current no_turbo state correctly")
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Cc: All applicable <stable(a)vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index f41ed0b9e610..ba9bf06f1c77 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -598,6 +598,9 @@ static bool turbo_is_disabled(void)
{
u64 misc_en;
+ if (!cpu_feature_enabled(X86_FEATURE_IDA))
+ return true;
+
rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
return !!(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x ac4e04d9e378f5aa826c2406ad7871ae1b6a6fb9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050518-excusable-payback-b0f6@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ac4e04d9e378f5aa826c2406ad7871ae1b6a6fb9 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Date: Tue, 29 Apr 2025 14:07:11 -0700
Subject: [PATCH] cpufreq: intel_pstate: Unchecked MSR aceess in legacy mode
When turbo mode is unavailable on a Skylake-X system, executing the
command:
# echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
results in an unchecked MSR access error:
WRMSR to 0x199 (attempted to write 0x0000000100001300).
This issue was reproduced on an OEM (Original Equipment Manufacturer)
system and is not a common problem across all Skylake-X systems.
This error occurs because the MSR 0x199 Turbo Engage Bit (bit 32) is set
when turbo mode is disabled. The issue arises when intel_pstate fails to
detect that turbo mode is disabled. Here intel_pstate relies on
MSR_IA32_MISC_ENABLE bit 38 to determine the status of turbo mode.
However, on this system, bit 38 is not set even when turbo mode is
disabled.
According to the Intel Software Developer's Manual (SDM), the BIOS sets
this bit during platform initialization to enable or disable
opportunistic processor performance operations. Logically, this bit
should be set in such cases. However, the SDM also specifies that "OS
and applications must use CPUID leaf 06H to detect processors with
opportunistic processor performance operations enabled."
Therefore, in addition to checking MSR_IA32_MISC_ENABLE bit 38, verify
that CPUID.06H:EAX[1] is 0 to accurately determine if turbo mode is
disabled.
Fixes: 4521e1a0ce17 ("cpufreq: intel_pstate: Reflect current no_turbo state correctly")
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Cc: All applicable <stable(a)vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index f41ed0b9e610..ba9bf06f1c77 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -598,6 +598,9 @@ static bool turbo_is_disabled(void)
{
u64 misc_en;
+ if (!cpu_feature_enabled(X86_FEATURE_IDA))
+ return true;
+
rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
return !!(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x ac4e04d9e378f5aa826c2406ad7871ae1b6a6fb9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050517-linked-numbing-6e20@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ac4e04d9e378f5aa826c2406ad7871ae1b6a6fb9 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Date: Tue, 29 Apr 2025 14:07:11 -0700
Subject: [PATCH] cpufreq: intel_pstate: Unchecked MSR aceess in legacy mode
When turbo mode is unavailable on a Skylake-X system, executing the
command:
# echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
results in an unchecked MSR access error:
WRMSR to 0x199 (attempted to write 0x0000000100001300).
This issue was reproduced on an OEM (Original Equipment Manufacturer)
system and is not a common problem across all Skylake-X systems.
This error occurs because the MSR 0x199 Turbo Engage Bit (bit 32) is set
when turbo mode is disabled. The issue arises when intel_pstate fails to
detect that turbo mode is disabled. Here intel_pstate relies on
MSR_IA32_MISC_ENABLE bit 38 to determine the status of turbo mode.
However, on this system, bit 38 is not set even when turbo mode is
disabled.
According to the Intel Software Developer's Manual (SDM), the BIOS sets
this bit during platform initialization to enable or disable
opportunistic processor performance operations. Logically, this bit
should be set in such cases. However, the SDM also specifies that "OS
and applications must use CPUID leaf 06H to detect processors with
opportunistic processor performance operations enabled."
Therefore, in addition to checking MSR_IA32_MISC_ENABLE bit 38, verify
that CPUID.06H:EAX[1] is 0 to accurately determine if turbo mode is
disabled.
Fixes: 4521e1a0ce17 ("cpufreq: intel_pstate: Reflect current no_turbo state correctly")
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Cc: All applicable <stable(a)vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index f41ed0b9e610..ba9bf06f1c77 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -598,6 +598,9 @@ static bool turbo_is_disabled(void)
{
u64 misc_en;
+ if (!cpu_feature_enabled(X86_FEATURE_IDA))
+ return true;
+
rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
return !!(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x e08e49d986f82c30f42ad0ed43ebbede1e1e3739
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025050520-specimen-smell-8a92@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e08e49d986f82c30f42ad0ed43ebbede1e1e3739 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef(a)toxicpanda.com>
Date: Mon, 14 Apr 2025 14:51:58 -0400
Subject: [PATCH] btrfs: adjust subpage bit start based on sectorsize
When running machines with 64k page size and a 16k nodesize we started
seeing tree log corruption in production. This turned out to be because
we were not writing out dirty blocks sometimes, so this in fact affects
all metadata writes.
When writing out a subpage EB we scan the subpage bitmap for a dirty
range. If the range isn't dirty we do
bit_start++;
to move onto the next bit. The problem is the bitmap is based on the
number of sectors that an EB has. So in this case, we have a 64k
pagesize, 16k nodesize, but a 4k sectorsize. This means our bitmap is 4
bits for every node. With a 64k page size we end up with 4 nodes per
page.
To make this easier this is how everything looks
[0 16k 32k 48k ] logical address
[0 4 8 12 ] radix tree offset
[ 64k page ] folio
[ 16k eb ][ 16k eb ][ 16k eb ][ 16k eb ] extent buffers
[ | | | | | | | | | | | | | | | | ] bitmap
Now we use all of our addressing based on fs_info->sectorsize_bits, so
as you can see the above our 16k eb->start turns into radix entry 4.
When we find a dirty range for our eb, we correctly do bit_start +=
sectors_per_node, because if we start at bit 0, the next bit for the
next eb is 4, to correspond to eb->start 16k.
However if our range is clean, we will do bit_start++, which will now
put us offset from our radix tree entries.
In our case, assume that the first time we check the bitmap the block is
not dirty, we increment bit_start so now it == 1, and then we loop
around and check again. This time it is dirty, and we go to find that
start using the following equation
start = folio_start + bit_start * fs_info->sectorsize;
so in the case above, eb->start 0 is now dirty, and we calculate start
as
0 + 1 * fs_info->sectorsize = 4096
4096 >> 12 = 1
Now we're looking up the radix tree for 1, and we won't find an eb.
What's worse is now we're using bit_start == 1, so we do bit_start +=
sectors_per_node, which is now 5. If that eb is dirty we will run into
the same thing, we will look at an offset that is not populated in the
radix tree, and now we're skipping the writeout of dirty extent buffers.
The best fix for this is to not use sectorsize_bits to address nodes,
but that's a larger change. Since this is a fs corruption problem fix
it simply by always using sectors_per_node to increment the start bit.
Fixes: c4aec299fa8f ("btrfs: introduce submit_eb_subpage() to submit a subpage metadata page")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Boris Burkov <boris(a)bur.io>
Reviewed-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 197f5e51c474..8515c31f563b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2047,7 +2047,7 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
spin_unlock(&folio->mapping->i_private_lock);
- bit_start++;
+ bit_start += sectors_per_node;
continue;
}
These patchset adds support for VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW
message added in recent VPU firmware. Without it the driver will not be able to
process any jobs after this message is received and would need to be reloaded.
The last patch in this series is as-is from upstream, but other two patches
had to be rebased because of missing new CMDQ UAPI changes that should not be
backported to stable.
Karol Wachowski (3):
accel/ivpu: Abort all jobs after command queue unregister
accel/ivpu: Fix locking order in ivpu_job_submit
accel/ivpu: Add handling of VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW
drivers/accel/ivpu/ivpu_drv.c | 32 ++-------
drivers/accel/ivpu/ivpu_drv.h | 2 +
drivers/accel/ivpu/ivpu_job.c | 111 ++++++++++++++++++++++++++------
drivers/accel/ivpu/ivpu_job.h | 1 +
drivers/accel/ivpu/ivpu_mmu.c | 3 +-
drivers/accel/ivpu/ivpu_sysfs.c | 5 +-
6 files changed, 103 insertions(+), 51 deletions(-)
--
2.45.1
From: Ashish Kalra <ashish.kalra(a)amd.com>
When the shared pages are being made private during kdump preparation
there are additional checks to handle shared GHCB pages.
These additional checks include handling the case of GHCB page being
contained within a huge page.
The check for handling the case of GHCB contained within a huge
page incorrectly skips a page just below the GHCB page from being
transitioned back to private during kdump preparation.
This skipped page causes a 0x404 #VC exception when it is accessed
later while dumping guest memory during vmcore generation via kdump.
Correct the range to be checked for GHCB contained in a huge page.
Also ensure that the skipped huge page containing the GHCB page is
transitioned back to private by applying the correct address mask
later when changing GHCBs to private at end of kdump preparation.
Cc: stable(a)vger.kernel.org
Fixes: 3074152e56c9 ("x86/sev: Convert shared memory back to private on kexec")
Signed-off-by: Ashish Kalra <ashish.kalra(a)amd.com>
---
arch/x86/coco/sev/core.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index d35fec7b164a..e39db6714f09 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -1019,7 +1019,8 @@ static void unshare_all_memory(void)
data = per_cpu(runtime_data, cpu);
ghcb = (unsigned long)&data->ghcb_page;
- if (addr <= ghcb && ghcb <= addr + size) {
+ /* Handle the case of a huge page containing the GHCB page */
+ if (addr <= ghcb && ghcb < addr + size) {
skipped_addr = true;
break;
}
@@ -1131,9 +1132,8 @@ static void shutdown_all_aps(void)
void snp_kexec_finish(void)
{
struct sev_es_runtime_data *data;
+ unsigned long size, ghcb;
unsigned int level, cpu;
- unsigned long size;
- struct ghcb *ghcb;
pte_t *pte;
if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
@@ -1157,11 +1157,13 @@ void snp_kexec_finish(void)
for_each_possible_cpu(cpu) {
data = per_cpu(runtime_data, cpu);
- ghcb = &data->ghcb_page;
- pte = lookup_address((unsigned long)ghcb, &level);
+ ghcb = (unsigned long)&data->ghcb_page;
+ pte = lookup_address(ghcb, &level);
size = page_level_size(level);
+ /* Handle the case of a huge page containing the GHCB page */
+ ghcb &= page_level_mask(level);
set_pte_enc(pte, level, (void *)ghcb);
- snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE));
+ snp_set_memory_private(ghcb, (size / PAGE_SIZE));
}
}
--
2.34.1
Hi Greg,
In kernel v6.10 the zoned storage approach was changed from zoned write
locking to zone write plugging. Because of this change the block layer
must preserve the request order. Hence this backport of Christoph's
"don't reorder requests passed to ->queue_rqs" patch series. Please
consider this patch series for inclusion in the 6.12 stable kernel.
See also https://lore.kernel.org/linux-block/20241113152050.157179-1-hch@lst.de/.
Thanks,
Bart.
Christoph Hellwig (3):
block: remove rq_list_move
block: add a rq_list type
block: don't reorder requests in blk_add_rq_to_plug
block/blk-core.c | 6 +--
block/blk-merge.c | 2 +-
block/blk-mq.c | 42 +++++++--------
block/blk-mq.h | 2 +-
drivers/block/null_blk/main.c | 9 ++--
drivers/block/virtio_blk.c | 13 +++--
drivers/nvme/host/apple.c | 2 +-
drivers/nvme/host/pci.c | 15 +++---
include/linux/blk-mq.h | 99 +++++++++++++++++------------------
include/linux/blkdev.h | 11 ++--
io_uring/rw.c | 4 +-
11 files changed, 102 insertions(+), 103 deletions(-)
Add the correct scale to get temperature in mili degree Celcius.
Add sign component to temperature scan element.
Signed-off-by: Sean Nyekjaer <sean(a)geanix.com>
---
Changes in v2:
- Correct offset is applied before scaling component
- Added sign component to temperature scan element
- Link to v1: https://lore.kernel.org/r/20250501-fxls-v1-1-f54061a07099@geanix.com
---
Sean Nyekjaer (2):
iio: accel: fxls8962af: Fix temperature calculation
iio: accel: fxls8962af: Fix sign temperature scan element
drivers/iio/accel/fxls8962af-core.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
---
base-commit: 609bc31eca06c7408e6860d8b46311ebe45c1fef
change-id: 20250501-fxls-307ef3d6d065
Best regards,
--
Sean Nyekjaer <sean(a)geanix.com>
Hi all,
I recently discovered an mm regression introduced in kernel version 6.9
that affects systems running as a Xen PV domain [1]. Original fix
proposal wasn't ideal, but it sparked a discussion which helped us
fully understand the root cause.
The new v2 patch contains changes based on David Hildenbrand's proposal
to cap max_nr to the number of PFNs that actually remain in the folio
and to clean up the loop.
Thanks,
Petr
[1] https://lore.kernel.org/lkml/20250429142237.22138-1-arkamar@atlas.cz
Petr Vaněk (1):
mm: fix folio_pte_batch() on XEN PV
mm/internal.h | 27 +++++++++++----------------
1 file changed, 11 insertions(+), 16 deletions(-)
--
2.48.1
arch_bpf_trampoline_size() provides JIT size of the BPF trampoline
before the buffer for JIT'ing it is allocated. The total number of
instructions emitted for BPF trampoline JIT code depends on where
the final image is located. So, the size arrived at with the dummy
pass in arch_bpf_trampoline_size() can vary from the actual size
needed in arch_prepare_bpf_trampoline(). When the instructions
accounted in arch_bpf_trampoline_size() is less than the number of
instructions emitted during the actual JIT compile of the trampoline,
the below warning is produced:
WARNING: CPU: 8 PID: 204190 at arch/powerpc/net/bpf_jit_comp.c:981 __arch_prepare_bpf_trampoline.isra.0+0xd2c/0xdcc
which is:
/* Make sure the trampoline generation logic doesn't overflow */
if (image && WARN_ON_ONCE(&image[ctx->idx] >
(u32 *)rw_image_end - BPF_INSN_SAFETY)) {
So, during the dummy pass, instead of providing some arbitrary image
location, account for maximum possible instructions if and when there
is a dependency with image location for JIT'ing.
Fixes: d243b62b7bd3 ("powerpc64/bpf: Add support for bpf trampolines")
Reported-by: Venkat Rao Bagalkote <venkat88(a)linux.ibm.com>
Closes: https://lore.kernel.org/all/6168bfc8-659f-4b5a-a6fb-90a916dde3b3@linux.ibm.…
Cc: stable(a)vger.kernel.org # v6.13+
Acked-by: Naveen N Rao (AMD) <naveen(a)kernel.org>
Tested-by: Venkat Rao Bagalkote <venkat88(a)linux.ibm.com>
Signed-off-by: Hari Bathini <hbathini(a)linux.ibm.com>
---
Changes since v2:
- Address review comments from Naveen:
- Remove additional padding for 'case BPF_LD | BPF_IMM | BPF_DW:'
in arch/powerpc/net/bpf_jit_comp*.c
- Merge the if sequence in bpf_jit_emit_func_call_rel() with the
other conditionals
- Naveen, carried your Acked-by tag as the additional changes are
minimal and in line with your suggestion. Feel free to update
if you look at it differently.
- Venkat, carried your Tested-by tag from v2 as the changes in
v3 should not alter the test result. Feel free to update if
you look at it differently.
Changes since v1:
- Pass NULL for image during intial pass and account for max. possible
instruction during this pass as Naveen suggested.
arch/powerpc/net/bpf_jit.h | 20 ++++++++++++++++---
arch/powerpc/net/bpf_jit_comp.c | 33 ++++++++++---------------------
arch/powerpc/net/bpf_jit_comp32.c | 6 ------
arch/powerpc/net/bpf_jit_comp64.c | 15 +++++++-------
4 files changed, 35 insertions(+), 39 deletions(-)
diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 6beacaec63d3..4c26912c2e3c 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -51,8 +51,16 @@
EMIT(PPC_INST_BRANCH_COND | (((cond) & 0x3ff) << 16) | (offset & 0xfffc)); \
} while (0)
-/* Sign-extended 32-bit immediate load */
+/*
+ * Sign-extended 32-bit immediate load
+ *
+ * If this is a dummy pass (!image), account for
+ * maximum possible instructions.
+ */
#define PPC_LI32(d, i) do { \
+ if (!image) \
+ ctx->idx += 2; \
+ else { \
if ((int)(uintptr_t)(i) >= -32768 && \
(int)(uintptr_t)(i) < 32768) \
EMIT(PPC_RAW_LI(d, i)); \
@@ -60,10 +68,15 @@
EMIT(PPC_RAW_LIS(d, IMM_H(i))); \
if (IMM_L(i)) \
EMIT(PPC_RAW_ORI(d, d, IMM_L(i))); \
- } } while(0)
+ } \
+ } } while (0)
#ifdef CONFIG_PPC64
+/* If dummy pass (!image), account for maximum possible instructions */
#define PPC_LI64(d, i) do { \
+ if (!image) \
+ ctx->idx += 5; \
+ else { \
if ((long)(i) >= -2147483648 && \
(long)(i) < 2147483648) \
PPC_LI32(d, i); \
@@ -84,7 +97,8 @@
if ((uintptr_t)(i) & 0x000000000000ffffULL) \
EMIT(PPC_RAW_ORI(d, d, (uintptr_t)(i) & \
0xffff)); \
- } } while (0)
+ } \
+ } } while (0)
#define PPC_LI_ADDR PPC_LI64
#ifndef CONFIG_PPC_KERNEL_PCREL
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 2991bb171a9b..c0684733e9d6 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -504,10 +504,11 @@ static int invoke_bpf_prog(u32 *image, u32 *ro_image, struct codegen_context *ct
EMIT(PPC_RAW_ADDI(_R3, _R1, regs_off));
if (!p->jited)
PPC_LI_ADDR(_R4, (unsigned long)p->insnsi);
- if (!create_branch(&branch_insn, (u32 *)&ro_image[ctx->idx], (unsigned long)p->bpf_func,
- BRANCH_SET_LINK)) {
- if (image)
- image[ctx->idx] = ppc_inst_val(branch_insn);
+ /* Account for max possible instructions during dummy pass for size calculation */
+ if (image && !create_branch(&branch_insn, (u32 *)&ro_image[ctx->idx],
+ (unsigned long)p->bpf_func,
+ BRANCH_SET_LINK)) {
+ image[ctx->idx] = ppc_inst_val(branch_insn);
ctx->idx++;
} else {
EMIT(PPC_RAW_LL(_R12, _R25, offsetof(struct bpf_prog, bpf_func)));
@@ -889,7 +890,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
bpf_trampoline_restore_tail_call_cnt(image, ctx, func_frame_offset, r4_off);
/* Reserve space to patch branch instruction to skip fexit progs */
- im->ip_after_call = &((u32 *)ro_image)[ctx->idx];
+ if (ro_image) /* image is NULL for dummy pass */
+ im->ip_after_call = &((u32 *)ro_image)[ctx->idx];
EMIT(PPC_RAW_NOP());
}
@@ -912,7 +914,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
}
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- im->ip_epilogue = &((u32 *)ro_image)[ctx->idx];
+ if (ro_image) /* image is NULL for dummy pass */
+ im->ip_epilogue = &((u32 *)ro_image)[ctx->idx];
PPC_LI_ADDR(_R3, im);
ret = bpf_jit_emit_func_call_rel(image, ro_image, ctx,
(unsigned long)__bpf_tramp_exit);
@@ -973,25 +976,9 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
struct bpf_tramp_links *tlinks, void *func_addr)
{
struct bpf_tramp_image im;
- void *image;
int ret;
- /*
- * Allocate a temporary buffer for __arch_prepare_bpf_trampoline().
- * This will NOT cause fragmentation in direct map, as we do not
- * call set_memory_*() on this buffer.
- *
- * We cannot use kvmalloc here, because we need image to be in
- * module memory range.
- */
- image = bpf_jit_alloc_exec(PAGE_SIZE);
- if (!image)
- return -ENOMEM;
-
- ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image,
- m, flags, tlinks, func_addr);
- bpf_jit_free_exec(image);
-
+ ret = __arch_prepare_bpf_trampoline(&im, NULL, NULL, NULL, m, flags, tlinks, func_addr);
return ret;
}
diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c
index c4db278dae36..0aace304dfe1 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -313,7 +313,6 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
u64 func_addr;
u32 true_cond;
u32 tmp_idx;
- int j;
if (i && (BPF_CLASS(code) == BPF_ALU64 || BPF_CLASS(code) == BPF_ALU) &&
(BPF_CLASS(prevcode) == BPF_ALU64 || BPF_CLASS(prevcode) == BPF_ALU) &&
@@ -1099,13 +1098,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
* 16 byte instruction that uses two 'struct bpf_insn'
*/
case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */
- tmp_idx = ctx->idx;
PPC_LI32(dst_reg_h, (u32)insn[i + 1].imm);
PPC_LI32(dst_reg, (u32)insn[i].imm);
- /* padding to allow full 4 instructions for later patching */
- if (!image)
- for (j = ctx->idx - tmp_idx; j < 4; j++)
- EMIT(PPC_RAW_NOP());
/* Adjust for two bpf instructions */
addrs[++i] = ctx->idx * 4;
break;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 233703b06d7c..5daa77aee7f7 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -227,7 +227,14 @@ int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *
#ifdef CONFIG_PPC_KERNEL_PCREL
reladdr = func_addr - local_paca->kernelbase;
- if (reladdr < (long)SZ_8G && reladdr >= -(long)SZ_8G) {
+ /*
+ * If fimage is NULL (the initial pass to find image size),
+ * account for the maximum no. of instructions possible.
+ */
+ if (!fimage) {
+ ctx->idx += 7;
+ return 0;
+ } else if (reladdr < (long)SZ_8G && reladdr >= -(long)SZ_8G) {
EMIT(PPC_RAW_LD(_R12, _R13, offsetof(struct paca_struct, kernelbase)));
/* Align for subsequent prefix instruction */
if (!IS_ALIGNED((unsigned long)fimage + CTX_NIA(ctx), 8))
@@ -412,7 +419,6 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
u64 imm64;
u32 true_cond;
u32 tmp_idx;
- int j;
/*
* addrs[] maps a BPF bytecode address into a real offset from
@@ -1046,12 +1052,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */
imm64 = ((u64)(u32) insn[i].imm) |
(((u64)(u32) insn[i+1].imm) << 32);
- tmp_idx = ctx->idx;
PPC_LI64(dst_reg, imm64);
- /* padding to allow full 5 instructions for later patching */
- if (!image)
- for (j = ctx->idx - tmp_idx; j < 5; j++)
- EMIT(PPC_RAW_NOP());
/* Adjust for two bpf instructions */
addrs[++i] = ctx->idx * 4;
break;
--
2.49.0
The patch titled
Subject: mm: fix folio_pte_batch() on XEN PV
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-fix-folio_pte_batch-on-xen-pv.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Petr Van��k <arkamar(a)atlas.cz>
Subject: mm: fix folio_pte_batch() on XEN PV
Date: Fri, 2 May 2025 23:50:19 +0200
On XEN PV, folio_pte_batch() can incorrectly batch beyond the end of a
folio due to a corner case in pte_advance_pfn(). Specifically, when the
PFN following the folio maps to an invalidated MFN,
expected_pte = pte_advance_pfn(expected_pte, nr);
produces a pte_none(). If the actual next PTE in memory is also
pte_none(), the pte_same() succeeds,
if (!pte_same(pte, expected_pte))
break;
the loop is not broken, and batching continues into unrelated memory.
For example, with a 4-page folio, the PTE layout might look like this:
[ 53.465673] [ T2552] folio_pte_batch: printing PTE values at addr=0x7f1ac9dc5000
[ 53.465674] [ T2552] PTE[453] = 000000010085c125
[ 53.465679] [ T2552] PTE[454] = 000000010085d125
[ 53.465682] [ T2552] PTE[455] = 000000010085e125
[ 53.465684] [ T2552] PTE[456] = 000000010085f125
[ 53.465686] [ T2552] PTE[457] = 0000000000000000 <-- not present
[ 53.465689] [ T2552] PTE[458] = 0000000101da7125
pte_advance_pfn(PTE[456]) returns a pte_none() due to invalid PFN->MFN
mapping. The next actual PTE (PTE[457]) is also pte_none(), so the loop
continues and includes PTE[457] in the batch, resulting in 5 batched
entries for a 4-page folio. This triggers the following warning:
[ 53.465751] [ T2552] page: refcount:85 mapcount:20 mapping:ffff88813ff4f6a8 index:0x110 pfn:0x10085c
[ 53.465754] [ T2552] head: order:2 mapcount:80 entire_mapcount:0 nr_pages_mapped:4 pincount:0
[ 53.465756] [ T2552] memcg:ffff888003573000
[ 53.465758] [ T2552] aops:0xffffffff8226fd20 ino:82467c dentry name(?):"libc.so.6"
[ 53.465761] [ T2552] flags: 0x2000000000416c(referenced|uptodate|lru|active|private|head|node=0|zone=2)
[ 53.465764] [ T2552] raw: 002000000000416c ffffea0004021f08 ffffea0004021908 ffff88813ff4f6a8
[ 53.465767] [ T2552] raw: 0000000000000110 ffff888133d8bd40 0000005500000013 ffff888003573000
[ 53.465768] [ T2552] head: 002000000000416c ffffea0004021f08 ffffea0004021908 ffff88813ff4f6a8
[ 53.465770] [ T2552] head: 0000000000000110 ffff888133d8bd40 0000005500000013 ffff888003573000
[ 53.465772] [ T2552] head: 0020000000000202 ffffea0004021701 000000040000004f 00000000ffffffff
[ 53.465774] [ T2552] head: 0000000300000003 8000000300000002 0000000000000013 0000000000000004
[ 53.465775] [ T2552] page dumped because: VM_WARN_ON_FOLIO((_Generic((page + nr_pages - 1), const struct page *: (const struct folio *)_compound_head(page + nr_pages - 1), struct page *: (struct folio *)_compound_head(page + nr_pages - 1))) != folio)
Original code works as expected everywhere, except on XEN PV, where
pte_advance_pfn() can yield a pte_none() after balloon inflation due to
MFNs invalidation. In XEN, pte_advance_pfn() ends up calling
__pte()->xen_make_pte()->pte_pfn_to_mfn(), which returns pte_none() when
mfn == INVALID_P2M_ENTRY.
The pte_pfn_to_mfn() documents that nastiness:
If there's no mfn for the pfn, then just create an
empty non-present pte. Unfortunately this loses
information about the original pfn, so
pte_mfn_to_pfn is asymmetric.
While such hacks should certainly be removed, we can do better in
folio_pte_batch() and simply check ahead of time how many PTEs we can
possibly batch in our folio.
This way, we can not only fix the issue but cleanup the code: removing the
pte_pfn() check inside the loop body and avoiding end_ptr comparison +
arithmetic.
Link: https://lkml.kernel.org/r/20250502215019.822-2-arkamar@atlas.cz
Fixes: f8d937761d65 ("mm/memory: optimize fork() with PTE-mapped THP")
Co-developed-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: Petr Van��k <arkamar(a)atlas.cz>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/internal.h | 27 +++++++++++----------------
1 file changed, 11 insertions(+), 16 deletions(-)
--- a/mm/internal.h~mm-fix-folio_pte_batch-on-xen-pv
+++ a/mm/internal.h
@@ -248,11 +248,9 @@ static inline int folio_pte_batch(struct
pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
bool *any_writable, bool *any_young, bool *any_dirty)
{
- unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
- const pte_t *end_ptep = start_ptep + max_nr;
pte_t expected_pte, *ptep;
bool writable, young, dirty;
- int nr;
+ int nr, cur_nr;
if (any_writable)
*any_writable = false;
@@ -265,11 +263,15 @@ static inline int folio_pte_batch(struct
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
+ /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */
+ max_nr = min_t(unsigned long, max_nr,
+ folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
+
nr = pte_batch_hint(start_ptep, pte);
expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
ptep = start_ptep + nr;
- while (ptep < end_ptep) {
+ while (nr < max_nr) {
pte = ptep_get(ptep);
if (any_writable)
writable = !!pte_write(pte);
@@ -282,14 +284,6 @@ static inline int folio_pte_batch(struct
if (!pte_same(pte, expected_pte))
break;
- /*
- * Stop immediately once we reached the end of the folio. In
- * corner cases the next PFN might fall into a different
- * folio.
- */
- if (pte_pfn(pte) >= folio_end_pfn)
- break;
-
if (any_writable)
*any_writable |= writable;
if (any_young)
@@ -297,12 +291,13 @@ static inline int folio_pte_batch(struct
if (any_dirty)
*any_dirty |= dirty;
- nr = pte_batch_hint(ptep, pte);
- expected_pte = pte_advance_pfn(expected_pte, nr);
- ptep += nr;
+ cur_nr = pte_batch_hint(ptep, pte);
+ expected_pte = pte_advance_pfn(expected_pte, cur_nr);
+ ptep += cur_nr;
+ nr += cur_nr;
}
- return min(ptep - start_ptep, max_nr);
+ return min(nr, max_nr);
}
/**
_
Patches currently in -mm which might be from arkamar(a)atlas.cz are
mm-fix-folio_pte_batch-on-xen-pv.patch
create_init_idmap() could be called before .bss section initialization
which is done in early_map_kernel() since data/test_prot could be set
wrong for PTE_MAYBE_NG macro.
PTE_MAYBE_NG macro is set according to value of "arm64_use_ng_mappings".
and this variable is located in .bss section.
# llvm-objdump-21 --syms vmlinux-gcc | grep arm64_use_ng_mappings
ffff800082f242a8 g O .bss 0000000000000001 arm64_use_ng_mappings
If .bss section doesn't initialized, "arm64_use_ng_mappings" would be set
with garbage value ant then the text_prot or data_prot could be set
with garbgae value.
Here is what i saw with kernel compiled via llvm-21
// create_init_idmap()
ffff80008255c058: d10103ff sub sp, sp, #0x40
ffff80008255c05c: a9017bfd stp x29, x30, [sp, #0x10]
ffff80008255c060: a90257f6 stp x22, x21, [sp, #0x20]
ffff80008255c064: a9034ff4 stp x20, x19, [sp, #0x30]
ffff80008255c068: 910043fd add x29, sp, #0x10
ffff80008255c06c: 90003fc8 adrp x8, 0xffff800082d54000
ffff80008255c070: d280e06a mov x10, #0x703 // =1795
ffff80008255c074: 91400409 add x9, x0, #0x1, lsl #12 // =0x1000
ffff80008255c078: 394a4108 ldrb w8, [x8, #0x290] ------------- (1)
ffff80008255c07c: f2e00d0a movk x10, #0x68, lsl #48
ffff80008255c080: f90007e9 str x9, [sp, #0x8]
ffff80008255c084: aa0103f3 mov x19, x1
ffff80008255c088: aa0003f4 mov x20, x0
ffff80008255c08c: 14000000 b 0xffff80008255c08c <__pi_create_init_idmap+0x34>
ffff80008255c090: aa082d56 orr x22, x10, x8, lsl #11 -------- (2)
Note (1) is load the arm64_use_ng_mappings value in w8.
and (2) is set the text or data prot with the w8 value to set PTE_NG bit.
If .bss section doesn't initialized, x8 can include garbage value
-- In case of some platform, x8 loaded with 0xcf -- it could generate
wrong mapping. (i.e) text_prot is expected with
PAGE_KERNEL_ROX(0x0040000000000F83) but
with garbage x8 -- 0xcf, it sets with (0x0040000000067F83)
and This makes boot failure with translation fault.
This error cannot happen according to code generated by compiler.
here is the case of gcc:
ffff80008260a940 <__pi_create_init_idmap>:
ffff80008260a940: d100c3ff sub sp, sp, #0x30
ffff80008260a944: aa0003ed mov x13, x0
ffff80008260a948: 91400400 add x0, x0, #0x1, lsl #12 // =0x1000
ffff80008260a94c: a9017bfd stp x29, x30, [sp, #0x10]
ffff80008260a950: 910043fd add x29, sp, #0x10
ffff80008260a954: f90017e0 str x0, [sp, #0x28]
ffff80008260a958: d00048c0 adrp x0, 0xffff800082f24000 <reset_devices>
ffff80008260a95c: 394aa000 ldrb w0, [x0, #0x2a8]
ffff80008260a960: 37000640 tbnz w0, #0x0, 0xffff80008260aa28 <__pi_create_init_idmap+0xe8> ---(3)
ffff80008260a964: d280f060 mov x0, #0x783 // =1923
ffff80008260a968: d280e062 mov x2, #0x703 // =1795
ffff80008260a96c: f2e00800 movk x0, #0x40, lsl #48
ffff80008260a970: f2e00d02 movk x2, #0x68, lsl #48
ffff80008260a974: aa2103e4 mvn x4, x1
ffff80008260a978: 8a210049 bic x9, x2, x1
...
ffff80008260aa28: d281f060 mov x0, #0xf83 // =3971
ffff80008260aa2c: d281e062 mov x2, #0xf03 // =3843
ffff80008260aa30: f2e00800 movk x0, #0x40, lsl #48
In case of gcc, according to value of arm64_use_ng_mappings (annoated as(3)),
it branches to each prot settup code.
However this is also problem since it branches according to garbage
value too -- idmapping with wrong pgprot.
To resolve this, annotate arm64_use_ng_mappings as ro_after_init.
Fixes: 84b04d3e6bdb ("arm64: kernel: Create initial ID map from C code")
Cc: <stable(a)vger.kernel.org> # 6.9.x
Signed-off-by: Yeoreum Yun <yeoreum.yun(a)arm.com>
---
There is another way to solve this problem by setting
test/data_prot with _PAGE_DEFAULT which doesn't include PTE_MAYBE_NG
with constanst check in create_init_idmap() to be free from
arm64_use_ng_mappings. but i think it would be better to change
arm64_use_ng_mappings as ro_after_init because it doesn't change after
init phase and solve this problem too.
---
arch/arm64/kernel/cpufeature.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d2104a1e7843..967ffb1cbd52 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -114,7 +114,7 @@ static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NC
DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS);
-bool arm64_use_ng_mappings = false;
+bool arm64_use_ng_mappings __ro_after_init = false;
EXPORT_SYMBOL(arm64_use_ng_mappings);
DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors;
--
LEVI:{C3F47F37-75D8-414A-A8BA-3980EC8A46D7}
The ftrace __mcount_loc buildtime sort does not work properly when the host is
32-bit and the target is 64-bit. sorttable parses the start and stop addresses
by calling strtoul on the buffer holding the hexadecimal string. Since the
target is 64-bit but unsigned long on 32-bit machines is 32 bits, strtoul,
and by extension the start and stop addresses, can max out to 2^32 - 1.
This patch adds a new macro, parse_addr, that corresponds to a strtoul
or strtoull call based on whether you are operating on a 32-bit ELF or
a 64-bit ELF. This way, the correct width is guaranteed whether or not
the host is 32-bit. This should cleanly apply on all of the 6.x stable
kernels.
Manually verified that the __mcount_loc section is sorted by parsing the
ELF and verified tests corresponding to CONFIG_FTRACE_SORT_STARTUP_TEST
for kernels built on a 32-bit and a 64-bit host.
Signed-off-by: Sahil Gupta <s.gupta(a)arista.com>
---
scripts/sorttable.h | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/scripts/sorttable.h b/scripts/sorttable.h
index 7bd0184380d3..9ed7acca9f30 100644
--- a/scripts/sorttable.h
+++ b/scripts/sorttable.h
@@ -40,6 +40,7 @@
#undef uint_t
#undef _r
#undef _w
+#undef parse_addr
#ifdef SORTTABLE_64
# define extable_ent_size 16
@@ -65,6 +66,7 @@
# define uint_t uint64_t
# define _r r8
# define _w w8
+# define parse_addr(buf) strtoull(buf, NULL, 16)
#else
# define extable_ent_size 8
# define compare_extable compare_extable_32
@@ -89,6 +91,7 @@
# define uint_t uint32_t
# define _r r
# define _w w
+# define parse_addr(buf) strtoul(buf, NULL, 16)
#endif
#if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED)
@@ -246,13 +249,13 @@ static void get_mcount_loc(uint_t *_start, uint_t *_stop)
len = strlen(start_buff);
start_buff[len - 1] = '\0';
}
- *_start = strtoul(start_buff, NULL, 16);
+ *_start = parse_addr(start_buff);
while (fgets(stop_buff, sizeof(stop_buff), file_stop) != NULL) {
len = strlen(stop_buff);
stop_buff[len - 1] = '\0';
}
- *_stop = strtoul(stop_buff, NULL, 16);
+ *_stop = parse_addr(stop_buff);
pclose(file_start);
pclose(file_stop);
--
2.47.0
When a CL/CSD job times out, we check if the GPU has made any progress
since the last timeout. If so, instead of resetting the hardware, we skip
the reset and let the timer get rearmed. This gives long-running jobs a
chance to complete.
However, when `timedout_job()` is called, the job in question is removed
from the pending list, which means it won't be automatically freed through
`free_job()`. Consequently, when we skip the reset and keep the job
running, the job won't be freed when it finally completes.
This situation leads to a memory leak, as exposed in [1] and [2].
Similarly to commit 704d3d60fec4 ("drm/etnaviv: don't block scheduler when
GPU is still active"), this patch ensures the job is put back on the
pending list when extending the timeout.
Cc: stable(a)vger.kernel.org # 6.0
Reported-by: Daivik Bhatia <dtgs1208(a)gmail.com>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12227 [1]
Closes: https://github.com/raspberrypi/linux/issues/6817 [2]
Signed-off-by: Maíra Canal <mcanal(a)igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral(a)igalia.com>
---
Hi,
While we typically strive to avoid exposing the scheduler's internals
within the drivers, I'm proposing this fix as an interim solution. I'm aware
that a comprehensive fix will need some adjustments on the DRM sched side,
and I plan to address that soon.
However, it would be hard to justify the backport of such patches to the
stable branches and this issue is affecting users in the moment.
Therefore, I'd like to push this patch to drm-misc-fixes in order to
address this leak as soon as possible, while working in a more generic
solution.
Best Regards,
- Maíra
v1 -> v2: https://lore.kernel.org/dri-devel/20250427202907.94415-2-mcanal@igalia.com/
- Protect the pending list by using its spinlock.
- Add the URL of another downstream issue related to this patch.
drivers/gpu/drm/v3d/v3d_sched.c | 28 +++++++++++++++++++++-------
1 file changed, 21 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index b3be08b0ca91..c612363181df 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -734,11 +734,16 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
return DRM_GPU_SCHED_STAT_NOMINAL;
}
-/* If the current address or return address have changed, then the GPU
- * has probably made progress and we should delay the reset. This
- * could fail if the GPU got in an infinite loop in the CL, but that
- * is pretty unlikely outside of an i-g-t testcase.
- */
+static void
+v3d_sched_skip_reset(struct drm_sched_job *sched_job)
+{
+ struct drm_gpu_scheduler *sched = sched_job->sched;
+
+ spin_lock(&sched->job_list_lock);
+ list_add(&sched_job->list, &sched->pending_list);
+ spin_unlock(&sched->job_list_lock);
+}
+
static enum drm_gpu_sched_stat
v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
u32 *timedout_ctca, u32 *timedout_ctra)
@@ -748,9 +753,16 @@ v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(q));
u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(q));
+ /* If the current address or return address have changed, then the GPU
+ * has probably made progress and we should delay the reset. This
+ * could fail if the GPU got in an infinite loop in the CL, but that
+ * is pretty unlikely outside of an i-g-t testcase.
+ */
if (*timedout_ctca != ctca || *timedout_ctra != ctra) {
*timedout_ctca = ctca;
*timedout_ctra = ctra;
+
+ v3d_sched_skip_reset(sched_job);
return DRM_GPU_SCHED_STAT_NOMINAL;
}
@@ -790,11 +802,13 @@ v3d_csd_job_timedout(struct drm_sched_job *sched_job)
struct v3d_dev *v3d = job->base.v3d;
u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4(v3d->ver));
- /* If we've made progress, skip reset and let the timer get
- * rearmed.
+ /* If we've made progress, skip reset, add the job to the pending
+ * list, and let the timer get rearmed.
*/
if (job->timedout_batches != batches) {
job->timedout_batches = batches;
+
+ v3d_sched_skip_reset(sched_job);
return DRM_GPU_SCHED_STAT_NOMINAL;
}
--
2.49.0
From: Ashish Kalra <ashish.kalra(a)amd.com>
When the shared pages are being made private during kdump preparation
there are additional checks to handle shared GHCB pages.
These additional checks include handling the case of GHCB page being
contained within a huge page.
While handling the case of GHCB page contained within a huge page
any shared page just below the GHCB page gets skipped from being
transitioned back to private during kdump preparation.
This subsequently causes a 0x404 #VC exception when this skipped
shared page is accessed later while dumping guest memory during
vmcore generation via kdump.
Split the initial check for skipping the GHCB page into the page
being skipped fully containing the GHCB and GHCB being contained
within a huge page. Also ensure that the skipped huge page
containing the GHCB page is transitioned back to private later
when changing GHCBs to private at end of kdump preparation.
Reviewed-by: Tom Lendacky <thomas.lendacky(a)amd.com>
Cc: stable(a)vger.kernel.org
Fixes: 3074152e56c9 ("x86/sev: Convert shared memory back to private on kexec")
Signed-off-by: Ashish Kalra <ashish.kalra(a)amd.com>
---
arch/x86/coco/sev/core.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index d35fec7b164a..1f53383bd1fa 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -1019,7 +1019,13 @@ static void unshare_all_memory(void)
data = per_cpu(runtime_data, cpu);
ghcb = (unsigned long)&data->ghcb_page;
- if (addr <= ghcb && ghcb <= addr + size) {
+ /* Handle the case of a huge page containing the GHCB page */
+ if (level == PG_LEVEL_4K && addr == ghcb) {
+ skipped_addr = true;
+ break;
+ }
+ if (level > PG_LEVEL_4K && addr <= ghcb &&
+ ghcb < addr + size) {
skipped_addr = true;
break;
}
@@ -1131,8 +1137,8 @@ static void shutdown_all_aps(void)
void snp_kexec_finish(void)
{
struct sev_es_runtime_data *data;
+ unsigned long size, mask;
unsigned int level, cpu;
- unsigned long size;
struct ghcb *ghcb;
pte_t *pte;
@@ -1160,6 +1166,10 @@ void snp_kexec_finish(void)
ghcb = &data->ghcb_page;
pte = lookup_address((unsigned long)ghcb, &level);
size = page_level_size(level);
+ mask = page_level_mask(level);
+ /* Handle the case of a huge page containing the GHCB page */
+ if (level > PG_LEVEL_4K)
+ ghcb = (struct ghcb *)((unsigned long)ghcb & mask);
set_pte_enc(pte, level, (void *)ghcb);
snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE));
}
--
2.34.1
The following commit has been merged into the irq/urgent branch of tip:
Commit-ID: 38a05c0b87833f5b188ae43b428b1f792df2b384
Gitweb: https://git.kernel.org/tip/38a05c0b87833f5b188ae43b428b1f792df2b384
Author: Stephan Gerhold <stephan.gerhold(a)linaro.org>
AuthorDate: Fri, 02 May 2025 13:22:28 +02:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Fri, 02 May 2025 21:07:02 +02:00
irqchip/qcom-mpm: Prevent crash when trying to handle non-wake GPIOs
On Qualcomm chipsets not all GPIOs are wakeup capable. Those GPIOs do not
have a corresponding MPM pin and should not be handled inside the MPM
driver. The IRQ domain hierarchy is always applied, so it's required to
explicitly disconnect the hierarchy for those. The pinctrl-msm driver marks
these with GPIO_NO_WAKE_IRQ. qcom-pdc has a check for this, but
irq-qcom-mpm is currently missing the check. This is causing crashes when
setting up interrupts for non-wake GPIOs:
root@rb1:~# gpiomon -c gpiochip1 10
irq: IRQ159: trimming hierarchy from :soc@0:interrupt-controller@f200000-1
Unable to handle kernel paging request at virtual address ffff8000a1dc3820
Hardware name: Qualcomm Technologies, Inc. Robotics RB1 (DT)
pc : mpm_set_type+0x80/0xcc
lr : mpm_set_type+0x5c/0xcc
Call trace:
mpm_set_type+0x80/0xcc (P)
qcom_mpm_set_type+0x64/0x158
irq_chip_set_type_parent+0x20/0x38
msm_gpio_irq_set_type+0x50/0x530
__irq_set_trigger+0x60/0x184
__setup_irq+0x304/0x6bc
request_threaded_irq+0xc8/0x19c
edge_detector_setup+0x260/0x364
linereq_create+0x420/0x5a8
gpio_ioctl+0x2d4/0x6c0
Fix this by copying the check for GPIO_NO_WAKE_IRQ from qcom-pdc.c, so that
MPM is removed entirely from the hierarchy for non-wake GPIOs.
Fixes: a6199bb514d8 ("irqchip: Add Qualcomm MPM controller driver")
Reported-by: Alexey Klimov <alexey.klimov(a)linaro.org>
Signed-off-by: Stephan Gerhold <stephan.gerhold(a)linaro.org>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Alexey Klimov <alexey.klimov(a)linaro.org>
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski(a)linaro.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20250502-irq-qcom-mpm-fix-no-wake-v1-1-8a1eafcd…
---
drivers/irqchip/irq-qcom-mpm.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/irqchip/irq-qcom-mpm.c b/drivers/irqchip/irq-qcom-mpm.c
index 7942d8e..f772deb 100644
--- a/drivers/irqchip/irq-qcom-mpm.c
+++ b/drivers/irqchip/irq-qcom-mpm.c
@@ -227,6 +227,9 @@ static int qcom_mpm_alloc(struct irq_domain *domain, unsigned int virq,
if (ret)
return ret;
+ if (pin == GPIO_NO_WAKE_IRQ)
+ return irq_domain_disconnect_hierarchy(domain, virq);
+
ret = irq_domain_set_hwirq_and_chip(domain, virq, pin,
&qcom_mpm_chip, priv);
if (ret)
Hi there,
Are you looking for a high-quality, global opt-in user list for Branch and
other top mobile analytics platforms?
We have fresh, permission-based data available for the following platforms:
* Adjust
* Mixpanel
* AppsFlyer
* CleverTap
* Amplitude Analytics
* Braze
* Singular
* Tenjin & many more...!
Would you be interested in learning more or receiving a sample?
Best regards,
Brielle Hayes | Marketing Executive
Please response with "skip" if not required.
From: Steven Rostedt <rostedt(a)goodmis.org>
On some paths in print_event_fields() it takes the trace_event_sem for
read, even though it should always be held when the function is called.
Remove the taking of that mutex and add a lockdep_assert_held_read() to
make sure the trace_event_sem is held when print_event_fields() is called.
Cc: stable(a)vger.kernel.org
Cc: Masami Hiramatsu <mhiramat(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Link: https://lore.kernel.org/20250501224128.0b1f0571@batman.local.home
Fixes: 80a76994b2d88 ("tracing: Add "fields" option to show raw trace event fields")
Reported-by: syzbot+441582c1592938fccf09(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/6813ff5e.050a0220.14dd7d.001b.GAE@google.com/
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/trace_output.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index fee40ffbd490..b9ab06c99543 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1042,11 +1042,12 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
struct trace_event_call *call;
struct list_head *head;
+ lockdep_assert_held_read(&trace_event_sem);
+
/* ftrace defined events have separate call structures */
if (event->type <= __TRACE_LAST_TYPE) {
bool found = false;
- down_read(&trace_event_sem);
list_for_each_entry(call, &ftrace_events, list) {
if (call->event.type == event->type) {
found = true;
@@ -1056,7 +1057,6 @@ enum print_line_t print_event_fields(struct trace_iterator *iter,
if (call->event.type > __TRACE_LAST_TYPE)
break;
}
- up_read(&trace_event_sem);
if (!found) {
trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type);
goto out;
--
2.47.2
Starting with Rust 1.88.0 (expected 2025-06-26) [1], Clippy may start
warning about paths that do not resolve in the `disallowed_macros`
configuration:
warning: `kernel::dbg` does not refer to an existing macro
--> .clippy.toml:10:5
|
10 | { path = "kernel::dbg", reason = "the `dbg!` macro is intended as a debugging tool" },
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
This is a lint we requested at [2], due to the trouble debugging
the lint due to false negatives (e.g. [3]), which we use to emulate
`clippy::dbg_macro` [4]. See commit 8577c9dca799 ("rust: replace
`clippy::dbg_macro` with `disallowed_macros`") for more details.
Given the false negatives are not resolved yet, it is expected that
Clippy complains about not finding this macro.
Thus, until the false negatives are fixed (and, even then, probably we
will need to wait for the MSRV to raise enough), use the escape hatch
to allow an invalid path.
Cc: stable(a)vger.kernel.org # Needed in 6.12.y and later (Rust is pinned in older LTSs).
Link: https://github.com/rust-lang/rust-clippy/pull/14397 [1]
Link: https://github.com/rust-lang/rust-clippy/issues/11432 [2]
Link: https://github.com/rust-lang/rust-clippy/issues/11431 [3]
Link: https://github.com/rust-lang/rust-clippy/issues/11303 [4]
Signed-off-by: Miguel Ojeda <ojeda(a)kernel.org>
---
.clippy.toml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.clippy.toml b/.clippy.toml
index 815c94732ed7..137f41d203de 100644
--- a/.clippy.toml
+++ b/.clippy.toml
@@ -7,5 +7,5 @@ check-private-items = true
disallowed-macros = [
# The `clippy::dbg_macro` lint only works with `std::dbg!`, thus we simulate
# it here, see: https://github.com/rust-lang/rust-clippy/issues/11303.
- { path = "kernel::dbg", reason = "the `dbg!` macro is intended as a debugging tool" },
+ { path = "kernel::dbg", reason = "the `dbg!` macro is intended as a debugging tool", allow-invalid = true },
]
--
2.49.0
Starting with Rust 1.88.0 (expected 2025-06-26) [1][2], `rustc` may
introduce a new lint that catches unnecessary transmutes, e.g.:
error: unnecessary transmute
--> rust/uapi/uapi_generated.rs:23242:18
|
23242 | unsafe { ::core::mem::transmute(self._bitfield_1.get(0usize, 1u8) as u8) }
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: replace this with: `(self._bitfield_1.get(0usize, 1u8) as u8 == 1)`
|
= note: `-D unnecessary-transmutes` implied by `-D warnings`
= help: to override `-D warnings` add `#[allow(unnecessary_transmutes)]`
There are a lot of them (at least 300), but luckily they are all in
`bindgen`-generated code.
Thus clean all up by allowing it there.
Since unknown lints trigger a lint itself in older compilers, do it
conditionally so that we can keep the `unknown_lints` lint enabled.
Cc: stable(a)vger.kernel.org # Needed in 6.12.y and later (Rust is pinned in older LTSs).
Link: https://github.com/rust-lang/rust/pull/136083 [1]
Link: https://github.com/rust-lang/rust/issues/136067 [2]
Signed-off-by: Miguel Ojeda <ojeda(a)kernel.org>
---
init/Kconfig | 3 +++
rust/bindings/lib.rs | 1 +
rust/uapi/lib.rs | 1 +
3 files changed, 5 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig
index 63f5974b9fa6..4cdd1049283c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -140,6 +140,9 @@ config LD_CAN_USE_KEEP_IN_OVERLAY
config RUSTC_HAS_COERCE_POINTEE
def_bool RUSTC_VERSION >= 108400
+config RUSTC_HAS_UNNECESSARY_TRANSMUTES
+ def_bool RUSTC_VERSION >= 108800
+
config PAHOLE_VERSION
int
default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE))
diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs
index 014af0d1fc70..a08eb5518cac 100644
--- a/rust/bindings/lib.rs
+++ b/rust/bindings/lib.rs
@@ -26,6 +26,7 @@
#[allow(dead_code)]
#[allow(clippy::undocumented_unsafe_blocks)]
+#[cfg_attr(CONFIG_RUSTC_HAS_UNNECESSARY_TRANSMUTES, allow(unnecessary_transmutes))]
mod bindings_raw {
// Manual definition for blocklisted types.
type __kernel_size_t = usize;
diff --git a/rust/uapi/lib.rs b/rust/uapi/lib.rs
index 13495910271f..c98d7a8cde77 100644
--- a/rust/uapi/lib.rs
+++ b/rust/uapi/lib.rs
@@ -24,6 +24,7 @@
unreachable_pub,
unsafe_op_in_unsafe_fn
)]
+#![cfg_attr(CONFIG_RUSTC_HAS_UNNECESSARY_TRANSMUTES, allow(unnecessary_transmutes))]
// Manual definition of blocklisted types.
type __kernel_size_t = usize;
--
2.49.0
After commit 0a65bc27bd64 ("eventpoll: Set epoll timeout if it's in
the future"), the following program would immediately enter a busy
loop in the kernel:
```
int main() {
int e = epoll_create1(0);
struct epoll_event event = {.events = EPOLLIN};
epoll_ctl(e, EPOLL_CTL_ADD, 0, &event);
const struct timespec timeout = {.tv_nsec = 1};
epoll_pwait2(e, &event, 1, &timeout, 0);
}
```
This happens because the given (non-zero) timeout of 1 nanosecond
usually expires before ep_poll() is entered and then
ep_schedule_timeout() returns false, but `timed_out` is never set
because the code line that sets it is skipped. This quickly turns
into a soft lockup, RCU stalls and deadlocks, inflicting severe
headaches to the whole system.
When the timeout has expired, we don't need to schedule a hrtimer, but
we should set the `timed_out` variable. Therefore, I suggest moving
the ep_schedule_timeout() check into the `timed_out` expression
instead of skipping it.
Fixes: 0a65bc27bd64 ("eventpoll: Set epoll timeout if it's in the future")
Cc: Joe Damato <jdamato(a)fastly.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Max Kellermann <max.kellermann(a)ionos.com>
---
fs/eventpoll.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4bc264b854c4..d4dbffdedd08 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2111,9 +2111,10 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
write_unlock_irq(&ep->lock);
- if (!eavail && ep_schedule_timeout(to))
- timed_out = !schedule_hrtimeout_range(to, slack,
- HRTIMER_MODE_ABS);
+ if (!eavail)
+ timed_out = !ep_schedule_timeout(to) ||
+ !schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
/*
--
2.47.2
Recent tests with timeouts > INT_MAX produced random error returns
with usbtmc_get_stb. This was caused by assigning the return value
of wait_event_interruptible_timeout to an int which overflowed to
negative values. Also return value on success was the remaining
number of jiffies instead of 0.
These patches fix all the cases where the return of
wait_event_interruptible_timeout was assigned to an int and
the case of the remaining jiffies return in usbtmc_get_stb.
Patch 1: Fixes usbtmc_get_stb
Patch 2: Fixes usbtmc488_ioctl_wait_srq
Patch 3: Fixes usbtmc_generic_read
Dave Penkler (3):
usb: usbtmc: Fix erroneous get_stb ioctl error returns
usb: usbtmc: Fix erroneous wait_srq ioctl return
usb: usbtmc: Fix erroneous generic_read ioctl return
drivers/usb/class/usbtmc.c | 53 ++++++++++++++++++++++----------------
1 file changed, 31 insertions(+), 22 deletions(-)
--
Changes V1 => V2 Add cc to stable line
V2 => V3 Add susbsystem to cover letter
2.49.0
980a573621ea ("tpm: Make chip->{status,cancel,req_canceled} opt")
This is a dependent commit for the series of patches to add the AMD
SEV-SNP SVSM vTPM device driver. Kernel 6.11 added SVSM support, but
not support for the critical component for boot integrity that follows
the SEV-SNP threat model. That series
https://lore.kernel.org/all/20250410135118.133240-1-sgarzare@redhat.com/
is applied at tip but is not yet in the mainline.
I have confirmed that this patch applies cleanly. Stefano's patch
series needs a minor tweak to the first patch due to the changed
surrounding function declarations in arch/x86/include/asm/sev.h
https://github.com/deeglaze/amdese-linux/commits/vtpm612/
I've independently tested the patches.
--
-Dionna Glaze, PhD, CISSP, CCSP (she/her)
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x c2fee09fc167c74a64adb08656cb993ea475197e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021815-protract-greasily-cdea@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c2fee09fc167c74a64adb08656cb993ea475197e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 24 Jan 2025 17:18:33 -0800
Subject: [PATCH] KVM: x86: Load DR6 with guest value only before entering
.vcpu_run() loop
Move the conditional loading of hardware DR6 with the guest's DR6 value
out of the core .vcpu_run() loop to fix a bug where KVM can load hardware
with a stale vcpu->arch.dr6.
When the guest accesses a DR and host userspace isn't debugging the guest,
KVM disables DR interception and loads the guest's values into hardware on
VM-Enter and saves them on VM-Exit. This allows the guest to access DRs
at will, e.g. so that a sequence of DR accesses to configure a breakpoint
only generates one VM-Exit.
For DR0-DR3, the logic/behavior is identical between VMX and SVM, and also
identical between KVM_DEBUGREG_BP_ENABLED (userspace debugging the guest)
and KVM_DEBUGREG_WONT_EXIT (guest using DRs), and so KVM handles loading
DR0-DR3 in common code, _outside_ of the core kvm_x86_ops.vcpu_run() loop.
But for DR6, the guest's value doesn't need to be loaded into hardware for
KVM_DEBUGREG_BP_ENABLED, and SVM provides a dedicated VMCB field whereas
VMX requires software to manually load the guest value, and so loading the
guest's value into DR6 is handled by {svm,vmx}_vcpu_run(), i.e. is done
_inside_ the core run loop.
Unfortunately, saving the guest values on VM-Exit is initiated by common
x86, again outside of the core run loop. If the guest modifies DR6 (in
hardware, when DR interception is disabled), and then the next VM-Exit is
a fastpath VM-Exit, KVM will reload hardware DR6 with vcpu->arch.dr6 and
clobber the guest's actual value.
The bug shows up primarily with nested VMX because KVM handles the VMX
preemption timer in the fastpath, and the window between hardware DR6
being modified (in guest context) and DR6 being read by guest software is
orders of magnitude larger in a nested setup. E.g. in non-nested, the
VMX preemption timer would need to fire precisely between #DB injection
and the #DB handler's read of DR6, whereas with a KVM-on-KVM setup, the
window where hardware DR6 is "dirty" extends all the way from L1 writing
DR6 to VMRESUME (in L1).
L1's view:
==========
<L1 disables DR interception>
CPU 0/KVM-7289 [023] d.... 2925.640961: kvm_entry: vcpu 0
A: L1 Writes DR6
CPU 0/KVM-7289 [023] d.... 2925.640963: <hack>: Set DRs, DR6 = 0xffff0ff1
B: CPU 0/KVM-7289 [023] d.... 2925.640967: kvm_exit: vcpu 0 reason EXTERNAL_INTERRUPT intr_info 0x800000ec
D: L1 reads DR6, arch.dr6 = 0
CPU 0/KVM-7289 [023] d.... 2925.640969: <hack>: Sync DRs, DR6 = 0xffff0ff0
CPU 0/KVM-7289 [023] d.... 2925.640976: kvm_entry: vcpu 0
L2 reads DR6, L1 disables DR interception
CPU 0/KVM-7289 [023] d.... 2925.640980: kvm_exit: vcpu 0 reason DR_ACCESS info1 0x0000000000000216
CPU 0/KVM-7289 [023] d.... 2925.640983: kvm_entry: vcpu 0
CPU 0/KVM-7289 [023] d.... 2925.640983: <hack>: Set DRs, DR6 = 0xffff0ff0
L2 detects failure
CPU 0/KVM-7289 [023] d.... 2925.640987: kvm_exit: vcpu 0 reason HLT
L1 reads DR6 (confirms failure)
CPU 0/KVM-7289 [023] d.... 2925.640990: <hack>: Sync DRs, DR6 = 0xffff0ff0
L0's view:
==========
L2 reads DR6, arch.dr6 = 0
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
L2 => L1 nested VM-Exit
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit_inject: reason: DR_ACCESS ext_inf1: 0x0000000000000216
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_entry: vcpu 23
L1 writes DR7, L0 disables DR interception
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000007
CPU 23/KVM-5046 [001] d.... 3410.005613: kvm_entry: vcpu 23
L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005613: <hack>: Set DRs, DR6 = 0xffff0ff0
A: <L1 writes DR6 = 1, no interception, arch.dr6 is still '0'>
B: CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_exit: vcpu 23 reason PREEMPTION_TIMER
CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_entry: vcpu 23
C: L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005614: <hack>: Set DRs, DR6 = 0xffff0ff0
L1 => L2 nested VM-Enter
CPU 23/KVM-5046 [001] d.... 3410.005616: kvm_exit: vcpu 23 reason VMRESUME
L0 reads DR6, arch.dr6 = 0
Reported-by: John Stultz <jstultz(a)google.com>
Closes: https://lkml.kernel.org/r/CANDhNCq5_F3HfFYABqFGCA1bPd_%2BxgNj-iDQhH4tDk%2Bw…
Fixes: 375e28ffc0cf ("KVM: X86: Set host DR6 only on VMX and for KVM_DEBUGREG_WONT_EXIT")
Fixes: d67668e9dd76 ("KVM: x86, SVM: isolate vcpu->arch.dr6 from vmcb->save.dr6")
Cc: stable(a)vger.kernel.org
Cc: Jim Mattson <jmattson(a)google.com>
Tested-by: John Stultz <jstultz(a)google.com>
Link: https://lore.kernel.org/r/20250125011833.3644371-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index c35550581da0..823c0434bbad 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -48,6 +48,7 @@ KVM_X86_OP(set_idt)
KVM_X86_OP(get_gdt)
KVM_X86_OP(set_gdt)
KVM_X86_OP(sync_dirty_debug_regs)
+KVM_X86_OP(set_dr6)
KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b15cde0a9b5c..0b7af5902ff7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1696,6 +1696,7 @@ struct kvm_x86_ops {
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+ void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7640a84e554a..a713c803a3a3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid = sd->next_asid++;
}
-static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
- struct vmcb *vmcb = svm->vmcb;
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- if (svm->vcpu.arch.guest_state_protected)
+ if (vcpu->arch.guest_state_protected)
return;
if (unlikely(value != vmcb->save.dr6)) {
@@ -4247,10 +4247,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
*/
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- svm_set_dr6(svm, vcpu->arch.dr6);
- else
- svm_set_dr6(svm, DR6_ACTIVE_LOW);
+ if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
@@ -5043,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
+ .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 2427f918e763..43ee9ed11291 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
+ .set_dr6 = vmx_set_dr6,
.set_dr7 = vmx_set_dr7,
.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
.cache_reg = vmx_cache_reg,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f72835e85b6d..6c56d5235f0f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5648,6 +5648,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ lockdep_assert_irqs_disabled();
+ set_debugreg(vcpu->arch.dr6, 6);
+}
+
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -7417,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- set_debugreg(vcpu->arch.dr6, 6);
-
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index ce3295a67c04..430773a5ef8e 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8e77e61d4fbd..02159c967d29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10961,6 +10961,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(0, 7);
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x c2fee09fc167c74a64adb08656cb993ea475197e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021809-census-mammal-224b@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c2fee09fc167c74a64adb08656cb993ea475197e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 24 Jan 2025 17:18:33 -0800
Subject: [PATCH] KVM: x86: Load DR6 with guest value only before entering
.vcpu_run() loop
Move the conditional loading of hardware DR6 with the guest's DR6 value
out of the core .vcpu_run() loop to fix a bug where KVM can load hardware
with a stale vcpu->arch.dr6.
When the guest accesses a DR and host userspace isn't debugging the guest,
KVM disables DR interception and loads the guest's values into hardware on
VM-Enter and saves them on VM-Exit. This allows the guest to access DRs
at will, e.g. so that a sequence of DR accesses to configure a breakpoint
only generates one VM-Exit.
For DR0-DR3, the logic/behavior is identical between VMX and SVM, and also
identical between KVM_DEBUGREG_BP_ENABLED (userspace debugging the guest)
and KVM_DEBUGREG_WONT_EXIT (guest using DRs), and so KVM handles loading
DR0-DR3 in common code, _outside_ of the core kvm_x86_ops.vcpu_run() loop.
But for DR6, the guest's value doesn't need to be loaded into hardware for
KVM_DEBUGREG_BP_ENABLED, and SVM provides a dedicated VMCB field whereas
VMX requires software to manually load the guest value, and so loading the
guest's value into DR6 is handled by {svm,vmx}_vcpu_run(), i.e. is done
_inside_ the core run loop.
Unfortunately, saving the guest values on VM-Exit is initiated by common
x86, again outside of the core run loop. If the guest modifies DR6 (in
hardware, when DR interception is disabled), and then the next VM-Exit is
a fastpath VM-Exit, KVM will reload hardware DR6 with vcpu->arch.dr6 and
clobber the guest's actual value.
The bug shows up primarily with nested VMX because KVM handles the VMX
preemption timer in the fastpath, and the window between hardware DR6
being modified (in guest context) and DR6 being read by guest software is
orders of magnitude larger in a nested setup. E.g. in non-nested, the
VMX preemption timer would need to fire precisely between #DB injection
and the #DB handler's read of DR6, whereas with a KVM-on-KVM setup, the
window where hardware DR6 is "dirty" extends all the way from L1 writing
DR6 to VMRESUME (in L1).
L1's view:
==========
<L1 disables DR interception>
CPU 0/KVM-7289 [023] d.... 2925.640961: kvm_entry: vcpu 0
A: L1 Writes DR6
CPU 0/KVM-7289 [023] d.... 2925.640963: <hack>: Set DRs, DR6 = 0xffff0ff1
B: CPU 0/KVM-7289 [023] d.... 2925.640967: kvm_exit: vcpu 0 reason EXTERNAL_INTERRUPT intr_info 0x800000ec
D: L1 reads DR6, arch.dr6 = 0
CPU 0/KVM-7289 [023] d.... 2925.640969: <hack>: Sync DRs, DR6 = 0xffff0ff0
CPU 0/KVM-7289 [023] d.... 2925.640976: kvm_entry: vcpu 0
L2 reads DR6, L1 disables DR interception
CPU 0/KVM-7289 [023] d.... 2925.640980: kvm_exit: vcpu 0 reason DR_ACCESS info1 0x0000000000000216
CPU 0/KVM-7289 [023] d.... 2925.640983: kvm_entry: vcpu 0
CPU 0/KVM-7289 [023] d.... 2925.640983: <hack>: Set DRs, DR6 = 0xffff0ff0
L2 detects failure
CPU 0/KVM-7289 [023] d.... 2925.640987: kvm_exit: vcpu 0 reason HLT
L1 reads DR6 (confirms failure)
CPU 0/KVM-7289 [023] d.... 2925.640990: <hack>: Sync DRs, DR6 = 0xffff0ff0
L0's view:
==========
L2 reads DR6, arch.dr6 = 0
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
L2 => L1 nested VM-Exit
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit_inject: reason: DR_ACCESS ext_inf1: 0x0000000000000216
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_entry: vcpu 23
L1 writes DR7, L0 disables DR interception
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000007
CPU 23/KVM-5046 [001] d.... 3410.005613: kvm_entry: vcpu 23
L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005613: <hack>: Set DRs, DR6 = 0xffff0ff0
A: <L1 writes DR6 = 1, no interception, arch.dr6 is still '0'>
B: CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_exit: vcpu 23 reason PREEMPTION_TIMER
CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_entry: vcpu 23
C: L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005614: <hack>: Set DRs, DR6 = 0xffff0ff0
L1 => L2 nested VM-Enter
CPU 23/KVM-5046 [001] d.... 3410.005616: kvm_exit: vcpu 23 reason VMRESUME
L0 reads DR6, arch.dr6 = 0
Reported-by: John Stultz <jstultz(a)google.com>
Closes: https://lkml.kernel.org/r/CANDhNCq5_F3HfFYABqFGCA1bPd_%2BxgNj-iDQhH4tDk%2Bw…
Fixes: 375e28ffc0cf ("KVM: X86: Set host DR6 only on VMX and for KVM_DEBUGREG_WONT_EXIT")
Fixes: d67668e9dd76 ("KVM: x86, SVM: isolate vcpu->arch.dr6 from vmcb->save.dr6")
Cc: stable(a)vger.kernel.org
Cc: Jim Mattson <jmattson(a)google.com>
Tested-by: John Stultz <jstultz(a)google.com>
Link: https://lore.kernel.org/r/20250125011833.3644371-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index c35550581da0..823c0434bbad 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -48,6 +48,7 @@ KVM_X86_OP(set_idt)
KVM_X86_OP(get_gdt)
KVM_X86_OP(set_gdt)
KVM_X86_OP(sync_dirty_debug_regs)
+KVM_X86_OP(set_dr6)
KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b15cde0a9b5c..0b7af5902ff7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1696,6 +1696,7 @@ struct kvm_x86_ops {
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+ void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7640a84e554a..a713c803a3a3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid = sd->next_asid++;
}
-static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
- struct vmcb *vmcb = svm->vmcb;
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- if (svm->vcpu.arch.guest_state_protected)
+ if (vcpu->arch.guest_state_protected)
return;
if (unlikely(value != vmcb->save.dr6)) {
@@ -4247,10 +4247,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
*/
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- svm_set_dr6(svm, vcpu->arch.dr6);
- else
- svm_set_dr6(svm, DR6_ACTIVE_LOW);
+ if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
@@ -5043,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
+ .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 2427f918e763..43ee9ed11291 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
+ .set_dr6 = vmx_set_dr6,
.set_dr7 = vmx_set_dr7,
.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
.cache_reg = vmx_cache_reg,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f72835e85b6d..6c56d5235f0f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5648,6 +5648,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ lockdep_assert_irqs_disabled();
+ set_debugreg(vcpu->arch.dr6, 6);
+}
+
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -7417,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- set_debugreg(vcpu->arch.dr6, 6);
-
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index ce3295a67c04..430773a5ef8e 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8e77e61d4fbd..02159c967d29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10961,6 +10961,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(0, 7);
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x c2fee09fc167c74a64adb08656cb993ea475197e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025021812-candied-hamper-7f08@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c2fee09fc167c74a64adb08656cb993ea475197e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Fri, 24 Jan 2025 17:18:33 -0800
Subject: [PATCH] KVM: x86: Load DR6 with guest value only before entering
.vcpu_run() loop
Move the conditional loading of hardware DR6 with the guest's DR6 value
out of the core .vcpu_run() loop to fix a bug where KVM can load hardware
with a stale vcpu->arch.dr6.
When the guest accesses a DR and host userspace isn't debugging the guest,
KVM disables DR interception and loads the guest's values into hardware on
VM-Enter and saves them on VM-Exit. This allows the guest to access DRs
at will, e.g. so that a sequence of DR accesses to configure a breakpoint
only generates one VM-Exit.
For DR0-DR3, the logic/behavior is identical between VMX and SVM, and also
identical between KVM_DEBUGREG_BP_ENABLED (userspace debugging the guest)
and KVM_DEBUGREG_WONT_EXIT (guest using DRs), and so KVM handles loading
DR0-DR3 in common code, _outside_ of the core kvm_x86_ops.vcpu_run() loop.
But for DR6, the guest's value doesn't need to be loaded into hardware for
KVM_DEBUGREG_BP_ENABLED, and SVM provides a dedicated VMCB field whereas
VMX requires software to manually load the guest value, and so loading the
guest's value into DR6 is handled by {svm,vmx}_vcpu_run(), i.e. is done
_inside_ the core run loop.
Unfortunately, saving the guest values on VM-Exit is initiated by common
x86, again outside of the core run loop. If the guest modifies DR6 (in
hardware, when DR interception is disabled), and then the next VM-Exit is
a fastpath VM-Exit, KVM will reload hardware DR6 with vcpu->arch.dr6 and
clobber the guest's actual value.
The bug shows up primarily with nested VMX because KVM handles the VMX
preemption timer in the fastpath, and the window between hardware DR6
being modified (in guest context) and DR6 being read by guest software is
orders of magnitude larger in a nested setup. E.g. in non-nested, the
VMX preemption timer would need to fire precisely between #DB injection
and the #DB handler's read of DR6, whereas with a KVM-on-KVM setup, the
window where hardware DR6 is "dirty" extends all the way from L1 writing
DR6 to VMRESUME (in L1).
L1's view:
==========
<L1 disables DR interception>
CPU 0/KVM-7289 [023] d.... 2925.640961: kvm_entry: vcpu 0
A: L1 Writes DR6
CPU 0/KVM-7289 [023] d.... 2925.640963: <hack>: Set DRs, DR6 = 0xffff0ff1
B: CPU 0/KVM-7289 [023] d.... 2925.640967: kvm_exit: vcpu 0 reason EXTERNAL_INTERRUPT intr_info 0x800000ec
D: L1 reads DR6, arch.dr6 = 0
CPU 0/KVM-7289 [023] d.... 2925.640969: <hack>: Sync DRs, DR6 = 0xffff0ff0
CPU 0/KVM-7289 [023] d.... 2925.640976: kvm_entry: vcpu 0
L2 reads DR6, L1 disables DR interception
CPU 0/KVM-7289 [023] d.... 2925.640980: kvm_exit: vcpu 0 reason DR_ACCESS info1 0x0000000000000216
CPU 0/KVM-7289 [023] d.... 2925.640983: kvm_entry: vcpu 0
CPU 0/KVM-7289 [023] d.... 2925.640983: <hack>: Set DRs, DR6 = 0xffff0ff0
L2 detects failure
CPU 0/KVM-7289 [023] d.... 2925.640987: kvm_exit: vcpu 0 reason HLT
L1 reads DR6 (confirms failure)
CPU 0/KVM-7289 [023] d.... 2925.640990: <hack>: Sync DRs, DR6 = 0xffff0ff0
L0's view:
==========
L2 reads DR6, arch.dr6 = 0
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216
L2 => L1 nested VM-Exit
CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit_inject: reason: DR_ACCESS ext_inf1: 0x0000000000000216
CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_entry: vcpu 23
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason VMREAD
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_entry: vcpu 23
L1 writes DR7, L0 disables DR interception
CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000007
CPU 23/KVM-5046 [001] d.... 3410.005613: kvm_entry: vcpu 23
L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005613: <hack>: Set DRs, DR6 = 0xffff0ff0
A: <L1 writes DR6 = 1, no interception, arch.dr6 is still '0'>
B: CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_exit: vcpu 23 reason PREEMPTION_TIMER
CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_entry: vcpu 23
C: L0 writes DR6 = 0 (arch.dr6)
CPU 23/KVM-5046 [001] d.... 3410.005614: <hack>: Set DRs, DR6 = 0xffff0ff0
L1 => L2 nested VM-Enter
CPU 23/KVM-5046 [001] d.... 3410.005616: kvm_exit: vcpu 23 reason VMRESUME
L0 reads DR6, arch.dr6 = 0
Reported-by: John Stultz <jstultz(a)google.com>
Closes: https://lkml.kernel.org/r/CANDhNCq5_F3HfFYABqFGCA1bPd_%2BxgNj-iDQhH4tDk%2Bw…
Fixes: 375e28ffc0cf ("KVM: X86: Set host DR6 only on VMX and for KVM_DEBUGREG_WONT_EXIT")
Fixes: d67668e9dd76 ("KVM: x86, SVM: isolate vcpu->arch.dr6 from vmcb->save.dr6")
Cc: stable(a)vger.kernel.org
Cc: Jim Mattson <jmattson(a)google.com>
Tested-by: John Stultz <jstultz(a)google.com>
Link: https://lore.kernel.org/r/20250125011833.3644371-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index c35550581da0..823c0434bbad 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -48,6 +48,7 @@ KVM_X86_OP(set_idt)
KVM_X86_OP(get_gdt)
KVM_X86_OP(set_gdt)
KVM_X86_OP(sync_dirty_debug_regs)
+KVM_X86_OP(set_dr6)
KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b15cde0a9b5c..0b7af5902ff7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1696,6 +1696,7 @@ struct kvm_x86_ops {
void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
+ void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7640a84e554a..a713c803a3a3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid = sd->next_asid++;
}
-static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
+static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
- struct vmcb *vmcb = svm->vmcb;
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- if (svm->vcpu.arch.guest_state_protected)
+ if (vcpu->arch.guest_state_protected)
return;
if (unlikely(value != vmcb->save.dr6)) {
@@ -4247,10 +4247,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
*/
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- svm_set_dr6(svm, vcpu->arch.dr6);
- else
- svm_set_dr6(svm, DR6_ACTIVE_LOW);
+ if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)))
+ svm_set_dr6(vcpu, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
@@ -5043,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_idt = svm_set_idt,
.get_gdt = svm_get_gdt,
.set_gdt = svm_set_gdt,
+ .set_dr6 = svm_set_dr6,
.set_dr7 = svm_set_dr7,
.sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
.cache_reg = svm_cache_reg,
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 2427f918e763..43ee9ed11291 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
.set_gdt = vmx_set_gdt,
+ .set_dr6 = vmx_set_dr6,
.set_dr7 = vmx_set_dr7,
.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
.cache_reg = vmx_cache_reg,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f72835e85b6d..6c56d5235f0f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5648,6 +5648,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ lockdep_assert_irqs_disabled();
+ set_debugreg(vcpu->arch.dr6, 6);
+}
+
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
@@ -7417,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- set_debugreg(vcpu->arch.dr6, 6);
-
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index ce3295a67c04..430773a5ef8e 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8e77e61d4fbd..02159c967d29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10961,6 +10961,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6);
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(0, 7);
}
Changes since v3 [1] (note v4 was a partial re-roll, but more feedback
came in requiring a v5):
- Fix a kbuild robot report for a missing header include of cc_platform.h
- Switch to selecting STRICT_DEVMEM and IOSTRICT_DEVMEM rather than
"depends on". (Naveen)
- Clarify the "SEPT violation" vs "crash" and other changelog fixups for
devmem maintainers and other arch maintainers. (Dave)
- Drop patch numbering since patch2 is a fix and has no dependencies on
patch1
[1]: http://lore.kernel.org/174491711228.1395340.3647010925173796093.stgit@dwill…
---
The original response to Nikolay's report of a "crash" (unhandled SEPT
violation) triggered by /dev/mem access to private memory was "let's
just turn off /dev/mem".
After some machinations of x86_platform_ops to block a subset of
problematic access, spelunking the history of devmem_is_allowed()
returning "2" to enable some compatibility benefits while blocking
access, and discovering that userspace depends buggy kernel behavior for
mmap(2) of the first 1MB of memory on x86, the proposal has circled back
to "disable /dev/mem".
Require both STRICT_DEVMEM and IO_STRICT_DEVMEM for x86 confidential
guests to close /dev/mem hole while still allowing for userspace
mapping of PCI MMIO as long as the kernel and userspace are not mapping
the range at the same time.
The range_is_allowed() cleanup is not strictly necessary, but might as
well close a 17 year-old "TODO".
Dan Williams (2):
x86/devmem: Remove duplicate range_is_allowed() definition
x86/devmem: Drop /dev/mem access for confidential guests
arch/x86/Kconfig | 4 ++++
arch/x86/mm/pat/memtype.c | 31 ++++---------------------------
drivers/char/mem.c | 28 ++++++++++------------------
include/linux/io.h | 21 +++++++++++++++++++++
4 files changed, 39 insertions(+), 45 deletions(-)
base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8
--
2.49.0
Hello again,
This is the 6.1.y backports set from 6.10 which corresponds to the 6.6.y
backports set from here:
https://lore.kernel.org/all/20241002174108.64615-1-catherine.hoang@oracle.c…
The following patches were included:
f43bd357fde0 xfs: fix error returns from xfs_bmapi_write
4bcef72d96b5 xfs: fix xfs_bmap_add_extent_delay_real for partial conversions
c299188b443a xfs: remove a racy if_bytes check in xfs_reflink_end_cow_extent
c13c21f77824 xfs: require XFS_SB_FEAT_INCOMPAT_LOG_XATTRS for attr log intent item recovey
0934046e3392 xfs: check opcode and iovec count match in xlog_recover_attri_commit_pass2
9716cdcc2f9e xfs: validate recovered name buffers when recovering xattr items
20adb1e2f069 xfs: revert commit 44af6c7e59b12
f24ba2183148 xfs: match lock mode in xfs_buffered_write_iomap_begin()
0f726c17dfd8 xfs: make the seq argument to xfs_bmapi_convert_delalloc() optional
36081fd0ee37 xfs: make xfs_bmapi_convert_delalloc() to allocate the target offset
4c99f3026cf2 xfs: convert delayed extents to unwritten when zeroing post eof blocks
0aca73915dc1 xfs: allow symlinks with short remote targets
0e52b98bf041 xfs: make sure sb_fdblocks is non-negative
2bc2d49c36c2 xfs: fix freeing speculative preallocations for preallocated files
3eeac3311683 xfs: allow unlinked symlinks and dirs with zero size
9a0ab4fc28ed xfs: restrict when we try to align cow fork delalloc to cowextsz hints
The following patches were omitted:
cad051826d83 xfs: fix missing check for invalid attr flags
scrub
db460c26f0b0 xfs: check shortform attr entry flags specifically
scrub
5689d2345a01 xfs: enforce one namespace per attribute
doesn't cherry pick cleanly, it is some refactoring and isn't a
dependency for other patches, lets skip it for now
7c03b124353a xfs: use dontcache for grabbing inodes during scrub
scrub
740a427e8f45 xfs: fix unlink vs cluster buffer instantiation race
already in 6.1.y
No fixes were found on mainline for any of the patches being ported.
The auto group was run 1x on each of these configs:
xfs/4k
xfs/1k
xfs/logdev
xfs/realtime
xfs/quota
xfs/v4
xfs/dax
xfs/adv
xfs/dirblock_8k
and no regressions were seen. This set has already been ack'd on the
xfs-stable list.
Let me know if you see any issues. Thanks,
Leah
Christoph Hellwig (4):
xfs: fix error returns from xfs_bmapi_write
xfs: fix xfs_bmap_add_extent_delay_real for partial conversions
xfs: remove a racy if_bytes check in xfs_reflink_end_cow_extent
xfs: fix freeing speculative preallocations for preallocated files
Darrick J. Wong (7):
xfs: require XFS_SB_FEAT_INCOMPAT_LOG_XATTRS for attr log intent item
recovery
xfs: check opcode and iovec count match in
xlog_recover_attri_commit_pass2
xfs: validate recovered name buffers when recovering xattr items
xfs: revert commit 44af6c7e59b12
xfs: allow symlinks with short remote targets
xfs: allow unlinked symlinks and dirs with zero size
xfs: restrict when we try to align cow fork delalloc to cowextsz hints
Wengang Wang (1):
xfs: make sure sb_fdblocks is non-negative
Zhang Yi (4):
xfs: match lock mode in xfs_buffered_write_iomap_begin()
xfs: make the seq argument to xfs_bmapi_convert_delalloc() optional
xfs: make xfs_bmapi_convert_delalloc() to allocate the target offset
xfs: convert delayed extents to unwritten when zeroing post eof blocks
fs/xfs/libxfs/xfs_attr_remote.c | 1 -
fs/xfs/libxfs/xfs_bmap.c | 130 ++++++++++++++++++++++++++------
fs/xfs/libxfs/xfs_da_btree.c | 20 ++---
fs/xfs/libxfs/xfs_inode_buf.c | 47 ++++++++++--
fs/xfs/libxfs/xfs_sb.c | 7 +-
fs/xfs/scrub/attr.c | 5 ++
fs/xfs/xfs_aops.c | 54 ++++---------
fs/xfs/xfs_attr_item.c | 88 ++++++++++++++++++---
fs/xfs/xfs_bmap_util.c | 61 +++++++++------
fs/xfs/xfs_bmap_util.h | 2 +-
fs/xfs/xfs_dquot.c | 1 -
fs/xfs/xfs_icache.c | 2 +-
fs/xfs/xfs_inode.c | 14 +---
fs/xfs/xfs_iomap.c | 81 +++++++++++---------
fs/xfs/xfs_reflink.c | 20 -----
fs/xfs/xfs_rtalloc.c | 2 -
16 files changed, 342 insertions(+), 193 deletions(-)
--
2.49.0.906.g1f30a19c02-goog
Fix the bypassing of invalid packet pointer check in 6.6 by backporting
the entire "bpf: track changes_pkt_data property for global functions"
series[1], along with the follow up, "bpf: fix NPE when computing
changes_pkt_data of program w/o subprograms" series[2]; both from
Eduard.
I had ran the BPF selftests after backporting, confirmed that newly
added BPF selftests passes, and that no new failure observed in BPF
selftests (dummy_st_ops, ns_current_pid_tgid, test_bpf_ma, and
test_bpffs are failing even without this patchset applied). See [3] for
the full log.
#50/1 changes_pkt_data_freplace/changes_pkt_data_with_changes_pkt_data:OK
#50/2 changes_pkt_data_freplace/changes_pkt_data_with_does_not_change_pkt_data:OK
#50/3 changes_pkt_data_freplace/does_not_change_pkt_data_with_changes_pkt_data:OK
#50/4 changes_pkt_data_freplace/does_not_change_pkt_data_with_does_not_change_pkt_data:OK
#50/5 changes_pkt_data_freplace/main_changes_with_changes_pkt_data:OK
#50/6 changes_pkt_data_freplace/main_changes_with_does_not_change_pkt_data:OK
#50/7 changes_pkt_data_freplace/main_does_not_change_with_changes_pkt_data:OK
#50/8 changes_pkt_data_freplace/main_does_not_change_with_does_not_change_pkt_data:OK
#395/57 verifier_sock/invalidate_pkt_pointers_from_global_func:OK
#395/58 verifier_sock/invalidate_pkt_pointers_by_tail_call:OK
1: https://lore.kernel.org/all/20241210041100.1898468-1-eddyz87@gmail.com/
2: https://lore.kernel.org/all/20241212070711.427443-1-eddyz87@gmail.com/
3: https://github.com/shunghsiyu/libbpf/actions/runs/14747347842/job/413971041…
Eduard Zingerman (10):
bpf: add find_containing_subprog() utility function
bpf: refactor bpf_helper_changes_pkt_data to use helper number
bpf: track changes_pkt_data property for global functions
selftests/bpf: test for changing packet data from global functions
bpf: check changes_pkt_data property for extension programs
selftests/bpf: freplace tests for tracking of changes_packet_data
bpf: consider that tail calls invalidate packet pointers
selftests/bpf: validate that tail call invalidates packet pointers
bpf: fix null dereference when computing changes_pkt_data of prog w/o
subprogs
selftests/bpf: extend changes_pkt_data with cases w/o subprograms
include/linux/bpf.h | 1 +
include/linux/bpf_verifier.h | 1 +
include/linux/filter.h | 2 +-
kernel/bpf/core.c | 2 +-
kernel/bpf/verifier.c | 81 +++++++++++--
net/core/filter.c | 63 +++++------
.../bpf/prog_tests/changes_pkt_data.c | 107 ++++++++++++++++++
.../selftests/bpf/progs/changes_pkt_data.c | 39 +++++++
.../bpf/progs/changes_pkt_data_freplace.c | 18 +++
.../selftests/bpf/progs/verifier_sock.c | 56 +++++++++
10 files changed, 324 insertions(+), 46 deletions(-)
create mode 100644 tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c
create mode 100644 tools/testing/selftests/bpf/progs/changes_pkt_data.c
create mode 100644 tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c
--
2.49.0
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 44d9b3f584c59a606b521e7274e658d5b866c699
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025042808-vastly-armrest-7811@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 44d9b3f584c59a606b521e7274e658d5b866c699 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti(a)mev.co.uk>
Date: Tue, 15 Apr 2025 13:39:01 +0100
Subject: [PATCH] comedi: jr3_pci: Fix synchronous deletion of timer
When `jr3_pci_detach()` is called during device removal, it calls
`timer_delete_sync()` to stop the timer, but the timer expiry function
always reschedules the timer, so the synchronization is ineffective.
Call `timer_shutdown_sync()` instead. It does not matter that the timer
expiry function pointer is cleared, because the device is being removed.
Fixes: 07b509e6584a5 ("Staging: comedi: add jr3_pci driver")
Cc: stable <stable(a)kernel.org>
Signed-off-by: Ian Abbott <abbotti(a)mev.co.uk>
Link: https://lore.kernel.org/r/20250415123901.13483-1-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/comedi/drivers/jr3_pci.c b/drivers/comedi/drivers/jr3_pci.c
index cdc842b32bab..75dce1ff2419 100644
--- a/drivers/comedi/drivers/jr3_pci.c
+++ b/drivers/comedi/drivers/jr3_pci.c
@@ -758,7 +758,7 @@ static void jr3_pci_detach(struct comedi_device *dev)
struct jr3_pci_dev_private *devpriv = dev->private;
if (devpriv)
- timer_delete_sync(&devpriv->timer);
+ timer_shutdown_sync(&devpriv->timer);
comedi_pci_detach(dev);
}
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 170d1a3738908eef6a0dbf378ea77fb4ae8e294d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025042803-deflected-overdue-01a5@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 170d1a3738908eef6a0dbf378ea77fb4ae8e294d Mon Sep 17 00:00:00 2001
From: Carlos Llamas <cmllamas(a)google.com>
Date: Tue, 25 Mar 2025 18:49:00 +0000
Subject: [PATCH] binder: fix offset calculation in debug log
The vma start address should be substracted from the buffer's user data
address and not the other way around.
Cc: Tiffany Y. Yang <ynaffit(a)google.com>
Cc: stable <stable(a)kernel.org>
Fixes: 162c79731448 ("binder: avoid user addresses in debug logs")
Signed-off-by: Carlos Llamas <cmllamas(a)google.com>
Reviewed-by: Tiffany Y. Yang <ynaffit(a)google.com>
Link: https://lore.kernel.org/r/20250325184902.587138-1-cmllamas@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 76052006bd87..5fc2c8ee61b1 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -6373,7 +6373,7 @@ static void print_binder_transaction_ilocked(struct seq_file *m,
seq_printf(m, " node %d", buffer->target_node->debug_id);
seq_printf(m, " size %zd:%zd offset %lx\n",
buffer->data_size, buffer->offsets_size,
- proc->alloc.vm_start - buffer->user_data);
+ buffer->user_data - proc->alloc.vm_start);
}
static void print_binder_work_ilocked(struct seq_file *m,
Unlike patch_text(), patch_text_nosync() takes the length in bytes, not
number of instructions. It is therefore wrong for arch_prepare_ss_slot() to
pass length=1 while patching one instruction.
This bug was introduced by commit b1756750a397 ("riscv: kprobes: Use
patch_text_nosync() for insn slots"). It has been fixed upstream by commit
51781ce8f448 ("riscv: Pass patch_text() the length in bytes"). However,
beside fixing this bug, this commit does many other things, making it
unsuitable for backporting.
Fix it by properly passing the lengths in bytes.
Fixes: b1756750a397 ("riscv: kprobes: Use patch_text_nosync() for insn slots")
Signed-off-by: Nam Cao <namcao(a)linutronix.de>
---
arch/riscv/kernel/probes/kprobes.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c
index 4fbc70e823f0..dc431b965bc3 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -28,8 +28,8 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
p->ainsn.api.restore = (unsigned long)p->addr + offset;
- patch_text_nosync(p->ainsn.api.insn, &p->opcode, 1);
- patch_text_nosync((void *)p->ainsn.api.insn + offset, &insn, 1);
+ patch_text_nosync(p->ainsn.api.insn, &p->opcode, offset);
+ patch_text_nosync((void *)p->ainsn.api.insn + offset, &insn, sizeof(insn));
}
static void __kprobes arch_prepare_simulate(struct kprobe *p)
--
2.39.5
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 44d9b3f584c59a606b521e7274e658d5b866c699
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025042807-vanquish-unbeaten-57ab@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 44d9b3f584c59a606b521e7274e658d5b866c699 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti(a)mev.co.uk>
Date: Tue, 15 Apr 2025 13:39:01 +0100
Subject: [PATCH] comedi: jr3_pci: Fix synchronous deletion of timer
When `jr3_pci_detach()` is called during device removal, it calls
`timer_delete_sync()` to stop the timer, but the timer expiry function
always reschedules the timer, so the synchronization is ineffective.
Call `timer_shutdown_sync()` instead. It does not matter that the timer
expiry function pointer is cleared, because the device is being removed.
Fixes: 07b509e6584a5 ("Staging: comedi: add jr3_pci driver")
Cc: stable <stable(a)kernel.org>
Signed-off-by: Ian Abbott <abbotti(a)mev.co.uk>
Link: https://lore.kernel.org/r/20250415123901.13483-1-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/comedi/drivers/jr3_pci.c b/drivers/comedi/drivers/jr3_pci.c
index cdc842b32bab..75dce1ff2419 100644
--- a/drivers/comedi/drivers/jr3_pci.c
+++ b/drivers/comedi/drivers/jr3_pci.c
@@ -758,7 +758,7 @@ static void jr3_pci_detach(struct comedi_device *dev)
struct jr3_pci_dev_private *devpriv = dev->private;
if (devpriv)
- timer_delete_sync(&devpriv->timer);
+ timer_shutdown_sync(&devpriv->timer);
comedi_pci_detach(dev);
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 44d9b3f584c59a606b521e7274e658d5b866c699
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025042808-difficult-germicide-075a@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 44d9b3f584c59a606b521e7274e658d5b866c699 Mon Sep 17 00:00:00 2001
From: Ian Abbott <abbotti(a)mev.co.uk>
Date: Tue, 15 Apr 2025 13:39:01 +0100
Subject: [PATCH] comedi: jr3_pci: Fix synchronous deletion of timer
When `jr3_pci_detach()` is called during device removal, it calls
`timer_delete_sync()` to stop the timer, but the timer expiry function
always reschedules the timer, so the synchronization is ineffective.
Call `timer_shutdown_sync()` instead. It does not matter that the timer
expiry function pointer is cleared, because the device is being removed.
Fixes: 07b509e6584a5 ("Staging: comedi: add jr3_pci driver")
Cc: stable <stable(a)kernel.org>
Signed-off-by: Ian Abbott <abbotti(a)mev.co.uk>
Link: https://lore.kernel.org/r/20250415123901.13483-1-abbotti@mev.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/comedi/drivers/jr3_pci.c b/drivers/comedi/drivers/jr3_pci.c
index cdc842b32bab..75dce1ff2419 100644
--- a/drivers/comedi/drivers/jr3_pci.c
+++ b/drivers/comedi/drivers/jr3_pci.c
@@ -758,7 +758,7 @@ static void jr3_pci_detach(struct comedi_device *dev)
struct jr3_pci_dev_private *devpriv = dev->private;
if (devpriv)
- timer_delete_sync(&devpriv->timer);
+ timer_shutdown_sync(&devpriv->timer);
comedi_pci_detach(dev);
}
The quilt patch titled
Subject: mm/userfaultfd: prevent busy looping for tasks with signals pending
has been removed from the -mm tree. Its filename was
mm-userfaultfd-prevent-busy-looping-for-tasks-with-signals-pending.patch
This patch was dropped because an updated version will be issued
------------------------------------------------------
From: Jens Axboe <axboe(a)kernel.dk>
Subject: mm/userfaultfd: prevent busy looping for tasks with signals pending
Date: Wed, 23 Apr 2025 17:37:06 -0600
userfaultfd may use interruptible sleeps to wait on userspace filling a
page fault, which works fine if the task can be reliably put to sleeping
waiting for that. However, if the task has a normal (ie non-fatal) signal
pending, then TASK_INTERRUPTIBLE sleep will simply cause schedule() to be
a no-op.
For a task that registers a page with userfaultfd and then proceeds to do
a write from it, if that task also has a signal pending then it'll
essentially busy loop from do_page_fault() -> handle_userfault() until
that fault has been filled. Normally it'd be expected that the task would
sleep until that happens. Here's a trace from an application doing just
that:
handle_userfault+0x4b8/0xa00 (P)
hugetlb_fault+0xe24/0x1060
handle_mm_fault+0x2bc/0x318
do_page_fault+0x1e8/0x6f0
do_translation_fault+0x9c/0xd0
do_mem_abort+0x44/0xa0
el1_abort+0x3c/0x68
el1h_64_sync_handler+0xd4/0x100
el1h_64_sync+0x6c/0x70
fault_in_readable+0x74/0x108 (P)
iomap_file_buffered_write+0x14c/0x438
blkdev_write_iter+0x1a8/0x340
vfs_write+0x20c/0x348
ksys_write+0x64/0x108
__arm64_sys_write+0x1c/0x38
where the task is looping with 100% CPU time in the above mentioned fault
path.
Since it's impossible to handle signals, or other conditions like
TIF_NOTIFY_SIGNAL that also prevents interruptible sleeping, from the
fault path, use TASK_UNINTERRUPTIBLE with a short timeout even for vmf
modes that would normally ask for INTERRUPTIBLE or KILLABLE sleep. Fatal
signals will still be handled by the caller, and the timeout is short
enough to hopefully not cause any issues. If this is the first invocation
of this fault, eg FAULT_FLAG_TRIED isn't set, then the normal sleep mode
is used.
Link: https://lkml.kernel.org/r/27c3a7f5-aad8-4f2a-a66e-ff5ae98f31eb@kernel.dk
Fixes: 4064b9827063 ("mm: allow VM_FAULT_RETRY for multiple times")
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
Reported-by: Zhiwei Jiang <qq282012236(a)gmail.com>
Closes: https://lore.kernel.org/io-uring/20250422162913.1242057-1-qq282012236@gmail…
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/userfaultfd.c | 34 ++++++++++++++++++++++++++--------
1 file changed, 26 insertions(+), 8 deletions(-)
--- a/fs/userfaultfd.c~mm-userfaultfd-prevent-busy-looping-for-tasks-with-signals-pending
+++ a/fs/userfaultfd.c
@@ -334,15 +334,29 @@ out:
return ret;
}
-static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
+struct userfault_wait {
+ unsigned int task_state;
+ bool timeout;
+};
+
+static struct userfault_wait userfaultfd_get_blocking_state(unsigned int flags)
{
+ /*
+ * If the fault has already been tried AND there's a signal pending
+ * for this task, use TASK_UNINTERRUPTIBLE with a small timeout.
+ * This prevents busy looping where schedule() otherwise does nothing
+ * for TASK_INTERRUPTIBLE when the task has a signal pending.
+ */
+ if ((flags & FAULT_FLAG_TRIED) && signal_pending(current))
+ return (struct userfault_wait) { TASK_UNINTERRUPTIBLE, true };
+
if (flags & FAULT_FLAG_INTERRUPTIBLE)
- return TASK_INTERRUPTIBLE;
+ return (struct userfault_wait) { TASK_INTERRUPTIBLE, false };
if (flags & FAULT_FLAG_KILLABLE)
- return TASK_KILLABLE;
+ return (struct userfault_wait) { TASK_KILLABLE, false };
- return TASK_UNINTERRUPTIBLE;
+ return (struct userfault_wait) { TASK_UNINTERRUPTIBLE, false };
}
/*
@@ -368,7 +382,7 @@ vm_fault_t handle_userfault(struct vm_fa
struct userfaultfd_wait_queue uwq;
vm_fault_t ret = VM_FAULT_SIGBUS;
bool must_wait;
- unsigned int blocking_state;
+ struct userfault_wait wait_mode;
/*
* We don't do userfault handling for the final child pid update
@@ -466,7 +480,7 @@ vm_fault_t handle_userfault(struct vm_fa
uwq.ctx = ctx;
uwq.waken = false;
- blocking_state = userfaultfd_get_blocking_state(vmf->flags);
+ wait_mode = userfaultfd_get_blocking_state(vmf->flags);
/*
* Take the vma lock now, in order to safely call
@@ -488,7 +502,7 @@ vm_fault_t handle_userfault(struct vm_fa
* following the spin_unlock to happen before the list_add in
* __add_wait_queue.
*/
- set_current_state(blocking_state);
+ set_current_state(wait_mode.task_state);
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
if (!is_vm_hugetlb_page(vma))
@@ -501,7 +515,11 @@ vm_fault_t handle_userfault(struct vm_fa
if (likely(must_wait && !READ_ONCE(ctx->released))) {
wake_up_poll(&ctx->fd_wqh, EPOLLIN);
- schedule();
+ /* See comment in userfaultfd_get_blocking_state() */
+ if (!wait_mode.timeout)
+ schedule();
+ else
+ schedule_timeout(HZ / 10);
}
__set_current_state(TASK_RUNNING);
_
Patches currently in -mm which might be from axboe(a)kernel.dk are