The version is fetched once in check_version(), which then does some
validation and then overwrites the version in userspace with the API
version supported by the kernel. copy_params() then fetches the version
from userspace *again*, and this time no validation is done. The result
is that the kernel's version number is completely controllable by
userspace, provided that userspace can win a race condition.
Fix this flaw by not copying the version back to the kernel the second
time. This is not exploitable as the version is not further used in the
kernel. However, it could become a problem if future patches start
relying on the version field.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
Signed-off-by: Demi Marie Obenour <demi(a)invisiblethingslab.com>
---
drivers/md/dm-ioctl.c | 30 ++++++++++++++++++------------
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index da6ca26b51d0953df380582bb3a51c2ec22c27cb..7510afe237d979a5ee71afe87a20d49f631de1aa 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1873,30 +1873,33 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
* As well as checking the version compatibility this always
* copies the kernel interface version out.
*/
-static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
+static int check_version(unsigned int cmd, struct dm_ioctl __user *user,
+ struct dm_ioctl *kernel_params)
{
- uint32_t version[3];
int r = 0;
- if (copy_from_user(version, user->version, sizeof(version)))
+ if (copy_from_user(kernel_params->version, user->version, sizeof(kernel_params->version)))
return -EFAULT;
- if ((version[0] != DM_VERSION_MAJOR) ||
- (version[1] > DM_VERSION_MINOR)) {
+ if ((kernel_params->version[0] != DM_VERSION_MAJOR) ||
+ (kernel_params->version[1] > DM_VERSION_MINOR)) {
DMERR("ioctl interface mismatch: kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
DM_VERSION_MAJOR, DM_VERSION_MINOR,
DM_VERSION_PATCHLEVEL,
- version[0], version[1], version[2], cmd);
+ kernel_params->version[0],
+ kernel_params->version[1],
+ kernel_params->version[2],
+ cmd);
r = -EINVAL;
}
/*
* Fill in the kernel version.
*/
- version[0] = DM_VERSION_MAJOR;
- version[1] = DM_VERSION_MINOR;
- version[2] = DM_VERSION_PATCHLEVEL;
- if (copy_to_user(user->version, version, sizeof(version)))
+ kernel_params->version[0] = DM_VERSION_MAJOR;
+ kernel_params->version[1] = DM_VERSION_MINOR;
+ kernel_params->version[2] = DM_VERSION_PATCHLEVEL;
+ if (copy_to_user(user->version, kernel_params->version, sizeof(kernel_params->version)))
return -EFAULT;
return r;
@@ -1922,7 +1925,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
const size_t minimum_data_size = offsetof(struct dm_ioctl, data);
unsigned int noio_flag;
- if (copy_from_user(param_kernel, user, minimum_data_size))
+ /* Version has been copied from userspace already, avoid TOCTOU */
+ if (copy_from_user((char *)param_kernel + sizeof(param_kernel->version),
+ (char __user *)user + sizeof(param_kernel->version),
+ minimum_data_size - sizeof(param_kernel->version)))
return -EFAULT;
if (param_kernel->data_size < minimum_data_size) {
@@ -2034,7 +2040,7 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us
* Check the interface version passed in. This also
* writes out the kernel's interface version.
*/
- r = check_version(cmd, user);
+ r = check_version(cmd, user, ¶m_kernel);
if (r)
return r;
--
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab
This is the start of the stable review cycle for the 5.15.115 release.
There are 37 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sat, 03 Jun 2023 14:33:22 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.15.115-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.15.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.15.115-rc2
Paul Blakey <paulb(a)nvidia.com>
netfilter: ctnetlink: Support offloaded conntrack entry deletion
Nicolas Dichtel <nicolas.dichtel(a)6wind.com>
ipv{4,6}/raw: fix output xfrm lookup wrt protocol
Carlos Llamas <cmllamas(a)google.com>
binder: fix UAF of alloc->vma in race with munmap()
Carlos Llamas <cmllamas(a)google.com>
binder: add lockless binder_alloc_(set|get)_vma()
Carlos Llamas <cmllamas(a)google.com>
Revert "android: binder: stop saving a pointer to the VMA"
Carlos Llamas <cmllamas(a)google.com>
Revert "binder_alloc: add missing mmap_lock calls when using the VMA"
Ruihan Li <lrh2000(a)pku.edu.cn>
bluetooth: Add cmd validity checks at the start of hci_sock_ioctl()
Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
xdp: xdp_mem_allocator can be NULL in trace_mem_connect().
Jiaxun Yang <jiaxun.yang(a)flygoat.com>
irqchip/mips-gic: Don't touch vl_map if a local interrupt is not routable
Yunsheng Lin <linyunsheng(a)huawei.com>
page_pool: fix inconsistency for page_pool_ring_[un]lock()
Qingfang DENG <qingfang.deng(a)siflower.com.cn>
net: page_pool: use in_softirq() instead
Toke Høiland-Jørgensen <toke(a)redhat.com>
xdp: Allow registering memory model without rxq reference
Rahul Rameshbabu <rrameshbabu(a)nvidia.com>
net/mlx5e: Fix SQ wake logic in ptp napi_poll context
Jiaxun Yang <jiaxun.yang(a)flygoat.com>
irqchip/mips-gic: Use raw spinlock for gic_lock
Marc Zyngier <maz(a)kernel.org>
irqchip/mips-gic: Get rid of the reliance on irq_cpu_online()
Carlos Llamas <cmllamas(a)google.com>
binder: fix UAF caused by faulty buffer cleanup
Hangbin Liu <liuhangbin(a)gmail.com>
bonding: fix send_peer_notif overflow
Hangbin Liu <liuhangbin(a)gmail.com>
Bonding: add arp_missed_max option
Arınç ÜNAL <arinc.unal(a)arinc9.com>
net: dsa: mt7530: fix network connectivity with multiple CPU ports
Daniel Golle <daniel(a)makrotopia.org>
net: dsa: mt7530: split-off common parts from mt7531_setup
Frank Wunderlich <frank-w(a)public-files.de>
net: dsa: mt7530: rework mt753[01]_setup
Vladimir Oltean <vladimir.oltean(a)nxp.com>
net: dsa: introduce helpers for iterating through ports using dp
Claudio Imbrenda <imbrenda(a)linux.ibm.com>
KVM: s390: fix race in gmap_make_secure()
Claudio Imbrenda <imbrenda(a)linux.ibm.com>
KVM: s390: pv: add export before import
David Epping <david.epping(a)missinglinkelectronics.com>
net: phy: mscc: enable VSC8501/2 RGMII RX clock
Steve Wahl <steve.wahl(a)hpe.com>
platform/x86: ISST: Remove 8 socket limit
Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
platform/x86: ISST: PUNIT device mapping with Sub-NUMA clustering
Shay Drory <shayd(a)nvidia.com>
net/mlx5: Devcom, serialize devcom registration
Vlad Buslov <vladbu(a)nvidia.com>
net/mlx5e: Fix deadlock in tc route query code
Mark Bloch <mbloch(a)nvidia.com>
net/mlx5: devcom only supports 2 ports
Anton Protopopov <aspsk(a)isovalent.com>
bpf: fix a memory leak in the LRU and LRU_PERCPU hash maps
Hans de Goede <hdegoede(a)redhat.com>
power: supply: bq24190: Call power_supply_changed() after updating input current
Hans de Goede <hdegoede(a)redhat.com>
power: supply: core: Refactor power_supply_set_input_current_limit_from_supplier()
Hans de Goede <hdegoede(a)redhat.com>
power: supply: bq27xxx: After charger plug in/out wait 0.5s for things to stabilize
Hans de Goede <hdegoede(a)redhat.com>
power: supply: bq27xxx: Ensure power_supply_changed() is called on current sign changes
Hans de Goede <hdegoede(a)redhat.com>
power: supply: bq27xxx: Move bq27xxx_battery_update() down
Sicelo A. Mhlongo <absicsz(a)gmail.com>
power: supply: bq27xxx: expose battery data when CI=1
-------------
Diffstat:
Documentation/networking/bonding.rst | 11 ++
Makefile | 4 +-
arch/s390/kernel/uv.c | 56 ++++---
drivers/android/binder.c | 26 +++-
drivers/android/binder_alloc.c | 64 +++-----
drivers/android/binder_alloc.h | 2 +-
drivers/android/binder_alloc_selftest.c | 2 +-
drivers/irqchip/irq-mips-gic.c | 65 +++++---
drivers/net/bonding/bond_main.c | 17 +-
drivers/net/bonding/bond_netlink.c | 22 ++-
drivers/net/bonding/bond_options.c | 36 ++++-
drivers/net/bonding/bond_procfs.c | 2 +
drivers/net/bonding/bond_sysfs.c | 13 ++
drivers/net/dsa/mt7530.c | 124 +++++++++------
drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 2 +
drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 2 +
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 19 +--
drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 19 ++-
.../net/ethernet/mellanox/mlx5/core/lib/devcom.c | 81 +++++++---
.../net/ethernet/mellanox/mlx5/core/lib/devcom.h | 3 +
drivers/net/phy/mscc/mscc.h | 1 +
drivers/net/phy/mscc/mscc_main.c | 54 +++----
.../x86/intel/speed_select_if/isst_if_common.c | 49 ++++--
drivers/power/supply/bq24190_charger.c | 13 +-
drivers/power/supply/bq27xxx_battery.c | 171 +++++++++++----------
drivers/power/supply/power_supply_core.c | 57 +++----
include/linux/power/bq27xxx_battery.h | 3 +
include/linux/power_supply.h | 5 +-
include/net/bond_options.h | 1 +
include/net/bonding.h | 3 +-
include/net/dsa.h | 28 ++++
include/net/ip.h | 2 +
include/net/page_pool.h | 18 ---
include/net/xdp.h | 3 +
include/uapi/linux/if_link.h | 1 +
include/uapi/linux/in.h | 2 +
kernel/bpf/hashtab.c | 6 +-
net/bluetooth/hci_sock.c | 28 ++++
net/core/page_pool.c | 34 +++-
net/core/xdp.c | 93 +++++++----
net/ipv4/ip_sockglue.c | 12 +-
net/ipv4/raw.c | 5 +-
net/ipv6/raw.c | 3 +-
net/netfilter/nf_conntrack_netlink.c | 8 -
tools/include/uapi/linux/if_link.h | 1 +
45 files changed, 763 insertions(+), 408 deletions(-)
The version is fetched once in check_version(), which then does some
validation and then overwrites the version in userspace with the API
version supported by the kernel. copy_params() then fetches the version
from userspace *again*, and this time no validation is done. The result
is that the kernel's version number is completely controllable by
userspace, provided that userspace can win a race condition.
Fix this flaw by not copying the version back to the kernel the second
time. This is not exploitable as the version is not further used in the
kernel. However, it could become a problem if future patches start
relying on the version field.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
Signed-off-by: Demi Marie Obenour <demi(a)invisiblethingslab.com>
---
drivers/md/dm-ioctl.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index da6ca26b51d0953df380582bb3a51c2ec22c27cb..fd46b249f6f856c49752063fc49d720e95df0525 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1873,12 +1873,13 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
* As well as checking the version compatibility this always
* copies the kernel interface version out.
*/
-static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
+static int check_version(unsigned int cmd, struct dm_ioctl __user *user,
+ struct dm_ioctl *kernel_params)
{
- uint32_t version[3];
int r = 0;
+ uint32_t *version = kernel_params->version;
- if (copy_from_user(version, user->version, sizeof(version)))
+ if (copy_from_user(version, user->version, sizeof(user->version)))
return -EFAULT;
if ((version[0] != DM_VERSION_MAJOR) ||
@@ -1922,7 +1923,10 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
const size_t minimum_data_size = offsetof(struct dm_ioctl, data);
unsigned int noio_flag;
- if (copy_from_user(param_kernel, user, minimum_data_size))
+ /* Version has been copied from userspace already, avoid TOCTOU */
+ if (copy_from_user((char *)param_kernel + sizeof(param_kernel->version),
+ (char __user *)user + sizeof(param_kernel->version),
+ minimum_data_size - sizeof(param_kernel->version)))
return -EFAULT;
if (param_kernel->data_size < minimum_data_size) {
@@ -2034,7 +2038,7 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us
* Check the interface version passed in. This also
* writes out the kernel's interface version.
*/
- r = check_version(cmd, user);
+ r = check_version(cmd, user, ¶m_kernel);
if (r)
return r;
--
Sincerely,
Demi Marie Obenour (she/her/hers)
Invisible Things Lab
This patch fixes a possible plock op collisions when using F_SETLKW lock
requests and fsid, number and owner are not enough to identify a result
for a pending request. The ltp testcases [0] and [1] are examples when
this is not enough in case of using classic posix locks with threads and
open filedescriptor posix locks.
The idea to fix the issue here is to place all lock request in order. In
case of non F_SETLKW lock request (indicated if wait is set or not) the
lock requests are ordered inside the recv_list. If a result comes back
the right plock op can be found by the first plock_op in recv_list which
has not info.wait set. This can be done only by non F_SETLKW plock ops as
dlm_controld always reads a specific plock op (list_move_tail() from
send_list to recv_mlist) and write the result immediately back.
This behaviour is for F_SETLKW not possible as multiple waiters can be
get a result back in an random order. To avoid a collisions in cases
like [0] or [1] this patch adds more fields to compare the plock
operations as the lock request is the same. This is also being made in
NFS to find an result for an asynchronous F_SETLKW lock request [2][3]. We
still can't find the exact lock request for a specific result if the
lock request is the same, but if this is the case we don't care the
order how the identical lock requests get their result back to grant the
lock.
[0] https://gitlab.com/netcoder/ltp/-/blob/dlm_fcntl_owner_testcase/testcases/k…
[1] https://gitlab.com/netcoder/ltp/-/blob/dlm_fcntl_owner_testcase/testcases/k…
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/inc…
[3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/…
Cc: stable(a)vger.kernel.org
Signed-off-by: Alexander Aring <aahringo(a)redhat.com>
---
change since v2:
- don't split recv_list into recv_setlkw_list
fs/dlm/plock.c | 43 ++++++++++++++++++++++++++++++-------------
1 file changed, 30 insertions(+), 13 deletions(-)
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 31bc601ee3d8..53d17dbbb716 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -391,7 +391,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
if (op->info.flags & DLM_PLOCK_FL_CLOSE)
list_del(&op->list);
else
- list_move(&op->list, &recv_list);
+ list_move_tail(&op->list, &recv_list);
memcpy(&info, &op->info, sizeof(info));
}
spin_unlock(&ops_lock);
@@ -430,19 +430,36 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
return -EINVAL;
spin_lock(&ops_lock);
- list_for_each_entry(iter, &recv_list, list) {
- if (iter->info.fsid == info.fsid &&
- iter->info.number == info.number &&
- iter->info.owner == info.owner) {
- list_del_init(&iter->list);
- memcpy(&iter->info, &info, sizeof(info));
- if (iter->data)
- do_callback = 1;
- else
- iter->done = 1;
- op = iter;
- break;
+ if (info.wait) {
+ list_for_each_entry(iter, &recv_list, list) {
+ if (iter->info.fsid == info.fsid &&
+ iter->info.number == info.number &&
+ iter->info.owner == info.owner &&
+ iter->info.pid == info.pid &&
+ iter->info.start == info.start &&
+ iter->info.end == info.end &&
+ iter->info.ex == info.ex &&
+ iter->info.wait) {
+ op = iter;
+ break;
+ }
}
+ } else {
+ list_for_each_entry(iter, &recv_list, list) {
+ if (!iter->info.wait) {
+ op = iter;
+ break;
+ }
+ }
+ }
+
+ if (op) {
+ list_del_init(&op->list);
+ memcpy(&op->info, &info, sizeof(info));
+ if (op->data)
+ do_callback = 1;
+ else
+ op->done = 1;
}
spin_unlock(&ops_lock);
--
2.31.1
On Fri, 2 Jun 2023 15:57:46 -0700 Mike Kravetz <mike.kravetz(a)oracle.com> wrote:
> In commits d0ce0e47b323 and 91a2fb956ad99, hugetlb code was changed to
> use page_cache_next_miss to determine if a page was present in the page
> cache. However, the current implementation of page_cache_next_miss will
> always return the passed index if max_scan is 1 as in the hugetlb code.
> As a result, hugetlb code will always thing a page is present in the
> cache, even if that is not the case.
>
> The patch which follows addresses the issue by changing the implementation
> of page_cache_next_miss and for consistency page_cache_prev_miss. Since
> such a patch also impacts the readahead code, I would suggest using the
> patch by Sidhartha Kumar [1] to fix the issue in 6.3 and this patch moving
> forward.
Well this is tricky.
This patch applies cleanly to 6.3, so if we add cc:stable to this
patch, it will get backported, against your suggestion.
Sidhartha's patch [1] (which you recommend for -stable) is quite
different from this patch. And Sidhartha's patch has no route to being
tested in linux-next nor to being merged by Linus.
So problems. The preferable approach is to just backport this patch
into -stable in the usual fashion. What are the risks in doing this?
> If we would rather not modify page_cache_next/prev_miss, then a new
> interface as suggested by Ackerley Tng [2] could also be used.
>
> Comments on the best way to fix moving forward would be appreciated.
>
> [1] https://lore.kernel.org/linux-mm/20230505185301.534259-1-sidhartha.kumar@or…
> [2] https://lore.kernel.org/linux-mm/98624c2f481966492b4eb8272aef747790229b73.1…
>
> Mike Kravetz (1):
> page cache: fix page_cache_next/prev_miss off by one
>
> mm/filemap.c | 26 ++++++++++++++++----------
> 1 file changed, 16 insertions(+), 10 deletions(-)
>
Currently the offset into the device when looking for OTP
bits can go outside of the address of the MTD NOR devices,
and if that memory isn't readable, bad things happen
on the IXP4xx (added prints that illustrate the problem before
the crash):
cfi_intelext_otp_walk walk OTP on chip 0 start at reg_prot_offset 0x00000100
ixp4xx_copy_from copy from 0x00000100 to 0xc880dd78
cfi_intelext_otp_walk walk OTP on chip 0 start at reg_prot_offset 0x12000000
ixp4xx_copy_from copy from 0x12000000 to 0xc880dd78
8<--- cut here ---
Unable to handle kernel paging request at virtual address db000000
[db000000] *pgd=00000000
(...)
This happens in this case because the IXP4xx is big endian and
the 32- and 16-bit fields in the struct cfi_intelext_otpinfo are not
properly byteswapped. Compare to how the code in read_pri_intelext()
byteswaps the fields in struct cfi_pri_intelext.
Adding a small byte swapping loop for the OTP in read_pri_intelext()
and the crash goes away.
The problem went unnoticed for many years until I enabled
CONFIG_MTD_OTP on the IXP4xx as well, triggering the bug.
Cc: Nicolas Pitre <npitre(a)baylibre.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
---
ChangeLog v2->v3:
- Move the byte swapping to a small loop in read_pri_intelext()
so all bytes are swapped as we reach cfi_intelext_otp_walk().
ChangeLog v1->v2:
- Drill deeper and discover a big endian compatibility issue.
---
drivers/mtd/chips/cfi_cmdset_0001.c | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index 54f92d09d9cf..02aaf09d6f5c 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -421,9 +421,25 @@ read_pri_intelext(struct map_info *map, __u16 adr)
extra_size = 0;
/* Protection Register info */
- if (extp->NumProtectionFields)
+ if (extp->NumProtectionFields) {
+ struct cfi_intelext_otpinfo *otp =
+ (struct cfi_intelext_otpinfo *)&extp->extra[0];
+
extra_size += (extp->NumProtectionFields - 1) *
- sizeof(struct cfi_intelext_otpinfo);
+ sizeof(struct cfi_intelext_otpinfo);
+
+ if (extp_size >= sizeof(*extp) + extra_size) {
+ int i;
+
+ /* Do some byteswapping if necessary */
+ for (i = 0; i < extp->NumProtectionFields - 1; i++) {
+ otp->ProtRegAddr = le32_to_cpu(otp->ProtRegAddr);
+ otp->FactGroups = le16_to_cpu(otp->FactGroups);
+ otp->UserGroups = le16_to_cpu(otp->UserGroups);
+ otp++;
+ }
+ }
+ }
}
if (extp->MinorVersion >= '1') {
--
2.40.1
The patch titled
Subject: mm/damon/ops-common: atomically test and clear young on ptes and pmds
has been added to the -mm mm-unstable branch. Its filename is
mm-damon-ops-common-atomically-test-and-clear-young-on-ptes-and-pmds.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Ryan Roberts <ryan.roberts(a)arm.com>
Subject: mm/damon/ops-common: atomically test and clear young on ptes and pmds
Date: Fri, 2 Jun 2023 10:29:47 +0100
It is racy to non-atomically read a pte, then clear the young bit, then
write it back as this could discard dirty information. Further, it is bad
practice to directly set a pte entry within a table. Instead clearing
young must go through the arch-provided helper,
ptep_test_and_clear_young() to ensure it is modified atomically and to
give the arch code visibility and allow it to check (and potentially
modify) the operation.
Link: https://lkml.kernel.org/r/20230602092949.545577-3-ryan.roberts@arm.com
Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces").
Signed-off-by: Ryan Roberts <ryan.roberts(a)arm.com>
Reviewed-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: SeongJae Park <sj(a)kernel.org>
Reviewed-by: Mike Rapoport (IBM) <rppt(a)kernel.org>
Cc: Christoph Hellwig <hch(a)infradead.org>
Cc: Christoph Hellwig <hch(a)lst.de>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Lorenzo Stoakes <lstoakes(a)gmail.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Uladzislau Rezki (Sony) <urezki(a)gmail.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/ops-common.c | 16 ++++++----------
mm/damon/ops-common.h | 4 ++--
mm/damon/paddr.c | 4 ++--
mm/damon/vaddr.c | 4 ++--
4 files changed, 12 insertions(+), 16 deletions(-)
--- a/mm/damon/ops-common.c~mm-damon-ops-common-atomically-test-and-clear-young-on-ptes-and-pmds
+++ a/mm/damon/ops-common.c
@@ -37,7 +37,7 @@ struct folio *damon_get_folio(unsigned l
return folio;
}
-void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
+void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr)
{
bool referenced = false;
struct folio *folio = damon_get_folio(pte_pfn(*pte));
@@ -45,13 +45,11 @@ void damon_ptep_mkold(pte_t *pte, struct
if (!folio)
return;
- if (pte_young(*pte)) {
+ if (ptep_test_and_clear_young(vma, addr, pte))
referenced = true;
- *pte = pte_mkold(*pte);
- }
#ifdef CONFIG_MMU_NOTIFIER
- if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
+ if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE))
referenced = true;
#endif /* CONFIG_MMU_NOTIFIER */
@@ -62,7 +60,7 @@ void damon_ptep_mkold(pte_t *pte, struct
folio_put(folio);
}
-void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
+void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool referenced = false;
@@ -71,13 +69,11 @@ void damon_pmdp_mkold(pmd_t *pmd, struct
if (!folio)
return;
- if (pmd_young(*pmd)) {
+ if (pmdp_test_and_clear_young(vma, addr, pmd))
referenced = true;
- *pmd = pmd_mkold(*pmd);
- }
#ifdef CONFIG_MMU_NOTIFIER
- if (mmu_notifier_clear_young(mm, addr, addr + HPAGE_PMD_SIZE))
+ if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + HPAGE_PMD_SIZE))
referenced = true;
#endif /* CONFIG_MMU_NOTIFIER */
--- a/mm/damon/ops-common.h~mm-damon-ops-common-atomically-test-and-clear-young-on-ptes-and-pmds
+++ a/mm/damon/ops-common.h
@@ -9,8 +9,8 @@
struct folio *damon_get_folio(unsigned long pfn);
-void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
-void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
+void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr);
+void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr);
int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
--- a/mm/damon/paddr.c~mm-damon-ops-common-atomically-test-and-clear-young-on-ptes-and-pmds
+++ a/mm/damon/paddr.c
@@ -24,9 +24,9 @@ static bool __damon_pa_mkold(struct foli
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte)
- damon_ptep_mkold(pvmw.pte, vma->vm_mm, addr);
+ damon_ptep_mkold(pvmw.pte, vma, addr);
else
- damon_pmdp_mkold(pvmw.pmd, vma->vm_mm, addr);
+ damon_pmdp_mkold(pvmw.pmd, vma, addr);
}
return true;
}
--- a/mm/damon/vaddr.c~mm-damon-ops-common-atomically-test-and-clear-young-on-ptes-and-pmds
+++ a/mm/damon/vaddr.c
@@ -311,7 +311,7 @@ static int damon_mkold_pmd_entry(pmd_t *
}
if (pmd_trans_huge(*pmd)) {
- damon_pmdp_mkold(pmd, walk->mm, addr);
+ damon_pmdp_mkold(pmd, walk->vma, addr);
spin_unlock(ptl);
return 0;
}
@@ -323,7 +323,7 @@ static int damon_mkold_pmd_entry(pmd_t *
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
if (!pte_present(*pte))
goto out;
- damon_ptep_mkold(pte, walk->mm, addr);
+ damon_ptep_mkold(pte, walk->vma, addr);
out:
pte_unmap_unlock(pte, ptl);
return 0;
_
Patches currently in -mm which might be from ryan.roberts(a)arm.com are
mm-vmalloc-must-set-pte-via-arch-code.patch
mm-damon-ops-common-atomically-test-and-clear-young-on-ptes-and-pmds.patch
mm-damon-ops-common-refactor-to-use-ptepmdp_clear_young_notify.patch
mm-fix-failure-to-unmap-pte-on-highmem-systems.patch
From: Lino Sanfilippo <l.sanfilippo(a)kunbus.com>
With commit 858e8b792d06 ("tpm, tpm_tis: Avoid cache incoherency in test
for interrupts") bit accessor functions are used to access flags in
tpm_tis_data->flags.
However these functions expect bit numbers, while the flags are defined as
bit masks in enum tpm_tis_flag.
Fix this inconsistency by using numbers instead of masks also for the flags
in the enum.
Reported-by: Pavel Machek <pavel(a)denx.de>
Fixes: 858e8b792d06 ("tpm, tpm_tis: Avoid cache incoherency in test for interrupts")
Signed-off-by: Lino Sanfilippo <l.sanfilippo(a)kunbus.com>
Cc: stable(a)vger.kernel.org
---
drivers/char/tpm/tpm_tis_core.h | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h
index e978f457fd4d..610bfadb6acf 100644
--- a/drivers/char/tpm/tpm_tis_core.h
+++ b/drivers/char/tpm/tpm_tis_core.h
@@ -84,10 +84,10 @@ enum tis_defaults {
#define ILB_REMAP_SIZE 0x100
enum tpm_tis_flags {
- TPM_TIS_ITPM_WORKAROUND = BIT(0),
- TPM_TIS_INVALID_STATUS = BIT(1),
- TPM_TIS_DEFAULT_CANCELLATION = BIT(2),
- TPM_TIS_IRQ_TESTED = BIT(3),
+ TPM_TIS_ITPM_WORKAROUND = 0,
+ TPM_TIS_INVALID_STATUS = 1,
+ TPM_TIS_DEFAULT_CANCELLATION = 2,
+ TPM_TIS_IRQ_TESTED = 3,
};
struct tpm_tis_data {
base-commit: 7877cb91f1081754a1487c144d85dc0d2e2e7fc4
--
2.40.1
usb_udc_vbus_handler() can be invoked from interrupt context by irq
handlers of the gadget drivers, however, usb_udc_connect_control() has
to run in non-atomic context due to the following:
a. Some of the gadget driver implementations expect the ->pullup
callback to be invoked in non-atomic context.
b. usb_gadget_disconnect() acquires udc_lock which is a mutex.
Hence offload invocation of usb_udc_connect_control()
to workqueue.
Cc: stable(a)vger.kernel.org
Fixes: 1016fc0c096c ("USB: gadget: Fix obscure lockdep violation for udc_mutex")
Signed-off-by: Badhri Jagan Sridharan <badhri(a)google.com>
---
Changes since v1:
- Address Alan Stern's comment on usb_udc_vbus_handler invocation from
atomic context:
* vbus_events_lock is now a spinlock and allocations in
* usb_udc_vbus_handler are atomic now.
Changes since v2:
- Addressing Alan Stern's comments:
** connect_lock is now held by callers of
* usb_gadget_pullup_update_locked() and gadget_(un)bind_driver() does
* notdirectly hold the lock.
** Both usb_gadget_(dis)connect() and usb_udc_vbus_handler() would
* set/clear udc->vbus and invoke usb_gadget_pullup_update_locked.
** Add "unbinding" to prevent new connections after the gadget is being
* unbound.
Changes since v3:
** Made a minor cleanup which I missed to do in v3 in
* usb_udc_vbus_handler().
Changes since v4:
- Addressing Alan Stern's comments:
** usb_udc_vbus_handler() now offloads invocation of usb_udc_connect_control()
* from workqueue.
** Dropped vbus_events list as this was redundant. Updating to the
* latest value is suffice
---
drivers/usb/gadget/udc/core.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c
index 52e6d2e84e35..44a9f32679b5 100644
--- a/drivers/usb/gadget/udc/core.c
+++ b/drivers/usb/gadget/udc/core.c
@@ -48,6 +48,7 @@ struct usb_udc {
struct list_head list;
bool vbus;
bool started;
+ struct work_struct vbus_work;
};
static struct class *udc_class;
@@ -1086,6 +1087,13 @@ static void usb_udc_connect_control(struct usb_udc *udc)
usb_gadget_disconnect(udc->gadget);
}
+static void vbus_event_work(struct work_struct *work)
+{
+ struct usb_udc *udc = container_of(work, struct usb_udc, vbus_work);
+
+ usb_udc_connect_control(udc);
+}
+
/**
* usb_udc_vbus_handler - updates the udc core vbus status, and try to
* connect or disconnect gadget
@@ -1094,6 +1102,13 @@ static void usb_udc_connect_control(struct usb_udc *udc)
*
* The udc driver calls it when it wants to connect or disconnect gadget
* according to vbus status.
+ *
+ * This function can be invoked from interrupt context by irq handlers of the gadget drivers,
+ * however, usb_udc_connect_control() has to run in non-atomic context due to the following:
+ * a. Some of the gadget driver implementations expect the ->pullup callback to be invoked in
+ * non-atomic context.
+ * b. usb_gadget_disconnect() acquires udc_lock which is a mutex.
+ * Hence offload invocation of usb_udc_connect_control() to workqueue.
*/
void usb_udc_vbus_handler(struct usb_gadget *gadget, bool status)
{
@@ -1101,7 +1116,7 @@ void usb_udc_vbus_handler(struct usb_gadget *gadget, bool status)
if (udc) {
udc->vbus = status;
- usb_udc_connect_control(udc);
+ schedule_work(&udc->vbus_work);
}
}
EXPORT_SYMBOL_GPL(usb_udc_vbus_handler);
@@ -1328,6 +1343,7 @@ int usb_add_gadget(struct usb_gadget *gadget)
mutex_lock(&udc_lock);
list_add_tail(&udc->list, &udc_list);
mutex_unlock(&udc_lock);
+ INIT_WORK(&udc->vbus_work, vbus_event_work);
ret = device_add(&udc->dev);
if (ret)
@@ -1558,6 +1574,7 @@ static void gadget_unbind_driver(struct device *dev)
kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE);
+ cancel_work_sync(&udc->vbus_work);
usb_gadget_disconnect(gadget);
usb_gadget_disable_async_callbacks(udc);
if (gadget->irq)
base-commit: 046895105d9666ab56e86ce8dd9786f8003125c6
--
2.41.0.rc0.172.g3f132b7071-goog