Passing the host topology to the guest is almost certainly wrong
and will confuse the scheduler. In addition, several fields of
these CPUID leaves vary on each processor; it is simply impossible to
return the right values from KVM_GET_SUPPORTED_CPUID in such a way that
they can be passed to KVM_SET_CPUID2.
The values that will most likely prevent confusion are all zeroes.
Userspace will have to override it anyway if it wishes to present a
specific topology to the guest.
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
Documentation/virt/kvm/api.rst | 14 ++++++++++++++
arch/x86/kvm/cpuid.c | 32 ++++++++++++++++----------------
2 files changed, 30 insertions(+), 16 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index eee9f857a986..20f4f6b302ff 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8249,6 +8249,20 @@ CPU[EAX=1]:ECX[24] (TSC_DEADLINE) is not reported by ``KVM_GET_SUPPORTED_CPUID``
It can be enabled if ``KVM_CAP_TSC_DEADLINE_TIMER`` is present and the kernel
has enabled in-kernel emulation of the local APIC.
+CPU topology
+~~~~~~~~~~~~
+
+Several CPUID values include topology information for the host CPU:
+0x0b and 0x1f for Intel systems, 0x8000001e for AMD systems. Different
+versions of KVM return different values for this information and userspace
+should not rely on it. Currently they return all zeroes.
+
+If userspace wishes to set up a guest topology, it should be careful that
+the values of these three leaves differ for each CPU. In particular,
+the APIC ID is found in EDX for all subleaves of 0x0b and 0x1f, and in EAX
+for 0x8000001e; the latter also encodes the core id and node id in bits
+7:0 of EBX and ECX respectively.
+
Obsolete ioctls and capabilities
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0810e93cbedc..164bfb7e7a16 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -759,16 +759,22 @@ struct kvm_cpuid_array {
int nent;
};
+static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array)
+{
+ if (array->nent >= array->maxnent)
+ return NULL;
+
+ return &array->entries[array->nent++];
+}
+
static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
u32 function, u32 index)
{
- struct kvm_cpuid_entry2 *entry;
+ struct kvm_cpuid_entry2 *entry = get_next_cpuid(array);
- if (array->nent >= array->maxnent)
+ if (!entry)
return NULL;
- entry = &array->entries[array->nent++];
-
memset(entry, 0, sizeof(*entry));
entry->function = function;
entry->index = index;
@@ -945,22 +951,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = edx.full;
break;
}
- /*
- * Per Intel's SDM, the 0x1f is a superset of 0xb,
- * thus they can be handled by common code.
- */
case 0x1f:
case 0xb:
/*
- * Populate entries until the level type (ECX[15:8]) of the
- * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is
- * the starting entry, filled by the primary do_host_cpuid().
+ * No topology; a valid topology is indicated by the presence
+ * of subleaf 1.
*/
- for (i = 1; entry->ecx & 0xff00; ++i) {
- entry = do_host_cpuid(array, function, i);
- if (!entry)
- goto out;
- }
+ entry->eax = entry->ebx = entry->ecx = 0;
break;
case 0xd: {
u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm();
@@ -1193,6 +1190,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ebx = entry->ecx = entry->edx = 0;
break;
case 0x8000001e:
+ /* Do not return host topology information. */
+ entry->eax = entry->ebx = entry->ecx = 0;
+ entry->edx = 0; /* reserved */
break;
case 0x8000001F:
if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
--
2.31.1
The previous algorithm was pretty broken.
- The inner loop had a '(m > m_max)' condition, and the value of 'm'
would increase in each iteration;
- Each iteration would actually multiply 'm' by two, so it is not needed
to re-compute the whole equation at each iteration;
- It would loop until (m & 1) == 0, which means it would loop at most
once.
- The outer loop would divide the 'n' value by two at the end of each
iteration. This meant that for a 12 MHz parent clock and a 1.2 GHz
requested clock, it would first try n=12, then n=6, then n=3, then
n=1, none of which would work; the only valid value is n=2 in this
case.
Simplify this algorithm with a single for loop, which decrements 'n'
after each iteration, addressing all of the above problems.
Fixes: bdbfc029374f ("clk: ingenic: Add support for the JZ4760")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Paul Cercueil <paul(a)crapouillou.net>
---
drivers/clk/ingenic/jz4760-cgu.c | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/drivers/clk/ingenic/jz4760-cgu.c b/drivers/clk/ingenic/jz4760-cgu.c
index ecd395ac8a28..e407f00bd594 100644
--- a/drivers/clk/ingenic/jz4760-cgu.c
+++ b/drivers/clk/ingenic/jz4760-cgu.c
@@ -58,7 +58,7 @@ jz4760_cgu_calc_m_n_od(const struct ingenic_cgu_pll_info *pll_info,
unsigned long rate, unsigned long parent_rate,
unsigned int *pm, unsigned int *pn, unsigned int *pod)
{
- unsigned int m, n, od, m_max = (1 << pll_info->m_bits) - 2;
+ unsigned int m, n, od, m_max = (1 << pll_info->m_bits) - 1;
/* The frequency after the N divider must be between 1 and 50 MHz. */
n = parent_rate / (1 * MHZ);
@@ -66,19 +66,17 @@ jz4760_cgu_calc_m_n_od(const struct ingenic_cgu_pll_info *pll_info,
/* The N divider must be >= 2. */
n = clamp_val(n, 2, 1 << pll_info->n_bits);
- for (;; n >>= 1) {
- od = (unsigned int)-1;
+ rate /= MHZ;
+ parent_rate /= MHZ;
- do {
- m = (rate / MHZ) * (1 << ++od) * n / (parent_rate / MHZ);
- } while ((m > m_max || m & 1) && (od < 4));
-
- if (od < 4 && m >= 4 && m <= m_max)
- break;
+ for (m = m_max; m >= m_max && n >= 2; n--) {
+ m = rate * n / parent_rate;
+ od = m & 1;
+ m <<= od;
}
*pm = m;
- *pn = n;
+ *pn = n + 1;
*pod = 1 << od;
}
--
2.35.1
Current usage of kvm_io_device requires users to destruct it with an extra
call of kvm_iodevice_destructor after the device gets unregistered from
the kvm_io_bus. This is not necessary and can cause errors if a user
forgot to make the extra call.
Simplify the usage by combining kvm_iodevice_destructor into
kvm_io_bus_unregister_dev. This reduces LOCs a bit for users and can
avoid the leakage of destructing the device explicitly.
The fix was originally provided by Sean Christopherson.
Link: https://lore.kernel.org/lkml/DS0PR11MB6373F27D0EE6CD28C784478BDCEC9@DS0PR11…
Fixes: 5d3c4c79384a ("KVM: Stop looking for coalesced MMIO zones if the bus is destroyed")
Cc: stable(a)vger.kernel.org
Reported-by: ��������� <liujingfeng(a)qianxin.com>
Signed-off-by: Wei Wang <wei.w.wang(a)intel.com>
---
include/kvm/iodev.h | 6 ------
virt/kvm/coalesced_mmio.c | 1 -
virt/kvm/eventfd.c | 1 -
virt/kvm/kvm_main.c | 24 +++++++++++++++---------
4 files changed, 15 insertions(+), 17 deletions(-)
diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h
index d75fc4365746..56619e33251e 100644
--- a/include/kvm/iodev.h
+++ b/include/kvm/iodev.h
@@ -55,10 +55,4 @@ static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu,
: -EOPNOTSUPP;
}
-static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
-{
- if (dev->ops->destructor)
- dev->ops->destructor(dev);
-}
-
#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 0be80c213f7f..d7135a5e76f8 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -195,7 +195,6 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
*/
if (r)
break;
- kvm_iodevice_destructor(&dev->dev);
}
}
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 2a3ed401ce46..1b277afb545b 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -898,7 +898,6 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
bus = kvm_get_bus(kvm, bus_idx);
if (bus)
bus->ioeventfd_count--;
- ioeventfd_release(p);
ret = 0;
break;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..582757ebdce6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5200,6 +5200,12 @@ static struct notifier_block kvm_reboot_notifier = {
.priority = 0,
};
+static void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+ if (dev->ops->destructor)
+ dev->ops->destructor(dev);
+}
+
static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
{
int i;
@@ -5423,7 +5429,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev)
{
- int i, j;
+ int i;
struct kvm_io_bus *new_bus, *bus;
lockdep_assert_held(&kvm->slots_lock);
@@ -5453,18 +5459,18 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
synchronize_srcu_expedited(&kvm->srcu);
- /* Destroy the old bus _after_ installing the (null) bus. */
+ /*
+ * If (null) bus is installed, destroy the old bus, including all the
+ * attached devices. Otherwise, destroy the caller's device only.
+ */
if (!new_bus) {
pr_err("kvm: failed to shrink bus, removing it completely\n");
- for (j = 0; j < bus->dev_count; j++) {
- if (j == i)
- continue;
- kvm_iodevice_destructor(bus->range[j].dev);
- }
+ kvm_io_bus_destroy(bus);
+ return -ENOMEM;
}
- kfree(bus);
- return new_bus ? 0 : -ENOMEM;
+ kvm_iodevice_destructor(dev);
+ return 0;
}
struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
--
2.27.0
From: Roberto Sassu <roberto.sassu(a)huawei.com>
Commit 98de59bfe4b2f ("take calculation of final prot in
security_mmap_file() into a helper") moved the code to update prot with the
actual protection flags to be granted to the requestor by the kernel to a
helper called mmap_prot(). However, the patch didn't update the argument
passed to ima_file_mmap(), making it receive the requested prot instead of
the final computed prot.
A possible consequence is that files mmapped as executable might not be
measured/appraised if PROT_EXEC is not requested but subsequently added in
the final prot.
Replace prot with mmap_prot(file, prot) as the second argument of
ima_file_mmap() to restore the original behavior.
Cc: stable(a)vger.kernel.org
Fixes: 98de59bfe4b2 ("take calculation of final prot in security_mmap_file() into a helper")
Signed-off-by: Roberto Sassu <roberto.sassu(a)huawei.com>
---
security/security.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/security/security.c b/security/security.c
index d1571900a8c7..0d2359d588a1 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1666,7 +1666,7 @@ int security_mmap_file(struct file *file, unsigned long prot,
mmap_prot(file, prot), flags);
if (ret)
return ret;
- return ima_file_mmap(file, prot);
+ return ima_file_mmap(file, mmap_prot(file, prot));
}
int security_mmap_addr(unsigned long addr)
--
2.25.1
Backport of:
commit 0d362be5b142 ("Makefile: link with -z noexecstack --no-warn-rwx-segments")
breaks arm64 Build ID when CONFIG_MODVERSIONS=y for all kernels
from: commit e4484a495586 ("Merge tag 'kbuild-fixes-v5.0' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild")
until: commit df202b452fe6 ("Merge tag 'kbuild-v5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild")
Linus's tree doesn't have this issue since 0d362be5b142 was merged
after df202b452fe6 which included:
commit 7b4537199a4a ("kbuild: link symbol CRCs at final link, removing CONFIG_MODULE_REL_CRCS")
This kernel's KBUILD CONFIG_MODVERSIONS tooling compiles and links .S targets
with relocatable (-r) and now (-z noexecstack)
which results in ld adding a .note.GNU-stack section to .o files.
Final linking of vmlinux should add a .NOTES segment containing the
Build ID, but does NOT (on some architectures like arm64) if a
.note.GNU-stack section is found in .o's supplied during link
of vmlinux.
DISCARD .note.GNU-stack sections of .S targets. Final link of
vmlinux then properly adds .NOTES segment containing Build ID that can
be read using tools like 'readelf -n'.
Fixes: 0d362be5b142 ("Makefile: link with -z noexecstack --no-warn-rwx-segments")
Cc: <stable(a)vger.kernel.org> # 5.15, 5.10, 5.4
Cc: <linux-kbuild(a)vger.kernel.org>
Cc: Nick Desaulniers <ndesaulniers(a)google.com>
Cc: Masahiro Yamada <masahiroy(a)kernel.org>
Cc: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Michal Marek <michal.lkml(a)markovi.net>
Cc: Nathan Chancellor <nathan(a)kernel.org>
Signed-off-by: Tom Saeger <tom.saeger(a)oracle.com>
---
v2:
- Changed approach to append DISCARD section to generated linker script.
- ld no longer emits warning (which was intent of 0d362b35b142) this
addresses Nick's v1 feedback.
- this is applied to all arches, not just arm64
- added commit refs and notes why this doesn't occur in Linus's tree
to address Greg's v1 feedback.
- added Fixes: 0d362b35b142 requested by Nick
- added note to changelog for 7b4537199a4a requested by Nick
- build tested on arm64 and x86
version works(vmlinux contains Build ID)
v4.14.302 x86, arm64
v4.14.302.patched x86, arm64
v4.19.269 x86, arm64
v4.19.269.patched x86, arm64
v5.4.227 x86
v5.4.227.patched x86, arm64
v5.10.159 x86
v5.10.159.patched x86, arm64
v5.15.83 x86
v5.15.83.patched x86, arm64
v1: https://lore.kernel.org/all/cover.1670358255.git.tom.saeger@oracle.com/
scripts/Makefile.build | 2 ++
1 file changed, 2 insertions(+)
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 17aa8ef2d52a..e3939676eeb5 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -379,6 +379,8 @@ cmd_modversions_S = \
if $(OBJDUMP) -h $@ | grep -q __ksymtab; then \
$(call cmd_gensymtypes_S,$(KBUILD_SYMTYPES),$(@:.o=.symtypes)) \
> $(@D)/.tmp_$(@F:.o=.ver); \
+ echo "SECTIONS { /DISCARD/ : { *(.note.GNU-stack) } }" \
+ >> $(@D)/.tmp_$(@F:.o=.ver); \
\
$(LD) $(KBUILD_LDFLAGS) -r -o $(@D)/.tmp_$(@F) $@ \
-T $(@D)/.tmp_$(@F:.o=.ver); \
base-commit: fd6d66840b4269da4e90e1ea807ae3197433bc66
--
2.38.1
A non-first waiter can potentially spin in the for loop of
rwsem_down_write_slowpath() without sleeping but fail to acquire the
lock even if the rwsem is free if the following sequence happens:
Non-first RT waiter First waiter Lock holder
------------------- ------------ -----------
Acquire wait_lock
rwsem_try_write_lock():
Set handoff bit if RT or
wait too long
Set waiter->handoff_set
Release wait_lock
Acquire wait_lock
Inherit waiter->handoff_set
Release wait_lock
Clear owner
Release lock
if (waiter.handoff_set) {
rwsem_spin_on_owner(();
if (OWNER_NULL)
goto trylock_again;
}
trylock_again:
Acquire wait_lock
rwsem_try_write_lock():
if (first->handoff_set && (waiter != first))
return false;
Release wait_lock
A non-first waiter cannot really acquire the rwsem even if it mistakenly
believes that it can spin on OWNER_NULL value. If that waiter happens
to be an RT task running on the same CPU as the first waiter, it can
block the first waiter from acquiring the rwsem leading to live lock.
Fix this problem by making sure that a non-first waiter cannot spin in
the slowpath loop without sleeping.
Fixes: d257cc8cb8d5 ("locking/rwsem: Make handoff bit handling more consistent")
Reviewed-and-tested-by: Mukesh Ojha <quic_mojha(a)quicinc.com>
Signed-off-by: Waiman Long <longman(a)redhat.com>
Cc: stable(a)vger.kernel.org
---
kernel/locking/rwsem.c | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 44873594de03..be2df9ea7c30 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -624,18 +624,16 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
*/
if (first->handoff_set && (waiter != first))
return false;
-
- /*
- * First waiter can inherit a previously set handoff
- * bit and spin on rwsem if lock acquisition fails.
- */
- if (waiter == first)
- waiter->handoff_set = true;
}
new = count;
if (count & RWSEM_LOCK_MASK) {
+ /*
+ * A waiter (first or not) can set the handoff bit
+ * if it is an RT task or wait in the wait queue
+ * for too long.
+ */
if (has_handoff || (!rt_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false;
@@ -651,11 +649,12 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
/*
- * We have either acquired the lock with handoff bit cleared or
- * set the handoff bit.
+ * We have either acquired the lock with handoff bit cleared or set
+ * the handoff bit. Only the first waiter can have its handoff_set
+ * set here to enable optimistic spinning in slowpath loop.
*/
if (new & RWSEM_FLAG_HANDOFF) {
- waiter->handoff_set = true;
+ first->handoff_set = true;
lockevent_inc(rwsem_wlock_handoff);
return false;
}
--
2.31.1
This device has a trackstick that is sent through the same HID device
than the touchpad.
Unfortunately there are 2 mice attached to that device descriptor, with
the first one for the touchpad when the second is for the trackstick.
Force all devices to be exported.
Cc: stable(a)vger.kernel.org # 5.8+
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2154204
Signed-off-by: Benjamin Tissoires <benjamin.tissoires(a)redhat.com>
---
drivers/hid/hid-multitouch.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c
index 91a4d3fc30e0..91ac72b32d45 100644
--- a/drivers/hid/hid-multitouch.c
+++ b/drivers/hid/hid-multitouch.c
@@ -1860,6 +1860,12 @@ static const struct hid_device_id mt_devices[] = {
USB_VENDOR_ID_ASUSTEK,
USB_DEVICE_ID_ASUSTEK_T304_KEYBOARD) },
+ /* Asus ExpertBook with trackstick */
+ { .driver_data = MT_CLS_WIN_8_FORCE_MULTI_INPUT,
+ HID_DEVICE(BUS_I2C, HID_GROUP_MULTITOUCH_WIN_8,
+ USB_VENDOR_ID_ELAN,
+ 0x3148) },
+
/* Atmel panels */
{ .driver_data = MT_CLS_SERIAL,
MT_USB_DEVICE(USB_VENDOR_ID_ATMEL,
--
2.38.1
The "duty > cycle" trick to force the pin level of a disabled TCU2
channel would only work when the channel had been enabled previously.
Address this issue by enabling the PWM mode in jz4740_pwm_disable
(I know, right) so that the "duty > cycle" trick works before disabling
the PWM channel right after.
This issue went unnoticed, as the PWM pins on the majority of the boards
tested would default to the inactive level once the corresponding TCU
clock was enabled, so the first call to jz4740_pwm_disable() would not
actually change the pin levels.
On the GCW Zero however, the PWM pin for the backlight (PWM1, which is
a TCU2 channel) goes active as soon as the timer1 clock is enabled.
Since the jz4740_pwm_disable() function did not work on channels not
previously enabled, the backlight would shine at full brightness from
the moment the backlight driver would probe, until the backlight driver
tried to *enable* the PWM output.
With this fix, the PWM pins will be forced inactive as soon as
jz4740_pwm_apply() is called (and might be reconfigured to active if
dictated by the pwm_state). This means that there is still a tiny time
frame between the .request() and .apply() callbacks where the PWM pin
might be active. Sadly, there is no way to fix this issue: it is
impossible to write a PWM channel's registers if the corresponding clock
is not enabled, and enabling the clock is what causes the PWM pin to go
active.
There is a workaround, though, which complements this fix: simply
starting the backlight driver (or any PWM client driver) with a "init"
pinctrl state that sets the pin as an inactive GPIO. Once the driver is
probed and the pinctrl state switches to "default", the regular PWM pin
configuration can be used as it will be properly driven.
Fixes: c2693514a0a1 ("pwm: jz4740: Obtain regmap from parent node")
Signed-off-by: Paul Cercueil <paul(a)crapouillou.net>
Cc: stable(a)vger.kernel.org
---
drivers/pwm/pwm-jz4740.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/pwm/pwm-jz4740.c b/drivers/pwm/pwm-jz4740.c
index a5fdf97c0d2e..228eb104bf1e 100644
--- a/drivers/pwm/pwm-jz4740.c
+++ b/drivers/pwm/pwm-jz4740.c
@@ -102,11 +102,14 @@ static void jz4740_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
struct jz4740_pwm_chip *jz = to_jz4740(chip);
/*
- * Set duty > period. This trick allows the TCU channels in TCU2 mode to
- * properly return to their init level.
+ * Set duty > period, then enable PWM mode and start the counter.
+ * This trick allows to force the inactive pin level for the TCU2
+ * channels.
*/
regmap_write(jz->map, TCU_REG_TDHRc(pwm->hwpwm), 0xffff);
regmap_write(jz->map, TCU_REG_TDFRc(pwm->hwpwm), 0x0);
+ regmap_set_bits(jz->map, TCU_REG_TCSRc(pwm->hwpwm), TCU_TCSR_PWM_EN);
+ regmap_write(jz->map, TCU_REG_TESR, BIT(pwm->hwpwm));
/*
* Disable PWM output.
--
2.35.1
There is a lock inversion and rwsem read-lock recursion in the devfreq
target callback which can lead to deadlocks.
Specifically, ufshcd_devfreq_scale() already holds a clk_scaling_lock
read lock when toggling the write booster, which involves taking the
dev_cmd mutex before taking another clk_scaling_lock read lock.
This can lead to a deadlock if another thread:
1) tries to acquire the dev_cmd and clk_scaling locks in the correct
order, or
2) takes a clk_scaling write lock before the attempt to take the
clk_scaling read lock a second time.
Fix this by dropping the clk_scaling_lock before toggling the write
booster as was done before commit 0e9d4ca43ba8 ("scsi: ufs: Protect some
contexts from unexpected clock scaling").
While the devfreq callbacks are already serialised, add a second
serialising mutex to handle the unlikely case where a callback triggered
through the devfreq sysfs interface is racing with a request to disable
clock scaling through the UFS controller 'clkscale_enable' sysfs
attribute. This could otherwise lead to the write booster being left
disabled after having disabled clock scaling.
Also take the new mutex in ufshcd_clk_scaling_allow() to make sure that
any pending write booster update has completed on return.
Note that this currently only affects Qualcomm platforms since commit
87bd05016a64 ("scsi: ufs: core: Allow host driver to disable wb toggling
during clock scaling").
The lock inversion (i.e. 1 above) was reported by lockdep as:
======================================================
WARNING: possible circular locking dependency detected
6.1.0-next-20221216 #211 Not tainted
------------------------------------------------------
kworker/u16:2/71 is trying to acquire lock:
ffff076280ba98a0 (&hba->dev_cmd.lock){+.+.}-{3:3}, at: ufshcd_query_flag+0x50/0x1c0
but task is already holding lock:
ffff076280ba9cf0 (&hba->clk_scaling_lock){++++}-{3:3}, at: ufshcd_devfreq_scale+0x2b8/0x380
which lock already depends on the new lock.
[ +0.011606]
the existing dependency chain (in reverse order) is:
-> #1 (&hba->clk_scaling_lock){++++}-{3:3}:
lock_acquire+0x68/0x90
down_read+0x58/0x80
ufshcd_exec_dev_cmd+0x70/0x2c0
ufshcd_verify_dev_init+0x68/0x170
ufshcd_probe_hba+0x398/0x1180
ufshcd_async_scan+0x30/0x320
async_run_entry_fn+0x34/0x150
process_one_work+0x288/0x6c0
worker_thread+0x74/0x450
kthread+0x118/0x120
ret_from_fork+0x10/0x20
-> #0 (&hba->dev_cmd.lock){+.+.}-{3:3}:
__lock_acquire+0x12a0/0x2240
lock_acquire.part.0+0xcc/0x220
lock_acquire+0x68/0x90
__mutex_lock+0x98/0x430
mutex_lock_nested+0x2c/0x40
ufshcd_query_flag+0x50/0x1c0
ufshcd_query_flag_retry+0x64/0x100
ufshcd_wb_toggle+0x5c/0x120
ufshcd_devfreq_scale+0x2c4/0x380
ufshcd_devfreq_target+0xf4/0x230
devfreq_set_target+0x84/0x2f0
devfreq_update_target+0xc4/0xf0
devfreq_monitor+0x38/0x1f0
process_one_work+0x288/0x6c0
worker_thread+0x74/0x450
kthread+0x118/0x120
ret_from_fork+0x10/0x20
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&hba->clk_scaling_lock);
lock(&hba->dev_cmd.lock);
lock(&hba->clk_scaling_lock);
lock(&hba->dev_cmd.lock);
*** DEADLOCK ***
Fixes: 0e9d4ca43ba8 ("scsi: ufs: Protect some contexts from unexpected clock scaling")
Cc: stable(a)vger.kernel.org # 5.12
Cc: Can Guo <quic_cang(a)quicinc.com>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
This issue has apparently been known for over a year [1] without anyone
bothering to fix it. Aside from the potential deadlocks, this also leads
to developers using Qualcomm platforms not being able to use lockdep to
prevent further issues like this from being introduced in other places.
Johan
[1] https://lore.kernel.org/lkml/1631843521-2863-1-git-send-email-cang@codeauro…
drivers/ufs/core/ufshcd.c | 29 +++++++++++++++--------------
include/ufs/ufshcd.h | 2 ++
2 files changed, 17 insertions(+), 14 deletions(-)
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index bda61be5f035..5c3821b2fcf8 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -1234,12 +1234,14 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
* clock scaling is in progress
*/
ufshcd_scsi_block_requests(hba);
+ mutex_lock(&hba->wb_mutex);
down_write(&hba->clk_scaling_lock);
if (!hba->clk_scaling.is_allowed ||
ufshcd_wait_for_doorbell_clr(hba, DOORBELL_CLR_TOUT_US)) {
ret = -EBUSY;
up_write(&hba->clk_scaling_lock);
+ mutex_unlock(&hba->wb_mutex);
ufshcd_scsi_unblock_requests(hba);
goto out;
}
@@ -1251,12 +1253,16 @@ static int ufshcd_clock_scaling_prepare(struct ufs_hba *hba)
return ret;
}
-static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
+static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool scale_up)
{
- if (writelock)
- up_write(&hba->clk_scaling_lock);
- else
- up_read(&hba->clk_scaling_lock);
+ up_write(&hba->clk_scaling_lock);
+
+ /* Enable Write Booster if we have scaled up else disable it */
+ if (ufshcd_enable_wb_if_scaling_up(hba))
+ ufshcd_wb_toggle(hba, scale_up);
+
+ mutex_unlock(&hba->wb_mutex);
+
ufshcd_scsi_unblock_requests(hba);
ufshcd_release(hba);
}
@@ -1273,7 +1279,6 @@ static void ufshcd_clock_scaling_unprepare(struct ufs_hba *hba, bool writelock)
static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up)
{
int ret = 0;
- bool is_writelock = true;
ret = ufshcd_clock_scaling_prepare(hba);
if (ret)
@@ -1302,15 +1307,8 @@ static int ufshcd_devfreq_scale(struct ufs_hba *hba, bool scale_up)
}
}
- /* Enable Write Booster if we have scaled up else disable it */
- if (ufshcd_enable_wb_if_scaling_up(hba)) {
- downgrade_write(&hba->clk_scaling_lock);
- is_writelock = false;
- ufshcd_wb_toggle(hba, scale_up);
- }
-
out_unprepare:
- ufshcd_clock_scaling_unprepare(hba, is_writelock);
+ ufshcd_clock_scaling_unprepare(hba, scale_up);
return ret;
}
@@ -6066,9 +6064,11 @@ static void ufshcd_force_error_recovery(struct ufs_hba *hba)
static void ufshcd_clk_scaling_allow(struct ufs_hba *hba, bool allow)
{
+ mutex_lock(&hba->wb_mutex);
down_write(&hba->clk_scaling_lock);
hba->clk_scaling.is_allowed = allow;
up_write(&hba->clk_scaling_lock);
+ mutex_unlock(&hba->wb_mutex);
}
static void ufshcd_clk_scaling_suspend(struct ufs_hba *hba, bool suspend)
@@ -9793,6 +9793,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
/* Initialize mutex for exception event control */
mutex_init(&hba->ee_ctrl_mutex);
+ mutex_init(&hba->wb_mutex);
init_rwsem(&hba->clk_scaling_lock);
ufshcd_init_clk_gating(hba);
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 5cf81dff60aa..727084cd79be 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -808,6 +808,7 @@ struct ufs_hba_monitor {
* @urgent_bkops_lvl: keeps track of urgent bkops level for device
* @is_urgent_bkops_lvl_checked: keeps track if the urgent bkops level for
* device is known or not.
+ * @wb_mutex: used to serialize devfreq and sysfs write booster toggling
* @clk_scaling_lock: used to serialize device commands and clock scaling
* @desc_size: descriptor sizes reported by device
* @scsi_block_reqs_cnt: reference counting for scsi block requests
@@ -951,6 +952,7 @@ struct ufs_hba {
enum bkops_status urgent_bkops_lvl;
bool is_urgent_bkops_lvl_checked;
+ struct mutex wb_mutex;
struct rw_semaphore clk_scaling_lock;
unsigned char desc_size[QUERY_DESC_IDN_MAX];
atomic_t scsi_block_reqs_cnt;
--
2.37.4
After doorbell DMA fetches the TRB. If during dequeuing request
driver changes NORMAL TRB to LINK TRB but doesn't delete it from
controller cache then controller will handle cached TRB and packet
can be lost.
The example scenario for this issue looks like:
1. queue request - set doorbell
2. dequeue request
3. send OUT data packet from host
4. Device will accept this packet which is unexpected
5. queue new request - set doorbell
6. Device lost the expected packet.
By setting DFLUSH controller clears DRDY bit and stop DMA transfer.
Fixes: 7733f6c32e36 ("usb: cdns3: Add Cadence USB3 DRD Driver")
cc: <stable(a)vger.kernel.org>
Signed-off-by: Pawel Laszczak <pawell(a)cadence.com>
---
drivers/usb/cdns3/cdns3-gadget.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c
index 5adcb349718c..ccfaebca6faa 100644
--- a/drivers/usb/cdns3/cdns3-gadget.c
+++ b/drivers/usb/cdns3/cdns3-gadget.c
@@ -2614,6 +2614,7 @@ int cdns3_gadget_ep_dequeue(struct usb_ep *ep,
u8 req_on_hw_ring = 0;
unsigned long flags;
int ret = 0;
+ int val;
if (!ep || !request || !ep->desc)
return -EINVAL;
@@ -2649,6 +2650,13 @@ int cdns3_gadget_ep_dequeue(struct usb_ep *ep,
/* Update ring only if removed request is on pending_req_list list */
if (req_on_hw_ring && link_trb) {
+ /* Stop DMA */
+ writel(EP_CMD_DFLUSH, &priv_dev->regs->ep_cmd);
+
+ /* wait for DFLUSH cleared */
+ readl_poll_timeout_atomic(&priv_dev->regs->ep_cmd, val,
+ !(val & EP_CMD_DFLUSH), 1, 1000);
+
link_trb->buffer = cpu_to_le32(TRB_BUFFER(priv_ep->trb_pool_dma +
((priv_req->end_trb + 1) * TRB_SIZE)));
link_trb->control = cpu_to_le32((le32_to_cpu(link_trb->control) & TRB_CYCLE) |
@@ -2660,6 +2668,10 @@ int cdns3_gadget_ep_dequeue(struct usb_ep *ep,
cdns3_gadget_giveback(priv_ep, priv_req, -ECONNRESET);
+ req = cdns3_next_request(&priv_ep->pending_req_list);
+ if (req)
+ cdns3_rearm_transfer(priv_ep, 1);
+
not_found:
spin_unlock_irqrestore(&priv_dev->lock, flags);
return ret;
--
2.25.1
From: Indan Zupancic <Indan.Zupancic(a)mep-info.com>
[ Upstream commit 401fb66a355eb0f22096cf26864324f8e63c7d78 ]
If an irq is pending when devm_request_irq() is called, the irq
handler will cause a NULL pointer access because initialisation
is not done yet.
Fixes: 9d7ee0e28da59 ("tty: serial: lpuart: avoid report NULL interrupt")
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Indan Zupancic <Indan.Zupancic(a)mep-info.com>
Link: https://lore.kernel.org/r/20220505114750.45423-1-Indan.Zupancic@mep-info.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
[5.10 did not have lpuart_global_reset or anything after
uart_add_one_port(), so add the remove call in cleanup manually]
Signed-off-by: Dominique Martinet <dominique.martinet(a)atmark-techno.com>
---
This was originally intended as a prerequirement to backport the patch
submitted in [1] for 5.10, but even with that part of the patch gone it
makes sense as a fix on its own.
[1] https://lkml.kernel.org/r/20221222114414.1886632-1-linux@rasmusvillemoes.dk
drivers/tty/serial/fsl_lpuart.c | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index 43aca5a2ef0f..223695947b65 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -2586,6 +2586,7 @@ static int lpuart_probe(struct platform_device *pdev)
struct device_node *np = pdev->dev.of_node;
struct lpuart_port *sport;
struct resource *res;
+ irq_handler_t handler;
int ret;
sport = devm_kzalloc(&pdev->dev, sizeof(*sport), GFP_KERNEL);
@@ -2658,17 +2659,12 @@ static int lpuart_probe(struct platform_device *pdev)
if (lpuart_is_32(sport)) {
lpuart_reg.cons = LPUART32_CONSOLE;
- ret = devm_request_irq(&pdev->dev, sport->port.irq, lpuart32_int, 0,
- DRIVER_NAME, sport);
+ handler = lpuart32_int;
} else {
lpuart_reg.cons = LPUART_CONSOLE;
- ret = devm_request_irq(&pdev->dev, sport->port.irq, lpuart_int, 0,
- DRIVER_NAME, sport);
+ handler = lpuart_int;
}
- if (ret)
- goto failed_irq_request;
-
ret = uart_get_rs485_mode(&sport->port);
if (ret)
goto failed_get_rs485;
@@ -2684,11 +2680,17 @@ static int lpuart_probe(struct platform_device *pdev)
if (ret)
goto failed_attach_port;
+ ret = devm_request_irq(&pdev->dev, sport->port.irq, handler, 0,
+ DRIVER_NAME, sport);
+ if (ret)
+ goto failed_irq_request;
+
return 0;
+failed_irq_request:
+ uart_remove_one_port(&lpuart_reg, &sport->port);
failed_get_rs485:
failed_attach_port:
-failed_irq_request:
lpuart_disable_clks(sport);
return ret;
}
--
2.35.1
When 7c7f9bc986e6 ("serial: Deassert Transmit Enable on probe in
driver-specific way") got backported to 5.15.y, there known as
b079d3775237, this hunk was accidentally left out. So if the "goto
failed_get_rs485;" is hit, the cleanup will do uart_remove_one_port()
despite uart_add_one_port() not having been called.
Add the missing hunk.
Fixes: b079d3775237 ("serial: Deassert Transmit Enable on probe in driver-specific way")
Signed-off-by: Rasmus Villemoes <linux(a)rasmusvillemoes.dk>
---
Not quite sure how to submit patches for a specific -stable series
only, or if the Fixes tag is appropriate and correct. Please let me
know if you'd have preferred anything different.
drivers/tty/serial/fsl_lpuart.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index 595430aedc0d..fc311df9f1c9 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -2784,9 +2784,9 @@ static int lpuart_probe(struct platform_device *pdev)
return 0;
failed_irq_request:
-failed_get_rs485:
uart_remove_one_port(&lpuart_reg, &sport->port);
failed_attach_port:
+failed_get_rs485:
failed_reset:
lpuart_disable_clks(sport);
return ret;
base-commit: fd6d66840b4269da4e90e1ea807ae3197433bc66
--
2.37.2
When creating a new monitoring group, the RMID allocated for it may have
been used by a group which was previously removed. In this case, the
hardware counters will have non-zero values which should be deducted
from what is reported in the new group's counts.
resctrl_arch_reset_rmid() initializes the prev_msr value for counters to
0, causing the initial count to be charged to the new group. Resurrect
__rmid_read() and use it to initialize prev_msr correctly.
Unlike before, __rmid_read() checks for error bits in the MSR read so
that callers don't need to.
Fixes: 1d81d15db39c ("x86/resctrl: Move mbm_overflow_count() into resctrl_arch_rmid_read()")
Signed-off-by: Peter Newman <peternewman(a)google.com>
Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
Cc: stable(a)vger.kernel.org
---
v3:
- add changelog
- CC stable
v2:
- move error bit processing into __rmid_read()
v1: https://lore.kernel.org/lkml/20221207112924.3602960-1-peternewman@google.co…
v2: https://lore.kernel.org/lkml/20221214160856.2164207-1-peternewman@google.co…
---
arch/x86/kernel/cpu/resctrl/monitor.c | 49 ++++++++++++++++++---------
1 file changed, 33 insertions(+), 16 deletions(-)
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index efe0c30d3a12..77538abeb72a 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -146,6 +146,30 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid)
return entry;
}
+static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
+{
+ u64 msr_val;
+
+ /*
+ * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+ * with a valid event code for supported resource type and the bits
+ * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+ * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+ * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+ * are error bits.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, msr_val);
+
+ if (msr_val & RMID_VAL_ERROR)
+ return -EIO;
+ if (msr_val & RMID_VAL_UNAVAIL)
+ return -EINVAL;
+
+ *val = msr_val;
+ return 0;
+}
+
static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
u32 rmid,
enum resctrl_event_id eventid)
@@ -172,8 +196,12 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
struct arch_mbm_state *am;
am = get_arch_mbm_state(hw_dom, rmid, eventid);
- if (am)
+ if (am) {
memset(am, 0, sizeof(*am));
+
+ /* Record any initial, non-zero count value. */
+ __rmid_read(rmid, eventid, &am->prev_msr);
+ }
}
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
@@ -191,25 +219,14 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
struct arch_mbm_state *am;
u64 msr_val, chunks;
+ int ret;
if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
return -EINVAL;
- /*
- * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
- * with a valid event code for supported resource type and the bits
- * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
- * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
- * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
- * are error bits.
- */
- wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
- rdmsrl(MSR_IA32_QM_CTR, msr_val);
-
- if (msr_val & RMID_VAL_ERROR)
- return -EIO;
- if (msr_val & RMID_VAL_UNAVAIL)
- return -EINVAL;
+ ret = __rmid_read(rmid, eventid, &msr_val);
+ if (ret)
+ return ret;
am = get_arch_mbm_state(hw_dom, rmid, eventid);
if (am) {
base-commit: 830b3c68c1fb1e9176028d02ef86f3cf76aa2476
--
2.39.0.314.g84b9a713c41-goog
From: Quentin Schulz <quentin.schulz(a)theobroma-systems.com>
clk_cifout is derived from clk_cifout_src through an integer divider
limited to 32. clk_cifout_src is a child of either cpll, gpll or npll
without any possibility of a divider of any sort. The default clock
parent is cpll.
Let's allow clk_cifout to ask its parent clk_cifout_src to reparent in
order to find the real closest possible rate for clk_cifout and not one
derived from cpll only.
Cc: stable(a)vger.kernel.org # 4.10+
Fixes: fd8bc829336a ("clk: rockchip: fix the rk3399 cifout clock")
Signed-off-by: Quentin Schulz <quentin.schulz(a)theobroma-systems.com>
---
clk: rockchip: rk3399: allow clk_cifout to force clk_cifout_src to reparent
This used to be correct before v4.10 but commit fd8bc829336a ("clk: rockchip:
fix the rk3399 cifout clock") incorrectly removed this ability while reworking
it.
Note: this has been tested on top of v6.0.2 only but no changes were made to
this driver since. As for older stable releases, the git context seems identical
and there does not seem to have been any logical change introduced since v4.10
so it should be pretty safe to apply.
To: Michael Turquette <mturquette(a)baylibre.com>
To: Stephen Boyd <sboyd(a)kernel.org>
To: Heiko Stuebner <heiko(a)sntech.de>
To: Xing Zheng <zhengxing(a)rock-chips.com>
Cc: linux-clk(a)vger.kernel.org
Cc: linux-arm-kernel(a)lists.infradead.org
Cc: linux-rockchip(a)lists.infradead.org
Cc: linux-kernel(a)vger.kernel.org
---
drivers/clk/rockchip/clk-rk3399.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/clk/rockchip/clk-rk3399.c b/drivers/clk/rockchip/clk-rk3399.c
index 306910a3a0d38..9ebd6c451b3db 100644
--- a/drivers/clk/rockchip/clk-rk3399.c
+++ b/drivers/clk/rockchip/clk-rk3399.c
@@ -1263,7 +1263,7 @@ static struct rockchip_clk_branch rk3399_clk_branches[] __initdata = {
RK3399_CLKSEL_CON(56), 6, 2, MFLAGS,
RK3399_CLKGATE_CON(10), 7, GFLAGS),
- COMPOSITE_NOGATE(SCLK_CIF_OUT, "clk_cifout", mux_clk_cif_p, 0,
+ COMPOSITE_NOGATE(SCLK_CIF_OUT, "clk_cifout", mux_clk_cif_p, CLK_SET_RATE_PARENT,
RK3399_CLKSEL_CON(56), 5, 1, MFLAGS, 0, 5, DFLAGS),
/* gic */
---
base-commit: cc675d22e422442f6d230654a55a5fc5682ea018
change-id: 20221117-rk3399-cifout-set-rate-parent-1fbf0173ef2d
Best regards,
--
Quentin Schulz <quentin.schulz(a)theobroma-systems.com>
Since commit 07ec77a1d4e8 ("sched: Allow task CPU affinity to be
restricted on asymmetric systems"), the setting and clearing of
user_cpus_ptr are done under pi_lock for arm64 architecture. However,
dup_user_cpus_ptr() accesses user_cpus_ptr without any lock
protection. Since sched_setaffinity() can be invoked from another
process, the process being modified may be undergoing fork() at
the same time. When racing with the clearing of user_cpus_ptr in
__set_cpus_allowed_ptr_locked(), it can lead to user-after-free and
possibly double-free in arm64 kernel.
Commit 8f9ea86fdf99 ("sched: Always preserve the user requested
cpumask") fixes this problem as user_cpus_ptr, once set, will never
be cleared in a task's lifetime. However, this bug was re-introduced
in commit 851a723e45d1 ("sched: Always clear user_cpus_ptr in
do_set_cpus_allowed()") which allows the clearing of user_cpus_ptr in
do_set_cpus_allowed(). This time, it will affect all arches.
Fix this bug by always clearing the user_cpus_ptr of the newly
cloned/forked task before the copying process starts and check the
user_cpus_ptr state of the source task under pi_lock.
Note to stable, this patch won't be applicable to stable releases.
Just copy the new dup_user_cpus_ptr() function over.
Fixes: 07ec77a1d4e8 ("sched: Allow task CPU affinity to be restricted on asymmetric systems")
Fixes: 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()")
CC: stable(a)vger.kernel.org
Reported-by: David Wang 王标 <wangbiao3(a)xiaomi.com>
Signed-off-by: Waiman Long <longman(a)redhat.com>
---
kernel/sched/core.c | 34 +++++++++++++++++++++++++++++-----
1 file changed, 29 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 25b582b6ee5f..b93d030b9fd5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2612,19 +2612,43 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
+ cpumask_t *user_mask;
unsigned long flags;
- if (!src->user_cpus_ptr)
+ /*
+ * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's
+ * may differ by now due to racing.
+ */
+ dst->user_cpus_ptr = NULL;
+
+ /*
+ * This check is racy and losing the race is a valid situation.
+ * It is not worth the extra overhead of taking the pi_lock on
+ * every fork/clone.
+ */
+ if (data_race(!src->user_cpus_ptr))
return 0;
- dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
- if (!dst->user_cpus_ptr)
+ user_mask = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
+ if (!user_mask)
return -ENOMEM;
- /* Use pi_lock to protect content of user_cpus_ptr */
+ /*
+ * Use pi_lock to protect content of user_cpus_ptr
+ *
+ * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
+ * do_set_cpus_allowed().
+ */
raw_spin_lock_irqsave(&src->pi_lock, flags);
- cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ if (src->user_cpus_ptr) {
+ swap(dst->user_cpus_ptr, user_mask);
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ }
raw_spin_unlock_irqrestore(&src->pi_lock, flags);
+
+ if (unlikely(user_mask))
+ kfree(user_mask);
+
return 0;
}
--
2.31.1
From: Takashi Iwai <tiwai(a)suse.de>
commit 8423f0b6d513b259fdab9c9bf4aaa6188d054c2d upstream.
There is a small race window at snd_pcm_oss_sync() that is called from
OSS PCM SNDCTL_DSP_SYNC ioctl; namely the function calls
snd_pcm_oss_make_ready() at first, then takes the params_lock mutex
for the rest. When the stream is set up again by another thread
between them, it leads to inconsistency, and may result in unexpected
results such as NULL dereference of OSS buffer as a fuzzer spotted
recently.
The fix is simply to cover snd_pcm_oss_make_ready() call into the same
params_lock mutex with snd_pcm_oss_make_ready_locked() variant.
Reported-and-tested-by: butt3rflyh4ck <butterflyhuangxx(a)gmail.com>
Reviewed-by: Jaroslav Kysela <perex(a)perex.cz>
Cc: <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/CAFcO6XN7JDM4xSXGhtusQfS2mSBcx50VJKwQpCq=WeLt57aa…
Link: https://lore.kernel.org/r/20220905060714.22549-1-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
Signed-off-by: Zubin Mithra <zsm(a)google.com>
---
Note:
* 8423f0b6d513 is present in linux-5.15.y and linux-5.4.y; missing in
linux-5.10.y.
* Backport addresses conflict due to surrounding context.
* Tests run: build and boot.
sound/core/oss/pcm_oss.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index f88de74da1eb..de6f94bee50b 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -1662,13 +1662,14 @@ static int snd_pcm_oss_sync(struct snd_pcm_oss_file *pcm_oss_file)
runtime = substream->runtime;
if (atomic_read(&substream->mmap_count))
goto __direct;
- if ((err = snd_pcm_oss_make_ready(substream)) < 0)
- return err;
atomic_inc(&runtime->oss.rw_ref);
if (mutex_lock_interruptible(&runtime->oss.params_lock)) {
atomic_dec(&runtime->oss.rw_ref);
return -ERESTARTSYS;
}
+ err = snd_pcm_oss_make_ready_locked(substream);
+ if (err < 0)
+ goto unlock;
format = snd_pcm_oss_format_from(runtime->oss.format);
width = snd_pcm_format_physical_width(format);
if (runtime->oss.buffer_used > 0) {
--
2.38.0.rc1.362.ged0d419d3c-goog