When introducing support for irqchip in userspace we needed a way to
mask the timer signal to prevent the guest continuously exiting due to a
screaming timer.
We did this by disabling the corresponding percpu interrupt on the
host interrupt controller, because we cannot rely on the host system
having a GIC, and therefore cannot make any assumptions about having an
active state to hide the timer signal.
Unfortunately, when introducing this feature, it became entirely
possible that a VCPU which belongs to a VM that has a userspace irqchip
can disable the vtimer irq on the host on some physical CPU, and then go
away without ever enabling the vtimer irq on that physical CPU again.
This means that using irqchips in userspace on a system that also
supports running VMs with an in-kernel GIC can prevent forward progress
from in-kernel GIC VMs.
Later on, when we started taking virtual timer interrupts in the arch
timer code, we would also leave this timer state active for userspace
irqchip VMs, because we leave it up to a VGIC-enabled guest to
deactivate the hardware IRQ using the HW bit in the LR.
Both issues are solved by only using the enable/disable trick on systems
that do not have a host GIC which supports the active state, because all
VMs on such systems must use irqchips in userspace. Systems that have a
working GIC with support for an active state use the active state to
mask the timer signal for both userspace and in-kernel irqchips.
Cc: Alexander Graf <agraf(a)suse.de>
Cc: <stable(a)vger.kernel.org> # v4.12+
Fixes: d9e139778376 ("KVM: arm/arm64: Support arch timers with a userspace gic")
Signed-off-by: Christoffer Dall <christoffer.dall(a)linaro.org>
---
This conflicts horribly with everything when applied to either
kvmarm/queue or kvmarm/master. Therefore, this patch is written for
(and applies to) v4.15 with kvmarm/queue merged and should therefore
apply cleanly after v4.16-rc1. An example with this patch applied can
be found on kvmarm/temp-for-v4.16-rc2. I plan on sending this along
with any other potential fixes post v4.16-rc1.
Changes since v1:
- Added userspace_irqchip() wrapper to simplify logic
- Changed has_gic_active_state to a static key
- Fixed typos in commentary and commit message
- Clear the active state on sync for userspace irqchips on systems
with a working GIC.
- Get rid of __timer_snapshot_state() in unmaks_vtimer_irq_user()
because kvm_timer_should_fire() has already been reworked in other
patches to look at the timer state when it's loaded onto the
hardware. As a result, we don't need the
__timer_snapshot_state() indirection anymore and this logic is
now inlined in vtimer_save_state().
virt/kvm/arm/arch_timer.c | 116 +++++++++++++++++++++++++---------------------
1 file changed, 64 insertions(+), 52 deletions(-)
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 70268c0bec79..70f4c30918eb 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -36,6 +36,8 @@ static struct timecounter *timecounter;
static unsigned int host_vtimer_irq;
static u32 host_vtimer_irq_flags;
+static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
+
static const struct kvm_irq_level default_ptimer_irq = {
.irq = 30,
.level = 1,
@@ -56,6 +58,12 @@ u64 kvm_phys_timer_read(void)
return timecounter->cc->read(timecounter->cc);
}
+static inline bool userspace_irqchip(struct kvm *kvm)
+{
+ return static_branch_unlikely(&userspace_irqchip_in_use) &&
+ unlikely(!irqchip_in_kernel(kvm));
+}
+
static void soft_timer_start(struct hrtimer *hrt, u64 ns)
{
hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
@@ -69,25 +77,6 @@ static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work)
cancel_work_sync(work);
}
-static void kvm_vtimer_update_mask_user(struct kvm_vcpu *vcpu)
-{
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
- /*
- * When using a userspace irqchip with the architected timers, we must
- * prevent continuously exiting from the guest, and therefore mask the
- * physical interrupt by disabling it on the host interrupt controller
- * when the virtual level is high, such that the guest can make
- * forward progress. Once we detect the output level being
- * de-asserted, we unmask the interrupt again so that we exit from the
- * guest when the timer fires.
- */
- if (vtimer->irq.level)
- disable_percpu_irq(host_vtimer_irq);
- else
- enable_percpu_irq(host_vtimer_irq, 0);
-}
-
static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
{
struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
@@ -106,9 +95,9 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
if (kvm_timer_should_fire(vtimer))
kvm_timer_update_irq(vcpu, true, vtimer);
- if (static_branch_unlikely(&userspace_irqchip_in_use) &&
- unlikely(!irqchip_in_kernel(vcpu->kvm)))
- kvm_vtimer_update_mask_user(vcpu);
+ if (userspace_irqchip(vcpu->kvm) &&
+ !static_branch_unlikely(&has_gic_active_state))
+ disable_percpu_irq(host_vtimer_irq);
return IRQ_HANDLED;
}
@@ -290,8 +279,7 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
timer_ctx->irq.level);
- if (!static_branch_unlikely(&userspace_irqchip_in_use) ||
- likely(irqchip_in_kernel(vcpu->kvm))) {
+ if (!userspace_irqchip(vcpu->kvm)) {
ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
timer_ctx->irq.irq,
timer_ctx->irq.level,
@@ -350,12 +338,6 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
phys_timer_emulate(vcpu);
}
-static void __timer_snapshot_state(struct arch_timer_context *timer)
-{
- timer->cnt_ctl = read_sysreg_el0(cntv_ctl);
- timer->cnt_cval = read_sysreg_el0(cntv_cval);
-}
-
static void vtimer_save_state(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@@ -367,8 +349,10 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu)
if (!vtimer->loaded)
goto out;
- if (timer->enabled)
- __timer_snapshot_state(vtimer);
+ if (timer->enabled) {
+ vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
+ vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
+ }
/* Disable the virtual timer */
write_sysreg_el0(0, cntv_ctl);
@@ -460,23 +444,43 @@ static void set_cntvoff(u64 cntvoff)
kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
}
-static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu)
+static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active)
+{
+ int r;
+ r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active);
+ WARN_ON(r);
+}
+
+static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu)
{
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
bool phys_active;
- int ret;
- phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
-
- ret = irq_set_irqchip_state(host_vtimer_irq,
- IRQCHIP_STATE_ACTIVE,
- phys_active);
- WARN_ON(ret);
+ if (irqchip_in_kernel(vcpu->kvm))
+ phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
+ else
+ phys_active = vtimer->irq.level;
+ set_vtimer_irq_phys_active(vcpu, phys_active);
}
-static void kvm_timer_vcpu_load_user(struct kvm_vcpu *vcpu)
+static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
{
- kvm_vtimer_update_mask_user(vcpu);
+ struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+ /*
+ * When using a userspace irqchip with the architected timers and a
+ * host interrupt controller that doesn't support an active state, we
+ * must still prevent continuously exiting from the guest, and
+ * therefore mask the physical interrupt by disabling it on the host
+ * interrupt controller when the virtual level is high, such that the
+ * guest can make forward progress. Once we detect the output level
+ * being de-asserted, we unmask the interrupt again so that we exit
+ * from the guest when the timer fires.
+ */
+ if (vtimer->irq.level)
+ disable_percpu_irq(host_vtimer_irq);
+ else
+ enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
}
void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
@@ -487,10 +491,10 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
if (unlikely(!timer->enabled))
return;
- if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
- kvm_timer_vcpu_load_user(vcpu);
+ if (static_branch_likely(&has_gic_active_state))
+ kvm_timer_vcpu_load_gic(vcpu);
else
- kvm_timer_vcpu_load_vgic(vcpu);
+ kvm_timer_vcpu_load_nogic(vcpu);
set_cntvoff(vtimer->cntvoff);
@@ -555,18 +559,24 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
{
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
- __timer_snapshot_state(vtimer);
- if (!kvm_timer_should_fire(vtimer)) {
- kvm_timer_update_irq(vcpu, false, vtimer);
- kvm_vtimer_update_mask_user(vcpu);
- }
+ if (!kvm_timer_should_fire(vtimer)) {
+ kvm_timer_update_irq(vcpu, false, vtimer);
+ if (static_branch_likely(&has_gic_active_state))
+ set_vtimer_irq_phys_active(vcpu, false);
+ else
+ enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
}
}
void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
{
- unmask_vtimer_irq_user(vcpu);
+ struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+ if (unlikely(!timer->enabled))
+ return;
+
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+ unmask_vtimer_irq_user(vcpu);
}
int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -753,6 +763,8 @@ int kvm_timer_hyp_init(bool has_gic)
kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
goto out_free_irq;
}
+
+ static_branch_enable(&has_gic_active_state);
}
kvm_info("virtual timer IRQ%d\n", host_vtimer_irq);
--
2.14.2
This is a note to let you know that I've just added the patch titled
um: Stop abusing __KERNEL__
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
um-stop-abusing-__kernel__.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 298e20ba8c197e8d429a6c8671550c41c7919033 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard(a)nod.at>
Date: Sun, 31 May 2015 19:50:57 +0200
Subject: um: Stop abusing __KERNEL__
From: Richard Weinberger <richard(a)nod.at>
commit 298e20ba8c197e8d429a6c8671550c41c7919033 upstream.
Currently UML is abusing __KERNEL__ to distinguish between
kernel and host code (os-Linux). It is better to use a custom
define such that existing users of __KERNEL__ don't get confused.
Signed-off-by: Richard Weinberger <richard(a)nod.at>
Cc: Greg Hackmann <ghackmann(a)google.com>
Cc: Bernie Innocenti <codewiz(a)google.com>
Cc: Lorenzo Colitti <lorenzo(a)google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/um/Makefile | 7 ++++---
arch/um/drivers/mconsole.h | 2 +-
arch/um/include/shared/init.h | 4 ++--
arch/um/include/shared/user.h | 2 +-
arch/x86/um/shared/sysdep/tls.h | 6 +++---
5 files changed, 11 insertions(+), 10 deletions(-)
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -68,9 +68,10 @@ KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -
KBUILD_AFLAGS += $(ARCH_INCLUDE)
-USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -D__KERNEL__,,\
- $(patsubst -I%,,$(KBUILD_CFLAGS)))) $(ARCH_INCLUDE) $(MODE_INCLUDE) \
- $(filter -I%,$(CFLAGS)) -D_FILE_OFFSET_BITS=64 -idirafter include
+USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -I%,,$(KBUILD_CFLAGS))) \
+ $(ARCH_INCLUDE) $(MODE_INCLUDE) $(filter -I%,$(CFLAGS)) \
+ -D_FILE_OFFSET_BITS=64 -idirafter include \
+ -D__KERNEL__ -D__UM_HOST__
#This will adjust *FLAGS accordingly to the platform.
include $(srctree)/$(ARCH_DIR)/Makefile-os-$(OS)
--- a/arch/um/drivers/mconsole.h
+++ b/arch/um/drivers/mconsole.h
@@ -7,7 +7,7 @@
#ifndef __MCONSOLE_H__
#define __MCONSOLE_H__
-#ifndef __KERNEL__
+#ifdef __UM_HOST__
#include <stdint.h>
#define u32 uint32_t
#endif
--- a/arch/um/include/shared/init.h
+++ b/arch/um/include/shared/init.h
@@ -40,7 +40,7 @@
typedef int (*initcall_t)(void);
typedef void (*exitcall_t)(void);
-#ifndef __KERNEL__
+#ifdef __UM_HOST__
#ifndef __section
# define __section(S) __attribute__ ((__section__(#S)))
#endif
@@ -131,7 +131,7 @@ extern struct uml_param __uml_setup_star
#define __uml_postsetup_call __used __section(.uml.postsetup.init)
#define __uml_exit_call __used __section(.uml.exitcall.exit)
-#ifndef __KERNEL__
+#ifdef __UM_HOST__
#define __define_initcall(level,fn) \
static initcall_t __initcall_##fn __used \
--- a/arch/um/include/shared/user.h
+++ b/arch/um/include/shared/user.h
@@ -17,7 +17,7 @@
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
/* This is to get size_t */
-#ifdef __KERNEL__
+#ifndef __UM_HOST__
#include <linux/types.h>
#else
#include <stddef.h>
--- a/arch/x86/um/shared/sysdep/tls.h
+++ b/arch/x86/um/shared/sysdep/tls.h
@@ -1,7 +1,7 @@
#ifndef _SYSDEP_TLS_H
#define _SYSDEP_TLS_H
-# ifndef __KERNEL__
+#ifdef __UM_HOST__
/* Change name to avoid conflicts with the original one from <asm/ldt.h>, which
* may be named user_desc (but in 2.4 and in header matching its API was named
@@ -22,11 +22,11 @@ typedef struct um_dup_user_desc {
#endif
} user_desc_t;
-# else /* __KERNEL__ */
+#else /* __UM_HOST__ */
typedef struct user_desc user_desc_t;
-# endif /* __KERNEL__ */
+#endif /* __UM_HOST__ */
extern int os_set_thread_area(user_desc_t *info, int pid);
extern int os_get_thread_area(user_desc_t *info, int pid);
Patches currently in stable-queue which might be from richard(a)nod.at are
queue-3.18/um-stop-abusing-__kernel__.patch
queue-3.18/um-link-vmlinux-with-no-pie.patch
queue-3.18/um-remove-copy-paste-code-from-init.h.patch
This is a note to let you know that I've just added the patch titled
um: Remove copy&paste code from init.h
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
um-remove-copy-paste-code-from-init.h.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 30b11ee9ae23d78de66b9ae315880af17a64ba83 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard(a)nod.at>
Date: Sun, 31 May 2015 22:15:58 +0200
Subject: um: Remove copy&paste code from init.h
From: Richard Weinberger <richard(a)nod.at>
commit 30b11ee9ae23d78de66b9ae315880af17a64ba83 upstream.
As we got rid of the __KERNEL__ abuse, we can directly
include linux/compiler.h now.
This also allows gcc 5 to build UML.
Reported-by: Hans-Werner Hilse <hwhilse(a)gmail.com>
Signed-off-by: Richard Weinberger <richard(a)nod.at>
Cc: Greg Hackmann <ghackmann(a)google.com>
Cc: Bernie Innocenti <codewiz(a)google.com>
Cc: Lorenzo Colitti <lorenzo(a)google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/um/include/shared/init.h | 22 +---------------------
1 file changed, 1 insertion(+), 21 deletions(-)
--- a/arch/um/include/shared/init.h
+++ b/arch/um/include/shared/init.h
@@ -40,28 +40,8 @@
typedef int (*initcall_t)(void);
typedef void (*exitcall_t)(void);
-#ifdef __UM_HOST__
-#ifndef __section
-# define __section(S) __attribute__ ((__section__(#S)))
-#endif
-
-#if __GNUC__ == 3
-
-#if __GNUC_MINOR__ >= 3
-# define __used __attribute__((__used__))
-#else
-# define __used __attribute__((__unused__))
-#endif
-
-#else
-#if __GNUC__ == 4
-# define __used __attribute__((__used__))
-#endif
-#endif
-
-#else
#include <linux/compiler.h>
-#endif
+
/* These are for everybody (although not all archs will actually
discard it in modules) */
#define __init __section(.init.text)
Patches currently in stable-queue which might be from richard(a)nod.at are
queue-3.18/um-stop-abusing-__kernel__.patch
queue-3.18/um-link-vmlinux-with-no-pie.patch
queue-3.18/um-remove-copy-paste-code-from-init.h.patch
This is a note to let you know that I've just added the patch titled
um: link vmlinux with -no-pie
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
um-link-vmlinux-with-no-pie.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 883354afbc109c57f925ccc19840055193da0cc0 Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas(a)m3y3r.de>
Date: Sun, 20 Aug 2017 13:26:04 +0200
Subject: um: link vmlinux with -no-pie
From: Thomas Meyer <thomas(a)m3y3r.de>
commit 883354afbc109c57f925ccc19840055193da0cc0 upstream.
Debian's gcc defaults to pie. The global Makefile already defines the -fno-pie option.
Link UML dynamic kernel image also with -no-pie to fix the build.
Signed-off-by: Thomas Meyer <thomas(a)m3y3r.de>
Signed-off-by: Richard Weinberger <richard(a)nod.at>
Cc: Bernie Innocenti <codewiz(a)google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/um/Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -116,7 +116,7 @@ archheaders:
archprepare: include/generated/user_constants.h
LINK-$(CONFIG_LD_SCRIPT_STATIC) += -static
-LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib
+LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib $(call cc-option, -no-pie)
CFLAGS_NO_HARDENING := $(call cc-option, -fno-PIC,) $(call cc-option, -fno-pic,) \
$(call cc-option, -fno-stack-protector,) \
Patches currently in stable-queue which might be from thomas(a)m3y3r.de are
queue-3.18/um-link-vmlinux-with-no-pie.patch
This is a note to let you know that I've just added the patch titled
Input: do not emit unneeded EV_SYN when suspending
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
input-do-not-emit-unneeded-ev_syn-when-suspending.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 00159f19a5057cb779146afce1cceede692af346 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov(a)gmail.com>
Date: Thu, 6 Aug 2015 19:15:30 -0700
Subject: Input: do not emit unneeded EV_SYN when suspending
From: Dmitry Torokhov <dmitry.torokhov(a)gmail.com>
commit 00159f19a5057cb779146afce1cceede692af346 upstream.
Do not emit EV_SYN/SYN_REPORT on suspend if there were no keys that are
still pressed as we are suspending the device (and in all other cases when
input core is forcibly releasing keys via input_dev_release_keys() call).
Reviewed-by: Benson Leung <bleung(a)chromium.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov(a)gmail.com>
Signed-off-by: Bo Hu <bohu(a)google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 78d24990a816..5391abd28b27 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -674,13 +674,19 @@ EXPORT_SYMBOL(input_close_device);
*/
static void input_dev_release_keys(struct input_dev *dev)
{
+ bool need_sync = false;
int code;
if (is_event_supported(EV_KEY, dev->evbit, EV_MAX)) {
- for_each_set_bit(code, dev->key, KEY_CNT)
+ for_each_set_bit(code, dev->key, KEY_CNT) {
input_pass_event(dev, EV_KEY, code, 0);
+ need_sync = true;
+ }
+
+ if (need_sync)
+ input_pass_event(dev, EV_SYN, SYN_REPORT, 1);
+
memset(dev->key, 0, sizeof(dev->key));
- input_pass_event(dev, EV_SYN, SYN_REPORT, 1);
}
}
Patches currently in stable-queue which might be from dmitry.torokhov(a)gmail.com are
queue-3.18/input-do-not-emit-unneeded-ev_syn-when-suspending.patch
Hi Marcin,
Since it's been a week, could you confirm the patch is ok as-is or do
you think some comment(s) from James should be incorporated ?
On Tue, Jan 23, 2018 at 3:17 PM, James Hogan <jhogan(a)kernel.org> wrote:
> On Thu, Dec 21, 2017 at 10:00:59PM +0100, Mathieu Malaterre wrote:
>> From: Marcin Nowakowski <marcin.nowakowski(a)mips.com>
>>
>> Change 73fbc1eba7ff added a fix to ensure that the memory range between
>
> Please refer to commits with e.g. commit 73fbc1eba7ff ("MIPS: fix
> mem=X@Y commandline processing").
>
>> PHYS_OFFSET and low memory address specified by mem= cmdline argument is
>> not later processed by free_all_bootmem.
>> This change was incorrect for systems where the commandline specifies
>> more than 1 mem argument, as it will cause all memory between
>> PHYS_OFFSET and each of the memory offsets to be marked as reserved,
>> which results in parts of the RAM marked as reserved (Creator CI20's
>> u-boot has a default commandline argument 'mem=256M@0x0
>> mem=768M@0x30000000').
>>
>> Change the behaviour to ensure that only the range between PHYS_OFFSET
>> and the lowest start address of the memories is marked as protected.
>>
>> This change also ensures that the range is marked protected even if it's
>> only defined through the devicetree and not only via commandline
>> arguments.
>>
>> Reported-by: Mathieu Malaterre <mathieu.malaterre(a)gmail.com>
>> Signed-off-by: Marcin Nowakowski <marcin.nowakowski(a)mips.com>
>> Fixes: 73fbc1eba7ff ("MIPS: fix mem=X@Y commandline processing")
>> Cc: <stable(a)vger.kernel.org> # v4.11
>
> I'm guessing that should technically be v4.11+
My fault, if this is the only change, I can re-submit.
>> ---
>> v2: Use updated email adress, add tag for stable.
>> arch/mips/kernel/setup.c | 19 ++++++++++++++++---
>> 1 file changed, 16 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
>> index 702c678de116..f19d61224c71 100644
>> --- a/arch/mips/kernel/setup.c
>> +++ b/arch/mips/kernel/setup.c
>> @@ -375,6 +375,7 @@ static void __init bootmem_init(void)
>> unsigned long reserved_end;
>> unsigned long mapstart = ~0UL;
>> unsigned long bootmap_size;
>> + phys_addr_t ramstart = ~0UL;
>
> Although practically it might not matter, technically phys_addr_t may be
> 64-bits (CONFIG_PHYS_ADDR_T_64BIT) even on a 32-bit kernels, in which
> case ~0UL may not be sufficiently large.
>
> Maybe that should be ~(phys_addr_t)0, or perhaps (phys_addr_t)ULLONG_MAX
> to match add_memory_region().
>
>> bool bootmap_valid = false;
>> int i;
>>
>> @@ -395,6 +396,21 @@ static void __init bootmem_init(void)
>> max_low_pfn = 0;
>>
>> /*
>> + * Reserve any memory between the start of RAM and PHYS_OFFSET
>> + */
>> + for (i = 0; i < boot_mem_map.nr_map; i++) {
>> + if (boot_mem_map.map[i].type != BOOT_MEM_RAM)
>> + continue;
>> +
>> + ramstart = min(ramstart, boot_mem_map.map[i].addr);
>
> Is it worth incorporating this into the existing loop below ...
>
>> + }
>> +
>> + if (ramstart > PHYS_OFFSET)
>> + add_memory_region(PHYS_OFFSET, ramstart - PHYS_OFFSET,
>> + BOOT_MEM_RESERVED);
>
> ... and this then placed below that loop?
>
> Otherwise I can't find fault with this patch, though i'm not intimately
> familiar with bootmem.
>
> Cheers
> James
>
>> +
>> +
>> + /*
>> * Find the highest page frame number we have available.
>> */
>> for (i = 0; i < boot_mem_map.nr_map; i++) {
>> @@ -664,9 +680,6 @@ static int __init early_parse_mem(char *p)
>>
>> add_memory_region(start, size, BOOT_MEM_RAM);
>>
>> - if (start && start > PHYS_OFFSET)
>> - add_memory_region(PHYS_OFFSET, start - PHYS_OFFSET,
>> - BOOT_MEM_RESERVED);
>> return 0;
>> }
>> early_param("mem", early_parse_mem);
>> --
>> 2.11.0
>>
When introducing support for irqchip in userspace we needed a way to
mask the timer signal to prevent the guest continuously exiting due to a
screaming timer.
We did this by disabling the corresponding percpu interrupt on the
host interrupt controller, because we cannot rely on the host system
having a GIC, and therefore cannot make any assumptions about having an
active state to hide the timer signal.
Unfortunately, when introducing this feature, it became entirely
possible that a VCPU which belongs to a VM that has a userspace irqchip
can disable the vtimer irq on the host on some physical CPU, and then go
away without ever enabling the vimter irq on that physical CPU again.
This means that using irqchips in userspace on a system that also
supports running VMs with an in-kernel GIC can prevent forward progress
from in-kernel GIC VMs.
Later on, when we started taking virtual timer interrupts in the arch
timer code, we would also leave this timer state active for userspace
irqchip VMs, because we leave it up to a VGIC-enabled guest to
deactivate the hardware IRQ using the HW bit in the LR.
Both issues are solved by only using the enable/disable trick on systems
that do not have a host GIC which supports the active state, because all
VMs on such systems must use irqchips in userspace. Systems that have a
working GIC with support for an active state use the active state to
mask the timer signal for both userspace an in-kernel irqchips.
Cc: Alexander Graf <agraf(a)suse.de>
Cc: <stable(a)vger.kernel.org> # v4.12+
Fixes: d9e139778376 ("KVM: arm/arm64: Support arch timers with a userspace gic")
Signed-off-by: Christoffer Dall <christoffer.dall(a)linaro.org>
---
This conflicts horribly with everything when applied to either
kvmarm/queue or kvmarm/master. Therefore, this patch is written for
(and applies to) v4.15 with kvmarm/queue merged and should therefore
apply cleanly after v4.16-rc1. An example with this patch applied can
be found on kvmarm/temp-for-v4.16-rc2. I plan on sending this along
with any other potential fixes post v4.16-rc1.
virt/kvm/arm/arch_timer.c | 77 ++++++++++++++++++++++++++---------------------
1 file changed, 42 insertions(+), 35 deletions(-)
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 70268c0bec79..228906ceb722 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -35,6 +35,7 @@
static struct timecounter *timecounter;
static unsigned int host_vtimer_irq;
static u32 host_vtimer_irq_flags;
+static bool has_gic_active_state;
static const struct kvm_irq_level default_ptimer_irq = {
.irq = 30,
@@ -69,25 +70,6 @@ static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work)
cancel_work_sync(work);
}
-static void kvm_vtimer_update_mask_user(struct kvm_vcpu *vcpu)
-{
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
- /*
- * When using a userspace irqchip with the architected timers, we must
- * prevent continuously exiting from the guest, and therefore mask the
- * physical interrupt by disabling it on the host interrupt controller
- * when the virtual level is high, such that the guest can make
- * forward progress. Once we detect the output level being
- * de-asserted, we unmask the interrupt again so that we exit from the
- * guest when the timer fires.
- */
- if (vtimer->irq.level)
- disable_percpu_irq(host_vtimer_irq);
- else
- enable_percpu_irq(host_vtimer_irq, 0);
-}
-
static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
{
struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
@@ -107,8 +89,8 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
kvm_timer_update_irq(vcpu, true, vtimer);
if (static_branch_unlikely(&userspace_irqchip_in_use) &&
- unlikely(!irqchip_in_kernel(vcpu->kvm)))
- kvm_vtimer_update_mask_user(vcpu);
+ unlikely(!irqchip_in_kernel(vcpu->kvm)) && !has_gic_active_state)
+ disable_percpu_irq(host_vtimer_irq);
return IRQ_HANDLED;
}
@@ -460,13 +442,16 @@ static void set_cntvoff(u64 cntvoff)
kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
}
-static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu)
+static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu)
{
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
bool phys_active;
int ret;
- phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
+ if (irqchip_in_kernel(vcpu->kvm))
+ phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
+ else
+ phys_active = vtimer->irq.level;
ret = irq_set_irqchip_state(host_vtimer_irq,
IRQCHIP_STATE_ACTIVE,
@@ -474,9 +459,24 @@ static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu)
WARN_ON(ret);
}
-static void kvm_timer_vcpu_load_user(struct kvm_vcpu *vcpu)
+static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
{
- kvm_vtimer_update_mask_user(vcpu);
+ struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+ /*
+ * When using a userspace irqchip with the architected timers and a
+ * host interrupt controller that doesn't support an active state, we
+ * must still we must prevent continuously exiting from the guest, and
+ * therefore mask the physical interrupt by disabling it on the host
+ * interrupt controller when the virtual level is high, such that the
+ * guest can make forward progress. Once we detect the output level
+ * being de-asserted, we unmask the interrupt again so that we exit
+ * from the guest when the timer fires.
+ */
+ if (vtimer->irq.level)
+ disable_percpu_irq(host_vtimer_irq);
+ else
+ enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
}
void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
@@ -487,10 +487,10 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
if (unlikely(!timer->enabled))
return;
- if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
- kvm_timer_vcpu_load_user(vcpu);
+ if (has_gic_active_state)
+ kvm_timer_vcpu_load_gic(vcpu);
else
- kvm_timer_vcpu_load_vgic(vcpu);
+ kvm_timer_vcpu_load_nogic(vcpu);
set_cntvoff(vtimer->cntvoff);
@@ -555,18 +555,23 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
{
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
- __timer_snapshot_state(vtimer);
- if (!kvm_timer_should_fire(vtimer)) {
- kvm_timer_update_irq(vcpu, false, vtimer);
- kvm_vtimer_update_mask_user(vcpu);
- }
+ __timer_snapshot_state(vtimer);
+ if (!kvm_timer_should_fire(vtimer)) {
+ kvm_timer_update_irq(vcpu, false, vtimer);
+ if (!has_gic_active_state)
+ enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
}
}
void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
{
- unmask_vtimer_irq_user(vcpu);
+ struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+ if (unlikely(!timer->enabled))
+ return;
+
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+ unmask_vtimer_irq_user(vcpu);
}
int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -753,6 +758,8 @@ int kvm_timer_hyp_init(bool has_gic)
kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
goto out_free_irq;
}
+
+ has_gic_active_state = true;
}
kvm_info("virtual timer IRQ%d\n", host_vtimer_irq);
--
2.14.2
In the past the ast driver relied upon the fbdev emulation helpers to
call ->load_lut at boot-up. But since
commit b8e2b0199cc377617dc238f5106352c06dcd3fa2
Author: Peter Rosin <peda(a)axentia.se>
Date: Tue Jul 4 12:36:57 2017 +0200
drm/fb-helper: factor out pseudo-palette
that's cleaned up and drivers are expected to boot into a consistent
lut state. This patch fixes that.
Fixes: b8e2b0199cc3 ("drm/fb-helper: factor out pseudo-palette")
Cc: Peter Rosin <peda(a)axenita.se>
Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: <stable(a)vger.kernel.org> # v4.14+
References: https://bugzilla.kernel.org/show_bug.cgi?id=198123
Signed-off-by: Daniel Vetter <daniel.vetter(a)intel.com>
---
drivers/gpu/drm/cirrus/cirrus_mode.c | 40 +++++++++++++++++++++---------------
1 file changed, 23 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/drm/cirrus/cirrus_mode.c b/drivers/gpu/drm/cirrus/cirrus_mode.c
index cd23b1b28259..c91b9b054e3f 100644
--- a/drivers/gpu/drm/cirrus/cirrus_mode.c
+++ b/drivers/gpu/drm/cirrus/cirrus_mode.c
@@ -294,22 +294,7 @@ static void cirrus_crtc_prepare(struct drm_crtc *crtc)
{
}
-/*
- * This is called after a mode is programmed. It should reverse anything done
- * by the prepare function
- */
-static void cirrus_crtc_commit(struct drm_crtc *crtc)
-{
-}
-
-/*
- * The core can pass us a set of gamma values to program. We actually only
- * use this for 8-bit mode so can't perform smooth fades on deeper modes,
- * but it's a requirement that we provide the function
- */
-static int cirrus_crtc_gamma_set(struct drm_crtc *crtc, u16 *red, u16 *green,
- u16 *blue, uint32_t size,
- struct drm_modeset_acquire_ctx *ctx)
+static void cirrus_crtc_load_lut(struct drm_crtc *crtc)
{
struct drm_device *dev = crtc->dev;
struct cirrus_device *cdev = dev->dev_private;
@@ -317,7 +302,7 @@ static int cirrus_crtc_gamma_set(struct drm_crtc *crtc, u16 *red, u16 *green,
int i;
if (!crtc->enabled)
- return 0;
+ return;
r = crtc->gamma_store;
g = r + crtc->gamma_size;
@@ -330,6 +315,27 @@ static int cirrus_crtc_gamma_set(struct drm_crtc *crtc, u16 *red, u16 *green,
WREG8(PALETTE_DATA, *g++ >> 8);
WREG8(PALETTE_DATA, *b++ >> 8);
}
+}
+
+/*
+ * This is called after a mode is programmed. It should reverse anything done
+ * by the prepare function
+ */
+static void cirrus_crtc_commit(struct drm_crtc *crtc)
+{
+ cirrus_crtc_load_lut(crtc);
+}
+
+/*
+ * The core can pass us a set of gamma values to program. We actually only
+ * use this for 8-bit mode so can't perform smooth fades on deeper modes,
+ * but it's a requirement that we provide the function
+ */
+static int cirrus_crtc_gamma_set(struct drm_crtc *crtc, u16 *red, u16 *green,
+ u16 *blue, uint32_t size,
+ struct drm_modeset_acquire_ctx *ctx)
+{
+ cirrus_crtc_load_lut(crtc);
return 0;
}
--
2.15.1
In the past the ast driver relied upon the fbdev emulation helpers to
call ->load_lut at boot-up. But since
commit b8e2b0199cc377617dc238f5106352c06dcd3fa2
Author: Peter Rosin <peda(a)axentia.se>
Date: Tue Jul 4 12:36:57 2017 +0200
drm/fb-helper: factor out pseudo-palette
that's cleaned up and drivers are expected to boot into a consistent
lut state. This patch fixes that.
Fixes: b8e2b0199cc3 ("drm/fb-helper: factor out pseudo-palette")
Cc: Peter Rosin <peda(a)axenita.se>
Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: <stable(a)vger.kernel.org> # v4.14+
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=198123
Cc: Bill Fraser <bill.fraser(a)gmail.com>
Reported-and-Tested-by: Bill Fraser <bill.fraser(a)gmail.com>
Tested-by: Konstantin Khlebnikov <koct9i(a)gmail.com>
Tested-by: Paul Tobias <tobias.pal(a)gmail.com>
Signed-off-by: Daniel Vetter <daniel.vetter(a)intel.com>
---
drivers/gpu/drm/ast/ast_mode.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c
index 9555a3542022..831b73392d82 100644
--- a/drivers/gpu/drm/ast/ast_mode.c
+++ b/drivers/gpu/drm/ast/ast_mode.c
@@ -644,6 +644,7 @@ static void ast_crtc_commit(struct drm_crtc *crtc)
{
struct ast_private *ast = crtc->dev->dev_private;
ast_set_index_reg_mask(ast, AST_IO_SEQ_PORT, 0x1, 0xdf, 0);
+ ast_crtc_load_lut(crtc);
}
--
2.15.1
The bounce buffer is gone from the MMC core, and now we found out
that there are some (crippled) i.MX boards out there that have broken
ADMA (cannot do scatter-gather), and also broken PIO so they must
use SDMA. Closer examination shows a less significant slowdown
also on SDMA-only capable Laptop hosts.
SDMA sets down the number of segments to one, so that each segment
gets turned into a singular request that ping-pongs to the block
layer before the next request/segment is issued.
Apparently it happens a lot that the block layer send requests
that include a lot of physically discontiguous segments. My guess
is that this phenomenon is coming from the file system.
These devices that cannot handle scatterlists in hardware can see
major benefits from a DMA-contiguous bounce buffer.
This patch accumulates those fragmented scatterlists in a physically
contiguous bounce buffer so that we can issue bigger DMA data chunks
to/from the card.
When tested with a PCI-integrated host (1217:8221) that
only supports SDMA:
0b:00.0 SD Host controller: O2 Micro, Inc. OZ600FJ0/OZ900FJ0/OZ600FJS
SD/MMC Card Reader Controller (rev 05)
This patch gave ~1Mbyte/s improved throughput on large reads and
writes when testing using iozone than without the patch.
dmesg:
sdhci-pci 0000:0b:00.0: SDHCI controller found [1217:8221] (rev 5)
mmc0 bounce up to 128 segments into one, max segment size 65536 bytes
mmc0: SDHCI controller on PCI [0000:0b:00.0] using DMA
On the i.MX SDHCI controllers on the crippled i.MX 25 and i.MX 35
the patch restores the performance to what it was before we removed
the bounce buffers.
Cc: Pierre Ossman <pierre(a)ossman.eu>
Cc: Benoît Thébaudeau <benoit(a)wsystem.com>
Cc: Fabio Estevam <fabio.estevam(a)nxp.com>
Cc: Benjamin Beckmeyer <beckmeyer.b(a)rittal.de>
Cc: stable(a)vger.kernel.org # v4.14+
Fixes: de3ee99b097d ("mmc: Delete bounce buffer handling")
Tested-by: Benjamin Beckmeyer <beckmeyer.b(a)rittal.de>
Acked-by: Adrian Hunter <adrian.hunter(a)intel.com>
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
---
ChangeLog v7->v8:
- Fixed bad information and spelling mistakes in the commit
message.
- Use sdhci_sdma_address() in one more spot identified by Adrian.
- Collected Adrian's ACK.
ChangeLog v6->v7:
- Fix the directions on dma_sync_for[device|cpu]() so the
ownership of the buffer gets swapped properly and in the right
direction for every transfer. Didn't see this because x86 PCI is
DMA coherent...
- Tested and greelighted on i.MX 25.
- Also tested on the PCI version.
ChangeLog v5->v6:
- Again switch back to explicit sync of buffers. I want to get this
solution to work because it gives more control and it's more
elegant.
- Update host->max_req_size as noted by Adrian, hopefully this
fixes the i.MX. I was just lucky on my Intel laptop I guess:
the block stack never requested anything bigger than 64KB and
that was why it worked even if max_req_size was bigger than
what would fit in the bounce buffer.
- Copy the number of bytes in the mmc_data instead of the number
of bytes in the bounce buffer. For RX this is blksize * blocks
and for TX this is bytes_xfered.
- Break out a sdhci_sdma_address() for getting the DMA address
for either the raw sglist or the bounce buffer depending on
configuration.
- Add some explicit bounds check for the data so that we do not
attempt to copy more than the bounce buffer size even if the
block layer is erroneously configured.
- Move allocation of bounce buffer out to its own function.
- Use pr_[info|err] throughout so all debug prints from the
driver come out in the same manner and style.
- Use unsigned int for the bounce buffer size.
- Re-tested with iozone: we still get the same nice performance
improvements.
- Request a text on i.MX (hi Benjamin)
ChangeLog v4->v5:
- Go back to dma_alloc_coherent() as this apparently works better.
- Keep the other changes, cap for 64KB, fall back to single segments.
- Requesting a test of this on i.MX. (Sorry Benjamin.)
ChangeLog v3->v4:
- Cap the bounce buffer to 64KB instead of the biggest segment
as we experience diminishing returns with buffers > 64KB.
- Instead of using dma_alloc_coherent(), use good old devm_kmalloc()
and issue dma_sync_single_for*() to explicitly switch
ownership between CPU and the device. This way we exercise the
cache better and may consume less CPU.
- Bail out with single segments if we cannot allocate a bounce
buffer.
- Tested on the PCI SDHCI on my laptop: requesting a new test
on i.MX from Benjamin. (Please!)
ChangeLog v2->v3:
- Rewrite the commit message a bit
- Add Benjamin's Tested-by
- Add Fixes and stable tags
ChangeLog v1->v2:
- Skip the remapping and fiddling with the buffer, instead use
dma_alloc_coherent() and use a simple, coherent bounce buffer.
- Couple kernel messages to ->parent of the mmc_host as it relates
to the hardware characteristics.
---
drivers/mmc/host/sdhci.c | 164 ++++++++++++++++++++++++++++++++++++++++++++---
drivers/mmc/host/sdhci.h | 3 +
2 files changed, 159 insertions(+), 8 deletions(-)
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index e9290a3439d5..d24306b2b839 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -21,6 +21,7 @@
#include <linux/dma-mapping.h>
#include <linux/slab.h>
#include <linux/scatterlist.h>
+#include <linux/sizes.h>
#include <linux/swiotlb.h>
#include <linux/regulator/consumer.h>
#include <linux/pm_runtime.h>
@@ -502,8 +503,35 @@ static int sdhci_pre_dma_transfer(struct sdhci_host *host,
if (data->host_cookie == COOKIE_PRE_MAPPED)
return data->sg_count;
- sg_count = dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
- mmc_get_dma_dir(data));
+ /* Bounce write requests to the bounce buffer */
+ if (host->bounce_buffer) {
+ unsigned int length = data->blksz * data->blocks;
+
+ if (length > host->bounce_buffer_size) {
+ pr_err("%s: asked for transfer of %u bytes exceeds bounce buffer %u bytes\n",
+ mmc_hostname(host->mmc), length,
+ host->bounce_buffer_size);
+ return -EIO;
+ }
+ if (mmc_get_dma_dir(data) == DMA_TO_DEVICE) {
+ /* Copy the data to the bounce buffer */
+ sg_copy_to_buffer(data->sg, data->sg_len,
+ host->bounce_buffer,
+ length);
+ }
+ /* Switch ownership to the DMA */
+ dma_sync_single_for_device(host->mmc->parent,
+ host->bounce_addr,
+ host->bounce_buffer_size,
+ mmc_get_dma_dir(data));
+ /* Just a dummy value */
+ sg_count = 1;
+ } else {
+ /* Just access the data directly from memory */
+ sg_count = dma_map_sg(mmc_dev(host->mmc),
+ data->sg, data->sg_len,
+ mmc_get_dma_dir(data));
+ }
if (sg_count == 0)
return -ENOSPC;
@@ -673,6 +701,14 @@ static void sdhci_adma_table_post(struct sdhci_host *host,
}
}
+static u32 sdhci_sdma_address(struct sdhci_host *host)
+{
+ if (host->bounce_buffer)
+ return host->bounce_addr;
+ else
+ return sg_dma_address(host->data->sg);
+}
+
static u8 sdhci_calc_timeout(struct sdhci_host *host, struct mmc_command *cmd)
{
u8 count;
@@ -858,8 +894,8 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_command *cmd)
SDHCI_ADMA_ADDRESS_HI);
} else {
WARN_ON(sg_cnt != 1);
- sdhci_writel(host, sg_dma_address(data->sg),
- SDHCI_DMA_ADDRESS);
+ sdhci_writel(host, sdhci_sdma_address(host),
+ SDHCI_DMA_ADDRESS);
}
}
@@ -2248,7 +2284,12 @@ static void sdhci_pre_req(struct mmc_host *mmc, struct mmc_request *mrq)
mrq->data->host_cookie = COOKIE_UNMAPPED;
- if (host->flags & SDHCI_REQ_USE_DMA)
+ /*
+ * No pre-mapping in the pre hook if we're using the bounce buffer,
+ * for that we would need two bounce buffers since one buffer is
+ * in flight when this is getting called.
+ */
+ if (host->flags & SDHCI_REQ_USE_DMA && !host->bounce_buffer)
sdhci_pre_dma_transfer(host, mrq->data, COOKIE_PRE_MAPPED);
}
@@ -2352,8 +2393,45 @@ static bool sdhci_request_done(struct sdhci_host *host)
struct mmc_data *data = mrq->data;
if (data && data->host_cookie == COOKIE_MAPPED) {
- dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
- mmc_get_dma_dir(data));
+ if (host->bounce_buffer) {
+ /*
+ * On reads, copy the bounced data into the
+ * sglist
+ */
+ if (mmc_get_dma_dir(data) == DMA_FROM_DEVICE) {
+ unsigned int length = data->bytes_xfered;
+
+ if (length > host->bounce_buffer_size) {
+ pr_err("%s: bounce buffer is %u bytes but DMA claims to have transferred %u bytes\n",
+ mmc_hostname(host->mmc),
+ host->bounce_buffer_size,
+ data->bytes_xfered);
+ /* Cap it down and continue */
+ length = host->bounce_buffer_size;
+ }
+ dma_sync_single_for_cpu(
+ host->mmc->parent,
+ host->bounce_addr,
+ host->bounce_buffer_size,
+ DMA_FROM_DEVICE);
+ sg_copy_from_buffer(data->sg,
+ data->sg_len,
+ host->bounce_buffer,
+ length);
+ } else {
+ /* No copying, just switch ownership */
+ dma_sync_single_for_cpu(
+ host->mmc->parent,
+ host->bounce_addr,
+ host->bounce_buffer_size,
+ mmc_get_dma_dir(data));
+ }
+ } else {
+ /* Unmap the raw data */
+ dma_unmap_sg(mmc_dev(host->mmc), data->sg,
+ data->sg_len,
+ mmc_get_dma_dir(data));
+ }
data->host_cookie = COOKIE_UNMAPPED;
}
}
@@ -2636,7 +2714,8 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask)
*/
if (intmask & SDHCI_INT_DMA_END) {
u32 dmastart, dmanow;
- dmastart = sg_dma_address(host->data->sg);
+
+ dmastart = sdhci_sdma_address(host);
dmanow = dmastart + host->data->bytes_xfered;
/*
* Force update to the next DMA block boundary.
@@ -3217,6 +3296,68 @@ void __sdhci_read_caps(struct sdhci_host *host, u16 *ver, u32 *caps, u32 *caps1)
}
EXPORT_SYMBOL_GPL(__sdhci_read_caps);
+static int sdhci_allocate_bounce_buffer(struct sdhci_host *host)
+{
+ struct mmc_host *mmc = host->mmc;
+ unsigned int max_blocks;
+ unsigned int bounce_size;
+ int ret;
+
+ /*
+ * Cap the bounce buffer at 64KB. Using a bigger bounce buffer
+ * has diminishing returns, this is probably because SD/MMC
+ * cards are usually optimized to handle this size of requests.
+ */
+ bounce_size = SZ_64K;
+ /*
+ * Adjust downwards to maximum request size if this is less
+ * than our segment size, else hammer down the maximum
+ * request size to the maximum buffer size.
+ */
+ if (mmc->max_req_size < bounce_size)
+ bounce_size = mmc->max_req_size;
+ max_blocks = bounce_size / 512;
+
+ /*
+ * When we just support one segment, we can get significant
+ * speedups by the help of a bounce buffer to group scattered
+ * reads/writes together.
+ */
+ host->bounce_buffer = devm_kmalloc(mmc->parent,
+ bounce_size,
+ GFP_KERNEL);
+ if (!host->bounce_buffer) {
+ pr_err("%s: failed to allocate %u bytes for bounce buffer, falling back to single segments\n",
+ mmc_hostname(mmc),
+ bounce_size);
+ /*
+ * Exiting with zero here makes sure we proceed with
+ * mmc->max_segs == 1.
+ */
+ return 0;
+ }
+
+ host->bounce_addr = dma_map_single(mmc->parent,
+ host->bounce_buffer,
+ bounce_size,
+ DMA_BIDIRECTIONAL);
+ ret = dma_mapping_error(mmc->parent, host->bounce_addr);
+ if (ret)
+ /* Again fall back to max_segs == 1 */
+ return 0;
+ host->bounce_buffer_size = bounce_size;
+
+ /* Lie about this since we're bouncing */
+ mmc->max_segs = max_blocks;
+ mmc->max_seg_size = bounce_size;
+ mmc->max_req_size = bounce_size;
+
+ pr_info("%s bounce up to %u segments into one, max segment size %u bytes\n",
+ mmc_hostname(mmc), max_blocks, bounce_size);
+
+ return 0;
+}
+
int sdhci_setup_host(struct sdhci_host *host)
{
struct mmc_host *mmc;
@@ -3713,6 +3854,13 @@ int sdhci_setup_host(struct sdhci_host *host)
*/
mmc->max_blk_count = (host->quirks & SDHCI_QUIRK_NO_MULTIBLOCK) ? 1 : 65535;
+ if (mmc->max_segs == 1) {
+ /* This may alter mmc->*_blk_* parameters */
+ ret = sdhci_allocate_bounce_buffer(host);
+ if (ret)
+ return ret;
+ }
+
return 0;
unreg:
diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h
index 54bc444c317f..1d7d61e25dbf 100644
--- a/drivers/mmc/host/sdhci.h
+++ b/drivers/mmc/host/sdhci.h
@@ -440,6 +440,9 @@ struct sdhci_host {
int irq; /* Device IRQ */
void __iomem *ioaddr; /* Mapped address */
+ char *bounce_buffer; /* For packing SDMA reads/writes */
+ dma_addr_t bounce_addr;
+ unsigned int bounce_buffer_size;
const struct sdhci_ops *ops; /* Low level hw interface */
--
2.14.3
From: James Morse <james.morse(a)arm.com>
cpu_pm_enter() calls the pm notifier chain with CPU_PM_ENTER, then if
there is a failure: CPU_PM_ENTER_FAILED.
When KVM receives CPU_PM_ENTER it calls cpu_hyp_reset() which will
return us to the hyp-stub. If we subsequently get a CPU_PM_ENTER_FAILED,
KVM does nothing, leaving the CPU running with the hyp-stub, at odds
with kvm_arm_hardware_enabled.
Add CPU_PM_ENTER_FAILED as a fallthrough for CPU_PM_EXIT, this reloads
KVM based on kvm_arm_hardware_enabled. This is safe even if CPU_PM_ENTER
never gets as far as KVM, as cpu_hyp_reinit() calls cpu_hyp_reset()
to make sure the hyp-stub is loaded before reloading KVM.
Fixes: 67f691976662 ("arm64: kvm: allows kvm cpu hotplug")
Cc: <stable(a)vger.kernel.org> # v4.7+
CC: Lorenzo Pieralisi <lorenzo.pieralisi(a)arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall(a)linaro.org>
Signed-off-by: James Morse <james.morse(a)arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall(a)linaro.org>
---
virt/kvm/arm/arm.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 59d8e04c19fa..639dca0c0560 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -1262,6 +1262,7 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
cpu_hyp_reset();
return NOTIFY_OK;
+ case CPU_PM_ENTER_FAILED:
case CPU_PM_EXIT:
if (__this_cpu_read(kvm_arm_hardware_enabled))
/* The hardware was enabled before suspend. */
--
2.14.2
From: "Rafael J. Wysocki" <rafael.j.wysocki(a)intel.com>
[ Upstream commit 56026645e2b6f11ede34a5e6ab69d3eb56f9c8fc ]
After commit aa7519af450d (cpufreq: Use transition_delay_us for legacy
governors as well) the sampling_rate field of struct dbs_data may be
less than the tick period which causes dbs_update() to produce
incorrect results, so make the code ensure that the value of that
field will always be sufficiently large.
Cc: 4.14 <stable(a)vger.kernel.org> # 4.14
Fixes: aa7519af450d (cpufreq: Use transition_delay_us for legacy governors as well)
Reported-by: Andy Tang <andy.tang(a)nxp.com>
Reported-by: Doug Smythies <dsmythies(a)telus.net>
Tested-by: Andy Tang <andy.tang(a)nxp.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Acked-by: Viresh Kumar <viresh.kumar(a)linaro.org>
---
drivers/cpufreq/cpufreq_governor.c | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 58d4f4e1ad6a..ca38229b045a 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -22,6 +22,8 @@
#include "cpufreq_governor.h"
+#define CPUFREQ_DBS_MIN_SAMPLING_INTERVAL (2 * TICK_NSEC / NSEC_PER_USEC)
+
static DEFINE_PER_CPU(struct cpu_dbs_info, cpu_dbs);
static DEFINE_MUTEX(gov_dbs_data_mutex);
@@ -47,11 +49,15 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
struct policy_dbs_info *policy_dbs;
+ unsigned int sampling_interval;
int ret;
- ret = sscanf(buf, "%u", &dbs_data->sampling_rate);
- if (ret != 1)
+
+ ret = sscanf(buf, "%u", &sampling_interval);
+ if (ret != 1 || sampling_interval < CPUFREQ_DBS_MIN_SAMPLING_INTERVAL)
return -EINVAL;
+ dbs_data->sampling_rate = sampling_interval;
+
/*
* We are operating under dbs_data->mutex and so the list and its
* entries can't be freed concurrently.
@@ -430,7 +436,14 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy)
if (ret)
goto free_policy_dbs_info;
- dbs_data->sampling_rate = cpufreq_policy_transition_delay_us(policy);
+ /*
+ * The sampling interval should not be less than the transition latency
+ * of the CPU and it also cannot be too small for dbs_update() to work
+ * correctly.
+ */
+ dbs_data->sampling_rate = max_t(unsigned int,
+ CPUFREQ_DBS_MIN_SAMPLING_INTERVAL,
+ cpufreq_policy_transition_delay_us(policy));
if (!have_governor_per_policy())
gov->gdbs_data = dbs_data;
--
2.15.0.194.g9af6a3dea062
This is the start of the stable review cycle for the 3.18.93 release.
There are 52 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed Jan 31 12:36:07 UTC 2018.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
kernel.org/pub/linux/kernel/v3.x/stable-review/patch-3.18.93-rc1.gz
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-3.18.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 3.18.93-rc1
Jim Westfall <jwestfall(a)surrealistic.net>
ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY
Mike Maloney <maloney(a)google.com>
ipv6: fix udpv6 sendmsg crash caused by too small MTU
Jim Westfall <jwestfall(a)surrealistic.net>
net: Allow neigh contructor functions ability to modify the primary_key
Neil Horman <nhorman(a)tuxdriver.com>
vmxnet3: repair memory leak
Xin Long <lucien.xin(a)gmail.com>
sctp: return error if the asoc has been peeled off in sctp_wait_for_sndbuf
Xin Long <lucien.xin(a)gmail.com>
sctp: do not allow the v4 socket to bind a v4mapped v6 address
Guillaume Nault <g.nault(a)alphalink.fr>
pppoe: take ->needed_headroom of lower device into account on xmit
Eric Dumazet <edumazet(a)google.com>
net: qdisc_pkt_len_init() should be more robust
Craig Gallek <kraig(a)google.com>
tcp: __tcp_hdrlen() helper
Felix Fietkau <nbd(a)nbd.name>
net: igmp: fix source address check for IGMPv3 reports
Alexey Kodanev <alexey.kodanev(a)oracle.com>
dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state
Dan Streetman <ddstreet(a)ieee.org>
net: tcp: close sock if net namespace is exiting
Jia Zhang <zhang.jia(a)linux.alibaba.com>
x86/microcode/intel: Extend BDW late-loading further with LLC size check
Richard Weinberger <richard(a)nod.at>
um: Remove copy&paste code from init.h
Richard Weinberger <richard(a)nod.at>
um: Stop abusing __KERNEL__
Greg KH <gregkh(a)linuxfoundation.org>
eventpoll.h: add missing epoll event masks
Thomas Meyer <thomas(a)m3y3r.de>
um: link vmlinux with -no-pie
Johannes Thumshirn <jthumshirn(a)suse.de>
scsi: libiscsi: fix shifting of DID_REQUEUE host byte
Jiri Slaby <jslaby(a)suse.cz>
fs/fcntl: f_setown, avoid undefined behaviour
Jeff Mahoney <jeffm(a)suse.com>
reiserfs: don't preallocate blocks for extended attributes
Jeff Mahoney <jeffm(a)suse.com>
reiserfs: fix race in prealloc discard
Kevin Cernekee <cernekee(a)chromium.org>
netfilter: xt_osf: Add missing permission checks
Kevin Cernekee <cernekee(a)chromium.org>
netfilter: nfnetlink_cthelper: Add missing permission checks
Ulrich Weber <ulrich.weber(a)riverbed.com>
netfilter: nf_conntrack_sip: extend request line validation
Florian Westphal <fw(a)strlen.de>
netfilter: restart search if moved to other chain
Liping Zhang <liping.zhang(a)spreadtrum.com>
netfilter: nf_ct_expect: remove the redundant slash when policy name is empty
Jiri Slaby <jslaby(a)suse.cz>
ipc: msg, make msgrcv work with LONG_MIN
Michal Hocko <mhocko(a)suse.com>
hwpoison, memcg: forcibly uncharge LRU pages
Michal Hocko <mhocko(a)suse.com>
mm/mmap.c: do not blow on PROT_NONE MAP_FIXED holes in the stack
Marc Kleine-Budde <mkl(a)pengutronix.de>
can: af_can: canfd_rcv(): replace WARN_ONCE by pr_warn_once
Marc Kleine-Budde <mkl(a)pengutronix.de>
can: af_can: can_rcv(): replace WARN_ONCE by pr_warn_once
Jonathan Dieter <jdieter(a)lesbg.com>
usbip: Fix implicit fallthrough warning
Andy Lutomirski <luto(a)kernel.org>
x86/asm/32: Make sync_core() handle missing CPUID on all 32-bit kernels
Jonas Gorski <jonas.gorski(a)gmail.com>
MIPS: AR7: ensure the port type's FCR value is used
Marc Zyngier <marc.zyngier(a)arm.com>
arm64: KVM: Fix SMCCC handling of unimplemented SMC/HVC calls
Dennis Yang <dennisyang(a)qnap.com>
dm thin metadata: THIN_MAX_CONCURRENT_LOCKS should be 6
Joe Thornber <thornber(a)redhat.com>
dm btree: fix serious bug in btree_split_beneath()
Thomas Petazzoni <thomas.petazzoni(a)free-electrons.com>
ARM: dts: kirkwood: fix pin-muxing of MPP7 on OpenBlocks A7
Arnd Bergmann <arnd(a)arndb.de>
phy: work around 'phys' references to usb-nop-xceiv devices
Johan Hovold <johan(a)kernel.org>
Input: twl4030-vibra - fix sibling-node lookup
Marek Belisko <marek(a)goldelico.com>
Input: twl4030-vibra - fix ERROR: Bad of_node_put() warning
Johan Hovold <johan(a)kernel.org>
Input: twl6040-vibra - fix child-node lookup
H. Nikolaus Schaller <hns(a)goldelico.com>
Input: twl6040-vibra - fix DT node memory management
Johan Hovold <johan(a)kernel.org>
Input: 88pm860x-ts - fix child-node lookup
Joe Lawrence <joe.lawrence(a)redhat.com>
pipe: avoid round_pipe_size() nr_pages overflow on 32-bit
Eric Biggers <ebiggers(a)google.com>
af_key: fix buffer overread in parse_exthdrs()
Eric Biggers <ebiggers(a)google.com>
af_key: fix buffer overread in verify_address_len()
Takashi Iwai <tiwai(a)suse.de>
ALSA: hda - Apply the existing quirk to iMac 14,1
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Remove yet superfluous WARN_ON()
Li Jinyue <lijinyue(a)huawei.com>
futex: Prevent overflow by strengthen input validation
Hannes Reinecke <hare(a)suse.de>
scsi: sg: disable SET_FORCE_LOW_DMA
Arnd Bergmann <arnd(a)arndb.de>
gcov: disable for COMPILE_TEST
-------------
Diffstat:
Makefile | 4 ++--
arch/arm/boot/dts/kirkwood-openblocks_a7.dts | 10 ++++++++--
arch/arm64/kvm/handle_exit.c | 4 ++--
arch/mips/ar7/platform.c | 2 +-
arch/um/Makefile | 9 +++++----
arch/um/drivers/mconsole.h | 2 +-
arch/um/include/shared/init.h | 24 ++--------------------
arch/um/include/shared/user.h | 2 +-
arch/x86/include/asm/processor.h | 2 +-
arch/x86/kernel/cpu/microcode/intel.c | 20 +++++++++++++++++--
arch/x86/um/shared/sysdep/tls.h | 6 +++---
drivers/input/misc/twl4030-vibra.c | 7 +++++--
drivers/input/misc/twl6040-vibra.c | 2 +-
drivers/input/touchscreen/88pm860x-ts.c | 16 +++++++++++----
drivers/md/dm-thin-metadata.c | 6 +++++-
drivers/md/persistent-data/dm-btree.c | 19 ++----------------
drivers/net/ppp/pppoe.c | 11 +++++-----
drivers/net/vmxnet3/vmxnet3_drv.c | 2 +-
drivers/phy/phy-core.c | 4 ++++
drivers/scsi/libiscsi.c | 2 +-
drivers/scsi/sg.c | 30 +++++++++-------------------
fs/fcntl.c | 4 ++++
fs/pipe.c | 18 +++++++++++++++--
fs/reiserfs/bitmap.c | 14 ++++++++++---
include/linux/tcp.h | 7 ++++++-
include/net/arp.h | 3 +++
include/net/net_namespace.h | 10 ++++++++++
include/scsi/sg.h | 1 -
include/uapi/linux/eventpoll.h | 13 ++++++++++++
ipc/msg.c | 5 ++++-
kernel/futex.c | 3 +++
kernel/gcov/Kconfig | 1 +
mm/memcontrol.c | 2 +-
mm/memory-failure.c | 7 +++++++
mm/mmap.c | 6 ++++--
net/can/af_can.c | 22 ++++++++++----------
net/core/dev.c | 19 ++++++++++++++----
net/core/neighbour.c | 4 ++--
net/dccp/ccids/ccid2.c | 3 +++
net/ipv4/arp.c | 7 ++++++-
net/ipv4/igmp.c | 2 +-
net/ipv4/tcp.c | 3 +++
net/ipv4/tcp_timer.c | 15 ++++++++++++++
net/ipv6/ip6_output.c | 6 ++++--
net/key/af_key.c | 8 ++++++++
net/netfilter/nf_conntrack_core.c | 7 +++++++
net/netfilter/nf_conntrack_expect.c | 2 +-
net/netfilter/nf_conntrack_sip.c | 5 ++++-
net/netfilter/nfnetlink_cthelper.c | 10 ++++++++++
net/netfilter/xt_osf.c | 7 +++++++
net/sctp/socket.c | 30 +++++++++++-----------------
sound/core/pcm_lib.c | 1 -
sound/pci/hda/patch_cirrus.c | 1 +
tools/usb/usbip/src/usbip.c | 2 ++
54 files changed, 284 insertions(+), 148 deletions(-)
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: v4l2-compat-ioctl32.c: make ctrl_is_pointer work for subdevs
Author: Hans Verkuil <hansverk(a)cisco.com>
Date: Tue Jan 30 10:18:32 2018 -0500
If the device is of type VFL_TYPE_SUBDEV then vdev->ioctl_ops
is NULL so the 'if (!ops->vidioc_query_ext_ctrl)' check would crash.
Add a test for !ops to the condition.
All sub-devices that have controls will use the control framework,
so they do not have an equivalent to ops->vidioc_query_ext_ctrl.
Returning false if ops is NULL is the correct thing to do here.
Fixes: b8c601e8af ("v4l2-compat-ioctl32.c: fix ctrl_is_pointer")
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Acked-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Reported-by: Laurent Pinchart <laurent.pinchart(a)ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart(a)ideasonboard.com>
Cc: <stable(a)vger.kernel.org> # for v4.15 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)s-opensource.com>
drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
---
diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index bdb5c226d01c..5198c9eeb348 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -770,7 +770,7 @@ static inline bool ctrl_is_pointer(struct file *file, u32 id)
return ctrl && ctrl->is_ptr;
}
- if (!ops->vidioc_query_ext_ctrl)
+ if (!ops || !ops->vidioc_query_ext_ctrl)
return false;
return !ops->vidioc_query_ext_ctrl(file, fh, &qec) &&
The fcp_rsp_info structure as defined in the FC spec has an initial 3 bytes
reserved field. The ibmvfc driver mistakenly defined this field as 4 bytes
resulting in the rsp_code field being defined in what should be the start of
the second reserved field and thus always being reported as zero by the
driver.
Ideally, we should wire ibmvfc up with libfc for the sake of code
deduplication, and ease of maintaining standardized structures in a single
place. However, for now simply fixup the definition in ibmvfc for
backporting to distros on older kernels. Wiring up with libfc will be done
in a followup patch.
Cc: stable(a)vger.kernel.org
Reported-by: Hannes Reinecke <hare(a)suse.de>
Signed-off-by: Tyrel Datwyler <tyreld(a)linux.vnet.ibm.com>
---
drivers/scsi/ibmvscsi/ibmvfc.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index 9a0696f..b81a53c 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -367,7 +367,7 @@ enum ibmvfc_fcp_rsp_info_codes {
};
struct ibmvfc_fcp_rsp_info {
- __be16 reserved;
+ u8 reserved[3];
u8 rsp_code;
u8 reserved2[4];
}__attribute__((packed, aligned (2)));
--
2.7.4
commit 1005bccd7a4a ("crypto: caam - enable instantiation of all RNG4 state
handles") introduces a control when incrementing ent_delay which contains
the following comment above it:
/*
* If either SH were instantiated by somebody else
* (e.g. u-boot) then it is assumed that the entropy
* parameters are properly set and thus the function
* setting these (kick_trng(...)) is skipped.
* Also, if a handle was instantiated, do not change
* the TRNG parameters.
*/
This is a problem observed when sec_init() has been run in u-boot and
and TrustZone is enabled. We can fix this by instantiating all rng state
handles in u-boot but, on the Kernel side we should ensure that this
non-terminating path is dealt with.
Fixes: 1005bccd7a4a ("crypto: caam - enable instantiation of all RNG4 state
handles")
Reported-by: Ryan Harkin <ryan.harkin(a)linaro.org>
Cc: "Horia Geantă" <horia.geanta(a)nxp.com>
Cc: Aymen Sghaier <aymen.sghaier(a)nxp.com>
Cc: Fabio Estevam <fabio.estevam(a)nxp.com>
Cc: Peng Fan <peng.fan(a)nxp.com>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: Lukas Auer <lukas.auer(a)aisec.fraunhofer.de>
Cc: <stable(a)vger.kernel.org> # 4.12+
Signed-off-by: Bryan O'Donoghue <pure.logic(a)nexus-software.ie>
---
drivers/crypto/caam/ctrl.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index 98986d3..0a1e96b 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -704,7 +704,10 @@ static int caam_probe(struct platform_device *pdev)
ent_delay);
kick_trng(pdev, ent_delay);
ent_delay += 400;
+ } else if (ctrlpriv->rng4_sh_init && inst_handles) {
+ ent_delay += 400;
}
+
/*
* if instantiate_rng(...) fails, the loop will rerun
* and the kick_trng(...) function will modfiy the
--
2.7.4
The patch titled
Subject: mm, memory_hotplug: fix memmap initialization
has been added to the -mm tree. Its filename is
mm-memory_hotplug-fix-memmap-initialization.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/mm-memory_hotplug-fix-memmap-initi…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/mm-memory_hotplug-fix-memmap-initi…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/SubmitChecklist when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Michal Hocko <mhocko(a)suse.com>
Subject: mm, memory_hotplug: fix memmap initialization
Bharata has noticed that onlining a newly added memory doesn't increase
the total memory, pointing to f7f99100d8d9 ("mm: stop zeroing memory
during allocation in vmemmap") as a culprit. This commit has changed the
way how the memory for memmaps is initialized and moves it from the
allocation time to the initialization time. This works properly for the
early memmap init path.
It doesn't work for the memory hotplug though because we need to mark page
as reserved when the sparsemem section is created and later initialize it
completely during onlining. memmap_init_zone is called in the early stage
of onlining. With the current code it calls __init_single_page and as
such it clears up the whole stage and therefore online_pages_range skips
those pages.
Fix this by skipping mm_zero_struct_page in __init_single_page for memory
hotplug path. This is quite uggly but unifying both early init and memory
hotplug init paths is a large project. Make sure we plug the regression
at least.
Link: http://lkml.kernel.org/r/20180130101141.GW21609@dhcp22.suse.cz
Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap")
Signed-off-by: Michal Hocko <mhocko(a)suse.com>
Reported-by: Bharata B Rao <bharata(a)linux.vnet.ibm.com>
Tested-by: Bharata B Rao <bharata(a)linux.vnet.ibm.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin(a)oracle.com>
Cc: Steven Sistare <steven.sistare(a)oracle.com>
Cc: Daniel Jordan <daniel.m.jordan(a)oracle.com>
Cc: Bob Picco <bob.picco(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff -puN mm/page_alloc.c~mm-memory_hotplug-fix-memmap-initialization mm/page_alloc.c
--- a/mm/page_alloc.c~mm-memory_hotplug-fix-memmap-initialization
+++ a/mm/page_alloc.c
@@ -1178,9 +1178,10 @@ static void free_one_page(struct zone *z
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
- unsigned long zone, int nid)
+ unsigned long zone, int nid, bool zero)
{
- mm_zero_struct_page(page);
+ if (zero)
+ mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
@@ -1195,9 +1196,9 @@ static void __meminit __init_single_page
}
static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
- int nid)
+ int nid, bool zero)
{
- return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
+ return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1218,7 +1219,7 @@ static void __meminit init_reserved_page
if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
break;
}
- __init_single_pfn(pfn, zid, nid);
+ __init_single_pfn(pfn, zid, nid, true);
}
#else
static inline void init_reserved_page(unsigned long pfn)
@@ -1535,7 +1536,7 @@ static unsigned long __init deferred_in
} else {
page++;
}
- __init_single_page(page, pfn, zid, nid);
+ __init_single_page(page, pfn, zid, nid, true);
nr_pages++;
}
return (nr_pages);
@@ -5400,15 +5401,20 @@ not_early:
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
+ *
+ * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * because this is done early in sparse_add_one_section
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
struct page *page = pfn_to_page(pfn);
- __init_single_page(page, pfn, zone, nid);
+ __init_single_page(page, pfn, zone, nid,
+ context != MEMMAP_HOTPLUG);
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
} else {
- __init_single_pfn(pfn, zone, nid);
+ __init_single_pfn(pfn, zone, nid,
+ context != MEMMAP_HOTPLUG);
}
}
}
_
Patches currently in -mm which might be from mhocko(a)suse.com are
mm-drop-hotplug-lock-from-lru_add_drain_all.patch
mm-oom-docs-describe-the-cgroup-aware-oom-killer-fix-2.patch
mm-hugetlb-drop-hugepages_treat_as_movable-sysctl.patch
mm-introduce-map_fixed_safe.patch
fs-elf-drop-map_fixed-usage-from-elf_map.patch
fs-elf-drop-map_fixed-usage-from-elf_map-fix-fix.patch
mm-numa-rework-do_pages_move.patch
mm-migrate-remove-reason-argument-from-new_page_t.patch
mm-migrate-remove-reason-argument-from-new_page_t-fix-3.patch
mm-unclutter-thp-migration.patch
mm-hugetlb-unify-core-page-allocation-accounting-and-initialization.patch
mm-hugetlb-integrate-giga-hugetlb-more-naturally-to-the-allocation-path.patch
mm-hugetlb-do-not-rely-on-overcommit-limit-during-migration.patch
mm-hugetlb-get-rid-of-surplus-page-accounting-tricks.patch
mm-hugetlb-further-simplify-hugetlb-allocation-api.patch
hugetlb-mempolicy-fix-the-mbind-hugetlb-migration.patch
hugetlb-mbind-fall-back-to-default-policy-if-vma-is-null.patch
mm-memory_hotplug-fix-memmap-initialization.patch
This is the start of the stable review cycle for the 4.14.16 release.
There are 71 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed Jan 31 12:37:59 UTC 2018.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.14.16-rc1.gz
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.14.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.14.16-rc1
Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
cpufreq: governor: Ensure sufficiently large sampling intervals
Daniel Borkmann <daniel(a)iogearbox.net>
bpf, arm64: fix stack_depth tracking in combination with tail calls
Daniel Borkmann <daniel(a)iogearbox.net>
bpf: reject stores into ctx via st and xadd
Alexei Starovoitov <ast(a)kernel.org>
bpf: fix 32-bit divide by zero
Eric Dumazet <edumazet(a)google.com>
bpf: fix divides by zero
Daniel Borkmann <daniel(a)iogearbox.net>
bpf: avoid false sharing of map refcount with max_entries
Alexei Starovoitov <ast(a)kernel.org>
bpf: introduce BPF_JIT_ALWAYS_ON config
Thomas Gleixner <tglx(a)linutronix.de>
hrtimer: Reset hrtimer cpu base proper on CPU hotplug
Andy Lutomirski <luto(a)kernel.org>
x86/mm/64: Fix vmapped stack syncing on very-large-memory 4-level systems
Borislav Petkov <bp(a)suse.de>
x86/microcode: Fix again accessing initrd after having been freed
Jia Zhang <zhang.jia(a)linux.alibaba.com>
x86/microcode/intel: Extend BDW late-loading further with LLC size check
Xiao Liang <xiliang(a)redhat.com>
perf/x86/amd/power: Do not load AMD power module on !AMD platforms
Neil Horman <nhorman(a)tuxdriver.com>
vmxnet3: repair memory leak
Lorenzo Colitti <lorenzo(a)google.com>
net: ipv4: Make "ip route get" match iif lo rules again.
Sabrina Dubroca <sd(a)queasysnail.net>
tls: reset crypto_info when do_tls_setsockopt_tx fails
Sabrina Dubroca <sd(a)queasysnail.net>
tls: return -EBUSY if crypto_info is already set
Sabrina Dubroca <sd(a)queasysnail.net>
tls: fix sw_ctx leak
Ilya Lesokhin <ilyal(a)mellanox.com>
net/tls: Only attach to sockets in ESTABLISHED state
Xin Long <lucien.xin(a)gmail.com>
netlink: reset extack earlier in netlink_rcv_skb
Jakub Kicinski <jakub.kicinski(a)netronome.com>
nfp: use the correct index for link speed table
Talat Batheesh <talatb(a)mellanox.com>
net/mlx5e: Fix fixpoint divide exception in mlx5e_am_stats_compare
David Ahern <dsahern(a)gmail.com>
netlink: extack needs to be reset each time through loop
Xin Long <lucien.xin(a)gmail.com>
sctp: reinit stream if stream outcnt has been change by sinit in sendmsg
Eric Dumazet <edumazet(a)google.com>
flow_dissector: properly cap thoff field
Cong Wang <xiyou.wangcong(a)gmail.com>
tun: fix a memory leak for tfile->tx_array
Yuval Mintz <yuvalm(a)mellanox.com>
mlxsw: spectrum_router: Don't log an error on missing neighbor
Willem de Bruijn <willemb(a)google.com>
gso: validate gso_type in GSO handlers
Alexey Kodanev <alexey.kodanev(a)oracle.com>
ip6_gre: init dev->mtu and dev->hard_header_len correctly
Ivan Vecera <cera(a)cera.cz>
be2net: restore properly promisc mode after queues reconfiguration
Guillaume Nault <g.nault(a)alphalink.fr>
ppp: unlock all_ppp_mutex before registering device
Saeed Mahameed <saeedm(a)mellanox.com>
net/mlx5: Fix get vector affinity helper function
Eran Ben Elisha <eranbe(a)mellanox.com>
{net,ib}/mlx5: Don't disable local loopback multicast traffic when needed
Cong Wang <xiyou.wangcong(a)gmail.com>
tipc: fix a memory leak in tipc_nl_node_get_link()
Xin Long <lucien.xin(a)gmail.com>
sctp: return error if the asoc has been peeled off in sctp_wait_for_sndbuf
Xin Long <lucien.xin(a)gmail.com>
sctp: do not allow the v4 socket to bind a v4mapped v6 address
Francois Romieu <romieu(a)fr.zoreil.com>
r8169: fix memory corruption on retrieval of hardware statistics.
Guillaume Nault <g.nault(a)alphalink.fr>
pppoe: take ->needed_headroom of lower device into account on xmit
David Ahern <dsahern(a)gmail.com>
net: vrf: Add support for sends to local broadcast address
r.hering(a)avm.de <r.hering(a)avm.de>
net/tls: Fix inverted error codes to avoid endless loop
Dan Streetman <ddstreet(a)ieee.org>
net: tcp: close sock if net namespace is exiting
Eric Dumazet <edumazet(a)google.com>
net: qdisc_pkt_len_init() should be more robust
Felix Fietkau <nbd(a)nbd.name>
net: igmp: fix source address check for IGMPv3 reports
Yuiko Oshino <yuiko.oshino(a)microchip.com>
lan78xx: Fix failure in USB Full Speed
Eric Dumazet <edumazet(a)google.com>
ipv6: ip6_make_skb() needs to clear cork.base.dst
Mike Maloney <maloney(a)google.com>
ipv6: fix udpv6 sendmsg crash caused by too small MTU
Ben Hutchings <ben.hutchings(a)codethink.co.uk>
ipv6: Fix getsockopt() for sockets with default IPV6_AUTOFLOWLABEL
Alexey Kodanev <alexey.kodanev(a)oracle.com>
dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state
Jim Westfall <jwestfall(a)surrealistic.net>
ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY
Jim Westfall <jwestfall(a)surrealistic.net>
net: Allow neigh contructor functions ability to modify the primary_key
Boris Brezillon <boris.brezillon(a)free-electrons.com>
drm/vc4: Fix NULL pointer dereference in vc4_save_hang_state()
Russell King <rmk+kernel(a)armlinux.org.uk>
ARM: net: bpf: clarify tail_call index
Russell King <rmk+kernel(a)armlinux.org.uk>
ARM: net: bpf: fix LDX instructions
Russell King <rmk+kernel(a)armlinux.org.uk>
ARM: net: bpf: fix register saving
Russell King <rmk+kernel(a)armlinux.org.uk>
ARM: net: bpf: correct stack layout documentation
Russell King <rmk+kernel(a)armlinux.org.uk>
ARM: net: bpf: move stack documentation
Russell King <rmk+kernel(a)armlinux.org.uk>
ARM: net: bpf: fix stack alignment
Russell King <rmk+kernel(a)armlinux.org.uk>
ARM: net: bpf: fix tail call jumps
Russell King <rmk+kernel(a)armlinux.org.uk>
ARM: net: bpf: avoid 'bx' instruction on non-Thumb capable CPUs
Martin Brandenburg <martin(a)omnibond.com>
orangefs: fix deadlock; do not write i_size in read_iter
Christian Borntraeger <borntraeger(a)de.ibm.com>
KVM: s390: add proper locking for CMMA migration bitmap
Josef Bacik <jbacik(a)fb.com>
Btrfs: fix stale entries in readdir
Dmitry Torokhov <dmitry.torokhov(a)gmail.com>
Input: trackpoint - only expose supported controls for Elan, ALPS and NXP
Aaron Ma <aaron.ma(a)canonical.com>
Input: trackpoint - force 3 buttons if 0 button is reported
Mark Furneaux <mark(a)furneaux.ca>
Input: xpad - add support for PDP Xbox One controllers
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Revert "module: Add retpoline tag to VERMAGIC"
Steffen Klassert <steffen.klassert(a)secunet.com>
xfrm: Fix a race in the xdst pcpu cache.
Kevin Cernekee <cernekee(a)chromium.org>
netfilter: xt_osf: Add missing permission checks
Kevin Cernekee <cernekee(a)chromium.org>
netfilter: nfnetlink_cthelper: Add missing permission checks
Vlastimil Babka <vbabka(a)suse.cz>
mm, page_alloc: fix potential false positive in __zone_watermark_ok
Martin Brandenburg <martin(a)omnibond.com>
orangefs: initialize op on loop restart in orangefs_devreq_read
Martin Brandenburg <martin(a)omnibond.com>
orangefs: use list_for_each_entry_safe in purge_waiting_ops
-------------
Diffstat:
Makefile | 4 +-
arch/arm/net/bpf_jit_32.c | 225 ++++++++++---------
arch/arm64/net/bpf_jit_comp.c | 20 +-
arch/s390/kvm/kvm-s390.c | 18 +-
arch/x86/events/amd/power.c | 2 +-
arch/x86/kernel/cpu/microcode/core.c | 2 +-
arch/x86/kernel/cpu/microcode/intel.c | 20 +-
arch/x86/mm/tlb.c | 34 ++-
drivers/cpufreq/cpufreq_governor.c | 19 +-
drivers/gpu/drm/vc4/vc4_gem.c | 12 +-
drivers/infiniband/hw/mlx5/main.c | 9 +-
drivers/input/joystick/xpad.c | 19 ++
drivers/input/mouse/trackpoint.c | 245 +++++++++++++--------
drivers/input/mouse/trackpoint.h | 34 +--
drivers/net/ethernet/emulex/benet/be_main.c | 9 +
drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c | 6 +
.../net/ethernet/mellanox/mlx5/core/en_selftest.c | 27 ++-
drivers/net/ethernet/mellanox/mlx5/core/main.c | 3 +-
drivers/net/ethernet/mellanox/mlx5/core/vport.c | 22 +-
.../net/ethernet/mellanox/mlxsw/spectrum_router.c | 10 +-
.../net/ethernet/netronome/nfp/nfp_net_ethtool.c | 2 +-
drivers/net/ethernet/realtek/r8169.c | 9 +-
drivers/net/ppp/ppp_generic.c | 5 +-
drivers/net/ppp/pppoe.c | 11 +-
drivers/net/tun.c | 15 +-
drivers/net/usb/lan78xx.c | 1 +
drivers/net/vmxnet3/vmxnet3_drv.c | 2 +-
drivers/net/vrf.c | 5 +-
fs/btrfs/delayed-inode.c | 26 +--
fs/orangefs/devorangefs-req.c | 3 +-
fs/orangefs/file.c | 7 +-
fs/orangefs/orangefs-kernel.h | 11 -
fs/orangefs/waitqueue.c | 4 +-
include/linux/bpf.h | 21 +-
include/linux/mlx5/driver.h | 19 +-
include/linux/mlx5/mlx5_ifc.h | 5 +-
include/linux/vermagic.h | 8 +-
include/net/arp.h | 3 +
include/net/ipv6.h | 1 +
include/net/net_namespace.h | 10 +
include/net/tls.h | 2 +-
init/Kconfig | 7 +
kernel/bpf/core.c | 23 +-
kernel/bpf/verifier.c | 37 ++++
kernel/time/hrtimer.c | 3 +
lib/test_bpf.c | 11 +-
mm/page_alloc.c | 6 +-
net/core/dev.c | 19 +-
net/core/filter.c | 10 +-
net/core/flow_dissector.c | 3 +-
net/core/neighbour.c | 4 +-
net/core/sysctl_net_core.c | 6 +
net/dccp/ccids/ccid2.c | 3 +
net/ipv4/arp.c | 7 +-
net/ipv4/esp4_offload.c | 3 +
net/ipv4/igmp.c | 2 +-
net/ipv4/route.c | 1 +
net/ipv4/tcp.c | 3 +
net/ipv4/tcp_offload.c | 3 +
net/ipv4/tcp_timer.c | 15 ++
net/ipv4/udp_offload.c | 3 +
net/ipv6/esp6_offload.c | 3 +
net/ipv6/ip6_gre.c | 14 +-
net/ipv6/ip6_output.c | 9 +-
net/ipv6/ipv6_sockglue.c | 2 +-
net/ipv6/tcpv6_offload.c | 3 +
net/ipv6/udp_offload.c | 3 +
net/netfilter/nfnetlink_cthelper.c | 10 +
net/netfilter/xt_osf.c | 7 +
net/netlink/af_netlink.c | 3 +-
net/sctp/offload.c | 3 +
net/sctp/socket.c | 40 ++--
net/socket.c | 9 +
net/tipc/node.c | 26 ++-
net/tls/tls_main.c | 17 +-
net/tls/tls_sw.c | 16 +-
net/xfrm/xfrm_policy.c | 8 +-
tools/testing/selftests/bpf/test_verifier.c | 29 ++-
78 files changed, 843 insertions(+), 438 deletions(-)
From: Hans Verkuil <hans.verkuil(a)cisco.com>
Some ioctls need to copy back the result even if the ioctl returned
an error. However, don't do this for the error code -ENOTTY.
It makes no sense in that cases.
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Acked-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # for v4.15 and up
---
drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index 7ee3777cbe9c..3a1fca1440ac 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -968,6 +968,9 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
set_fs(old_fs);
}
+ if (err == -ENOTTY)
+ return err;
+
/* Special case: even after an error we need to put the
results back for these ioctls since the error_idx will
contain information on which control failed. */
--
2.15.1
From: Hans Verkuil <hans.verkuil(a)cisco.com>
ctrl_is_pointer just hardcoded two known string controls, but that
caused problems when using e.g. custom controls that use a pointer
for the payload.
Reimplement this function: it now finds the v4l2_ctrl (if the driver
uses the control framework) or it calls vidioc_query_ext_ctrl (if the
driver implements that directly).
In both cases it can now check if the control is a pointer control
or not.
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Cc: <stable(a)vger.kernel.org> # for v4.15 and up
---
drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 57 ++++++++++++++++++---------
1 file changed, 38 insertions(+), 19 deletions(-)
diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index 7dff9b4aeb19..30c5be1f0549 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -18,6 +18,8 @@
#include <linux/videodev2.h>
#include <linux/v4l2-subdev.h>
#include <media/v4l2-dev.h>
+#include <media/v4l2-fh.h>
+#include <media/v4l2-ctrls.h>
#include <media/v4l2-ioctl.h>
static long native_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -601,24 +603,39 @@ struct v4l2_ext_control32 {
};
} __attribute__ ((packed));
-/* The following function really belong in v4l2-common, but that causes
- a circular dependency between modules. We need to think about this, but
- for now this will do. */
-
-/* Return non-zero if this control is a pointer type. Currently only
- type STRING is a pointer type. */
-static inline int ctrl_is_pointer(u32 id)
+/* Return true if this control is a pointer type. */
+static inline bool ctrl_is_pointer(struct file *file, u32 id)
{
- switch (id) {
- case V4L2_CID_RDS_TX_PS_NAME:
- case V4L2_CID_RDS_TX_RADIO_TEXT:
- return 1;
- default:
- return 0;
+ struct video_device *vdev = video_devdata(file);
+ struct v4l2_fh *fh = NULL;
+ struct v4l2_ctrl_handler *hdl = NULL;
+ struct v4l2_query_ext_ctrl qec = { id };
+ const struct v4l2_ioctl_ops *ops = vdev->ioctl_ops;
+
+ if (test_bit(V4L2_FL_USES_V4L2_FH, &vdev->flags))
+ fh = file->private_data;
+
+ if (fh && fh->ctrl_handler)
+ hdl = fh->ctrl_handler;
+ else if (vdev->ctrl_handler)
+ hdl = vdev->ctrl_handler;
+
+ if (hdl) {
+ struct v4l2_ctrl *ctrl = v4l2_ctrl_find(hdl, id);
+
+ return ctrl && ctrl->is_ptr;
}
+
+ if (!ops->vidioc_query_ext_ctrl)
+ return false;
+
+ return !ops->vidioc_query_ext_ctrl(file, fh, &qec) &&
+ (qec.flags & V4L2_CTRL_FLAG_HAS_PAYLOAD);
}
-static int get_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext_controls32 __user *up)
+static int get_v4l2_ext_controls32(struct file *file,
+ struct v4l2_ext_controls *kp,
+ struct v4l2_ext_controls32 __user *up)
{
struct v4l2_ext_control32 __user *ucontrols;
struct v4l2_ext_control __user *kcontrols;
@@ -651,7 +668,7 @@ static int get_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext
return -EFAULT;
if (get_user(id, &kcontrols->id))
return -EFAULT;
- if (ctrl_is_pointer(id)) {
+ if (ctrl_is_pointer(file, id)) {
void __user *s;
if (get_user(p, &ucontrols->string))
@@ -666,7 +683,9 @@ static int get_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext
return 0;
}
-static int put_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext_controls32 __user *up)
+static int put_v4l2_ext_controls32(struct file *file,
+ struct v4l2_ext_controls *kp,
+ struct v4l2_ext_controls32 __user *up)
{
struct v4l2_ext_control32 __user *ucontrols;
struct v4l2_ext_control __user *kcontrols =
@@ -698,7 +717,7 @@ static int put_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext
/* Do not modify the pointer when copying a pointer control.
The contents of the pointer was changed, not the pointer
itself. */
- if (ctrl_is_pointer(id))
+ if (ctrl_is_pointer(file, id))
size -= sizeof(ucontrols->value64);
if (copy_in_user(ucontrols, kcontrols, size))
return -EFAULT;
@@ -912,7 +931,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
case VIDIOC_G_EXT_CTRLS:
case VIDIOC_S_EXT_CTRLS:
case VIDIOC_TRY_EXT_CTRLS:
- err = get_v4l2_ext_controls32(&karg.v2ecs, up);
+ err = get_v4l2_ext_controls32(file, &karg.v2ecs, up);
compatible_arg = 0;
break;
case VIDIOC_DQEVENT:
@@ -939,7 +958,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
case VIDIOC_G_EXT_CTRLS:
case VIDIOC_S_EXT_CTRLS:
case VIDIOC_TRY_EXT_CTRLS:
- if (put_v4l2_ext_controls32(&karg.v2ecs, up))
+ if (put_v4l2_ext_controls32(file, &karg.v2ecs, up))
err = -EFAULT;
break;
case VIDIOC_S_EDID:
--
2.15.1
Currently we see sporadic timeouts during CDCLK changing both on BXT and
GLK as reported by the Bugzilla: ticket. It's easy to reproduce this by
changing the frequency in a tight loop after blanking the display. The
upper bound for the completion time is 800us based on my tests, so
increase it from the current 500us to 2ms; with that I couldn't trigger
the problem either on BXT or GLK.
Note that timeouts happened during both the change notification and the
voltage level setting PCODE request. (For the latter one BSpec doesn't
require us to wait for completion before further HW programming.)
This issue is similar to
2c7d0602c815 ("drm/i915/gen9: Fix PCODE polling during CDCLK change
notification")
but there the PCODE request does complete (as shown by the mbox
busy flag), only the reply we get from PCODE indicates a failure.
So there we keep resending the request until a success reply, here we
just have to increase the timeout for the one PCODE request we send.
Cc: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Cc: stable(a)vger.kernel.org # v4.4+
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103326
Signed-off-by: Imre Deak <imre.deak(a)intel.com>
---
drivers/gpu/drm/i915/i915_drv.h | 6 +++++-
drivers/gpu/drm/i915/intel_cdclk.c | 20 +++++++++++++++-----
drivers/gpu/drm/i915/intel_pm.c | 6 +++---
3 files changed, 23 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 454d8f937fae..5e293be4e51d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3723,7 +3723,11 @@ extern void intel_display_print_error_state(struct drm_i915_error_state_buf *e,
struct intel_display_error_state *error);
int sandybridge_pcode_read(struct drm_i915_private *dev_priv, u32 mbox, u32 *val);
-int sandybridge_pcode_write(struct drm_i915_private *dev_priv, u32 mbox, u32 val);
+int snb_pcode_request(struct drm_i915_private *dev_priv, u32 mbox, u32 val,
+ int timeout_us);
+#define sandybridge_pcode_write(dev_priv, mbox, val) \
+ snb_pcode_request(dev_priv, mbox, val, 500)
+
int skl_pcode_request(struct drm_i915_private *dev_priv, u32 mbox, u32 request,
u32 reply_mask, u32 reply, int timeout_base_ms);
diff --git a/drivers/gpu/drm/i915/intel_cdclk.c b/drivers/gpu/drm/i915/intel_cdclk.c
index c4392ea34a3d..5057336c40ba 100644
--- a/drivers/gpu/drm/i915/intel_cdclk.c
+++ b/drivers/gpu/drm/i915/intel_cdclk.c
@@ -1370,10 +1370,14 @@ static void bxt_set_cdclk(struct drm_i915_private *dev_priv,
break;
}
- /* Inform power controller of upcoming frequency change */
+ /*
+ * Inform power controller of upcoming frequency change. BSpec
+ * requires us to wait up to 150usec, but that leads to timeouts;
+ * the 2ms used here is based on experiment.
+ */
mutex_lock(&dev_priv->pcu_lock);
- ret = sandybridge_pcode_write(dev_priv, HSW_PCODE_DE_WRITE_FREQ_REQ,
- 0x80000000);
+ ret = snb_pcode_request(dev_priv, HSW_PCODE_DE_WRITE_FREQ_REQ,
+ 0x80000000, 2000);
mutex_unlock(&dev_priv->pcu_lock);
if (ret) {
@@ -1404,8 +1408,14 @@ static void bxt_set_cdclk(struct drm_i915_private *dev_priv,
I915_WRITE(CDCLK_CTL, val);
mutex_lock(&dev_priv->pcu_lock);
- ret = sandybridge_pcode_write(dev_priv, HSW_PCODE_DE_WRITE_FREQ_REQ,
- cdclk_state->voltage_level);
+ /*
+ * The timeout isn't specified, the 2ms used here is based on
+ * experiment.
+ * FIXME: Waiting for the request completion could be delayed until
+ * the next PCODE request based on BSpec.
+ */
+ ret = snb_pcode_request(dev_priv, HSW_PCODE_DE_WRITE_FREQ_REQ,
+ cdclk_state->voltage_level, 2000);
mutex_unlock(&dev_priv->pcu_lock);
if (ret) {
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 0b92ea1dbd40..f6f4dbacb9af 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -9169,8 +9169,8 @@ int sandybridge_pcode_read(struct drm_i915_private *dev_priv, u32 mbox, u32 *val
return 0;
}
-int sandybridge_pcode_write(struct drm_i915_private *dev_priv,
- u32 mbox, u32 val)
+int snb_pcode_request(struct drm_i915_private *dev_priv,
+ u32 mbox, u32 val, int timeout_us)
{
int status;
@@ -9193,7 +9193,7 @@ int sandybridge_pcode_write(struct drm_i915_private *dev_priv,
if (__intel_wait_for_register_fw(dev_priv,
GEN6_PCODE_MAILBOX, GEN6_PCODE_READY, 0,
- 500, 0, NULL)) {
+ timeout_us, 0, NULL)) {
DRM_ERROR("timeout waiting for pcode write of 0x%08x to mbox %x to finish for %ps\n",
val, mbox, __builtin_return_address(0));
return -ETIMEDOUT;
--
2.13.2
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: v4l2-compat-ioctl32.c: don't copy back the result for certain errors
Author: Hans Verkuil <hans.verkuil(a)cisco.com>
Date: Fri Jan 26 03:24:53 2018 -0500
Some ioctls need to copy back the result even if the ioctl returned
an error. However, don't do this for the error code -ENOTTY.
It makes no sense in that cases.
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Acked-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # for v4.15 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)s-opensource.com>
drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 3 +++
1 file changed, 3 insertions(+)
---
diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index 7ee3777cbe9c..3a1fca1440ac 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -968,6 +968,9 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
set_fs(old_fs);
}
+ if (err == -ENOTTY)
+ return err;
+
/* Special case: even after an error we need to put the
results back for these ioctls since the error_idx will
contain information on which control failed. */
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: v4l2-compat-ioctl32.c: drop pr_info for unknown buffer type
Author: Hans Verkuil <hans.verkuil(a)cisco.com>
Date: Wed Jan 24 09:33:57 2018 -0500
There is nothing wrong with using an unknown buffer type. So
stop spamming the kernel log whenever this happens. The kernel
will just return -EINVAL to signal this.
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Acked-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # for v4.15 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)s-opensource.com>
drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 4 ----
1 file changed, 4 deletions(-)
---
diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index 0df941ca4d90..7ee3777cbe9c 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -179,8 +179,6 @@ static int __get_v4l2_format32(struct v4l2_format *kp, struct v4l2_format32 __us
return copy_from_user(&kp->fmt.meta, &up->fmt.meta,
sizeof(kp->fmt.meta)) ? -EFAULT : 0;
default:
- pr_info("compat_ioctl32: unexpected VIDIOC_FMT type %d\n",
- kp->type);
return -EINVAL;
}
}
@@ -233,8 +231,6 @@ static int __put_v4l2_format32(struct v4l2_format *kp, struct v4l2_format32 __us
return copy_to_user(&up->fmt.meta, &kp->fmt.meta,
sizeof(kp->fmt.meta)) ? -EFAULT : 0;
default:
- pr_info("compat_ioctl32: unexpected VIDIOC_FMT type %d\n",
- kp->type);
return -EINVAL;
}
}
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: v4l2-compat-ioctl32.c: fix ctrl_is_pointer
Author: Hans Verkuil <hans.verkuil(a)cisco.com>
Date: Wed Jan 24 05:30:59 2018 -0500
ctrl_is_pointer just hardcoded two known string controls, but that
caused problems when using e.g. custom controls that use a pointer
for the payload.
Reimplement this function: it now finds the v4l2_ctrl (if the driver
uses the control framework) or it calls vidioc_query_ext_ctrl (if the
driver implements that directly).
In both cases it can now check if the control is a pointer control
or not.
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Acked-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # for v4.15 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)s-opensource.com>
drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 57 ++++++++++++++++++---------
1 file changed, 38 insertions(+), 19 deletions(-)
---
diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index 7dff9b4aeb19..30c5be1f0549 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -18,6 +18,8 @@
#include <linux/videodev2.h>
#include <linux/v4l2-subdev.h>
#include <media/v4l2-dev.h>
+#include <media/v4l2-fh.h>
+#include <media/v4l2-ctrls.h>
#include <media/v4l2-ioctl.h>
static long native_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -601,24 +603,39 @@ struct v4l2_ext_control32 {
};
} __attribute__ ((packed));
-/* The following function really belong in v4l2-common, but that causes
- a circular dependency between modules. We need to think about this, but
- for now this will do. */
-
-/* Return non-zero if this control is a pointer type. Currently only
- type STRING is a pointer type. */
-static inline int ctrl_is_pointer(u32 id)
+/* Return true if this control is a pointer type. */
+static inline bool ctrl_is_pointer(struct file *file, u32 id)
{
- switch (id) {
- case V4L2_CID_RDS_TX_PS_NAME:
- case V4L2_CID_RDS_TX_RADIO_TEXT:
- return 1;
- default:
- return 0;
+ struct video_device *vdev = video_devdata(file);
+ struct v4l2_fh *fh = NULL;
+ struct v4l2_ctrl_handler *hdl = NULL;
+ struct v4l2_query_ext_ctrl qec = { id };
+ const struct v4l2_ioctl_ops *ops = vdev->ioctl_ops;
+
+ if (test_bit(V4L2_FL_USES_V4L2_FH, &vdev->flags))
+ fh = file->private_data;
+
+ if (fh && fh->ctrl_handler)
+ hdl = fh->ctrl_handler;
+ else if (vdev->ctrl_handler)
+ hdl = vdev->ctrl_handler;
+
+ if (hdl) {
+ struct v4l2_ctrl *ctrl = v4l2_ctrl_find(hdl, id);
+
+ return ctrl && ctrl->is_ptr;
}
+
+ if (!ops->vidioc_query_ext_ctrl)
+ return false;
+
+ return !ops->vidioc_query_ext_ctrl(file, fh, &qec) &&
+ (qec.flags & V4L2_CTRL_FLAG_HAS_PAYLOAD);
}
-static int get_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext_controls32 __user *up)
+static int get_v4l2_ext_controls32(struct file *file,
+ struct v4l2_ext_controls *kp,
+ struct v4l2_ext_controls32 __user *up)
{
struct v4l2_ext_control32 __user *ucontrols;
struct v4l2_ext_control __user *kcontrols;
@@ -651,7 +668,7 @@ static int get_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext
return -EFAULT;
if (get_user(id, &kcontrols->id))
return -EFAULT;
- if (ctrl_is_pointer(id)) {
+ if (ctrl_is_pointer(file, id)) {
void __user *s;
if (get_user(p, &ucontrols->string))
@@ -666,7 +683,9 @@ static int get_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext
return 0;
}
-static int put_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext_controls32 __user *up)
+static int put_v4l2_ext_controls32(struct file *file,
+ struct v4l2_ext_controls *kp,
+ struct v4l2_ext_controls32 __user *up)
{
struct v4l2_ext_control32 __user *ucontrols;
struct v4l2_ext_control __user *kcontrols =
@@ -698,7 +717,7 @@ static int put_v4l2_ext_controls32(struct v4l2_ext_controls *kp, struct v4l2_ext
/* Do not modify the pointer when copying a pointer control.
The contents of the pointer was changed, not the pointer
itself. */
- if (ctrl_is_pointer(id))
+ if (ctrl_is_pointer(file, id))
size -= sizeof(ucontrols->value64);
if (copy_in_user(ucontrols, kcontrols, size))
return -EFAULT;
@@ -912,7 +931,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
case VIDIOC_G_EXT_CTRLS:
case VIDIOC_S_EXT_CTRLS:
case VIDIOC_TRY_EXT_CTRLS:
- err = get_v4l2_ext_controls32(&karg.v2ecs, up);
+ err = get_v4l2_ext_controls32(file, &karg.v2ecs, up);
compatible_arg = 0;
break;
case VIDIOC_DQEVENT:
@@ -939,7 +958,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
case VIDIOC_G_EXT_CTRLS:
case VIDIOC_S_EXT_CTRLS:
case VIDIOC_TRY_EXT_CTRLS:
- if (put_v4l2_ext_controls32(&karg.v2ecs, up))
+ if (put_v4l2_ext_controls32(file, &karg.v2ecs, up))
err = -EFAULT;
break;
case VIDIOC_S_EDID:
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: v4l2-compat-ioctl32.c: add missing VIDIOC_PREPARE_BUF
Author: Hans Verkuil <hans.verkuil(a)cisco.com>
Date: Wed Jan 24 08:37:04 2018 -0500
The result of the VIDIOC_PREPARE_BUF ioctl was never copied back
to userspace since it was missing in the switch.
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Acked-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # for v4.15 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)s-opensource.com>
drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 1 +
1 file changed, 1 insertion(+)
---
diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index e48d59046086..76ed43e774dd 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -1052,6 +1052,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
err = put_v4l2_create32(&karg.v2crt, up);
break;
+ case VIDIOC_PREPARE_BUF:
case VIDIOC_QUERYBUF:
case VIDIOC_QBUF:
case VIDIOC_DQBUF:
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: v4l2-ioctl.c: don't copy back the result for -ENOTTY
Author: Hans Verkuil <hans.verkuil(a)cisco.com>
Date: Tue Jan 30 03:50:01 2018 -0500
If the ioctl returned -ENOTTY, then don't bother copying
back the result as there is no point.
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Acked-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # for v4.15 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)s-opensource.com>
drivers/media/v4l2-core/v4l2-ioctl.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
---
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index c7f6b65d3ad7..260288ca4f55 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -2900,8 +2900,11 @@ video_usercopy(struct file *file, unsigned int cmd, unsigned long arg,
/* Handles IOCTL */
err = func(file, cmd, parg);
- if (err == -ENOIOCTLCMD)
+ if (err == -ENOTTY || err == -ENOIOCTLCMD) {
err = -ENOTTY;
+ goto out;
+ }
+
if (err == 0) {
if (cmd == VIDIOC_DQBUF)
trace_v4l2_dqbuf(video_devdata(file)->minor, parg);
From: Hans Verkuil <hans.verkuil(a)cisco.com>
If the ioctl returned -ENOTTY, then don't bother copying
back the result as there is no point.
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Cc: <stable(a)vger.kernel.org> # for v4.15 and up
---
drivers/media/v4l2-core/v4l2-ioctl.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index c7f6b65d3ad7..260288ca4f55 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -2900,8 +2900,11 @@ video_usercopy(struct file *file, unsigned int cmd, unsigned long arg,
/* Handles IOCTL */
err = func(file, cmd, parg);
- if (err == -ENOIOCTLCMD)
+ if (err == -ENOTTY || err == -ENOIOCTLCMD) {
err = -ENOTTY;
+ goto out;
+ }
+
if (err == 0) {
if (cmd == VIDIOC_DQBUF)
trace_v4l2_dqbuf(video_devdata(file)->minor, parg);
--
2.15.1
From: Hans Verkuil <hans.verkuil(a)cisco.com>
The result of the VIDIOC_PREPARE_BUF ioctl was never copied back
to userspace since it was missing in the switch.
Signed-off-by: Hans Verkuil <hans.verkuil(a)cisco.com>
Acked-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Cc: <stable(a)vger.kernel.org> # for v4.15 and up
---
drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index e48d59046086..76ed43e774dd 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -1052,6 +1052,7 @@ static long do_video_ioctl(struct file *file, unsigned int cmd, unsigned long ar
err = put_v4l2_create32(&karg.v2crt, up);
break;
+ case VIDIOC_PREPARE_BUF:
case VIDIOC_QUERYBUF:
case VIDIOC_QBUF:
case VIDIOC_DQBUF:
--
2.15.1
strscpy() performs the word-at-a-time optimistic reads. So it may
may access the memory past the end of the object, which is perfectly fine
since strscpy() doesn't use that (past-the-end) data and makes sure the
optimistic read won't cross a page boundary.
But KASAN doesn't know anything about that so it will complain.
There are several possible ways to address this issue, but none
are perfect. See https://lkml.kernel.org/r/9f0a9cf6-51f7-cd1f-5dc6-6d510a7b8ec4@virtuozzo.com
It seems the best solution is to simply disable word-at-a-time
optimization. My trivial testing shows that byte-at-a-time
could be up to x4.3 times slower than word-at-a-time.
It may seems like a lot, but it's actually ~1.2e-10 sec per symbol vs
~4.8e-10 sec per symbol on modern hardware. And we don't use strscpy()
in a performance critical paths to copy large amounts of data,
so it shouldn't matter anyway.
Fixes: 30035e45753b7 ("string: provide strscpy()")
Signed-off-by: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Cc: <stable(a)vger.kernel.org>
---
lib/string.c | 38 --------------------------------------
1 file changed, 38 deletions(-)
diff --git a/lib/string.c b/lib/string.c
index 64a9e33f1daa..6205dd71aa0f 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -29,7 +29,6 @@
#include <linux/errno.h>
#include <asm/byteorder.h>
-#include <asm/word-at-a-time.h>
#include <asm/page.h>
#ifndef __HAVE_ARCH_STRNCASECMP
@@ -177,45 +176,8 @@ EXPORT_SYMBOL(strlcpy);
*/
ssize_t strscpy(char *dest, const char *src, size_t count)
{
- const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
- size_t max = count;
long res = 0;
- if (count == 0)
- return -E2BIG;
-
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- /*
- * If src is unaligned, don't cross a page boundary,
- * since we don't know if the next page is mapped.
- */
- if ((long)src & (sizeof(long) - 1)) {
- size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1));
- if (limit < max)
- max = limit;
- }
-#else
- /* If src or dest is unaligned, don't do word-at-a-time. */
- if (((long) dest | (long) src) & (sizeof(long) - 1))
- max = 0;
-#endif
-
- while (max >= sizeof(unsigned long)) {
- unsigned long c, data;
-
- c = *(unsigned long *)(src+res);
- if (has_zero(c, &data, &constants)) {
- data = prep_zero_mask(c, data, &constants);
- data = create_zero_mask(data);
- *(unsigned long *)(dest+res) = c & zero_bytemask(data);
- return res + find_zero(data);
- }
- *(unsigned long *)(dest+res) = c;
- res += sizeof(unsigned long);
- count -= sizeof(unsigned long);
- max -= sizeof(unsigned long);
- }
-
while (count) {
char c;
--
2.13.6
On Tue, Jan 30, 2018 at 05:09:07AM +0000, Harsh Shandilya wrote:
> On Tue 30 Jan, 2018, 2:20 AM Greg Kroah-Hartman, <gregkh(a)linuxfoundation.org>
> wrote:
>
> > This is the start of the stable review cycle for the 3.18.93 release.
> > There are 52 patches in this series, all will be posted as a response
> > to this one. If anyone has any issues with these being applied, please
> > let me know.
> >
> > Responses should be made by Wed Jan 31 12:36:07 UTC 2018.
> > Anything received after that time might be too late.
> >
> > The whole patch series can be found in one patch at:
> >
> > kernel.org/pub/linux/kernel/v3.x/stable-review/patch-3.18.93-rc1.gz
> > or in the git tree and branch at:
> > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> > linux-3.18.y
> > and the diffstat can be found below.
> >
> > thanks,
> >
> > greg k-h
> >
>
> Builds and boots on the OnePlus 3, no dmesg or userspace regressions.
Yeah, it's still working! :)
thanks for testing and letting me know.
greg k-h
The patch titled
Subject: kernel/relay.c: revert "kernel/relay.c: fix potential memory leak"
has been added to the -mm tree. Its filename is
revert-kernel-relayc-fix-potential-memory-leak.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/revert-kernel-relayc-fix-potential…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/revert-kernel-relayc-fix-potential…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/SubmitChecklist when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Eric Biggers <ebiggers(a)google.com>
Subject: kernel/relay.c: revert "kernel/relay.c: fix potential memory leak"
This reverts ba62bafe942b159a6 ("kernel/relay.c: fix potential memory leak").
This commit introduced a double free bug, because 'chan' is already
freed by the line:
kref_put(&chan->kref, relay_destroy_channel);
This bug was found by syzkaller, using the BLKTRACESETUP ioctl.
Link: http://lkml.kernel.org/r/20180127004759.101823-1-ebiggers3@gmail.com
Fixes: ba62bafe942b ("kernel/relay.c: fix potential memory leak")
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Reported-by: syzbot <syzkaller(a)googlegroups.com>
Reviewed-by: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Zhouyi Zhou <yizhouzhou(a)ict.ac.cn>
Cc: Jens Axboe <axboe(a)kernel.dk>
Cc: <stable(a)vger.kernel.org> [4.7+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/relay.c | 1 -
1 file changed, 1 deletion(-)
diff -puN kernel/relay.c~revert-kernel-relayc-fix-potential-memory-leak kernel/relay.c
--- a/kernel/relay.c~revert-kernel-relayc-fix-potential-memory-leak
+++ a/kernel/relay.c
@@ -611,7 +611,6 @@ free_bufs:
kref_put(&chan->kref, relay_destroy_channel);
mutex_unlock(&relay_channels_mutex);
- kfree(chan);
return NULL;
}
EXPORT_SYMBOL_GPL(relay_open);
_
Patches currently in -mm which might be from ebiggers(a)google.com are
userfaultfd-convert-to-use-anon_inode_getfd.patch
revert-kernel-relayc-fix-potential-memory-leak.patch
pipe-sysctl-drop-min-parameter-from-pipe-max-size-converter.patch
pipe-sysctl-remove-pipe_proc_fn.patch
pipe-actually-allow-root-to-exceed-the-pipe-buffer-limits.patch
pipe-fix-off-by-one-error-when-checking-buffer-limits.patch
pipe-reject-f_setpipe_sz-with-size-over-uint_max.patch
pipe-simplify-round_pipe_size.patch
pipe-read-buffer-limits-atomically.patch
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0fd189a95fdbc631737df5f27a0fc0a3dd31b75e Mon Sep 17 00:00:00 2001
From: Lyude Paul <lyude(a)redhat.com>
Date: Thu, 25 Jan 2018 18:29:53 -0500
Subject: [PATCH] drm/nouveau: Move irq setup/teardown to pci ctor/dtor
For a while we've been having issues with seemingly random interrupts
coming from nvidia cards when resuming them. Originally the fix for this
was thought to be just re-arming the MSI interrupt registers right after
re-allocating our IRQs, however it seems a lot of what we do is both
wrong and not even nessecary.
This was made apparent by what appeared to be a regression in the
mainline kernel that started introducing suspend/resume issues for
nouveau:
a0c9259dc4e1 (irq/matrix: Spread interrupts on allocation)
After this commit was introduced, we started getting interrupts from the
GPU before we actually re-allocated our own IRQ (see references below)
and assigned the IRQ handler. Investigating this turned out that the
problem was not with the commit, but the fact that nouveau even
free/allocates it's irqs before and after suspend/resume.
For starters: drivers in the linux kernel haven't had to handle
freeing/re-allocating their IRQs during suspend/resume cycles for quite
a while now. Nouveau seems to be one of the few drivers left that still
does this, despite the fact there's no reason we actually need to since
disabling interrupts from the device side should be enough, as the
kernel is already smart enough to know to disable host-side interrupts
for us before going into suspend. Since we were tearing down our IRQs by
hand however, that means there was a short period during resume where
interrupts could be received before we re-allocated our IRQ which would
lead to us getting an unhandled IRQ. Since we never handle said IRQ and
re-arm the interrupt registers, this would cause us to miss all of the
interrupts from the GPU and cause our init process to start timing out
on anything requiring interrupts.
So, since this whole setup/teardown every suspend/resume cycle is
useless anyway, move irq setup/teardown into the pci subdev's ctor/dtor
functions instead so they're only called at driver load and driver
unload. This should fix most of the issues with pending interrupts on
resume, along with getting suspend/resume for nouveau to work again.
As well, this probably means we can also just remove the msi rearm call
inside nvkm_pci_init(). But since our main focus here is to fix
suspend/resume before 4.15, we'll save that for a later patch.
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
Cc: Karol Herbst <kherbst(a)redhat.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Mike Galbraith <efault(a)gmx.de>
Cc: stable(a)vger.kernel.org
Signed-off-by: Ben Skeggs <bskeggs(a)redhat.com>
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
index deb96de54b00..ee2431a7804e 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
@@ -71,6 +71,10 @@ nvkm_pci_intr(int irq, void *arg)
struct nvkm_pci *pci = arg;
struct nvkm_device *device = pci->subdev.device;
bool handled = false;
+
+ if (pci->irq < 0)
+ return IRQ_HANDLED;
+
nvkm_mc_intr_unarm(device);
if (pci->msi)
pci->func->msi_rearm(pci);
@@ -84,11 +88,6 @@ nvkm_pci_fini(struct nvkm_subdev *subdev, bool suspend)
{
struct nvkm_pci *pci = nvkm_pci(subdev);
- if (pci->irq >= 0) {
- free_irq(pci->irq, pci);
- pci->irq = -1;
- }
-
if (pci->agp.bridge)
nvkm_agp_fini(pci);
@@ -108,8 +107,20 @@ static int
nvkm_pci_oneinit(struct nvkm_subdev *subdev)
{
struct nvkm_pci *pci = nvkm_pci(subdev);
- if (pci_is_pcie(pci->pdev))
- return nvkm_pcie_oneinit(pci);
+ struct pci_dev *pdev = pci->pdev;
+ int ret;
+
+ if (pci_is_pcie(pci->pdev)) {
+ ret = nvkm_pcie_oneinit(pci);
+ if (ret)
+ return ret;
+ }
+
+ ret = request_irq(pdev->irq, nvkm_pci_intr, IRQF_SHARED, "nvkm", pci);
+ if (ret)
+ return ret;
+
+ pci->irq = pdev->irq;
return 0;
}
@@ -117,7 +128,6 @@ static int
nvkm_pci_init(struct nvkm_subdev *subdev)
{
struct nvkm_pci *pci = nvkm_pci(subdev);
- struct pci_dev *pdev = pci->pdev;
int ret;
if (pci->agp.bridge) {
@@ -131,28 +141,34 @@ nvkm_pci_init(struct nvkm_subdev *subdev)
if (pci->func->init)
pci->func->init(pci);
- ret = request_irq(pdev->irq, nvkm_pci_intr, IRQF_SHARED, "nvkm", pci);
- if (ret)
- return ret;
-
- pci->irq = pdev->irq;
-
/* Ensure MSI interrupts are armed, for the case where there are
* already interrupts pending (for whatever reason) at load time.
*/
if (pci->msi)
pci->func->msi_rearm(pci);
- return ret;
+ return 0;
}
static void *
nvkm_pci_dtor(struct nvkm_subdev *subdev)
{
struct nvkm_pci *pci = nvkm_pci(subdev);
+
nvkm_agp_dtor(pci);
+
+ if (pci->irq >= 0) {
+ /* freq_irq() will call the handler, we use pci->irq == -1
+ * to signal that it's been torn down and should be a noop.
+ */
+ int irq = pci->irq;
+ pci->irq = -1;
+ free_irq(irq, pci);
+ }
+
if (pci->msi)
pci_disable_msi(pci->pdev);
+
return nvkm_pci(subdev);
}
This is a note to let you know that I've just added the patch titled
net: igmp: fix source address check for IGMPv3 reports
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
net-igmp-fix-source-address-check-for-igmpv3-reports.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Sun Jan 28 17:35:08 CET 2018
From: Felix Fietkau <nbd(a)nbd.name>
Date: Fri, 19 Jan 2018 11:50:46 +0100
Subject: net: igmp: fix source address check for IGMPv3 reports
From: Felix Fietkau <nbd(a)nbd.name>
[ Upstream commit ad23b750933ea7bf962678972a286c78a8fa36aa ]
Commit "net: igmp: Use correct source address on IGMPv3 reports"
introduced a check to validate the source address of locally generated
IGMPv3 packets.
Instead of checking the local interface address directly, it uses
inet_ifa_match(fl4->saddr, ifa), which checks if the address is on the
local subnet (or equal to the point-to-point address if used).
This breaks for point-to-point interfaces, so check against
ifa->ifa_local directly.
Cc: Kevin Cernekee <cernekee(a)chromium.org>
Fixes: a46182b00290 ("net: igmp: Use correct source address on IGMPv3 reports")
Reported-by: Sebastian Gottschall <s.gottschall(a)dd-wrt.com>
Signed-off-by: Felix Fietkau <nbd(a)nbd.name>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv4/igmp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -332,7 +332,7 @@ static __be32 igmpv3_get_srcaddr(struct
return htonl(INADDR_ANY);
for_ifa(in_dev) {
- if (inet_ifa_match(fl4->saddr, ifa))
+ if (fl4->saddr == ifa->ifa_local)
return fl4->saddr;
} endfor_ifa(in_dev);
Patches currently in stable-queue which might be from nbd(a)nbd.name are
queue-4.14/net-igmp-fix-source-address-check-for-igmpv3-reports.patch
The buf pointer was not being incremented inside the loop
meaning the same block of data would be read or written
repeatedly.
Fixes: 09ac4fcb3f25 ("drm/ttm: Implement vm_operations_struct.access v2")
Signed-off-by: Tom St Denis <tom.stdenis(a)amd.com>
Reviewed-by: Christian König <christian.koenig(a)amd.com>
(v2) Change 'buf' pointer to uint8_t* type
---
drivers/gpu/drm/ttm/ttm_bo_vm.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 08a3c324242e..60fcef1593dd 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -316,7 +316,7 @@ static void ttm_bo_vm_close(struct vm_area_struct *vma)
static int ttm_bo_vm_access_kmap(struct ttm_buffer_object *bo,
unsigned long offset,
- void *buf, int len, int write)
+ uint8_t *buf, int len, int write)
{
unsigned long page = offset >> PAGE_SHIFT;
unsigned long bytes_left = len;
@@ -345,6 +345,7 @@ static int ttm_bo_vm_access_kmap(struct ttm_buffer_object *bo,
ttm_bo_kunmap(&map);
page++;
+ buf += bytes;
bytes_left -= bytes;
offset = 0;
} while (bytes_left);
--
2.14.3
The patch below does not apply to the 3.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d5421ea43d30701e03cadc56a38854c36a8b4433 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Fri, 26 Jan 2018 14:54:32 +0100
Subject: [PATCH] hrtimer: Reset hrtimer cpu base proper on CPU hotplug
The hrtimer interrupt code contains a hang detection and mitigation
mechanism, which prevents that a long delayed hrtimer interrupt causes a
continous retriggering of interrupts which prevent the system from making
progress. If a hang is detected then the timer hardware is programmed with
a certain delay into the future and a flag is set in the hrtimer cpu base
which prevents newly enqueued timers from reprogramming the timer hardware
prior to the chosen delay. The subsequent hrtimer interrupt after the delay
clears the flag and resumes normal operation.
If such a hang happens in the last hrtimer interrupt before a CPU is
unplugged then the hang_detected flag is set and stays that way when the
CPU is plugged in again. At that point the timer hardware is not armed and
it cannot be armed because the hang_detected flag is still active, so
nothing clears that flag. As a consequence the CPU does not receive hrtimer
interrupts and no timers expire on that CPU which results in RCU stalls and
other malfunctions.
Clear the flag along with some other less critical members of the hrtimer
cpu base to ensure starting from a clean state when a CPU is plugged in.
Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the
root cause of that hard to reproduce heisenbug. Once understood it's
trivial and certainly justifies a brown paperbag.
Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic")
Reported-by: Paul E. McKenney <paulmck(a)linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Sebastian Sewior <bigeasy(a)linutronix.de>
Cc: Anna-Maria Gleixner <anna-maria(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d32520840fde..aa9d2a2b1210 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -655,7 +655,9 @@ static void hrtimer_reprogram(struct hrtimer *timer,
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
base->expires_next = KTIME_MAX;
+ base->hang_detected = 0;
base->hres_active = 0;
+ base->next_timer = NULL;
}
/*
@@ -1589,6 +1591,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
timerqueue_init_head(&cpu_base->clock_base[i].active);
}
+ cpu_base->active_bases = 0;
cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
return 0;
This is a note to let you know that I've just added the patch titled
hrtimer: Reset hrtimer cpu base proper on CPU hotplug
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
hrtimer-reset-hrtimer-cpu-base-proper-on-cpu-hotplug.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From d5421ea43d30701e03cadc56a38854c36a8b4433 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Fri, 26 Jan 2018 14:54:32 +0100
Subject: hrtimer: Reset hrtimer cpu base proper on CPU hotplug
From: Thomas Gleixner <tglx(a)linutronix.de>
commit d5421ea43d30701e03cadc56a38854c36a8b4433 upstream.
The hrtimer interrupt code contains a hang detection and mitigation
mechanism, which prevents that a long delayed hrtimer interrupt causes a
continous retriggering of interrupts which prevent the system from making
progress. If a hang is detected then the timer hardware is programmed with
a certain delay into the future and a flag is set in the hrtimer cpu base
which prevents newly enqueued timers from reprogramming the timer hardware
prior to the chosen delay. The subsequent hrtimer interrupt after the delay
clears the flag and resumes normal operation.
If such a hang happens in the last hrtimer interrupt before a CPU is
unplugged then the hang_detected flag is set and stays that way when the
CPU is plugged in again. At that point the timer hardware is not armed and
it cannot be armed because the hang_detected flag is still active, so
nothing clears that flag. As a consequence the CPU does not receive hrtimer
interrupts and no timers expire on that CPU which results in RCU stalls and
other malfunctions.
Clear the flag along with some other less critical members of the hrtimer
cpu base to ensure starting from a clean state when a CPU is plugged in.
Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the
root cause of that hard to reproduce heisenbug. Once understood it's
trivial and certainly justifies a brown paperbag.
Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic")
Reported-by: Paul E. McKenney <paulmck(a)linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Sebastian Sewior <bigeasy(a)linutronix.de>
Cc: Anna-Maria Gleixner <anna-maria(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos
[bigeasy: backport to v3.18, drop ->next_timer it was introduced later]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
kernel/time/hrtimer.c | 2 ++
1 file changed, 2 insertions(+)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -612,6 +612,7 @@ static int hrtimer_reprogram(struct hrti
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
base->expires_next.tv64 = KTIME_MAX;
+ base->hang_detected = 0;
base->hres_active = 0;
}
@@ -1632,6 +1633,7 @@ static void init_hrtimers_cpu(int cpu)
timerqueue_init_head(&cpu_base->clock_base[i].active);
}
+ cpu_base->active_bases = 0;
cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
}
Patches currently in stable-queue which might be from tglx(a)linutronix.de are
queue-3.18/futex-prevent-overflow-by-strengthen-input-validation.patch
queue-3.18/x86-asm-32-make-sync_core-handle-missing-cpuid-on-all-32-bit-kernels.patch
queue-3.18/x86-microcode-intel-extend-bdw-late-loading-further-with-llc-size-check.patch
queue-3.18/hrtimer-reset-hrtimer-cpu-base-proper-on-cpu-hotplug.patch
This is a note to let you know that I've just added the patch titled
nfsd: auth: Fix gid sorting when rootsquash enabled
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
nfsd-auth-fix-gid-sorting-when-rootsquash-enabled.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 1995266727fa8143897e89b55f5d3c79aa828420 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Date: Mon, 22 Jan 2018 20:11:06 +0000
Subject: nfsd: auth: Fix gid sorting when rootsquash enabled
From: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
commit 1995266727fa8143897e89b55f5d3c79aa828420 upstream.
Commit bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility
group_info allocators") appears to break nfsd rootsquash in a pretty
major way.
It adds a call to groups_sort() inside the loop that copies/squashes
gids, which means the valid gids are sorted along with the following
garbage. The net result is that the highest numbered valid gids are
replaced with any lower-valued garbage gids, possibly including 0.
We should sort only once, after filling in all the gids.
Fixes: bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility ...")
Signed-off-by: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Acked-by: J. Bruce Fields <bfields(a)redhat.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Wolfgang Walter <linux(a)stwm.de>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
fs/nfsd/auth.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -59,10 +59,10 @@ int nfsd_setuser(struct svc_rqst *rqstp,
gi->gid[i] = exp->ex_anon_gid;
else
gi->gid[i] = rqgi->gid[i];
-
- /* Each thread allocates its own gi, no race */
- groups_sort(gi);
}
+
+ /* Each thread allocates its own gi, no race */
+ groups_sort(gi);
} else {
gi = get_group_info(rqgi);
}
Patches currently in stable-queue which might be from ben.hutchings(a)codethink.co.uk are
queue-4.9/vsyscall-fix-permissions-for-emulate-mode-with-kaiser-pti.patch
queue-4.9/ipv6-fix-getsockopt-for-sockets-with-default-ipv6_autoflowlabel.patch
queue-4.9/nfsd-auth-fix-gid-sorting-when-rootsquash-enabled.patch
Hello,
after upgrading our nfs-server from 4.9.75 to 4.9.78 group permissions stop
working (for clients). If you need group permissions to access a file or
directory, sometimes access is granted, but rather often denied. Often access
to the same object is denied within seconds after access was granted in an
earlier access. user permissions work fine.
Downgrading to 4.9.75 fixes the issue.
We use kerberos.
Regards,
--
Wolfgang Walter
Studentenwerk München
Anstalt des öffentlichen Rechts
This is a note to let you know that I've just added the patch titled
nfsd: auth: Fix gid sorting when rootsquash enabled
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
nfsd-auth-fix-gid-sorting-when-rootsquash-enabled.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 1995266727fa8143897e89b55f5d3c79aa828420 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Date: Mon, 22 Jan 2018 20:11:06 +0000
Subject: nfsd: auth: Fix gid sorting when rootsquash enabled
From: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
commit 1995266727fa8143897e89b55f5d3c79aa828420 upstream.
Commit bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility
group_info allocators") appears to break nfsd rootsquash in a pretty
major way.
It adds a call to groups_sort() inside the loop that copies/squashes
gids, which means the valid gids are sorted along with the following
garbage. The net result is that the highest numbered valid gids are
replaced with any lower-valued garbage gids, possibly including 0.
We should sort only once, after filling in all the gids.
Fixes: bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility ...")
Signed-off-by: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Acked-by: J. Bruce Fields <bfields(a)redhat.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Wolfgang Walter <linux(a)stwm.de>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
fs/nfsd/auth.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -60,9 +60,10 @@ int nfsd_setuser(struct svc_rqst *rqstp,
else
GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
- /* Each thread allocates its own gi, no race */
- groups_sort(gi);
}
+
+ /* Each thread allocates its own gi, no race */
+ groups_sort(gi);
} else {
gi = get_group_info(rqgi);
}
Patches currently in stable-queue which might be from ben.hutchings(a)codethink.co.uk are
queue-4.4/vsyscall-fix-permissions-for-emulate-mode-with-kaiser-pti.patch
queue-4.4/ipv6-fix-getsockopt-for-sockets-with-default-ipv6_autoflowlabel.patch
queue-4.4/x86-microcode-intel-fix-bdw-late-loading-revision-check.patch
queue-4.4/nfsd-auth-fix-gid-sorting-when-rootsquash-enabled.patch
This is a note to let you know that I've just added the patch titled
nfsd: auth: Fix gid sorting when rootsquash enabled
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
nfsd-auth-fix-gid-sorting-when-rootsquash-enabled.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 1995266727fa8143897e89b55f5d3c79aa828420 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Date: Mon, 22 Jan 2018 20:11:06 +0000
Subject: nfsd: auth: Fix gid sorting when rootsquash enabled
From: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
commit 1995266727fa8143897e89b55f5d3c79aa828420 upstream.
Commit bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility
group_info allocators") appears to break nfsd rootsquash in a pretty
major way.
It adds a call to groups_sort() inside the loop that copies/squashes
gids, which means the valid gids are sorted along with the following
garbage. The net result is that the highest numbered valid gids are
replaced with any lower-valued garbage gids, possibly including 0.
We should sort only once, after filling in all the gids.
Fixes: bdcf0a423ea1 ("kernel: make groups_sort calling a responsibility ...")
Signed-off-by: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Acked-by: J. Bruce Fields <bfields(a)redhat.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Wolfgang Walter <linux(a)stwm.de>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
fs/nfsd/auth.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -60,10 +60,10 @@ int nfsd_setuser(struct svc_rqst *rqstp,
gi->gid[i] = exp->ex_anon_gid;
else
gi->gid[i] = rqgi->gid[i];
-
- /* Each thread allocates its own gi, no race */
- groups_sort(gi);
}
+
+ /* Each thread allocates its own gi, no race */
+ groups_sort(gi);
} else {
gi = get_group_info(rqgi);
}
Patches currently in stable-queue which might be from ben.hutchings(a)codethink.co.uk are
queue-4.14/ipv6-fix-getsockopt-for-sockets-with-default-ipv6_autoflowlabel.patch
queue-4.14/nfsd-auth-fix-gid-sorting-when-rootsquash-enabled.patch
This is a note to let you know that I've just added the patch titled
cpufreq: governor: Ensure sufficiently large sampling intervals
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
cpufreq-governor-ensure-sufficiently-large-sampling-intervals.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 56026645e2b6f11ede34a5e6ab69d3eb56f9c8fc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki(a)intel.com>
Date: Mon, 18 Dec 2017 02:15:32 +0100
Subject: cpufreq: governor: Ensure sufficiently large sampling intervals
From: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
commit 56026645e2b6f11ede34a5e6ab69d3eb56f9c8fc upstream.
After commit aa7519af450d (cpufreq: Use transition_delay_us for legacy
governors as well) the sampling_rate field of struct dbs_data may be
less than the tick period which causes dbs_update() to produce
incorrect results, so make the code ensure that the value of that
field will always be sufficiently large.
Fixes: aa7519af450d (cpufreq: Use transition_delay_us for legacy governors as well)
Reported-by: Andy Tang <andy.tang(a)nxp.com>
Reported-by: Doug Smythies <dsmythies(a)telus.net>
Tested-by: Andy Tang <andy.tang(a)nxp.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Acked-by: Viresh Kumar <viresh.kumar(a)linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/cpufreq/cpufreq_governor.c | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -22,6 +22,8 @@
#include "cpufreq_governor.h"
+#define CPUFREQ_DBS_MIN_SAMPLING_INTERVAL (2 * TICK_NSEC / NSEC_PER_USEC)
+
static DEFINE_PER_CPU(struct cpu_dbs_info, cpu_dbs);
static DEFINE_MUTEX(gov_dbs_data_mutex);
@@ -47,11 +49,15 @@ ssize_t store_sampling_rate(struct gov_a
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
struct policy_dbs_info *policy_dbs;
+ unsigned int sampling_interval;
int ret;
- ret = sscanf(buf, "%u", &dbs_data->sampling_rate);
- if (ret != 1)
+
+ ret = sscanf(buf, "%u", &sampling_interval);
+ if (ret != 1 || sampling_interval < CPUFREQ_DBS_MIN_SAMPLING_INTERVAL)
return -EINVAL;
+ dbs_data->sampling_rate = sampling_interval;
+
/*
* We are operating under dbs_data->mutex and so the list and its
* entries can't be freed concurrently.
@@ -430,7 +436,14 @@ int cpufreq_dbs_governor_init(struct cpu
if (ret)
goto free_policy_dbs_info;
- dbs_data->sampling_rate = cpufreq_policy_transition_delay_us(policy);
+ /*
+ * The sampling interval should not be less than the transition latency
+ * of the CPU and it also cannot be too small for dbs_update() to work
+ * correctly.
+ */
+ dbs_data->sampling_rate = max_t(unsigned int,
+ CPUFREQ_DBS_MIN_SAMPLING_INTERVAL,
+ cpufreq_policy_transition_delay_us(policy));
if (!have_governor_per_policy())
gov->gdbs_data = dbs_data;
Patches currently in stable-queue which might be from rafael.j.wysocki(a)intel.com are
queue-4.14/cpufreq-governor-ensure-sufficiently-large-sampling-intervals.patch
On 01/24/2018 11:07 AM, David Woodhouse wrote:
> On Tue, 2018-01-09 at 22:39 +0100, Daniel Borkmann wrote:
>> On 01/09/2018 07:04 PM, Alexei Starovoitov wrote:
>>>
>>> The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715.
>>>
>>> A quote from goolge project zero blog:
>>> "At this point, it would normally be necessary to locate gadgets in
>>> the host kernel code that can be used to actually leak data by reading
>>> from an attacker-controlled location, shifting and masking the result
>>> appropriately and then using the result of that as offset to an
>>> attacker-controlled address for a load. But piecing gadgets together
>>> and figuring out which ones work in a speculation context seems annoying.
>>> So instead, we decided to use the eBPF interpreter, which is built into
>>> the host kernel - while there is no legitimate way to invoke it from inside
>>> a VM, the presence of the code in the host kernel's text section is sufficient
>>> to make it usable for the attack, just like with ordinary ROP gadgets."
>>>
>>> To make attacker job harder introduce BPF_JIT_ALWAYS_ON config
>>> option that removes interpreter from the kernel in favor of JIT-only mode.
>>> So far eBPF JIT is supported by:
>>> x64, arm64, arm32, sparc64, s390, powerpc64, mips64
>>>
>>> The start of JITed program is randomized and code page is marked as read-only.
>>> In addition "constant blinding" can be turned on with net.core.bpf_jit_harden
>>>
>>> v2->v3:
>>> - move __bpf_prog_ret0 under ifdef (Daniel)
>>>
>>> v1->v2:
>>> - fix init order, test_bpf and cBPF (Daniel's feedback)
>>> - fix offloaded bpf (Jakub's feedback)
>>> - add 'return 0' dummy in case something can invoke prog->bpf_func
>>> - retarget bpf tree. For bpf-next the patch would need one extra hunk.
>>> It will be sent when the trees are merged back to net-next
>>>
>>> Considered doing:
>>> int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT;
>>> but it seems better to land the patch as-is and in bpf-next remove
>>> bpf_jit_enable global variable from all JITs, consolidate in one place
>>> and remove this jit_init() function.
>>>
>>> Signed-off-by: Alexei Starovoitov <ast(a)kernel.org>
>>
>> Applied to bpf tree, thanks Alexei!
>
> For stable too?
Yes, this will go into stable as well; batch of backports will come Thurs/Fri.
This is a note to let you know that I've just added the patch titled
vmxnet3: repair memory leak
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
vmxnet3-repair-memory-leak.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 11:15:06 CET 2018
From: Neil Horman <nhorman(a)tuxdriver.com>
Date: Mon, 22 Jan 2018 16:06:37 -0500
Subject: vmxnet3: repair memory leak
From: Neil Horman <nhorman(a)tuxdriver.com>
[ Upstream commit 848b159835ddef99cc4193083f7e786c3992f580 ]
with the introduction of commit
b0eb57cb97e7837ebb746404c2c58c6f536f23fa, it appears that rq->buf_info
is improperly handled. While it is heap allocated when an rx queue is
setup, and freed when torn down, an old line of code in
vmxnet3_rq_destroy was not properly removed, leading to rq->buf_info[0]
being set to NULL prior to its being freed, causing a memory leak, which
eventually exhausts the system on repeated create/destroy operations
(for example, when the mtu of a vmxnet3 interface is changed
frequently.
Fix is pretty straight forward, just move the NULL set to after the
free.
Tested by myself with successful results
Applies to net, and should likely be queued for stable, please
Signed-off-by: Neil Horman <nhorman(a)tuxdriver.com>
Reported-By: boyang(a)redhat.com
CC: boyang(a)redhat.com
CC: Shrikrishna Khare <skhare(a)vmware.com>
CC: "VMware, Inc." <pv-drivers(a)vmware.com>
CC: David S. Miller <davem(a)davemloft.net>
Acked-by: Shrikrishna Khare <skhare(a)vmware.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/net/vmxnet3/vmxnet3_drv.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1420,7 +1420,6 @@ static void vmxnet3_rq_destroy(struct vm
rq->rx_ring[i].basePA);
rq->rx_ring[i].base = NULL;
}
- rq->buf_info[i] = NULL;
}
if (rq->comp_ring.base) {
@@ -1435,6 +1434,7 @@ static void vmxnet3_rq_destroy(struct vm
(rq->rx_ring[0].size + rq->rx_ring[1].size);
dma_free_coherent(&adapter->pdev->dev, sz, rq->buf_info[0],
rq->buf_info_pa);
+ rq->buf_info[0] = rq->buf_info[1] = NULL;
}
}
Patches currently in stable-queue which might be from nhorman(a)tuxdriver.com are
queue-3.18/sctp-do-not-allow-the-v4-socket-to-bind-a-v4mapped-v6-address.patch
queue-3.18/vmxnet3-repair-memory-leak.patch
queue-3.18/sctp-return-error-if-the-asoc-has-been-peeled-off-in-sctp_wait_for_sndbuf.patch
This is a note to let you know that I've just added the patch titled
tcp: __tcp_hdrlen() helper
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tcp-__tcp_hdrlen-helper.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From d9b3fca27385eafe61c3ca6feab6cb1e7dc77482 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig(a)google.com>
Date: Wed, 10 Feb 2016 11:50:37 -0500
Subject: tcp: __tcp_hdrlen() helper
From: Craig Gallek <kraig(a)google.com>
commit d9b3fca27385eafe61c3ca6feab6cb1e7dc77482 upstream.
tcp_hdrlen is wasteful if you already have a pointer to struct tcphdr.
This splits the size calculation into a helper function that can be
used if a struct tcphdr is already available.
Signed-off-by: Craig Gallek <kraig(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/linux/tcp.h | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -29,9 +29,14 @@ static inline struct tcphdr *tcp_hdr(con
return (struct tcphdr *)skb_transport_header(skb);
}
+static inline unsigned int __tcp_hdrlen(const struct tcphdr *th)
+{
+ return th->doff * 4;
+}
+
static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
{
- return tcp_hdr(skb)->doff * 4;
+ return __tcp_hdrlen(tcp_hdr(skb));
}
static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb)
Patches currently in stable-queue which might be from kraig(a)google.com are
queue-3.18/tcp-__tcp_hdrlen-helper.patch
This is a note to let you know that I've just added the patch titled
sctp: return error if the asoc has been peeled off in sctp_wait_for_sndbuf
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
sctp-return-error-if-the-asoc-has-been-peeled-off-in-sctp_wait_for_sndbuf.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 11:15:06 CET 2018
From: Xin Long <lucien.xin(a)gmail.com>
Date: Mon, 15 Jan 2018 17:01:36 +0800
Subject: sctp: return error if the asoc has been peeled off in sctp_wait_for_sndbuf
From: Xin Long <lucien.xin(a)gmail.com>
[ Upstream commit a0ff660058b88d12625a783ce9e5c1371c87951f ]
After commit cea0cc80a677 ("sctp: use the right sk after waking up from
wait_buf sleep"), it may change to lock another sk if the asoc has been
peeled off in sctp_wait_for_sndbuf.
However, the asoc's new sk could be already closed elsewhere, as it's in
the sendmsg context of the old sk that can't avoid the new sk's closing.
If the sk's last one refcnt is held by this asoc, later on after putting
this asoc, the new sk will be freed, while under it's own lock.
This patch is to revert that commit, but fix the old issue by returning
error under the old sk's lock.
Fixes: cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep")
Reported-by: syzbot+ac6ea7baa4432811eb50(a)syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin(a)gmail.com>
Acked-by: Neil Horman <nhorman(a)tuxdriver.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/sctp/socket.c | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -83,7 +83,7 @@
static int sctp_writeable(struct sock *sk);
static void sctp_wfree(struct sk_buff *skb);
static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
- size_t msg_len, struct sock **orig_sk);
+ size_t msg_len);
static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p);
static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p);
static int sctp_wait_for_accept(struct sock *sk, long timeo);
@@ -1948,7 +1948,7 @@ static int sctp_sendmsg(struct kiocb *io
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
if (!sctp_wspace(asoc)) {
/* sk can be changed by peel off when waiting for buf. */
- err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len, &sk);
+ err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
if (err) {
if (err == -ESRCH) {
/* asoc is already dead. */
@@ -6981,12 +6981,12 @@ void sctp_sock_rfree(struct sk_buff *skb
/* Helper function to wait for space in the sndbuf. */
static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
- size_t msg_len, struct sock **orig_sk)
+ size_t msg_len)
{
struct sock *sk = asoc->base.sk;
- int err = 0;
long current_timeo = *timeo_p;
DEFINE_WAIT(wait);
+ int err = 0;
pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc,
*timeo_p, msg_len);
@@ -7015,17 +7015,13 @@ static int sctp_wait_for_sndbuf(struct s
release_sock(sk);
current_timeo = schedule_timeout(current_timeo);
lock_sock(sk);
- if (sk != asoc->base.sk) {
- release_sock(sk);
- sk = asoc->base.sk;
- lock_sock(sk);
- }
+ if (sk != asoc->base.sk)
+ goto do_error;
*timeo_p = current_timeo;
}
out:
- *orig_sk = sk;
finish_wait(&asoc->wait, &wait);
/* Release the association's refcnt. */
Patches currently in stable-queue which might be from lucien.xin(a)gmail.com are
queue-3.18/sctp-do-not-allow-the-v4-socket-to-bind-a-v4mapped-v6-address.patch
queue-3.18/pppoe-take-needed_headroom-of-lower-device-into-account-on-xmit.patch
queue-3.18/sctp-return-error-if-the-asoc-has-been-peeled-off-in-sctp_wait_for_sndbuf.patch
This is a note to let you know that I've just added the patch titled
sctp: do not allow the v4 socket to bind a v4mapped v6 address
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
sctp-do-not-allow-the-v4-socket-to-bind-a-v4mapped-v6-address.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 11:15:06 CET 2018
From: Xin Long <lucien.xin(a)gmail.com>
Date: Mon, 15 Jan 2018 17:02:00 +0800
Subject: sctp: do not allow the v4 socket to bind a v4mapped v6 address
From: Xin Long <lucien.xin(a)gmail.com>
[ Upstream commit c5006b8aa74599ce19104b31d322d2ea9ff887cc ]
The check in sctp_sockaddr_af is not robust enough to forbid binding a
v4mapped v6 addr on a v4 socket.
The worse thing is that v4 socket's bind_verify would not convert this
v4mapped v6 addr to a v4 addr. syzbot even reported a crash as the v4
socket bound a v6 addr.
This patch is to fix it by doing the common sa.sa_family check first,
then AF_INET check for v4mapped v6 addrs.
Fixes: 7dab83de50c7 ("sctp: Support ipv6only AF_INET6 sockets.")
Reported-by: syzbot+7b7b518b1228d2743963(a)syzkaller.appspotmail.com
Acked-by: Neil Horman <nhorman(a)tuxdriver.com>
Signed-off-by: Xin Long <lucien.xin(a)gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/sctp/socket.c | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -333,16 +333,14 @@ static struct sctp_af *sctp_sockaddr_af(
if (len < sizeof (struct sockaddr))
return NULL;
+ if (!opt->pf->af_supported(addr->sa.sa_family, opt))
+ return NULL;
+
/* V4 mapped address are really of AF_INET family */
if (addr->sa.sa_family == AF_INET6 &&
- ipv6_addr_v4mapped(&addr->v6.sin6_addr)) {
- if (!opt->pf->af_supported(AF_INET, opt))
- return NULL;
- } else {
- /* Does this PF support this AF? */
- if (!opt->pf->af_supported(addr->sa.sa_family, opt))
- return NULL;
- }
+ ipv6_addr_v4mapped(&addr->v6.sin6_addr) &&
+ !opt->pf->af_supported(AF_INET, opt))
+ return NULL;
/* If we get this far, af is valid. */
af = sctp_get_af_specific(addr->sa.sa_family);
Patches currently in stable-queue which might be from lucien.xin(a)gmail.com are
queue-3.18/sctp-do-not-allow-the-v4-socket-to-bind-a-v4mapped-v6-address.patch
queue-3.18/pppoe-take-needed_headroom-of-lower-device-into-account-on-xmit.patch
queue-3.18/sctp-return-error-if-the-asoc-has-been-peeled-off-in-sctp_wait_for_sndbuf.patch
This is a note to let you know that I've just added the patch titled
pppoe: take ->needed_headroom of lower device into account on xmit
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
pppoe-take-needed_headroom-of-lower-device-into-account-on-xmit.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 11:15:06 CET 2018
From: Guillaume Nault <g.nault(a)alphalink.fr>
Date: Mon, 22 Jan 2018 18:06:37 +0100
Subject: pppoe: take ->needed_headroom of lower device into account on xmit
From: Guillaume Nault <g.nault(a)alphalink.fr>
[ Upstream commit 02612bb05e51df8489db5e94d0cf8d1c81f87b0c ]
In pppoe_sendmsg(), reserving dev->hard_header_len bytes of headroom
was probably fine before the introduction of ->needed_headroom in
commit f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom").
But now, virtual devices typically advertise the size of their overhead
in dev->needed_headroom, so we must also take it into account in
skb_reserve().
Allocation size of skb is also updated to take dev->needed_tailroom
into account and replace the arbitrary 32 bytes with the real size of
a PPPoE header.
This issue was discovered by syzbot, who connected a pppoe socket to a
gre device which had dev->header_ops->create == ipgre_header and
dev->hard_header_len == 0. Therefore, PPPoE didn't reserve any
headroom, and dev_hard_header() crashed when ipgre_header() tried to
prepend its header to skb->data.
skbuff: skb_under_panic: text:000000001d390b3a len:31 put:24
head:00000000d8ed776f data:000000008150e823 tail:0x7 end:0xc0 dev:gre0
------------[ cut here ]------------
kernel BUG at net/core/skbuff.c:104!
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
(ftrace buffer empty)
Modules linked in:
CPU: 1 PID: 3670 Comm: syzkaller801466 Not tainted
4.15.0-rc7-next-20180115+ #97
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:skb_panic+0x162/0x1f0 net/core/skbuff.c:100
RSP: 0018:ffff8801d9bd7840 EFLAGS: 00010282
RAX: 0000000000000083 RBX: ffff8801d4f083c0 RCX: 0000000000000000
RDX: 0000000000000083 RSI: 1ffff1003b37ae92 RDI: ffffed003b37aefc
RBP: ffff8801d9bd78a8 R08: 1ffff1003b37ae8a R09: 0000000000000000
R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff86200de0
R13: ffffffff84a981ad R14: 0000000000000018 R15: ffff8801d2d34180
FS: 00000000019c4880(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000208bc000 CR3: 00000001d9111001 CR4: 00000000001606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
skb_under_panic net/core/skbuff.c:114 [inline]
skb_push+0xce/0xf0 net/core/skbuff.c:1714
ipgre_header+0x6d/0x4e0 net/ipv4/ip_gre.c:879
dev_hard_header include/linux/netdevice.h:2723 [inline]
pppoe_sendmsg+0x58e/0x8b0 drivers/net/ppp/pppoe.c:890
sock_sendmsg_nosec net/socket.c:630 [inline]
sock_sendmsg+0xca/0x110 net/socket.c:640
sock_write_iter+0x31a/0x5d0 net/socket.c:909
call_write_iter include/linux/fs.h:1775 [inline]
do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653
do_iter_write+0x154/0x540 fs/read_write.c:932
vfs_writev+0x18a/0x340 fs/read_write.c:977
do_writev+0xfc/0x2a0 fs/read_write.c:1012
SYSC_writev fs/read_write.c:1085 [inline]
SyS_writev+0x27/0x30 fs/read_write.c:1082
entry_SYSCALL_64_fastpath+0x29/0xa0
Admittedly PPPoE shouldn't be allowed to run on non Ethernet-like
interfaces, but reserving space for ->needed_headroom is a more
fundamental issue that needs to be addressed first.
Same problem exists for __pppoe_xmit(), which also needs to take
dev->needed_headroom into account in skb_cow_head().
Fixes: f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom")
Reported-by: syzbot+ed0838d0fa4c4f2b528e20286e6dc63effc7c14d(a)syzkaller.appspotmail.com
Signed-off-by: Guillaume Nault <g.nault(a)alphalink.fr>
Reviewed-by: Xin Long <lucien.xin(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/net/ppp/pppoe.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -830,6 +830,7 @@ static int pppoe_sendmsg(struct kiocb *i
struct pppoe_hdr *ph;
struct net_device *dev;
char *start;
+ int hlen;
lock_sock(sk);
if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) {
@@ -848,16 +849,16 @@ static int pppoe_sendmsg(struct kiocb *i
if (total_len > (dev->mtu + dev->hard_header_len))
goto end;
-
- skb = sock_wmalloc(sk, total_len + dev->hard_header_len + 32,
- 0, GFP_KERNEL);
+ hlen = LL_RESERVED_SPACE(dev);
+ skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len +
+ dev->needed_tailroom, 0, GFP_KERNEL);
if (!skb) {
error = -ENOMEM;
goto end;
}
/* Reserve space for headers. */
- skb_reserve(skb, dev->hard_header_len);
+ skb_reserve(skb, hlen);
skb_reset_network_header(skb);
skb->dev = dev;
@@ -918,7 +919,7 @@ static int __pppoe_xmit(struct sock *sk,
/* Copy the data if there is no space for the header or if it's
* read-only.
*/
- if (skb_cow_head(skb, sizeof(*ph) + dev->hard_header_len))
+ if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph)))
goto abort;
__skb_push(skb, sizeof(*ph));
Patches currently in stable-queue which might be from g.nault(a)alphalink.fr are
queue-3.18/pppoe-take-needed_headroom-of-lower-device-into-account-on-xmit.patch
This is a note to let you know that I've just added the patch titled
net: tcp: close sock if net namespace is exiting
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
net-tcp-close-sock-if-net-namespace-is-exiting.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Dan Streetman <ddstreet(a)ieee.org>
Date: Thu, 18 Jan 2018 16:14:26 -0500
Subject: net: tcp: close sock if net namespace is exiting
From: Dan Streetman <ddstreet(a)ieee.org>
[ Upstream commit 4ee806d51176ba7b8ff1efd81f271d7252e03a1d ]
When a tcp socket is closed, if it detects that its net namespace is
exiting, close immediately and do not wait for FIN sequence.
For normal sockets, a reference is taken to their net namespace, so it will
never exit while the socket is open. However, kernel sockets do not take a
reference to their net namespace, so it may begin exiting while the kernel
socket is still open. In this case if the kernel socket is a tcp socket,
it will stay open trying to complete its close sequence. The sock's dst(s)
hold a reference to their interface, which are all transferred to the
namespace's loopback interface when the real interfaces are taken down.
When the namespace tries to take down its loopback interface, it hangs
waiting for all references to the loopback interface to release, which
results in messages like:
unregister_netdevice: waiting for lo to become free. Usage count = 1
These messages continue until the socket finally times out and closes.
Since the net namespace cleanup holds the net_mutex while calling its
registered pernet callbacks, any new net namespace initialization is
blocked until the current net namespace finishes exiting.
After this change, the tcp socket notices the exiting net namespace, and
closes immediately, releasing its dst(s) and their reference to the
loopback interface, which lets the net namespace continue exiting.
Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
Signed-off-by: Dan Streetman <ddstreet(a)canonical.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/net/net_namespace.h | 10 ++++++++++
net/ipv4/tcp.c | 3 +++
net/ipv4/tcp_timer.c | 15 +++++++++++++++
3 files changed, 28 insertions(+)
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -200,6 +200,11 @@ int net_eq(const struct net *net1, const
return net1 == net2;
}
+static inline int check_net(const struct net *net)
+{
+ return atomic_read(&net->count) != 0;
+}
+
void net_drop_ns(void *);
#else
@@ -223,6 +228,11 @@ int net_eq(const struct net *net1, const
{
return 1;
}
+
+static inline int check_net(const struct net *net)
+{
+ return 1;
+}
#define net_drop_ns NULL
#endif
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2182,6 +2182,9 @@ adjudge_to_death:
tcp_send_active_reset(sk, GFP_ATOMIC);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
+ } else if (!check_net(sock_net(sk))) {
+ /* Not possible to send reset; just close */
+ tcp_set_state(sk, TCP_CLOSE);
}
}
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -46,11 +46,19 @@ static void tcp_write_err(struct sock *s
* to prevent DoS attacks. It is called when a retransmission timeout
* or zero probe timeout occurs on orphaned socket.
*
+ * Also close if our net namespace is exiting; in that case there is no
+ * hope of ever communicating again since all netns interfaces are already
+ * down (or about to be down), and we need to release our dst references,
+ * which have been moved to the netns loopback interface, so the namespace
+ * can finish exiting. This condition is only possible if we are a kernel
+ * socket, as those do not hold references to the namespace.
+ *
* Criteria is still not confirmed experimentally and may change.
* We kill the socket, if:
* 1. If number of orphaned sockets exceeds an administratively configured
* limit.
* 2. If we have strong memory pressure.
+ * 3. If our net namespace is exiting.
*/
static int tcp_out_of_resources(struct sock *sk, bool do_reset)
{
@@ -79,6 +87,13 @@ static int tcp_out_of_resources(struct s
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
}
+
+ if (!check_net(sock_net(sk))) {
+ /* Not possible to send reset; just close */
+ tcp_done(sk);
+ return 1;
+ }
+
return 0;
}
Patches currently in stable-queue which might be from ddstreet(a)ieee.org are
queue-3.18/net-tcp-close-sock-if-net-namespace-is-exiting.patch
This is a note to let you know that I've just added the patch titled
net: qdisc_pkt_len_init() should be more robust
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
net-qdisc_pkt_len_init-should-be-more-robust.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 11:15:06 CET 2018
From: Eric Dumazet <edumazet(a)google.com>
Date: Thu, 18 Jan 2018 19:59:19 -0800
Subject: net: qdisc_pkt_len_init() should be more robust
From: Eric Dumazet <edumazet(a)google.com>
[ Upstream commit 7c68d1a6b4db9012790af7ac0f0fdc0d2083422a ]
Without proper validation of DODGY packets, we might very well
feed qdisc_pkt_len_init() with invalid GSO packets.
tcp_hdrlen() might access out-of-bound data, so let's use
skb_header_pointer() and proper checks.
Whole story is described in commit d0c081b49137 ("flow_dissector:
properly cap thoff field")
We have the goal of validating DODGY packets earlier in the stack,
so we might very well revert this fix in the future.
Signed-off-by: Eric Dumazet <edumazet(a)google.com>
Cc: Willem de Bruijn <willemb(a)google.com>
Cc: Jason Wang <jasowang(a)redhat.com>
Reported-by: syzbot+9da69ebac7dddd804552(a)syzkaller.appspotmail.com
Acked-by: Jason Wang <jasowang(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/core/dev.c | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2772,10 +2772,21 @@ static void qdisc_pkt_len_init(struct sk
hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
/* + transport layer */
- if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
- hdr_len += tcp_hdrlen(skb);
- else
- hdr_len += sizeof(struct udphdr);
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+ const struct tcphdr *th;
+ struct tcphdr _tcphdr;
+
+ th = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_tcphdr), &_tcphdr);
+ if (likely(th))
+ hdr_len += __tcp_hdrlen(th);
+ } else {
+ struct udphdr _udphdr;
+
+ if (skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_udphdr), &_udphdr))
+ hdr_len += sizeof(struct udphdr);
+ }
if (shinfo->gso_type & SKB_GSO_DODGY)
gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
Patches currently in stable-queue which might be from edumazet(a)google.com are
queue-3.18/ipv6-fix-udpv6-sendmsg-crash-caused-by-too-small-mtu.patch
queue-3.18/dccp-don-t-restart-ccid2_hc_tx_rto_expire-if-sk-in-closed-state.patch
queue-3.18/netfilter-restart-search-if-moved-to-other-chain.patch
queue-3.18/net-qdisc_pkt_len_init-should-be-more-robust.patch
This is a note to let you know that I've just added the patch titled
net: igmp: fix source address check for IGMPv3 reports
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
net-igmp-fix-source-address-check-for-igmpv3-reports.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 11:15:06 CET 2018
From: Felix Fietkau <nbd(a)nbd.name>
Date: Fri, 19 Jan 2018 11:50:46 +0100
Subject: net: igmp: fix source address check for IGMPv3 reports
From: Felix Fietkau <nbd(a)nbd.name>
[ Upstream commit ad23b750933ea7bf962678972a286c78a8fa36aa ]
Commit "net: igmp: Use correct source address on IGMPv3 reports"
introduced a check to validate the source address of locally generated
IGMPv3 packets.
Instead of checking the local interface address directly, it uses
inet_ifa_match(fl4->saddr, ifa), which checks if the address is on the
local subnet (or equal to the point-to-point address if used).
This breaks for point-to-point interfaces, so check against
ifa->ifa_local directly.
Cc: Kevin Cernekee <cernekee(a)chromium.org>
Fixes: a46182b00290 ("net: igmp: Use correct source address on IGMPv3 reports")
Reported-by: Sebastian Gottschall <s.gottschall(a)dd-wrt.com>
Signed-off-by: Felix Fietkau <nbd(a)nbd.name>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv4/igmp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -329,7 +329,7 @@ static __be32 igmpv3_get_srcaddr(struct
return htonl(INADDR_ANY);
for_ifa(in_dev) {
- if (inet_ifa_match(fl4->saddr, ifa))
+ if (fl4->saddr == ifa->ifa_local)
return fl4->saddr;
} endfor_ifa(in_dev);
Patches currently in stable-queue which might be from nbd(a)nbd.name are
queue-3.18/net-igmp-fix-source-address-check-for-igmpv3-reports.patch
This is a note to let you know that I've just added the patch titled
net: Allow neigh contructor functions ability to modify the primary_key
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
net-allow-neigh-contructor-functions-ability-to-modify-the-primary_key.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 11:15:06 CET 2018
From: Jim Westfall <jwestfall(a)surrealistic.net>
Date: Sun, 14 Jan 2018 04:18:50 -0800
Subject: net: Allow neigh contructor functions ability to modify the primary_key
From: Jim Westfall <jwestfall(a)surrealistic.net>
[ Upstream commit 096b9854c04df86f03b38a97d40b6506e5730919 ]
Use n->primary_key instead of pkey to account for the possibility that a neigh
constructor function may have modified the primary_key value.
Signed-off-by: Jim Westfall <jwestfall(a)surrealistic.net>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/core/neighbour.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -508,7 +508,7 @@ struct neighbour *__neigh_create(struct
if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
- hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
+ hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
@@ -520,7 +520,7 @@ struct neighbour *__neigh_create(struct
n1 != NULL;
n1 = rcu_dereference_protected(n1->next,
lockdep_is_held(&tbl->lock))) {
- if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
+ if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
if (want_ref)
neigh_hold(n1);
rc = n1;
Patches currently in stable-queue which might be from jwestfall(a)surrealistic.net are
queue-3.18/ipv4-make-neigh-lookup-keys-for-loopback-point-to-point-devices-be-inaddr_any.patch
queue-3.18/net-allow-neigh-contructor-functions-ability-to-modify-the-primary_key.patch
This is a note to let you know that I've just added the patch titled
ipv6: fix udpv6 sendmsg crash caused by too small MTU
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
ipv6-fix-udpv6-sendmsg-crash-caused-by-too-small-mtu.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 11:15:06 CET 2018
From: Mike Maloney <maloney(a)google.com>
Date: Wed, 10 Jan 2018 12:45:10 -0500
Subject: ipv6: fix udpv6 sendmsg crash caused by too small MTU
From: Mike Maloney <maloney(a)google.com>
[ Upstream commit 749439bfac6e1a2932c582e2699f91d329658196 ]
The logic in __ip6_append_data() assumes that the MTU is at least large
enough for the headers. A device's MTU may be adjusted after being
added while sendmsg() is processing data, resulting in
__ip6_append_data() seeing any MTU. For an mtu smaller than the size of
the fragmentation header, the math results in a negative 'maxfraglen',
which causes problems when refragmenting any previous skb in the
skb_write_queue, leaving it possibly malformed.
Instead sendmsg returns EINVAL when the mtu is calculated to be less
than IPV6_MIN_MTU.
Found by syzkaller:
kernel BUG at ./include/linux/skbuff.h:2064!
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
(ftrace buffer empty)
Modules linked in:
CPU: 1 PID: 14216 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
task: ffff8801d0b68580 task.stack: ffff8801ac6b8000
RIP: 0010:__skb_pull include/linux/skbuff.h:2064 [inline]
RIP: 0010:__ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617
RSP: 0018:ffff8801ac6bf570 EFLAGS: 00010216
RAX: 0000000000010000 RBX: 0000000000000028 RCX: ffffc90003cce000
RDX: 00000000000001b8 RSI: ffffffff839df06f RDI: ffff8801d9478ca0
RBP: ffff8801ac6bf780 R08: ffff8801cc3f1dbc R09: 0000000000000000
R10: ffff8801ac6bf7a0 R11: 43cb4b7b1948a9e7 R12: ffff8801cc3f1dc8
R13: ffff8801cc3f1d40 R14: 0000000000001036 R15: dffffc0000000000
FS: 00007f43d740c700(0000) GS:ffff8801dc100000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f7834984000 CR3: 00000001d79b9000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
ip6_finish_skb include/net/ipv6.h:911 [inline]
udp_v6_push_pending_frames+0x255/0x390 net/ipv6/udp.c:1093
udpv6_sendmsg+0x280d/0x31a0 net/ipv6/udp.c:1363
inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:762
sock_sendmsg_nosec net/socket.c:633 [inline]
sock_sendmsg+0xca/0x110 net/socket.c:643
SYSC_sendto+0x352/0x5a0 net/socket.c:1750
SyS_sendto+0x40/0x50 net/socket.c:1718
entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x4512e9
RSP: 002b:00007f43d740bc08 EFLAGS: 00000216 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 00000000007180a8 RCX: 00000000004512e9
RDX: 000000000000002e RSI: 0000000020d08000 RDI: 0000000000000005
RBP: 0000000000000086 R08: 00000000209c1000 R09: 000000000000001c
R10: 0000000000040800 R11: 0000000000000216 R12: 00000000004b9c69
R13: 00000000ffffffff R14: 0000000000000005 R15: 00000000202c2000
Code: 9e 01 fe e9 c5 e8 ff ff e8 7f 9e 01 fe e9 4a ea ff ff 48 89 f7 e8 52 9e 01 fe e9 aa eb ff ff e8 a8 b6 cf fd 0f 0b e8 a1 b6 cf fd <0f> 0b 49 8d 45 78 4d 8d 45 7c 48 89 85 78 fe ff ff 49 8d 85 ba
RIP: __skb_pull include/linux/skbuff.h:2064 [inline] RSP: ffff8801ac6bf570
RIP: __ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: ffff8801ac6bf570
Reported-by: syzbot <syzkaller(a)googlegroups.com>
Signed-off-by: Mike Maloney <maloney(a)google.com>
Reviewed-by: Eric Dumazet <edumazet(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv6/ip6_output.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1214,14 +1214,16 @@ int ip6_append_data(struct sock *sk, int
np->cork.tclass = tclass;
if (rt->dst.flags & DST_XFRM_TUNNEL)
mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
- rt->dst.dev->mtu : dst_mtu(&rt->dst);
+ READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
else
mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
- rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+ READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
if (np->frag_size < mtu) {
if (np->frag_size)
mtu = np->frag_size;
}
+ if (mtu < IPV6_MIN_MTU)
+ return -EINVAL;
cork->fragsize = mtu;
if (dst_allfrag(rt->dst.path))
cork->flags |= IPCORK_ALLFRAG;
Patches currently in stable-queue which might be from maloney(a)google.com are
queue-3.18/ipv6-fix-udpv6-sendmsg-crash-caused-by-too-small-mtu.patch
This is a note to let you know that I've just added the patch titled
ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
ipv4-make-neigh-lookup-keys-for-loopback-point-to-point-devices-be-inaddr_any.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 11:15:06 CET 2018
From: Jim Westfall <jwestfall(a)surrealistic.net>
Date: Sun, 14 Jan 2018 04:18:51 -0800
Subject: ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY
From: Jim Westfall <jwestfall(a)surrealistic.net>
[ Upstream commit cd9ff4de0107c65d69d02253bb25d6db93c3dbc1 ]
Map all lookup neigh keys to INADDR_ANY for loopback/point-to-point devices
to avoid making an entry for every remote ip the device needs to talk to.
This used the be the old behavior but became broken in a263b3093641f
(ipv4: Make neigh lookups directly in output packet path) and later removed
in 0bb4087cbec0 (ipv4: Fix neigh lookup keying over loopback/point-to-point
devices) because it was broken.
Signed-off-by: Jim Westfall <jwestfall(a)surrealistic.net>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/net/arp.h | 3 +++
net/ipv4/arp.c | 7 ++++++-
2 files changed, 9 insertions(+), 1 deletion(-)
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -37,6 +37,9 @@ static inline struct neighbour *__ipv4_n
{
struct neighbour *n;
+ if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
+ key = INADDR_ANY;
+
rcu_read_lock_bh();
n = __ipv4_neigh_lookup_noref(dev, key);
if (n && !atomic_inc_not_zero(&n->refcnt))
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -221,11 +221,16 @@ static u32 arp_hash(const void *pkey,
static int arp_constructor(struct neighbour *neigh)
{
- __be32 addr = *(__be32 *)neigh->primary_key;
+ __be32 addr;
struct net_device *dev = neigh->dev;
struct in_device *in_dev;
struct neigh_parms *parms;
+ u32 inaddr_any = INADDR_ANY;
+ if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
+ memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len);
+
+ addr = *(__be32 *)neigh->primary_key;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev == NULL) {
Patches currently in stable-queue which might be from jwestfall(a)surrealistic.net are
queue-3.18/ipv4-make-neigh-lookup-keys-for-loopback-point-to-point-devices-be-inaddr_any.patch
queue-3.18/net-allow-neigh-contructor-functions-ability-to-modify-the-primary_key.patch
This is a note to let you know that I've just added the patch titled
dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state
to the 3.18-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
dccp-don-t-restart-ccid2_hc_tx_rto_expire-if-sk-in-closed-state.patch
and it can be found in the queue-3.18 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 11:15:06 CET 2018
From: Alexey Kodanev <alexey.kodanev(a)oracle.com>
Date: Fri, 26 Jan 2018 15:14:16 +0300
Subject: dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state
From: Alexey Kodanev <alexey.kodanev(a)oracle.com>
[ Upstream commit dd5684ecae3bd8e44b644f50e2c12c7e57fdfef5 ]
ccid2_hc_tx_rto_expire() timer callback always restarts the timer
again and can run indefinitely (unless it is stopped outside), and after
commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"),
which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from
dccp_destroy_sock() to sk_destruct(), this started to happen quite often.
The timer prevents releasing the socket, as a result, sk_destruct() won't
be called.
Found with LTP/dccp_ipsec tests running on the bonding device,
which later couldn't be unloaded after the tests were completed:
unregister_netdevice: waiting for bond0 to become free. Usage count = 148
Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation")
Signed-off-by: Alexey Kodanev <alexey.kodanev(a)oracle.com>
Reviewed-by: Eric Dumazet <edumazet(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/dccp/ccids/ccid2.c | 3 +++
1 file changed, 3 insertions(+)
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -140,6 +140,9 @@ static void ccid2_hc_tx_rto_expire(unsig
ccid2_pr_debug("RTO_EXPIRE\n");
+ if (sk->sk_state == DCCP_CLOSED)
+ goto out;
+
/* back-off timer */
hc->tx_rto <<= 1;
if (hc->tx_rto > DCCP_RTO_MAX)
Patches currently in stable-queue which might be from alexey.kodanev(a)oracle.com are
queue-3.18/dccp-don-t-restart-ccid2_hc_tx_rto_expire-if-sk-in-closed-state.patch
This is a note to let you know that I've just added the patch titled
net: tcp: close sock if net namespace is exiting
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
net-tcp-close-sock-if-net-namespace-is-exiting.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Dan Streetman <ddstreet(a)ieee.org>
Date: Thu, 18 Jan 2018 16:14:26 -0500
Subject: net: tcp: close sock if net namespace is exiting
From: Dan Streetman <ddstreet(a)ieee.org>
[ Upstream commit 4ee806d51176ba7b8ff1efd81f271d7252e03a1d ]
When a tcp socket is closed, if it detects that its net namespace is
exiting, close immediately and do not wait for FIN sequence.
For normal sockets, a reference is taken to their net namespace, so it will
never exit while the socket is open. However, kernel sockets do not take a
reference to their net namespace, so it may begin exiting while the kernel
socket is still open. In this case if the kernel socket is a tcp socket,
it will stay open trying to complete its close sequence. The sock's dst(s)
hold a reference to their interface, which are all transferred to the
namespace's loopback interface when the real interfaces are taken down.
When the namespace tries to take down its loopback interface, it hangs
waiting for all references to the loopback interface to release, which
results in messages like:
unregister_netdevice: waiting for lo to become free. Usage count = 1
These messages continue until the socket finally times out and closes.
Since the net namespace cleanup holds the net_mutex while calling its
registered pernet callbacks, any new net namespace initialization is
blocked until the current net namespace finishes exiting.
After this change, the tcp socket notices the exiting net namespace, and
closes immediately, releasing its dst(s) and their reference to the
loopback interface, which lets the net namespace continue exiting.
Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
Signed-off-by: Dan Streetman <ddstreet(a)canonical.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/net/net_namespace.h | 10 ++++++++++
net/ipv4/tcp.c | 3 +++
net/ipv4/tcp_timer.c | 15 +++++++++++++++
3 files changed, 28 insertions(+)
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -209,6 +209,11 @@ int net_eq(const struct net *net1, const
return net1 == net2;
}
+static inline int check_net(const struct net *net)
+{
+ return atomic_read(&net->count) != 0;
+}
+
void net_drop_ns(void *);
#else
@@ -232,6 +237,11 @@ int net_eq(const struct net *net1, const
{
return 1;
}
+
+static inline int check_net(const struct net *net)
+{
+ return 1;
+}
#define net_drop_ns NULL
#endif
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2176,6 +2176,9 @@ adjudge_to_death:
tcp_send_active_reset(sk, GFP_ATOMIC);
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
+ } else if (!check_net(sock_net(sk))) {
+ /* Not possible to send reset; just close */
+ tcp_set_state(sk, TCP_CLOSE);
}
}
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -46,11 +46,19 @@ static void tcp_write_err(struct sock *s
* to prevent DoS attacks. It is called when a retransmission timeout
* or zero probe timeout occurs on orphaned socket.
*
+ * Also close if our net namespace is exiting; in that case there is no
+ * hope of ever communicating again since all netns interfaces are already
+ * down (or about to be down), and we need to release our dst references,
+ * which have been moved to the netns loopback interface, so the namespace
+ * can finish exiting. This condition is only possible if we are a kernel
+ * socket, as those do not hold references to the namespace.
+ *
* Criteria is still not confirmed experimentally and may change.
* We kill the socket, if:
* 1. If number of orphaned sockets exceeds an administratively configured
* limit.
* 2. If we have strong memory pressure.
+ * 3. If our net namespace is exiting.
*/
static int tcp_out_of_resources(struct sock *sk, bool do_reset)
{
@@ -79,6 +87,13 @@ static int tcp_out_of_resources(struct s
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
}
+
+ if (!check_net(sock_net(sk))) {
+ /* Not possible to send reset; just close */
+ tcp_done(sk);
+ return 1;
+ }
+
return 0;
}
Patches currently in stable-queue which might be from ddstreet(a)ieee.org are
queue-4.4/net-tcp-close-sock-if-net-namespace-is-exiting.patch
This is a note to let you know that I've just added the patch titled
vmxnet3: repair memory leak
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
vmxnet3-repair-memory-leak.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Neil Horman <nhorman(a)tuxdriver.com>
Date: Mon, 22 Jan 2018 16:06:37 -0500
Subject: vmxnet3: repair memory leak
From: Neil Horman <nhorman(a)tuxdriver.com>
[ Upstream commit 848b159835ddef99cc4193083f7e786c3992f580 ]
with the introduction of commit
b0eb57cb97e7837ebb746404c2c58c6f536f23fa, it appears that rq->buf_info
is improperly handled. While it is heap allocated when an rx queue is
setup, and freed when torn down, an old line of code in
vmxnet3_rq_destroy was not properly removed, leading to rq->buf_info[0]
being set to NULL prior to its being freed, causing a memory leak, which
eventually exhausts the system on repeated create/destroy operations
(for example, when the mtu of a vmxnet3 interface is changed
frequently.
Fix is pretty straight forward, just move the NULL set to after the
free.
Tested by myself with successful results
Applies to net, and should likely be queued for stable, please
Signed-off-by: Neil Horman <nhorman(a)tuxdriver.com>
Reported-By: boyang(a)redhat.com
CC: boyang(a)redhat.com
CC: Shrikrishna Khare <skhare(a)vmware.com>
CC: "VMware, Inc." <pv-drivers(a)vmware.com>
CC: David S. Miller <davem(a)davemloft.net>
Acked-by: Shrikrishna Khare <skhare(a)vmware.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/net/vmxnet3/vmxnet3_drv.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1563,7 +1563,6 @@ static void vmxnet3_rq_destroy(struct vm
rq->rx_ring[i].basePA);
rq->rx_ring[i].base = NULL;
}
- rq->buf_info[i] = NULL;
}
if (rq->comp_ring.base) {
@@ -1578,6 +1577,7 @@ static void vmxnet3_rq_destroy(struct vm
(rq->rx_ring[0].size + rq->rx_ring[1].size);
dma_free_coherent(&adapter->pdev->dev, sz, rq->buf_info[0],
rq->buf_info_pa);
+ rq->buf_info[0] = rq->buf_info[1] = NULL;
}
}
Patches currently in stable-queue which might be from nhorman(a)tuxdriver.com are
queue-4.4/sctp-do-not-allow-the-v4-socket-to-bind-a-v4mapped-v6-address.patch
queue-4.4/vmxnet3-repair-memory-leak.patch
queue-4.4/sctp-return-error-if-the-asoc-has-been-peeled-off-in-sctp_wait_for_sndbuf.patch
This is a note to let you know that I've just added the patch titled
sctp: return error if the asoc has been peeled off in sctp_wait_for_sndbuf
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
sctp-return-error-if-the-asoc-has-been-peeled-off-in-sctp_wait_for_sndbuf.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Xin Long <lucien.xin(a)gmail.com>
Date: Mon, 15 Jan 2018 17:01:36 +0800
Subject: sctp: return error if the asoc has been peeled off in sctp_wait_for_sndbuf
From: Xin Long <lucien.xin(a)gmail.com>
[ Upstream commit a0ff660058b88d12625a783ce9e5c1371c87951f ]
After commit cea0cc80a677 ("sctp: use the right sk after waking up from
wait_buf sleep"), it may change to lock another sk if the asoc has been
peeled off in sctp_wait_for_sndbuf.
However, the asoc's new sk could be already closed elsewhere, as it's in
the sendmsg context of the old sk that can't avoid the new sk's closing.
If the sk's last one refcnt is held by this asoc, later on after putting
this asoc, the new sk will be freed, while under it's own lock.
This patch is to revert that commit, but fix the old issue by returning
error under the old sk's lock.
Fixes: cea0cc80a677 ("sctp: use the right sk after waking up from wait_buf sleep")
Reported-by: syzbot+ac6ea7baa4432811eb50(a)syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin(a)gmail.com>
Acked-by: Neil Horman <nhorman(a)tuxdriver.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/sctp/socket.c | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -83,7 +83,7 @@
static int sctp_writeable(struct sock *sk);
static void sctp_wfree(struct sk_buff *skb);
static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
- size_t msg_len, struct sock **orig_sk);
+ size_t msg_len);
static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p);
static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p);
static int sctp_wait_for_accept(struct sock *sk, long timeo);
@@ -1952,7 +1952,7 @@ static int sctp_sendmsg(struct sock *sk,
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
if (!sctp_wspace(asoc)) {
/* sk can be changed by peel off when waiting for buf. */
- err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len, &sk);
+ err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
if (err) {
if (err == -ESRCH) {
/* asoc is already dead. */
@@ -6974,12 +6974,12 @@ void sctp_sock_rfree(struct sk_buff *skb
/* Helper function to wait for space in the sndbuf. */
static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
- size_t msg_len, struct sock **orig_sk)
+ size_t msg_len)
{
struct sock *sk = asoc->base.sk;
- int err = 0;
long current_timeo = *timeo_p;
DEFINE_WAIT(wait);
+ int err = 0;
pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc,
*timeo_p, msg_len);
@@ -7008,17 +7008,13 @@ static int sctp_wait_for_sndbuf(struct s
release_sock(sk);
current_timeo = schedule_timeout(current_timeo);
lock_sock(sk);
- if (sk != asoc->base.sk) {
- release_sock(sk);
- sk = asoc->base.sk;
- lock_sock(sk);
- }
+ if (sk != asoc->base.sk)
+ goto do_error;
*timeo_p = current_timeo;
}
out:
- *orig_sk = sk;
finish_wait(&asoc->wait, &wait);
/* Release the association's refcnt. */
Patches currently in stable-queue which might be from lucien.xin(a)gmail.com are
queue-4.4/sctp-do-not-allow-the-v4-socket-to-bind-a-v4mapped-v6-address.patch
queue-4.4/pppoe-take-needed_headroom-of-lower-device-into-account-on-xmit.patch
queue-4.4/sctp-return-error-if-the-asoc-has-been-peeled-off-in-sctp_wait_for_sndbuf.patch
This is a note to let you know that I've just added the patch titled
tcp: __tcp_hdrlen() helper
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tcp-__tcp_hdrlen-helper.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From d9b3fca27385eafe61c3ca6feab6cb1e7dc77482 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig(a)google.com>
Date: Wed, 10 Feb 2016 11:50:37 -0500
Subject: tcp: __tcp_hdrlen() helper
From: Craig Gallek <kraig(a)google.com>
commit d9b3fca27385eafe61c3ca6feab6cb1e7dc77482 upstream.
tcp_hdrlen is wasteful if you already have a pointer to struct tcphdr.
This splits the size calculation into a helper function that can be
used if a struct tcphdr is already available.
Signed-off-by: Craig Gallek <kraig(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/linux/tcp.h | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -29,9 +29,14 @@ static inline struct tcphdr *tcp_hdr(con
return (struct tcphdr *)skb_transport_header(skb);
}
+static inline unsigned int __tcp_hdrlen(const struct tcphdr *th)
+{
+ return th->doff * 4;
+}
+
static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
{
- return tcp_hdr(skb)->doff * 4;
+ return __tcp_hdrlen(tcp_hdr(skb));
}
static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb)
Patches currently in stable-queue which might be from kraig(a)google.com are
queue-4.4/tcp-__tcp_hdrlen-helper.patch
This is a note to let you know that I've just added the patch titled
r8169: fix memory corruption on retrieval of hardware statistics.
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
r8169-fix-memory-corruption-on-retrieval-of-hardware-statistics.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Francois Romieu <romieu(a)fr.zoreil.com>
Date: Fri, 26 Jan 2018 01:53:26 +0100
Subject: r8169: fix memory corruption on retrieval of hardware statistics.
From: Francois Romieu <romieu(a)fr.zoreil.com>
[ Upstream commit a78e93661c5fd30b9e1dee464b2f62f966883ef7 ]
Hardware statistics retrieval hurts in tight invocation loops.
Avoid extraneous write and enforce strict ordering of writes targeted to
the tally counters dump area address registers.
Signed-off-by: Francois Romieu <romieu(a)fr.zoreil.com>
Tested-by: Oliver Freyermuth <o.freyermuth(a)googlemail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/net/ethernet/realtek/r8169.c | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -2205,19 +2205,14 @@ static bool rtl8169_do_counters(struct n
void __iomem *ioaddr = tp->mmio_addr;
dma_addr_t paddr = tp->counters_phys_addr;
u32 cmd;
- bool ret;
RTL_W32(CounterAddrHigh, (u64)paddr >> 32);
+ RTL_R32(CounterAddrHigh);
cmd = (u64)paddr & DMA_BIT_MASK(32);
RTL_W32(CounterAddrLow, cmd);
RTL_W32(CounterAddrLow, cmd | counter_cmd);
- ret = rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000);
-
- RTL_W32(CounterAddrLow, 0);
- RTL_W32(CounterAddrHigh, 0);
-
- return ret;
+ return rtl_udelay_loop_wait_low(tp, &rtl_counters_cond, 10, 1000);
}
static bool rtl8169_reset_counters(struct net_device *dev)
Patches currently in stable-queue which might be from romieu(a)fr.zoreil.com are
queue-4.4/r8169-fix-memory-corruption-on-retrieval-of-hardware-statistics.patch
This is a note to let you know that I've just added the patch titled
sctp: do not allow the v4 socket to bind a v4mapped v6 address
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
sctp-do-not-allow-the-v4-socket-to-bind-a-v4mapped-v6-address.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Xin Long <lucien.xin(a)gmail.com>
Date: Mon, 15 Jan 2018 17:02:00 +0800
Subject: sctp: do not allow the v4 socket to bind a v4mapped v6 address
From: Xin Long <lucien.xin(a)gmail.com>
[ Upstream commit c5006b8aa74599ce19104b31d322d2ea9ff887cc ]
The check in sctp_sockaddr_af is not robust enough to forbid binding a
v4mapped v6 addr on a v4 socket.
The worse thing is that v4 socket's bind_verify would not convert this
v4mapped v6 addr to a v4 addr. syzbot even reported a crash as the v4
socket bound a v6 addr.
This patch is to fix it by doing the common sa.sa_family check first,
then AF_INET check for v4mapped v6 addrs.
Fixes: 7dab83de50c7 ("sctp: Support ipv6only AF_INET6 sockets.")
Reported-by: syzbot+7b7b518b1228d2743963(a)syzkaller.appspotmail.com
Acked-by: Neil Horman <nhorman(a)tuxdriver.com>
Signed-off-by: Xin Long <lucien.xin(a)gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/sctp/socket.c | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -332,16 +332,14 @@ static struct sctp_af *sctp_sockaddr_af(
if (len < sizeof (struct sockaddr))
return NULL;
+ if (!opt->pf->af_supported(addr->sa.sa_family, opt))
+ return NULL;
+
/* V4 mapped address are really of AF_INET family */
if (addr->sa.sa_family == AF_INET6 &&
- ipv6_addr_v4mapped(&addr->v6.sin6_addr)) {
- if (!opt->pf->af_supported(AF_INET, opt))
- return NULL;
- } else {
- /* Does this PF support this AF? */
- if (!opt->pf->af_supported(addr->sa.sa_family, opt))
- return NULL;
- }
+ ipv6_addr_v4mapped(&addr->v6.sin6_addr) &&
+ !opt->pf->af_supported(AF_INET, opt))
+ return NULL;
/* If we get this far, af is valid. */
af = sctp_get_af_specific(addr->sa.sa_family);
Patches currently in stable-queue which might be from lucien.xin(a)gmail.com are
queue-4.4/sctp-do-not-allow-the-v4-socket-to-bind-a-v4mapped-v6-address.patch
queue-4.4/pppoe-take-needed_headroom-of-lower-device-into-account-on-xmit.patch
queue-4.4/sctp-return-error-if-the-asoc-has-been-peeled-off-in-sctp_wait_for_sndbuf.patch
This is a note to let you know that I've just added the patch titled
pppoe: take ->needed_headroom of lower device into account on xmit
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
pppoe-take-needed_headroom-of-lower-device-into-account-on-xmit.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Guillaume Nault <g.nault(a)alphalink.fr>
Date: Mon, 22 Jan 2018 18:06:37 +0100
Subject: pppoe: take ->needed_headroom of lower device into account on xmit
From: Guillaume Nault <g.nault(a)alphalink.fr>
[ Upstream commit 02612bb05e51df8489db5e94d0cf8d1c81f87b0c ]
In pppoe_sendmsg(), reserving dev->hard_header_len bytes of headroom
was probably fine before the introduction of ->needed_headroom in
commit f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom").
But now, virtual devices typically advertise the size of their overhead
in dev->needed_headroom, so we must also take it into account in
skb_reserve().
Allocation size of skb is also updated to take dev->needed_tailroom
into account and replace the arbitrary 32 bytes with the real size of
a PPPoE header.
This issue was discovered by syzbot, who connected a pppoe socket to a
gre device which had dev->header_ops->create == ipgre_header and
dev->hard_header_len == 0. Therefore, PPPoE didn't reserve any
headroom, and dev_hard_header() crashed when ipgre_header() tried to
prepend its header to skb->data.
skbuff: skb_under_panic: text:000000001d390b3a len:31 put:24
head:00000000d8ed776f data:000000008150e823 tail:0x7 end:0xc0 dev:gre0
------------[ cut here ]------------
kernel BUG at net/core/skbuff.c:104!
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
(ftrace buffer empty)
Modules linked in:
CPU: 1 PID: 3670 Comm: syzkaller801466 Not tainted
4.15.0-rc7-next-20180115+ #97
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
RIP: 0010:skb_panic+0x162/0x1f0 net/core/skbuff.c:100
RSP: 0018:ffff8801d9bd7840 EFLAGS: 00010282
RAX: 0000000000000083 RBX: ffff8801d4f083c0 RCX: 0000000000000000
RDX: 0000000000000083 RSI: 1ffff1003b37ae92 RDI: ffffed003b37aefc
RBP: ffff8801d9bd78a8 R08: 1ffff1003b37ae8a R09: 0000000000000000
R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff86200de0
R13: ffffffff84a981ad R14: 0000000000000018 R15: ffff8801d2d34180
FS: 00000000019c4880(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000208bc000 CR3: 00000001d9111001 CR4: 00000000001606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
skb_under_panic net/core/skbuff.c:114 [inline]
skb_push+0xce/0xf0 net/core/skbuff.c:1714
ipgre_header+0x6d/0x4e0 net/ipv4/ip_gre.c:879
dev_hard_header include/linux/netdevice.h:2723 [inline]
pppoe_sendmsg+0x58e/0x8b0 drivers/net/ppp/pppoe.c:890
sock_sendmsg_nosec net/socket.c:630 [inline]
sock_sendmsg+0xca/0x110 net/socket.c:640
sock_write_iter+0x31a/0x5d0 net/socket.c:909
call_write_iter include/linux/fs.h:1775 [inline]
do_iter_readv_writev+0x525/0x7f0 fs/read_write.c:653
do_iter_write+0x154/0x540 fs/read_write.c:932
vfs_writev+0x18a/0x340 fs/read_write.c:977
do_writev+0xfc/0x2a0 fs/read_write.c:1012
SYSC_writev fs/read_write.c:1085 [inline]
SyS_writev+0x27/0x30 fs/read_write.c:1082
entry_SYSCALL_64_fastpath+0x29/0xa0
Admittedly PPPoE shouldn't be allowed to run on non Ethernet-like
interfaces, but reserving space for ->needed_headroom is a more
fundamental issue that needs to be addressed first.
Same problem exists for __pppoe_xmit(), which also needs to take
dev->needed_headroom into account in skb_cow_head().
Fixes: f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom")
Reported-by: syzbot+ed0838d0fa4c4f2b528e20286e6dc63effc7c14d(a)syzkaller.appspotmail.com
Signed-off-by: Guillaume Nault <g.nault(a)alphalink.fr>
Reviewed-by: Xin Long <lucien.xin(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/net/ppp/pppoe.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -860,6 +860,7 @@ static int pppoe_sendmsg(struct socket *
struct pppoe_hdr *ph;
struct net_device *dev;
char *start;
+ int hlen;
lock_sock(sk);
if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) {
@@ -878,16 +879,16 @@ static int pppoe_sendmsg(struct socket *
if (total_len > (dev->mtu + dev->hard_header_len))
goto end;
-
- skb = sock_wmalloc(sk, total_len + dev->hard_header_len + 32,
- 0, GFP_KERNEL);
+ hlen = LL_RESERVED_SPACE(dev);
+ skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len +
+ dev->needed_tailroom, 0, GFP_KERNEL);
if (!skb) {
error = -ENOMEM;
goto end;
}
/* Reserve space for headers. */
- skb_reserve(skb, dev->hard_header_len);
+ skb_reserve(skb, hlen);
skb_reset_network_header(skb);
skb->dev = dev;
@@ -948,7 +949,7 @@ static int __pppoe_xmit(struct sock *sk,
/* Copy the data if there is no space for the header or if it's
* read-only.
*/
- if (skb_cow_head(skb, sizeof(*ph) + dev->hard_header_len))
+ if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph)))
goto abort;
__skb_push(skb, sizeof(*ph));
Patches currently in stable-queue which might be from g.nault(a)alphalink.fr are
queue-4.4/pppoe-take-needed_headroom-of-lower-device-into-account-on-xmit.patch
This is a note to let you know that I've just added the patch titled
net: qdisc_pkt_len_init() should be more robust
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
net-qdisc_pkt_len_init-should-be-more-robust.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Eric Dumazet <edumazet(a)google.com>
Date: Thu, 18 Jan 2018 19:59:19 -0800
Subject: net: qdisc_pkt_len_init() should be more robust
From: Eric Dumazet <edumazet(a)google.com>
[ Upstream commit 7c68d1a6b4db9012790af7ac0f0fdc0d2083422a ]
Without proper validation of DODGY packets, we might very well
feed qdisc_pkt_len_init() with invalid GSO packets.
tcp_hdrlen() might access out-of-bound data, so let's use
skb_header_pointer() and proper checks.
Whole story is described in commit d0c081b49137 ("flow_dissector:
properly cap thoff field")
We have the goal of validating DODGY packets earlier in the stack,
so we might very well revert this fix in the future.
Signed-off-by: Eric Dumazet <edumazet(a)google.com>
Cc: Willem de Bruijn <willemb(a)google.com>
Cc: Jason Wang <jasowang(a)redhat.com>
Reported-by: syzbot+9da69ebac7dddd804552(a)syzkaller.appspotmail.com
Acked-by: Jason Wang <jasowang(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/core/dev.c | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2889,10 +2889,21 @@ static void qdisc_pkt_len_init(struct sk
hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
/* + transport layer */
- if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
- hdr_len += tcp_hdrlen(skb);
- else
- hdr_len += sizeof(struct udphdr);
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+ const struct tcphdr *th;
+ struct tcphdr _tcphdr;
+
+ th = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_tcphdr), &_tcphdr);
+ if (likely(th))
+ hdr_len += __tcp_hdrlen(th);
+ } else {
+ struct udphdr _udphdr;
+
+ if (skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_udphdr), &_udphdr))
+ hdr_len += sizeof(struct udphdr);
+ }
if (shinfo->gso_type & SKB_GSO_DODGY)
gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
Patches currently in stable-queue which might be from edumazet(a)google.com are
queue-4.4/ipv6-fix-udpv6-sendmsg-crash-caused-by-too-small-mtu.patch
queue-4.4/flow_dissector-properly-cap-thoff-field.patch
queue-4.4/ipv6-ip6_make_skb-needs-to-clear-cork.base.dst.patch
queue-4.4/dccp-don-t-restart-ccid2_hc_tx_rto_expire-if-sk-in-closed-state.patch
queue-4.4/netfilter-restart-search-if-moved-to-other-chain.patch
queue-4.4/net-qdisc_pkt_len_init-should-be-more-robust.patch
This is a note to let you know that I've just added the patch titled
net: igmp: fix source address check for IGMPv3 reports
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
net-igmp-fix-source-address-check-for-igmpv3-reports.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Felix Fietkau <nbd(a)nbd.name>
Date: Fri, 19 Jan 2018 11:50:46 +0100
Subject: net: igmp: fix source address check for IGMPv3 reports
From: Felix Fietkau <nbd(a)nbd.name>
[ Upstream commit ad23b750933ea7bf962678972a286c78a8fa36aa ]
Commit "net: igmp: Use correct source address on IGMPv3 reports"
introduced a check to validate the source address of locally generated
IGMPv3 packets.
Instead of checking the local interface address directly, it uses
inet_ifa_match(fl4->saddr, ifa), which checks if the address is on the
local subnet (or equal to the point-to-point address if used).
This breaks for point-to-point interfaces, so check against
ifa->ifa_local directly.
Cc: Kevin Cernekee <cernekee(a)chromium.org>
Fixes: a46182b00290 ("net: igmp: Use correct source address on IGMPv3 reports")
Reported-by: Sebastian Gottschall <s.gottschall(a)dd-wrt.com>
Signed-off-by: Felix Fietkau <nbd(a)nbd.name>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv4/igmp.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -338,7 +338,7 @@ static __be32 igmpv3_get_srcaddr(struct
return htonl(INADDR_ANY);
for_ifa(in_dev) {
- if (inet_ifa_match(fl4->saddr, ifa))
+ if (fl4->saddr == ifa->ifa_local)
return fl4->saddr;
} endfor_ifa(in_dev);
Patches currently in stable-queue which might be from nbd(a)nbd.name are
queue-4.4/net-igmp-fix-source-address-check-for-igmpv3-reports.patch
This is a note to let you know that I've just added the patch titled
net: Allow neigh contructor functions ability to modify the primary_key
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
net-allow-neigh-contructor-functions-ability-to-modify-the-primary_key.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Jim Westfall <jwestfall(a)surrealistic.net>
Date: Sun, 14 Jan 2018 04:18:50 -0800
Subject: net: Allow neigh contructor functions ability to modify the primary_key
From: Jim Westfall <jwestfall(a)surrealistic.net>
[ Upstream commit 096b9854c04df86f03b38a97d40b6506e5730919 ]
Use n->primary_key instead of pkey to account for the possibility that a neigh
constructor function may have modified the primary_key value.
Signed-off-by: Jim Westfall <jwestfall(a)surrealistic.net>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/core/neighbour.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -496,7 +496,7 @@ struct neighbour *__neigh_create(struct
if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
- hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
+ hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
@@ -508,7 +508,7 @@ struct neighbour *__neigh_create(struct
n1 != NULL;
n1 = rcu_dereference_protected(n1->next,
lockdep_is_held(&tbl->lock))) {
- if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
+ if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
if (want_ref)
neigh_hold(n1);
rc = n1;
Patches currently in stable-queue which might be from jwestfall(a)surrealistic.net are
queue-4.4/ipv4-make-neigh-lookup-keys-for-loopback-point-to-point-devices-be-inaddr_any.patch
queue-4.4/net-allow-neigh-contructor-functions-ability-to-modify-the-primary_key.patch
This is a note to let you know that I've just added the patch titled
ipv6: ip6_make_skb() needs to clear cork.base.dst
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
ipv6-ip6_make_skb-needs-to-clear-cork.base.dst.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Eric Dumazet <edumazet(a)google.com>
Date: Thu, 11 Jan 2018 22:31:18 -0800
Subject: ipv6: ip6_make_skb() needs to clear cork.base.dst
From: Eric Dumazet <edumazet(a)google.com>
[ Upstream commit 95ef498d977bf44ac094778fd448b98af158a3e6 ]
In my last patch, I missed fact that cork.base.dst was not initialized
in ip6_make_skb() :
If ip6_setup_cork() returns an error, we might attempt a dst_release()
on some random pointer.
Fixes: 862c03ee1deb ("ipv6: fix possible mem leaks in ipv6_make_skb()")
Signed-off-by: Eric Dumazet <edumazet(a)google.com>
Reported-by: syzbot <syzkaller(a)googlegroups.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv6/ip6_output.c | 1 +
1 file changed, 1 insertion(+)
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1785,6 +1785,7 @@ struct sk_buff *ip6_make_skb(struct sock
cork.base.flags = 0;
cork.base.addr = 0;
cork.base.opt = NULL;
+ cork.base.dst = NULL;
v6_cork.opt = NULL;
err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
if (err) {
Patches currently in stable-queue which might be from edumazet(a)google.com are
queue-4.4/ipv6-fix-udpv6-sendmsg-crash-caused-by-too-small-mtu.patch
queue-4.4/flow_dissector-properly-cap-thoff-field.patch
queue-4.4/ipv6-ip6_make_skb-needs-to-clear-cork.base.dst.patch
queue-4.4/dccp-don-t-restart-ccid2_hc_tx_rto_expire-if-sk-in-closed-state.patch
queue-4.4/netfilter-restart-search-if-moved-to-other-chain.patch
queue-4.4/net-qdisc_pkt_len_init-should-be-more-robust.patch
This is a note to let you know that I've just added the patch titled
lan78xx: Fix failure in USB Full Speed
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
lan78xx-fix-failure-in-usb-full-speed.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Yuiko Oshino <yuiko.oshino(a)microchip.com>
Date: Mon, 15 Jan 2018 13:24:28 -0500
Subject: lan78xx: Fix failure in USB Full Speed
From: Yuiko Oshino <yuiko.oshino(a)microchip.com>
[ Upstream commit a5b1379afbfabf91e3a689e82ac619a7157336b3 ]
Fix initialize the uninitialized tx_qlen to an appropriate value when USB
Full Speed is used.
Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver")
Signed-off-by: Yuiko Oshino <yuiko.oshino(a)microchip.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/net/usb/lan78xx.c | 1 +
1 file changed, 1 insertion(+)
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -1859,6 +1859,7 @@ static int lan78xx_reset(struct lan78xx_
buf = DEFAULT_BURST_CAP_SIZE / FS_USB_PKT_SIZE;
dev->rx_urb_size = DEFAULT_BURST_CAP_SIZE;
dev->rx_qlen = 4;
+ dev->tx_qlen = 4;
}
ret = lan78xx_write_reg(dev, BURST_CAP, buf);
Patches currently in stable-queue which might be from yuiko.oshino(a)microchip.com are
queue-4.4/lan78xx-fix-failure-in-usb-full-speed.patch
This is a note to let you know that I've just added the patch titled
ipv6: fix udpv6 sendmsg crash caused by too small MTU
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
ipv6-fix-udpv6-sendmsg-crash-caused-by-too-small-mtu.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Mike Maloney <maloney(a)google.com>
Date: Wed, 10 Jan 2018 12:45:10 -0500
Subject: ipv6: fix udpv6 sendmsg crash caused by too small MTU
From: Mike Maloney <maloney(a)google.com>
[ Upstream commit 749439bfac6e1a2932c582e2699f91d329658196 ]
The logic in __ip6_append_data() assumes that the MTU is at least large
enough for the headers. A device's MTU may be adjusted after being
added while sendmsg() is processing data, resulting in
__ip6_append_data() seeing any MTU. For an mtu smaller than the size of
the fragmentation header, the math results in a negative 'maxfraglen',
which causes problems when refragmenting any previous skb in the
skb_write_queue, leaving it possibly malformed.
Instead sendmsg returns EINVAL when the mtu is calculated to be less
than IPV6_MIN_MTU.
Found by syzkaller:
kernel BUG at ./include/linux/skbuff.h:2064!
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
(ftrace buffer empty)
Modules linked in:
CPU: 1 PID: 14216 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #2
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
task: ffff8801d0b68580 task.stack: ffff8801ac6b8000
RIP: 0010:__skb_pull include/linux/skbuff.h:2064 [inline]
RIP: 0010:__ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617
RSP: 0018:ffff8801ac6bf570 EFLAGS: 00010216
RAX: 0000000000010000 RBX: 0000000000000028 RCX: ffffc90003cce000
RDX: 00000000000001b8 RSI: ffffffff839df06f RDI: ffff8801d9478ca0
RBP: ffff8801ac6bf780 R08: ffff8801cc3f1dbc R09: 0000000000000000
R10: ffff8801ac6bf7a0 R11: 43cb4b7b1948a9e7 R12: ffff8801cc3f1dc8
R13: ffff8801cc3f1d40 R14: 0000000000001036 R15: dffffc0000000000
FS: 00007f43d740c700(0000) GS:ffff8801dc100000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f7834984000 CR3: 00000001d79b9000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
ip6_finish_skb include/net/ipv6.h:911 [inline]
udp_v6_push_pending_frames+0x255/0x390 net/ipv6/udp.c:1093
udpv6_sendmsg+0x280d/0x31a0 net/ipv6/udp.c:1363
inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:762
sock_sendmsg_nosec net/socket.c:633 [inline]
sock_sendmsg+0xca/0x110 net/socket.c:643
SYSC_sendto+0x352/0x5a0 net/socket.c:1750
SyS_sendto+0x40/0x50 net/socket.c:1718
entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x4512e9
RSP: 002b:00007f43d740bc08 EFLAGS: 00000216 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 00000000007180a8 RCX: 00000000004512e9
RDX: 000000000000002e RSI: 0000000020d08000 RDI: 0000000000000005
RBP: 0000000000000086 R08: 00000000209c1000 R09: 000000000000001c
R10: 0000000000040800 R11: 0000000000000216 R12: 00000000004b9c69
R13: 00000000ffffffff R14: 0000000000000005 R15: 00000000202c2000
Code: 9e 01 fe e9 c5 e8 ff ff e8 7f 9e 01 fe e9 4a ea ff ff 48 89 f7 e8 52 9e 01 fe e9 aa eb ff ff e8 a8 b6 cf fd 0f 0b e8 a1 b6 cf fd <0f> 0b 49 8d 45 78 4d 8d 45 7c 48 89 85 78 fe ff ff 49 8d 85 ba
RIP: __skb_pull include/linux/skbuff.h:2064 [inline] RSP: ffff8801ac6bf570
RIP: __ip6_make_skb+0x18cf/0x1f70 net/ipv6/ip6_output.c:1617 RSP: ffff8801ac6bf570
Reported-by: syzbot <syzkaller(a)googlegroups.com>
Signed-off-by: Mike Maloney <maloney(a)google.com>
Reviewed-by: Eric Dumazet <edumazet(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv6/ip6_output.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1246,14 +1246,16 @@ static int ip6_setup_cork(struct sock *s
v6_cork->tclass = tclass;
if (rt->dst.flags & DST_XFRM_TUNNEL)
mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
- rt->dst.dev->mtu : dst_mtu(&rt->dst);
+ READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
else
mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
- rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+ READ_ONCE(rt->dst.dev->mtu) : dst_mtu(rt->dst.path);
if (np->frag_size < mtu) {
if (np->frag_size)
mtu = np->frag_size;
}
+ if (mtu < IPV6_MIN_MTU)
+ return -EINVAL;
cork->base.fragsize = mtu;
if (dst_allfrag(rt->dst.path))
cork->base.flags |= IPCORK_ALLFRAG;
Patches currently in stable-queue which might be from maloney(a)google.com are
queue-4.4/ipv6-fix-udpv6-sendmsg-crash-caused-by-too-small-mtu.patch
This is a note to let you know that I've just added the patch titled
ipv6: Fix getsockopt() for sockets with default IPV6_AUTOFLOWLABEL
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
ipv6-fix-getsockopt-for-sockets-with-default-ipv6_autoflowlabel.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Date: Mon, 22 Jan 2018 20:06:42 +0000
Subject: ipv6: Fix getsockopt() for sockets with default IPV6_AUTOFLOWLABEL
From: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
[ Upstream commit e9191ffb65d8e159680ce0ad2224e1acbde6985c ]
Commit 513674b5a2c9 ("net: reevalulate autoflowlabel setting after
sysctl setting") removed the initialisation of
ipv6_pinfo::autoflowlabel and added a second flag to indicate
whether this field or the net namespace default should be used.
The getsockopt() handling for this case was not updated, so it
currently returns 0 for all sockets for which IPV6_AUTOFLOWLABEL is
not explicitly enabled. Fix it to return the effective value, whether
that has been set at the socket or net namespace level.
Fixes: 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl ...")
Signed-off-by: Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/net/ipv6.h | 1 +
net/ipv6/ip6_output.c | 2 +-
net/ipv6/ipv6_sockglue.c | 2 +-
3 files changed, 3 insertions(+), 2 deletions(-)
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -281,6 +281,7 @@ int ipv6_flowlabel_opt_get(struct sock *
int flags);
int ip6_flowlabel_init(void);
void ip6_flowlabel_cleanup(void);
+bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np);
static inline void fl6_sock_release(struct ip6_flowlabel *fl)
{
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -148,7 +148,7 @@ int ip6_output(struct net *net, struct s
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
-static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
+bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
{
if (!np->autoflowlabel_set)
return ip6_default_np_autolabel(net);
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1313,7 +1313,7 @@ static int do_ipv6_getsockopt(struct soc
break;
case IPV6_AUTOFLOWLABEL:
- val = np->autoflowlabel;
+ val = ip6_autoflowlabel(sock_net(sk), np);
break;
default:
Patches currently in stable-queue which might be from ben.hutchings(a)codethink.co.uk are
queue-4.4/vsyscall-fix-permissions-for-emulate-mode-with-kaiser-pti.patch
queue-4.4/ipv6-fix-getsockopt-for-sockets-with-default-ipv6_autoflowlabel.patch
queue-4.4/x86-microcode-intel-fix-bdw-late-loading-revision-check.patch
This is a note to let you know that I've just added the patch titled
ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
ipv4-make-neigh-lookup-keys-for-loopback-point-to-point-devices-be-inaddr_any.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Jim Westfall <jwestfall(a)surrealistic.net>
Date: Sun, 14 Jan 2018 04:18:51 -0800
Subject: ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY
From: Jim Westfall <jwestfall(a)surrealistic.net>
[ Upstream commit cd9ff4de0107c65d69d02253bb25d6db93c3dbc1 ]
Map all lookup neigh keys to INADDR_ANY for loopback/point-to-point devices
to avoid making an entry for every remote ip the device needs to talk to.
This used the be the old behavior but became broken in a263b3093641f
(ipv4: Make neigh lookups directly in output packet path) and later removed
in 0bb4087cbec0 (ipv4: Fix neigh lookup keying over loopback/point-to-point
devices) because it was broken.
Signed-off-by: Jim Westfall <jwestfall(a)surrealistic.net>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/net/arp.h | 3 +++
net/ipv4/arp.c | 7 ++++++-
2 files changed, 9 insertions(+), 1 deletion(-)
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -19,6 +19,9 @@ static inline u32 arp_hashfn(const void
static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
{
+ if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
+ key = INADDR_ANY;
+
return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev);
}
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -223,11 +223,16 @@ static bool arp_key_eq(const struct neig
static int arp_constructor(struct neighbour *neigh)
{
- __be32 addr = *(__be32 *)neigh->primary_key;
+ __be32 addr;
struct net_device *dev = neigh->dev;
struct in_device *in_dev;
struct neigh_parms *parms;
+ u32 inaddr_any = INADDR_ANY;
+ if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
+ memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len);
+
+ addr = *(__be32 *)neigh->primary_key;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (!in_dev) {
Patches currently in stable-queue which might be from jwestfall(a)surrealistic.net are
queue-4.4/ipv4-make-neigh-lookup-keys-for-loopback-point-to-point-devices-be-inaddr_any.patch
queue-4.4/net-allow-neigh-contructor-functions-ability-to-modify-the-primary_key.patch
This is a note to let you know that I've just added the patch titled
flow_dissector: properly cap thoff field
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
flow_dissector-properly-cap-thoff-field.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Eric Dumazet <edumazet(a)google.com>
Date: Wed, 17 Jan 2018 14:21:13 -0800
Subject: flow_dissector: properly cap thoff field
From: Eric Dumazet <edumazet(a)google.com>
[ Upstream commit d0c081b49137cd3200f2023c0875723be66e7ce5 ]
syzbot reported yet another crash [1] that is caused by
insufficient validation of DODGY packets.
Two bugs are happening here to trigger the crash.
1) Flow dissection leaves with incorrect thoff field.
2) skb_probe_transport_header() sets transport header to this invalid
thoff, even if pointing after skb valid data.
3) qdisc_pkt_len_init() reads out-of-bound data because it
trusts tcp_hdrlen(skb)
Possible fixes :
- Full flow dissector validation before injecting bad DODGY packets in
the stack.
This approach was attempted here : https://patchwork.ozlabs.org/patch/
861874/
- Have more robust functions in the core.
This might be needed anyway for stable versions.
This patch fixes the flow dissection issue.
[1]
CPU: 1 PID: 3144 Comm: syzkaller271204 Not tainted 4.15.0-rc4-mm1+ #49
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x194/0x257 lib/dump_stack.c:53
print_address_description+0x73/0x250 mm/kasan/report.c:256
kasan_report_error mm/kasan/report.c:355 [inline]
kasan_report+0x23b/0x360 mm/kasan/report.c:413
__asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:432
__tcp_hdrlen include/linux/tcp.h:35 [inline]
tcp_hdrlen include/linux/tcp.h:40 [inline]
qdisc_pkt_len_init net/core/dev.c:3160 [inline]
__dev_queue_xmit+0x20d3/0x2200 net/core/dev.c:3465
dev_queue_xmit+0x17/0x20 net/core/dev.c:3554
packet_snd net/packet/af_packet.c:2943 [inline]
packet_sendmsg+0x3ad5/0x60a0 net/packet/af_packet.c:2968
sock_sendmsg_nosec net/socket.c:628 [inline]
sock_sendmsg+0xca/0x110 net/socket.c:638
sock_write_iter+0x31a/0x5d0 net/socket.c:907
call_write_iter include/linux/fs.h:1776 [inline]
new_sync_write fs/read_write.c:469 [inline]
__vfs_write+0x684/0x970 fs/read_write.c:482
vfs_write+0x189/0x510 fs/read_write.c:544
SYSC_write fs/read_write.c:589 [inline]
SyS_write+0xef/0x220 fs/read_write.c:581
entry_SYSCALL_64_fastpath+0x1f/0x96
Fixes: 34fad54c2537 ("net: __skb_flow_dissect() must cap its return value")
Fixes: a6e544b0a88b ("flow_dissector: Jump to exit code in __skb_flow_dissect")
Signed-off-by: Eric Dumazet <edumazet(a)google.com>
Cc: Willem de Bruijn <willemb(a)google.com>
Reported-by: syzbot <syzkaller(a)googlegroups.com>
Acked-by: Jason Wang <jasowang(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/core/flow_dissector.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -492,8 +492,8 @@ ip_proto_again:
out_good:
ret = true;
- key_control->thoff = (u16)nhoff;
out:
+ key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
key_basic->n_proto = proto;
key_basic->ip_proto = ip_proto;
@@ -501,7 +501,6 @@ out:
out_bad:
ret = false;
- key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
goto out;
}
EXPORT_SYMBOL(__skb_flow_dissect);
Patches currently in stable-queue which might be from edumazet(a)google.com are
queue-4.4/ipv6-fix-udpv6-sendmsg-crash-caused-by-too-small-mtu.patch
queue-4.4/flow_dissector-properly-cap-thoff-field.patch
queue-4.4/ipv6-ip6_make_skb-needs-to-clear-cork.base.dst.patch
queue-4.4/dccp-don-t-restart-ccid2_hc_tx_rto_expire-if-sk-in-closed-state.patch
queue-4.4/netfilter-restart-search-if-moved-to-other-chain.patch
queue-4.4/net-qdisc_pkt_len_init-should-be-more-robust.patch
This is a note to let you know that I've just added the patch titled
dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
dccp-don-t-restart-ccid2_hc_tx_rto_expire-if-sk-in-closed-state.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Mon Jan 29 10:14:57 CET 2018
From: Alexey Kodanev <alexey.kodanev(a)oracle.com>
Date: Fri, 26 Jan 2018 15:14:16 +0300
Subject: dccp: don't restart ccid2_hc_tx_rto_expire() if sk in closed state
From: Alexey Kodanev <alexey.kodanev(a)oracle.com>
[ Upstream commit dd5684ecae3bd8e44b644f50e2c12c7e57fdfef5 ]
ccid2_hc_tx_rto_expire() timer callback always restarts the timer
again and can run indefinitely (unless it is stopped outside), and after
commit 120e9dabaf55 ("dccp: defer ccid_hc_tx_delete() at dismantle time"),
which moved ccid_hc_tx_delete() (also includes sk_stop_timer()) from
dccp_destroy_sock() to sk_destruct(), this started to happen quite often.
The timer prevents releasing the socket, as a result, sk_destruct() won't
be called.
Found with LTP/dccp_ipsec tests running on the bonding device,
which later couldn't be unloaded after the tests were completed:
unregister_netdevice: waiting for bond0 to become free. Usage count = 148
Fixes: 2a91aa396739 ("[DCCP] CCID2: Initial CCID2 (TCP-Like) implementation")
Signed-off-by: Alexey Kodanev <alexey.kodanev(a)oracle.com>
Reviewed-by: Eric Dumazet <edumazet(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/dccp/ccids/ccid2.c | 3 +++
1 file changed, 3 insertions(+)
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -140,6 +140,9 @@ static void ccid2_hc_tx_rto_expire(unsig
ccid2_pr_debug("RTO_EXPIRE\n");
+ if (sk->sk_state == DCCP_CLOSED)
+ goto out;
+
/* back-off timer */
hc->tx_rto <<= 1;
if (hc->tx_rto > DCCP_RTO_MAX)
Patches currently in stable-queue which might be from alexey.kodanev(a)oracle.com are
queue-4.4/dccp-don-t-restart-ccid2_hc_tx_rto_expire-if-sk-in-closed-state.patch
This is a note to let you know that I've just added the patch titled
x86/microcode/intel: Extend BDW late-loading further with LLC size check
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
x86-microcode-intel-extend-bdw-late-loading-further-with-llc-size-check.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 7e702d17ed138cf4ae7c00e8c00681ed464587c7 Mon Sep 17 00:00:00 2001
From: Jia Zhang <zhang.jia(a)linux.alibaba.com>
Date: Tue, 23 Jan 2018 11:41:32 +0100
Subject: x86/microcode/intel: Extend BDW late-loading further with LLC size check
From: Jia Zhang <zhang.jia(a)linux.alibaba.com>
commit 7e702d17ed138cf4ae7c00e8c00681ed464587c7 upstream.
Commit b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a
revision check") reduced the impact of erratum BDF90 for Broadwell model
79.
The impact can be reduced further by checking the size of the last level
cache portion per core.
Tony: "The erratum says the problem only occurs on the large-cache SKUs.
So we only need to avoid the update if we are on a big cache SKU that is
also running old microcode."
For more details, see erratum BDF90 in document #334165 (Intel Xeon
Processor E7-8800/4800 v4 Product Family Specification Update) from
September 2017.
Fixes: b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check")
Signed-off-by: Jia Zhang <zhang.jia(a)linux.alibaba.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Acked-by: Tony Luck <tony.luck(a)intel.com>
Link: https://lkml.kernel.org/r/1516321542-31161-1-git-send-email-zhang.jia@linux…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kernel/cpu/microcode/intel.c | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -40,6 +40,9 @@
#include <asm/setup.h>
#include <asm/msr.h>
+/* last level cache size per core */
+static int llc_size_per_core;
+
/*
* Temporary microcode blobs pointers storage. We note here during early load
* the pointers to microcode blobs we've got from whatever storage (detached
@@ -1053,12 +1056,14 @@ static bool is_blacklisted(unsigned int
/*
* Late loading on model 79 with microcode revision less than 0x0b000021
- * may result in a system hang. This behavior is documented in item
- * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family).
+ * and LLC size per core bigger than 2.5MB may result in a system hang.
+ * This behavior is documented in item BDF90, #334165 (Intel Xeon
+ * Processor E7-8800/4800 v4 Product Family).
*/
if (c->x86 == 6 &&
c->x86_model == INTEL_FAM6_BROADWELL_X &&
c->x86_mask == 0x01 &&
+ llc_size_per_core > 2621440 &&
c->microcode < 0x0b000021) {
pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
@@ -1125,6 +1130,15 @@ static struct microcode_ops microcode_in
.microcode_fini_cpu = microcode_fini_cpu,
};
+static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
+{
+ u64 llc_size = c->x86_cache_size * 1024;
+
+ do_div(llc_size, c->x86_max_cores);
+
+ return (int)llc_size;
+}
+
struct microcode_ops * __init init_intel_microcode(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -1135,6 +1149,8 @@ struct microcode_ops * __init init_intel
return NULL;
}
+ llc_size_per_core = calc_llc_size_per_core(c);
+
return µcode_intel_ops;
}
Patches currently in stable-queue which might be from zhang.jia(a)linux.alibaba.com are
queue-4.9/x86-microcode-intel-extend-bdw-late-loading-further-with-llc-size-check.patch
This is a note to let you know that I've just added the patch titled
perf/x86/amd/power: Do not load AMD power module on !AMD platforms
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
perf-x86-amd-power-do-not-load-amd-power-module-on-amd-platforms.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 40d4071ce2d20840d224b4a77b5dc6f752c9ab15 Mon Sep 17 00:00:00 2001
From: Xiao Liang <xiliang(a)redhat.com>
Date: Mon, 22 Jan 2018 14:12:52 +0800
Subject: perf/x86/amd/power: Do not load AMD power module on !AMD platforms
From: Xiao Liang <xiliang(a)redhat.com>
commit 40d4071ce2d20840d224b4a77b5dc6f752c9ab15 upstream.
The AMD power module can be loaded on non AMD platforms, but unload fails
with the following Oops:
BUG: unable to handle kernel NULL pointer dereference at (null)
IP: __list_del_entry_valid+0x29/0x90
Call Trace:
perf_pmu_unregister+0x25/0xf0
amd_power_pmu_exit+0x1c/0xd23 [power]
SyS_delete_module+0x1a8/0x2b0
? exit_to_usermode_loop+0x8f/0xb0
entry_SYSCALL_64_fastpath+0x20/0x83
Return -ENODEV instead of 0 from the module init function if the CPU does
not match.
Fixes: c7ab62bfbe0e ("perf/x86/amd/power: Add AMD accumulated power reporting mechanism")
Signed-off-by: Xiao Liang <xiliang(a)redhat.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Link: https://lkml.kernel.org/r/20180122061252.6394-1-xiliang@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/events/amd/power.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/x86/events/amd/power.c
+++ b/arch/x86/events/amd/power.c
@@ -277,7 +277,7 @@ static int __init amd_power_pmu_init(voi
int ret;
if (!x86_match_cpu(cpu_match))
- return 0;
+ return -ENODEV;
if (!boot_cpu_has(X86_FEATURE_ACC_POWER))
return -ENODEV;
Patches currently in stable-queue which might be from xiliang(a)redhat.com are
queue-4.9/perf-x86-amd-power-do-not-load-amd-power-module-on-amd-platforms.patch
This is a note to let you know that I've just added the patch titled
hrtimer: Reset hrtimer cpu base proper on CPU hotplug
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
hrtimer-reset-hrtimer-cpu-base-proper-on-cpu-hotplug.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From d5421ea43d30701e03cadc56a38854c36a8b4433 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Fri, 26 Jan 2018 14:54:32 +0100
Subject: hrtimer: Reset hrtimer cpu base proper on CPU hotplug
From: Thomas Gleixner <tglx(a)linutronix.de>
commit d5421ea43d30701e03cadc56a38854c36a8b4433 upstream.
The hrtimer interrupt code contains a hang detection and mitigation
mechanism, which prevents that a long delayed hrtimer interrupt causes a
continous retriggering of interrupts which prevent the system from making
progress. If a hang is detected then the timer hardware is programmed with
a certain delay into the future and a flag is set in the hrtimer cpu base
which prevents newly enqueued timers from reprogramming the timer hardware
prior to the chosen delay. The subsequent hrtimer interrupt after the delay
clears the flag and resumes normal operation.
If such a hang happens in the last hrtimer interrupt before a CPU is
unplugged then the hang_detected flag is set and stays that way when the
CPU is plugged in again. At that point the timer hardware is not armed and
it cannot be armed because the hang_detected flag is still active, so
nothing clears that flag. As a consequence the CPU does not receive hrtimer
interrupts and no timers expire on that CPU which results in RCU stalls and
other malfunctions.
Clear the flag along with some other less critical members of the hrtimer
cpu base to ensure starting from a clean state when a CPU is plugged in.
Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the
root cause of that hard to reproduce heisenbug. Once understood it's
trivial and certainly justifies a brown paperbag.
Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic")
Reported-by: Paul E. McKenney <paulmck(a)linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Sebastian Sewior <bigeasy(a)linutronix.de>
Cc: Anna-Maria Gleixner <anna-maria(a)linutronix.de>
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
kernel/time/hrtimer.c | 3 +++
1 file changed, 3 insertions(+)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -652,7 +652,9 @@ static void hrtimer_reprogram(struct hrt
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
base->expires_next.tv64 = KTIME_MAX;
+ base->hang_detected = 0;
base->hres_active = 0;
+ base->next_timer = NULL;
}
/*
@@ -1610,6 +1612,7 @@ int hrtimers_prepare_cpu(unsigned int cp
timerqueue_init_head(&cpu_base->clock_base[i].active);
}
+ cpu_base->active_bases = 0;
cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
return 0;
Patches currently in stable-queue which might be from tglx(a)linutronix.de are
queue-4.9/prevent-timer-value-0-for-mwaitx.patch
queue-4.9/perf-x86-amd-power-do-not-load-amd-power-module-on-amd-platforms.patch
queue-4.9/x86-asm-32-make-sync_core-handle-missing-cpuid-on-all-32-bit-kernels.patch
queue-4.9/revert-module-add-retpoline-tag-to-vermagic.patch
queue-4.9/x86-microcode-intel-extend-bdw-late-loading-further-with-llc-size-check.patch
queue-4.9/hrtimer-reset-hrtimer-cpu-base-proper-on-cpu-hotplug.patch
This is a note to let you know that I've just added the patch titled
x86/microcode/intel: Extend BDW late-loading further with LLC size check
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
x86-microcode-intel-extend-bdw-late-loading-further-with-llc-size-check.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 7e702d17ed138cf4ae7c00e8c00681ed464587c7 Mon Sep 17 00:00:00 2001
From: Jia Zhang <zhang.jia(a)linux.alibaba.com>
Date: Tue, 23 Jan 2018 11:41:32 +0100
Subject: x86/microcode/intel: Extend BDW late-loading further with LLC size check
From: Jia Zhang <zhang.jia(a)linux.alibaba.com>
commit 7e702d17ed138cf4ae7c00e8c00681ed464587c7 upstream.
Commit b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a
revision check") reduced the impact of erratum BDF90 for Broadwell model
79.
The impact can be reduced further by checking the size of the last level
cache portion per core.
Tony: "The erratum says the problem only occurs on the large-cache SKUs.
So we only need to avoid the update if we are on a big cache SKU that is
also running old microcode."
For more details, see erratum BDF90 in document #334165 (Intel Xeon
Processor E7-8800/4800 v4 Product Family Specification Update) from
September 2017.
Fixes: b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check")
Signed-off-by: Jia Zhang <zhang.jia(a)linux.alibaba.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Acked-by: Tony Luck <tony.luck(a)intel.com>
Link: https://lkml.kernel.org/r/1516321542-31161-1-git-send-email-zhang.jia@linux…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kernel/cpu/microcode/intel.c | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,6 +39,9 @@
#include <asm/setup.h>
#include <asm/msr.h>
+/* last level cache size per core */
+static int llc_size_per_core;
+
static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
static struct mc_saved_data {
unsigned int mc_saved_count;
@@ -996,12 +999,14 @@ static bool is_blacklisted(unsigned int
/*
* Late loading on model 79 with microcode revision less than 0x0b000021
- * may result in a system hang. This behavior is documented in item
- * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family).
+ * and LLC size per core bigger than 2.5MB may result in a system hang.
+ * This behavior is documented in item BDF90, #334165 (Intel Xeon
+ * Processor E7-8800/4800 v4 Product Family).
*/
if (c->x86 == 6 &&
c->x86_model == 79 &&
c->x86_mask == 0x01 &&
+ llc_size_per_core > 2621440 &&
c->microcode < 0x0b000021) {
pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
@@ -1068,6 +1073,15 @@ static struct microcode_ops microcode_in
.microcode_fini_cpu = microcode_fini_cpu,
};
+static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
+{
+ u64 llc_size = c->x86_cache_size * 1024;
+
+ do_div(llc_size, c->x86_max_cores);
+
+ return (int)llc_size;
+}
+
struct microcode_ops * __init init_intel_microcode(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -1078,6 +1092,8 @@ struct microcode_ops * __init init_intel
return NULL;
}
+ llc_size_per_core = calc_llc_size_per_core(c);
+
return µcode_intel_ops;
}
Patches currently in stable-queue which might be from zhang.jia(a)linux.alibaba.com are
queue-4.4/x86-microcode-intel-extend-bdw-late-loading-further-with-llc-size-check.patch
This is a note to let you know that I've just added the patch titled
hrtimer: Reset hrtimer cpu base proper on CPU hotplug
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
hrtimer-reset-hrtimer-cpu-base-proper-on-cpu-hotplug.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From d5421ea43d30701e03cadc56a38854c36a8b4433 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Fri, 26 Jan 2018 14:54:32 +0100
Subject: hrtimer: Reset hrtimer cpu base proper on CPU hotplug
From: Thomas Gleixner <tglx(a)linutronix.de>
commit d5421ea43d30701e03cadc56a38854c36a8b4433 upstream.
The hrtimer interrupt code contains a hang detection and mitigation
mechanism, which prevents that a long delayed hrtimer interrupt causes a
continous retriggering of interrupts which prevent the system from making
progress. If a hang is detected then the timer hardware is programmed with
a certain delay into the future and a flag is set in the hrtimer cpu base
which prevents newly enqueued timers from reprogramming the timer hardware
prior to the chosen delay. The subsequent hrtimer interrupt after the delay
clears the flag and resumes normal operation.
If such a hang happens in the last hrtimer interrupt before a CPU is
unplugged then the hang_detected flag is set and stays that way when the
CPU is plugged in again. At that point the timer hardware is not armed and
it cannot be armed because the hang_detected flag is still active, so
nothing clears that flag. As a consequence the CPU does not receive hrtimer
interrupts and no timers expire on that CPU which results in RCU stalls and
other malfunctions.
Clear the flag along with some other less critical members of the hrtimer
cpu base to ensure starting from a clean state when a CPU is plugged in.
Thanks to Paul, Sebastian and Anna-Maria for their help to get down to the
root cause of that hard to reproduce heisenbug. Once understood it's
trivial and certainly justifies a brown paperbag.
Fixes: 41d2e4949377 ("hrtimer: Tune hrtimer_interrupt hang logic")
Reported-by: Paul E. McKenney <paulmck(a)linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Sebastian Sewior <bigeasy(a)linutronix.de>
Cc: Anna-Maria Gleixner <anna-maria(a)linutronix.de>
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801261447590.2067@nanos
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
kernel/time/hrtimer.c | 3 +++
1 file changed, 3 insertions(+)
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -669,7 +669,9 @@ static void hrtimer_reprogram(struct hrt
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
base->expires_next.tv64 = KTIME_MAX;
+ base->hang_detected = 0;
base->hres_active = 0;
+ base->next_timer = NULL;
}
/*
@@ -1615,6 +1617,7 @@ static void init_hrtimers_cpu(int cpu)
timerqueue_init_head(&cpu_base->clock_base[i].active);
}
+ cpu_base->active_bases = 0;
cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
}
Patches currently in stable-queue which might be from tglx(a)linutronix.de are
queue-4.4/prevent-timer-value-0-for-mwaitx.patch
queue-4.4/x86-ioapic-fix-incorrect-pointers-in-ioapic_setup_resources.patch
queue-4.4/x86-asm-32-make-sync_core-handle-missing-cpuid-on-all-32-bit-kernels.patch
queue-4.4/timers-plug-locking-race-vs.-timer-migration.patch
queue-4.4/x86-cpu-intel-introduce-macros-for-intel-family-numbers.patch
queue-4.4/x86-retpoline-fill-rsb-on-context-switch-for-affected-cpus.patch
queue-4.4/revert-module-add-retpoline-tag-to-vermagic.patch
queue-4.4/x86-microcode-intel-extend-bdw-late-loading-further-with-llc-size-check.patch
queue-4.4/time-avoid-undefined-behaviour-in-ktime_add_safe.patch
queue-4.4/sched-deadline-use-the-revised-wakeup-rule-for-suspending-constrained-dl-tasks.patch
queue-4.4/hrtimer-reset-hrtimer-cpu-base-proper-on-cpu-hotplug.patch