This is a note to let you know that I've just added the patch titled
x86/kvm/vmx: Simplify segment_base()
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
x86-kvm-vmx-simplify-segment_base.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 8c2e41f7ae1234c192ef497472ad306227c77c03 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto(a)kernel.org>
Date: Mon, 20 Feb 2017 08:56:12 -0800
Subject: x86/kvm/vmx: Simplify segment_base()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
From: Andy Lutomirski <luto(a)kernel.org>
commit 8c2e41f7ae1234c192ef497472ad306227c77c03 upstream.
Use actual pointer types for pointers (instead of unsigned long) and
replace hardcoded constants with the appropriate self-documenting
macros.
The function is still a bit messy, but this seems a lot better than
before to me.
This is mostly borrowed from a patch by Thomas Garnier.
Cc: Thomas Garnier <thgarnie(a)google.com>
Cc: Jim Mattson <jmattson(a)google.com>
Cc: Radim Krčmář <rkrcmar(a)redhat.com>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Andy Lutomirski <luto(a)kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Eduardo Valentin <eduval(a)amazon.com>
Signed-off-by: Eduardo Valentin <edubezval(a)gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kvm/vmx.c | 19 +++++++------------
1 file changed, 7 insertions(+), 12 deletions(-)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2030,28 +2030,23 @@ static unsigned long segment_base(u16 se
{
struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
struct desc_struct *d;
- unsigned long table_base;
+ struct desc_struct *table;
unsigned long v;
- if (!(selector & ~3))
+ if (!(selector & ~SEGMENT_RPL_MASK))
return 0;
- table_base = gdt->address;
+ table = (struct desc_struct *)gdt->address;
- if (selector & 4) { /* from ldt */
+ if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
u16 ldt_selector = kvm_read_ldt();
- if (!(ldt_selector & ~3))
+ if (!(ldt_selector & ~SEGMENT_RPL_MASK))
return 0;
- table_base = segment_base(ldt_selector);
+ table = (struct desc_struct *)segment_base(ldt_selector);
}
- d = (struct desc_struct *)(table_base + (selector & ~7));
- v = get_desc_base(d);
-#ifdef CONFIG_X86_64
- if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
- v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
-#endif
+ v = get_desc_base(&table[selector >> 3]);
return v;
}
Patches currently in stable-queue which might be from luto(a)kernel.org are
queue-4.9/x86-mm-refactor-flush_tlb_mm_range-to-merge-local-and-remote-cases.patch
queue-4.9/x86-mm-pass-flush_tlb_info-to-flush_tlb_others-etc.patch
queue-4.9/x86-mm-rework-lazy-tlb-to-track-the-actual-loaded-mm.patch
queue-4.9/x86-mm-kvm-teach-kvm-s-vmx-code-that-cr3-isn-t-a-constant.patch
queue-4.9/x86-mm-use-new-merged-flush-logic-in-arch_tlbbatch_flush.patch
queue-4.9/x86-kvm-vmx-simplify-segment_base.patch
queue-4.9/x86-entry-unwind-create-stack-frames-for-saved-interrupt-registers.patch
queue-4.9/x86-mm-reduce-indentation-in-flush_tlb_func.patch
queue-4.9/x86-mm-remove-the-up-asm-tlbflush.h-code-always-use-the-formerly-smp-code.patch
queue-4.9/x86-mm-reimplement-flush_tlb_page-using-flush_tlb_mm_range.patch
queue-4.9/mm-x86-mm-make-the-batched-unmap-tlb-flush-api-more-generic.patch
queue-4.9/x86-kvm-vmx-defer-tr-reload-after-vm-exit.patch
queue-4.9/x86-mm-change-the-leave_mm-condition-for-local-tlb-flushes.patch
queue-4.9/x86-mm-be-more-consistent-wrt-page_shift-vs-page_size-in-tlb-flush-code.patch
This is a note to let you know that I've just added the patch titled
x86/kvm/vmx: Defer TR reload after VM exit
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
x86-kvm-vmx-defer-tr-reload-after-vm-exit.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From b7ffc44d5b2ea163899d09289ca7743d5c32e926 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto(a)kernel.org>
Date: Mon, 20 Feb 2017 08:56:14 -0800
Subject: x86/kvm/vmx: Defer TR reload after VM exit
From: Andy Lutomirski <luto(a)kernel.org>
commit b7ffc44d5b2ea163899d09289ca7743d5c32e926 upstream.
Intel's VMX is daft and resets the hidden TSS limit register to 0x67
on VMX reload, and the 0x67 is not configurable. KVM currently
reloads TR using the LTR instruction on every exit, but this is quite
slow because LTR is serializing.
The 0x67 limit is entirely harmless unless ioperm() is in use, so
defer the reload until a task using ioperm() is actually running.
Here's some poorly done benchmarking using kvm-unit-tests:
Before:
cpuid 1313
vmcall 1195
mov_from_cr8 11
mov_to_cr8 17
inl_from_pmtimer 6770
inl_from_qemu 6856
inl_from_kernel 2435
outl_to_kernel 1402
After:
cpuid 1291
vmcall 1181
mov_from_cr8 11
mov_to_cr8 16
inl_from_pmtimer 6457
inl_from_qemu 6209
inl_from_kernel 2339
outl_to_kernel 1391
Signed-off-by: Andy Lutomirski <luto(a)kernel.org>
[Force-reload TR in invalidate_tss_limit. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Eduardo Valentin <eduval(a)amazon.com>
Signed-off-by: Eduardo Valentin <edubezval(a)gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/include/asm/desc.h | 48 ++++++++++++++++++++++++++++++++++++++++++++
arch/x86/kernel/ioport.c | 5 ++++
arch/x86/kernel/process.c | 10 +++++++++
arch/x86/kvm/vmx.c | 23 ++++++++-------------
4 files changed, 72 insertions(+), 14 deletions(-)
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -213,6 +213,54 @@ static inline void native_load_tr_desc(v
asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
}
+static inline void force_reload_TR(void)
+{
+ struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
+ tss_desc tss;
+
+ memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
+
+ /*
+ * LTR requires an available TSS, and the TSS is currently
+ * busy. Make it be available so that LTR will work.
+ */
+ tss.type = DESC_TSS;
+ write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);
+
+ load_TR_desc();
+}
+
+DECLARE_PER_CPU(bool, need_tr_refresh);
+
+static inline void refresh_TR(void)
+{
+ WARN_ON(preemptible());
+
+ if (unlikely(this_cpu_read(need_tr_refresh))) {
+ force_reload_TR();
+ this_cpu_write(need_tr_refresh, false);
+ }
+}
+
+/*
+ * If you do something evil that corrupts the cached TSS limit (I'm looking
+ * at you, VMX exits), call this function.
+ *
+ * The optimization here is that the TSS limit only matters for Linux if the
+ * IO bitmap is in use. If the TSS limit gets forced to its minimum value,
+ * everything works except that IO bitmap will be ignored and all CPL 3 IO
+ * instructions will #GP, which is exactly what we want for normal tasks.
+ */
+static inline void invalidate_tss_limit(void)
+{
+ WARN_ON(preemptible());
+
+ if (unlikely(test_thread_flag(TIF_IO_BITMAP)))
+ force_reload_TR();
+ else
+ this_cpu_write(need_tr_refresh, true);
+}
+
static inline void native_load_gdt(const struct desc_ptr *dtr)
{
asm volatile("lgdt %0"::"m" (*dtr));
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/bitmap.h>
#include <asm/syscalls.h>
+#include <asm/desc.h>
/*
* this changes the io permissions bitmap in the current task.
@@ -45,6 +46,10 @@ asmlinkage long sys_ioperm(unsigned long
memset(bitmap, 0xff, IO_BITMAP_BYTES);
t->io_bitmap_ptr = bitmap;
set_thread_flag(TIF_IO_BITMAP);
+
+ preempt_disable();
+ refresh_TR();
+ preempt_enable();
}
/*
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -33,6 +33,7 @@
#include <asm/mce.h>
#include <asm/vm86.h>
#include <asm/switch_to.h>
+#include <asm/desc.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -82,6 +83,9 @@ void idle_notifier_unregister(struct not
EXPORT_SYMBOL_GPL(idle_notifier_unregister);
#endif
+DEFINE_PER_CPU(bool, need_tr_refresh);
+EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh);
+
/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
@@ -227,6 +231,12 @@ void __switch_to_xtra(struct task_struct
*/
memcpy(tss->io_bitmap, next->io_bitmap_ptr,
max(prev->io_bitmap_max, next->io_bitmap_max));
+
+ /*
+ * Make sure that the TSS limit is correct for the CPU
+ * to notice the IO bitmap.
+ */
+ refresh_TR();
} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
/*
* Clear any possible leftover bits:
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1959,19 +1959,6 @@ static void add_atomic_switch_msr(struct
m->host[i].value = host_val;
}
-static void reload_tss(void)
-{
- /*
- * VT restores TR but not its size. Useless.
- */
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
- struct desc_struct *descs;
-
- descs = (void *)gdt->address;
- descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
- load_TR_desc();
-}
-
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
{
u64 guest_efer = vmx->vcpu.arch.efer;
@@ -2141,7 +2128,7 @@ static void __vmx_load_host_state(struct
loadsegment(es, vmx->host_state.es_sel);
}
#endif
- reload_tss();
+ invalidate_tss_limit();
#ifdef CONFIG_X86_64
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
@@ -2265,6 +2252,14 @@ static void vmx_vcpu_load(struct kvm_vcp
vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
+ /*
+ * VM exits change the host TR limit to 0x67 after a VM
+ * exit. This is okay, since 0x67 covers everything except
+ * the IO bitmap and have have code to handle the IO bitmap
+ * being lost after a VM exit.
+ */
+ BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
+
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
Patches currently in stable-queue which might be from luto(a)kernel.org are
queue-4.9/x86-mm-refactor-flush_tlb_mm_range-to-merge-local-and-remote-cases.patch
queue-4.9/x86-mm-pass-flush_tlb_info-to-flush_tlb_others-etc.patch
queue-4.9/x86-mm-rework-lazy-tlb-to-track-the-actual-loaded-mm.patch
queue-4.9/x86-mm-kvm-teach-kvm-s-vmx-code-that-cr3-isn-t-a-constant.patch
queue-4.9/x86-mm-use-new-merged-flush-logic-in-arch_tlbbatch_flush.patch
queue-4.9/x86-kvm-vmx-simplify-segment_base.patch
queue-4.9/x86-entry-unwind-create-stack-frames-for-saved-interrupt-registers.patch
queue-4.9/x86-mm-reduce-indentation-in-flush_tlb_func.patch
queue-4.9/x86-mm-remove-the-up-asm-tlbflush.h-code-always-use-the-formerly-smp-code.patch
queue-4.9/x86-mm-reimplement-flush_tlb_page-using-flush_tlb_mm_range.patch
queue-4.9/mm-x86-mm-make-the-batched-unmap-tlb-flush-api-more-generic.patch
queue-4.9/x86-kvm-vmx-defer-tr-reload-after-vm-exit.patch
queue-4.9/x86-mm-change-the-leave_mm-condition-for-local-tlb-flushes.patch
queue-4.9/x86-mm-be-more-consistent-wrt-page_shift-vs-page_size-in-tlb-flush-code.patch
This is a note to let you know that I've just added the patch titled
x86/kvm/vmx: remove unused variable in segment_base()
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
x86-kvm-vmx-remove-unused-variable-in-segment_base.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 0fce546f9f07b94ccc9de09cf48d35e18946d2fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Lefaure?= <jeremy.lefaure(a)lse.epita.fr>
Date: Sat, 25 Feb 2017 17:46:53 -0500
Subject: x86/kvm/vmx: remove unused variable in segment_base()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
From: Jérémy Lefaure <jeremy.lefaure(a)lse.epita.fr>
commit 0fce546f9f07b94ccc9de09cf48d35e18946d2fa upstream.
The pointer 'struct desc_struct *d' is unused since commit 8c2e41f7ae12
("x86/kvm/vmx: Simplify segment_base()") so let's remove it.
Signed-off-by: Jérémy Lefaure <jeremy.lefaure(a)lse.epita.fr>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar(a)redhat.com>
Signed-off-by: Eduardo Valentin <eduval(a)amazon.com>
Signed-off-by: Eduardo Valentin <edubezval(a)gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kvm/vmx.c | 1 -
1 file changed, 1 deletion(-)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2016,7 +2016,6 @@ static bool update_transition_efer(struc
static unsigned long segment_base(u16 selector)
{
struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
- struct desc_struct *d;
struct desc_struct *table;
unsigned long v;
Patches currently in stable-queue which might be from jeremy.lefaure(a)lse.epita.fr are
queue-4.9/x86-kvm-vmx-remove-unused-variable-in-segment_base.patch
This is a note to let you know that I've just added the patch titled
x86/entry/unwind: Create stack frames for saved interrupt registers
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
x86-entry-unwind-create-stack-frames-for-saved-interrupt-registers.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From 946c191161cef10c667b5ee3179db1714fa5b7c0 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe(a)redhat.com>
Date: Thu, 20 Oct 2016 11:34:40 -0500
Subject: x86/entry/unwind: Create stack frames for saved interrupt registers
From: Josh Poimboeuf <jpoimboe(a)redhat.com>
commit 946c191161cef10c667b5ee3179db1714fa5b7c0 upstream.
With frame pointers, when a task is interrupted, its stack is no longer
completely reliable because the function could have been interrupted
before it had a chance to save the previous frame pointer on the stack.
So the caller of the interrupted function could get skipped by a stack
trace.
This is problematic for live patching, which needs to know whether a
stack trace of a sleeping task can be relied upon. There's currently no
way to detect if a sleeping task was interrupted by a page fault
exception or preemption before it went to sleep.
Another issue is that when dumping the stack of an interrupted task, the
unwinder has no way of knowing where the saved pt_regs registers are, so
it can't print them.
This solves those issues by encoding the pt_regs pointer in the frame
pointer on entry from an interrupt or an exception.
This patch also updates the unwinder to be able to decode it, because
otherwise the unwinder would be broken by this change.
Note that this causes a change in the behavior of the unwinder: each
instance of a pt_regs on the stack is now considered a "frame". So
callers of unwind_get_return_address() will now get an occasional
'regs->ip' address that would have previously been skipped over.
Suggested-by: Andy Lutomirski <luto(a)amacapital.net>
Signed-off-by: Josh Poimboeuf <jpoimboe(a)redhat.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: Brian Gerst <brgerst(a)gmail.com>
Cc: Denys Vlasenko <dvlasenk(a)redhat.com>
Cc: H. Peter Anvin <hpa(a)zytor.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Link: http://lkml.kernel.org/r/8b9f84a21e39d249049e0547b559ff8da0df0988.147697374…
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Signed-off-by: Eduardo Valentin <eduval(a)amazon.com>
Signed-off-by: Eduardo Valentin <edubezval(a)gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/entry/calling.h | 20 ++++++++++
arch/x86/entry/entry_32.S | 33 +++++++++++++++--
arch/x86/entry/entry_64.S | 10 +++--
arch/x86/include/asm/unwind.h | 16 ++++++++
arch/x86/kernel/unwind_frame.c | 76 ++++++++++++++++++++++++++++++++++++-----
5 files changed, 139 insertions(+), 16 deletions(-)
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -201,6 +201,26 @@ For 32-bit we have the following convent
.byte 0xf1
.endm
+/*
+ * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
+ * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
+ * is just setting the LSB, which makes it an invalid stack address and is also
+ * a signal to the unwinder that it's a pt_regs pointer in disguise.
+ *
+ * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts
+ * the original rbp.
+ */
+.macro ENCODE_FRAME_POINTER ptregs_offset=0
+#ifdef CONFIG_FRAME_POINTER
+ .if \ptregs_offset
+ leaq \ptregs_offset(%rsp), %rbp
+ .else
+ mov %rsp, %rbp
+ .endif
+ orq $0x1, %rbp
+#endif
+.endm
+
#endif /* CONFIG_X86_64 */
/*
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -175,6 +175,22 @@
SET_KERNEL_GS %edx
.endm
+/*
+ * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
+ * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
+ * is just setting the LSB, which makes it an invalid stack address and is also
+ * a signal to the unwinder that it's a pt_regs pointer in disguise.
+ *
+ * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
+ * original rbp.
+ */
+.macro ENCODE_FRAME_POINTER
+#ifdef CONFIG_FRAME_POINTER
+ mov %esp, %ebp
+ orl $0x1, %ebp
+#endif
+.endm
+
.macro RESTORE_INT_REGS
popl %ebx
popl %ecx
@@ -624,6 +640,7 @@ common_interrupt:
ASM_CLAC
addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
SAVE_ALL
+ ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
movl %esp, %eax
call do_IRQ
@@ -635,6 +652,7 @@ ENTRY(name) \
ASM_CLAC; \
pushl $~(nr); \
SAVE_ALL; \
+ ENCODE_FRAME_POINTER; \
TRACE_IRQS_OFF \
movl %esp, %eax; \
call fn; \
@@ -769,6 +787,7 @@ END(spurious_interrupt_bug)
ENTRY(xen_hypervisor_callback)
pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
+ ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
/*
@@ -823,6 +842,7 @@ ENTRY(xen_failsafe_callback)
jmp iret_exc
5: pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
+ ENCODE_FRAME_POINTER
jmp ret_from_exception
.section .fixup, "ax"
@@ -1047,6 +1067,7 @@ error_code:
pushl %edx
pushl %ecx
pushl %ebx
+ ENCODE_FRAME_POINTER
cld
movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
@@ -1079,6 +1100,7 @@ ENTRY(debug)
ASM_CLAC
pushl $-1 # mark this as an int
SAVE_ALL
+ ENCODE_FRAME_POINTER
xorl %edx, %edx # error code 0
movl %esp, %eax # pt_regs pointer
@@ -1094,11 +1116,11 @@ ENTRY(debug)
.Ldebug_from_sysenter_stack:
/* We're on the SYSENTER stack. Switch off. */
- movl %esp, %ebp
+ movl %esp, %ebx
movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
TRACE_IRQS_OFF
call do_debug
- movl %ebp, %esp
+ movl %ebx, %esp
jmp ret_from_exception
END(debug)
@@ -1121,6 +1143,7 @@ ENTRY(nmi)
pushl %eax # pt_regs->orig_ax
SAVE_ALL
+ ENCODE_FRAME_POINTER
xorl %edx, %edx # zero error code
movl %esp, %eax # pt_regs pointer
@@ -1139,10 +1162,10 @@ ENTRY(nmi)
* We're on the SYSENTER stack. Switch off. No one (not even debug)
* is using the thread stack right now, so it's safe for us to use it.
*/
- movl %esp, %ebp
+ movl %esp, %ebx
movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
call do_nmi
- movl %ebp, %esp
+ movl %ebx, %esp
jmp restore_all_notrace
#ifdef CONFIG_X86_ESPFIX32
@@ -1159,6 +1182,7 @@ nmi_espfix_stack:
.endr
pushl %eax
SAVE_ALL
+ ENCODE_FRAME_POINTER
FIXUP_ESPFIX_STACK # %eax == %esp
xorl %edx, %edx # zero error code
call do_nmi
@@ -1172,6 +1196,7 @@ ENTRY(int3)
ASM_CLAC
pushl $-1 # mark this as an int
SAVE_ALL
+ ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
xorl %edx, %edx # zero error code
movl %esp, %eax # pt_regs pointer
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -469,6 +469,7 @@ END(irq_entries_start)
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
+ ENCODE_FRAME_POINTER
testb $3, CS(%rsp)
jz 1f
@@ -985,6 +986,7 @@ ENTRY(xen_failsafe_callback)
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
+ ENCODE_FRAME_POINTER
jmp error_exit
END(xen_failsafe_callback)
@@ -1028,6 +1030,7 @@ ENTRY(paranoid_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+ ENCODE_FRAME_POINTER 8
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
rdmsr
@@ -1075,6 +1078,7 @@ ENTRY(error_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+ ENCODE_FRAME_POINTER 8
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1259,6 +1263,7 @@ ENTRY(nmi)
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+ ENCODE_FRAME_POINTER
/*
* At this point we no longer need to worry about stack damage
@@ -1272,11 +1277,10 @@ ENTRY(nmi)
/*
* Return back to user mode. We must *not* do the normal exit
- * work, because we don't want to enable interrupts. Fortunately,
- * do_nmi doesn't modify pt_regs.
+ * work, because we don't want to enable interrupts.
*/
SWAPGS
- jmp restore_c_regs_and_iret
+ jmp restore_regs_and_iret
.Lnmi_from_kernel:
/*
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -13,6 +13,7 @@ struct unwind_state {
int graph_idx;
#ifdef CONFIG_FRAME_POINTER
unsigned long *bp;
+ struct pt_regs *regs;
#else
unsigned long *sp;
#endif
@@ -47,7 +48,15 @@ unsigned long *unwind_get_return_address
if (unwind_done(state))
return NULL;
- return state->bp + 1;
+ return state->regs ? &state->regs->ip : state->bp + 1;
+}
+
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ return state->regs;
}
#else /* !CONFIG_FRAME_POINTER */
@@ -57,6 +66,11 @@ unsigned long *unwind_get_return_address
{
return NULL;
}
+
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+{
+ return NULL;
+}
#endif /* CONFIG_FRAME_POINTER */
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -14,6 +14,9 @@ unsigned long unwind_get_return_address(
if (unwind_done(state))
return 0;
+ if (state->regs && user_mode(state->regs))
+ return 0;
+
addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
addr_p);
@@ -21,6 +24,20 @@ unsigned long unwind_get_return_address(
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
+/*
+ * This determines if the frame pointer actually contains an encoded pointer to
+ * pt_regs on the stack. See ENCODE_FRAME_POINTER.
+ */
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (!(regs & 0x1))
+ return NULL;
+
+ return (struct pt_regs *)(regs & ~0x1);
+}
+
static bool update_stack_state(struct unwind_state *state, void *addr,
size_t len)
{
@@ -43,26 +60,59 @@ static bool update_stack_state(struct un
bool unwind_next_frame(struct unwind_state *state)
{
- unsigned long *next_bp;
+ struct pt_regs *regs;
+ unsigned long *next_bp, *next_frame;
+ size_t next_len;
if (unwind_done(state))
return false;
- next_bp = (unsigned long *)*state->bp;
+ /* have we reached the end? */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ /* get the next frame pointer */
+ if (state->regs)
+ next_bp = (unsigned long *)state->regs->bp;
+ else
+ next_bp = (unsigned long *)*state->bp;
+
+ /* is the next frame pointer an encoded pointer to pt_regs? */
+ regs = decode_frame_pointer(next_bp);
+ if (regs) {
+ next_frame = (unsigned long *)regs;
+ next_len = sizeof(*regs);
+ } else {
+ next_frame = next_bp;
+ next_len = FRAME_HEADER_SIZE;
+ }
/* make sure the next frame's data is accessible */
- if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
+ if (!update_stack_state(state, next_frame, next_len))
return false;
-
/* move to the next frame */
- state->bp = next_bp;
+ if (regs) {
+ state->regs = regs;
+ state->bp = NULL;
+ } else {
+ state->bp = next_bp;
+ state->regs = NULL;
+ }
+
return true;
+
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
}
EXPORT_SYMBOL_GPL(unwind_next_frame);
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long *first_frame)
{
+ unsigned long *bp, *frame;
+ size_t len;
+
memset(state, 0, sizeof(*state));
state->task = task;
@@ -73,12 +123,22 @@ void __unwind_start(struct unwind_state
}
/* set up the starting stack frame */
- state->bp = get_frame_pointer(task, regs);
+ bp = get_frame_pointer(task, regs);
+ regs = decode_frame_pointer(bp);
+ if (regs) {
+ state->regs = regs;
+ frame = (unsigned long *)regs;
+ len = sizeof(*regs);
+ } else {
+ state->bp = bp;
+ frame = bp;
+ len = FRAME_HEADER_SIZE;
+ }
/* initialize stack info and make sure the frame data is accessible */
- get_stack_info(state->bp, state->task, &state->stack_info,
+ get_stack_info(frame, state->task, &state->stack_info,
&state->stack_mask);
- update_stack_state(state, state->bp, FRAME_HEADER_SIZE);
+ update_stack_state(state, frame, len);
/*
* The caller can provide the address of the first frame directly
Patches currently in stable-queue which might be from jpoimboe(a)redhat.com are
queue-4.9/x86-entry-unwind-create-stack-frames-for-saved-interrupt-registers.patch
This is a note to let you know that I've just added the patch titled
vsock: cancel packets when failing to connect
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
vsock-cancel-packets-when-failing-to-connect.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Thu Dec 21 09:02:40 CET 2017
From: Peng Tao <bergwolf(a)gmail.com>
Date: Wed, 15 Mar 2017 09:32:17 +0800
Subject: vsock: cancel packets when failing to connect
From: Peng Tao <bergwolf(a)gmail.com>
[ Upstream commit 380feae0def7e6a115124a3219c3ec9b654dca32 ]
Otherwise we'll leave the packets queued until releasing vsock device.
E.g., if guest is slow to start up, resulting ETIMEDOUT on connect, guest
will get the connect requests from failed host sockets.
Reviewed-by: Stefan Hajnoczi <stefanha(a)redhat.com>
Reviewed-by: Jorgen Hansen <jhansen(a)vmware.com>
Signed-off-by: Peng Tao <bergwolf(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Sasha Levin <alexander.levin(a)verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/vmw_vsock/af_vsock.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1101,10 +1101,19 @@ static const struct proto_ops vsock_dgra
.sendpage = sock_no_sendpage,
};
+static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+ if (!transport->cancel_pkt)
+ return -EOPNOTSUPP;
+
+ return transport->cancel_pkt(vsk);
+}
+
static void vsock_connect_timeout(struct work_struct *work)
{
struct sock *sk;
struct vsock_sock *vsk;
+ int cancel = 0;
vsk = container_of(work, struct vsock_sock, dwork.work);
sk = sk_vsock(vsk);
@@ -1115,8 +1124,11 @@ static void vsock_connect_timeout(struct
sk->sk_state = SS_UNCONNECTED;
sk->sk_err = ETIMEDOUT;
sk->sk_error_report(sk);
+ cancel = 1;
}
release_sock(sk);
+ if (cancel)
+ vsock_transport_cancel_pkt(vsk);
sock_put(sk);
}
@@ -1223,11 +1235,13 @@ static int vsock_stream_connect(struct s
err = sock_intr_errno(timeout);
sk->sk_state = SS_UNCONNECTED;
sock->state = SS_UNCONNECTED;
+ vsock_transport_cancel_pkt(vsk);
goto out_wait;
} else if (timeout == 0) {
err = -ETIMEDOUT;
sk->sk_state = SS_UNCONNECTED;
sock->state = SS_UNCONNECTED;
+ vsock_transport_cancel_pkt(vsk);
goto out_wait;
}
Patches currently in stable-queue which might be from bergwolf(a)gmail.com are
queue-4.9/vsock-cancel-packets-when-failing-to-connect.patch
queue-4.9/vsock-track-pkt-owner-vsock.patch
queue-4.9/vhost-vsock-add-pkt-cancel-capability.patch
This is a note to let you know that I've just added the patch titled
virtio-balloon: use actual number of stats for stats queue buffers
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
virtio-balloon-use-actual-number-of-stats-for-stats-queue-buffers.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Thu Dec 21 09:02:40 CET 2017
From: Ladi Prosek <lprosek(a)redhat.com>
Date: Tue, 28 Mar 2017 18:46:58 +0200
Subject: virtio-balloon: use actual number of stats for stats queue buffers
From: Ladi Prosek <lprosek(a)redhat.com>
[ Upstream commit 9646b26e85896ef0256e66649f7937f774dc18a6 ]
The virtio balloon driver contained a not-so-obvious invariant that
update_balloon_stats has to update exactly VIRTIO_BALLOON_S_NR counters
in order to send valid stats to the host. This commit fixes it by having
update_balloon_stats return the actual number of counters, and its
callers use it when pushing buffers to the stats virtqueue.
Note that it is still out of spec to change the number of counters
at run-time. "Driver MUST supply the same subset of statistics in all
buffers submitted to the statsq."
Suggested-by: Arnd Bergmann <arnd(a)arndb.de>
Signed-off-by: Ladi Prosek <lprosek(a)redhat.com>
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
Signed-off-by: Sasha Levin <alexander.levin(a)verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/virtio/virtio_balloon.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -241,11 +241,11 @@ static inline void update_stat(struct vi
#define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT)
-static void update_balloon_stats(struct virtio_balloon *vb)
+static unsigned int update_balloon_stats(struct virtio_balloon *vb)
{
unsigned long events[NR_VM_EVENT_ITEMS];
struct sysinfo i;
- int idx = 0;
+ unsigned int idx = 0;
long available;
all_vm_events(events);
@@ -265,6 +265,8 @@ static void update_balloon_stats(struct
pages_to_bytes(i.totalram));
update_stat(vb, idx++, VIRTIO_BALLOON_S_AVAIL,
pages_to_bytes(available));
+
+ return idx;
}
/*
@@ -290,14 +292,14 @@ static void stats_handle_request(struct
{
struct virtqueue *vq;
struct scatterlist sg;
- unsigned int len;
+ unsigned int len, num_stats;
- update_balloon_stats(vb);
+ num_stats = update_balloon_stats(vb);
vq = vb->stats_vq;
if (!virtqueue_get_buf(vq, &len))
return;
- sg_init_one(&sg, vb->stats, sizeof(vb->stats));
+ sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL);
virtqueue_kick(vq);
}
@@ -421,15 +423,16 @@ static int init_vqs(struct virtio_balloo
vb->deflate_vq = vqs[1];
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
struct scatterlist sg;
+ unsigned int num_stats;
vb->stats_vq = vqs[2];
/*
* Prime this virtqueue with one buffer so the hypervisor can
* use it to signal us later (it can't be broken yet!).
*/
- update_balloon_stats(vb);
+ num_stats = update_balloon_stats(vb);
- sg_init_one(&sg, vb->stats, sizeof vb->stats);
+ sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
< 0)
BUG();
Patches currently in stable-queue which might be from lprosek(a)redhat.com are
queue-4.9/kvm-nvmx-fix-host_cr3-host_cr4-cache.patch
queue-4.9/virtio-balloon-use-actual-number-of-stats-for-stats-queue-buffers.patch
queue-4.9/virtio_balloon-prevent-uninitialized-variable-use.patch
This is a note to let you know that I've just added the patch titled
virtio_balloon: prevent uninitialized variable use
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
virtio_balloon-prevent-uninitialized-variable-use.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Thu Dec 21 09:02:40 CET 2017
From: Arnd Bergmann <arnd(a)arndb.de>
Date: Tue, 28 Mar 2017 18:46:59 +0200
Subject: virtio_balloon: prevent uninitialized variable use
From: Arnd Bergmann <arnd(a)arndb.de>
[ Upstream commit f0bb2d50dfcc519f06f901aac88502be6ff1df2c ]
The latest gcc-7.0.1 snapshot reports a new warning:
virtio/virtio_balloon.c: In function 'update_balloon_stats':
virtio/virtio_balloon.c:258:26: error: 'events[2]' is used uninitialized in this function [-Werror=uninitialized]
virtio/virtio_balloon.c:260:26: error: 'events[3]' is used uninitialized in this function [-Werror=uninitialized]
virtio/virtio_balloon.c:261:56: error: 'events[18]' is used uninitialized in this function [-Werror=uninitialized]
virtio/virtio_balloon.c:262:56: error: 'events[17]' is used uninitialized in this function [-Werror=uninitialized]
This seems absolutely right, so we should add an extra check to
prevent copying uninitialized stack data into the statistics.
>From all I can tell, this has been broken since the statistics code
was originally added in 2.6.34.
Fixes: 9564e138b1f6 ("virtio: Add memory statistics reporting to the balloon driver (V4)")
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
Signed-off-by: Ladi Prosek <lprosek(a)redhat.com>
Signed-off-by: Michael S. Tsirkin <mst(a)redhat.com>
Signed-off-by: Sasha Levin <alexander.levin(a)verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/virtio/virtio_balloon.c | 2 ++
1 file changed, 2 insertions(+)
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -253,12 +253,14 @@ static unsigned int update_balloon_stats
available = si_mem_available();
+#ifdef CONFIG_VM_EVENT_COUNTERS
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
pages_to_bytes(events[PSWPIN]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT,
pages_to_bytes(events[PSWPOUT]));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]);
update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
+#endif
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
pages_to_bytes(i.freeram));
update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT,
Patches currently in stable-queue which might be from arnd(a)arndb.de are
queue-4.9/hwmon-asus_atk0110-fix-uninitialized-data-access.patch
queue-4.9/bna-avoid-writing-uninitialized-data-into-hw-registers.patch
queue-4.9/virtio-balloon-use-actual-number-of-stats-for-stats-queue-buffers.patch
queue-4.9/virtio_balloon-prevent-uninitialized-variable-use.patch
queue-4.9/isdn-kcapi-avoid-uninitialized-data.patch
This is a note to let you know that I've just added the patch titled
vfio/pci: Virtualize Maximum Payload Size
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
vfio-pci-virtualize-maximum-payload-size.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Thu Dec 21 09:02:40 CET 2017
From: Alex Williamson <alex.williamson(a)redhat.com>
Date: Mon, 2 Oct 2017 12:39:09 -0600
Subject: vfio/pci: Virtualize Maximum Payload Size
From: Alex Williamson <alex.williamson(a)redhat.com>
[ Upstream commit 523184972b282cd9ca17a76f6ca4742394856818 ]
With virtual PCI-Express chipsets, we now see userspace/guest drivers
trying to match the physical MPS setting to a virtual downstream port.
Of course a lone physical device surrounded by virtual interconnects
cannot make a correct decision for a proper MPS setting. Instead,
let's virtualize the MPS control register so that writes through to
hardware are disallowed. Userspace drivers like QEMU assume they can
write anything to the device and we'll filter out anything dangerous.
Since mismatched MPS can lead to AER and other faults, let's add it
to the kernel side rather than relying on userspace virtualization to
handle it.
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com>
Reviewed-by: Eric Auger <eric.auger(a)redhat.com>
Signed-off-by: Sasha Levin <alexander.levin(a)verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/vfio/pci/vfio_pci_config.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -851,11 +851,13 @@ static int __init init_pci_cap_exp_perm(
/*
* Allow writes to device control fields, except devctl_phantom,
- * which could confuse IOMMU, and the ARI bit in devctl2, which
+ * which could confuse IOMMU, MPS, which can break communication
+ * with other physical devices, and the ARI bit in devctl2, which
* is set at probe time. FLR gets virtualized via our writefn.
*/
p_setw(perm, PCI_EXP_DEVCTL,
- PCI_EXP_DEVCTL_BCR_FLR, ~PCI_EXP_DEVCTL_PHANTOM);
+ PCI_EXP_DEVCTL_BCR_FLR | PCI_EXP_DEVCTL_PAYLOAD,
+ ~PCI_EXP_DEVCTL_PHANTOM);
p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI);
return 0;
}
Patches currently in stable-queue which might be from alex.williamson(a)redhat.com are
queue-4.9/pci-avoid-bus-reset-if-bridge-itself-is-broken.patch
queue-4.9/vfio-pci-virtualize-maximum-payload-size.patch
This is a note to let you know that I've just added the patch titled
vhost-vsock: add pkt cancel capability
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
vhost-vsock-add-pkt-cancel-capability.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Thu Dec 21 09:02:40 CET 2017
From: Peng Tao <bergwolf(a)gmail.com>
Date: Wed, 15 Mar 2017 09:32:15 +0800
Subject: vhost-vsock: add pkt cancel capability
From: Peng Tao <bergwolf(a)gmail.com>
[ Upstream commit 16320f363ae128d9b9c70e60f00f2a572f57c23d ]
To allow canceling all packets of a connection.
Reviewed-by: Stefan Hajnoczi <stefanha(a)redhat.com>
Reviewed-by: Jorgen Hansen <jhansen(a)vmware.com>
Signed-off-by: Peng Tao <bergwolf(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Sasha Levin <alexander.levin(a)verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/vhost/vsock.c | 41 +++++++++++++++++++++++++++++++++++++++++
include/net/af_vsock.h | 3 +++
2 files changed, 44 insertions(+)
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -218,6 +218,46 @@ vhost_transport_send_pkt(struct virtio_v
return len;
}
+static int
+vhost_transport_cancel_pkt(struct vsock_sock *vsk)
+{
+ struct vhost_vsock *vsock;
+ struct virtio_vsock_pkt *pkt, *n;
+ int cnt = 0;
+ LIST_HEAD(freeme);
+
+ /* Find the vhost_vsock according to guest context id */
+ vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
+ if (!vsock)
+ return -ENODEV;
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
+ if (pkt->vsk != vsk)
+ continue;
+ list_move(&pkt->list, &freeme);
+ }
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ list_for_each_entry_safe(pkt, n, &freeme, list) {
+ if (pkt->reply)
+ cnt++;
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+
+ if (cnt) {
+ struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
+ int new_cnt;
+
+ new_cnt = atomic_sub_return(cnt, &vsock->queued_replies);
+ if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num)
+ vhost_poll_queue(&tx_vq->poll);
+ }
+
+ return 0;
+}
+
static struct virtio_vsock_pkt *
vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
unsigned int out, unsigned int in)
@@ -669,6 +709,7 @@ static struct virtio_transport vhost_tra
.release = virtio_transport_release,
.connect = virtio_transport_connect,
.shutdown = virtio_transport_shutdown,
+ .cancel_pkt = vhost_transport_cancel_pkt,
.dgram_enqueue = virtio_transport_dgram_enqueue,
.dgram_dequeue = virtio_transport_dgram_dequeue,
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -100,6 +100,9 @@ struct vsock_transport {
void (*destruct)(struct vsock_sock *);
void (*release)(struct vsock_sock *);
+ /* Cancel all pending packets sent on vsock. */
+ int (*cancel_pkt)(struct vsock_sock *vsk);
+
/* Connections. */
int (*connect)(struct vsock_sock *);
Patches currently in stable-queue which might be from bergwolf(a)gmail.com are
queue-4.9/vsock-cancel-packets-when-failing-to-connect.patch
queue-4.9/vsock-track-pkt-owner-vsock.patch
queue-4.9/vhost-vsock-add-pkt-cancel-capability.patch