[ Upstream commit c8401dda2f0a00cd25c0af6a95ed50e478d25de4 ]
TF is handled a bit differently for syscall and sysret, compared to the other instructions: TF is checked after the instruction completes, so that the OS can disable #DB at a syscall by adding TF to FMASK. When the sysret is executed the #DB is taken "as if" the syscall insn just completed.
KVM emulates syscall so that it can trap 32-bit syscall on Intel processors. Fix the behavior, otherwise you could get #DB on a user stack which is not nice. This does not affect Linux guests, as they use an IST or task gate for #DB.
This fixes CVE-2017-7518.
Cc: stable@vger.kernel.org Reported-by: Andy Lutomirski luto@kernel.org Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com
Conflicts: arch/x86/kvm/x86.c --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 1 + arch/x86/kvm/x86.c | 52 ++++++++++++++++---------------------- 3 files changed, 24 insertions(+), 30 deletions(-)
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 19d14ac23ef9..fc3c7e49c8e4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -296,6 +296,7 @@ struct x86_emulate_ctxt {
bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ + bool tf; /* TF value before instruction (after for syscall/sysret) */
bool have_exception; struct x86_exception exception; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 04b2f3cad7ba..684edebb4a0c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2726,6 +2726,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); }
+ ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; return X86EMUL_CONTINUE; }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e526c6fd784..3ffd5900da5b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5095,6 +5095,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
ctxt->eflags = kvm_get_rflags(vcpu); + ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; + ctxt->eip = kvm_rip_read(vcpu); ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : @@ -5315,37 +5317,26 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; }
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) +static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) { struct kvm_run *kvm_run = vcpu->run;
- /* - * rflags is the old, "raw" value of the flags. The new value has - * not been saved yet. - * - * This is correct even for TF set by the guest, because "the - * processor will not generate this exception after the instruction - * that sets the TF flag". - */ - if (unlikely(rflags & X86_EFLAGS_TF)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | - DR6_RTM; - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; - /* - * "Certain debug exceptions may clear bit 0-3. The - * remaining contents of the DR6 register are never - * cleared by the processor". - */ - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); - } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.exception = DB_VECTOR; + kvm_run->exit_reason = KVM_EXIT_DEBUG; + *r = EMULATE_USER_EXIT; + } else { + vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~15; + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; + kvm_queue_exception(vcpu, DB_VECTOR); } }
@@ -5500,8 +5491,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE) - kvm_vcpu_check_singlestep(vcpu, rflags, &r); + if (r == EMULATE_DONE && + (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + kvm_vcpu_do_singlestep(vcpu, &r); if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) __kvm_set_rflags(vcpu, ctxt->eflags);
On 16.11.2017 19:07, Paolo Bonzini wrote:
[ Upstream commit c8401dda2f0a00cd25c0af6a95ed50e478d25de4 ]
TF is handled a bit differently for syscall and sysret, compared to the other instructions: TF is checked after the instruction completes, so that the OS can disable #DB at a syscall by adding TF to FMASK. When the sysret is executed the #DB is taken "as if" the syscall insn just completed.
KVM emulates syscall so that it can trap 32-bit syscall on Intel processors. Fix the behavior, otherwise you could get #DB on a user stack which is not nice. This does not affect Linux guests, as they use an IST or task gate for #DB.
This fixes CVE-2017-7518.
Cc: stable@vger.kernel.org Reported-by: Andy Lutomirski luto@kernel.org Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Radim Krčmář rkrcmar@redhat.com
Conflicts: arch/x86/kvm/x86.c
arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 1 + arch/x86/kvm/x86.c | 52 ++++++++++++++++---------------------- 3 files changed, 24 insertions(+), 30 deletions(-)
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 19d14ac23ef9..fc3c7e49c8e4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -296,6 +296,7 @@ struct x86_emulate_ctxt { bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */
- bool tf; /* TF value before instruction (after for syscall/sysret) */
bool have_exception; struct x86_exception exception; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 04b2f3cad7ba..684edebb4a0c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2726,6 +2726,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); }
- ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; return X86EMUL_CONTINUE;
} diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e526c6fd784..3ffd5900da5b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5095,6 +5095,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); ctxt->eflags = kvm_get_rflags(vcpu);
- ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
- ctxt->eip = kvm_rip_read(vcpu); ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
@@ -5315,37 +5317,26 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) +static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) { struct kvm_run *kvm_run = vcpu->run;
- /*
* rflags is the old, "raw" value of the flags. The new value has
* not been saved yet.
*
* This is correct even for TF set by the guest, because "the
* processor will not generate this exception after the instruction
* that sets the TF flag".
*/
- if (unlikely(rflags & X86_EFLAGS_TF)) {
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
DR6_RTM;
kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
*r = EMULATE_USER_EXIT;
} else {
vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
/*
* "Certain debug exceptions may clear bit 0-3. The
* remaining contents of the DR6 register are never
* cleared by the processor".
*/
vcpu->arch.dr6 &= ~15;
vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
kvm_queue_exception(vcpu, DB_VECTOR);
}
- if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
*r = EMULATE_USER_EXIT;
- } else {
vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
/*
* "Certain debug exceptions may clear bit 0-3. The
* remaining contents of the DR6 register are never
* cleared by the processor".
*/
vcpu->arch.dr6 &= ~15;
vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
}kvm_queue_exception(vcpu, DB_VECTOR);
}
This looks good to me.
General question: how do we treat KVM single stepping and concurrent TF in the guest? IOW, shouldn't the "else" rather be a check for ctxt->tf ? (or is that handled later on e.g. in user space?)
But question unrelated to this patch, so
Reviewed-by: David Hildenbrand david@redhat.com
@@ -5500,8 +5491,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip);
if (r == EMULATE_DONE)
kvm_vcpu_check_singlestep(vcpu, rflags, &r);
if (r == EMULATE_DONE &&
(ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) __kvm_set_rflags(vcpu, ctxt->eflags);kvm_vcpu_do_singlestep(vcpu, &r);
On 20/11/2017 10:55, David Hildenbrand wrote:
This looks good to me.
General question: how do we treat KVM single stepping and concurrent TF in the guest? IOW, shouldn't the "else" rather be a check for ctxt->tf ? (or is that handled later on e.g. in user space?)
At this time, not very well. What you suggest seems sensible, however note that this function is only used on the (slow and rare) emulation path, not when the guest is running in VMX (or SVM) non-root mode.
Paolo
linux-stable-mirror@lists.linaro.org