On Fri, 2018-12-07 at 12:18 +0000, David Woodhouse wrote:
#else
struct multicall_space mc = __xen_mc_entry(0);
MULTI_set_segment_base(mc.mc, SEGBASE_GS_USER_SEL, 0);
loadsegment(fs, 0);
#endif
That seems to boot and run, at least.
I'm going to experiment with sticking a SCHEDOP_yield in the batch *after* the update_descriptor requests, to see if I can trigger the original problem a bit quicker than my current method — which involves running a hundred machines for a day or two.
That SCHEDOP_yield and some debug output in xen_failsafe_callback shows that it's nice and easy to reproduce now without the above MULTI_set_segment_base() call. And stopped happening when I add the MULTI_set_segment_base() call back again. I'll call that 'tested'.
But now we're making a hypercall to clear user %gs even in the case where none of the descriptors have changed; we should probably stop doing that...
Testing this (incremental) variant now.
--- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -688,7 +688,7 @@ static inline bool desc_equal(const struct desc_struct *d1, }
static void load_TLS_descriptor(struct thread_struct *t, - unsigned int cpu, unsigned int i) + unsigned int cpu, unsigned int i, int flush_gs) { struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; struct desc_struct *gdt; @@ -698,6 +698,15 @@ static void load_TLS_descriptor(struct thread_struct *t, if (desc_equal(shadow, &t->tls_array[i])) return;
+ /* + * If the current user %gs points to a descriptor we're changing, + * zero it first to avoid taking a fault if Xen preempts this + * vCPU between now and the time that %gs is later loaded with + * the new value. + */ + if ((flush_gs >> 3) == GDT_ENTRY_TLS_MIN + i) + MULTI_set_segment_base(mc.mc, SEGBASE_GS_USER_SEL, 0); + *shadow = t->tls_array[i];
gdt = get_cpu_gdt_table(cpu); @@ -709,7 +718,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
static void xen_load_tls(struct thread_struct *t, unsigned int cpu) { - xen_mc_batch(); + u16 flush_gs = 0;
/* * XXX sleazy hack: If we're being called in a lazy-cpu zone @@ -723,17 +732,19 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * * For x86_64, we need to zero %fs, otherwise we may get an * exception between the new %fs descriptor being loaded and - * %fs being effectively cleared at __switch_to(). We can't - * just zero %gs, but we do need to clear the selector in - * case of a Xen vCPU context switch before it gets reloaded - * which would also cause a fault. + * %fs being effectively cleared at __switch_to(). + * + * We may also need to zero %gs, if it refers to a descriptor + * which we are clearing. Otherwise a Xen vCPU context switch + * before it gets reloaded could also cause a fault. Since + * clearing the user %gs is another hypercall, do that only if + * it's necessary. */ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { #ifdef CONFIG_X86_32 lazy_load_gs(0); #else - struct multicall_space mc = __xen_mc_entry(0); - MULTI_set_segment_base(mc.mc, SEGBASE_GS_USER_SEL, 0); + savesegment(gs, flush_gs);
loadsegment(fs, 0); #endif @@ -741,9 +752,9 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
xen_mc_batch();
- load_TLS_descriptor(t, cpu, 0); - load_TLS_descriptor(t, cpu, 1); - load_TLS_descriptor(t, cpu, 2); + load_TLS_descriptor(t, cpu, 0, flush_gs); + load_TLS_descriptor(t, cpu, 1, flush_gs); + load_TLS_descriptor(t, cpu, 2, flush_gs);
{ struct multicall_space mc = __xen_mc_entry(0);