Commit 633260fa143 ("x86/irq: Convey vector as argument and not in ptregs") changed the handover logic of the vector identifier from ~vector in orig_ax to purely register based. Unfortunately, this field has another consumer in the APIC code which the commit did not touch. The net result was that IRQ balancing did not work and instead resulted in interrupt storms, slowing down the system.
This patch restores the original semantics that orig_ax contains the vector. When we receive an interrupt now, the actual vector number stays stored in the orig_ax field which then gets consumed by the APIC code.
To ensure that nobody else trips over this in the future, the patch also adds comments at strategic places to warn anyone who would refactor the code that there is another consumer of the field.
With this patch in place, IRQ balancing works as expected and performance levels are restored to previous levels.
Reported-by: Alex bykov alex.bykov@scylladb.com Reported-by: Avi Kivity avi@scylladb.com Fixes: 633260fa143 ("x86/irq: Convey vector as argument and not in ptregs") Cc: stable@vger.kernel.org Signed-off-by: Alexander Graf graf@amazon.com --- arch/x86/entry/entry_32.S | 2 +- arch/x86/entry/entry_64.S | 17 +++++++++++------ arch/x86/kernel/apic/vector.c | 2 +- 3 files changed, 13 insertions(+), 8 deletions(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index df8c017..22e829c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -727,7 +727,7 @@ SYM_CODE_START_LOCAL(asm_\cfunc) ENCODE_FRAME_POINTER movl %esp, %eax movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */ - movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */ + /* keep vector on stack for APIC's irq_complete_move() */ call \cfunc jmp handle_exception_return SYM_CODE_END(asm_\cfunc) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 70dea93..d78fb1c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -319,7 +319,7 @@ SYM_CODE_END(ret_from_fork) * @cfunc: C function to be called * @has_error_code: Hardware pushed error code on stack */ -.macro idtentry_body cfunc has_error_code:req +.macro idtentry_body cfunc has_error_code:req preserve_error_code:req
call error_entry UNWIND_HINT_REGS @@ -328,7 +328,9 @@ SYM_CODE_END(ret_from_fork)
.if \has_error_code == 1 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .if \preserve_error_code == 0 + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .endif .endif
call \cfunc @@ -346,7 +348,7 @@ SYM_CODE_END(ret_from_fork) * The macro emits code to set up the kernel context for straight forward * and simple IDT entries. No IST stack, no paranoid entry checks. */ -.macro idtentry vector asmsym cfunc has_error_code:req +.macro idtentry vector asmsym cfunc has_error_code:req preserve_error_code=0 SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 ASM_CLAC @@ -369,7 +371,7 @@ SYM_CODE_START(\asmsym) .Lfrom_usermode_no_gap_@: .endif
- idtentry_body \cfunc \has_error_code + idtentry_body \cfunc \has_error_code \preserve_error_code
_ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) @@ -382,11 +384,14 @@ SYM_CODE_END(\asmsym) * position of idtentry exceptions, and jump to one of the two idtentry points * (common/spurious). * + * The original vector number on the stack has to stay untouched, so that the + * APIC irq_complete_move() code can access it later on IRQ ack. + * * common_interrupt is a hotpath, align it to a cache line */ .macro idtentry_irq vector cfunc .p2align CONFIG_X86_L1_CACHE_SHIFT - idtentry \vector asm_\cfunc \cfunc has_error_code=1 + idtentry \vector asm_\cfunc \cfunc has_error_code=1 preserve_error_code=1 .endm
/* @@ -440,7 +445,7 @@ SYM_CODE_START(\asmsym)
/* Switch to the regular task stack and use the noist entry point */ .Lfrom_usermode_switch_stack_@: - idtentry_body noist_\cfunc, has_error_code=0 + idtentry_body noist_\cfunc, has_error_code=0, preserve_error_code=0
_ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index dae32d9..e81b835 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -924,7 +924,7 @@ static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
void irq_complete_move(struct irq_cfg *cfg) { - __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); + __irq_complete_move(cfg, (u8)get_irq_regs()->orig_ax); }
/*
On Wed, Aug 26, 2020 at 01:53:57PM +0200, Alexander Graf wrote:
-.macro idtentry_body cfunc has_error_code:req +.macro idtentry_body cfunc has_error_code:req preserve_error_code:req call error_entry UNWIND_HINT_REGS @@ -328,7 +328,9 @@ SYM_CODE_END(ret_from_fork) .if \has_error_code == 1 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.if \preserve_error_code == 0
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.endif
When does this happen (has_error_code=1 && preserve_error_code=0)? I don't see any users of this macro (or idtentry) with this combination.
On 26.08.20 15:22, Josh Poimboeuf wrote:
On Wed, Aug 26, 2020 at 01:53:57PM +0200, Alexander Graf wrote:
-.macro idtentry_body cfunc has_error_code:req +.macro idtentry_body cfunc has_error_code:req preserve_error_code:req
call error_entry UNWIND_HINT_REGS
@@ -328,7 +328,9 @@ SYM_CODE_END(ret_from_fork)
.if \has_error_code == 1 movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.if \preserve_error_code == 0
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.endif
When does this happen (has_error_code=1 && preserve_error_code=0)? I don't see any users of this macro (or idtentry) with this combination.
It's well hidden in arch/x86/include/asm/idtentry.h:
#define DECLARE_IDTENTRY_ERRORCODE(vector, func) \ idtentry vector asm_##func func has_error_code=1
/* Simple exception entries with error code pushed by hardware */ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_TS, exc_invalid_tss); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_NP, exc_segment_not_present); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_SS, exc_stack_segment); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP, exc_general_protection); DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC, exc_alignment_check); [...] DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault);
Alex
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On Wed, Aug 26 2020 at 13:53, Alexander Graf wrote:
Commit 633260fa143 ("x86/irq: Convey vector as argument and not in ptregs") changed the handover logic of the vector identifier from ~vector in orig_ax to purely register based. Unfortunately, this field has another consumer in the APIC code which the commit did not touch. The net result was that IRQ balancing did not work and instead resulted in interrupt storms, slowing down the system.
The net result is an observationof the symptom but that does not explain what the underlying technical issue is.
This patch restores the original semantics that orig_ax contains the vector. When we receive an interrupt now, the actual vector number stays stored in the orig_ax field which then gets consumed by the APIC code.
To ensure that nobody else trips over this in the future, the patch also adds comments at strategic places to warn anyone who would refactor the code that there is another consumer of the field.
With this patch in place, IRQ balancing works as expected and performance levels are restored to previous levels.
There's a lot of 'This patch and we' in that changelog. Care to grep for 'This patch' in Documentation/process/ ?
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index df8c017..22e829c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -727,7 +727,7 @@ SYM_CODE_START_LOCAL(asm_\cfunc) ENCODE_FRAME_POINTER movl %esp, %eax movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
- movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
- /* keep vector on stack for APIC's irq_complete_move() */
Yes that's fixing your observed wreckage, but it introduces a worse one.
user space -> interrupt push vector into orig_ax (values are in the ranges of 0-127 and -128 - 255 except for the system vectors which do not go through this code) handle() ... exit_to_user_mode_loop() arch_do_signal() /* Did we come from a system call? */ if (syscall_get_nr(current, regs) >= 0) {
---> BOOM for any vector 0-127 because syscall_get_nr() resolves to regs->orig_ax
Going to be fun to debug.
The below nasty hack cures it, but I hate it with a passion. I'll look deeper for a sane variant.
Thanks,
tglx --- --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -246,7 +246,9 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { + regs->orig_ax = (unsigned long)vector; handle_irq(desc, regs); + regs->orig_ax = -1; } else { ack_APIC_irq();
On Wed, Aug 26, 2020 at 7:27 AM Thomas Gleixner tglx@linutronix.de wrote:
On Wed, Aug 26 2020 at 13:53, Alexander Graf wrote:
Commit 633260fa143 ("x86/irq: Convey vector as argument and not in ptregs") changed the handover logic of the vector identifier from ~vector in orig_ax to purely register based. Unfortunately, this field has another consumer in the APIC code which the commit did not touch. The net result was that IRQ balancing did not work and instead resulted in interrupt storms, slowing down the system.
The net result is an observationof the symptom but that does not explain what the underlying technical issue is.
This patch restores the original semantics that orig_ax contains the vector. When we receive an interrupt now, the actual vector number stays stored in the orig_ax field which then gets consumed by the APIC code.
To ensure that nobody else trips over this in the future, the patch also adds comments at strategic places to warn anyone who would refactor the code that there is another consumer of the field.
With this patch in place, IRQ balancing works as expected and performance levels are restored to previous levels.
There's a lot of 'This patch and we' in that changelog. Care to grep for 'This patch' in Documentation/process/ ?
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index df8c017..22e829c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -727,7 +727,7 @@ SYM_CODE_START_LOCAL(asm_\cfunc) ENCODE_FRAME_POINTER movl %esp, %eax movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
/* keep vector on stack for APIC's irq_complete_move() */
Yes that's fixing your observed wreckage, but it introduces a worse one.
user space -> interrupt push vector into orig_ax (values are in the ranges of 0-127 and -128 - 255 except for the system vectors which do not go through this code) handle() ... exit_to_user_mode_loop() arch_do_signal() /* Did we come from a system call? */ if (syscall_get_nr(current, regs) >= 0) {
---> BOOM for any vector 0-127 because syscall_get_nr() resolves to regs->orig_ax
Going to be fun to debug.
The below nasty hack cures it, but I hate it with a passion. I'll look deeper for a sane variant.
Fundamentally, the way we overload orig_ax is problematic. I have a half-written series to improve it, but my series is broken. I think it's fixable, though.
First is this patch to use some __csh bits to indicate the entry type. As far as I know, this patch is correct:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86...
Then I wrote this incorrect patch:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86...
That one is wrong because the orig_ax wreckage seems to have leaked into user ABI -- user programs think that orig_ax has certain semantics on user-visible entries.
But I think that the problem in this thread could be fixed quite nicely by the first patch, plus a new CS_ENTRY_IRQ and allocating eight bits of __csh to store the vector. Then we could read out the vector.
--Andy
Andy,
On Wed, Aug 26 2020 at 09:13, Andy Lutomirski wrote:
On Wed, Aug 26, 2020 at 7:27 AM Thomas Gleixner tglx@linutronix.de wrote:
The below nasty hack cures it, but I hate it with a passion. I'll look deeper for a sane variant.
Fundamentally, the way we overload orig_ax is problematic. I have a half-written series to improve it, but my series is broken. I think it's fixable, though.
First is this patch to use some __csh bits to indicate the entry type. As far as I know, this patch is correct:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86...
Yes, that looks about right.
Then I wrote this incorrect patch:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86...
That one is wrong because the orig_ax wreckage seems to have leaked into user ABI -- user programs think that orig_ax has certain semantics on user-visible entries.
Yes, orig_ax is pretty much user ABI for a very long time.
But I think that the problem in this thread could be fixed quite nicely by the first patch, plus a new CS_ENTRY_IRQ and allocating eight bits of __csh to store the vector. Then we could read out the vector.
That works. Alternatively I can just store the vector in the irq descriptor itself. That's trivial enough and can be done completely in C independent of the stuff above.
Thanks,
tglx
On Wed, Aug 26, 2020 at 10:47 AM Thomas Gleixner tglx@linutronix.de wrote:
Andy,
On Wed, Aug 26 2020 at 09:13, Andy Lutomirski wrote:
On Wed, Aug 26, 2020 at 7:27 AM Thomas Gleixner tglx@linutronix.de wrote:
The below nasty hack cures it, but I hate it with a passion. I'll look deeper for a sane variant.
Fundamentally, the way we overload orig_ax is problematic. I have a half-written series to improve it, but my series is broken. I think it's fixable, though.
First is this patch to use some __csh bits to indicate the entry type. As far as I know, this patch is correct:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86...
Yes, that looks about right.
Then I wrote this incorrect patch:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86...
That one is wrong because the orig_ax wreckage seems to have leaked into user ABI -- user programs think that orig_ax has certain semantics on user-visible entries.
Yes, orig_ax is pretty much user ABI for a very long time.
But I think that the problem in this thread could be fixed quite nicely by the first patch, plus a new CS_ENTRY_IRQ and allocating eight bits of __csh to store the vector. Then we could read out the vector.
That works. Alternatively I can just store the vector in the irq descriptor itself. That's trivial enough and can be done completely in C independent of the stuff above.
The latter sounds quite sensible to me. It does seem vaguely ridiculous to be trying to fish the vector out of pt_regs in the APIC code.
--Andy
Am 26.08.2020 um 20:03 schrieb Andy Lutomirski luto@kernel.org:
On Wed, Aug 26, 2020 at 10:47 AM Thomas Gleixner tglx@linutronix.de wrote:
Andy,
On Wed, Aug 26 2020 at 09:13, Andy Lutomirski wrote: On Wed, Aug 26, 2020 at 7:27 AM Thomas Gleixner tglx@linutronix.de wrote:
The below nasty hack cures it, but I hate it with a passion. I'll look deeper for a sane variant.
Fundamentally, the way we overload orig_ax is problematic. I have a half-written series to improve it, but my series is broken. I think it's fixable, though.
First is this patch to use some __csh bits to indicate the entry type. As far as I know, this patch is correct:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86...
Yes, that looks about right.
Then I wrote this incorrect patch:
https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/commit/?h=x86...
That one is wrong because the orig_ax wreckage seems to have leaked into user ABI -- user programs think that orig_ax has certain semantics on user-visible entries.
Yes, orig_ax is pretty much user ABI for a very long time.
But I think that the problem in this thread could be fixed quite nicely by the first patch, plus a new CS_ENTRY_IRQ and allocating eight bits of __csh to store the vector. Then we could read out the vector.
That works. Alternatively I can just store the vector in the irq descriptor itself. That's trivial enough and can be done completely in C independent of the stuff above.
The latter sounds quite sensible to me. It does seem vaguely ridiculous to be trying to fish the vector out of pt_regs in the APIC code.
I like that option much better than the orig_ax hacks. Is this going to be something useable enough for stable?
Also, Thomas, will you have a look at moving the vector info? If so, I'd hold still on this patch for a bit.
Alex
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On 26.08.20 16:27, Thomas Gleixner wrote:
On Wed, Aug 26 2020 at 13:53, Alexander Graf wrote:
Commit 633260fa143 ("x86/irq: Convey vector as argument and not in ptregs") changed the handover logic of the vector identifier from ~vector in orig_ax to purely register based. Unfortunately, this field has another consumer in the APIC code which the commit did not touch. The net result was that IRQ balancing did not work and instead resulted in interrupt storms, slowing down the system.
The net result is an observationof the symptom but that does not explain what the underlying technical issue is.
This patch restores the original semantics that orig_ax contains the vector. When we receive an interrupt now, the actual vector number stays stored in the orig_ax field which then gets consumed by the APIC code.
To ensure that nobody else trips over this in the future, the patch also adds comments at strategic places to warn anyone who would refactor the code that there is another consumer of the field.
With this patch in place, IRQ balancing works as expected and performance levels are restored to previous levels.
There's a lot of 'This patch and we' in that changelog. Care to grep for 'This patch' in Documentation/process/ ?
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index df8c017..22e829c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -727,7 +727,7 @@ SYM_CODE_START_LOCAL(asm_\cfunc) ENCODE_FRAME_POINTER movl %esp, %eax movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
/* keep vector on stack for APIC's irq_complete_move() */
Yes that's fixing your observed wreckage, but it introduces a worse one.
user space -> interrupt push vector into orig_ax (values are in the ranges of 0-127 and -128 - 255 except for the system vectors which do not go through this code) handle() ... exit_to_user_mode_loop() arch_do_signal() /* Did we come from a system call? */ if (syscall_get_nr(current, regs) >= 0) {
---> BOOM for any vector 0-127 because syscall_get_nr() resolves to regs->orig_ax
Going to be fun to debug.
Hah, that's the code flow I was looking for to understand why the value was negative in the first place. Thanks a lot for pointing it out!
The below nasty hack cures it, but I hate it with a passion. I'll look deeper for a sane variant.
An alternative (that doesn't make the code easier to read, but would fix the issue at hand) would be touse a pushq imm16 with vector | 0x8000 instead to always make the value negative, no?
Alex
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
On Wed, Aug 26 2020 at 18:33, Alexander Graf wrote:
On 26.08.20 16:27, Thomas Gleixner wrote:
The below nasty hack cures it, but I hate it with a passion. I'll look deeper for a sane variant.
An alternative (that doesn't make the code easier to read, but would fix the issue at hand) would be touse a pushq imm16 with vector | 0x8000 instead to always make the value negative, no?
Which makes each entry larger than 8 byte which was frowned upon before.
And it does not solve the issue that we abuse orig_ax which Andy mentioned.
Thanks,
tglx
On Wed, Aug 26 2020 at 20:30, Thomas Gleixner wrote:
And it does not solve the issue that we abuse orig_ax which Andy mentioned.
Ha! After staring some more, it's not required at all, which is the most elegant solution.
The vector check is pointless in that condition because there is never a condition where an interrupt is moved from vector A to vector B on the same CPU.
That's a left over from the old allocation model which supported multi-cpu affinities, but this was removed as it just created trouble for no real benefit.
Today the effective affinity which is a single CPU out of the requested affinity. If an affinity mask change still contains the current target CPU then there is no move happening at all. It just stays on that vector on that CPU.
Thanks,
tglx ---
--- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -909,7 +909,7 @@ void send_cleanup_vector(struct irq_cfg __send_cleanup_vector(apicd); }
-static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) +void irq_complete_move(struct irq_cfg *cfg) { struct apic_chip_data *apicd;
@@ -917,15 +917,10 @@ static void __irq_complete_move(struct i if (likely(!apicd->move_in_progress)) return;
- if (vector == apicd->vector && apicd->cpu == smp_processor_id()) + if (apicd->cpu == smp_processor_id()) __send_cleanup_vector(apicd); }
-void irq_complete_move(struct irq_cfg *cfg) -{ - __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); -} - /* * Called from fixup_irqs() with @desc->lock held and interrupts disabled. */
On 26.08.20 20:53, Thomas Gleixner wrote:
On Wed, Aug 26 2020 at 20:30, Thomas Gleixner wrote:
And it does not solve the issue that we abuse orig_ax which Andy mentioned.
Ha! After staring some more, it's not required at all, which is the most elegant solution.
The vector check is pointless in that condition because there is never a condition where an interrupt is moved from vector A to vector B on the same CPU.
That's a left over from the old allocation model which supported multi-cpu affinities, but this was removed as it just created trouble for no real benefit.
Today the effective affinity which is a single CPU out of the requested affinity. If an affinity mask change still contains the current target CPU then there is no move happening at all. It just stays on that vector on that CPU.
Thanks,
tglx
--- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -909,7 +909,7 @@ void send_cleanup_vector(struct irq_cfg __send_cleanup_vector(apicd); }
-static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) +void irq_complete_move(struct irq_cfg *cfg) { struct apic_chip_data *apicd;
@@ -917,15 +917,10 @@ static void __irq_complete_move(struct i if (likely(!apicd->move_in_progress)) return;
if (vector == apicd->vector && apicd->cpu == smp_processor_id())
}if (apicd->cpu == smp_processor_id()) __send_cleanup_vector(apicd);
-void irq_complete_move(struct irq_cfg *cfg) -{
__irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
-}
- /*
*/
- Called from fixup_irqs() with @desc->lock held and interrupts disabled.
As expected, this also fixes the issue at hand. Do you want to send a real patch? :)
Tested-by: Alexander Graf graf@amazon.com
Alex
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
Several people reported that 5.8 broke the interrupt affinity setting mechanism.
The consolidation of the entry code reused the regular exception entry code for device interrupts and changed the way how the vector number is conveyed from ptregs->orig_ax to a function argument.
The low level entry uses the hardware error code slot to push the vector number onto the stack which is retrieved from there into a function argument and the slot on stack is set to -1.
The reason for setting it to -1 is that the error code slot is at the position where pt_regs::orig_ax is. A positive value in pt_regs::orig_ax indicates that the entry came via a syscall. If it's not set to a negative value then a signal delivery on return to userspace would try to restart a syscall. But there are other places which rely on pt_regs::orig_ax being a valid indicator for syscall entry.
But setting pt_regs::orig_ax to -1 has a nasty side effect vs. the interrupt affinity setting mechanism, which was overlooked when this change was made.
Moving interrupts on x86 happens in several steps. A new vector on a different CPU is allocated and the relevant interrupt source is reprogrammed to that. But that's racy and there might be an interrupt already in flight to the old vector. So the old vector is preserved until the first interrupt arrives on the new vector and the new target CPU. Once that happens the old vector is cleaned up, but this cleanup still depends on the vector number being stored in pt_regs::orig_ax, which is now -1.
That -1 makes the check for cleanup: pt_regs::orig_ax == new_vector always false. As a consequence the interrupt is moved once, but then it cannot be moved anymore because the cleanup of the old vector never happens.
There would be several ways to convey the vector information to that place in the guts of the interrupt handling, but on deeper inspection it turned out that this check is pointless and a leftover from the old affinity model of X86 which supported multi-CPU affinities. Under this model it was possible that an interrupt had an old and a new vector on the same CPU, so the vector match was required.
Under the new model the effective affinity of an interrupt is always a single CPU from the requested affinity mask. If the affinity mask changes then either the interrupt stays on the CPU and on the same vector when that CPU is still in the new affinity mask or it is moved to a different CPU, but it is never moved to a different vector on the same CPU.
Ergo the cleanup check for the matching vector number is not required and can be removed which makes the dependency on pt_regs:orig_ax go away.
The remaining check for new_cpu == smp_processsor_id() is completely sufficient. If it matches then the interrupt was successfully migrated and the cleanup can proceed.
For paranoia sake add a warning into the vector assignment code to validate that the assumption of never moving to a different vector on the same CPU holds.
Reported-by: Alex bykov alex.bykov@scylladb.com Reported-by: Avi Kivity avi@scylladb.com Reported-by: Alexander Graf graf@amazon.com Fixes: 633260fa143 ("x86/irq: Convey vector as argument and not in ptregs") Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: stable@vger.kernel.org --- arch/x86/kernel/apic/vector.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-)
--- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -161,6 +161,7 @@ static void apic_update_vector(struct ir apicd->move_in_progress = true; apicd->prev_vector = apicd->vector; apicd->prev_cpu = apicd->cpu; + WARN_ON_ONCE(apicd->cpu == newcpu); } else { irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, managed); @@ -909,7 +910,7 @@ void send_cleanup_vector(struct irq_cfg __send_cleanup_vector(apicd); }
-static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) +void irq_complete_move(struct irq_cfg *cfg) { struct apic_chip_data *apicd;
@@ -917,15 +918,16 @@ static void __irq_complete_move(struct i if (likely(!apicd->move_in_progress)) return;
- if (vector == apicd->vector && apicd->cpu == smp_processor_id()) + /* + * If the interrupt arrived on the new target CPU, cleanup the + * vector on the old target CPU. A vector check is not required + * because an interrupt can never move from one vector to another + * on the same CPU. + */ + if (apicd->cpu == smp_processor_id()) __send_cleanup_vector(apicd); }
-void irq_complete_move(struct irq_cfg *cfg) -{ - __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); -} - /* * Called from fixup_irqs() with @desc->lock held and interrupts disabled. */
From: Thomas Gleixner
Sent: 26 August 2020 21:22
...
Moving interrupts on x86 happens in several steps. A new vector on a different CPU is allocated and the relevant interrupt source is reprogrammed to that. But that's racy and there might be an interrupt already in flight to the old vector. So the old vector is preserved until the first interrupt arrives on the new vector and the new target CPU. Once that happens the old vector is cleaned up, but this cleanup still depends on the vector number being stored in pt_regs::orig_ax, which is now -1.
I suspect that it is much more 'racy' than that for PCI-X interrupts. On the hardware side there is an interrupt disable bit, and address and a value. To raise an interrupt the hardware must write the value to the address.
If the cpu needs to move an interrupt both the address and value need changing, but the cpu wont write the address and value using the same TLP, so the hardware could potentially write a value to the wrong address. Worse than that, the hardware could easily only look at the address and value in the clocks after checking the interrupt is enabled. So masking the interrupt immediately prior to changing the vector info may not be enough.
It is likely that a read-back of the mask before updating the vector is enough.
David
- Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
From: David Laight
Sent: 26 August 2020 22:37
From: Thomas Gleixner
Sent: 26 August 2020 21:22
...
Moving interrupts on x86 happens in several steps. A new vector on a different CPU is allocated and the relevant interrupt source is reprogrammed to that. But that's racy and there might be an interrupt already in flight to the old vector. So the old vector is preserved until the first interrupt arrives on the new vector and the new target CPU. Once that happens the old vector is cleaned up, but this cleanup still depends on the vector number being stored in pt_regs::orig_ax, which is now -1.
I suspect that it is much more 'racy' than that for PCI-X interrupts. On the hardware side there is an interrupt disable bit, and address and a value. To raise an interrupt the hardware must write the value to the address.
If the cpu needs to move an interrupt both the address and value need changing, but the cpu wont write the address and value using the same TLP, so the hardware could potentially write a value to the wrong address. Worse than that, the hardware could easily only look at the address and value in the clocks after checking the interrupt is enabled. So masking the interrupt immediately prior to changing the vector info may not be enough.
It is likely that a read-back of the mask before updating the vector is enough.
But not enough to assume you won't receive an interrupt after reading back that interrupts are masked.
(I've implemented the hardware side for an fpga ...)
David
- Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On 26.08.20 23:47, David Laight wrote:
From: David Laight
Sent: 26 August 2020 22:37
From: Thomas Gleixner
Sent: 26 August 2020 21:22
...
Moving interrupts on x86 happens in several steps. A new vector on a different CPU is allocated and the relevant interrupt source is reprogrammed to that. But that's racy and there might be an interrupt already in flight to the old vector. So the old vector is preserved until the first interrupt arrives on the new vector and the new target CPU. Once that happens the old vector is cleaned up, but this cleanup still depends on the vector number being stored in pt_regs::orig_ax, which is now -1.
I suspect that it is much more 'racy' than that for PCI-X interrupts. On the hardware side there is an interrupt disable bit, and address and a value. To raise an interrupt the hardware must write the value to the address.
If the cpu needs to move an interrupt both the address and value need changing, but the cpu wont write the address and value using the same TLP, so the hardware could potentially write a value to the wrong address. Worse than that, the hardware could easily only look at the address and value in the clocks after checking the interrupt is enabled. So masking the interrupt immediately prior to changing the vector info may not be enough.
It is likely that a read-back of the mask before updating the vector is enough.
But not enough to assume you won't receive an interrupt after reading back that interrupts are masked.
(I've implemented the hardware side for an fpga ...)
Do we actually care in this context? All we want to know here is whether a device (or irqchip in between) has actually noticed that it should post to a new vector. If we get interrupts on random other vectors in between, they would simply show up as spurious, no?
So I don't quite see where this patch makes the situation any worse than before.
Alex
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
From: Alexander Graf
Sent: 26 August 2020 23:53
On 26.08.20 23:47, David Laight wrote:
From: David Laight
Sent: 26 August 2020 22:37
From: Thomas Gleixner
Sent: 26 August 2020 21:22
...
Moving interrupts on x86 happens in several steps. A new vector on a different CPU is allocated and the relevant interrupt source is reprogrammed to that. But that's racy and there might be an interrupt already in flight to the old vector. So the old vector is preserved until the first interrupt arrives on the new vector and the new target CPU. Once that happens the old vector is cleaned up, but this cleanup still depends on the vector number being stored in pt_regs::orig_ax, which is now -1.
I suspect that it is much more 'racy' than that for PCI-X interrupts. On the hardware side there is an interrupt disable bit, and address and a value. To raise an interrupt the hardware must write the value to the address.
If the cpu needs to move an interrupt both the address and value need changing, but the cpu wont write the address and value using the same TLP, so the hardware could potentially write a value to the wrong address. Worse than that, the hardware could easily only look at the address and value in the clocks after checking the interrupt is enabled. So masking the interrupt immediately prior to changing the vector info may not be enough.
It is likely that a read-back of the mask before updating the vector is enough.
But not enough to assume you won't receive an interrupt after reading back that interrupts are masked.
(I've implemented the hardware side for an fpga ...)
Do we actually care in this context? All we want to know here is whether a device (or irqchip in between) has actually noticed that it should post to a new vector. If we get interrupts on random other vectors in between, they would simply show up as spurious, no?
So I don't quite see where this patch makes the situation any worse than before.
Oh, it won't make it any worse. It just might be rather worse than anyone imagined.
David
- Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On Wed, Aug 26 2020 at 21:37, David Laight wrote:
From: Thomas Gleixner
Sent: 26 August 2020 21:22
...
Moving interrupts on x86 happens in several steps. A new vector on a different CPU is allocated and the relevant interrupt source is reprogrammed to that. But that's racy and there might be an interrupt already in flight to the old vector. So the old vector is preserved until the first interrupt arrives on the new vector and the new target CPU. Once that happens the old vector is cleaned up, but this cleanup still depends on the vector number being stored in pt_regs::orig_ax, which is now -1.
I suspect that it is much more 'racy' than that for PCI-X interrupts. On the hardware side there is an interrupt disable bit, and address and a value. To raise an interrupt the hardware must write the value to the address.
Really?
If the cpu needs to move an interrupt both the address and value need changing, but the cpu wont write the address and value using the same TLP, so the hardware could potentially write a value to the wrong address.
Now I understand finally why msi_set_affinity() in x86 has to be so convoluted.
Thanks a lot for the enlightment!
tglx
From: Thomas Gleixner
Sent: 26 August 2020 23:08
...
I suspect that it is much more 'racy' than that for PCI-X interrupts. On the hardware side there is an interrupt disable bit, and address and a value. To raise an interrupt the hardware must write the value to the address.
Really?
Yep, anyone with write access to the msi-x table can get the device to write to any physical location (allowed by any IOMMU) instead of raising an interrupt.
If the cpu needs to move an interrupt both the address and value need changing, but the cpu wont write the address and value using the same TLP, so the hardware could potentially write a value to the wrong address.
Now I understand finally why msi_set_affinity() in x86 has to be so convoluted.
Updating the registers should be much the same on all architectures. I probably should have looked at what msi_set_affinity() does before deciding which order the fpga logic should read the four 32bit registers in; but they are read in increasing order - so enable bit last.
David
- Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: e027fffff799cdd70400c5485b1a54f482255985 Gitweb: https://git.kernel.org/tip/e027fffff799cdd70400c5485b1a54f482255985 Author: Thomas Gleixner tglx@linutronix.de AuthorDate: Wed, 26 Aug 2020 22:21:44 +02:00 Committer: Thomas Gleixner tglx@linutronix.de CommitterDate: Thu, 27 Aug 2020 09:29:23 +02:00
x86/irq: Unbreak interrupt affinity setting
Several people reported that 5.8 broke the interrupt affinity setting mechanism.
The consolidation of the entry code reused the regular exception entry code for device interrupts and changed the way how the vector number is conveyed from ptregs->orig_ax to a function argument.
The low level entry uses the hardware error code slot to push the vector number onto the stack which is retrieved from there into a function argument and the slot on stack is set to -1.
The reason for setting it to -1 is that the error code slot is at the position where pt_regs::orig_ax is. A positive value in pt_regs::orig_ax indicates that the entry came via a syscall. If it's not set to a negative value then a signal delivery on return to userspace would try to restart a syscall. But there are other places which rely on pt_regs::orig_ax being a valid indicator for syscall entry.
But setting pt_regs::orig_ax to -1 has a nasty side effect vs. the interrupt affinity setting mechanism, which was overlooked when this change was made.
Moving interrupts on x86 happens in several steps. A new vector on a different CPU is allocated and the relevant interrupt source is reprogrammed to that. But that's racy and there might be an interrupt already in flight to the old vector. So the old vector is preserved until the first interrupt arrives on the new vector and the new target CPU. Once that happens the old vector is cleaned up, but this cleanup still depends on the vector number being stored in pt_regs::orig_ax, which is now -1.
That -1 makes the check for cleanup: pt_regs::orig_ax == new_vector always false. As a consequence the interrupt is moved once, but then it cannot be moved anymore because the cleanup of the old vector never happens.
There would be several ways to convey the vector information to that place in the guts of the interrupt handling, but on deeper inspection it turned out that this check is pointless and a leftover from the old affinity model of X86 which supported multi-CPU affinities. Under this model it was possible that an interrupt had an old and a new vector on the same CPU, so the vector match was required.
Under the new model the effective affinity of an interrupt is always a single CPU from the requested affinity mask. If the affinity mask changes then either the interrupt stays on the CPU and on the same vector when that CPU is still in the new affinity mask or it is moved to a different CPU, but it is never moved to a different vector on the same CPU.
Ergo the cleanup check for the matching vector number is not required and can be removed which makes the dependency on pt_regs:orig_ax go away.
The remaining check for new_cpu == smp_processsor_id() is completely sufficient. If it matches then the interrupt was successfully migrated and the cleanup can proceed.
For paranoia sake add a warning into the vector assignment code to validate that the assumption of never moving to a different vector on the same CPU holds.
Fixes: 633260fa143 ("x86/irq: Convey vector as argument and not in ptregs") Reported-by: Alex bykov alex.bykov@scylladb.com Reported-by: Avi Kivity avi@scylladb.com Reported-by: Alexander Graf graf@amazon.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Alexander Graf graf@amazon.com Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87wo1ltaxz.fsf@nanos.tec.linutronix.de
--- arch/x86/kernel/apic/vector.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index dae32d9..f8a56b5 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -161,6 +161,7 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, apicd->move_in_progress = true; apicd->prev_vector = apicd->vector; apicd->prev_cpu = apicd->cpu; + WARN_ON_ONCE(apicd->cpu == newcpu); } else { irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, managed); @@ -910,7 +911,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) __send_cleanup_vector(apicd); }
-static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) +void irq_complete_move(struct irq_cfg *cfg) { struct apic_chip_data *apicd;
@@ -918,15 +919,16 @@ static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) if (likely(!apicd->move_in_progress)) return;
- if (vector == apicd->vector && apicd->cpu == smp_processor_id()) + /* + * If the interrupt arrived on the new target CPU, cleanup the + * vector on the old target CPU. A vector check is not required + * because an interrupt can never move from one vector to another + * on the same CPU. + */ + if (apicd->cpu == smp_processor_id()) __send_cleanup_vector(apicd); }
-void irq_complete_move(struct irq_cfg *cfg) -{ - __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); -} - /* * Called from fixup_irqs() with @desc->lock held and interrupts disabled. */
linux-stable-mirror@lists.linaro.org