External interrupts (EVENT_TYPE_EXTINT) and system calls (EVENT_TYPE_OTHER) occur more frequently than other events in a typical system. Prioritizing these events saves CPU cycles and optimizes the efficiency of performance- critical paths.
When examining the compiler-generated assembly code for event dispatching in the functions fred_entry_from_user() and fred_entry_from_kernel(), it was observed that the compiler intelligently uses a binary search to match all event type values (0-7) and perform dispatching. As a result, even if the following cases:
case EVENT_TYPE_EXTINT: return fred_extint(regs); case EVENT_TYPE_OTHER: return fred_other(regs);
are placed at the beginning of the switch() statement, the generated assembly code would remain the same, and the expected prioritization would not be achieved.
Command line to check the assembly code generated by the compiler for fred_entry_from_user():
$objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--'
00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 83 e0 0f and $0xf,%eax 15b7: 48 89 e5 mov %rsp,%rbp 15ba: 3c 04 cmp $0x4,%al -->> /* match 4(EVENT_TYPE_SWINT) first */ 15bc: 74 78 je 1636 <fred_entry_from_user+0x96> 15be: 77 15 ja 15d5 <fred_entry_from_user+0x35> 15c0: 3c 02 cmp $0x2,%al 15c2: 74 53 je 1617 <fred_entry_from_user+0x77> 15c4: 77 65 ja 162b <fred_entry_from_user+0x8b> 15c6: 84 c0 test %al,%al 15c8: 75 42 jne 160c <fred_entry_from_user+0x6c> 15ca: e8 71 fc ff ff callq 1240 <fred_extint> 15cf: 5d pop %rbp 15d0: e9 00 00 00 00 jmpq 15d5 <fred_entry_from_user+0x35> 15d5: 3c 06 cmp $0x6,%al 15d7: 74 7c je 1655 <fred_entry_from_user+0xb5> 15d9: 72 66 jb 1641 <fred_entry_from_user+0xa1> 15db: 3c 07 cmp $0x7,%al 15dd: 75 2d jne 160c <fred_entry_from_user+0x6c> 15df: 8b 87 a4 00 00 00 mov 0xa4(%rdi),%eax 15e5: 25 ff 00 00 02 and $0x20000ff,%eax 15ea: 3d 01 00 00 02 cmp $0x2000001,%eax 15ef: 75 6f jne 1660 <fred_entry_from_user+0xc0> 15f1: 48 8b 77 50 mov 0x50(%rdi),%rsi 15f5: 48 c7 47 50 da ff ff movq $0xffffffffffffffda,0x50(%rdi) ... ...
Command line to check the assembly code generated by the compiler for fred_entry_from_kernel():
$objdump -d vmlinux.o | awk '/<fred_entry_from_kernel>:/{c=65} c&&c--'
00000000000016b0 <fred_entry_from_kernel>: 16b0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 16b7: 48 8b 77 78 mov 0x78(%rdi),%rsi 16bb: 55 push %rbp 16bc: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 16c3: ff 16c4: 83 e0 0f and $0xf,%eax 16c7: 48 89 e5 mov %rsp,%rbp 16ca: 3c 03 cmp $0x3,%al -->> /* match 3(EVENT_TYPE_HWEXC) first */ 16cc: 74 3c je 170a <fred_entry_from_kernel+0x5a> 16ce: 76 13 jbe 16e3 <fred_entry_from_kernel+0x33> 16d0: 3c 05 cmp $0x5,%al 16d2: 74 41 je 1715 <fred_entry_from_kernel+0x65> 16d4: 3c 06 cmp $0x6,%al 16d6: 75 27 jne 16ff <fred_entry_from_kernel+0x4f> 16d8: e8 73 fe ff ff callq 1550 <fred_swexc.isra.3> 16dd: 5d pop %rbp ... ...
Therefore, it is necessary to handle EVENT_TYPE_EXTINT and EVENT_TYPE_OTHER before the switch statement using if-else syntax to ensure the compiler generates the desired code. After applying the patch, the verification results are as follows:
$objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--'
00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 48 89 e5 mov %rsp,%rbp 15b7: 83 e0 0f and $0xf,%eax 15ba: 74 34 je 15f0 <fred_entry_from_user+0x50> -->> /* match 0(EVENT_TYPE_EXTINT) first */ 15bc: 3c 07 cmp $0x7,%al -->> /* match 7(EVENT_TYPE_OTHER) second * 15be: 74 6e je 162e <fred_entry_from_user+0x8e> 15c0: 3c 04 cmp $0x4,%al 15c2: 0f 84 93 00 00 00 je 165b <fred_entry_from_user+0xbb> 15c8: 76 13 jbe 15dd <fred_entry_from_user+0x3d> 15ca: 3c 05 cmp $0x5,%al 15cc: 74 41 je 160f <fred_entry_from_user+0x6f> 15ce: 3c 06 cmp $0x6,%al 15d0: 75 51 jne 1623 <fred_entry_from_user+0x83> 15d2: e8 79 ff ff ff callq 1550 <fred_swexc.isra.3> 15d7: 5d pop %rbp 15d8: e9 00 00 00 00 jmpq 15dd <fred_entry_from_user+0x3d> 15dd: 3c 02 cmp $0x2,%al 15df: 74 1a je 15fb <fred_entry_from_user+0x5b> 15e1: 3c 03 cmp $0x3,%al 15e3: 75 3e jne 1623 <fred_entry_from_user+0x83> ... ...
The same desired code in fred_entry_from_kernel is no longer repeated.
While the C code with if-else placed before switch() may appear ugly, it works. Additionally, using a jump table is not advisable; even if the jump table resides in the L1 cache, the cost of loading it is over 10 times the latency of a cmp instruction.
Signed-off-by: Ethan Zhao haifeng.zhao@linux.intel.com --- base commit: 619f0b6fad524f08d493a98d55bac9ab8895e3a6 --- arch/x86/entry/entry_fred.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index f004a4dc74c2..591f47771ecf 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -228,9 +228,18 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { - case EVENT_TYPE_EXTINT: + if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs); + else if (regs->fred_ss.type == EVENT_TYPE_OTHER) + return fred_other(regs); + + /* + * Dispatch EVENT_TYPE_EXTINT and EVENT_TYPE_OTHER(syscall) type events + * first due to their high probability and let the compiler create binary search + * dispatching for the remaining events + */ + + switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs); @@ -245,8 +254,6 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) break; case EVENT_TYPE_SWEXC: return fred_swexc(regs, error_code); - case EVENT_TYPE_OTHER: - return fred_other(regs); default: break; }
@@ -260,9 +267,15 @@ __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { - case EVENT_TYPE_EXTINT: + if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs); + + /* + * Dispatch EVENT_TYPE_EXTINT type event first due to its high probability + * and let the compiler do binary search dispatching for the other events + */ + + switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs);
Hi,
Thanks for your patch.
FYI: kernel test robot notices the stable kernel rule is not satisfied.
The check is based on https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html#opti...
Rule: add the tag "Cc: stable@vger.kernel.org" in the sign-off area to have the patch automatically included in the stable tree. Subject: [PATCH] x86/fred: Optimize the FRED entry by prioritizing high-probability event dispatching Link: https://lore.kernel.org/stable/20250116065145.2747960-1-haifeng.zhao%40linux...
On 1/15/2025 10:51 PM, Ethan Zhao wrote:
External interrupts (EVENT_TYPE_EXTINT) and system calls (EVENT_TYPE_OTHER) occur more frequently than other events in a typical system. Prioritizing these events saves CPU cycles and optimizes the efficiency of performance- critical paths.
We deliberately hold off sending performance improvement patches at this point, but first of all please read: https://lore.kernel.org/lkml/87fs766o3t.ffs@tglx/
Thanks! Xin
When examining the compiler-generated assembly code for event dispatching in the functions fred_entry_from_user() and fred_entry_from_kernel(), it was observed that the compiler intelligently uses a binary search to match all event type values (0-7) and perform dispatching. As a result, even if the following cases:
case EVENT_TYPE_EXTINT: return fred_extint(regs); case EVENT_TYPE_OTHER: return fred_other(regs);
are placed at the beginning of the switch() statement, the generated assembly code would remain the same, and the expected prioritization would not be achieved.
Command line to check the assembly code generated by the compiler for fred_entry_from_user():
$objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--'
00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 83 e0 0f and $0xf,%eax 15b7: 48 89 e5 mov %rsp,%rbp 15ba: 3c 04 cmp $0x4,%al -->> /* match 4(EVENT_TYPE_SWINT) first */ 15bc: 74 78 je 1636 <fred_entry_from_user+0x96> 15be: 77 15 ja 15d5 <fred_entry_from_user+0x35> 15c0: 3c 02 cmp $0x2,%al 15c2: 74 53 je 1617 <fred_entry_from_user+0x77> 15c4: 77 65 ja 162b <fred_entry_from_user+0x8b> 15c6: 84 c0 test %al,%al 15c8: 75 42 jne 160c <fred_entry_from_user+0x6c> 15ca: e8 71 fc ff ff callq 1240 <fred_extint> 15cf: 5d pop %rbp 15d0: e9 00 00 00 00 jmpq 15d5 <fred_entry_from_user+0x35> 15d5: 3c 06 cmp $0x6,%al 15d7: 74 7c je 1655 <fred_entry_from_user+0xb5> 15d9: 72 66 jb 1641 <fred_entry_from_user+0xa1> 15db: 3c 07 cmp $0x7,%al 15dd: 75 2d jne 160c <fred_entry_from_user+0x6c> 15df: 8b 87 a4 00 00 00 mov 0xa4(%rdi),%eax 15e5: 25 ff 00 00 02 and $0x20000ff,%eax 15ea: 3d 01 00 00 02 cmp $0x2000001,%eax 15ef: 75 6f jne 1660 <fred_entry_from_user+0xc0> 15f1: 48 8b 77 50 mov 0x50(%rdi),%rsi 15f5: 48 c7 47 50 da ff ff movq $0xffffffffffffffda,0x50(%rdi) ... ...
Command line to check the assembly code generated by the compiler for fred_entry_from_kernel():
$objdump -d vmlinux.o | awk '/<fred_entry_from_kernel>:/{c=65} c&&c--'
00000000000016b0 <fred_entry_from_kernel>: 16b0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 16b7: 48 8b 77 78 mov 0x78(%rdi),%rsi 16bb: 55 push %rbp 16bc: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 16c3: ff 16c4: 83 e0 0f and $0xf,%eax 16c7: 48 89 e5 mov %rsp,%rbp 16ca: 3c 03 cmp $0x3,%al -->> /* match 3(EVENT_TYPE_HWEXC) first */ 16cc: 74 3c je 170a <fred_entry_from_kernel+0x5a> 16ce: 76 13 jbe 16e3 <fred_entry_from_kernel+0x33> 16d0: 3c 05 cmp $0x5,%al 16d2: 74 41 je 1715 <fred_entry_from_kernel+0x65> 16d4: 3c 06 cmp $0x6,%al 16d6: 75 27 jne 16ff <fred_entry_from_kernel+0x4f> 16d8: e8 73 fe ff ff callq 1550 <fred_swexc.isra.3> 16dd: 5d pop %rbp ... ...
Therefore, it is necessary to handle EVENT_TYPE_EXTINT and EVENT_TYPE_OTHER before the switch statement using if-else syntax to ensure the compiler generates the desired code. After applying the patch, the verification results are as follows:
$objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--'
00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 48 89 e5 mov %rsp,%rbp 15b7: 83 e0 0f and $0xf,%eax 15ba: 74 34 je 15f0 <fred_entry_from_user+0x50> -->> /* match 0(EVENT_TYPE_EXTINT) first */ 15bc: 3c 07 cmp $0x7,%al -->> /* match 7(EVENT_TYPE_OTHER) second * 15be: 74 6e je 162e <fred_entry_from_user+0x8e> 15c0: 3c 04 cmp $0x4,%al 15c2: 0f 84 93 00 00 00 je 165b <fred_entry_from_user+0xbb> 15c8: 76 13 jbe 15dd <fred_entry_from_user+0x3d> 15ca: 3c 05 cmp $0x5,%al 15cc: 74 41 je 160f <fred_entry_from_user+0x6f> 15ce: 3c 06 cmp $0x6,%al 15d0: 75 51 jne 1623 <fred_entry_from_user+0x83> 15d2: e8 79 ff ff ff callq 1550 <fred_swexc.isra.3> 15d7: 5d pop %rbp 15d8: e9 00 00 00 00 jmpq 15dd <fred_entry_from_user+0x3d> 15dd: 3c 02 cmp $0x2,%al 15df: 74 1a je 15fb <fred_entry_from_user+0x5b> 15e1: 3c 03 cmp $0x3,%al 15e3: 75 3e jne 1623 <fred_entry_from_user+0x83> ... ...
The same desired code in fred_entry_from_kernel is no longer repeated.
While the C code with if-else placed before switch() may appear ugly, it works. Additionally, using a jump table is not advisable; even if the jump table resides in the L1 cache, the cost of loading it is over 10 times the latency of a cmp instruction.
Signed-off-by: Ethan Zhao haifeng.zhao@linux.intel.com
base commit: 619f0b6fad524f08d493a98d55bac9ab8895e3a6
arch/x86/entry/entry_fred.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index f004a4dc74c2..591f47771ecf 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -228,9 +228,18 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) {
- case EVENT_TYPE_EXTINT:
- if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs);
- else if (regs->fred_ss.type == EVENT_TYPE_OTHER)
return fred_other(regs);
- /*
* Dispatch EVENT_TYPE_EXTINT and EVENT_TYPE_OTHER(syscall) type events
* first due to their high probability and let the compiler create binary search
* dispatching for the remaining events
*/
- switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs);
@@ -245,8 +254,6 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) break; case EVENT_TYPE_SWEXC: return fred_swexc(regs, error_code);
- case EVENT_TYPE_OTHER:
default: break; }return fred_other(regs);
@@ -260,9 +267,15 @@ __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) {
- case EVENT_TYPE_EXTINT:
- if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs);
- /*
* Dispatch EVENT_TYPE_EXTINT type event first due to its high probability
* and let the compiler do binary search dispatching for the other events
*/
- switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs);
在 2025/1/16 15:27, Xin Li 写道:
On 1/15/2025 10:51 PM, Ethan Zhao wrote:
External interrupts (EVENT_TYPE_EXTINT) and system calls (EVENT_TYPE_OTHER) occur more frequently than other events in a typical system. Prioritizing these events saves CPU cycles and optimizes the efficiency of performance- critical paths.
We deliberately hold off sending performance improvement patches at this point, but first of all please read: https://lore.kernel.org/lkml/87fs766o3t.ffs@tglx/
Got it, seems there is jump table implenmentation in that link page. will read
and discuss it -- two years ago, too old to reply that post.
Thanks,
Ethan
Thanks! Xin
When examining the compiler-generated assembly code for event dispatching in the functions fred_entry_from_user() and fred_entry_from_kernel(), it was observed that the compiler intelligently uses a binary search to match all event type values (0-7) and perform dispatching. As a result, even if the following cases:
case EVENT_TYPE_EXTINT: return fred_extint(regs); case EVENT_TYPE_OTHER: return fred_other(regs);
are placed at the beginning of the switch() statement, the generated assembly code would remain the same, and the expected prioritization would not be achieved.
Command line to check the assembly code generated by the compiler for fred_entry_from_user():
$objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--'
00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 83 e0 0f and $0xf,%eax 15b7: 48 89 e5 mov %rsp,%rbp 15ba: 3c 04 cmp $0x4,%al -->> /* match 4(EVENT_TYPE_SWINT) first */ 15bc: 74 78 je 1636 <fred_entry_from_user+0x96> 15be: 77 15 ja 15d5 <fred_entry_from_user+0x35> 15c0: 3c 02 cmp $0x2,%al 15c2: 74 53 je 1617 <fred_entry_from_user+0x77> 15c4: 77 65 ja 162b <fred_entry_from_user+0x8b> 15c6: 84 c0 test %al,%al 15c8: 75 42 jne 160c <fred_entry_from_user+0x6c> 15ca: e8 71 fc ff ff callq 1240 <fred_extint> 15cf: 5d pop %rbp 15d0: e9 00 00 00 00 jmpq 15d5 <fred_entry_from_user+0x35> 15d5: 3c 06 cmp $0x6,%al 15d7: 74 7c je 1655 <fred_entry_from_user+0xb5> 15d9: 72 66 jb 1641 <fred_entry_from_user+0xa1> 15db: 3c 07 cmp $0x7,%al 15dd: 75 2d jne 160c <fred_entry_from_user+0x6c> 15df: 8b 87 a4 00 00 00 mov 0xa4(%rdi),%eax 15e5: 25 ff 00 00 02 and $0x20000ff,%eax 15ea: 3d 01 00 00 02 cmp $0x2000001,%eax 15ef: 75 6f jne 1660 <fred_entry_from_user+0xc0> 15f1: 48 8b 77 50 mov 0x50(%rdi),%rsi 15f5: 48 c7 47 50 da ff ff movq $0xffffffffffffffda,0x50(%rdi) ... ...
Command line to check the assembly code generated by the compiler for fred_entry_from_kernel():
$objdump -d vmlinux.o | awk '/<fred_entry_from_kernel>:/{c=65} c&&c--'
00000000000016b0 <fred_entry_from_kernel>: 16b0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 16b7: 48 8b 77 78 mov 0x78(%rdi),%rsi 16bb: 55 push %rbp 16bc: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 16c3: ff 16c4: 83 e0 0f and $0xf,%eax 16c7: 48 89 e5 mov %rsp,%rbp 16ca: 3c 03 cmp $0x3,%al -->> /* match 3(EVENT_TYPE_HWEXC) first */ 16cc: 74 3c je 170a <fred_entry_from_kernel+0x5a> 16ce: 76 13 jbe 16e3 <fred_entry_from_kernel+0x33> 16d0: 3c 05 cmp $0x5,%al 16d2: 74 41 je 1715 <fred_entry_from_kernel+0x65> 16d4: 3c 06 cmp $0x6,%al 16d6: 75 27 jne 16ff <fred_entry_from_kernel+0x4f> 16d8: e8 73 fe ff ff callq 1550 <fred_swexc.isra.3> 16dd: 5d pop %rbp ... ...
Therefore, it is necessary to handle EVENT_TYPE_EXTINT and EVENT_TYPE_OTHER before the switch statement using if-else syntax to ensure the compiler generates the desired code. After applying the patch, the verification results are as follows:
$objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--'
00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 48 89 e5 mov %rsp,%rbp 15b7: 83 e0 0f and $0xf,%eax 15ba: 74 34 je 15f0 <fred_entry_from_user+0x50> -->> /* match 0(EVENT_TYPE_EXTINT) first */ 15bc: 3c 07 cmp $0x7,%al -->> /* match 7(EVENT_TYPE_OTHER) second * 15be: 74 6e je 162e <fred_entry_from_user+0x8e> 15c0: 3c 04 cmp $0x4,%al 15c2: 0f 84 93 00 00 00 je 165b <fred_entry_from_user+0xbb> 15c8: 76 13 jbe 15dd <fred_entry_from_user+0x3d> 15ca: 3c 05 cmp $0x5,%al 15cc: 74 41 je 160f <fred_entry_from_user+0x6f> 15ce: 3c 06 cmp $0x6,%al 15d0: 75 51 jne 1623 <fred_entry_from_user+0x83> 15d2: e8 79 ff ff ff callq 1550 <fred_swexc.isra.3> 15d7: 5d pop %rbp 15d8: e9 00 00 00 00 jmpq 15dd <fred_entry_from_user+0x3d> 15dd: 3c 02 cmp $0x2,%al 15df: 74 1a je 15fb <fred_entry_from_user+0x5b> 15e1: 3c 03 cmp $0x3,%al 15e3: 75 3e jne 1623 <fred_entry_from_user+0x83> ... ...
The same desired code in fred_entry_from_kernel is no longer repeated.
While the C code with if-else placed before switch() may appear ugly, it works. Additionally, using a jump table is not advisable; even if the jump table resides in the L1 cache, the cost of loading it is over 10 times the latency of a cmp instruction.
Signed-off-by: Ethan Zhao haifeng.zhao@linux.intel.com
base commit: 619f0b6fad524f08d493a98d55bac9ab8895e3a6
arch/x86/entry/entry_fred.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index f004a4dc74c2..591f47771ecf 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -228,9 +228,18 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1; - switch (regs->fred_ss.type) { - case EVENT_TYPE_EXTINT: + if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs); + else if (regs->fred_ss.type == EVENT_TYPE_OTHER) + return fred_other(regs);
+ /* + * Dispatch EVENT_TYPE_EXTINT and EVENT_TYPE_OTHER(syscall) type events + * first due to their high probability and let the compiler create binary search + * dispatching for the remaining events + */
+ switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs); @@ -245,8 +254,6 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) break; case EVENT_TYPE_SWEXC: return fred_swexc(regs, error_code); - case EVENT_TYPE_OTHER: - return fred_other(regs); default: break; } @@ -260,9 +267,15 @@ __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1; - switch (regs->fred_ss.type) { - case EVENT_TYPE_EXTINT: + if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs);
+ /* + * Dispatch EVENT_TYPE_EXTINT type event first due to its high probability + * and let the compiler do binary search dispatching for the other events + */
+ switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs);
On 1/16/2025 3:27 PM, Xin Li wrote:
On 1/15/2025 10:51 PM, Ethan Zhao wrote:
External interrupts (EVENT_TYPE_EXTINT) and system calls (EVENT_TYPE_OTHER) occur more frequently than other events in a typical system. Prioritizing these events saves CPU cycles and optimizes the efficiency of performance- critical paths.
We deliberately hold off sending performance improvement patches at this point, but first of all please read: https://lore.kernel.org/lkml/87fs766o3t.ffs@tglx/
Do you mean Thomas'point "Premature optimization is the enemy of correctness. don't do it" ? Yep, I agree with it.
Compared to the asm code generated by the compiler with fewer than 20 cmp instructions, placing a event handler jump table indexed by event-typethere is a negative optimization. That is not deliberately 'hold off', correctness goes first, definitely right.
Thanks,
Ethan
Thanks! Xin
When examining the compiler-generated assembly code for event dispatching in the functions fred_entry_from_user() and fred_entry_from_kernel(), it was observed that the compiler intelligently uses a binary search to match all event type values (0-7) and perform dispatching. As a result, even if the following cases:
case EVENT_TYPE_EXTINT: return fred_extint(regs); case EVENT_TYPE_OTHER: return fred_other(regs);
are placed at the beginning of the switch() statement, the generated assembly code would remain the same, and the expected prioritization would not be achieved.
Command line to check the assembly code generated by the compiler for fred_entry_from_user():
$objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--'
00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 83 e0 0f and $0xf,%eax 15b7: 48 89 e5 mov %rsp,%rbp 15ba: 3c 04 cmp $0x4,%al -->> /* match 4(EVENT_TYPE_SWINT) first */ 15bc: 74 78 je 1636 <fred_entry_from_user+0x96> 15be: 77 15 ja 15d5 <fred_entry_from_user+0x35> 15c0: 3c 02 cmp $0x2,%al 15c2: 74 53 je 1617 <fred_entry_from_user+0x77> 15c4: 77 65 ja 162b <fred_entry_from_user+0x8b> 15c6: 84 c0 test %al,%al 15c8: 75 42 jne 160c <fred_entry_from_user+0x6c> 15ca: e8 71 fc ff ff callq 1240 <fred_extint> 15cf: 5d pop %rbp 15d0: e9 00 00 00 00 jmpq 15d5 <fred_entry_from_user+0x35> 15d5: 3c 06 cmp $0x6,%al 15d7: 74 7c je 1655 <fred_entry_from_user+0xb5> 15d9: 72 66 jb 1641 <fred_entry_from_user+0xa1> 15db: 3c 07 cmp $0x7,%al 15dd: 75 2d jne 160c <fred_entry_from_user+0x6c> 15df: 8b 87 a4 00 00 00 mov 0xa4(%rdi),%eax 15e5: 25 ff 00 00 02 and $0x20000ff,%eax 15ea: 3d 01 00 00 02 cmp $0x2000001,%eax 15ef: 75 6f jne 1660 <fred_entry_from_user+0xc0> 15f1: 48 8b 77 50 mov 0x50(%rdi),%rsi 15f5: 48 c7 47 50 da ff ff movq $0xffffffffffffffda,0x50(%rdi) ... ...
Command line to check the assembly code generated by the compiler for fred_entry_from_kernel():
$objdump -d vmlinux.o | awk '/<fred_entry_from_kernel>:/{c=65} c&&c--'
00000000000016b0 <fred_entry_from_kernel>: 16b0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 16b7: 48 8b 77 78 mov 0x78(%rdi),%rsi 16bb: 55 push %rbp 16bc: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 16c3: ff 16c4: 83 e0 0f and $0xf,%eax 16c7: 48 89 e5 mov %rsp,%rbp 16ca: 3c 03 cmp $0x3,%al -->> /* match 3(EVENT_TYPE_HWEXC) first */ 16cc: 74 3c je 170a <fred_entry_from_kernel+0x5a> 16ce: 76 13 jbe 16e3 <fred_entry_from_kernel+0x33> 16d0: 3c 05 cmp $0x5,%al 16d2: 74 41 je 1715 <fred_entry_from_kernel+0x65> 16d4: 3c 06 cmp $0x6,%al 16d6: 75 27 jne 16ff <fred_entry_from_kernel+0x4f> 16d8: e8 73 fe ff ff callq 1550 <fred_swexc.isra.3> 16dd: 5d pop %rbp ... ...
Therefore, it is necessary to handle EVENT_TYPE_EXTINT and EVENT_TYPE_OTHER before the switch statement using if-else syntax to ensure the compiler generates the desired code. After applying the patch, the verification results are as follows:
$objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--'
00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 48 89 e5 mov %rsp,%rbp 15b7: 83 e0 0f and $0xf,%eax 15ba: 74 34 je 15f0 <fred_entry_from_user+0x50> -->> /* match 0(EVENT_TYPE_EXTINT) first */ 15bc: 3c 07 cmp $0x7,%al -->> /* match 7(EVENT_TYPE_OTHER) second * 15be: 74 6e je 162e <fred_entry_from_user+0x8e> 15c0: 3c 04 cmp $0x4,%al 15c2: 0f 84 93 00 00 00 je 165b <fred_entry_from_user+0xbb> 15c8: 76 13 jbe 15dd <fred_entry_from_user+0x3d> 15ca: 3c 05 cmp $0x5,%al 15cc: 74 41 je 160f <fred_entry_from_user+0x6f> 15ce: 3c 06 cmp $0x6,%al 15d0: 75 51 jne 1623 <fred_entry_from_user+0x83> 15d2: e8 79 ff ff ff callq 1550 <fred_swexc.isra.3> 15d7: 5d pop %rbp 15d8: e9 00 00 00 00 jmpq 15dd <fred_entry_from_user+0x3d> 15dd: 3c 02 cmp $0x2,%al 15df: 74 1a je 15fb <fred_entry_from_user+0x5b> 15e1: 3c 03 cmp $0x3,%al 15e3: 75 3e jne 1623 <fred_entry_from_user+0x83> ... ...
The same desired code in fred_entry_from_kernel is no longer repeated.
While the C code with if-else placed before switch() may appear ugly, it works. Additionally, using a jump table is not advisable; even if the jump table resides in the L1 cache, the cost of loading it is over 10 times the latency of a cmp instruction.
Signed-off-by: Ethan Zhao haifeng.zhao@linux.intel.com
base commit: 619f0b6fad524f08d493a98d55bac9ab8895e3a6
arch/x86/entry/entry_fred.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index f004a4dc74c2..591f47771ecf 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -228,9 +228,18 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1; - switch (regs->fred_ss.type) { - case EVENT_TYPE_EXTINT: + if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs); + else if (regs->fred_ss.type == EVENT_TYPE_OTHER) + return fred_other(regs);
+ /* + * Dispatch EVENT_TYPE_EXTINT and EVENT_TYPE_OTHER(syscall) type events + * first due to their high probability and let the compiler create binary search + * dispatching for the remaining events + */
+ switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs); @@ -245,8 +254,6 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) break; case EVENT_TYPE_SWEXC: return fred_swexc(regs, error_code); - case EVENT_TYPE_OTHER: - return fred_other(regs); default: break; } @@ -260,9 +267,15 @@ __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1; - switch (regs->fred_ss.type) { - case EVENT_TYPE_EXTINT: + if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs);
+ /* + * Dispatch EVENT_TYPE_EXTINT type event first due to its high probability + * and let the compiler do binary search dispatching for the other events + */
+ switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs);
On 1/16/2025 5:03 AM, Ethan Zhao wrote:
On 1/16/2025 3:27 PM, Xin Li wrote:
On 1/15/2025 10:51 PM, Ethan Zhao wrote:
External interrupts (EVENT_TYPE_EXTINT) and system calls (EVENT_TYPE_OTHER) occur more frequently than other events in a typical system. Prioritizing these events saves CPU cycles and optimizes the efficiency of performance- critical paths.
We deliberately hold off sending performance improvement patches at this point, but first of all please read: https://lore.kernel.org/lkml/87fs766o3t.ffs@tglx/
Do you mean Thomas'point "Premature optimization is the enemy of correctness. don't do it" ? Yep, I agree with it.
Yes.
Compared to the asm code generated by the compiler with fewer than 20 cmp instructions, placing a event handler jump table indexed by event- typethere is a negative optimization. That is not deliberately 'hold off', correctness goes first, definitely right.
hpa suggested to introduce "switch_likely" for this kind of optimization on a switch statement, which is also easier to read. I measured it with a user space focus test, it does improve performance a lot. But obviously there are still a lot of work to do.
Thanks! Xin
On 1/16/2025 9:48 PM, Xin Li wrote:
On 1/16/2025 5:03 AM, Ethan Zhao wrote:
On 1/16/2025 3:27 PM, Xin Li wrote:
On 1/15/2025 10:51 PM, Ethan Zhao wrote:
External interrupts (EVENT_TYPE_EXTINT) and system calls (EVENT_TYPE_OTHER) occur more frequently than other events in a typical system. Prioritizing these events saves CPU cycles and optimizes the efficiency of performance- critical paths.
We deliberately hold off sending performance improvement patches at this point, but first of all please read: https://lore.kernel.org/lkml/87fs766o3t.ffs@tglx/
Do you mean Thomas'point "Premature optimization is the enemy of correctness. don't do it" ? Yep, I agree with it.
Yes.
Compared to the asm code generated by the compiler with fewer than 20 cmp instructions, placing a event handler jump table indexed by event- typethere is a negative optimization. That is not deliberately 'hold off', correctness goes first, definitely right.
hpa suggested to introduce "switch_likely" for this kind of optimization on a switch statement, which is also easier to read. I measured it with a user space focus test, it does improve performance a lot. But obviously there are still a lot of work to do.
Find a way to instruct compiler to pick the right hot branch meanwhile make folks reading happy... yup, a lot of work.
Thanks, Ethan
Thanks! Xin
On 1/16/25 16:37, Ethan Zhao wrote:
hpa suggested to introduce "switch_likely" for this kind of optimization on a switch statement, which is also easier to read. I measured it with a user space focus test, it does improve performance a lot. But obviously there are still a lot of work to do.
Find a way to instruct compiler to pick the right hot branch meanwhile make folks reading happy... yup, a lot of work.
It's not that complicated, believe it or not.
/* * switch(v) biased for speed in the case v == l * * Note: gcc is quite sensitive to the exact form of this * expression. */ #define switch_likely(v,l) \ switch((__typeof__(v))__builtin_expect((v),(l)))
-hpa
在 2025/1/17 9:21, H. Peter Anvin 写道:
On 1/16/25 16:37, Ethan Zhao wrote:
hpa suggested to introduce "switch_likely" for this kind of optimization on a switch statement, which is also easier to read. I measured it with a user space focus test, it does improve performance a lot. But obviously there are still a lot of work to do.
Find a way to instruct compiler to pick the right hot branch meanwhile make folks reading happy... yup, a lot of work.
It's not that complicated, believe it or not.
/* * switch(v) biased for speed in the case v == l * * Note: gcc is quite sensitive to the exact form of this * expression. */ #define switch_likely(v,l) \ switch((__typeof__(v))__builtin_expect((v),(l)))
I know we could play such trick for one branch, never think of there are 2-3 branches are expected among 7 branches. :) , --- external interrupts, syscall, page fault.
Thanks, Ethan
-hpa
在 2025/1/17 9:21, H. Peter Anvin 写道:
On 1/16/25 16:37, Ethan Zhao wrote:
hpa suggested to introduce "switch_likely" for this kind of optimization on a switch statement, which is also easier to read. I measured it with a user space focus test, it does improve performance a lot. But obviously there are still a lot of work to do.
Find a way to instruct compiler to pick the right hot branch meanwhile make folks reading happy... yup, a lot of work.
It's not that complicated, believe it or not.
/* * switch(v) biased for speed in the case v == l * * Note: gcc is quite sensitive to the exact form of this * expression. */ #define switch_likely(v,l) \ switch((__typeof__(v))__builtin_expect((v),(l)))
I tried this macro as following, but got something really *weird* from gcc.
+#define switch_likely(v,l) \ + switch((__typeof__(v))__builtin_expect((v),(l))) + __visible noinstr void fred_entry_from_user(struct pt_regs *regs) { unsigned long error_code = regs->orig_ax; + unsigned short etype = regs->fred_ss.type & 0xf;
/* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { + switch_likely ((etype == EVENT_TYPE_EXTINT || etype == EVENT_TYPE_OTHER), etype) { case EVENT_TYPE_EXTINT: return fred_extint(regs); case EVENT_TYPE_NMI: @@ -256,11 +260,12 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) { unsigned long error_code = regs->orig_ax; + unsigned short etype = regs->fred_ss.type & 0xf;
/* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { + switch_likely (etype == EVENT_TYPE_EXTINT, etype) { case EVENT_TYPE_EXTINT: return fred_extint(regs); case EVENT_TYPE_NMI:
Got the asm code as following:
objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--' 00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 48 89 e5 mov %rsp,%rbp 15b7: 66 83 e0 0f and $0xf,%ax 15bb: 74 11 je 15ce <fred_entry_from_user+0x2e> 15bd: 66 83 f8 07 cmp $0x7,%ax 15c1: 74 0b je 15ce <fred_entry_from_user+0x2e> 15c3: e8 78 fc ff ff callq 1240 <fred_extint> 15c8: 5d pop %rbp 15c9: e9 00 00 00 00 jmpq 15ce <fred_entry_from_user+0x2e> 15ce: e8 4d fd ff ff callq 1320 <fred_bad_type> 15d3: 5d pop %rbp 15d4: e9 00 00 00 00 jmpq 15d9 <fred_entry_from_user+0x39> 15d9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
00000000000015e0 <__pfx_fred_entry_from_kernel>: 15e0: 90 nop 15e1: 90 nop
00000000000015f0 <fred_entry_from_kernel>: 15f0: 55 push %rbp 15f1: 48 8b 77 78 mov 0x78(%rdi),%rsi 15f5: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15fc: ff 15fd: 48 89 e5 mov %rsp,%rbp 1600: f6 87 a6 00 00 00 0f testb $0xf,0xa6(%rdi) 1607: 75 0b jne 1614 <fred_entry_from_kernel+0x24> 1609: e8 12 fd ff ff callq 1320 <fred_bad_type> 160e: 5d pop %rbp 160f: e9 00 00 00 00 jmpq 1614 <fred_entry_from_kernel+0x24> 1614: e8 27 fc ff ff callq 1240 <fred_extint> 1619: 5d pop %rbp 161a: e9 00 00 00 00 jmpq 161f <fred_entry_from_kernel+0x2f> 161f: 90 nop
0000000000001620 <__pfx___fred_entry_from_kvm>: 1620: 90 nop 1621: 90 nop
Even the fred_entry_from_kernel() asm code doesn't look right. *gcc version 8.5.0 20210514 (Red Hat 8.5.0-10) (GCC)* ** *Did I screw up something ?* ** *Thanks,* *Ethan*
-hpa
On 1/16/2025 8:19 PM, Ethan Zhao wrote:
在 2025/1/17 9:21, H. Peter Anvin 写道:
On 1/16/25 16:37, Ethan Zhao wrote:
hpa suggested to introduce "switch_likely" for this kind of optimization on a switch statement, which is also easier to read. I measured it with a user space focus test, it does improve performance a lot. But obviously there are still a lot of work to do.
Find a way to instruct compiler to pick the right hot branch meanwhile make folks reading happy... yup, a lot of work.
It's not that complicated, believe it or not.
/* * switch(v) biased for speed in the case v == l * * Note: gcc is quite sensitive to the exact form of this * expression. */ #define switch_likely(v,l) \ switch((__typeof__(v))__builtin_expect((v),(l)))
I tried this macro as following, but got something really *weird* from gcc.
+#define switch_likely(v,l) \ + switch((__typeof__(v))__builtin_expect((v),(l)))
__visible noinstr void fred_entry_from_user(struct pt_regs *regs) { unsigned long error_code = regs->orig_ax; + unsigned short etype = regs->fred_ss.type & 0xf;
/* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { + switch_likely ((etype == EVENT_TYPE_EXTINT || etype == EVENT_TYPE_OTHER), etype) {
Just swap the 2 arguments, and it should be: + switch_likely (etype, EVENT_TYPE_OTHER) {
Probably also check __builtin_expect on https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html.
case EVENT_TYPE_EXTINT: return fred_extint(regs); case EVENT_TYPE_NMI: @@ -256,11 +260,12 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) { unsigned long error_code = regs->orig_ax; + unsigned short etype = regs->fred_ss.type & 0xf;
/* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { + switch_likely (etype == EVENT_TYPE_EXTINT, etype) { case EVENT_TYPE_EXTINT: return fred_extint(regs); case EVENT_TYPE_NMI:
Got the asm code as following:
objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--' 00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 48 89 e5 mov %rsp,%rbp 15b7: 66 83 e0 0f and $0xf,%ax 15bb: 74 11 je 15ce <fred_entry_from_user+0x2e> 15bd: 66 83 f8 07 cmp $0x7,%ax 15c1: 74 0b je 15ce <fred_entry_from_user+0x2e> 15c3: e8 78 fc ff ff callq 1240 <fred_extint> 15c8: 5d pop %rbp 15c9: e9 00 00 00 00 jmpq 15ce <fred_entry_from_user+0x2e> 15ce: e8 4d fd ff ff callq 1320 <fred_bad_type> 15d3: 5d pop %rbp 15d4: e9 00 00 00 00 jmpq 15d9 <fred_entry_from_user+0x39> 15d9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
00000000000015e0 <__pfx_fred_entry_from_kernel>: 15e0: 90 nop 15e1: 90 nop
00000000000015f0 <fred_entry_from_kernel>: 15f0: 55 push %rbp 15f1: 48 8b 77 78 mov 0x78(%rdi),%rsi 15f5: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15fc: ff 15fd: 48 89 e5 mov %rsp,%rbp 1600: f6 87 a6 00 00 00 0f testb $0xf,0xa6(%rdi) 1607: 75 0b jne 1614 <fred_entry_from_kernel+0x24> 1609: e8 12 fd ff ff callq 1320 <fred_bad_type> 160e: 5d pop %rbp 160f: e9 00 00 00 00 jmpq 1614 <fred_entry_from_kernel+0x24> 1614: e8 27 fc ff ff callq 1240 <fred_extint> 1619: 5d pop %rbp 161a: e9 00 00 00 00 jmpq 161f <fred_entry_from_kernel+0x2f> 161f: 90 nop
0000000000001620 <__pfx___fred_entry_from_kvm>: 1620: 90 nop 1621: 90 nop
Even the fred_entry_from_kernel() asm code doesn't look right. *gcc version 8.5.0 20210514 (Red Hat 8.5.0-10) (GCC)* ** *Did I screw up something ?* ** *Thanks,* *Ethan*
-hpa
在 2025/1/17 12:43, Xin Li 写道:
On 1/16/2025 8:19 PM, Ethan Zhao wrote:
在 2025/1/17 9:21, H. Peter Anvin 写道:
On 1/16/25 16:37, Ethan Zhao wrote:
hpa suggested to introduce "switch_likely" for this kind of optimization on a switch statement, which is also easier to read. I measured it with a user space focus test, it does improve performance a lot. But obviously there are still a lot of work to do.
Find a way to instruct compiler to pick the right hot branch meanwhile make folks reading happy... yup, a lot of work.
It's not that complicated, believe it or not.
/* * switch(v) biased for speed in the case v == l * * Note: gcc is quite sensitive to the exact form of this * expression. */ #define switch_likely(v,l) \ switch((__typeof__(v))__builtin_expect((v),(l)))
I tried this macro as following, but got something really *weird* from gcc.
+#define switch_likely(v,l) \ + switch((__typeof__(v))__builtin_expect((v),(l)))
__visible noinstr void fred_entry_from_user(struct pt_regs *regs) { unsigned long error_code = regs->orig_ax; + unsigned short etype = regs->fred_ss.type & 0xf;
/* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { + switch_likely ((etype == EVENT_TYPE_EXTINT || etype == EVENT_TYPE_OTHER), etype) {
Just swap the 2 arguments, and it should be: + switch_likely (etype, EVENT_TYPE_OTHER) {
after swapped the parameters as following: +#define switch_likely(v,l) \ + switch((__typeof__(v))__builtin_expect((v),(l))) + __visible noinstr void fred_entry_from_user(struct pt_regs *regs) { unsigned long error_code = regs->orig_ax; + unsigned short etype = regs->fred_ss.type & 0xf;
/* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { + switch_likely (etype, (EVENT_TYPE_EXTINT == etype || EVENT_TYPE_OTHER == etype)) { case EVENT_TYPE_EXTINT: return fred_extint(regs); case EVENT_TYPE_NMI: @@ -256,11 +260,12 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) { unsigned long error_code = regs->orig_ax; + unsigned short etype = regs->fred_ss.type & 0xf;
/* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { + switch_likely (etype, EVENT_TYPE_EXTINT == etype) { case EVENT_TYPE_EXTINT: return fred_extint(regs);
case EVENT_TYPE_NMI:
Good news -- the gcc works normal, generated right asm code, Bad news -- it fails back to binary search method to do matching, ignored our hints.
$objdump -d vmlinux.o | awk '/<fred_entry_from_kernel>:/{c=65} c&&c--' 00000000000016b0 <fred_entry_from_kernel>: 16b0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 16b7: 48 8b 77 78 mov 0x78(%rdi),%rsi 16bb: 55 push %rbp 16bc: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 16c3: ff 16c4: 83 e0 0f and $0xf,%eax 16c7: 48 89 e5 mov %rsp,%rbp 16ca: 3c 03 cmp $0x3,%al 16cc: 74 3c je 170a <fred_entry_from_kernel+0x5a> 16ce: 76 13 jbe 16e3 <fred_entry_from_kernel+0x33> 16d0: 3c 05 cmp $0x5,%al 16d2: 74 41 je 1715 <fred_entry_from_kernel+0x65> 16d4: 3c 06 cmp $0x6,%al 16d6: 75 27 jne 16ff <fred_entry_from_kernel+0x4f> 16d8: e8 73 fe ff ff callq 1550 <fred_swexc> 16dd: 5d pop %rbp 16de: e9 00 00 00 00 jmpq 16e3 <fred_entry_from_kernel+0x33> 16e3: 84 c0 test %al,%al 16e5: 74 42 je 1729 <fred_entry_from_kernel+0x79> 16e7: 3c 02 cmp $0x2,%al 16e9: 75 14 jne 16ff <fred_entry_from_kernel+0x4f> 16eb: 80 bf a4 00 00 00 02 cmpb $0x2,0xa4(%rdi) 16f2: 75 0b jne 16ff <fred_entry_from_kernel+0x4f> 16f4: e8 00 00 00 00 callq 16f9 <fred_entry_from_kernel+0x49> 16f9: 5d pop %rbp 16fa: e9 00 00 00 00 jmpq 16ff <fred_entry_from_kernel+0x4f> 16ff: e8 1c fc ff ff callq 1320 <fred_bad_type> 1704: 5d pop %rbp 1705: e9 00 00 00 00 jmpq 170a <fred_entry_from_kernel+0x5a> 170a: e8 21 fd ff ff callq 1430 <fred_hwexc> 170f: 5d pop %rbp 1710: e9 00 00 00 00 jmpq 1715 <fred_entry_from_kernel+0x65> 1715: 80 bf a4 00 00 00 01 cmpb $0x1,0xa4(%rdi) 171c: 75 e1 jne 16ff <fred_entry_from_kernel+0x4f> 171e: e8 00 00 00 00 callq 1723 <fred_entry_from_kernel+0x73> 1723: 5d pop %rbp 1724: e9 00 00 00 00 jmpq 1729 <fred_entry_from_kernel+0x79> 1729: e8 12 fb ff ff callq 1240 <fred_extint> 172e: 5d pop %rbp 172f: e9 00 00 00 00 jmpq 1734 <fred_entry_from_kernel+0x84> 1734: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1) 173b: 00 00 00 00 173f: 90 nop
$ objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--' 00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 83 e0 0f and $0xf,%eax 15b7: 48 89 e5 mov %rsp,%rbp 15ba: 3c 04 cmp $0x4,%al 15bc: 74 78 je 1636 <fred_entry_from_user+0x96> 15be: 77 15 ja 15d5 <fred_entry_from_user+0x35> 15c0: 3c 02 cmp $0x2,%al 15c2: 74 53 je 1617 <fred_entry_from_user+0x77> 15c4: 77 65 ja 162b <fred_entry_from_user+0x8b> 15c6: 84 c0 test %al,%al 15c8: 75 42 jne 160c <fred_entry_from_user+0x6c> 15ca: e8 71 fc ff ff callq 1240 <fred_extint> 15cf: 5d pop %rbp 15d0: e9 00 00 00 00 jmpq 15d5 <fred_entry_from_user+0x35> 15d5: 3c 06 cmp $0x6,%al 15d7: 74 7c je 1655 <fred_entry_from_user+0xb5> 15d9: 72 66 jb 1641 <fred_entry_from_user+0xa1> 15db: 3c 07 cmp $0x7,%al 15dd: 75 2d jne 160c <fred_entry_from_user+0x6c> 15df: 8b 87 a4 00 00 00 mov 0xa4(%rdi),%eax 15e5: 25 ff 00 00 02 and $0x20000ff,%eax 15ea: 3d 01 00 00 02 cmp $0x2000001,%eax 15ef: 75 6f jne 1660 <fred_entry_from_user+0xc0> 15f1: 48 8b 77 50 mov 0x50(%rdi),%rsi 15f5: 48 c7 47 50 da ff ff movq $0xffffffffffffffda,0x50(%rdi) 15fc: ff 15fd: 48 89 77 78 mov %rsi,0x78(%rdi) 1601: e8 00 00 00 00 callq 1606 <fred_entry_from_user+0x66> 1606: 5d pop %rbp 1607: e9 00 00 00 00 jmpq 160c <fred_entry_from_user+0x6c> 160c: e8 0f fd ff ff callq 1320 <fred_bad_type> 1611: 5d pop %rbp 1612: e9 00 00 00 00 jmpq 1617 <fred_entry_from_user+0x77> 1617: 80 bf a4 00 00 00 02 cmpb $0x2,0xa4(%rdi) 161e: 75 ec jne 160c <fred_entry_from_user+0x6c> 1620: e8 00 00 00 00 callq 1625 <fred_entry_from_user+0x85> 1625: 5d pop %rbp 1626: e9 00 00 00 00 jmpq 162b <fred_entry_from_user+0x8b> 162b: e8 00 fe ff ff callq 1430 <fred_hwexc> 1630: 5d pop %rbp 1631: e9 00 00 00 00 jmpq 1636 <fred_entry_from_user+0x96> 1636: e8 85 fc ff ff callq 12c0 <fred_intx> 163b: 5d pop %rbp 163c: e9 00 00 00 00 jmpq 1641 <fred_entry_from_user+0xa1> 1641: 80 bf a4 00 00 00 01 cmpb $0x1,0xa4(%rdi) 1648: 75 c2 jne 160c <fred_entry_from_user+0x6c> 164a: e8 00 00 00 00 callq 164f <fred_entry_from_user+0xaf> 164f: 5d pop %rbp 1650: e9 00 00 00 00 jmpq 1655 <fred_entry_from_user+0xb5> 1655: e8 f6 fe ff ff callq 1550 <fred_swexc> 165a: 5d pop %rbp 165b: e9 00 00 00 00 jmpq 1660 <fred_entry_from_user+0xc0> 1660: 83 f8 02 cmp $0x2,%eax 1663: 75 24 jne 1689 <fred_entry_from_user+0xe9> 1665: 80 3d 00 00 00 00 00 cmpb $0x0,0x0(%rip) # 166c <fred_entry_from_user+0xcc> 166c: 74 1b je 1689 <fred_entry_from_user+0xe9> 166e: 48 8b 47 50 mov 0x50(%rdi),%rax 1672: 48 c7 47 50 da ff ff movq $0xffffffffffffffda,0x50(%rdi) 1679: ff 167a: 48 89 47 78 mov %rax,0x78(%rdi)
In short, seems that __builtin_expect not work with switch(), at least for gcc version 8.5.0 20210514(RHEL).
Thanks, Ethan
Probably also check __builtin_expect on https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html.
case EVENT_TYPE_EXTINT: return fred_extint(regs); case EVENT_TYPE_NMI: @@ -256,11 +260,12 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) { unsigned long error_code = regs->orig_ax; + unsigned short etype = regs->fred_ss.type & 0xf;
/* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { + switch_likely (etype == EVENT_TYPE_EXTINT, etype) { case EVENT_TYPE_EXTINT: return fred_extint(regs); case EVENT_TYPE_NMI:
Got the asm code as following:
objdump -d vmlinux.o | awk '/<fred_entry_from_user>:/{c=65} c&&c--' 00000000000015a0 <fred_entry_from_user>: 15a0: 0f b6 87 a6 00 00 00 movzbl 0xa6(%rdi),%eax 15a7: 48 8b 77 78 mov 0x78(%rdi),%rsi 15ab: 55 push %rbp 15ac: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15b3: ff 15b4: 48 89 e5 mov %rsp,%rbp 15b7: 66 83 e0 0f and $0xf,%ax 15bb: 74 11 je 15ce <fred_entry_from_user+0x2e> 15bd: 66 83 f8 07 cmp $0x7,%ax 15c1: 74 0b je 15ce <fred_entry_from_user+0x2e> 15c3: e8 78 fc ff ff callq 1240 <fred_extint> 15c8: 5d pop %rbp 15c9: e9 00 00 00 00 jmpq 15ce <fred_entry_from_user+0x2e> 15ce: e8 4d fd ff ff callq 1320 <fred_bad_type> 15d3: 5d pop %rbp 15d4: e9 00 00 00 00 jmpq 15d9 <fred_entry_from_user+0x39> 15d9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
00000000000015e0 <__pfx_fred_entry_from_kernel>: 15e0: 90 nop 15e1: 90 nop
00000000000015f0 <fred_entry_from_kernel>: 15f0: 55 push %rbp 15f1: 48 8b 77 78 mov 0x78(%rdi),%rsi 15f5: 48 c7 47 78 ff ff ff movq $0xffffffffffffffff,0x78(%rdi) 15fc: ff 15fd: 48 89 e5 mov %rsp,%rbp 1600: f6 87 a6 00 00 00 0f testb $0xf,0xa6(%rdi) 1607: 75 0b jne 1614 <fred_entry_from_kernel+0x24> 1609: e8 12 fd ff ff callq 1320 <fred_bad_type> 160e: 5d pop %rbp 160f: e9 00 00 00 00 jmpq 1614 <fred_entry_from_kernel+0x24> 1614: e8 27 fc ff ff callq 1240 <fred_extint> 1619: 5d pop %rbp 161a: e9 00 00 00 00 jmpq 161f <fred_entry_from_kernel+0x2f> 161f: 90 nop
0000000000001620 <__pfx___fred_entry_from_kvm>: 1620: 90 nop 1621: 90 nop
Even the fred_entry_from_kernel() asm code doesn't look right. *gcc version 8.5.0 20210514 (Red Hat 8.5.0-10) (GCC)* ** *Did I screw up something ?* ** *Thanks,* *Ethan*
-hpa
On 1/16/2025 9:18 PM, Ethan Zhao wrote:
Just swap the 2 arguments, and it should be: + switch_likely (etype, EVENT_TYPE_OTHER) {
after swapped the parameters as following: +#define switch_likely(v,l) \
- switch((__typeof__(v))__builtin_expect((v),(l)))
__visible noinstr void fred_entry_from_user(struct pt_regs *regs) { unsigned long error_code = regs->orig_ax; + unsigned short etype = regs->fred_ss.type & 0xf;
/* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { + switch_likely (etype, (EVENT_TYPE_EXTINT == etype || EVENT_TYPE_OTHER == etype)) {
This is not what I suggested, the (l) argument should be only one constant; __builtin_expect() doesn't allow 2 different constants.
On 1/17/2025 1:54 PM, Xin Li wrote:
On 1/16/2025 9:18 PM, Ethan Zhao wrote:
Just swap the 2 arguments, and it should be: + switch_likely (etype, EVENT_TYPE_OTHER) {
after swapped the parameters as following: +#define switch_likely(v,l) \
- switch((__typeof__(v))__builtin_expect((v),(l)))
__visible noinstr void fred_entry_from_user(struct pt_regs *regs) { unsigned long error_code = regs->orig_ax; + unsigned short etype = regs->fred_ss.type & 0xf;
/* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { + switch_likely (etype, (EVENT_TYPE_EXTINT == etype || EVENT_TYPE_OTHER == etype)) {
This is not what I suggested, the (l) argument should be only one constant; __builtin_expect() doesn't allow 2 different constants.
That is why it couldn't be used in switch(var), while it works with if(true) else if (false). Switch(var) needs a var returned by the __builtin_expect(), but it only could return Const.
Thanks, Ethan
On 1/16/25 21:54, Xin Li wrote:
On 1/16/2025 9:18 PM, Ethan Zhao wrote:
Just swap the 2 arguments, and it should be: + switch_likely (etype, EVENT_TYPE_OTHER) {
after swapped the parameters as following: +#define switch_likely(v,l) \
- switch((__typeof__(v))__builtin_expect((v),(l)))
__visible noinstr void fred_entry_from_user(struct pt_regs *regs) { unsigned long error_code = regs->orig_ax; + unsigned short etype = regs->fred_ss.type & 0xf;
/* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) { + switch_likely (etype, (EVENT_TYPE_EXTINT == etype || EVENT_TYPE_OTHER == etype)) {
This is not what I suggested, the (l) argument should be only one constant; __builtin_expect() doesn't allow 2 different constants.
The (l) argument is not a boolean expression! It is the *expected value* of (v).
-hpa
On 1/17/25 08:17, H. Peter Anvin wrote:
- switch (regs->fred_ss.type) { + switch_likely (etype, (EVENT_TYPE_EXTINT == etype || EVENT_TYPE_OTHER == etype)) {
This is not what I suggested, the (l) argument should be only one constant; __builtin_expect() doesn't allow 2 different constants.
The (l) argument is not a boolean expression! It is the *expected value* of (v).
Also, EVENT_TYPE_EXTINT == etype is not Linux style.
More fundamentally, though, I have to question this unless based on profiling, because it isn't at all clear that EXTINT is more important than FAULT (page faults, to be specific.)
To optimize syscalls, you want to do a one-shot comparison of the entire syscall64 signature (event type, 64-bit flag, and vector) as a mask and compare. For that you want to make sure the compiler loads the high 32 bits into a register so that your mask and compare values can be immediates. In other words, you don't actually want it to be part of the switch at all, and you want *other* EVENT_TYPE_OTHER to fall back to the switch with regular (low) priority.
-hpa
On 1/18/2025 12:23 AM, H. Peter Anvin wrote:
On 1/17/25 08:17, H. Peter Anvin wrote:
- switch (regs->fred_ss.type) { + switch_likely (etype, (EVENT_TYPE_EXTINT == etype || EVENT_TYPE_OTHER == etype)) {
This is not what I suggested, the (l) argument should be only one constant; __builtin_expect() doesn't allow 2 different constants.
The (l) argument is not a boolean expression! It is the *expected value* of (v).
Also, EVENT_TYPE_EXTINT == etype is not Linux style.
More fundamentally, though, I have to question this unless based on profiling, because it isn't at all clear that EXTINT is more important than FAULT (page faults, to be specific.)
Perhaps the conclusion about which is more important/higher probability among EXTINT,SYSCALL,PF only applies to specific kind of workload system, no one-size-fit-all conclusion there to dig.
But for a normal system, it is certainty that events like EXTINT,SYSCALL,PF would happen in higher probability than others. saving some cycles for their paths isn't hard to understand. just like taking shortcut at event type dispatching level, no other changes.
To optimize syscalls, you want to do a one-shot comparison of the entire syscall64 signature (event type, 64-bit flag, and vector) as a mask and compare. For
To whole event dispatching path for syscalls, yep.
that you want to make sure the compiler loads the high 32 bits into a register so that your mask and compare values can be immediates. In other words, you don't actually want it to be part of the switch at all, and you want *other* EVENT_TYPE_OTHER to fall back to the switch with regular (low) priority.
switch() seems not too bad, at least compared to jump table.
Thanks, Ethan
-hpa
On 1/18/2025 12:24 AM, H. Peter Anvin wrote:
In short, seems that __builtin_expect not work with switch(), at least for gcc version 8.5.0 20210514(RHEL).
For forward-facing optimizations, please don't use an ancient version of gcc as the benchmark.
Even there is a latest Gcc built-in feature could work for this case, it is highly unlikely that Linus would adopt such trick into upstream kernel (only works for specific ver compiler). the same resultto those downstream vendors/LTS kernels. thus, making an optimization with latest only Gcc would construct an impractical benchmark-only performance barrier. As to the __builtin_expect(), my understanding, it was designed to only work for if(bool value) { } else if(bool value) { } The value of the condition expression returned by __builtin_expect() is a bool const. while switch(variable) expects a variable. so it is normal for Gcc that it doesn't work with it.
If I got something wrong, please let me know.
Thanks, Ethan
-hpa
On January 17, 2025 7:29:36 PM PST, Ethan Zhao etzhao@outlook.com wrote:
On 1/18/2025 12:24 AM, H. Peter Anvin wrote:
In short, seems that __builtin_expect not work with switch(), at least for gcc version 8.5.0 20210514(RHEL).
For forward-facing optimizations, please don't use an ancient version of gcc as the benchmark.
Even there is a latest Gcc built-in feature could work for this case, it is highly unlikely that Linus would adopt such trick into upstream kernel (only works for specific ver compiler). the same resultto those downstream vendors/LTS kernels. thus, making an optimization with latest only Gcc would construct an impractical benchmark-only performance barrier. As to the __builtin_expect(), my understanding, it was designed to only work for if(bool value) { } else if(bool value) { } The value of the condition expression returned by __builtin_expect() is a bool const. while switch(variable) expects a variable. so it is normal for Gcc that it doesn't work with it.
If I got something wrong, please let me know.
Thanks, Ethan
-hpa
That is not true at all; we do that pretty much *all the time*. The reason is that the new compiler versions will become mainstream on a much shorter time scale than the lifespan of kernel code.
We do care about not making the code for the current mainstream compilers *worse* in the process, and we care about not *breaking* the backrev compilers.
On 1/18/2025 11:41 AM, H. Peter Anvin wrote:
On January 17, 2025 7:29:36 PM PST, Ethan Zhao etzhao@outlook.com wrote:
On 1/18/2025 12:24 AM, H. Peter Anvin wrote:
In short, seems that __builtin_expect not work with switch(), at least for gcc version 8.5.0 20210514(RHEL).
For forward-facing optimizations, please don't use an ancient version of gcc as the benchmark.
Even there is a latest Gcc built-in feature could work for this case, it is highly unlikely that Linus would adopt such trick into upstream kernel (only works for specific ver compiler). the same resultto those downstream vendors/LTS kernels. thus, making an optimization with latest only Gcc would construct an impractical benchmark-only performance barrier. As to the __builtin_expect(), my understanding, it was designed to only work for if(bool value) { } else if(bool value) { } The value of the condition expression returned by __builtin_expect() is a bool const. while switch(variable) expects a variable. so it is normal for Gcc that it doesn't work with it.
If I got something wrong, please let me know.
Thanks, Ethan
-hpa
That is not true at all; we do that pretty much *all the time*. The reason is that the new compiler versions will become mainstream on a much shorter time scale than the lifespan of kernel code.
Yup, time walks forward... But it is very painful to backporting like jobs to make those things in position for eager/no-waiting customers.
Thanks, Ethan
We do care about not making the code for the current mainstream compilers *worse* in the process, and we care about not *breaking* the backrev compilers.
On January 17, 2025 8:06:27 PM PST, Ethan Zhao etzhao@outlook.com wrote:
On 1/18/2025 11:41 AM, H. Peter Anvin wrote:
On January 17, 2025 7:29:36 PM PST, Ethan Zhao etzhao@outlook.com wrote:
On 1/18/2025 12:24 AM, H. Peter Anvin wrote:
In short, seems that __builtin_expect not work with switch(), at least for gcc version 8.5.0 20210514(RHEL).
For forward-facing optimizations, please don't use an ancient version of gcc as the benchmark.
Even there is a latest Gcc built-in feature could work for this case, it is highly unlikely that Linus would adopt such trick into upstream kernel (only works for specific ver compiler). the same resultto those downstream vendors/LTS kernels. thus, making an optimization with latest only Gcc would construct an impractical benchmark-only performance barrier. As to the __builtin_expect(), my understanding, it was designed to only work for if(bool value) { } else if(bool value) { } The value of the condition expression returned by __builtin_expect() is a bool const. while switch(variable) expects a variable. so it is normal for Gcc that it doesn't work with it.
If I got something wrong, please let me know.
Thanks, Ethan
-hpa
That is not true at all; we do that pretty much *all the time*. The reason is that the new compiler versions will become mainstream on a much shorter time scale than the lifespan of kernel code.
Yup, time walks forward... But it is very painful to backporting like jobs to make those things in position for eager/no-waiting customers.
Thanks, Ethan
We do care about not making the code for the current mainstream compilers *worse* in the process, and we care about not *breaking* the backrev compilers.
As I said, it is OK for an ancient compiler (gcc 8 is the oldest compiler still supported) to *not have an improvement* as long as it doesn't make it worse.
On 1/18/2025 12:09 PM, H. Peter Anvin wrote:
On January 17, 2025 8:06:27 PM PST, Ethan Zhao etzhao@outlook.com wrote:
On 1/18/2025 11:41 AM, H. Peter Anvin wrote:
On January 17, 2025 7:29:36 PM PST, Ethan Zhao etzhao@outlook.com wrote:
On 1/18/2025 12:24 AM, H. Peter Anvin wrote:
In short, seems that __builtin_expect not work with switch(), at least for gcc version 8.5.0 20210514(RHEL).
For forward-facing optimizations, please don't use an ancient version of gcc as the benchmark.
Even there is a latest Gcc built-in feature could work for this case, it is highly unlikely that Linus would adopt such trick into upstream kernel (only works for specific ver compiler). the same resultto those downstream vendors/LTS kernels. thus, making an optimization with latest only Gcc would construct an impractical benchmark-only performance barrier. As to the __builtin_expect(), my understanding, it was designed to only work for if(bool value) { } else if(bool value) { } The value of the condition expression returned by __builtin_expect() is a bool const. while switch(variable) expects a variable. so it is normal for Gcc that it doesn't work with it.
If I got something wrong, please let me know.
Thanks, Ethan
-hpa
That is not true at all; we do that pretty much *all the time*. The reason is that the new compiler versions will become mainstream on a much shorter time scale than the lifespan of kernel code.
Yup, time walks forward... But it is very painful to backporting like jobs to make those things in position for eager/no-waiting customers.
Thanks, Ethan
We do care about not making the code for the current mainstream compilers *worse* in the process, and we care about not *breaking* the backrev compilers.
As I said, it is OK for an ancient compiler (gcc 8 is the oldest compiler still supported) to *not have an improvement* as long as it doesn't make it worse.
There are always surprise that some customers want new features but wouldn't throw away much older pre-ancient toolchains (gcc etc)\kernels,and they really have justification (business system). refuse to provide service ? hmmm :(, they wanna both "no changing" (old system) and "changing" (new feature and performance improvement). this is the way some guys living forward by looking & working backwards. so a little sensitive for 'the last' :)
Thanks, Ethan
On January 17, 2025 8:06:27 PM PST, Ethan Zhao etzhao@outlook.com wrote:
On 1/18/2025 11:41 AM, H. Peter Anvin wrote:
On January 17, 2025 7:29:36 PM PST, Ethan Zhao etzhao@outlook.com wrote:
On 1/18/2025 12:24 AM, H. Peter Anvin wrote:
In short, seems that __builtin_expect not work with switch(), at least for gcc version 8.5.0 20210514(RHEL).
For forward-facing optimizations, please don't use an ancient version of gcc as the benchmark.
Even there is a latest Gcc built-in feature could work for this case, it is highly unlikely that Linus would adopt such trick into upstream kernel (only works for specific ver compiler). the same resultto those downstream vendors/LTS kernels. thus, making an optimization with latest only Gcc would construct an impractical benchmark-only performance barrier. As to the __builtin_expect(), my understanding, it was designed to only work for if(bool value) { } else if(bool value) { } The value of the condition expression returned by __builtin_expect() is a bool const. while switch(variable) expects a variable. so it is normal for Gcc that it doesn't work with it.
If I got something wrong, please let me know.
Thanks, Ethan
-hpa
That is not true at all; we do that pretty much *all the time*. The reason is that the new compiler versions will become mainstream on a much shorter time scale than the lifespan of kernel code.
Yup, time walks forward... But it is very painful to backporting like jobs to make those things in position for eager/no-waiting customers.
Thanks, Ethan
We do care about not making the code for the current mainstream compilers *worse* in the process, and we care about not *breaking* the backrev compilers.
Keep in mind we're not even talking about feature enablement here, but a microoptimization. The latter we really need to be syntactically lightweight and abstracted enough to deal with, for example, compiler changes.
On 1/18/2025 12:14 PM, H. Peter Anvin wrote:
On January 17, 2025 8:06:27 PM PST, Ethan Zhao etzhao@outlook.com wrote:
On 1/18/2025 11:41 AM, H. Peter Anvin wrote:
On January 17, 2025 7:29:36 PM PST, Ethan Zhao etzhao@outlook.com wrote:
On 1/18/2025 12:24 AM, H. Peter Anvin wrote:
In short, seems that __builtin_expect not work with switch(), at least for gcc version 8.5.0 20210514(RHEL).
For forward-facing optimizations, please don't use an ancient version of gcc as the benchmark.
Even there is a latest Gcc built-in feature could work for this case, it is highly unlikely that Linus would adopt such trick into upstream kernel (only works for specific ver compiler). the same resultto those downstream vendors/LTS kernels. thus, making an optimization with latest only Gcc would construct an impractical benchmark-only performance barrier. As to the __builtin_expect(), my understanding, it was designed to only work for if(bool value) { } else if(bool value) { } The value of the condition expression returned by __builtin_expect() is a bool const. while switch(variable) expects a variable. so it is normal for Gcc that it doesn't work with it.
If I got something wrong, please let me know.
Thanks, Ethan
-hpa
That is not true at all; we do that pretty much *all the time*. The reason is that the new compiler versions will become mainstream on a much shorter time scale than the lifespan of kernel code.
Yup, time walks forward... But it is very painful to backporting like jobs to make those things in position for eager/no-waiting customers.
Thanks, Ethan
We do care about not making the code for the current mainstream compilers *worse* in the process, and we care about not *breaking* the backrev compilers.
Keep in mind we're not even talking about feature enablement here, but a microoptimization. The latter we really need to be syntactically lightweight and abstracted enough to deal with, for example, compiler changes.
Fully agree. we are looking at 'micro optimization', not feature level, system level optimization, need to be abstracted to micro lightweight method to verify or think about its pros-cons.
Thanks, Ethan
On 16.01.25 г. 8:51 ч., Ethan Zhao wrote:
External interrupts (EVENT_TYPE_EXTINT) and system calls (EVENT_TYPE_OTHER) occur more frequently than other events in a typical system. Prioritizing these events saves CPU cycles and optimizes the efficiency of performance- critical paths.
<snip>
Can you also include some performance numbers?
Signed-off-by: Ethan Zhao haifeng.zhao@linux.intel.com
base commit: 619f0b6fad524f08d493a98d55bac9ab8895e3a6
arch/x86/entry/entry_fred.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index f004a4dc74c2..591f47771ecf 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -228,9 +228,18 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) {
- case EVENT_TYPE_EXTINT:
- if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs);
- else if (regs->fred_ss.type == EVENT_TYPE_OTHER)
return fred_other(regs);
- /*
* Dispatch EVENT_TYPE_EXTINT and EVENT_TYPE_OTHER(syscall) type events
* first due to their high probability and let the compiler create binary search
* dispatching for the remaining events
*/
nit: At least to me it makes sense to have the comment above the 'if' so that the flow is linear.
- switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs);
@@ -245,8 +254,6 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) break; case EVENT_TYPE_SWEXC: return fred_swexc(regs, error_code);
- case EVENT_TYPE_OTHER:
default: break; }return fred_other(regs);
@@ -260,9 +267,15 @@ __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1;
- switch (regs->fred_ss.type) {
- case EVENT_TYPE_EXTINT:
- if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs);
- /*
* Dispatch EVENT_TYPE_EXTINT type event first due to its high probability
* and let the compiler do binary search dispatching for the other events
*/
- switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs);
On 1/16/2025 7:08 AM, Nikolay Borisov wrote:
On 16.01.25 г. 8:51 ч., Ethan Zhao wrote:
External interrupts (EVENT_TYPE_EXTINT) and system calls (EVENT_TYPE_OTHER) occur more frequently than other events in a typical system. Prioritizing these events saves CPU cycles and optimizes the efficiency of performance- critical paths.
<snip>
Can you also include some performance numbers?
not a good time yet :)
On 1/16/2025 11:08 PM, Nikolay Borisov wrote:
On 16.01.25 г. 8:51 ч., Ethan Zhao wrote:
External interrupts (EVENT_TYPE_EXTINT) and system calls (EVENT_TYPE_OTHER) occur more frequently than other events in a typical system. Prioritizing these events saves CPU cycles and optimizes the efficiency of performance- critical paths.
<snip>
Can you also include some performance numbers?
Of coz will do when timing is good :)
Thanks, Ethan
Signed-off-by: Ethan Zhao haifeng.zhao@linux.intel.com
base commit: 619f0b6fad524f08d493a98d55bac9ab8895e3a6
arch/x86/entry/entry_fred.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index f004a4dc74c2..591f47771ecf 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -228,9 +228,18 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1; - switch (regs->fred_ss.type) { - case EVENT_TYPE_EXTINT: + if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs); + else if (regs->fred_ss.type == EVENT_TYPE_OTHER) + return fred_other(regs);
+ /* + * Dispatch EVENT_TYPE_EXTINT and EVENT_TYPE_OTHER(syscall) type events + * first due to their high probability and let the compiler create binary search + * dispatching for the remaining events + */
nit: At least to me it makes sense to have the comment above the 'if' so that the flow is linear.
+ switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs); @@ -245,8 +254,6 @@ __visible noinstr void fred_entry_from_user(struct pt_regs *regs) break; case EVENT_TYPE_SWEXC: return fred_swexc(regs, error_code); - case EVENT_TYPE_OTHER: - return fred_other(regs); default: break; } @@ -260,9 +267,15 @@ __visible noinstr void fred_entry_from_kernel(struct pt_regs *regs) /* Invalidate orig_ax so that syscall_get_nr() works correctly */ regs->orig_ax = -1; - switch (regs->fred_ss.type) { - case EVENT_TYPE_EXTINT: + if (regs->fred_ss.type == EVENT_TYPE_EXTINT) return fred_extint(regs);
+ /* + * Dispatch EVENT_TYPE_EXTINT type event first due to its high probability + * and let the compiler do binary search dispatching for the other events + */
+ switch (regs->fred_ss.type) { case EVENT_TYPE_NMI: if (likely(regs->fred_ss.vector == X86_TRAP_NMI)) return fred_exc_nmi(regs);
linux-stable-mirror@lists.linaro.org