From: Russell King russell.king@oracle.com
[ Upstream commit b89ddf4cca43f1269093942cf5c4e457fd45c335 ]
Commit 91fc957c9b1d ("arm64/bpf: don't allocate BPF JIT programs in module memory") restricts BPF JIT program allocation to a 128MB region to ensure BPF programs are still in branching range of each other. However this restriction should not apply to the aarch64 JIT, since BPF_JMP | BPF_CALL are implemented as a 64-bit move into a register and then a BLR instruction - which has the effect of being able to call anything without proximity limitation.
The practical reason to relax this restriction on JIT memory is that 128MB of JIT memory can be quickly exhausted, especially where PAGE_SIZE is 64KB - one page is needed per program. In cases where seccomp filters are applied to multiple VMs on VM launch - such filters are classic BPF but converted to BPF - this can severely limit the number of VMs that can be launched. In a world where we support BPF JIT always on, turning off the JIT isn't always an option either.
Fixes: 91fc957c9b1d ("arm64/bpf: don't allocate BPF JIT programs in module memory") Suggested-by: Ard Biesheuvel ard.biesheuvel@linaro.org Signed-off-by: Russell King russell.king@oracle.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Tested-by: Alan Maguire alan.maguire@oracle.com Link: https://lore.kernel.org/bpf/1636131046-5982-2-git-send-email-alan.maguire@or... [Replace usage of in_bpf_jit() with is_bpf_text_address()] Signed-off-by: Puranjay Mohan pjy@amazon.com --- arch/arm64/include/asm/extable.h | 9 --------- arch/arm64/include/asm/memory.h | 5 +---- arch/arm64/kernel/traps.c | 2 +- arch/arm64/mm/extable.c | 3 ++- arch/arm64/mm/ptdump.c | 2 -- arch/arm64/net/bpf_jit_comp.c | 7 ++----- 6 files changed, 6 insertions(+), 22 deletions(-)
diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h index b15eb4a3e6b20..840a35ed92ec8 100644 --- a/arch/arm64/include/asm/extable.h +++ b/arch/arm64/include/asm/extable.h @@ -22,15 +22,6 @@ struct exception_table_entry
#define ARCH_HAS_RELATIVE_EXTABLE
-static inline bool in_bpf_jit(struct pt_regs *regs) -{ - if (!IS_ENABLED(CONFIG_BPF_JIT)) - return false; - - return regs->pc >= BPF_JIT_REGION_START && - regs->pc < BPF_JIT_REGION_END; -} - #ifdef CONFIG_BPF_JIT int arm64_bpf_fixup_exception(const struct exception_table_entry *ex, struct pt_regs *regs); diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 505bdd75b5411..eef03120c0daf 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -44,11 +44,8 @@ #define _PAGE_OFFSET(va) (-(UL(1) << (va))) #define PAGE_OFFSET (_PAGE_OFFSET(VA_BITS)) #define KIMAGE_VADDR (MODULES_END) -#define BPF_JIT_REGION_START (KASAN_SHADOW_END) -#define BPF_JIT_REGION_SIZE (SZ_128M) -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) -#define MODULES_VADDR (BPF_JIT_REGION_END) +#define MODULES_VADDR (_PAGE_END(VA_BITS_MIN)) #define MODULES_VSIZE (SZ_128M) #define VMEMMAP_START (-VMEMMAP_SIZE - SZ_2M) #define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 563d07d3904e4..e9cc15414133f 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -913,7 +913,7 @@ static struct break_hook bug_break_hook = { static int reserved_fault_handler(struct pt_regs *regs, unsigned int esr) { pr_err("%s generated an invalid instruction at %pS!\n", - in_bpf_jit(regs) ? "BPF JIT" : "Kernel text patching", + "Kernel text patching", (void *)instruction_pointer(regs));
/* We cannot handle this */ diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c index aa0060178343a..9a8147b6878b9 100644 --- a/arch/arm64/mm/extable.c +++ b/arch/arm64/mm/extable.c @@ -5,6 +5,7 @@
#include <linux/extable.h> #include <linux/uaccess.h> +#include <linux/filter.h>
int fixup_exception(struct pt_regs *regs) { @@ -14,7 +15,7 @@ int fixup_exception(struct pt_regs *regs) if (!fixup) return 0;
- if (in_bpf_jit(regs)) + if (is_bpf_text_address(regs->pc)) return arm64_bpf_fixup_exception(fixup, regs);
regs->pc = (unsigned long)&fixup->fixup + fixup->fixup; diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 807dc634bbd24..ba6d1d89f9b2a 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -41,8 +41,6 @@ static struct addr_marker address_markers[] = { { 0 /* KASAN_SHADOW_START */, "Kasan shadow start" }, { KASAN_SHADOW_END, "Kasan shadow end" }, #endif - { BPF_JIT_REGION_START, "BPF start" }, - { BPF_JIT_REGION_END, "BPF end" }, { MODULES_VADDR, "Modules start" }, { MODULES_END, "Modules end" }, { VMALLOC_START, "vmalloc() area" }, diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 18627cbd6da4e..2a47165abbe5e 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1145,15 +1145,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
u64 bpf_jit_alloc_exec_limit(void) { - return BPF_JIT_REGION_SIZE; + return VMALLOC_END - VMALLOC_START; }
void *bpf_jit_alloc_exec(unsigned long size) { - return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, - BPF_JIT_REGION_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); + return vmalloc(size); }
void bpf_jit_free_exec(void *addr)
+CC Greg
From: Russell King russell.king@oracle.com
[ Upstream commit b89ddf4cca43f1269093942cf5c4e457fd45c335 ]
Commit 91fc957c9b1d ("arm64/bpf: don't allocate BPF JIT programs in module memory") restricts BPF JIT program allocation to a 128MB region to ensure BPF programs are still in branching range of each other. However this restriction should not apply to the aarch64 JIT, since BPF_JMP | BPF_CALL are implemented as a 64-bit move into a register and then a BLR instruction - which has the effect of being able to call anything without proximity limitation.
The practical reason to relax this restriction on JIT memory is that 128MB of JIT memory can be quickly exhausted, especially where PAGE_SIZE is 64KB - one page is needed per program. In cases where seccomp filters are applied to multiple VMs on VM launch - such filters are classic BPF but converted to BPF - this can severely limit the number of VMs that can be launched. In a world where we support BPF JIT always on, turning off the JIT isn't always an option either.
Fixes: 91fc957c9b1d ("arm64/bpf: don't allocate BPF JIT programs in module memory") Suggested-by: Ard Biesheuvel ard.biesheuvel@linaro.org Signed-off-by: Russell King russell.king@oracle.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Tested-by: Alan Maguire alan.maguire@oracle.com Link: https://lore.kernel.org/bpf/1636131046-5982-2-git-send-email-alan.maguire@or... [Replace usage of in_bpf_jit() with is_bpf_text_address()] Signed-off-by: Puranjay Mohan pjy@amazon.com
arch/arm64/include/asm/extable.h | 9 --------- arch/arm64/include/asm/memory.h | 5 +---- arch/arm64/kernel/traps.c | 2 +- arch/arm64/mm/extable.c | 3 ++- arch/arm64/mm/ptdump.c | 2 -- arch/arm64/net/bpf_jit_comp.c | 7 ++----- 6 files changed, 6 insertions(+), 22 deletions(-)
diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h index b15eb4a3e6b20..840a35ed92ec8 100644 --- a/arch/arm64/include/asm/extable.h +++ b/arch/arm64/include/asm/extable.h @@ -22,15 +22,6 @@ struct exception_table_entry #define ARCH_HAS_RELATIVE_EXTABLE -static inline bool in_bpf_jit(struct pt_regs *regs) -{
- if (!IS_ENABLED(CONFIG_BPF_JIT))
return false;
- return regs->pc >= BPF_JIT_REGION_START &&
regs->pc < BPF_JIT_REGION_END;
-}
#ifdef CONFIG_BPF_JIT int arm64_bpf_fixup_exception(const struct exception_table_entry *ex, struct pt_regs *regs); diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 505bdd75b5411..eef03120c0daf 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -44,11 +44,8 @@ #define _PAGE_OFFSET(va) (-(UL(1) << (va))) #define PAGE_OFFSET (_PAGE_OFFSET(VA_BITS)) #define KIMAGE_VADDR (MODULES_END) -#define BPF_JIT_REGION_START (KASAN_SHADOW_END) -#define BPF_JIT_REGION_SIZE (SZ_128M) -#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) -#define MODULES_VADDR (BPF_JIT_REGION_END) +#define MODULES_VADDR (_PAGE_END(VA_BITS_MIN)) #define MODULES_VSIZE (SZ_128M) #define VMEMMAP_START (-VMEMMAP_SIZE - SZ_2M) #define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 563d07d3904e4..e9cc15414133f 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -913,7 +913,7 @@ static struct break_hook bug_break_hook = { static int reserved_fault_handler(struct pt_regs *regs, unsigned int esr) { pr_err("%s generated an invalid instruction at %pS!\n",
in_bpf_jit(regs) ? "BPF JIT" : "Kernel text patching",
(void *)instruction_pointer(regs));"Kernel text patching",
/* We cannot handle this */ diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c index aa0060178343a..9a8147b6878b9 100644 --- a/arch/arm64/mm/extable.c +++ b/arch/arm64/mm/extable.c @@ -5,6 +5,7 @@ #include <linux/extable.h> #include <linux/uaccess.h> +#include <linux/filter.h> int fixup_exception(struct pt_regs *regs) { @@ -14,7 +15,7 @@ int fixup_exception(struct pt_regs *regs) if (!fixup) return 0;
- if (in_bpf_jit(regs))
- if (is_bpf_text_address(regs->pc)) return arm64_bpf_fixup_exception(fixup, regs);
regs->pc = (unsigned long)&fixup->fixup + fixup->fixup; diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 807dc634bbd24..ba6d1d89f9b2a 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -41,8 +41,6 @@ static struct addr_marker address_markers[] = { { 0 /* KASAN_SHADOW_START */, "Kasan shadow start" }, { KASAN_SHADOW_END, "Kasan shadow end" }, #endif
- { BPF_JIT_REGION_START, "BPF start" },
- { BPF_JIT_REGION_END, "BPF end" }, { MODULES_VADDR, "Modules start" }, { MODULES_END, "Modules end" }, { VMALLOC_START, "vmalloc() area" },
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 18627cbd6da4e..2a47165abbe5e 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1145,15 +1145,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) u64 bpf_jit_alloc_exec_limit(void) {
- return BPF_JIT_REGION_SIZE;
- return VMALLOC_END - VMALLOC_START;
} void *bpf_jit_alloc_exec(unsigned long size) {
- return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START,
BPF_JIT_REGION_END, GFP_KERNEL,
PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
- return vmalloc(size);
} void bpf_jit_free_exec(void *addr) -- 2.40.1
Greg Kroah-Hartman gregkh@linuxfoundation.org writes:
On Tue, Jul 16, 2024 at 12:36:29PM +0000, Puranjay Mohan wrote:
+CC Greg
Why?
I forgot to add you to the cc-list when sending this patch for stable.
confused,
greg k-h
Thanks, Puranjay
On Mon, Jul 01, 2024 at 11:46:59AM +0000, Puranjay Mohan wrote:
From: Russell King russell.king@oracle.com
[ Upstream commit b89ddf4cca43f1269093942cf5c4e457fd45c335 ]
Commit 91fc957c9b1d ("arm64/bpf: don't allocate BPF JIT programs in module memory") restricts BPF JIT program allocation to a 128MB region to ensure BPF programs are still in branching range of each other. However this restriction should not apply to the aarch64 JIT, since BPF_JMP | BPF_CALL are implemented as a 64-bit move into a register and then a BLR instruction - which has the effect of being able to call anything without proximity limitation.
The practical reason to relax this restriction on JIT memory is that 128MB of JIT memory can be quickly exhausted, especially where PAGE_SIZE is 64KB - one page is needed per program. In cases where seccomp filters are applied to multiple VMs on VM launch - such filters are classic BPF but converted to BPF - this can severely limit the number of VMs that can be launched. In a world where we support BPF JIT always on, turning off the JIT isn't always an option either.
Fixes: 91fc957c9b1d ("arm64/bpf: don't allocate BPF JIT programs in module memory") Suggested-by: Ard Biesheuvel ard.biesheuvel@linaro.org Signed-off-by: Russell King russell.king@oracle.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Tested-by: Alan Maguire alan.maguire@oracle.com Link: https://lore.kernel.org/bpf/1636131046-5982-2-git-send-email-alan.maguire@or... [Replace usage of in_bpf_jit() with is_bpf_text_address()] Signed-off-by: Puranjay Mohan pjy@amazon.com
arch/arm64/include/asm/extable.h | 9 --------- arch/arm64/include/asm/memory.h | 5 +---- arch/arm64/kernel/traps.c | 2 +- arch/arm64/mm/extable.c | 3 ++- arch/arm64/mm/ptdump.c | 2 -- arch/arm64/net/bpf_jit_comp.c | 7 ++----- 6 files changed, 6 insertions(+), 22 deletions(-)
This is reported to cause problems: https://lore.kernel.org/r/CA+G9fYtfAbfcQ9J9Hzq-e6yoBVG3t_iHZ=bS2eJbO_aiOcquX... so I will drop it now.
How did you test this?
And if you really need this feature, why not move to a more modern kernel version?
thanks,
greg k-h
On Thu, Jul 18, 2024 at 8:55 AM Greg KH greg@kroah.com wrote:
On Mon, Jul 01, 2024 at 11:46:59AM +0000, Puranjay Mohan wrote:
From: Russell King russell.king@oracle.com
[ Upstream commit b89ddf4cca43f1269093942cf5c4e457fd45c335 ]
Commit 91fc957c9b1d ("arm64/bpf: don't allocate BPF JIT programs in module memory") restricts BPF JIT program allocation to a 128MB region to ensure BPF programs are still in branching range of each other. However this restriction should not apply to the aarch64 JIT, since BPF_JMP | BPF_CALL are implemented as a 64-bit move into a register and then a BLR instruction - which has the effect of being able to call anything without proximity limitation.
The practical reason to relax this restriction on JIT memory is that 128MB of JIT memory can be quickly exhausted, especially where PAGE_SIZE is 64KB - one page is needed per program. In cases where seccomp filters are applied to multiple VMs on VM launch - such filters are classic BPF but converted to BPF - this can severely limit the number of VMs that can be launched. In a world where we support BPF JIT always on, turning off the JIT isn't always an option either.
Fixes: 91fc957c9b1d ("arm64/bpf: don't allocate BPF JIT programs in module memory") Suggested-by: Ard Biesheuvel ard.biesheuvel@linaro.org Signed-off-by: Russell King russell.king@oracle.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Tested-by: Alan Maguire alan.maguire@oracle.com Link: https://lore.kernel.org/bpf/1636131046-5982-2-git-send-email-alan.maguire@or... [Replace usage of in_bpf_jit() with is_bpf_text_address()] Signed-off-by: Puranjay Mohan pjy@amazon.com
arch/arm64/include/asm/extable.h | 9 --------- arch/arm64/include/asm/memory.h | 5 +---- arch/arm64/kernel/traps.c | 2 +- arch/arm64/mm/extable.c | 3 ++- arch/arm64/mm/ptdump.c | 2 -- arch/arm64/net/bpf_jit_comp.c | 7 ++----- 6 files changed, 6 insertions(+), 22 deletions(-)
This is reported to cause problems: https://lore.kernel.org/r/CA+G9fYtfAbfcQ9J9Hzq-e6yoBVG3t_iHZ=bS2eJbO_aiOcquX... so I will drop it now.
I will try to debug this!
How did you test this?
I tested this on an AWS Graviton based EC2 instance by loading 16000 BPF programs.
And if you really need this feature, why not move to a more modern kernel version? thanks,
greg k-h
Thanks, Puranjay
linux-stable-mirror@lists.linaro.org