I'd like to cut down the memory usage of parsing vmlinux BTF in ebpf-go. With some upcoming changes the library is sitting at 5MiB for a parse. Most of that memory is simply copying the BTF blob into user space. By allowing vmlinux BTF to be mmapped read-only into user space I can cut memory usage by about 75%.
Signed-off-by: Lorenz Bauer lmb@isovalent.com --- Changes in v5: - Fix error return of btf_parse_raw_mmap (Andrii) - Link to v4: https://lore.kernel.org/r/20250510-vmlinux-mmap-v4-0-69e424b2a672@isovalent....
Changes in v4: - Go back to remap_pfn_range for aarch64 compat - Dropped btf_new_no_copy (Andrii) - Fixed nits in selftests (Andrii) - Clearer error handling in the mmap handler (Andrii) - Fixed build on s390 - Link to v3: https://lore.kernel.org/r/20250505-vmlinux-mmap-v3-0-5d53afa060e8@isovalent....
Changes in v3: - Remove slightly confusing calculation of trailing (Alexei) - Use vm_insert_page (Alexei) - Simplified libbpf code - Link to v2: https://lore.kernel.org/r/20250502-vmlinux-mmap-v2-0-95c271434519@isovalent....
Changes in v2: - Use btf__new in selftest - Avoid vm_iomap_memory in btf_vmlinux_mmap - Add VM_DONTDUMP - Add support to libbpf - Link to v1: https://lore.kernel.org/r/20250501-vmlinux-mmap-v1-0-aa2724572598@isovalent....
--- Lorenz Bauer (3): btf: allow mmap of vmlinux btf selftests: bpf: add a test for mmapable vmlinux BTF libbpf: Use mmap to parse vmlinux BTF from sysfs
include/asm-generic/vmlinux.lds.h | 3 +- kernel/bpf/sysfs_btf.c | 32 ++++++++ tools/lib/bpf/btf.c | 89 +++++++++++++++++----- tools/testing/selftests/bpf/prog_tests/btf_sysfs.c | 81 ++++++++++++++++++++ 4 files changed, 186 insertions(+), 19 deletions(-) --- base-commit: 7220eabff8cb4af3b93cd021aa853b9f5df2923f change-id: 20250501-vmlinux-mmap-2ec5563c3ef1
Best regards,
User space needs access to kernel BTF for many modern features of BPF. Right now each process needs to read the BTF blob either in pieces or as a whole. Allow mmaping the sysfs file so that processes can directly access the memory allocated for it in the kernel.
remap_pfn_range is used instead of vm_insert_page due to aarch64 compatibility issues.
Tested-by: Alan Maguire alan.maguire@oracle.com Signed-off-by: Lorenz Bauer lmb@isovalent.com --- include/asm-generic/vmlinux.lds.h | 3 ++- kernel/bpf/sysfs_btf.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #ifdef CONFIG_DEBUG_INFO_BTF #define BTF \ + . = ALIGN(PAGE_SIZE); \ .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.BTF, _BTF) \ } \ - . = ALIGN(4); \ + . = ALIGN(PAGE_SIZE); \ .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \ *(.BTF_ids) \ } diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -7,14 +7,46 @@ #include <linux/kobject.h> #include <linux/init.h> #include <linux/sysfs.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/btf.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */ extern char __start_BTF[]; extern char __stop_BTF[];
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT; + size_t vm_size = vma->vm_end - vma->vm_start; + phys_addr_t addr = virt_to_phys(__start_BTF); + unsigned long pfn = addr >> PAGE_SHIFT; + + if (attr->private != __start_BTF || !PAGE_ALIGNED(addr)) + return -EINVAL; + + if (vma->vm_pgoff) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE)) + return -EACCES; + + if (pfn + pages < pfn) + return -EINVAL; + + if ((vm_size >> PAGE_SHIFT) > pages) + return -EINVAL; + + vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE); + return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot); +} + static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, .read_new = sysfs_bin_attr_simple_read, + .mmap = btf_sysfs_vmlinux_mmap, };
struct kobject *btf_kobj;
On Tue, May 20, 2025 at 02:01:17PM +0100, Lorenz Bauer wrote:
User space needs access to kernel BTF for many modern features of BPF. Right now each process needs to read the BTF blob either in pieces or as a whole. Allow mmaping the sysfs file so that processes can directly access the memory allocated for it in the kernel.
remap_pfn_range is used instead of vm_insert_page due to aarch64 compatibility issues.
Tested-by: Alan Maguire alan.maguire@oracle.com Signed-off-by: Lorenz Bauer lmb@isovalent.com
include/asm-generic/vmlinux.lds.h | 3 ++- kernel/bpf/sysfs_btf.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #ifdef CONFIG_DEBUG_INFO_BTF #define BTF \
- . = ALIGN(PAGE_SIZE); \ .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.BTF, _BTF) \ } \
- . = ALIGN(4); \
- . = ALIGN(PAGE_SIZE); \ .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \ *(.BTF_ids) \ }
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -7,14 +7,46 @@ #include <linux/kobject.h> #include <linux/init.h> #include <linux/sysfs.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/btf.h> /* See scripts/link-vmlinux.sh, gen_btf() func for details */ extern char __start_BTF[]; extern char __stop_BTF[]; +static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
const struct bin_attribute *attr,
struct vm_area_struct *vma)
+{
- unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
- size_t vm_size = vma->vm_end - vma->vm_start;
- phys_addr_t addr = virt_to_phys(__start_BTF);
- unsigned long pfn = addr >> PAGE_SHIFT;
- if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
With vmlinux.lds.h change above, is the page aligned check still needed?
Oh also can the size of btf region be non-page aligned?
return -EINVAL;
- if (vma->vm_pgoff)
return -EINVAL;
- if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
return -EACCES;
- if (pfn + pages < pfn)
return -EINVAL;
- if ((vm_size >> PAGE_SHIFT) > pages)
return -EINVAL;
- vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
Is it ok for fork() to keep the mapping in the child? (i.e. do you need VM_DONTCOPY). BTW VM_DONTDUMP is added by remap_pfn_range(), so if you want you can remove it here.
- return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
+}
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, .read_new = sysfs_bin_attr_simple_read,
- .mmap = btf_sysfs_vmlinux_mmap,
}; struct kobject *btf_kobj;
Overall this looks good to me, so you can add:
Reviewed-by: Shakeel Butt shakeel.butt@linux.dev
On Thu, May 22, 2025 at 4:01 PM Shakeel Butt shakeel.butt@linux.dev wrote:
On Tue, May 20, 2025 at 02:01:17PM +0100, Lorenz Bauer wrote:
User space needs access to kernel BTF for many modern features of BPF. Right now each process needs to read the BTF blob either in pieces or as a whole. Allow mmaping the sysfs file so that processes can directly access the memory allocated for it in the kernel.
remap_pfn_range is used instead of vm_insert_page due to aarch64 compatibility issues.
Tested-by: Alan Maguire alan.maguire@oracle.com Signed-off-by: Lorenz Bauer lmb@isovalent.com
include/asm-generic/vmlinux.lds.h | 3 ++- kernel/bpf/sysfs_btf.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #ifdef CONFIG_DEBUG_INFO_BTF #define BTF \
. = ALIGN(PAGE_SIZE); \ .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.BTF, _BTF) \ } \
. = ALIGN(4); \
. = ALIGN(PAGE_SIZE); \ .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \ *(.BTF_ids) \ }
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -7,14 +7,46 @@ #include <linux/kobject.h> #include <linux/init.h> #include <linux/sysfs.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/btf.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */ extern char __start_BTF[]; extern char __stop_BTF[];
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
const struct bin_attribute *attr,
struct vm_area_struct *vma)
+{
unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
size_t vm_size = vma->vm_end - vma->vm_start;
phys_addr_t addr = virt_to_phys(__start_BTF);
unsigned long pfn = addr >> PAGE_SHIFT;
if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
With vmlinux.lds.h change above, is the page aligned check still needed?
Oh also can the size of btf region be non-page aligned?
I'd probably leave this as a sanity/safety check, just in case someone modifies linker script and we miss this.
BTF region size isn't page-aligned but in the linker script we page-align .BTF_ids that follows it, so the padding should be zeroed out. And Lorenz added a check in the selftest to validate this, so we should be covered.
return -EINVAL;
if (vma->vm_pgoff)
return -EINVAL;
if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
return -EACCES;
if (pfn + pages < pfn)
return -EINVAL;
if ((vm_size >> PAGE_SHIFT) > pages)
return -EINVAL;
vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
Is it ok for fork() to keep the mapping in the child? (i.e. do you need VM_DONTCOPY). BTW VM_DONTDUMP is added by remap_pfn_range(), so if you want you can remove it here.
I think it's good to keep it in the fork, otherwise libbpf might crash after work due to BTF data suddenly disappearing.
return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
+}
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, .read_new = sysfs_bin_attr_simple_read,
.mmap = btf_sysfs_vmlinux_mmap,
};
struct kobject *btf_kobj;
Overall this looks good to me, so you can add:
Reviewed-by: Shakeel Butt shakeel.butt@linux.dev
Thanks Shakeel, I've applied the patches to bpf-next!
Hello Lorenz,
On Tue, May 20, 2025 at 02:01:17PM +0100, Lorenz Bauer wrote:
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c
extern char __start_BTF[]; extern char __stop_BTF[];
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
const struct bin_attribute *attr,
struct vm_area_struct *vma)
+{
- unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
- size_t vm_size = vma->vm_end - vma->vm_start;
- phys_addr_t addr = virt_to_phys(__start_BTF);
I am getting the following warning on arm64 which seems related to this code here. lines are based on cd031354087d8ae ("Merge branch 'net-mlx5e-add-support-for-pcie-congestion-events') net-next branch
[ 58.896157] virt_to_phys used for non-linear address: 000000009fea9737 (__start_BTF+0x0/0x685530) [ 23.988669] WARNING: CPU: 25 PID: 1442 at arch/arm64/mm/physaddr.c:15 __virt_to_phys (arch/arm64/mm/physaddr.c:?) [ 24.018136] Modules linked in: nvidia_cspmu(E) mlx5_ib(E) ipmi_ssif(E) arm_smmuv3_pmu(E) arm_cspmu_module(E) coresight_trbe(E) ib_uverbs(E) ipmi_devintf(E) ipmi_msghandler(E) coresight_stm(E) coresight_etm4x(E) coresight_tmc(E) coresight_funnel(E) stm_core(E) coresight(E) cppc_cpufreq(E) sch_fq_codel(E) drm(E) backlight(E) drm_panel_orientation_quirks(E) xhci_pci(E) xhci_hcd(E) sm3_ce(E) sha3_ce(E) sha512_ce(E) spi_tegra210_quad(E) acpi_power_meter(E) loop(E) efivarfs(E) autofs4(E) [ 24.075371] Tainted: [E]=UNSIGNED_MODULE, [N]=TEST [ 24.080276] Hardware name: Quanta S7GM 20S7GCU0010/S7G MB (CG1), BIOS 3D22 07/03/2024 [ 24.088295] pstate: 63400009 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) [ 24.098440] pc : __virt_to_phys (arch/arm64/mm/physaddr.c:?) [ 24.105398] lr : __virt_to_phys (arch/arm64/mm/physaddr.c:?) [ 24.112227] sp : ffff8000ba00f8e0 [ 24.115620] x29: ffff8000ba00f8e0 x28: ffff8000ba00faf0 x27: ffff8000ba00fa88 [ 24.122919] x26: ffff8000ba00fa40 x25: ffff800082772000 x24: 0000fffd6db70000 [ 24.130226] x23: 0000000000685530 x22: 0000fffd6e200000 x21: ffff800081cc0000 [ 24.140540] x20: ffff800081be02d8 x19: ffff800081cc0000 x18: 5f5f282037333739 [ 24.150708] x17: 6165663930303030 x16: 0000000000000fc4 x15: 0000000000000003 [ 24.160737] x14: ffff800082923398 x13: 0000000000000003 x12: 0000000000000003 [ 24.168042] x11: 00000000fffeffff x10: ffff800082663784 x9 : cc38fcac5cdabe00 [ 24.175348] x8 : 0001000000000000 x7 : ffff8000813dd878 x6 : 0000000000000000 [ 24.182653] x5 : 0000000000000001 x4 : 0000000000000001 x3 : 0000000000000000 [ 24.189959] x2 : 0000000000000000 x1 : ffff800081a3a6d0 x0 : 0000000000000055 [ 24.197257] Call trace: [ 24.199761] __virt_to_phys (arch/arm64/mm/physaddr.c:?) (P) [ 24.206883] btf_sysfs_vmlinux_mmap (kernel/bpf/sysfs_btf.c:27) [ 24.214264] sysfs_kf_bin_mmap (fs/sysfs/file.c:179) [ 24.218536] kernfs_fop_mmap (fs/kernfs/file.c:462) [ 24.222461] mmap_region (./include/linux/fs.h:? mm/internal.h:167 mm/vma.c:2405 mm/vma.c:2467 mm/vma.c:2622 mm/vma.c:2692)
Should __pa_symbol() be used instead of virt_to_phys()?
Thanks --breno
Hi Breno,
Thanks for reaching out.
On Thu, Jul 17, 2025 at 1:39 PM Breno Leitao leitao@debian.org wrote:
Should __pa_symbol() be used instead of virt_to_phys()?
I'm not really well versed with mm in general. Looking around a bit I found some explanation in [1]. Your suggested fix does make sense to me based on that.
Let me run the patch against bpf-ci and see what happens.
1: https://lore.kernel.org/all/90667b2b7f773308318261f96ebefd1a67133c4c.1732464...
Lorenz
On Thu, Jul 17, 2025 at 6:18 AM Lorenz Bauer lmb@isovalent.com wrote:
Hi Breno,
Thanks for reaching out.
On Thu, Jul 17, 2025 at 1:39 PM Breno Leitao leitao@debian.org wrote:
Should __pa_symbol() be used instead of virt_to_phys()?
I'm not really well versed with mm in general. Looking around a bit I found some explanation in [1]. Your suggested fix does make sense to me based on that.
Let me run the patch against bpf-ci and see what happens.
1: https://lore.kernel.org/all/90667b2b7f773308318261f96ebefd1a67133c4c.1732464...
Thanks for the link. Key quote: "arm64 maps the kernel in the vmalloc space." I think the map shouldn't be destroying linearity of kernel rodata. __pa_symbol() should work for start_BTF, but would be good to double check with Ard that the rest stays linear.
On Thu, Jul 17, 2025 at 3:49 PM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
__pa_symbol() should work for start_BTF, but would be good to double check with Ard that the rest stays linear.
Alexei,
This code in the arm64 setup does make me think we'll be OK.
kernel_code.start = __pa_symbol(_stext); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1);
Using these as start and end only makes sense to me if the addresses are linear? See https://elixir.bootlin.com/linux/v6.15.6/source/arch/arm64/kernel/setup.c#L2...
Let me know if you want me to double check with Ard regardless.
Best Lorenz
On Thu, Jul 17, 2025 at 8:15 AM Lorenz Bauer lmb@isovalent.com wrote:
On Thu, Jul 17, 2025 at 3:49 PM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
__pa_symbol() should work for start_BTF, but would be good to double check with Ard that the rest stays linear.
Alexei,
This code in the arm64 setup does make me think we'll be OK.
kernel_code.start = __pa_symbol(_stext); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1);
Using these as start and end only makes sense to me if the addresses are linear? See https://elixir.bootlin.com/linux/v6.15.6/source/arch/arm64/kernel/setup.c#L2...
Thanks for checking. lgtm.
Add a basic test for the ability to mmap /sys/kernel/btf/vmlinux. Ensure that the data is valid BTF and that it is padded with zero.
Tested-by: Alan Maguire alan.maguire@oracle.com Signed-off-by: Lorenz Bauer lmb@isovalent.com --- tools/testing/selftests/bpf/prog_tests/btf_sysfs.c | 81 ++++++++++++++++++++++ 1 file changed, 81 insertions(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c b/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c new file mode 100644 index 0000000000000000000000000000000000000000..3923e64c4c1d0f1dfeef2a39c7bbab7c9a19f0ca --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2025 Isovalent */ + +#include <test_progs.h> +#include <bpf/btf.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <unistd.h> + +static void test_btf_mmap_sysfs(const char *path, struct btf *base) +{ + struct stat st; + __u64 btf_size, end; + void *raw_data = NULL; + int fd = -1; + long page_size; + struct btf *btf = NULL; + + page_size = sysconf(_SC_PAGESIZE); + if (!ASSERT_GE(page_size, 0, "get_page_size")) + goto cleanup; + + if (!ASSERT_OK(stat(path, &st), "stat_btf")) + goto cleanup; + + btf_size = st.st_size; + end = (btf_size + page_size - 1) / page_size * page_size; + + fd = open(path, O_RDONLY); + if (!ASSERT_GE(fd, 0, "open_btf")) + goto cleanup; + + raw_data = mmap(NULL, btf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_writable")) + goto cleanup; + + raw_data = mmap(NULL, btf_size, PROT_READ, MAP_SHARED, fd, 0); + if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_shared")) + goto cleanup; + + raw_data = mmap(NULL, end + 1, PROT_READ, MAP_PRIVATE, fd, 0); + if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_invalid_size")) + goto cleanup; + + raw_data = mmap(NULL, end, PROT_READ, MAP_PRIVATE, fd, 0); + if (!ASSERT_OK_PTR(raw_data, "mmap_btf")) + goto cleanup; + + if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_WRITE), -1, + "mprotect_writable")) + goto cleanup; + + if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_EXEC), -1, + "mprotect_executable")) + goto cleanup; + + /* Check padding is zeroed */ + for (int i = btf_size; i < end; i++) { + if (((__u8 *)raw_data)[i] != 0) { + PRINT_FAIL("tail of BTF is not zero at page offset %d\n", i); + goto cleanup; + } + } + + btf = btf__new_split(raw_data, btf_size, base); + if (!ASSERT_OK_PTR(btf, "parse_btf")) + goto cleanup; + +cleanup: + btf__free(btf); + if (raw_data && raw_data != MAP_FAILED) + munmap(raw_data, btf_size); + if (fd >= 0) + close(fd); +} + +void test_btf_sysfs(void) +{ + test_btf_mmap_sysfs("/sys/kernel/btf/vmlinux", NULL); +}
Teach libbpf to use mmap when parsing vmlinux BTF from /sys. We don't apply this to fall-back paths on the regular file system because there is no way to ensure that modifications underlying the MAP_PRIVATE mapping are not visible to the process.
Acked-by: Andrii Nakryiko andrii@kernel.org Tested-by: Alan Maguire alan.maguire@oracle.com Signed-off-by: Lorenz Bauer lmb@isovalent.com --- tools/lib/bpf/btf.c | 89 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 71 insertions(+), 18 deletions(-)
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index f18d7e6a453cd9e5c384487659df04f7efafdf5a..3b98ac40bbd66c4fa688f967f9370b5c92610ba0 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -12,6 +12,7 @@ #include <sys/utsname.h> #include <sys/param.h> #include <sys/stat.h> +#include <sys/mman.h> #include <linux/kernel.h> #include <linux/err.h> #include <linux/btf.h> @@ -120,6 +121,9 @@ struct btf { /* whether base_btf should be freed in btf_free for this instance */ bool owns_base;
+ /* whether raw_data is a (read-only) mmap */ + bool raw_data_is_mmap; + /* BTF object FD, if loaded into kernel */ int fd;
@@ -951,6 +955,17 @@ static bool btf_is_modifiable(const struct btf *btf) return (void *)btf->hdr != btf->raw_data; }
+static void btf_free_raw_data(struct btf *btf) +{ + if (btf->raw_data_is_mmap) { + munmap(btf->raw_data, btf->raw_size); + btf->raw_data_is_mmap = false; + } else { + free(btf->raw_data); + } + btf->raw_data = NULL; +} + void btf__free(struct btf *btf) { if (IS_ERR_OR_NULL(btf)) @@ -970,7 +985,7 @@ void btf__free(struct btf *btf) free(btf->types_data); strset__free(btf->strs_set); } - free(btf->raw_data); + btf_free_raw_data(btf); free(btf->raw_data_swapped); free(btf->type_offs); if (btf->owns_base) @@ -1030,7 +1045,7 @@ struct btf *btf__new_empty_split(struct btf *base_btf) return libbpf_ptr(btf_new_empty(base_btf)); }
-static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) +static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf, bool is_mmap) { struct btf *btf; int err; @@ -1050,12 +1065,18 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) btf->start_str_off = base_btf->hdr->str_len; }
- btf->raw_data = malloc(size); - if (!btf->raw_data) { - err = -ENOMEM; - goto done; + if (is_mmap) { + btf->raw_data = (void *)data; + btf->raw_data_is_mmap = true; + } else { + btf->raw_data = malloc(size); + if (!btf->raw_data) { + err = -ENOMEM; + goto done; + } + memcpy(btf->raw_data, data, size); } - memcpy(btf->raw_data, data, size); + btf->raw_size = size;
btf->hdr = btf->raw_data; @@ -1083,12 +1104,12 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf)
struct btf *btf__new(const void *data, __u32 size) { - return libbpf_ptr(btf_new(data, size, NULL)); + return libbpf_ptr(btf_new(data, size, NULL, false)); }
struct btf *btf__new_split(const void *data, __u32 size, struct btf *base_btf) { - return libbpf_ptr(btf_new(data, size, base_btf)); + return libbpf_ptr(btf_new(data, size, base_btf, false)); }
struct btf_elf_secs { @@ -1209,7 +1230,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf,
if (secs.btf_base_data) { dist_base_btf = btf_new(secs.btf_base_data->d_buf, secs.btf_base_data->d_size, - NULL); + NULL, false); if (IS_ERR(dist_base_btf)) { err = PTR_ERR(dist_base_btf); dist_base_btf = NULL; @@ -1218,7 +1239,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, }
btf = btf_new(secs.btf_data->d_buf, secs.btf_data->d_size, - dist_base_btf ?: base_btf); + dist_base_btf ?: base_btf, false); if (IS_ERR(btf)) { err = PTR_ERR(btf); goto done; @@ -1335,7 +1356,7 @@ static struct btf *btf_parse_raw(const char *path, struct btf *base_btf) }
/* finally parse BTF data */ - btf = btf_new(data, sz, base_btf); + btf = btf_new(data, sz, base_btf, false);
err_out: free(data); @@ -1354,6 +1375,37 @@ struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf) return libbpf_ptr(btf_parse_raw(path, base_btf)); }
+static struct btf *btf_parse_raw_mmap(const char *path, struct btf *base_btf) +{ + struct stat st; + void *data; + struct btf *btf; + int fd, err; + + fd = open(path, O_RDONLY); + if (fd < 0) + return libbpf_err_ptr(-errno); + + if (fstat(fd, &st) < 0) { + err = -errno; + close(fd); + return libbpf_err_ptr(err); + } + + data = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + err = -errno; + close(fd); + + if (data == MAP_FAILED) + return libbpf_err_ptr(err); + + btf = btf_new(data, st.st_size, base_btf, true); + if (IS_ERR(btf)) + munmap(data, st.st_size); + + return btf; +} + static struct btf *btf_parse(const char *path, struct btf *base_btf, struct btf_ext **btf_ext) { struct btf *btf; @@ -1618,7 +1670,7 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) goto exit_free; }
- btf = btf_new(ptr, btf_info.btf_size, base_btf); + btf = btf_new(ptr, btf_info.btf_size, base_btf, false);
exit_free: free(ptr); @@ -1658,10 +1710,8 @@ struct btf *btf__load_from_kernel_by_id(__u32 id)
static void btf_invalidate_raw_data(struct btf *btf) { - if (btf->raw_data) { - free(btf->raw_data); - btf->raw_data = NULL; - } + if (btf->raw_data) + btf_free_raw_data(btf); if (btf->raw_data_swapped) { free(btf->raw_data_swapped); btf->raw_data_swapped = NULL; @@ -5331,7 +5381,10 @@ struct btf *btf__load_vmlinux_btf(void) pr_warn("kernel BTF is missing at '%s', was CONFIG_DEBUG_INFO_BTF enabled?\n", sysfs_btf_path); } else { - btf = btf__parse(sysfs_btf_path, NULL); + btf = btf_parse_raw_mmap(sysfs_btf_path, NULL); + if (IS_ERR(btf)) + btf = btf__parse(sysfs_btf_path, NULL); + if (!btf) { err = -errno; pr_warn("failed to read kernel BTF from '%s': %s\n",
Hello:
This series was applied to bpf/bpf-next.git (master) by Andrii Nakryiko andrii@kernel.org:
On Tue, 20 May 2025 14:01:16 +0100 you wrote:
I'd like to cut down the memory usage of parsing vmlinux BTF in ebpf-go. With some upcoming changes the library is sitting at 5MiB for a parse. Most of that memory is simply copying the BTF blob into user space. By allowing vmlinux BTF to be mmapped read-only into user space I can cut memory usage by about 75%.
Signed-off-by: Lorenz Bauer lmb@isovalent.com
[...]
Here is the summary with links: - [bpf-next,v5,1/3] btf: allow mmap of vmlinux btf https://git.kernel.org/bpf/bpf-next/c/a539e2a6d51d - [bpf-next,v5,2/3] selftests: bpf: add a test for mmapable vmlinux BTF https://git.kernel.org/bpf/bpf-next/c/828226b69ff5 - [bpf-next,v5,3/3] libbpf: Use mmap to parse vmlinux BTF from sysfs https://git.kernel.org/bpf/bpf-next/c/3c0421c93ce4
You are awesome, thank you!
linux-kselftest-mirror@lists.linaro.org