I'd like to cut down the memory usage of parsing vmlinux BTF in ebpf-go. With some upcoming changes the library is sitting at 5MiB for a parse. Most of that memory is simply copying the BTF blob into user space. By allowing vmlinux BTF to be mmapped read-only into user space I can cut memory usage by about 75%.
Signed-off-by: Lorenz Bauer lmb@isovalent.com --- Changes in v3: - Remove slightly confusing calculation of trailing (Alexei) - Use vm_insert_page (Alexei) - Simplified libbpf code - Link to v2: https://lore.kernel.org/r/20250502-vmlinux-mmap-v2-0-95c271434519@isovalent....
Changes in v2: - Use btf__new in selftest - Avoid vm_iomap_memory in btf_vmlinux_mmap - Add VM_DONTDUMP - Add support to libbpf - Link to v1: https://lore.kernel.org/r/20250501-vmlinux-mmap-v1-0-aa2724572598@isovalent....
--- Lorenz Bauer (3): btf: allow mmap of vmlinux btf selftests: bpf: add a test for mmapable vmlinux BTF libbpf: Use mmap to parse vmlinux BTF from sysfs
include/asm-generic/vmlinux.lds.h | 3 +- kernel/bpf/sysfs_btf.c | 37 ++++++++++ tools/lib/bpf/btf.c | 83 +++++++++++++++++++--- tools/testing/selftests/bpf/prog_tests/btf_sysfs.c | 83 ++++++++++++++++++++++ 4 files changed, 194 insertions(+), 12 deletions(-) --- base-commit: 38d976c32d85ef12dcd2b8a231196f7049548477 change-id: 20250501-vmlinux-mmap-2ec5563c3ef1
Best regards,
User space needs access to kernel BTF for many modern features of BPF. Right now each process needs to read the BTF blob either in pieces or as a whole. Allow mmaping the sysfs file so that processes can directly access the memory allocated for it in the kernel.
Signed-off-by: Lorenz Bauer lmb@isovalent.com --- include/asm-generic/vmlinux.lds.h | 3 ++- kernel/bpf/sysfs_btf.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #ifdef CONFIG_DEBUG_INFO_BTF #define BTF \ + . = ALIGN(PAGE_SIZE); \ .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.BTF, _BTF) \ } \ - . = ALIGN(4); \ + . = ALIGN(PAGE_SIZE); \ .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \ *(.BTF_ids) \ } diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a7157929c50f62a5c6862e7a3d081..37278d7f38ae72f2d7efcfa859e86aaf12e39a25 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -7,14 +7,51 @@ #include <linux/kobject.h> #include <linux/init.h> #include <linux/sysfs.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/btf.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */ extern char __start_BTF[]; extern char __stop_BTF[];
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT; + size_t vm_size = vma->vm_end - vma->vm_start; + unsigned long addr = (unsigned long)attr->private; + int i, err = 0; + + if (addr != (unsigned long)__start_BTF || !PAGE_ALIGNED(addr)) + return -EINVAL; + + if (vma->vm_pgoff) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE)) + return -EACCES; + + if (vm_size >> PAGE_SHIFT > pages) + return -EINVAL; + + vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE); + + for (i = 0; i < pages && !err; i++, addr += PAGE_SIZE) + err = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, + virt_to_page(addr)); + + if (err) + zap_vma_pages(vma); + + return err; +} + static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, .read_new = sysfs_bin_attr_simple_read, + .mmap = btf_sysfs_vmlinux_mmap, };
struct kobject *btf_kobj;
On Mon, May 5, 2025 at 11:39 AM Lorenz Bauer lmb@isovalent.com wrote:
User space needs access to kernel BTF for many modern features of BPF. Right now each process needs to read the BTF blob either in pieces or as a whole. Allow mmaping the sysfs file so that processes can directly access the memory allocated for it in the kernel.
Signed-off-by: Lorenz Bauer lmb@isovalent.com
include/asm-generic/vmlinux.lds.h | 3 ++- kernel/bpf/sysfs_btf.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) */ #ifdef CONFIG_DEBUG_INFO_BTF #define BTF \
. = ALIGN(PAGE_SIZE); \ .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \ BOUNDED_SECTION_BY(.BTF, _BTF) \ } \
. = ALIGN(4); \
. = ALIGN(PAGE_SIZE); \ .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \ *(.BTF_ids) \ }
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 81d6cf90584a7157929c50f62a5c6862e7a3d081..37278d7f38ae72f2d7efcfa859e86aaf12e39a25 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -7,14 +7,51 @@ #include <linux/kobject.h> #include <linux/init.h> #include <linux/sysfs.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/btf.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */ extern char __start_BTF[]; extern char __stop_BTF[];
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
const struct bin_attribute *attr,
struct vm_area_struct *vma)
+{
unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
size_t vm_size = vma->vm_end - vma->vm_start;
unsigned long addr = (unsigned long)attr->private;
int i, err = 0;
if (addr != (unsigned long)__start_BTF || !PAGE_ALIGNED(addr))
return -EINVAL;
if (vma->vm_pgoff)
return -EINVAL;
any particular reason to not allow vm_pgoff?
if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
return -EACCES;
if (vm_size >> PAGE_SHIFT > pages)
() around shift operation, please, for those of us who haven't memorized the entire C operator precedence table ;)
return -EINVAL;
vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
for (i = 0; i < pages && !err; i++, addr += PAGE_SIZE)
err = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
virt_to_page(addr));
if (err)
zap_vma_pages(vma);
it's certainly subjective, but I find this error handling with !err in for loop condition hard to follow. What's wrong with arguably more straightforward (and as you can see I'm not a big fan of mutated addr but calculated vma->vm_start + i * PAGE_SIZE: pick one style one follow it for both entities?):
for (i = 0; i < pages; i++) { err = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, virt_to_page(addr + i * PAGE_SIZE)); if (err) { zap_vma_pages(vma); return err; } }
return 0;
?
return err;
+}
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, .read_new = sysfs_bin_attr_simple_read,
.mmap = btf_sysfs_vmlinux_mmap,
};
struct kobject *btf_kobj;
-- 2.49.0
On Tue, May 6, 2025 at 10:39 PM Andrii Nakryiko andrii.nakryiko@gmail.com wrote:
if (vma->vm_pgoff)
return -EINVAL;
any particular reason to not allow vm_pgoff?
Doesn't seem particularly useful because the header is at offset 0, and I don't trust myself to get the overflow checks done right.
it's certainly subjective, but I find this error handling with !err in for loop condition hard to follow. What's wrong with arguably more straightforward (and as you can see I'm not a big fan of mutated addr but calculated vma->vm_start + i * PAGE_SIZE: pick one style one follow it for both entities?):
Yeah that's nicer, I was just going off of what Alexei proposed.
Hi Lorenz,
kernel test robot noticed the following build warnings:
[auto build test WARNING on 38d976c32d85ef12dcd2b8a231196f7049548477]
url: https://github.com/intel-lab-lkp/linux/commits/Lorenz-Bauer/btf-allow-mmap-o... base: 38d976c32d85ef12dcd2b8a231196f7049548477 patch link: https://lore.kernel.org/r/20250505-vmlinux-mmap-v3-1-5d53afa060e8%40isovalen... patch subject: [PATCH bpf-next v3 1/3] btf: allow mmap of vmlinux btf config: arc-randconfig-r073-20250508 (https://download.01.org/0day-ci/archive/20250509/202505091116.jHtyWJW4-lkp@i...) compiler: arc-linux-gcc (GCC) 12.4.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250509/202505091116.jHtyWJW4-lkp@i...)
If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot lkp@intel.com | Closes: https://lore.kernel.org/oe-kbuild-all/202505091116.jHtyWJW4-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from arch/arc/include/asm/page.h:136, from arch/arc/include/asm/thread_info.h:16, from include/linux/thread_info.h:60, from include/asm-generic/preempt.h:5, from ./arch/arc/include/generated/asm/preempt.h:1, from include/linux/preempt.h:79, from include/linux/spinlock.h:56, from include/linux/mmzone.h:8, from include/linux/gfp.h:7, from include/linux/umh.h:4, from include/linux/kmod.h:9, from include/linux/module.h:17, from kernel/bpf/sysfs_btf.c:6: kernel/bpf/sysfs_btf.c: In function 'btf_sysfs_vmlinux_mmap':
kernel/bpf/sysfs_btf.c:43:51: warning: passing argument 1 of 'virt_to_pfn' makes pointer from integer without a cast [-Wint-conversion]
43 | virt_to_page(addr)); | ^~~~ | | | long unsigned int include/asm-generic/memory_model.h:18:46: note: in definition of macro '__pfn_to_page' 18 | #define __pfn_to_page(pfn) (mem_map + ((pfn) - ARCH_PFN_OFFSET)) | ^~~ kernel/bpf/sysfs_btf.c:43:38: note: in expansion of macro 'virt_to_page' 43 | virt_to_page(addr)); | ^~~~~~~~~~~~ arch/arc/include/asm/page.h:123:53: note: expected 'const void *' but argument is of type 'long unsigned int' 123 | static inline unsigned long virt_to_pfn(const void *kaddr) | ~~~~~~~~~~~~^~~~~
vim +/virt_to_pfn +43 kernel/bpf/sysfs_btf.c
17 18 static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj, 19 const struct bin_attribute *attr, 20 struct vm_area_struct *vma) 21 { 22 unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT; 23 size_t vm_size = vma->vm_end - vma->vm_start; 24 unsigned long addr = (unsigned long)attr->private; 25 int i, err = 0; 26 27 if (addr != (unsigned long)__start_BTF || !PAGE_ALIGNED(addr)) 28 return -EINVAL; 29 30 if (vma->vm_pgoff) 31 return -EINVAL; 32 33 if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE)) 34 return -EACCES; 35 36 if (vm_size >> PAGE_SHIFT > pages) 37 return -EINVAL; 38 39 vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE); 40 41 for (i = 0; i < pages && !err; i++, addr += PAGE_SIZE) 42 err = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
43 virt_to_page(addr));
44 45 if (err) 46 zap_vma_pages(vma); 47 48 return err; 49 } 50
Add a basic test for the ability to mmap /sys/kernel/btf/vmlinux. Since libbpf doesn't have an API to parse BTF from memory we do some basic sanity checks ourselves.
Signed-off-by: Lorenz Bauer lmb@isovalent.com --- tools/testing/selftests/bpf/prog_tests/btf_sysfs.c | 83 ++++++++++++++++++++++ 1 file changed, 83 insertions(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c b/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c new file mode 100644 index 0000000000000000000000000000000000000000..3319cf758897d46cefa8ca25e16acb162f4e9889 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2025 Isovalent */ + +#include <test_progs.h> +#include <bpf/btf.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <unistd.h> + +static void test_btf_mmap_sysfs(const char *path, struct btf *base) +{ + struct stat st; + __u64 btf_size, end; + void *raw_data = NULL; + int fd = -1; + long page_size; + struct btf *btf = NULL; + + page_size = sysconf(_SC_PAGESIZE); + if (!ASSERT_GE(page_size, 0, "get_page_size")) + goto cleanup; + + if (!ASSERT_OK(stat(path, &st), "stat_btf")) + goto cleanup; + + btf_size = st.st_size; + end = (btf_size + page_size - 1) / page_size * page_size; + + fd = open(path, O_RDONLY); + if (!ASSERT_GE(fd, 0, "open_btf")) + goto cleanup; + + raw_data = mmap(NULL, btf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_writable")) + goto cleanup; + + raw_data = mmap(NULL, btf_size, PROT_READ, MAP_SHARED, fd, 0); + if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_shared")) + goto cleanup; + + raw_data = mmap(NULL, end + 1, PROT_READ, MAP_PRIVATE, fd, 0); + if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_invalid_size")) + goto cleanup; + + raw_data = mmap(NULL, end, PROT_READ, MAP_PRIVATE, fd, 0); + if (!ASSERT_NEQ(raw_data, MAP_FAILED, "mmap_btf")) + goto cleanup; + + if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_WRITE), -1, + "mprotect_writable")) + goto cleanup; + + if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_EXEC), -1, + "mprotect_executable")) + goto cleanup; + + /* Check padding is zeroed */ + for (int i = btf_size; i < end; i++) { + if (((__u8 *)raw_data)[i] != 0) { + PRINT_FAIL("tail of BTF is not zero at page offset %d\n", i); + goto cleanup; + } + } + + btf = btf__new_split(raw_data, btf_size, base); + if (!ASSERT_NEQ(btf, NULL, "parse_btf")) + goto cleanup; + +cleanup: + if (raw_data && raw_data != MAP_FAILED) + munmap(raw_data, btf_size); + if (btf) + btf__free(btf); + if (fd >= 0) + close(fd); +} + +void test_btf_sysfs(void) +{ + if (test__start_subtest("vmlinux")) + test_btf_mmap_sysfs("/sys/kernel/btf/vmlinux", NULL); +}
On Mon, May 5, 2025 at 11:39 AM Lorenz Bauer lmb@isovalent.com wrote:
Add a basic test for the ability to mmap /sys/kernel/btf/vmlinux. Since libbpf doesn't have an API to parse BTF from memory we do some basic sanity checks ourselves.
Signed-off-by: Lorenz Bauer lmb@isovalent.com
tools/testing/selftests/bpf/prog_tests/btf_sysfs.c | 83 ++++++++++++++++++++++ 1 file changed, 83 insertions(+)
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c b/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c new file mode 100644 index 0000000000000000000000000000000000000000..3319cf758897d46cefa8ca25e16acb162f4e9889 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/btf_sysfs.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2025 Isovalent */
+#include <test_progs.h> +#include <bpf/btf.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <unistd.h>
+static void test_btf_mmap_sysfs(const char *path, struct btf *base) +{
struct stat st;
__u64 btf_size, end;
void *raw_data = NULL;
int fd = -1;
long page_size;
struct btf *btf = NULL;
page_size = sysconf(_SC_PAGESIZE);
if (!ASSERT_GE(page_size, 0, "get_page_size"))
goto cleanup;
if (!ASSERT_OK(stat(path, &st), "stat_btf"))
goto cleanup;
btf_size = st.st_size;
end = (btf_size + page_size - 1) / page_size * page_size;
fd = open(path, O_RDONLY);
if (!ASSERT_GE(fd, 0, "open_btf"))
goto cleanup;
raw_data = mmap(NULL, btf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_writable"))
goto cleanup;
raw_data = mmap(NULL, btf_size, PROT_READ, MAP_SHARED, fd, 0);
if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_shared"))
goto cleanup;
raw_data = mmap(NULL, end + 1, PROT_READ, MAP_PRIVATE, fd, 0);
if (!ASSERT_EQ(raw_data, MAP_FAILED, "mmap_btf_invalid_size"))
goto cleanup;
raw_data = mmap(NULL, end, PROT_READ, MAP_PRIVATE, fd, 0);
if (!ASSERT_NEQ(raw_data, MAP_FAILED, "mmap_btf"))
ASSERT_OK_PTR()?
goto cleanup;
if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_WRITE), -1,
"mprotect_writable"))
goto cleanup;
if (!ASSERT_EQ(mprotect(raw_data, btf_size, PROT_READ | PROT_EXEC), -1,
"mprotect_executable"))
goto cleanup;
/* Check padding is zeroed */
for (int i = btf_size; i < end; i++) {
if (((__u8 *)raw_data)[i] != 0) {
PRINT_FAIL("tail of BTF is not zero at page offset %d\n", i);
goto cleanup;
}
}
btf = btf__new_split(raw_data, btf_size, base);
if (!ASSERT_NEQ(btf, NULL, "parse_btf"))
ASSERT_OK_PTR()
goto cleanup;
+cleanup:
if (raw_data && raw_data != MAP_FAILED)
munmap(raw_data, btf_size);
if (btf)
no need to check this, all libbpf destructor APIs deal with NULL correctly (ignoring them)
btf__free(btf);
if (fd >= 0)
close(fd);
+}
+void test_btf_sysfs(void) +{
if (test__start_subtest("vmlinux"))
test_btf_mmap_sysfs("/sys/kernel/btf/vmlinux", NULL);
Do you intend to add more subtests? if not, why even using a subtest structure
+}
-- 2.49.0
On Tue, May 6, 2025 at 10:39 PM Andrii Nakryiko andrii.nakryiko@gmail.com wrote:
raw_data = mmap(NULL, end, PROT_READ, MAP_PRIVATE, fd, 0);
if (!ASSERT_NEQ(raw_data, MAP_FAILED, "mmap_btf"))
ASSERT_OK_PTR()?
Don't think that mmap follows libbpf_get_error conventions? I'd keep it as it is.
btf = btf__new_split(raw_data, btf_size, base);
if (!ASSERT_NEQ(btf, NULL, "parse_btf"))
ASSERT_OK_PTR()
Ack.
Do you intend to add more subtests? if not, why even using a subtest structure
The original intention was to add kmod support, but that didn't pan out, see my discussion with Alexei. I can drop the subtest if you want, but I'd probably keep the helper as it is.
On Wed, May 7, 2025 at 2:14 AM Lorenz Bauer lmb@isovalent.com wrote:
On Tue, May 6, 2025 at 10:39 PM Andrii Nakryiko andrii.nakryiko@gmail.com wrote:
raw_data = mmap(NULL, end, PROT_READ, MAP_PRIVATE, fd, 0);
if (!ASSERT_NEQ(raw_data, MAP_FAILED, "mmap_btf"))
ASSERT_OK_PTR()?
Don't think that mmap follows libbpf_get_error conventions? I'd keep it as it is.
ASSERT_OK_PTR() isn't libbpf specific (and libbpf is actually returning a NULL or valid pointer for all public APIs, since libbpf 1.0). But if you look at the implementation, "an OK" pointer is a non-NULL pointer that is also not a small negative value. NULL is a bad pointer, -1 (MAP_FAILED) is a bad pointer, and so on. So it's a pretty universal check for anything pointer-related. Please do use OK_PTR, it's semantically better in tests
btf = btf__new_split(raw_data, btf_size, base);
if (!ASSERT_NEQ(btf, NULL, "parse_btf"))
ASSERT_OK_PTR()
Ack.
Do you intend to add more subtests? if not, why even using a subtest structure
The original intention was to add kmod support, but that didn't pan out, see my discussion with Alexei. I can drop the subtest if you want, but I'd probably keep the helper as it is.
yeah, let's drop the subtest, it's a bit easier to work with non-subtest tests, IMO
Teach libbpf to use mmap when parsing vmlinux BTF from /sys. We don't apply this to fall-back paths on the regular file system because there is no way to ensure that modifications underlying the MAP_PRIVATE mapping are not visible to the process.
Signed-off-by: Lorenz Bauer lmb@isovalent.com --- tools/lib/bpf/btf.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 11 deletions(-)
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index b7513d4cce55b263310c341bc254df6364e829d9..3006c1ebb97ed899eb519b10927491d87ccdaca5 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -12,6 +12,7 @@ #include <sys/utsname.h> #include <sys/param.h> #include <sys/stat.h> +#include <sys/mman.h> #include <linux/kernel.h> #include <linux/err.h> #include <linux/btf.h> @@ -120,6 +121,9 @@ struct btf { /* whether base_btf should be freed in btf_free for this instance */ bool owns_base;
+ /* whether raw_data is a (read-only) mmap */ + bool raw_data_is_mmap; + /* BTF object FD, if loaded into kernel */ int fd;
@@ -951,6 +955,17 @@ static bool btf_is_modifiable(const struct btf *btf) return (void *)btf->hdr != btf->raw_data; }
+static void btf_free_raw_data(struct btf *btf) +{ + if (btf->raw_data_is_mmap) { + munmap(btf->raw_data, btf->raw_size); + btf->raw_data_is_mmap = false; + } else { + free(btf->raw_data); + } + btf->raw_data = NULL; +} + void btf__free(struct btf *btf) { if (IS_ERR_OR_NULL(btf)) @@ -970,7 +985,7 @@ void btf__free(struct btf *btf) free(btf->types_data); strset__free(btf->strs_set); } - free(btf->raw_data); + btf_free_raw_data(btf); free(btf->raw_data_swapped); free(btf->type_offs); if (btf->owns_base) @@ -1030,7 +1045,7 @@ struct btf *btf__new_empty_split(struct btf *base_btf) return libbpf_ptr(btf_new_empty(base_btf)); }
-static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) +static struct btf *btf_new_no_copy(void *data, __u32 size, struct btf *base_btf) { struct btf *btf; int err; @@ -1050,12 +1065,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) btf->start_str_off = base_btf->hdr->str_len; }
- btf->raw_data = malloc(size); - if (!btf->raw_data) { - err = -ENOMEM; - goto done; - } - memcpy(btf->raw_data, data, size); + btf->raw_data = data; btf->raw_size = size;
btf->hdr = btf->raw_data; @@ -1081,6 +1091,24 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) return btf; }
+static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) +{ + struct btf *btf; + void *raw_data; + + raw_data = malloc(size); + if (!raw_data) + return ERR_PTR(-ENOMEM); + + memcpy(raw_data, data, size); + + btf = btf_new_no_copy(raw_data, size, base_btf); + if (IS_ERR(btf)) + free(raw_data); + + return btf; +} + struct btf *btf__new(const void *data, __u32 size) { return libbpf_ptr(btf_new(data, size, NULL)); @@ -1354,6 +1382,37 @@ struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf) return libbpf_ptr(btf_parse_raw(path, base_btf)); }
+static struct btf *btf_parse_raw_mmap(const char *path, struct btf *base_btf) +{ + struct stat st; + void *data; + struct btf *btf; + int fd; + + fd = open(path, O_RDONLY); + if (fd < 0) + return libbpf_err_ptr(-errno); + + if (fstat(fd, &st) < 0) { + close(fd); + return libbpf_err_ptr(-errno); + } + + data = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + close(fd); + + if (data == MAP_FAILED) + return NULL; + + btf = btf_new_no_copy(data, st.st_size, base_btf); + if (!btf) + munmap(data, st.st_size); + else + btf->raw_data_is_mmap = true; + + return btf; +} + static struct btf *btf_parse(const char *path, struct btf *base_btf, struct btf_ext **btf_ext) { struct btf *btf; @@ -1659,8 +1718,7 @@ struct btf *btf__load_from_kernel_by_id(__u32 id) static void btf_invalidate_raw_data(struct btf *btf) { if (btf->raw_data) { - free(btf->raw_data); - btf->raw_data = NULL; + btf_free_raw_data(btf); } if (btf->raw_data_swapped) { free(btf->raw_data_swapped); @@ -5290,7 +5348,10 @@ struct btf *btf__load_vmlinux_btf(void) pr_warn("kernel BTF is missing at '%s', was CONFIG_DEBUG_INFO_BTF enabled?\n", sysfs_btf_path); } else { - btf = btf__parse(sysfs_btf_path, NULL); + btf = btf_parse_raw_mmap(sysfs_btf_path, NULL); + if (IS_ERR_OR_NULL(btf)) + btf = btf__parse(sysfs_btf_path, NULL); + if (!btf) { err = -errno; pr_warn("failed to read kernel BTF from '%s': %s\n",
On Mon, May 5, 2025 at 11:39 AM Lorenz Bauer lmb@isovalent.com wrote:
Teach libbpf to use mmap when parsing vmlinux BTF from /sys. We don't apply this to fall-back paths on the regular file system because there is no way to ensure that modifications underlying the MAP_PRIVATE mapping are not visible to the process.
Signed-off-by: Lorenz Bauer lmb@isovalent.com
tools/lib/bpf/btf.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 11 deletions(-)
[...]
@@ -1030,7 +1045,7 @@ struct btf *btf__new_empty_split(struct btf *base_btf) return libbpf_ptr(btf_new_empty(base_btf)); }
-static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) +static struct btf *btf_new_no_copy(void *data, __u32 size, struct btf *base_btf) { struct btf *btf; int err; @@ -1050,12 +1065,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) btf->start_str_off = base_btf->hdr->str_len; }
btf->raw_data = malloc(size);
if (!btf->raw_data) {
err = -ENOMEM;
goto done;
}
memcpy(btf->raw_data, data, size);
btf->raw_data = data; btf->raw_size = size; btf->hdr = btf->raw_data;
@@ -1081,6 +1091,24 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) return btf; }
+static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf)
btf_new() is internal, so I'd extend existing btf_new() with `bool is_mmap` and not add btf_new_no_copy(), I think it's simpler. Eventually we can turn is_mmap into some sort of flags, if we need more tuning of data ownership behavior
+{
struct btf *btf;
void *raw_data;
raw_data = malloc(size);
if (!raw_data)
return ERR_PTR(-ENOMEM);
memcpy(raw_data, data, size);
btf = btf_new_no_copy(raw_data, size, base_btf);
if (IS_ERR(btf))
free(raw_data);
return btf;
+}
struct btf *btf__new(const void *data, __u32 size) { return libbpf_ptr(btf_new(data, size, NULL)); @@ -1354,6 +1382,37 @@ struct btf *btf__parse_raw_split(const char *path, struct btf *base_btf) return libbpf_ptr(btf_parse_raw(path, base_btf)); }
+static struct btf *btf_parse_raw_mmap(const char *path, struct btf *base_btf) +{
struct stat st;
void *data;
struct btf *btf;
int fd;
fd = open(path, O_RDONLY);
if (fd < 0)
return libbpf_err_ptr(-errno);
if (fstat(fd, &st) < 0) {
close(fd);
close() can clobber errno, so save `err = -errno` before it
return libbpf_err_ptr(-errno);
}
data = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
same, errno clobbering danger
if (data == MAP_FAILED)
return NULL;
btf = btf_new_no_copy(data, st.st_size, base_btf);
if (!btf)
btf_new_no_copy() is returning ERR_PTR() on error, no?
pw-bot: cr
munmap(data, st.st_size);
else
btf->raw_data_is_mmap = true;
return btf;
+}
static struct btf *btf_parse(const char *path, struct btf *base_btf, struct btf_ext **btf_ext) { struct btf *btf; @@ -1659,8 +1718,7 @@ struct btf *btf__load_from_kernel_by_id(__u32 id) static void btf_invalidate_raw_data(struct btf *btf) { if (btf->raw_data) {
free(btf->raw_data);
btf->raw_data = NULL;
btf_free_raw_data(btf); } if (btf->raw_data_swapped) { free(btf->raw_data_swapped);
@@ -5290,7 +5348,10 @@ struct btf *btf__load_vmlinux_btf(void) pr_warn("kernel BTF is missing at '%s', was CONFIG_DEBUG_INFO_BTF enabled?\n", sysfs_btf_path); } else {
btf = btf__parse(sysfs_btf_path, NULL);
btf = btf_parse_raw_mmap(sysfs_btf_path, NULL);
if (IS_ERR_OR_NULL(btf))
btf = btf__parse(sysfs_btf_path, NULL);
if (!btf) { err = -errno; pr_warn("failed to read kernel BTF from '%s': %s\n",
-- 2.49.0
linux-kselftest-mirror@lists.linaro.org