Right now there exists prog produce / userspace consume and userspace produce / prog consume support. But it is also useful to have prog produce / prog consume.
For example, we want to track the latency overhead of cpumap in production. Since we need to store enqueue timestamps somewhere and cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF ringbuf is such a data structure. Rather than reimplement (possibly poorly) a custom ringbuffer in BPF, extend the existing interface to allow the final quadrant of ringbuf usecases to be filled. Note we ignore userspace to userspace use case - there is no need to involve kernel for that.
Signed-off-by: Daniel Xu dxu@dxuuu.xyz --- kernel/bpf/verifier.c | 6 +- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/ringbuf.c | 50 +++++++++++++++ .../bpf/progs/test_ringbuf_bpf_to_bpf.c | 64 +++++++++++++++++++ 4 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53d0556fbbf3..56bfe559f228 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9142,7 +9142,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_query && func_id != BPF_FUNC_ringbuf_reserve_dynptr && func_id != BPF_FUNC_ringbuf_submit_dynptr && - func_id != BPF_FUNC_ringbuf_discard_dynptr) + func_id != BPF_FUNC_ringbuf_discard_dynptr && + func_id != BPF_FUNC_user_ringbuf_drain) goto error; break; case BPF_MAP_TYPE_USER_RINGBUF: @@ -9276,7 +9277,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_user_ringbuf_drain: - if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF) + if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF && + map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; case BPF_FUNC_get_stackid: diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9905e3739dd0..233109843d4d 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -503,7 +503,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \ - test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c + test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c \ + test_ringbuf_bpf_to_bpf.c
# Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index da430df45aa4..3e7955573ac5 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -17,6 +17,7 @@ #include "test_ringbuf_n.lskel.h" #include "test_ringbuf_map_key.lskel.h" #include "test_ringbuf_write.lskel.h" +#include "test_ringbuf_bpf_to_bpf.lskel.h"
#define EDONE 7777
@@ -497,6 +498,53 @@ static void ringbuf_map_key_subtest(void) test_ringbuf_map_key_lskel__destroy(skel_map_key); }
+static void ringbuf_bpf_to_bpf_subtest(void) +{ + struct test_ringbuf_bpf_to_bpf_lskel *skel; + int err, i; + + skel = test_ringbuf_bpf_to_bpf_lskel__open(); + if (!ASSERT_OK_PTR(skel, "test_ringbuf_bpf_to_bpf_lskel__open")) + return; + + skel->maps.ringbuf.max_entries = getpagesize(); + skel->bss->pid = getpid(); + + err = test_ringbuf_bpf_to_bpf_lskel__load(skel); + if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__load")) + goto cleanup; + + ringbuf = ring_buffer__new(skel->maps.ringbuf.map_fd, NULL, NULL, NULL); + if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new")) + goto cleanup; + + err = test_ringbuf_bpf_to_bpf_lskel__attach(skel); + if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__attach")) + goto cleanup_ringbuf; + + /* Produce N_SAMPLES samples in the ring buffer by calling getpid() */ + skel->bss->value = SAMPLE_VALUE; + for (i = 0; i < N_SAMPLES; i++) + syscall(__NR_getpgid); + + /* Trigger bpf-side consumption */ + syscall(__NR_prctl); + + /* Check correct number samples were consumed */ + if (!ASSERT_EQ(skel->bss->round_tripped, N_SAMPLES, "round_tripped")) + goto cleanup_ringbuf; + + /* Check all samples were consumed */ + err = ring_buffer__consume(ringbuf); + if (!ASSERT_EQ(err, 0, "rb_consume")) + goto cleanup_ringbuf; + +cleanup_ringbuf: + ring_buffer__free(ringbuf); +cleanup: + test_ringbuf_bpf_to_bpf_lskel__destroy(skel); +} + void test_ringbuf(void) { if (test__start_subtest("ringbuf")) @@ -507,4 +555,6 @@ void test_ringbuf(void) ringbuf_map_key_subtest(); if (test__start_subtest("ringbuf_write")) ringbuf_write_subtest(); + if (test__start_subtest("ringbuf_bpf_to_bpf")) + ringbuf_bpf_to_bpf_subtest(); } diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c b/tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c new file mode 100644 index 000000000000..378154922024 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <unistd.h> +#include <bpf/bpf_helpers.h> +#include "bpf_misc.h" + +struct sample { + long value; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); +} ringbuf SEC(".maps"); + +int pid = 0; +long value = 0; +int round_tripped = 0; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_ringbuf_bpf_to_bpf_produce(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + + if (cur_pid != pid) + return 0; + + sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); + if (!sample) + return 0; + sample->value = value; + + bpf_ringbuf_submit(sample, 0); + return 0; +} + +static long consume_cb(struct bpf_dynptr *dynptr, void *context) +{ + struct sample *sample = NULL; + + sample = bpf_dynptr_data(dynptr, 0, sizeof(*sample)); + if (!sample) + return 0; + + if (sample->value == value) + round_tripped++; + + return 0; +} + +SEC("fentry/" SYS_PREFIX "sys_prctl") +int test_ringbuf_bpf_to_bpf_consume(void *ctx) +{ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + + if (cur_pid != pid) + return 0; + + bpf_user_ringbuf_drain(&ringbuf, consume_cb, NULL, 0); + return 0; +} + +char _license[] SEC("license") = "GPL";
On Mon, Sep 9, 2024 at 5:55 PM Daniel Xu dxu@dxuuu.xyz wrote:
Right now there exists prog produce / userspace consume and userspace produce / prog consume support. But it is also useful to have prog produce / prog consume.
For example, we want to track the latency overhead of cpumap in production. Since we need to store enqueue timestamps somewhere and cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF ringbuf is such a data structure. Rather than reimplement (possibly poorly) a custom ringbuffer in BPF, extend the existing interface to allow the final quadrant of ringbuf usecases to be filled. Note we ignore userspace to userspace use case - there is no need to involve kernel for that.
Signed-off-by: Daniel Xu dxu@dxuuu.xyz
kernel/bpf/verifier.c | 6 +- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/ringbuf.c | 50 +++++++++++++++ .../bpf/progs/test_ringbuf_bpf_to_bpf.c | 64 +++++++++++++++++++ 4 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53d0556fbbf3..56bfe559f228 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9142,7 +9142,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_query && func_id != BPF_FUNC_ringbuf_reserve_dynptr && func_id != BPF_FUNC_ringbuf_submit_dynptr &&
func_id != BPF_FUNC_ringbuf_discard_dynptr)
func_id != BPF_FUNC_ringbuf_discard_dynptr &&
func_id != BPF_FUNC_user_ringbuf_drain) goto error; break; case BPF_MAP_TYPE_USER_RINGBUF:
@@ -9276,7 +9277,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_user_ringbuf_drain:
if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF &&
map->map_type != BPF_MAP_TYPE_RINGBUF) goto error;
I think it should work.
Andrii,
do you see any issues with such use?
break; case BPF_FUNC_get_stackid:
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9905e3739dd0..233109843d4d 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -503,7 +503,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \
test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c
test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c \
test_ringbuf_bpf_to_bpf.c
Do you need it to be lskel ?
Regular skels are either to debug.
Also pls split selftest into a separate patch.
# Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index da430df45aa4..3e7955573ac5 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -17,6 +17,7 @@ #include "test_ringbuf_n.lskel.h" #include "test_ringbuf_map_key.lskel.h" #include "test_ringbuf_write.lskel.h" +#include "test_ringbuf_bpf_to_bpf.lskel.h"
#define EDONE 7777
@@ -497,6 +498,53 @@ static void ringbuf_map_key_subtest(void) test_ringbuf_map_key_lskel__destroy(skel_map_key); }
+static void ringbuf_bpf_to_bpf_subtest(void) +{
struct test_ringbuf_bpf_to_bpf_lskel *skel;
int err, i;
skel = test_ringbuf_bpf_to_bpf_lskel__open();
if (!ASSERT_OK_PTR(skel, "test_ringbuf_bpf_to_bpf_lskel__open"))
return;
skel->maps.ringbuf.max_entries = getpagesize();
skel->bss->pid = getpid();
err = test_ringbuf_bpf_to_bpf_lskel__load(skel);
if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__load"))
goto cleanup;
ringbuf = ring_buffer__new(skel->maps.ringbuf.map_fd, NULL, NULL, NULL);
if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new"))
goto cleanup;
err = test_ringbuf_bpf_to_bpf_lskel__attach(skel);
if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__attach"))
goto cleanup_ringbuf;
/* Produce N_SAMPLES samples in the ring buffer by calling getpid() */
skel->bss->value = SAMPLE_VALUE;
for (i = 0; i < N_SAMPLES; i++)
syscall(__NR_getpgid);
/* Trigger bpf-side consumption */
syscall(__NR_prctl);
This might conflict with other selftests that run in parallel.
Just load the skel and bpf_prog_run(prog_fd). No need to attach anywhere in the kernel.
pw-bot: cr
On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Mon, Sep 9, 2024 at 5:55 PM Daniel Xu dxu@dxuuu.xyz wrote:
Right now there exists prog produce / userspace consume and userspace produce / prog consume support. But it is also useful to have prog produce / prog consume.
For example, we want to track the latency overhead of cpumap in production. Since we need to store enqueue timestamps somewhere and cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF ringbuf is such a data structure. Rather than reimplement (possibly poorly) a custom ringbuffer in BPF, extend the existing interface to allow the final quadrant of ringbuf usecases to be filled. Note we ignore userspace to userspace use case - there is no need to involve kernel for that.
Signed-off-by: Daniel Xu dxu@dxuuu.xyz
kernel/bpf/verifier.c | 6 +- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/ringbuf.c | 50 +++++++++++++++ .../bpf/progs/test_ringbuf_bpf_to_bpf.c | 64 +++++++++++++++++++ 4 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53d0556fbbf3..56bfe559f228 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9142,7 +9142,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_query && func_id != BPF_FUNC_ringbuf_reserve_dynptr && func_id != BPF_FUNC_ringbuf_submit_dynptr &&
func_id != BPF_FUNC_ringbuf_discard_dynptr)
func_id != BPF_FUNC_ringbuf_discard_dynptr &&
func_id != BPF_FUNC_user_ringbuf_drain) goto error; break; case BPF_MAP_TYPE_USER_RINGBUF:
@@ -9276,7 +9277,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_user_ringbuf_drain:
if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF &&
map->map_type != BPF_MAP_TYPE_RINGBUF) goto error;
I think it should work.
Andrii,
do you see any issues with such use?
Not from a quick glance. Both ringbufs have the same memory layout, so user_ringbuf_drain() should probably work fine for normal BPF ringbuf (and either way bpf_user_ringbuf_drain() has to protect from malicious user space, so its code is pretty unassuming).
We should make it very explicit, though, that the user is responsible for making sure that bpf_user_ringbuf_drain() will not be called simultaneously in two threads, kernel or user space.
Also, Daniel, can you please make sure that dynptr we return for each sample is read-only? We shouldn't let consumer BPF program ability to corrupt ringbuf record headers (accidentally or otherwise).
And as a thought exercise. I wonder what would it take to have an open-coded iterator returning these read-only dynptrs for each consumed record? Maybe we already have all the pieces together. So consider looking into that as well.
P.S. And yeah "user_" part in helper name is kind of unfortunate given it will work for both ringbufs. Can/should we add some sort of alias for this helper so it can be used with both bpf_user_ringbuf_drain() and bpf_ringbuf_drain() names?
break; case BPF_FUNC_get_stackid:
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9905e3739dd0..233109843d4d 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -503,7 +503,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \
test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c
test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c \
test_ringbuf_bpf_to_bpf.c
[...]
On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Mon, Sep 9, 2024 at 5:55 PM Daniel Xu dxu@dxuuu.xyz wrote:
Right now there exists prog produce / userspace consume and userspace produce / prog consume support. But it is also useful to have prog produce / prog consume.
For example, we want to track the latency overhead of cpumap in production. Since we need to store enqueue timestamps somewhere and cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF ringbuf is such a data structure. Rather than reimplement (possibly poorly) a custom ringbuffer in BPF, extend the existing interface to allow the final quadrant of ringbuf usecases to be filled. Note we ignore userspace to userspace use case - there is no need to involve kernel for that.
Signed-off-by: Daniel Xu dxu@dxuuu.xyz
kernel/bpf/verifier.c | 6 +- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/ringbuf.c | 50 +++++++++++++++ .../bpf/progs/test_ringbuf_bpf_to_bpf.c | 64 +++++++++++++++++++ 4 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53d0556fbbf3..56bfe559f228 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9142,7 +9142,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_query && func_id != BPF_FUNC_ringbuf_reserve_dynptr && func_id != BPF_FUNC_ringbuf_submit_dynptr &&
func_id != BPF_FUNC_ringbuf_discard_dynptr)
func_id != BPF_FUNC_ringbuf_discard_dynptr &&
func_id != BPF_FUNC_user_ringbuf_drain) goto error; break; case BPF_MAP_TYPE_USER_RINGBUF:
@@ -9276,7 +9277,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_user_ringbuf_drain:
if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF &&
map->map_type != BPF_MAP_TYPE_RINGBUF) goto error;
I think it should work.
Andrii,
do you see any issues with such use?
Not from a quick glance. Both ringbufs have the same memory layout, so user_ringbuf_drain() should probably work fine for normal BPF ringbuf (and either way bpf_user_ringbuf_drain() has to protect from malicious user space, so its code is pretty unassuming).
We should make it very explicit, though, that the user is responsible for making sure that bpf_user_ringbuf_drain() will not be called simultaneously in two threads, kernel or user space.
I see an atomic_try_cmpxchg() protecting the drain. So it should be safe, right? What are they supposed to expect?
Also, Daniel, can you please make sure that dynptr we return for each sample is read-only? We shouldn't let consumer BPF program ability to corrupt ringbuf record headers (accidentally or otherwise).
Sure.
And as a thought exercise. I wonder what would it take to have an open-coded iterator returning these read-only dynptrs for each consumed record? Maybe we already have all the pieces together. So consider looking into that as well.
P.S. And yeah "user_" part in helper name is kind of unfortunate given it will work for both ringbufs. Can/should we add some sort of alias for this helper so it can be used with both bpf_user_ringbuf_drain() and bpf_ringbuf_drain() names?
You mean register a new helper that shares the impl? Easy enough, but I thought we didn't want to add more uapi helpers.
Thanks, Daniel
On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
[...]
Also, Daniel, can you please make sure that dynptr we return for each sample is read-only? We shouldn't let consumer BPF program ability to corrupt ringbuf record headers (accidentally or otherwise).
Sure.
So the sample is not read-only. But I think prog is prevented from messing with header regardless.
__bpf_user_ringbuf_peek() returns sample past the header:
*sample = (void *)((uintptr_t)rb->data + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
dynptr is initialized with the above ptr:
bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
So I don't think there's a way for the prog to access the header thru the dynptr.
Thanks, Daniel
On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
[...]
Also, Daniel, can you please make sure that dynptr we return for each sample is read-only? We shouldn't let consumer BPF program ability to corrupt ringbuf record headers (accidentally or otherwise).
Sure.
So the sample is not read-only. But I think prog is prevented from messing with header regardless.
__bpf_user_ringbuf_peek() returns sample past the header:
*sample = (void *)((uintptr_t)rb->data + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
dynptr is initialized with the above ptr:
bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
So I don't think there's a way for the prog to access the header thru the dynptr.
By "header" I mean 8 bytes that precede each submitted ringbuf record. That header is part of ringbuf data area. Given user space can set consumer_pos to arbitrary value, kernel can return arbitrary part of ringbuf data area, including that 8 byte header. If that data is writable, it's easy to screw up that header and crash another BPF program that reserves/submits a new record. User space can only read data area for BPF ringbuf, and so we rely heavily on a tight control of who can write what into those 8 bytes.
Thanks, Daniel
On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
[...]
Also, Daniel, can you please make sure that dynptr we return for each sample is read-only? We shouldn't let consumer BPF program ability to corrupt ringbuf record headers (accidentally or otherwise).
Sure.
So the sample is not read-only. But I think prog is prevented from messing with header regardless.
__bpf_user_ringbuf_peek() returns sample past the header:
*sample = (void *)((uintptr_t)rb->data + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
dynptr is initialized with the above ptr:
bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
So I don't think there's a way for the prog to access the header thru the dynptr.
By "header" I mean 8 bytes that precede each submitted ringbuf record. That header is part of ringbuf data area. Given user space can set consumer_pos to arbitrary value, kernel can return arbitrary part of ringbuf data area, including that 8 byte header. If that data is writable, it's easy to screw up that header and crash another BPF program that reserves/submits a new record. User space can only read data area for BPF ringbuf, and so we rely heavily on a tight control of who can write what into those 8 bytes.
Ah, ok. I think I understand.
Given this and your other comments about rb->busy, what about enforcing bpf_user_ringbuf_drain() NAND mmap? I think the use cases here are different enough where this makes sense.
On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
[...]
Also, Daniel, can you please make sure that dynptr we return for each sample is read-only? We shouldn't let consumer BPF program ability to corrupt ringbuf record headers (accidentally or otherwise).
Sure.
So the sample is not read-only. But I think prog is prevented from messing with header regardless.
__bpf_user_ringbuf_peek() returns sample past the header:
*sample = (void *)((uintptr_t)rb->data + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
dynptr is initialized with the above ptr:
bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
So I don't think there's a way for the prog to access the header thru the dynptr.
By "header" I mean 8 bytes that precede each submitted ringbuf record. That header is part of ringbuf data area. Given user space can set consumer_pos to arbitrary value, kernel can return arbitrary part of ringbuf data area, including that 8 byte header. If that data is writable, it's easy to screw up that header and crash another BPF program that reserves/submits a new record. User space can only read data area for BPF ringbuf, and so we rely heavily on a tight control of who can write what into those 8 bytes.
Ah, ok. I think I understand.
Given this and your other comments about rb->busy, what about enforcing bpf_user_ringbuf_drain() NAND mmap? I think the use cases here are different enough where this makes sense.
You mean disabling user-space mmap()? TBH, I'd like to understand the use case first before we make such decisions. Maybe what you need is not really a BPF ringbuf? Can you give us a bit more details on what you are trying to achieve?
On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
[...]
Also, Daniel, can you please make sure that dynptr we return for each sample is read-only? We shouldn't let consumer BPF program ability to corrupt ringbuf record headers (accidentally or otherwise).
Sure.
So the sample is not read-only. But I think prog is prevented from messing with header regardless.
__bpf_user_ringbuf_peek() returns sample past the header:
*sample = (void *)((uintptr_t)rb->data + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
dynptr is initialized with the above ptr:
bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
So I don't think there's a way for the prog to access the header thru the dynptr.
By "header" I mean 8 bytes that precede each submitted ringbuf record. That header is part of ringbuf data area. Given user space can set consumer_pos to arbitrary value, kernel can return arbitrary part of ringbuf data area, including that 8 byte header. If that data is writable, it's easy to screw up that header and crash another BPF program that reserves/submits a new record. User space can only read data area for BPF ringbuf, and so we rely heavily on a tight control of who can write what into those 8 bytes.
Ah, ok. I think I understand.
Given this and your other comments about rb->busy, what about enforcing bpf_user_ringbuf_drain() NAND mmap? I think the use cases here are different enough where this makes sense.
You mean disabling user-space mmap()? TBH, I'd like to understand the use case first before we make such decisions. Maybe what you need is not really a BPF ringbuf? Can you give us a bit more details on what you are trying to achieve?
BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each entry in the cpumap. When a prog redirects to an entry in the cpumap, the machinery queues up the xdp frame onto the destination CPU ptr_ring. This can occur on any cpu, thus multi-producer. On processing side, there is only the kthread created by the cpumap entry and bound to the specific cpu that is consuming entries. So single consumer.
Goal is to track the latency overhead added from ptr_ring and the kthread (versus softirq where is less overhead). Ideally we want p50, p90, p95, p99 percentiles.
To do this, we need to track every single entry enqueue time as well as dequeue time - events that occur in the tail are quite important.
Since ptr_ring is also a ring buffer, I thought it would be easy, reliable, and fast to just create a "shadow" ring buffer. Every time producer enqueues entries, I'd enqueue the same number of current timestamp onto shadow RB. Same thing on consumer side, except dequeue and calculate timestamp delta.
I was originally planning on writing my own lockless ring buffer in pure BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I could avoid that with this patch.
About disabling user-space mmap: yeah, that's what I was suggesting. I think it'd be a bit odd if you wanted BPF RB to support consumption from both userspace && prog at the same time. And since draining from prog is new functionality (and thus the NAND isn't a regression), you could relax the restriction later without issues.
[cc Jesper]
On Tue, Sep 10, 2024, at 8:31 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote: > On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
[...]
> > Also, Daniel, can you please make sure that dynptr we return for each > sample is read-only? We shouldn't let consumer BPF program ability to > corrupt ringbuf record headers (accidentally or otherwise).
Sure.
So the sample is not read-only. But I think prog is prevented from messing with header regardless.
__bpf_user_ringbuf_peek() returns sample past the header:
*sample = (void *)((uintptr_t)rb->data + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
dynptr is initialized with the above ptr:
bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
So I don't think there's a way for the prog to access the header thru the dynptr.
By "header" I mean 8 bytes that precede each submitted ringbuf record. That header is part of ringbuf data area. Given user space can set consumer_pos to arbitrary value, kernel can return arbitrary part of ringbuf data area, including that 8 byte header. If that data is writable, it's easy to screw up that header and crash another BPF program that reserves/submits a new record. User space can only read data area for BPF ringbuf, and so we rely heavily on a tight control of who can write what into those 8 bytes.
Ah, ok. I think I understand.
Given this and your other comments about rb->busy, what about enforcing bpf_user_ringbuf_drain() NAND mmap? I think the use cases here are different enough where this makes sense.
You mean disabling user-space mmap()? TBH, I'd like to understand the use case first before we make such decisions. Maybe what you need is not really a BPF ringbuf? Can you give us a bit more details on what you are trying to achieve?
BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each entry in the cpumap. When a prog redirects to an entry in the cpumap, the machinery queues up the xdp frame onto the destination CPU ptr_ring. This can occur on any cpu, thus multi-producer. On processing side, there is only the kthread created by the cpumap entry and bound to the specific cpu that is consuming entries. So single consumer.
Goal is to track the latency overhead added from ptr_ring and the kthread (versus softirq where is less overhead). Ideally we want p50, p90, p95, p99 percentiles.
To do this, we need to track every single entry enqueue time as well as dequeue time - events that occur in the tail are quite important.
Since ptr_ring is also a ring buffer, I thought it would be easy, reliable, and fast to just create a "shadow" ring buffer. Every time producer enqueues entries, I'd enqueue the same number of current timestamp onto shadow RB. Same thing on consumer side, except dequeue and calculate timestamp delta.
I was originally planning on writing my own lockless ring buffer in pure BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I could avoid that with this patch.
[...]
Alternatively, could add a u64 timestamp to xdp_frame, which makes all this tracking inline (and thus more reliable). But I'm not sure how precious the space in that struct is - I see some references online saying most drivers save 128B headroom. I also see:
#define XDP_PACKET_HEADROOM 256
Could probably amortize the timestamp read by setting it in bq_flush_to_queue().
On 11/09/2024 06.43, Daniel Xu wrote:
[cc Jesper]
On Tue, Sep 10, 2024, at 8:31 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu dxu@dxuuu.xyz wrote:
[...cut...]
Can you give us a bit more details on what you are trying to achieve?
BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each entry in the cpumap. When a prog redirects to an entry in the cpumap, the machinery queues up the xdp frame onto the destination CPU ptr_ring. This can occur on any cpu, thus multi-producer. On processing side, there is only the kthread created by the cpumap entry and bound to the specific cpu that is consuming entries. So single consumer.
An important detail: to get Multi-Producer (MP) to scale the CPUMAP does bulk enqueue into the ptr_ring. It stores the xdp_frame's in a per-CPU array and does the flush/enqueue as part of the xdp_do_flush(). Because I was afraid of this adding latency, I choose to also flush every 8 frames (CPU_MAP_BULK_SIZE).
Looking at code I see this is also explained in a comment:
/* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is * guaranteed that queueing the frame and the flush operation happen on * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() * which queue in bpf_cpu_map_entry contains packets. */
Goal is to track the latency overhead added from ptr_ring and the kthread (versus softirq where is less overhead). Ideally we want p50, p90, p95, p99 percentiles.
I'm very interesting in this use-case of understanding the latency of CPUMAP. I'm a fan of latency histograms that I turn into heatmaps in grafana.
To do this, we need to track every single entry enqueue time as well as dequeue time - events that occur in the tail are quite important.
Since ptr_ring is also a ring buffer, I thought it would be easy, reliable, and fast to just create a "shadow" ring buffer. Every time producer enqueues entries, I'd enqueue the same number of current timestamp onto shadow RB. Same thing on consumer side, except dequeue and calculate timestamp delta.
This idea seems overkill and will likely produce unreliable results. E.g. the overhead of this additional ring buffer will also affect the measurements.
I was originally planning on writing my own lockless ring buffer in pure BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I could avoid that with this patch.
[...]
Alternatively, could add a u64 timestamp to xdp_frame, which makes all this tracking inline (and thus more reliable). But I'm not sure how precious the space in that struct is - I see some references online saying most drivers save 128B headroom. I also see:
#define XDP_PACKET_HEADROOM 256
I like the inline idea. I would suggest to add u64 timestamp into XDP-metadata area (ctx->data_meta code example[1]) , when XDP runs in RX-NAPI. Then at the remote CPU you can run another CPUMAP-XDP program that pickup this timestamp, and then calc a delta from "now" timestamp.
[1] https://github.com/xdp-project/bpf-examples/blob/master/AF_XDP-interaction/a...
Could probably amortize the timestamp read by setting it in bq_flush_to_queue().
To amortize, consider that you might not need to timestamp EVERY packet to get sufficient statistics on the latency.
Regarding bq_flush_to_queue() and the enqueue tracepoint: trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu)
I have an idea for you, on how to measure the latency overhead from XDP RX-processing to when enqueue "flush" happens. It is a little tricky to explain, so I will outline the steps.
1. XDP bpf_prog store timestamp in per-CPU array, unless timestamp is already set.
2. trace_xdp_cpumap_enqueue bpf_prog reads per-CPU timestamp and calc latency diff, and clears timestamp.
This measures the latency overhead of bulk enqueue. (Notice: Only the first XDP redirect frame after a bq_flush_to_queue() will set the timestamp). This per-CPU store should work as this all runs under same RX-NAPI "poll" execution.
This latency overhead of bulk enqueue, will (unfortunately) also count/measure the XDP_PASS packets that gets processed by the normal netstack. So, watch out for this. e.g could have XDP actions (e.g XDP_PASS) counters as part of step 1, and have statistic for cases where XDP_PASS interfered.
--Jesper
On Wed, Sep 11, 2024 at 10:32:56AM GMT, Jesper Dangaard Brouer wrote:
On 11/09/2024 06.43, Daniel Xu wrote:
[cc Jesper]
On Tue, Sep 10, 2024, at 8:31 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu dxu@dxuuu.xyz wrote: >
[...cut...]
Can you give us a bit more details on what you are trying to achieve?
BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each entry in the cpumap. When a prog redirects to an entry in the cpumap, the machinery queues up the xdp frame onto the destination CPU ptr_ring. This can occur on any cpu, thus multi-producer. On processing side, there is only the kthread created by the cpumap entry and bound to the specific cpu that is consuming entries. So single consumer.
An important detail: to get Multi-Producer (MP) to scale the CPUMAP does bulk enqueue into the ptr_ring. It stores the xdp_frame's in a per-CPU array and does the flush/enqueue as part of the xdp_do_flush(). Because I was afraid of this adding latency, I choose to also flush every 8 frames (CPU_MAP_BULK_SIZE).
Looking at code I see this is also explained in a comment:
/* General idea: XDP packets getting XDP redirected to another CPU,
- will maximum be stored/queued for one driver ->poll() call. It is
- guaranteed that queueing the frame and the flush operation happen on
- same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
- which queue in bpf_cpu_map_entry contains packets.
*/
Goal is to track the latency overhead added from ptr_ring and the kthread (versus softirq where is less overhead). Ideally we want p50, p90, p95, p99 percentiles.
I'm very interesting in this use-case of understanding the latency of CPUMAP. I'm a fan of latency histograms that I turn into heatmaps in grafana.
To do this, we need to track every single entry enqueue time as well as dequeue time - events that occur in the tail are quite important.
Since ptr_ring is also a ring buffer, I thought it would be easy, reliable, and fast to just create a "shadow" ring buffer. Every time producer enqueues entries, I'd enqueue the same number of current timestamp onto shadow RB. Same thing on consumer side, except dequeue and calculate timestamp delta.
This idea seems overkill and will likely produce unreliable results. E.g. the overhead of this additional ring buffer will also affect the measurements.
Yeah, good point.
I was originally planning on writing my own lockless ring buffer in pure BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I could avoid that with this patch.
[...]
Alternatively, could add a u64 timestamp to xdp_frame, which makes all this tracking inline (and thus more reliable). But I'm not sure how precious the space in that struct is - I see some references online saying most drivers save 128B headroom. I also see:
#define XDP_PACKET_HEADROOM 256
I like the inline idea. I would suggest to add u64 timestamp into XDP-metadata area (ctx->data_meta code example[1]) , when XDP runs in RX-NAPI. Then at the remote CPU you can run another CPUMAP-XDP program that pickup this timestamp, and then calc a delta from "now" timestamp.
[1] https://github.com/xdp-project/bpf-examples/blob/master/AF_XDP-interaction/a...
Cool! This is a much better idea than mine :)
I'll give this a try.
Could probably amortize the timestamp read by setting it in bq_flush_to_queue().
To amortize, consider that you might not need to timestamp EVERY packet to get sufficient statistics on the latency.
Regarding bq_flush_to_queue() and the enqueue tracepoint: trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu)
I have an idea for you, on how to measure the latency overhead from XDP RX-processing to when enqueue "flush" happens. It is a little tricky to explain, so I will outline the steps.
XDP bpf_prog store timestamp in per-CPU array, unless timestamp is already set.
trace_xdp_cpumap_enqueue bpf_prog reads per-CPU timestamp and calc latency diff, and clears timestamp.
This measures the latency overhead of bulk enqueue. (Notice: Only the first XDP redirect frame after a bq_flush_to_queue() will set the timestamp). This per-CPU store should work as this all runs under same RX-NAPI "poll" execution.
Makes sense to me. This breaks down the latency even further. I'll keep it in mind if we need further troubleshooting.
This latency overhead of bulk enqueue, will (unfortunately) also count/measure the XDP_PASS packets that gets processed by the normal netstack. So, watch out for this. e.g could have XDP actions (e.g XDP_PASS) counters as part of step 1, and have statistic for cases where XDP_PASS interfered.
Not sure I got this. If we only set the percpu timestamp for XDP_REDIRECT frames, then I don't see how XDP_PASS interferes. Maybe I misunderstand something.
Thanks, Daniel
On 11/09/2024 20.53, Daniel Xu wrote:
On Wed, Sep 11, 2024 at 10:32:56AM GMT, Jesper Dangaard Brouer wrote:
On 11/09/2024 06.43, Daniel Xu wrote:
[cc Jesper]
On Tue, Sep 10, 2024, at 8:31 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote: > On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu dxu@dxuuu.xyz wrote: >>
[...cut...]
Can you give us a bit more details on what you are trying to achieve?
BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each entry in the cpumap. When a prog redirects to an entry in the cpumap, the machinery queues up the xdp frame onto the destination CPU ptr_ring. This can occur on any cpu, thus multi-producer. On processing side, there is only the kthread created by the cpumap entry and bound to the specific cpu that is consuming entries. So single consumer.
An important detail: to get Multi-Producer (MP) to scale the CPUMAP does bulk enqueue into the ptr_ring. It stores the xdp_frame's in a per-CPU array and does the flush/enqueue as part of the xdp_do_flush(). Because I was afraid of this adding latency, I choose to also flush every 8 frames (CPU_MAP_BULK_SIZE).
Looking at code I see this is also explained in a comment:
/* General idea: XDP packets getting XDP redirected to another CPU,
- will maximum be stored/queued for one driver ->poll() call. It is
- guaranteed that queueing the frame and the flush operation happen on
- same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
- which queue in bpf_cpu_map_entry contains packets.
*/
Goal is to track the latency overhead added from ptr_ring and the kthread (versus softirq where is less overhead). Ideally we want p50, p90, p95, p99 percentiles.
I'm very interesting in this use-case of understanding the latency of CPUMAP. I'm a fan of latency histograms that I turn into heatmaps in grafana.
To do this, we need to track every single entry enqueue time as well as dequeue time - events that occur in the tail are quite important.
Since ptr_ring is also a ring buffer, I thought it would be easy, reliable, and fast to just create a "shadow" ring buffer. Every time producer enqueues entries, I'd enqueue the same number of current timestamp onto shadow RB. Same thing on consumer side, except dequeue and calculate timestamp delta.
This idea seems overkill and will likely produce unreliable results. E.g. the overhead of this additional ring buffer will also affect the measurements.
Yeah, good point.
I was originally planning on writing my own lockless ring buffer in pure BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I could avoid that with this patch.
[...]
Alternatively, could add a u64 timestamp to xdp_frame, which makes all this tracking inline (and thus more reliable). But I'm not sure how precious the space in that struct is - I see some references online saying most drivers save 128B headroom. I also see:
#define XDP_PACKET_HEADROOM 256
I like the inline idea. I would suggest to add u64 timestamp into XDP-metadata area (ctx->data_meta code example[1]) , when XDP runs in RX-NAPI. Then at the remote CPU you can run another CPUMAP-XDP program that pickup this timestamp, and then calc a delta from "now" timestamp.
[1] https://github.com/xdp-project/bpf-examples/blob/master/AF_XDP-interaction/a...
Cool! This is a much better idea than mine :)
I'll give this a try.
Could probably amortize the timestamp read by setting it in bq_flush_to_queue().
To amortize, consider that you might not need to timestamp EVERY packet to get sufficient statistics on the latency.
Regarding bq_flush_to_queue() and the enqueue tracepoint: trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu)
I have an idea for you, on how to measure the latency overhead from XDP RX-processing to when enqueue "flush" happens. It is a little tricky to explain, so I will outline the steps.
XDP bpf_prog store timestamp in per-CPU array, unless timestamp is already set.
trace_xdp_cpumap_enqueue bpf_prog reads per-CPU timestamp and calc latency diff, and clears timestamp.
This measures the latency overhead of bulk enqueue. (Notice: Only the first XDP redirect frame after a bq_flush_to_queue() will set the timestamp). This per-CPU store should work as this all runs under same RX-NAPI "poll" execution.
Makes sense to me. This breaks down the latency even further. I'll keep it in mind if we need further troubleshooting.
Yes, this breaks down the latency even further :-)
This latency overhead of bulk enqueue, will (unfortunately) also count/measure the XDP_PASS packets that gets processed by the normal netstack. So, watch out for this. e.g could have XDP actions (e.g XDP_PASS) counters as part of step 1, and have statistic for cases where XDP_PASS interfered.
Not sure I got this. If we only set the percpu timestamp for XDP_REDIRECT frames, then I don't see how XDP_PASS interferes. Maybe I misunderstand something.
Not quite. You only set timestamp for the first XDP_REDIRECT frames. I'm simply saying, all the packets processed *after* this timestamp will attribute to the time it takes until trace_xdp_cpumap_enqueue() runs. That part should be obvious. Then, I'm saying remember that an XDP_PASS packet takes more time than a XDP_REDIRECT packet. I hope this makes it more clear. Point: This is a pitfall you need to be aware of when looking at your metrics.
For the inline timestamping same pitfall actually applies. There you timestamp the packets themselves. Because the CPUMAP enqueue happens as the *LAST* thing in NAPI loop, during xdp_do_flush() call. This means that interleaved normal netstack (XDP_PASS) packets will be processed BEFORE this call to xdp_do_flush(). As noted earlier, to compensate for this effect, code will enqueue-flush every 8 frames (CPU_MAP_BULK_SIZE).
I hope, I've not confused you more :-|
--Jesper
On Thu, Sep 12, 2024, at 2:40 AM, Jesper Dangaard Brouer wrote:
On 11/09/2024 20.53, Daniel Xu wrote:
On Wed, Sep 11, 2024 at 10:32:56AM GMT, Jesper Dangaard Brouer wrote:
On 11/09/2024 06.43, Daniel Xu wrote:
[cc Jesper]
On Tue, Sep 10, 2024, at 8:31 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu dxu@dxuuu.xyz wrote: > > On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote: >> On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu dxu@dxuuu.xyz wrote: >>>
[...cut...]
Can you give us a bit more details on what you are trying to achieve?
BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each entry in the cpumap. When a prog redirects to an entry in the cpumap, the machinery queues up the xdp frame onto the destination CPU ptr_ring. This can occur on any cpu, thus multi-producer. On processing side, there is only the kthread created by the cpumap entry and bound to the specific cpu that is consuming entries. So single consumer.
An important detail: to get Multi-Producer (MP) to scale the CPUMAP does bulk enqueue into the ptr_ring. It stores the xdp_frame's in a per-CPU array and does the flush/enqueue as part of the xdp_do_flush(). Because I was afraid of this adding latency, I choose to also flush every 8 frames (CPU_MAP_BULK_SIZE).
Looking at code I see this is also explained in a comment:
/* General idea: XDP packets getting XDP redirected to another CPU,
- will maximum be stored/queued for one driver ->poll() call. It is
- guaranteed that queueing the frame and the flush operation happen on
- same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
- which queue in bpf_cpu_map_entry contains packets.
*/
Goal is to track the latency overhead added from ptr_ring and the kthread (versus softirq where is less overhead). Ideally we want p50, p90, p95, p99 percentiles.
I'm very interesting in this use-case of understanding the latency of CPUMAP. I'm a fan of latency histograms that I turn into heatmaps in grafana.
To do this, we need to track every single entry enqueue time as well as dequeue time - events that occur in the tail are quite important.
Since ptr_ring is also a ring buffer, I thought it would be easy, reliable, and fast to just create a "shadow" ring buffer. Every time producer enqueues entries, I'd enqueue the same number of current timestamp onto shadow RB. Same thing on consumer side, except dequeue and calculate timestamp delta.
This idea seems overkill and will likely produce unreliable results. E.g. the overhead of this additional ring buffer will also affect the measurements.
Yeah, good point.
I was originally planning on writing my own lockless ring buffer in pure BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I could avoid that with this patch.
[...]
Alternatively, could add a u64 timestamp to xdp_frame, which makes all this tracking inline (and thus more reliable). But I'm not sure how precious the space in that struct is - I see some references online saying most drivers save 128B headroom. I also see:
#define XDP_PACKET_HEADROOM 256
I like the inline idea. I would suggest to add u64 timestamp into XDP-metadata area (ctx->data_meta code example[1]) , when XDP runs in RX-NAPI. Then at the remote CPU you can run another CPUMAP-XDP program that pickup this timestamp, and then calc a delta from "now" timestamp.
[1] https://github.com/xdp-project/bpf-examples/blob/master/AF_XDP-interaction/a...
Cool! This is a much better idea than mine :)
I'll give this a try.
Could probably amortize the timestamp read by setting it in bq_flush_to_queue().
To amortize, consider that you might not need to timestamp EVERY packet to get sufficient statistics on the latency.
Regarding bq_flush_to_queue() and the enqueue tracepoint: trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu)
I have an idea for you, on how to measure the latency overhead from XDP RX-processing to when enqueue "flush" happens. It is a little tricky to explain, so I will outline the steps.
XDP bpf_prog store timestamp in per-CPU array, unless timestamp is already set.
trace_xdp_cpumap_enqueue bpf_prog reads per-CPU timestamp and calc latency diff, and clears timestamp.
This measures the latency overhead of bulk enqueue. (Notice: Only the first XDP redirect frame after a bq_flush_to_queue() will set the timestamp). This per-CPU store should work as this all runs under same RX-NAPI "poll" execution.
Makes sense to me. This breaks down the latency even further. I'll keep it in mind if we need further troubleshooting.
Yes, this breaks down the latency even further :-)
This latency overhead of bulk enqueue, will (unfortunately) also count/measure the XDP_PASS packets that gets processed by the normal netstack. So, watch out for this. e.g could have XDP actions (e.g XDP_PASS) counters as part of step 1, and have statistic for cases where XDP_PASS interfered.
Not sure I got this. If we only set the percpu timestamp for XDP_REDIRECT frames, then I don't see how XDP_PASS interferes. Maybe I misunderstand something.
Not quite. You only set timestamp for the first XDP_REDIRECT frames. I'm simply saying, all the packets processed *after* this timestamp will attribute to the time it takes until trace_xdp_cpumap_enqueue() runs. That part should be obvious. Then, I'm saying remember that an XDP_PASS packet takes more time than a XDP_REDIRECT packet. I hope this makes it more clear. Point: This is a pitfall you need to be aware of when looking at your metrics.
For the inline timestamping same pitfall actually applies. There you timestamp the packets themselves. Because the CPUMAP enqueue happens as the *LAST* thing in NAPI loop, during xdp_do_flush() call. This means that interleaved normal netstack (XDP_PASS) packets will be processed BEFORE this call to xdp_do_flush(). As noted earlier, to compensate for this effect, code will enqueue-flush every 8 frames (CPU_MAP_BULK_SIZE).
Ah, that makes sense. Thanks for explaining.
I hope, I've not confused you more :-|
No, that was helpful!
Thanks, Daniel
On Tue, Sep 10, 2024 at 8:31 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024 at 05:39:55PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 4:44 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024 at 03:21:04PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 3:16 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024, at 2:07 PM, Daniel Xu wrote:
On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote: > On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov
[...]
> > Also, Daniel, can you please make sure that dynptr we return for each > sample is read-only? We shouldn't let consumer BPF program ability to > corrupt ringbuf record headers (accidentally or otherwise).
Sure.
So the sample is not read-only. But I think prog is prevented from messing with header regardless.
__bpf_user_ringbuf_peek() returns sample past the header:
*sample = (void *)((uintptr_t)rb->data + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
dynptr is initialized with the above ptr:
bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
So I don't think there's a way for the prog to access the header thru the dynptr.
By "header" I mean 8 bytes that precede each submitted ringbuf record. That header is part of ringbuf data area. Given user space can set consumer_pos to arbitrary value, kernel can return arbitrary part of ringbuf data area, including that 8 byte header. If that data is writable, it's easy to screw up that header and crash another BPF program that reserves/submits a new record. User space can only read data area for BPF ringbuf, and so we rely heavily on a tight control of who can write what into those 8 bytes.
Ah, ok. I think I understand.
Given this and your other comments about rb->busy, what about enforcing bpf_user_ringbuf_drain() NAND mmap? I think the use cases here are different enough where this makes sense.
You mean disabling user-space mmap()? TBH, I'd like to understand the use case first before we make such decisions. Maybe what you need is not really a BPF ringbuf? Can you give us a bit more details on what you are trying to achieve?
BPF cpumap, under the hood, has one MPSC ring buffer (ptr_ring) for each entry in the cpumap. When a prog redirects to an entry in the cpumap, the machinery queues up the xdp frame onto the destination CPU ptr_ring. This can occur on any cpu, thus multi-producer. On processing side, there is only the kthread created by the cpumap entry and bound to the specific cpu that is consuming entries. So single consumer.
Goal is to track the latency overhead added from ptr_ring and the kthread (versus softirq where is less overhead). Ideally we want p50, p90, p95, p99 percentiles.
To do this, we need to track every single entry enqueue time as well as dequeue time - events that occur in the tail are quite important.
Since ptr_ring is also a ring buffer, I thought it would be easy, reliable, and fast to just create a "shadow" ring buffer. Every time producer enqueues entries, I'd enqueue the same number of current timestamp onto shadow RB. Same thing on consumer side, except dequeue and calculate timestamp delta.
I was originally planning on writing my own lockless ring buffer in pure BPF (b/c spinlocks cannot be used w/ tracepoints yet) but was hoping I could avoid that with this patch.
TBH, I'd just do a fixed-sized array and use atomic counters to enqueue and consumer position to dequeue. But seems like you might get away without any circular buffer, based on Jesper's suggestion.
About disabling user-space mmap: yeah, that's what I was suggesting. I think it'd be a bit odd if you wanted BPF RB to support consumption from both userspace && prog at the same time. And since draining from prog is new functionality (and thus the NAND isn't a regression), you could relax the restriction later without issues.
I probably wouldn't disable mmap-ing from user space and let the user handle coordination of consumers, if necessary (we could probably expose the "busy" field through the consumer metadata page, if it's not yet, to help with that a bit). This is more flexible as it allows to alternate who's consuming, if necessary. Consumer side can't compromise kernel-side producers (if we prevent writes from the consumer side).
Still, all this feels a bit kludgy anyways. There is also unnecessary epoll notification, which will be happening by default unless consumer explicitly requests to not do notification.
Anyways, if there is a better way for your task, see if that works first.
On Tue, Sep 10, 2024 at 2:07 PM Daniel Xu dxu@dxuuu.xyz wrote:
On Tue, Sep 10, 2024 at 01:41:41PM GMT, Andrii Nakryiko wrote:
On Tue, Sep 10, 2024 at 11:36 AM Alexei Starovoitov alexei.starovoitov@gmail.com wrote:
On Mon, Sep 9, 2024 at 5:55 PM Daniel Xu dxu@dxuuu.xyz wrote:
Right now there exists prog produce / userspace consume and userspace produce / prog consume support. But it is also useful to have prog produce / prog consume.
For example, we want to track the latency overhead of cpumap in production. Since we need to store enqueue timestamps somewhere and cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF ringbuf is such a data structure. Rather than reimplement (possibly poorly) a custom ringbuffer in BPF, extend the existing interface to allow the final quadrant of ringbuf usecases to be filled. Note we ignore userspace to userspace use case - there is no need to involve kernel for that.
Signed-off-by: Daniel Xu dxu@dxuuu.xyz
kernel/bpf/verifier.c | 6 +- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/ringbuf.c | 50 +++++++++++++++ .../bpf/progs/test_ringbuf_bpf_to_bpf.c | 64 +++++++++++++++++++ 4 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53d0556fbbf3..56bfe559f228 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9142,7 +9142,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_query && func_id != BPF_FUNC_ringbuf_reserve_dynptr && func_id != BPF_FUNC_ringbuf_submit_dynptr &&
func_id != BPF_FUNC_ringbuf_discard_dynptr)
func_id != BPF_FUNC_ringbuf_discard_dynptr &&
func_id != BPF_FUNC_user_ringbuf_drain) goto error; break; case BPF_MAP_TYPE_USER_RINGBUF:
@@ -9276,7 +9277,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_user_ringbuf_drain:
if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF &&
map->map_type != BPF_MAP_TYPE_RINGBUF) goto error;
I think it should work.
Andrii,
do you see any issues with such use?
Not from a quick glance. Both ringbufs have the same memory layout, so user_ringbuf_drain() should probably work fine for normal BPF ringbuf (and either way bpf_user_ringbuf_drain() has to protect from malicious user space, so its code is pretty unassuming).
We should make it very explicit, though, that the user is responsible for making sure that bpf_user_ringbuf_drain() will not be called simultaneously in two threads, kernel or user space.
I see an atomic_try_cmpxchg() protecting the drain. So it should be safe, right? What are they supposed to expect?
User space can ignore rb->busy and start consuming in parallel. Ok, given we had this atomic_try_cmpxchg() it was rather OK for user ringbuf, but it's not so great for BPF ringbuf, way too easy to screw up...
Also, Daniel, can you please make sure that dynptr we return for each sample is read-only? We shouldn't let consumer BPF program ability to corrupt ringbuf record headers (accidentally or otherwise).
Sure.
And as a thought exercise. I wonder what would it take to have an open-coded iterator returning these read-only dynptrs for each consumed record? Maybe we already have all the pieces together. So consider looking into that as well.
P.S. And yeah "user_" part in helper name is kind of unfortunate given it will work for both ringbufs. Can/should we add some sort of alias for this helper so it can be used with both bpf_user_ringbuf_drain() and bpf_ringbuf_drain() names?
You mean register a new helper that shares the impl? Easy enough, but I thought we didn't want to add more uapi helpers.
No, not a new helper. Just an alternative name for it, "bpf_ringbuf_drain()". Might not be worth doing, I haven't checked what would be the best way to do this, can't tell right now.
Thanks, Daniel
On Tue, Sep 10, 2024 at 11:36:36AM GMT, Alexei Starovoitov wrote:
On Mon, Sep 9, 2024 at 5:55 PM Daniel Xu dxu@dxuuu.xyz wrote:
Right now there exists prog produce / userspace consume and userspace produce / prog consume support. But it is also useful to have prog produce / prog consume.
For example, we want to track the latency overhead of cpumap in production. Since we need to store enqueue timestamps somewhere and cpumap is MPSC, we need an MPSC data structure to shadow cpumap. BPF ringbuf is such a data structure. Rather than reimplement (possibly poorly) a custom ringbuffer in BPF, extend the existing interface to allow the final quadrant of ringbuf usecases to be filled. Note we ignore userspace to userspace use case - there is no need to involve kernel for that.
Signed-off-by: Daniel Xu dxu@dxuuu.xyz
kernel/bpf/verifier.c | 6 +- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/ringbuf.c | 50 +++++++++++++++ .../bpf/progs/test_ringbuf_bpf_to_bpf.c | 64 +++++++++++++++++++ 4 files changed, 120 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_bpf_to_bpf.c
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53d0556fbbf3..56bfe559f228 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9142,7 +9142,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_ringbuf_query && func_id != BPF_FUNC_ringbuf_reserve_dynptr && func_id != BPF_FUNC_ringbuf_submit_dynptr &&
func_id != BPF_FUNC_ringbuf_discard_dynptr)
func_id != BPF_FUNC_ringbuf_discard_dynptr &&
func_id != BPF_FUNC_user_ringbuf_drain) goto error; break; case BPF_MAP_TYPE_USER_RINGBUF:
@@ -9276,7 +9277,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_user_ringbuf_drain:
if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF &&
map->map_type != BPF_MAP_TYPE_RINGBUF) goto error;
I think it should work.
Andrii,
do you see any issues with such use?
break; case BPF_FUNC_get_stackid:
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 9905e3739dd0..233109843d4d 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -503,7 +503,8 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h \ LSKELS := fentry_test.c fexit_test.c fexit_sleep.c atomics.c \ trace_printk.c trace_vprintk.c map_ptr_kern.c \ core_kern.c core_kern_overflow.c test_ringbuf.c \
test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c
test_ringbuf_n.c test_ringbuf_map_key.c test_ringbuf_write.c \
test_ringbuf_bpf_to_bpf.c
Do you need it to be lskel ?
Regular skels are either to debug.
I'm actually unsure the difference, still. But all the other tests in the file were using lskel so I just copy/pasted.
Also pls split selftest into a separate patch.
Ack.
# Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \ diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index da430df45aa4..3e7955573ac5 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -17,6 +17,7 @@ #include "test_ringbuf_n.lskel.h" #include "test_ringbuf_map_key.lskel.h" #include "test_ringbuf_write.lskel.h" +#include "test_ringbuf_bpf_to_bpf.lskel.h"
#define EDONE 7777
@@ -497,6 +498,53 @@ static void ringbuf_map_key_subtest(void) test_ringbuf_map_key_lskel__destroy(skel_map_key); }
+static void ringbuf_bpf_to_bpf_subtest(void) +{
struct test_ringbuf_bpf_to_bpf_lskel *skel;
int err, i;
skel = test_ringbuf_bpf_to_bpf_lskel__open();
if (!ASSERT_OK_PTR(skel, "test_ringbuf_bpf_to_bpf_lskel__open"))
return;
skel->maps.ringbuf.max_entries = getpagesize();
skel->bss->pid = getpid();
err = test_ringbuf_bpf_to_bpf_lskel__load(skel);
if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__load"))
goto cleanup;
ringbuf = ring_buffer__new(skel->maps.ringbuf.map_fd, NULL, NULL, NULL);
if (!ASSERT_OK_PTR(ringbuf, "ring_buffer__new"))
goto cleanup;
err = test_ringbuf_bpf_to_bpf_lskel__attach(skel);
if (!ASSERT_OK(err, "test_ringbuf_bpf_to_bpf_lskel__attach"))
goto cleanup_ringbuf;
/* Produce N_SAMPLES samples in the ring buffer by calling getpid() */
skel->bss->value = SAMPLE_VALUE;
for (i = 0; i < N_SAMPLES; i++)
syscall(__NR_getpgid);
/* Trigger bpf-side consumption */
syscall(__NR_prctl);
This might conflict with other selftests that run in parallel.
Just load the skel and bpf_prog_run(prog_fd). No need to attach anywhere in the kernel.
Ack.
pw-bot: cr
linux-kselftest-mirror@lists.linaro.org