Convert the raw_spinlock to rqspinlock to fix the possible deadlock in [1] for bpf lru map. Meanwhile, add the testcase for the deadlock.
Link: https://lore.kernel.org/bpf/CAEf4BzbTJCUx0D=zjx6+5m5iiGhwLzaP94hnw36ZMDHAf4-...] Menglong Dong (2): bpf: use rqspinlock for lru map selftests/bpf: test map deadlock caused by NMI
kernel/bpf/bpf_lru_list.c | 47 +++--- kernel/bpf/bpf_lru_list.h | 5 +- .../selftests/bpf/prog_tests/map_deadlock.c | 134 ++++++++++++++++++ .../selftests/bpf/progs/map_deadlock.c | 52 +++++++ 4 files changed, 217 insertions(+), 21 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_deadlock.c create mode 100644 tools/testing/selftests/bpf/progs/map_deadlock.c
For now, raw_spinlock is used during adding, deleting and updating in the bpf lru map, which can lead to deadlock if it is done in the NMI context, as described in [1].
Fix this by convert the raw_spinlock_t in bpf_lru_list and bpf_lru_locallist to rqspinlock_t.
Link: https://lore.kernel.org/bpf/CAEf4BzbTJCUx0D=zjx6+5m5iiGhwLzaP94hnw36ZMDHAf4-... Signed-off-by: Menglong Dong dongml2@chinatelecom.cn --- kernel/bpf/bpf_lru_list.c | 47 +++++++++++++++++++++++---------------- kernel/bpf/bpf_lru_list.h | 5 +++-- 2 files changed, 31 insertions(+), 21 deletions(-)
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e7a2fc60523f..38fddcb1e28c 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -307,9 +307,10 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l, if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return;
- raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) + return; __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); }
static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, @@ -319,7 +320,8 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, struct bpf_lru_node *node, *tmp_node; unsigned int nfree = 0;
- raw_spin_lock(&l->lock); + if (raw_res_spin_lock(&l->lock)) + return;
__local_list_flush(l, loc_l);
@@ -338,7 +340,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE);
- raw_spin_unlock(&l->lock); + raw_res_spin_unlock(&l->lock); }
static void __local_list_add_pending(struct bpf_lru *lru, @@ -404,7 +406,8 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
l = per_cpu_ptr(lru->percpu_lru, cpu);
- raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) + return NULL;
__bpf_lru_list_rotate(lru, l);
@@ -420,7 +423,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); }
- raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags);
return node; } @@ -437,7 +440,8 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
loc_l = per_cpu_ptr(clru->local_list, cpu);
- raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) + return NULL;
node = __local_list_pop_free(loc_l); if (!node) { @@ -448,7 +452,7 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, if (node) __local_list_add_pending(lru, loc_l, cpu, node, hash);
- raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);
if (node) return node; @@ -466,23 +470,26 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, do { steal_loc_l = per_cpu_ptr(clru->local_list, steal);
- raw_spin_lock_irqsave(&steal_loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags)) + goto out_next;
node = __local_list_pop_free(steal_loc_l); if (!node) node = __local_list_pop_pending(lru, steal_loc_l);
- raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+out_next: steal = cpumask_next_wrap(steal, cpu_possible_mask); } while (!node && steal != first_steal);
loc_l->next_steal = steal;
if (node) { - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) + return NULL; __local_list_add_pending(lru, loc_l, cpu, node, hash); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); }
return node; @@ -511,10 +518,11 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru,
loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
- raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) + return;
if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); goto check_lru_list; }
@@ -522,7 +530,7 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru, bpf_lru_node_clear_ref(node); list_move(&node->list, local_free_list(loc_l));
- raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); return; }
@@ -538,11 +546,12 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
l = per_cpu_ptr(lru->percpu_lru, node->cpu);
- raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) + return;
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
- raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); }
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) @@ -625,7 +634,7 @@ static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
loc_l->next_steal = cpu;
- raw_spin_lock_init(&loc_l->lock); + raw_res_spin_lock_init(&loc_l->lock); }
static void bpf_lru_list_init(struct bpf_lru_list *l) @@ -640,7 +649,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
- raw_spin_lock_init(&l->lock); + raw_res_spin_lock_init(&l->lock); }
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index fe2661a58ea9..61fc7d7f9de1 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -7,6 +7,7 @@ #include <linux/cache.h> #include <linux/list.h> #include <linux/spinlock_types.h> +#include <asm-generic/rqspinlock.h>
#define NR_BPF_LRU_LIST_T (3) #define NR_BPF_LRU_LIST_COUNT (2) @@ -34,13 +35,13 @@ struct bpf_lru_list { /* The next inactive list rotation starts from here */ struct list_head *next_inactive_rotation;
- raw_spinlock_t lock ____cacheline_aligned_in_smp; + rqspinlock_t lock ____cacheline_aligned_in_smp; };
struct bpf_lru_locallist { struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; u16 next_steal; - raw_spinlock_t lock; + rqspinlock_t lock; };
struct bpf_common_lru {
``` commit 8ae9fd98fe53c45a1a89adf6d254c2db90d26544 Author: Menglong Dong menglong8.dong@gmail.com
bpf: use rqspinlock for lru map
This patch converts raw_spinlock_t to rqspinlock_t in BPF LRU maps to prevent NMI context deadlocks. The new lock API can fail gracefully instead of deadlocking, returning an error code.
Link: https://lore.kernel.org/bpf/CAEf4BzbTJCUx0D=zjx6+5m5iiGhwLzaP94hnw36ZMDHAf4-...
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e7a2fc605..38fddcb1e 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c
[ ... ]
@@ -466,23 +470,26 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, do { steal_loc_l = per_cpu_ptr(clru->local_list, steal);
raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
if (raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags))goto out_next;node = __local_list_pop_free(steal_loc_l); if (!node) node = __local_list_pop_pending(lru, steal_loc_l);
raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags);+out_next: steal = cpumask_next_wrap(steal, cpu_possible_mask); } while (!node && steal != first_steal);
loc_l->next_steal = steal;
if (node) {
raw_spin_lock_irqsave(&loc_l->lock, flags);
if (raw_res_spin_lock_irqsave(&loc_l->lock, flags))return NULL;
^^^^^^^^^^^
Can this leak the node? At this point in bpf_common_lru_pop_free(), a node was successfully stolen from another CPU's list. The __local_list_pop_free() or __local_list_pop_pending() call removed the node from steal_loc_l, and steal_loc_l was unlocked. The node now exists but is not in any list.
When raw_res_spin_lock_irqsave() fails here, the function returns NULL, but the stolen node is never added to the local pending list and never returned to any list. The node becomes orphaned.
In NMI context (which this patch is designed to handle), lock failures are expected to be common. Each failure would leak one pre-allocated node from the BPF LRU pool, eventually exhausting the available nodes.
__local_list_add_pending(lru, loc_l, cpu, node, hash);
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
raw_res_spin_unlock_irqrestore(&loc_l->lock, flags);}
return node;
[ ... ]
```
--- AI reviewed your patch. Please fix the bug or email reply why it's not a bug. See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
In-Reply-To-Subject: `bpf: use rqspinlock for lru map` CI run summary: https://github.com/kernel-patches/bpf/actions/runs/18928591681
In this testing, map updating and deleting both happen in NMI context and user context, which is used to detect the possible deadlock.
For now, LRU is added in the testing, and more map type can be added in the feature.
Signed-off-by: Menglong Dong dongml2@chinatelecom.cn --- .../selftests/bpf/prog_tests/map_deadlock.c | 136 ++++++++++++++++++ .../selftests/bpf/progs/map_deadlock.c | 52 +++++++ 2 files changed, 188 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_deadlock.c create mode 100644 tools/testing/selftests/bpf/progs/map_deadlock.c
diff --git a/tools/testing/selftests/bpf/prog_tests/map_deadlock.c b/tools/testing/selftests/bpf/prog_tests/map_deadlock.c new file mode 100644 index 000000000000..17fcf1f5efa6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_deadlock.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <bpf/libbpf.h> +#include <linux/perf_event.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <pthread.h> +#include "map_deadlock.skel.h" + + +static int perf_open_all_cpus(struct perf_event_attr *attr, int fds[], int max_cpus) +{ + int n = 0; + + for (int cpu = 0; cpu < max_cpus; cpu++) { + int fd = syscall(__NR_perf_event_open, attr, -1 /* pid: all */, cpu, + -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); + if (fd < 0) + continue; + fds[cpu] = fd; + n++; + } + return n; +} + +struct thread_arg { + int map_fd; + bool *stop; +}; + +static void *user_update_thread(void *argp) +{ + struct thread_arg *arg = argp; + u32 key = 0; + u64 val = 1; + + while (!*arg->stop) { + key++; + val++; + bpf_map_update_elem(arg->map_fd, &key, &val, BPF_ANY); + if ((key & 0x7) == 0) + bpf_map_delete_elem(arg->map_fd, &key); + } + return NULL; +} + +static void test_map(const char *map_name, int map_index) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_HARDWARE, + .size = sizeof(struct perf_event_attr), + .config = PERF_COUNT_HW_CPU_CYCLES, + .sample_period = 1000000, + .freq = 0, + .disabled = 0, + .wakeup_events = 1, + }; + int map_fd, nfd = 0, max_cpus, err; + struct bpf_link **links = NULL; + struct map_deadlock *skel; + struct bpf_program *prog; + struct thread_arg targ; + bool stop = false; + int *fds = NULL; + pthread_t thr; + + skel = map_deadlock__open(); + if (!ASSERT_OK_PTR(skel, "map_deadlock__open")) + return; + skel->rodata->map_index = map_index; + err = map_deadlock__load(skel); + if (!ASSERT_OK(err, "map_deadlock__load")) + goto out; + + prog = skel->progs.on_perf; + map_fd = bpf_object__find_map_fd_by_name(skel->obj, map_name); + if (!ASSERT_GE(map_fd, 0, map_name)) + goto out; + + max_cpus = libbpf_num_possible_cpus(); + if (!ASSERT_GT(max_cpus, 0, "num cpus")) + goto out; + + links = calloc(max_cpus, sizeof(*links)); + ASSERT_OK_PTR(links, "alloc links"); + fds = calloc(max_cpus, sizeof(*fds)); + ASSERT_OK_PTR(fds, "alloc fds"); + for (int i = 0; i < max_cpus; i++) + fds[i] = -1; + + nfd = perf_open_all_cpus(&attr, fds, max_cpus); + if (!ASSERT_GT(nfd, 0, "perf fds")) + goto out; + + for (int cpu = 0; cpu < max_cpus; cpu++) { + if (fds[cpu] < 0) + continue; + links[cpu] = bpf_program__attach_perf_event(prog, fds[cpu]); + if (!ASSERT_OK_PTR(links[cpu], "attach perf")) + goto out; + } + + targ.map_fd = map_fd; + targ.stop = &stop; + err = pthread_create(&thr, NULL, user_update_thread, &targ); + if (!ASSERT_OK(err, "create thr")) + goto out; + + /* 1 second should be enough to trigger the deadlock */ + sleep(1); + stop = true; + (void)pthread_join(thr, NULL); + /* TODO: read dmesg to check the deadlock? */ +out: + if (links) { + for (int cpu = 0; cpu < max_cpus; cpu++) { + if (links[cpu]) + bpf_link__destroy(links[cpu]); + } + } + if (fds) { + for (int cpu = 0; cpu < max_cpus; cpu++) { + if (fds[cpu] >= 0) + close(fds[cpu]); + } + } + free(links); + free(fds); + map_deadlock__destroy(skel); +} + +void test_map_deadlock(void) +{ + if (test__start_subtest("lru")) + test_map("lru_map", 0); +} diff --git a/tools/testing/selftests/bpf/progs/map_deadlock.c b/tools/testing/selftests/bpf/progs/map_deadlock.c new file mode 100644 index 000000000000..6966224955fc --- /dev/null +++ b/tools/testing/selftests/bpf/progs/map_deadlock.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char LICENSE[] SEC("license") = "GPL"; + +struct lru_map { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, 1024); + __type(key, u32); + __type(value, u64); +} lru_map SEC(".maps"); + +struct map_list { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __uint(max_entries, 1); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __array(values, struct lru_map); +} map_list SEC(".maps") = { + .values = { [0] = &lru_map }, +}; + +const volatile int map_index; + +static __always_inline void do_update_delete(void *map) +{ + u64 ts = bpf_ktime_get_ns(); + u32 key = (u32)(ts >> 12); + u64 val = ts; + + if ((ts & 1) == 0) + bpf_map_update_elem(map, &key, &val, BPF_ANY); + else + bpf_map_delete_elem(map, &key); +} + +SEC("perf_event") +int on_perf(struct bpf_perf_event_data *ctx) +{ + int key = map_index; + void *target_map; + + target_map = bpf_map_lookup_elem(&map_list, &key); + if (!target_map) + return 0; + + for (int i = 0; i < 4; i++) + do_update_delete(target_map); + return 0; +}
linux-kselftest-mirror@lists.linaro.org