On Tue, Nov 25, 2025 at 7:00 AM Leon Hwang leon.hwang@linux.dev wrote:
Introduce BPF_F_ALL_CPUS flag support for percpu_hash and lru_percpu_hash maps to allow updating values for all CPUs with a single value for both update_elem and update_batch APIs.
Introduce BPF_F_CPU flag support for percpu_hash and lru_percpu_hash maps to allow:
- update value for specified CPU for both update_elem and update_batch
APIs.
- lookup value for specified CPU for both lookup_elem and lookup_batch
APIs.
The BPF_F_CPU flag is passed via:
- map_flags along with embedded cpu info.
- elem_flags along with embedded cpu info.
Signed-off-by: Leon Hwang leon.hwang@linux.dev
v10 -> v11:
- Drop buggy '(u32)map_flags > BPF_F_ALL_CPUS' check in htab_map_check_update_flags().
why?
- Update 'map_flags != BPF_EXIST' to '!(map_flags & BPF_EXIST)' in __htab_lru_percpu_map_update_elem().
include/linux/bpf.h | 4 +- kernel/bpf/hashtab.c | 96 ++++++++++++++++++++++++++++++-------------- kernel/bpf/syscall.c | 2 +- 3 files changed, 69 insertions(+), 33 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 01a99e3a3e51..f79d2ae27335 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2761,7 +2761,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee);
-int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); @@ -3833,6 +3833,8 @@ static inline bool bpf_map_supports_cpu_flags(enum bpf_map_type map_type) { switch (map_type) { case BPF_MAP_TYPE_PERCPU_ARRAY:
case BPF_MAP_TYPE_PERCPU_HASH:case BPF_MAP_TYPE_LRU_PERCPU_HASH: return true; default: return false;diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c8a9b27f8663..c768bf71d60f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -932,7 +932,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) }
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
void *value, bool onallcpus, u64 map_flags){ void *ptr;
@@ -943,19 +943,28 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, bpf_obj_free_fields(htab->map.record, ptr); } else { u32 size = round_up(htab->map.value_size, 8);
int off = 0, cpu;
void *val;int cpu;if (map_flags & BPF_F_CPU) {cpu = map_flags >> 32;ptr = per_cpu_ptr(pptr, cpu);copy_map_value(&htab->map, ptr, value);bpf_obj_free_fields(htab->map.record, ptr);return;} for_each_possible_cpu(cpu) { ptr = per_cpu_ptr(pptr, cpu);
copy_map_value_long(&htab->map, ptr, value + off);
val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu;copy_map_value(&htab->map, ptr, val); bpf_obj_free_fields(htab->map.record, ptr);
off += size; } }}
static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
void *value, bool onallcpus, u64 map_flags){ /* When not setting the initial value on all cpus, zero-fill element * values for other cpus. Otherwise, bpf program has no way to ensure @@ -973,7 +982,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); } } else {
pcpu_copy_value(htab, pptr, value, onallcpus);
pcpu_copy_value(htab, pptr, value, onallcpus, map_flags); }}
@@ -985,7 +994,7 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus,
struct htab_elem *old_elem)
struct htab_elem *old_elem, u64 map_flags){ u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); @@ -1043,7 +1052,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, pptr = *(void __percpu **)ptr; }
pcpu_init_value(htab, pptr, value, onallcpus);
pcpu_init_value(htab, pptr, value, onallcpus, map_flags); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr);@@ -1147,7 +1156,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, }
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
l_old);
l_old, map_flags); if (IS_ERR(l_new)) { /* all pre-allocated elements are in use or memory exhausted */ ret = PTR_ERR(l_new);@@ -1249,6 +1258,15 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value return ret; }
+static int htab_map_check_update_flags(bool onallcpus, u64 map_flags) +{
if (unlikely(!onallcpus && map_flags > BPF_EXIST))return -EINVAL;if (unlikely(onallcpus && (map_flags & BPF_F_LOCK)))return -EINVAL;return 0;+}
static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, void *value, u64 map_flags, bool percpu, bool onallcpus) @@ -1262,9 +1280,9 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, u32 key_size, hash; int ret;
if (unlikely(map_flags > BPF_EXIST))/* unknown flags */return -EINVAL;
ret = htab_map_check_update_flags(onallcpus, map_flags);if (unlikely(ret))return ret; WARN_ON_ONCE(!bpf_rcu_lock_held());@@ -1289,7 +1307,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, /* Update value in-place */ if (percpu) { pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
value, onallcpus);
value, onallcpus, map_flags); } else { void **inner_map_pptr = htab_elem_value(l_old, key_size);@@ -1298,7 +1316,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key, } } else { l_new = alloc_htab_elem(htab, key, value, key_size,
hash, percpu, onallcpus, NULL);
hash, percpu, onallcpus, NULL, map_flags); if (IS_ERR(l_new)) { ret = PTR_ERR(l_new); goto err;@@ -1324,9 +1342,9 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, u32 key_size, hash; int ret;
if (unlikely(map_flags > BPF_EXIST))/* unknown flags */return -EINVAL;
ret = htab_map_check_update_flags(onallcpus, map_flags);if (unlikely(ret))return ret; WARN_ON_ONCE(!bpf_rcu_lock_held());@@ -1342,7 +1360,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, * to remove older elem from htab and this removal * operation will need a bucket lock. */
if (map_flags != BPF_EXIST) {
if (!(map_flags & BPF_EXIST)) { l_new = prealloc_lru_pop(htab, key, hash); if (!l_new) return -ENOMEM;
It's not in the diff, but this is broken. You tried to allow BPF_EXIST combination here, but didn't update check_flags(),
so BPF_[NO]EXIST | BPF_F_CPU combination check_flags() will always return 0, so BPF_[NO]EXIST flag will make no difference.
When you add features, always always add unit tests. Patch 8 is not it. It's testing F_CPU. It doesn't check that BPF_EXIST | BPF_F_CPU correctly errors when an element doesn't exist.
v10 was close, but then you decided to add this BPF_EXIST feature and did it in a sloppy way. Why ? Focus on one thing only. Land it and then do the next one. 11 revisions and still no go... it is not a good sign.
pw-bot: cr