From: Uladzislau Rezki urezki@gmail.com
commit 3c5d61ae919cc377c71118ccc76fa6e8518023f8 upstream.
Add a kvfree_rcu_barrier() function. It waits until all in-flight pointers are freed over RCU machinery. It does not wait any GP completion and it is within its right to return immediately if there are no outstanding pointers.
This function is useful when there is a need to guarantee that a memory is fully freed before destroying memory caches. For example, during unloading a kernel module.
Signed-off-by: Uladzislau Rezki (Sony) urezki@gmail.com Signed-off-by: Vlastimil Babka vbabka@suse.cz Signed-off-by: Suren Baghdasaryan surenb@google.com --- Changes since v1 [1]: - Added SOB, per Greg KH
[1] https://lore.kernel.org/all/20241021171003.2907935-1-surenb@google.com/
include/linux/rcutiny.h | 5 ++ include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 109 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 107 insertions(+), 8 deletions(-)
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index d9ac7b136aea..522123050ff8 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -111,6 +111,11 @@ static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr) kvfree(ptr); }
+static inline void kvfree_rcu_barrier(void) +{ + rcu_barrier(); +} + #ifdef CONFIG_KASAN_GENERIC void kvfree_call_rcu(struct rcu_head *head, void *ptr); #else diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 254244202ea9..58e7db80f3a8 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -35,6 +35,7 @@ static inline void rcu_virt_note_context_switch(void)
void synchronize_rcu_expedited(void); void kvfree_call_rcu(struct rcu_head *head, void *ptr); +void kvfree_rcu_barrier(void);
void rcu_barrier(void); void rcu_momentary_dyntick_idle(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..be00aac5f4e7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3584,18 +3584,15 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) }
/* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * Return: %true if a work is queued, %false otherwise. */ -static void kfree_rcu_monitor(struct work_struct *work) +static bool +kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) { - struct kfree_rcu_cpu *krcp = container_of(work, - struct kfree_rcu_cpu, monitor_work.work); unsigned long flags; + bool queued = false; int i, j;
- // Drain ready for reclaim. - kvfree_rcu_drain_ready(krcp); - raw_spin_lock_irqsave(&krcp->lock, flags);
// Attempt to start a new batch. @@ -3634,11 +3631,27 @@ static void kfree_rcu_monitor(struct work_struct *work) // be that the work is in the pending state when // channels have been detached following by each // other. - queue_rcu_work(system_wq, &krwp->rcu_work); + queued = queue_rcu_work(system_wq, &krwp->rcu_work); } }
raw_spin_unlock_irqrestore(&krcp->lock, flags); + return queued; +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + struct kfree_rcu_cpu *krcp = container_of(work, + struct kfree_rcu_cpu, monitor_work.work); + + // Drain ready for reclaim. + kvfree_rcu_drain_ready(krcp); + + // Queue a batch for a rest. + kvfree_rcu_queue_batch(krcp);
// If there is nothing to detach, it means that our job is // successfully done here. In case of having at least one @@ -3859,6 +3872,86 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) } EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+/** + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. + * + * Note that a single argument of kvfree_rcu() call has a slow path that + * triggers synchronize_rcu() following by freeing a pointer. It is done + * before the return from the function. Therefore for any single-argument + * call that will result in a kfree() to a cache that is to be destroyed + * during module exit, it is developer's responsibility to ensure that all + * such calls have returned before the call to kmem_cache_destroy(). + */ +void kvfree_rcu_barrier(void) +{ + struct kfree_rcu_cpu_work *krwp; + struct kfree_rcu_cpu *krcp; + bool queued; + int i, cpu; + + /* + * Firstly we detach objects and queue them over an RCU-batch + * for all CPUs. Finally queued works are flushed for each CPU. + * + * Please note. If there are outstanding batches for a particular + * CPU, those have to be finished first following by queuing a new. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * Check if this CPU has any objects which have been queued for a + * new GP completion. If not(means nothing to detach), we are done + * with it. If any batch is pending/running for this "krcp", below + * per-cpu flush_rcu_work() waits its completion(see last step). + */ + if (!need_offload_krc(krcp)) + continue; + + while (1) { + /* + * If we are not able to queue a new RCU work it means: + * - batches for this CPU are still in flight which should + * be flushed first and then repeat; + * - no objects to detach, because of concurrency. + */ + queued = kvfree_rcu_queue_batch(krcp); + + /* + * Bail out, if there is no need to offload this "krcp" + * anymore. As noted earlier it can run concurrently. + */ + if (queued || !need_offload_krc(krcp)) + break; + + /* There are ongoing batches. */ + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } + } + + /* + * Now we guarantee that all objects are flushed. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * A monitor work can drain ready to reclaim objects + * directly. Wait its completion if running or pending. + */ + cancel_delayed_work_sync(&krcp->monitor_work); + + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } +} +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); + static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) {
base-commit: 163b38476c50d64b89c1dfdf4fd57a368b6ebbec
From: Florian Westphal fw@strlen.de
commit dc783ba4b9df3fb3e76e968b2cbeb9960069263c upstream.
Ben Greear reports following splat: ------------[ cut here ]------------ net/netfilter/nf_nat_core.c:1114 module nf_nat func:nf_nat_register_fn has 256 allocated at module unload WARNING: CPU: 1 PID: 10421 at lib/alloc_tag.c:168 alloc_tag_module_unload+0x22b/0x3f0 Modules linked in: nf_nat(-) btrfs ufs qnx4 hfsplus hfs minix vfat msdos fat ... Hardware name: Default string Default string/SKYBAY, BIOS 5.12 08/04/2020 RIP: 0010:alloc_tag_module_unload+0x22b/0x3f0 codetag_unload_module+0x19b/0x2a0 ? codetag_load_module+0x80/0x80
nf_nat module exit calls kfree_rcu on those addresses, but the free operation is likely still pending by the time alloc_tag checks for leaks.
Wait for outstanding kfree_rcu operations to complete before checking resolves this warning.
Reproducer: unshare -n iptables-nft -t nat -A PREROUTING -p tcp grep nf_nat /proc/allocinfo # will list 4 allocations rmmod nft_chain_nat rmmod nf_nat # will WARN.
Link: https://lkml.kernel.org/r/20241007205236.11847-1-fw@strlen.de Fixes: a473573964e5 ("lib: code tagging module support") Signed-off-by: Florian Westphal fw@strlen.de Reported-by: Ben Greear greearb@candelatech.com Closes: https://lore.kernel.org/netdev/bdaaef9d-4364-4171-b82b-bcfc12e207eb@candelat... Cc: Uladzislau Rezki urezki@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Cc: Suren Baghdasaryan surenb@google.com Cc: Kent Overstreet kent.overstreet@linux.dev Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Suren Baghdasaryan surenb@google.com --- Changes since v1 [1]: - Added SOB, per Greg KH
[1] https://lore.kernel.org/all/20241021171003.2907935-2-surenb@google.com/
lib/codetag.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/lib/codetag.c b/lib/codetag.c index afa8a2d4f317..d1fbbb7c2ec3 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -228,6 +228,9 @@ bool codetag_unload_module(struct module *mod) if (!mod) return true;
+ /* await any module's kfree_rcu() operations to complete */ + kvfree_rcu_barrier(); + mutex_lock(&codetag_lock); list_for_each_entry(cttype, &codetag_types, link) { struct codetag_module *found = NULL;
On Wed, Nov 06, 2024 at 09:09:26AM -0800, Suren Baghdasaryan wrote:
From: Uladzislau Rezki urezki@gmail.com
commit 3c5d61ae919cc377c71118ccc76fa6e8518023f8 upstream.
No, that's not the right git id :(
This should be 2b55d6a42d14c8675e38d6d9adca3014fdf01951, right? Let me go change this...
thanks,
greg k-h
On Thu, Nov 07, 2024 at 07:37:43AM +0100, Greg KH wrote:
On Wed, Nov 06, 2024 at 09:09:26AM -0800, Suren Baghdasaryan wrote:
From: Uladzislau Rezki urezki@gmail.com
commit 3c5d61ae919cc377c71118ccc76fa6e8518023f8 upstream.
No, that's not the right git id :(
But it is the git id of the fixup patch that I also need to take, so now grabbed, thanks for making me look :)
greg k-h
On Wed, Nov 6, 2024 at 10:46 PM Greg KH gregkh@linuxfoundation.org wrote:
On Thu, Nov 07, 2024 at 07:37:43AM +0100, Greg KH wrote:
On Wed, Nov 06, 2024 at 09:09:26AM -0800, Suren Baghdasaryan wrote:
From: Uladzislau Rezki urezki@gmail.com
commit 3c5d61ae919cc377c71118ccc76fa6e8518023f8 upstream.
No, that's not the right git id :(
But it is the git id of the fixup patch that I also need to take, so now grabbed, thanks for making me look :)
Sorry for the confusion. Sounds like you sorted it out :)
greg k-h
linux-stable-mirror@lists.linaro.org