Hi Greg, Sasha,
This batch contains a backport fix for 5.4 -stable.
The following list shows the backported patches, I am using original commit IDs for reference:
1) 8965d42bcf54 ("netfilter: nf_tables: pass nft_chain to destroy function, not nft_ctx")
This is a stable dependency for the next patch.
2) c03d278fdf35 ("netfilter: nf_tables: wait for rcu grace period on net_device removal")
3) b04df3da1b5c ("netfilter: nf_tables: do not defer rule destruction via call_rcu")
This is a fix-for-fix for patch 2.
These three patches are required to fix the netdevice release path for netdev family basechains.
NOTE: -stable kernels >= 5.4 already provide these backport fixes.
Please, apply, Thanks
** BLURB HERE ***
Florian Westphal (2): netfilter: nf_tables: pass nft_chain to destroy function, not nft_ctx netfilter: nf_tables: do not defer rule destruction via call_rcu
Pablo Neira Ayuso (1): netfilter: nf_tables: wait for rcu grace period on net_device removal
net/netfilter/nf_tables_api.c | 51 ++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 15 deletions(-)
From: Florian Westphal fw@strlen.de
commit 8965d42bcf54d42cbc72fe34a9d0ec3f8527debd upstream.
It would be better to not store nft_ctx inside nft_trans object, the netlink ctx strucutre is huge and most of its information is never needed in places that use trans->ctx.
Avoid/reduce its usage if possible, no runtime behaviour change intended.
Signed-off-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org --- net/netfilter/nf_tables_api.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7812cc3cc751..c6f142b5e64f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1675,10 +1675,8 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) kvfree(chain->rules_next); }
-static void nf_tables_chain_destroy(struct nft_ctx *ctx) +void nf_tables_chain_destroy(struct nft_chain *chain) { - struct nft_chain *chain = ctx->chain; - if (WARN_ON(chain->use > 0)) return;
@@ -1929,7 +1927,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, err_use: nf_tables_unregister_hook(net, table, chain); err1: - nf_tables_chain_destroy(ctx); + nf_tables_chain_destroy(chain);
return err; } @@ -6905,7 +6903,7 @@ static void nft_commit_release(struct nft_trans *trans) kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: - nf_tables_chain_destroy(&trans->ctx); + nf_tables_chain_destroy(trans->ctx.chain); break; case NFT_MSG_DELRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); @@ -7582,7 +7580,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: - nf_tables_chain_destroy(&trans->ctx); + nf_tables_chain_destroy(trans->ctx.chain); break; case NFT_MSG_NEWRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); @@ -8233,7 +8231,7 @@ int __nft_release_basechain(struct nft_ctx *ctx) } nft_chain_del(ctx->chain); nft_use_dec(&ctx->table->use); - nf_tables_chain_destroy(ctx); + nf_tables_chain_destroy(ctx->chain);
return 0; } @@ -8300,10 +8298,9 @@ static void __nft_release_table(struct net *net, struct nft_table *table) nft_obj_destroy(&ctx, obj); } list_for_each_entry_safe(chain, nc, &table->chains, list) { - ctx.chain = chain; nft_chain_del(chain); nft_use_dec(&table->use); - nf_tables_chain_destroy(&ctx); + nf_tables_chain_destroy(chain); } list_del(&table->list); nf_tables_table_destroy(&ctx);
commit c03d278fdf35e73dd0ec543b9b556876b9d9a8dc upstream.
8c873e219970 ("netfilter: core: free hooks with call_rcu") removed synchronize_net() call when unregistering basechain hook, however, net_device removal event handler for the NFPROTO_NETDEV was not updated to wait for RCU grace period.
Note that 835b803377f5 ("netfilter: nf_tables_netdev: unregister hooks on net_device removal") does not remove basechain rules on device removal, I was hinted to remove rules on net_device removal later, see 5ebe0b0eec9d ("netfilter: nf_tables: destroy basechain and rules on netdevice removal").
Although NETDEV_UNREGISTER event is guaranteed to be handled after synchronize_net() call, this path needs to wait for rcu grace period via rcu callback to release basechain hooks if netns is alive because an ongoing netlink dump could be in progress (sockets hold a reference on the netns).
Note that nf_tables_pre_exit_net() unregisters and releases basechain hooks but it is possible to see NETDEV_UNREGISTER at a later stage in the netns exit path, eg. veth peer device in another netns:
cleanup_net() default_device_exit_batch() unregister_netdevice_many_notify() notifier_call_chain() nf_tables_netdev_event() __nft_release_basechain()
In this particular case, same rule of thumb applies: if netns is alive, then wait for rcu grace period because netlink dump in the other netns could be in progress. Otherwise, if the other netns is going away then no netlink dump can be in progress and basechain hooks can be released inmediately.
While at it, turn WARN_ON() into WARN_ON_ONCE() for the basechain validation, which should not ever happen.
Fixes: 835b803377f5 ("netfilter: nf_tables_netdev: unregister hooks on net_device removal") Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org --- include/net/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 41 +++++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 7 deletions(-)
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 92551a765a44..566370938939 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -899,6 +899,7 @@ struct nft_chain { u8 flags:6, genmask:2; char *name; + struct rcu_head rcu_head;
/* Only used during control plane commit phase: */ struct nft_rule **rules_next; @@ -1015,6 +1016,7 @@ static inline void nft_use_inc_restore(u32 *use) * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table + * @net: netnamespace this table belongs to * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table @@ -1030,6 +1032,7 @@ struct nft_table { struct list_head sets; struct list_head objects; struct list_head flowtables; + possible_net_t net; u64 hgenerator; u64 handle; u32 use; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c6f142b5e64f..2d015fcf8d8d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1109,6 +1109,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); INIT_LIST_HEAD(&table->flowtables); + write_pnet(&table->net, net); table->family = family; table->flags = flags; table->handle = ++table_handle; @@ -8216,22 +8217,48 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump);
-int __nft_release_basechain(struct nft_ctx *ctx) +static void __nft_release_basechain_now(struct nft_ctx *ctx) { struct nft_rule *rule, *nr;
- if (WARN_ON(!nft_is_base_chain(ctx->chain))) - return 0; - - nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { list_del(&rule->list); - nft_use_dec(&ctx->chain->use); nf_tables_rule_release(ctx, rule); } + nf_tables_chain_destroy(ctx->chain); +} + +static void nft_release_basechain_rcu(struct rcu_head *head) +{ + struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head); + struct nft_ctx ctx = { + .family = chain->table->family, + .chain = chain, + .net = read_pnet(&chain->table->net), + }; + + __nft_release_basechain_now(&ctx); + put_net(ctx.net); +} + +int __nft_release_basechain(struct nft_ctx *ctx) +{ + struct nft_rule *rule; + + if (WARN_ON_ONCE(!nft_is_base_chain(ctx->chain))) + return 0; + + nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); + list_for_each_entry(rule, &ctx->chain->rules, list) + nft_use_dec(&ctx->chain->use); + nft_chain_del(ctx->chain); nft_use_dec(&ctx->table->use); - nf_tables_chain_destroy(ctx->chain); + + if (maybe_get_net(ctx->net)) + call_rcu(&ctx->chain->rcu_head, nft_release_basechain_rcu); + else + __nft_release_basechain_now(ctx);
return 0; }
From: Florian Westphal fw@strlen.de
commit b04df3da1b5c6f6dc7cdccc37941740c078c4043 upstream.
nf_tables_chain_destroy can sleep, it can't be used from call_rcu callbacks.
Moreover, nf_tables_rule_release() is only safe for error unwinding, while transaction mutex is held and the to-be-desroyed rule was not exposed to either dataplane or dumps, as it deactives+frees without the required synchronize_rcu() in-between.
nft_rule_expr_deactivate() callbacks will change ->use counters of other chains/sets, see e.g. nft_lookup .deactivate callback, these must be serialized via transaction mutex.
Also add a few lockdep asserts to make this more explicit.
Calling synchronize_rcu() isn't ideal, but fixing this without is hard and way more intrusive. As-is, we can get:
WARNING: .. net/netfilter/nf_tables_api.c:5515 nft_set_destroy+0x.. Workqueue: events nf_tables_trans_destroy_work RIP: 0010:nft_set_destroy+0x3fe/0x5c0 Call Trace: <TASK> nf_tables_trans_destroy_work+0x6b7/0xad0 process_one_work+0x64a/0xce0 worker_thread+0x613/0x10d0
In case the synchronize_rcu becomes an issue, we can explore alternatives.
One way would be to allocate nft_trans_rule objects + one nft_trans_chain object, deactivate the rules + the chain and then defer the freeing to the nft destroy workqueue. We'd still need to keep the synchronize_rcu path as a fallback to handle -ENOMEM corner cases though.
Reported-by: syzbot+b26935466701e56cfdc2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67478d92.050a0220.253251.0062.GAE@google.com/T/ Fixes: c03d278fdf35 ("netfilter: nf_tables: wait for rcu grace period on net_device removal") Signed-off-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org --- include/net/netfilter/nf_tables.h | 3 --- net/netfilter/nf_tables_api.c | 31 ++++++++++++++----------------- 2 files changed, 14 insertions(+), 20 deletions(-)
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 566370938939..92551a765a44 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -899,7 +899,6 @@ struct nft_chain { u8 flags:6, genmask:2; char *name; - struct rcu_head rcu_head;
/* Only used during control plane commit phase: */ struct nft_rule **rules_next; @@ -1016,7 +1015,6 @@ static inline void nft_use_inc_restore(u32 *use) * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table - * @net: netnamespace this table belongs to * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table @@ -1032,7 +1030,6 @@ struct nft_table { struct list_head sets; struct list_head objects; struct list_head flowtables; - possible_net_t net; u64 hgenerator; u64 handle; u32 use; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2d015fcf8d8d..9e20fb759cb8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1109,7 +1109,6 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); INIT_LIST_HEAD(&table->flowtables); - write_pnet(&table->net, net); table->family = family; table->flags = flags; table->handle = ++table_handle; @@ -2824,6 +2823,8 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { + lockdep_commit_lock_is_held(ctx->net); + nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); } @@ -4172,6 +4173,8 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase) { + lockdep_commit_lock_is_held(ctx->net); + switch (phase) { case NFT_TRANS_PREPARE_ERROR: nft_set_trans_unbind(ctx, set); @@ -8228,19 +8231,6 @@ static void __nft_release_basechain_now(struct nft_ctx *ctx) nf_tables_chain_destroy(ctx->chain); }
-static void nft_release_basechain_rcu(struct rcu_head *head) -{ - struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head); - struct nft_ctx ctx = { - .family = chain->table->family, - .chain = chain, - .net = read_pnet(&chain->table->net), - }; - - __nft_release_basechain_now(&ctx); - put_net(ctx.net); -} - int __nft_release_basechain(struct nft_ctx *ctx) { struct nft_rule *rule; @@ -8255,11 +8245,18 @@ int __nft_release_basechain(struct nft_ctx *ctx) nft_chain_del(ctx->chain); nft_use_dec(&ctx->table->use);
- if (maybe_get_net(ctx->net)) - call_rcu(&ctx->chain->rcu_head, nft_release_basechain_rcu); - else + if (!maybe_get_net(ctx->net)) { __nft_release_basechain_now(ctx); + return 0; + } + + /* wait for ruleset dumps to complete. Owning chain is no longer in + * lists, so new dumps can't find any of these rules anymore. + */ + synchronize_rcu();
+ __nft_release_basechain_now(ctx); + put_net(ctx->net); return 0; } EXPORT_SYMBOL_GPL(__nft_release_basechain);
linux-stable-mirror@lists.linaro.org