From: Linkui Xiao xiaolinkui@kylinos.cn
There is a race condition which can be demonstrated by the following script:
ipset create hash_ip1 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip1 172.20.0.0/16 ipset add hash_ip1 192.168.0.0/16 iptables -A INPUT -m set --match-set hash_ip1 src -j ACCEPT while [ 1 ] do ipset create hash_ip2 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip2 172.20.0.0/16 ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 sleep 0.05 done
Swap will exchange the values of ref so destroy will see ref = 0 instead of ref = 1. So after running this script for a period of time, the following race situations may occur: CPU0: CPU1: ipt_do_table ->set_match_v4 ->ip_set_test ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 ->hash_net4_kadt
CPU0 found ipset through the index, and at this time, hash_ip2 has been destroyed by CPU1 through name search. So CPU0 will crash when accessing set->data in the function hash_net4_kadt.
With this fix in place swap will wait for the ongoing operations to be finished.
V1->V2 changes: - replace ref_netlink with rcu synchonize_rcu()
Closes: https://lore.kernel.org/all/69e7963b-e7f8-3ad0-210-7b86eebf7f78@netfilter.or... Cc: stable@vger.kernel.org Suggested-by: Jozsef Kadlecsik kadlec@netfilter.org Signed-off-by: Linkui Xiao xiaolinkui@kylinos.cn
--- net/netfilter/ipset/ip_set_core.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-)
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 35d2f9c9ada0..62ee4de6ffee 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -712,13 +712,18 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index) struct ip_set_net *inst = ip_set_pernet(net);
rcu_read_lock(); - /* ip_set_list itself needs to be protected */ + /* ip_set_list and the set pointer need to be protected */ set = rcu_dereference(inst->ip_set_list)[index]; - rcu_read_unlock();
return set; }
+static inline void +ip_set_rcu_put(struct ip_set *set __always_unused) +{ + rcu_read_unlock(); +} + static inline void ip_set_lock(struct ip_set *set) { @@ -744,8 +749,10 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, pr_debug("set %s, index %u\n", set->name, index);
if (opt->dim < set->type->dimension || - !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) { + ip_set_rcu_put(set); return 0; + }
ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt);
@@ -764,6 +771,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, ret = -ret; }
+ ip_set_rcu_put(set); /* Convert error codes to nomatch */ return (ret < 0 ? 0 : ret); } @@ -780,12 +788,15 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, pr_debug("set %s, index %u\n", set->name, index);
if (opt->dim < set->type->dimension || - !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) { + ip_set_rcu_put(set); return -IPSET_ERR_TYPE_MISMATCH; + }
ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); ip_set_unlock(set); + ip_set_rcu_put(set);
return ret; } @@ -802,12 +813,15 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, pr_debug("set %s, index %u\n", set->name, index);
if (opt->dim < set->type->dimension || - !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) { + ip_set_rcu_put(set); return -IPSET_ERR_TYPE_MISMATCH; + }
ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); ip_set_unlock(set); + ip_set_rcu_put(set);
return ret; } @@ -882,6 +896,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) read_lock_bh(&ip_set_ref_lock); strscpy_pad(name, set->name, IPSET_MAXNAMELEN); read_unlock_bh(&ip_set_ref_lock); + ip_set_rcu_put(set); } EXPORT_SYMBOL_GPL(ip_set_name_byindex);
@@ -1348,6 +1363,9 @@ static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info, * protected by the ip_set_ref_lock. The kernel interfaces * do not hold the mutex but the pointer settings are atomic * so the ip_set_list always contains valid pointers to the sets. + * However after swapping, a userspace set destroy command could + * remove a set still processed by kernel side add/del/test. + * Therefore we need to wait for the ongoing operations to be finished. */
static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, @@ -1397,6 +1415,9 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock);
+ /* Make sure all readers of the old set pointers are completed. */ + synchronize_rcu(); + return 0; }