From: Eric Dumazet edumazet@google.com
[ Upstream commit dbca1596bbb08318f5e3b3b99f8ca0a0d3830a65 ]
Using rwlock in networking code is extremely risky. writers can starve if enough readers are constantly grabing the rwlock.
I thought rwlock were at fault and sent this patch:
https://lkml.org/lkml/2022/6/17/272
But Peter and Linus essentially told me rwlock had to be unfair.
We need to get rid of rwlock in networking code.
Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind") Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org --- net/ipv4/ping.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-)
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 3c6101def7d6..b83c2bd9d722 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -50,7 +50,7 @@
struct ping_table { struct hlist_nulls_head hash[PING_HTABLE_SIZE]; - rwlock_t lock; + spinlock_t lock; };
static struct ping_table ping_table; @@ -82,7 +82,7 @@ int ping_get_port(struct sock *sk, unsigned short ident) struct sock *sk2 = NULL;
isk = inet_sk(sk); - write_lock_bh(&ping_table.lock); + spin_lock(&ping_table.lock); if (ident == 0) { u32 i; u16 result = ping_port_rover + 1; @@ -128,14 +128,15 @@ int ping_get_port(struct sock *sk, unsigned short ident) if (sk_unhashed(sk)) { pr_debug("was not hashed\n"); sock_hold(sk); - hlist_nulls_add_head(&sk->sk_nulls_node, hlist); + sock_set_flag(sk, SOCK_RCU_FREE); + hlist_nulls_add_head_rcu(&sk->sk_nulls_node, hlist); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } - write_unlock_bh(&ping_table.lock); + spin_unlock(&ping_table.lock); return 0;
fail: - write_unlock_bh(&ping_table.lock); + spin_unlock(&ping_table.lock); return 1; } EXPORT_SYMBOL_GPL(ping_get_port); @@ -153,19 +154,19 @@ void ping_unhash(struct sock *sk) struct inet_sock *isk = inet_sk(sk);
pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num); - write_lock_bh(&ping_table.lock); + spin_lock(&ping_table.lock); if (sk_hashed(sk)) { - hlist_nulls_del(&sk->sk_nulls_node); - sk_nulls_node_init(&sk->sk_nulls_node); + hlist_nulls_del_init_rcu(&sk->sk_nulls_node); sock_put(sk); isk->inet_num = 0; isk->inet_sport = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } - write_unlock_bh(&ping_table.lock); + spin_unlock(&ping_table.lock); } EXPORT_SYMBOL_GPL(ping_unhash);
+/* Called under rcu_read_lock() */ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) { struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident); @@ -190,8 +191,6 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) return NULL; }
- read_lock_bh(&ping_table.lock); - ping_portaddr_for_each_entry(sk, hnode, hslot) { isk = inet_sk(sk);
@@ -230,13 +229,11 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) sk->sk_bound_dev_if != sdif) continue;
- sock_hold(sk); goto exit; }
sk = NULL; exit: - read_unlock_bh(&ping_table.lock);
return sk; } @@ -592,7 +589,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) sk->sk_err = err; sk_error_report(sk); out: - sock_put(sk); + return; } EXPORT_SYMBOL_GPL(ping_err);
@@ -998,7 +995,6 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb) reason = __ping_queue_rcv_skb(sk, skb2); else reason = SKB_DROP_REASON_NOMEM; - sock_put(sk); }
if (reason) @@ -1084,13 +1080,13 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) }
void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) - __acquires(ping_table.lock) + __acquires(RCU) { struct ping_iter_state *state = seq->private; state->bucket = 0; state->family = family;
- read_lock_bh(&ping_table.lock); + rcu_read_lock();
return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } @@ -1116,9 +1112,9 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) EXPORT_SYMBOL_GPL(ping_seq_next);
void ping_seq_stop(struct seq_file *seq, void *v) - __releases(ping_table.lock) + __releases(RCU) { - read_unlock_bh(&ping_table.lock); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ping_seq_stop);
@@ -1202,5 +1198,5 @@ void __init ping_init(void)
for (i = 0; i < PING_HTABLE_SIZE; i++) INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i); - rwlock_init(&ping_table.lock); + spin_lock_init(&ping_table.lock); }