On 11/30/22 1:21 PM, Greg Kroah-Hartman wrote:
From: Thomas Zeitlhofer thomas.zeitlhofer+lkml@ze-it.at
[ Upstream commit 8207f253a097fe15c93d85ac15ebb73c5e39e1e1 ]
Commit 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device") introduced the length counter qlen in struct neigh_parms. There are separate neigh_parms instances for IPv4/ARP and IPv6/ND, and while the family specific qlen is incremented in pneigh_enqueue(), the mentioned commit decrements always the IPv4/ARP specific qlen, regardless of the currently processed family, in pneigh_queue_purge() and neigh_proxy_process().
As a result, with IPv6/ND, the family specific qlen is only incremented (and never decremented) until it exceeds PROXY_QLEN, and then, according to the check in pneigh_enqueue(), neighbor solicitations are not answered anymore. As an example, this is noted when using the subnet-router anycast address to access a Linux router. After a certain amount of time (in the observed case, qlen exceeded PROXY_QLEN after two days), the Linux router stops answering neighbor solicitations for its subnet-router anycast address and effectively becomes unreachable.
In my environment, without this patch to 6.0.y, IPv6 proxy neighbours lose connectivity after two or three hours at most because at that point qlen > PROXY_QLEN in my router.
Another result with IPv6/ND is that the IPv4/ARP specific qlen is decremented more often than incremented. This leads to negative qlen values, as a signed integer has been used for the length counter qlen, and potentially to an integer overflow.
Fix this by introducing the helper function neigh_parms_qlen_dec(), which decrements the family specific qlen. Thereby, make use of the existing helper function neigh_get_dev_parms_rcu(), whose definition therefore needs to be placed earlier in neighbour.c. Take the family member from struct neigh_table to determine the currently processed family and appropriately call neigh_parms_qlen_dec() from pneigh_queue_purge() and neigh_proxy_process().
Additionally, use an unsigned integer for the length counter qlen.
Fixes: 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device") Signed-off-by: Thomas Zeitlhofer thomas.zeitlhofer+lkml@ze-it.at Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org
include/net/neighbour.h | 2 +- net/core/neighbour.c | 58 +++++++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 29 deletions(-)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 3827a6b395fd..bce6b228cf56 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -83,7 +83,7 @@ struct neigh_parms { struct rcu_head rcu_head; int reachable_time;
- int qlen;
- u32 qlen; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX);
}; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 84755db81e9d..35f5a3125808 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -307,7 +307,31 @@ static int neigh_del_timer(struct neighbour *n) return 0; } -static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
int family)
+{
- switch (family) {
- case AF_INET:
return __in_dev_arp_parms_get_rcu(dev);
- case AF_INET6:
return __in6_dev_nd_parms_get_rcu(dev);
- }
- return NULL;
+}
+static void neigh_parms_qlen_dec(struct net_device *dev, int family) +{
- struct neigh_parms *p;
- rcu_read_lock();
- p = neigh_get_dev_parms_rcu(dev, family);
- if (p)
p->qlen--;
- rcu_read_unlock();
+}
+static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
int family)
{ struct sk_buff_head tmp; unsigned long flags; @@ -321,13 +345,7 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) struct net_device *dev = skb->dev; if (net == NULL || net_eq(dev_net(dev), net)) {
struct in_device *in_dev;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev)
in_dev->arp_parms->qlen--;
rcu_read_unlock();
}neigh_parms_qlen_dec(dev, family); __skb_unlink(skb, list); __skb_queue_tail(&tmp, skb);
@@ -409,7 +427,8 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev);
- pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL);
- pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
if (skb_queue_empty_lockless(&tbl->proxy_queue)) del_timer_sync(&tbl->proxy_timer); return 0;tbl->family);
@@ -1621,13 +1640,8 @@ static void neigh_proxy_process(struct timer_list *t) if (tdif <= 0) { struct net_device *dev = skb->dev;
struct in_device *in_dev;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev)
in_dev->arp_parms->qlen--;
rcu_read_unlock();
neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue);
if (tbl->proxy_redo && netif_running(dev)) { @@ -1821,7 +1835,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl) cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue, NULL);
- pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n");
@@ -3542,18 +3556,6 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write, return ret; } -static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
int family)
-{
- switch (family) {
- case AF_INET:
return __in_dev_arp_parms_get_rcu(dev);
- case AF_INET6:
return __in6_dev_nd_parms_get_rcu(dev);
- }
- return NULL;
-}
static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) {
Hi Greg,
I tested this patch on my IPv6 router. Given that without this patch, connectivity of IPv6 proxy neighbours is lost after 2-3 hours in my environment, a successful test requires:
- connectivity of IPv6 proxy neighbours must last at least six hours - connectivity of IPv6 proxy neighbours must persist until router reboot - no other observed bugs or regressions
I tested three kernels:
- Official Fedora 37 6.0.9 kernel plus this patch - Official Fedora 37 6.0.10 kernel which includes this patch - 6.0.11-rc1 kernel
All three kernels passed the test, with the longest test before rebooting the router of 12 hours for the 6.0.11-rc1 kernel.
On the Debian bug tracker, there is also a report of a successful test of this patch which resulted in 24 hours of continued IPv6 connectivity for IPv6 proxy neighbours (see the Link tag below).
AFAICS, 6.0.11-rc1 works great on my IPv6 router.
Thanks to all who worked on this for the quick fix.
Best regards,
Chuck Zmudzinski
Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1024070#44 Tested-by: Chuck Zmudzinski brchuckz@aol.com