Previously, deleting peers would require traversing the entire trie in
order to rebalance nodes and safely free them. This meant that removing
1000 peers from a trie with a half million nodes would take an extremely
long time, during which we're holding the rtnl lock. Large-scale users
were reporting 200ms latencies added to the networking stack as a whole
every time their userspace software would queue up significant removals.
That's a serious situation.
This commit fixes that by maintaining a double pointer to the parent's
bit pointer for each node, and then using the already existing node list
belonging to each peer to go directly to the node, fix up its pointers,
and free it with RCU. This means removal is O(1) instead of O(n), and we
don't use gobs of stack.
The removal algorithm has the same downside as the code that it fixes:
it won't collapse needlessly long runs of fillers. We can enhance that
in the future if it ever becomes a problem. This commit documents that
limitation with a TODO comment in code, a small but meaningful
improvement over the prior situation.
Currently the biggest flaw, which the next commit addresses, is that
because this increases the node size on 64-bit machines from 60 bytes to
68 bytes. 60 rounds up to 64, but 68 rounds up to 128. So we wind up
using twice as much memory per node, because of power-of-two
allocations, which is a big bummer. We'll need to figure something out
there.
Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
Cc: stable(a)vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason(a)zx2c4.com>
---
drivers/net/wireguard/allowedips.c | 132 ++++++++++++-----------------
drivers/net/wireguard/allowedips.h | 9 +-
2 files changed, 57 insertions(+), 84 deletions(-)
diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c
index 3725e9cd85f4..2785cfd3a221 100644
--- a/drivers/net/wireguard/allowedips.c
+++ b/drivers/net/wireguard/allowedips.c
@@ -66,60 +66,6 @@ static void root_remove_peer_lists(struct allowedips_node *root)
}
}
-static void walk_remove_by_peer(struct allowedips_node __rcu **top,
- struct wg_peer *peer, struct mutex *lock)
-{
-#define REF(p) rcu_access_pointer(p)
-#define DEREF(p) rcu_dereference_protected(*(p), lockdep_is_held(lock))
-#define PUSH(p) ({ \
- WARN_ON(IS_ENABLED(DEBUG) && len >= 128); \
- stack[len++] = p; \
- })
-
- struct allowedips_node __rcu **stack[128], **nptr;
- struct allowedips_node *node, *prev;
- unsigned int len;
-
- if (unlikely(!peer || !REF(*top)))
- return;
-
- for (prev = NULL, len = 0, PUSH(top); len > 0; prev = node) {
- nptr = stack[len - 1];
- node = DEREF(nptr);
- if (!node) {
- --len;
- continue;
- }
- if (!prev || REF(prev->bit[0]) == node ||
- REF(prev->bit[1]) == node) {
- if (REF(node->bit[0]))
- PUSH(&node->bit[0]);
- else if (REF(node->bit[1]))
- PUSH(&node->bit[1]);
- } else if (REF(node->bit[0]) == prev) {
- if (REF(node->bit[1]))
- PUSH(&node->bit[1]);
- } else {
- if (rcu_dereference_protected(node->peer,
- lockdep_is_held(lock)) == peer) {
- RCU_INIT_POINTER(node->peer, NULL);
- list_del_init(&node->peer_list);
- if (!node->bit[0] || !node->bit[1]) {
- rcu_assign_pointer(*nptr, DEREF(
- &node->bit[!REF(node->bit[0])]));
- kfree_rcu(node, rcu);
- node = DEREF(nptr);
- }
- }
- --len;
- }
- }
-
-#undef REF
-#undef DEREF
-#undef PUSH
-}
-
static unsigned int fls128(u64 a, u64 b)
{
return a ? fls64(a) + 64U : fls64(b);
@@ -224,6 +170,7 @@ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
RCU_INIT_POINTER(node->peer, peer);
list_add_tail(&node->peer_list, &peer->allowedips_list);
copy_and_assign_cidr(node, key, cidr, bits);
+ rcu_assign_pointer(node->parent_bit, trie);
rcu_assign_pointer(*trie, node);
return 0;
}
@@ -243,9 +190,9 @@ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
if (!node) {
down = rcu_dereference_protected(*trie, lockdep_is_held(lock));
} else {
- down = rcu_dereference_protected(CHOOSE_NODE(node, key),
- lockdep_is_held(lock));
+ down = rcu_dereference_protected(CHOOSE_NODE(node, key), lockdep_is_held(lock));
if (!down) {
+ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, key));
rcu_assign_pointer(CHOOSE_NODE(node, key), newnode);
return 0;
}
@@ -254,29 +201,37 @@ static int add(struct allowedips_node __rcu **trie, u8 bits, const u8 *key,
parent = node;
if (newnode->cidr == cidr) {
+ rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(newnode, down->bits));
rcu_assign_pointer(CHOOSE_NODE(newnode, down->bits), down);
- if (!parent)
+ if (!parent) {
+ rcu_assign_pointer(newnode->parent_bit, trie);
rcu_assign_pointer(*trie, newnode);
- else
- rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits),
- newnode);
- } else {
- node = kzalloc(sizeof(*node), GFP_KERNEL);
- if (unlikely(!node)) {
- list_del(&newnode->peer_list);
- kfree(newnode);
- return -ENOMEM;
+ } else {
+ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(parent, newnode->bits));
+ rcu_assign_pointer(CHOOSE_NODE(parent, newnode->bits), newnode);
}
- INIT_LIST_HEAD(&node->peer_list);
- copy_and_assign_cidr(node, newnode->bits, cidr, bits);
-
- rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
- rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
- if (!parent)
- rcu_assign_pointer(*trie, node);
- else
- rcu_assign_pointer(CHOOSE_NODE(parent, node->bits),
- node);
+ return 0;
+ }
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (unlikely(!node)) {
+ list_del(&newnode->peer_list);
+ kfree(newnode);
+ return -ENOMEM;
+ }
+ INIT_LIST_HEAD(&node->peer_list);
+ copy_and_assign_cidr(node, newnode->bits, cidr, bits);
+
+ rcu_assign_pointer(down->parent_bit, &CHOOSE_NODE(node, down->bits));
+ rcu_assign_pointer(CHOOSE_NODE(node, down->bits), down);
+ rcu_assign_pointer(newnode->parent_bit, &CHOOSE_NODE(node, newnode->bits));
+ rcu_assign_pointer(CHOOSE_NODE(node, newnode->bits), newnode);
+ if (!parent) {
+ rcu_assign_pointer(node->parent_bit, trie);
+ rcu_assign_pointer(*trie, node);
+ } else {
+ rcu_assign_pointer(node->parent_bit, &CHOOSE_NODE(parent, node->bits));
+ rcu_assign_pointer(CHOOSE_NODE(parent, node->bits), node);
}
return 0;
}
@@ -335,9 +290,30 @@ int wg_allowedips_insert_v6(struct allowedips *table, const struct in6_addr *ip,
void wg_allowedips_remove_by_peer(struct allowedips *table,
struct wg_peer *peer, struct mutex *lock)
{
+ struct allowedips_node *node, *child, *tmp;
+
+ if (list_empty(&peer->allowedips_list))
+ return;
++table->seq;
- walk_remove_by_peer(&table->root4, peer, lock);
- walk_remove_by_peer(&table->root6, peer, lock);
+ list_for_each_entry_safe(node, tmp, &peer->allowedips_list, peer_list) {
+ list_del_init(&node->peer_list);
+ RCU_INIT_POINTER(node->peer, NULL);
+ if (node->bit[0] && node->bit[1])
+ continue;
+ child = rcu_dereference_protected(
+ node->bit[!rcu_access_pointer(node->bit[0])],
+ lockdep_is_held(lock));
+ if (child)
+ child->parent_bit = node->parent_bit;
+ *rcu_dereference_protected(node->parent_bit, lockdep_is_held(lock)) = child;
+ kfree_rcu(node, rcu);
+
+ /* TODO: Note that we currently don't walk up and down in order to
+ * free any potential filler nodes. This means that this function
+ * doesn't free up as much as it could, which could be revisited
+ * at some point.
+ */
+ }
}
int wg_allowedips_read_node(struct allowedips_node *node, u8 ip[16], u8 *cidr)
diff --git a/drivers/net/wireguard/allowedips.h b/drivers/net/wireguard/allowedips.h
index e5c83cafcef4..f08f552e6852 100644
--- a/drivers/net/wireguard/allowedips.h
+++ b/drivers/net/wireguard/allowedips.h
@@ -15,14 +15,11 @@ struct wg_peer;
struct allowedips_node {
struct wg_peer __rcu *peer;
struct allowedips_node __rcu *bit[2];
- /* While it may seem scandalous that we waste space for v4,
- * we're alloc'ing to the nearest power of 2 anyway, so this
- * doesn't actually make a difference.
- */
- u8 bits[16] __aligned(__alignof(u64));
u8 cidr, bit_at_a, bit_at_b, bitlen;
+ u8 bits[16] __aligned(__alignof(u64));
- /* Keep rarely used list at bottom to be beyond cache line. */
+ /* Keep rarely used members at bottom to be beyond cache line. */
+ struct allowedips_node *__rcu *parent_bit; /* XXX: this puts us at 68->128 bytes instead of 60->64 bytes!! */
union {
struct list_head peer_list;
struct rcu_head rcu;
--
2.31.1
The randomized trie tests weren't initializing the dummy peer list head,
resulting in a NULL pointer dereference when used. Fix this by
initializing it in the randomized trie test, just like we do for the
static unit test.
While we're at it, all of the other strings like this have the word
"self-test", so add it to the missing place here.
Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
Cc: stable(a)vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason(a)zx2c4.com>
---
drivers/net/wireguard/selftest/allowedips.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/wireguard/selftest/allowedips.c b/drivers/net/wireguard/selftest/allowedips.c
index 846db14cb046..0d2a43a2d400 100644
--- a/drivers/net/wireguard/selftest/allowedips.c
+++ b/drivers/net/wireguard/selftest/allowedips.c
@@ -296,6 +296,7 @@ static __init bool randomized_test(void)
goto free;
}
kref_init(&peers[i]->refcount);
+ INIT_LIST_HEAD(&peers[i]->allowedips_list);
}
mutex_lock(&mutex);
@@ -333,7 +334,7 @@ static __init bool randomized_test(void)
if (wg_allowedips_insert_v4(&t,
(struct in_addr *)mutated,
cidr, peer, &mutex) < 0) {
- pr_err("allowedips random malloc: FAIL\n");
+ pr_err("allowedips random self-test malloc: FAIL\n");
goto free_locked;
}
if (horrible_allowedips_insert_v4(&h,
--
2.31.1
Many of the synchronization points are sometimes called under the rtnl
lock, which means we should use synchronize_net rather than
synchronize_rcu. Under the hood, this expands to using the expedited
flavor of function in the event that rtnl is held, in order to not stall
other concurrent changes.
This fixes some very, very long delays when removing multiple peers at
once, which would cause some operations to take several minutes.
Fixes: e7096c131e51 ("net: WireGuard secure network tunnel")
Cc: stable(a)vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason(a)zx2c4.com>
---
drivers/net/wireguard/peer.c | 6 +++---
drivers/net/wireguard/socket.c | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/net/wireguard/peer.c b/drivers/net/wireguard/peer.c
index cd5cb0292cb6..3a042d28eb2e 100644
--- a/drivers/net/wireguard/peer.c
+++ b/drivers/net/wireguard/peer.c
@@ -88,7 +88,7 @@ static void peer_make_dead(struct wg_peer *peer)
/* Mark as dead, so that we don't allow jumping contexts after. */
WRITE_ONCE(peer->is_dead, true);
- /* The caller must now synchronize_rcu() for this to take effect. */
+ /* The caller must now synchronize_net() for this to take effect. */
}
static void peer_remove_after_dead(struct wg_peer *peer)
@@ -160,7 +160,7 @@ void wg_peer_remove(struct wg_peer *peer)
lockdep_assert_held(&peer->device->device_update_lock);
peer_make_dead(peer);
- synchronize_rcu();
+ synchronize_net();
peer_remove_after_dead(peer);
}
@@ -178,7 +178,7 @@ void wg_peer_remove_all(struct wg_device *wg)
peer_make_dead(peer);
list_add_tail(&peer->peer_list, &dead_peers);
}
- synchronize_rcu();
+ synchronize_net();
list_for_each_entry_safe(peer, temp, &dead_peers, peer_list)
peer_remove_after_dead(peer);
}
diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c
index d9ad850daa79..8c496b747108 100644
--- a/drivers/net/wireguard/socket.c
+++ b/drivers/net/wireguard/socket.c
@@ -430,7 +430,7 @@ void wg_socket_reinit(struct wg_device *wg, struct sock *new4,
if (new4)
wg->incoming_port = ntohs(inet_sk(new4)->inet_sport);
mutex_unlock(&wg->socket_update_lock);
- synchronize_rcu();
+ synchronize_net();
sock_free(old4);
sock_free(old6);
}
--
2.31.1
This is a note to let you know that I've just added the patch titled
usb: gadget: f_fs: Ensure io_completion_wq is idle during unbind
to my usb git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git
in the usb-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
>From 6fc1db5e6211e30fbb1cee8d7925d79d4ed2ae14 Mon Sep 17 00:00:00 2001
From: Wesley Cheng <wcheng(a)codeaurora.org>
Date: Fri, 21 May 2021 17:44:21 -0700
Subject: usb: gadget: f_fs: Ensure io_completion_wq is idle during unbind
During unbind, ffs_func_eps_disable() will be executed, resulting in
completion callbacks for any pending USB requests. When using AIO,
irrespective of the completion status, io_data work is queued to
io_completion_wq to evaluate and handle the completed requests. Since
work runs asynchronously to the unbind() routine, there can be a
scenario where the work runs after the USB gadget has been fully
removed, resulting in accessing of a resource which has been already
freed. (i.e. usb_ep_free_request() accessing the USB ep structure)
Explicitly drain the io_completion_wq, instead of relying on the
destroy_workqueue() (in ffs_data_put()) to make sure no pending
completion work items are running.
Signed-off-by: Wesley Cheng <wcheng(a)codeaurora.org>
Cc: stable <stable(a)vger.kernel.org>
Link: https://lore.kernel.org/r/1621644261-1236-1-git-send-email-wcheng@codeauror…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/usb/gadget/function/f_fs.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index bf109191659a..d4844afeaffc 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -3567,6 +3567,9 @@ static void ffs_func_unbind(struct usb_configuration *c,
ffs->func = NULL;
}
+ /* Drain any pending AIO completions */
+ drain_workqueue(ffs->io_completion_wq);
+
if (!--opts->refcnt)
functionfs_unbind(ffs);
--
2.31.1