An iptable rule like the following on a multicore systems will result in
accepting more connections than set in the rule.
iptables -A INPUT -p tcp -m tcp --syn --dport 7777 -m connlimit \
--connlimit-above 2000 --connlimit-mask 0 -j DROP
In check_hlist function, connections that are found in saved connections
but not in netfilter conntrack are deleted, assuming that those
connections do not exist anymore. But for multi core systems, there exists
a small time window, when a connection has been added to the xt_connlimit
maintained rb-tree but has not yet made to netfilter conntrack table. This
causes concurrent connections to return incorrect counts and go over limit
set in iptable rule.
The fix has been partially backported from the above mentioned upstream
commit. Introduce timestamp and the owning cpu.
Signed-off-by: Alakesh Haloi <alakeshh(a)amazon.com>
Cc: Pablo Neira Ayuso <pablo(a)netfilter.org>
Cc: Jozsef Kadlecsik <kadlec(a)blackhole.kfki.hu>
Cc: Florian Westphal <fw(a)strlen.de>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: stable(a)vger.kernel.org # v4.15 and before
Cc: netdev(a)vger.kernel.org
Cc: Dmitry Andrianov <dmitry.andrianov(a)alertme.com>
Cc: Justin Pettit <jpettit(a)vmware.com>
Cc: Yi-Hung Wei <yihung.wei(a)gmail.com>
---
net/netfilter/xt_connlimit.c | 28 ++++++++++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index ffa8eec..e7b092b 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -47,6 +47,8 @@ struct xt_connlimit_conn {
struct hlist_node node;
struct nf_conntrack_tuple tuple;
union nf_inet_addr addr;
+ int cpu;
+ u32 jiffies32;
};
struct xt_connlimit_rb {
@@ -126,6 +128,8 @@ static bool add_hlist(struct hlist_head *head,
return false;
conn->tuple = *tuple;
conn->addr = *addr;
+ conn->cpu = raw_smp_processor_id();
+ conn->jiffies32 = (u32)jiffies;
hlist_add_head(&conn->node, head);
return true;
}
@@ -148,8 +152,26 @@ static unsigned int check_hlist(struct net *net,
hlist_for_each_entry_safe(conn, n, head, node) {
found = nf_conntrack_find_get(net, zone, &conn->tuple);
if (found == NULL) {
- hlist_del(&conn->node);
- kmem_cache_free(connlimit_conn_cachep, conn);
+ /* If connection is not found, it may be because
+ * it has not made into conntrack table yet. We
+ * check if it is a recently created connection
+ * on a different core and do not delete it in that
+ * case.
+ */
+
+ unsigned long a, b;
+ int cpu = raw_smp_processor_id();
+ __u32 age;
+
+ b = conn->jiffies;
+ a = (u32)jiffies;
+ age = a - b;
+ if (conn->cpu != cpu && age <= 2) {
+ length++;
+ } else {
+ hlist_del(&conn->node);
+ kmem_cache_free(connlimit_conn_cachep, conn);
+ }
continue;
}
@@ -271,6 +293,8 @@ static void tree_nodes_free(struct rb_root *root,
conn->tuple = *tuple;
conn->addr = *addr;
+ conn->cpu = raw_smp_processor_id();
+ conn->jiffies32 = (u32)jiffies;
rbconn->addr = *addr;
INIT_HLIST_HEAD(&rbconn->hhead);
--
1.8.3.1
An iptable rule like the following on a multicore systems will result in
accepting more connections than set in the rule.
iptables -A INPUT -p tcp -m tcp --syn --dport 7777 -m connlimit \
--connlimit-above 2000 --connlimit-mask 0 -j DROP
In check_hlist function, connections that are found in saved connections
but not in netfilter conntrack are deleted, assuming that those
connections do not exist anymore. But for multi core systems, there exists
a small time window, when a connection has been added to the xt_connlimit
maintained rb-tree but has not yet made to netfilter conntrack table. This
causes concurrent connections to return incorrect counts and go over limit
set in iptable rule.
Connection 1 on Core 1 Connection 2 on Core 2
list_length = N
conntrack_table_len = N
spin_lock_bh()
In check_hlist() function
a. loop over saved connections
1. call nf_conntrack_find_get()
2. If not found in 1,
i. call hlist_del()
b. return total count to caller
c. connection gets added to list
of saved connections.
spin_unlock_bh()
list_length = N + 1
spin_lock_bh() on core 2
In check_hlist() function
a. loop over saved connection.
1. call nf_conntrack_find_get()
2. If not found in 1.
i. call hlist_del()
[Connection 1 was in the
but not in nf_conntrack yet]
ii. connection 1 gets deleted
list_length = N
conntrack_table_len = N
b. return total count to caller
c. connection 2 gets added to list
of saved connections
spin_unlock_bh()
d. connection 1 gets added to
nf_conntrack
list_length = N + 1
conntrack_table_len = N + 1
e. connection 2 gets added to
nf_conntrack
list_length = N + 1
conntrack_table_len = N + 2
So we end up with N + 1 connections in the list but N + 2 in nf_conntrack,
allowing more number of connections eventually than set in the rule.
This fix adds an additional field to track such pending connections
and prevent them from being deleted by another execution thread on
a different core and returns correct count.
Signed-off-by: Alakesh Haloi <alakeshh(a)amazon.com>
Cc: Pablo Neira Ayuso <pablo(a)netfilter.org>
Cc: Jozsef Kadlecsik <kadlec(a)blackhole.kfki.hu>
Cc: Florian Westphal <fw(a)strlen.de>
Cc: "David S. Miller" <davem(a)davemloft.net>
Cc: stable(a)vger.kernel.org # v4.15 and before
---
net/netfilter/xt_connlimit.c | 24 +++++++++++++++++++++---
1 file changed, 21 insertions(+), 3 deletions(-)
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index ffa8eec980e9..bd7563c209a4 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -47,6 +47,7 @@ struct xt_connlimit_conn {
struct hlist_node node;
struct nf_conntrack_tuple tuple;
union nf_inet_addr addr;
+ bool pending_add;
};
struct xt_connlimit_rb {
@@ -126,6 +127,7 @@ static bool add_hlist(struct hlist_head *head,
return false;
conn->tuple = *tuple;
conn->addr = *addr;
+ conn->pending_add = true;
hlist_add_head(&conn->node, head);
return true;
}
@@ -144,15 +146,31 @@ static unsigned int check_hlist(struct net *net,
*addit = true;
- /* check the saved connections */
+ /* check the saved connections
+ */
hlist_for_each_entry_safe(conn, n, head, node) {
found = nf_conntrack_find_get(net, zone, &conn->tuple);
if (found == NULL) {
- hlist_del(&conn->node);
- kmem_cache_free(connlimit_conn_cachep, conn);
+ /* It could be an already deleted connection or
+ * a new connection that is not there in conntrack
+ * yet. If former delete it from the list, else
+ * increase count and move on.
+ */
+ if (conn->pending_add) {
+ length++;
+ } else {
+ hlist_del(&conn->node);
+ kmem_cache_free(connlimit_conn_cachep, conn);
+ }
continue;
}
+ /* If it is a connection that was pending insertion to
+ * connection tracking table before, then it's time to clear
+ * the flag.
+ */
+ conn->pending_add = false;
+
found_ct = nf_ct_tuplehash_to_ctrack(found);
if (nf_ct_tuple_equal(&conn->tuple, tuple)) {
--
2.14.4
Hi x86 maintainers,
This is an important fix that I believe needs to be merged for 4.21.
Without it, applications calling fork() can potentially double-allocate
a protection key, causing lots of strange problems.
Thomas's Reviewed-by is on the the actual fix, but not the selftest.
I would also be happy to send this as a pull request if you would
prefer.
Cc: x86(a)kernel.org
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Will Deacon <will.deacon(a)arm.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Joerg Roedel <jroedel(a)suse.de>
Cc: stable(a)vger.kernel.org
The patch titled
Subject: slab: alien caches must not be initialized if the allocation of the alien cache failed
has been added to the -mm tree. Its filename is
slab-alien-caches-must-not-be-initialized-if-the-allocation-of-the-alien-cache-failed.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/slab-alien-caches-must-not-be-init…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/slab-alien-caches-must-not-be-init…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Christoph Lameter <cl(a)linux.com>
Subject: slab: alien caches must not be initialized if the allocation of the alien cache failed
Callers of __alloc_alien() check for NULL. We must do the same check in
__alloc_alien_cache to avoid NULL pointer dereferences on allocation
failures.
Link: http://lkml.kernel.org/r/010001680f42f192-82b4e12e-1565-4ee0-ae1f-1e9897490…
Signed-off-by: Christoph Lameter <cl(a)linux.com>
Reported-by: syzbot+d6ed4ec679652b4fd4e4(a)syzkaller.appspotmail.com
Reviewed-by: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/slab.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
--- a/mm/slab.c~slab-alien-caches-must-not-be-initialized-if-the-allocation-of-the-alien-cache-failed
+++ a/mm/slab.c
@@ -666,8 +666,10 @@ static struct alien_cache *__alloc_alien
struct alien_cache *alc = NULL;
alc = kmalloc_node(memsize, gfp, node);
- init_arraycache(&alc->ac, entries, batch);
- spin_lock_init(&alc->lock);
+ if (alc) {
+ init_arraycache(&alc->ac, entries, batch);
+ spin_lock_init(&alc->lock);
+ }
return alc;
}
_
Patches currently in -mm which might be from cl(a)linux.com are
slab-alien-caches-must-not-be-initialized-if-the-allocation-of-the-alien-cache-failed.patch
The patch titled
Subject: fork, memcg: fix cached_stacks case
has been added to the -mm tree. Its filename is
fork-memcg-fix-cached_stacks-case.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/fork-memcg-fix-cached_stacks-case.…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/fork-memcg-fix-cached_stacks-case.…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Shakeel Butt <shakeelb(a)google.com>
Subject: fork, memcg: fix cached_stacks case
5eed6f1dff87 ("fork,memcg: fix crash in free_thread_stack on memcg charge
fail") fixes a crash caused due to failed memcg charge of the kernel
stack. However the fix misses the cached_stacks case which this patch
fixes. So, the same crash can happen if the memcg charge of a cached
stack is failed.
Link: http://lkml.kernel.org/r/20190102180145.57406-1-shakeelb@google.com
Fixes: 5eed6f1dff87 ("fork,memcg: fix crash in free_thread_stack on memcg charge fail")
Signed-off-by: Shakeel Butt <shakeelb(a)google.com>
Cc: Rik van Riel <riel(a)surriel.com>
Cc: Roman Gushchin <guro(a)fb.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Tejun Heo <tj(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/fork.c | 1 +
1 file changed, 1 insertion(+)
--- a/kernel/fork.c~fork-memcg-fix-cached_stacks-case
+++ a/kernel/fork.c
@@ -221,6 +221,7 @@ static unsigned long *alloc_thread_stack
memset(s->addr, 0, THREAD_SIZE);
tsk->stack_vm_area = s;
+ tsk->stack = s->addr;
return s->addr;
}
_
Patches currently in -mm which might be from shakeelb(a)google.com are
fork-memcg-fix-cached_stacks-case.patch
Recently, Alakesh Haloi reported the following issue [1] with stable/4.14:
"""
An iptable rule like the following on a multicore systems will result in
accepting more connections than set in the rule.
iptables -A INPUT -p tcp -m tcp --syn --dport 7777 -m connlimit \
--connlimit-above 2000 --connlimit-mask 0 -j DROP
"""
And proposed a fix that is not in Linus's tree. The discussion went on to
confirm whether the issue was still reproducible with mainline/nf.git tip,
and to either identify the upstream fix or re-submit the non-upstream fix.
Alakesh eventually was able to test with upstream, and reported that issue
was still reproducible [2].
On that, our findinds diverge, at least in my test environment:
First, I verified that the suggested mainline fix for the issue [3] indeed
fixes it, by testing with it applied and reverted on v4.18, a clean revert.
(The issue is reproducible with the commit reverted).
Then, with a consistent reproducer, I moved to nf.git, with HEAD on commit
a007232 ("netfilter: nf_conncount: fix argument order to find_next_bit"),
and the issues was not reproducible (even with 20+ threads on client side,
the number Alakesh reported to achieve 2150+ connections [4], and I tried
spreading the network interface IRQ affinity over more and more CPUs too.)
Either way, the suggested mainline fix does actually fix the issue in 4.14
for at least one environment. So, it might well be the case that Alakesh's
test environment has differences/subtleties that leads to more connections
accepted, and more commits are needed for that particular environment type.
But for now, with one bare-metal environment (24-core server, 4-core client)
verified, I thought of submitting the patches for review/comments/testing,
then looking for additional fixes for that environment separately.
The fix is PATCH 4/4, and PATCHes 1-3/4 are helpers for a cleaner backport.
All backports are simple, and essentially consist of refresh context lines
and use older struct/file names.
Reviews from netfilter maintainers are very appreciated, as I've no previous
experience in this area, and although the backports look simple and build/run
correctly, there's usually stuff that only more experienced people may notice.
Thanks,
Mauricio
Links:
=====
[1] https://www.spinics.net/lists/stable/msg270040.html
[2] https://www.spinics.net/lists/stable/msg273669.html
[3] https://www.spinics.net/lists/stable/msg271300.html
[4] https://www.spinics.net/lists/stable/msg273669.html
Test-case:
=========
- v4.14.91 (original): client achieves 2000+ connections (6000 target)
with 3 threads.
server # iptables -F
server # iptables -A INPUT -p tcp -m tcp --syn --dport 7777 -m connlimit --connlimit-above 2000 --connlimit-mask 0 -j DROP
server # iptables -L
Chain INPUT (policy ACCEPT)
target prot opt source destination
DROP tcp -- anywhere anywhere tcp dpt:7777 flags:FIN,SYN,RST,ACK/SYN #conn src/0 > 2000
Chain FORWARD (policy ACCEPT)
target prot opt source destination
Chain OUTPUT (policy ACCEPT)
target prot opt source destination
server # ulimit -SHn 65000
server # ruby server.rb
<... listening ...>
client # ulimit -SHn 65000
client # ruby client.rb 10.230.56.100 7777 6000 3
Connecting to ["10.230.56.100"]:7777 6000 times with 3
1
2
3
<...>
2000
<...>
6000
Target reached. Thread finishing
6001
Target reached. Thread finishing
6002
Target reached. Thread finishing
Threads done. 6002 connections
press enter to exit
- v4.14.91 + patches: client only achieved 2000 connections.
server # (same procedure)
client # (same procedure)
Connecting to ["10.230.56.100"]:7777 6000 times with 3
1
2
3
<...>
2000
<... blocked for a while...>
failed to create connection: Connection timed out - connect(2) for "10.230.56.100" port 7777
failed to create connection: Connection timed out - connect(2) for "10.230.56.100" port 7777
failed to create connection: Connection timed out - connect(2) for "10.230.56.100" port 7777
Threads done. 2000 connections
press enter to exit
Florian Westphal (2):
netfilter: xt_connlimit: don't store address in the conn nodes
netfilter: nf_conncount: fix garbage collection confirm race
Pablo Neira Ayuso (1):
netfilter: nf_conncount: expose connection list interface
Yi-Hung Wei (1):
netfilter: nf_conncount: Fix garbage collection with zones
include/net/netfilter/nf_conntrack_count.h | 15 +++++
net/netfilter/xt_connlimit.c | 99 +++++++++++++++++++++++-------
2 files changed, 91 insertions(+), 23 deletions(-)
create mode 100644 include/net/netfilter/nf_conntrack_count.h
--
2.7.4
The patch titled
Subject: lib/gen_crc64table.c: don't depend on linux headers being installed
has been removed from the -mm tree. Its filename was
lib-dont-depend-on-linux-headers-being-installed.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: NeilBrown <neil(a)brown.name>
Subject: lib/gen_crc64table.c: don't depend on linux headers being installed
gen_crc64table requires linux include files to be installed in
/usr/include/linux. This is a new requrement so hosts that could
previously build the kernel, now cannot.
gen_crc64table makes this requirement by including <linux/swab.h>, but
nothing from that header is actually used.
So remove the #include, so that the linux headers no longer need to be
installed.
Link: http://lkml.kernel.org/r/87y3899gzi.fsf@notabene.neil.brown.name
Fixes: feba04fd2cf8 ("lib: add crc64 calculation routines")
Signed-off-by: NeilBrown <neil(a)brown.name>
Acked-by: Coly Li <colyli(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/lib/gen_crc64table.c~lib-dont-depend-on-linux-headers-being-installed
+++ a/lib/gen_crc64table.c
@@ -16,8 +16,6 @@
#include <inttypes.h>
#include <stdio.h>
-#include <linux/swab.h>
-
#define CRC64_ECMA182_POLY 0x42F0E1EBA9EA3693ULL
static uint64_t crc64_table[256] = {0};
_
Patches currently in -mm which might be from neil(a)brown.name are
The patch titled
Subject: memcg, oom: notify on oom killer invocation from the charge path
has been removed from the -mm tree. Its filename was
memcg-oom-notify-on-oom-killer-invocation-from-the-charge-path.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Michal Hocko <mhocko(a)suse.com>
Subject: memcg, oom: notify on oom killer invocation from the charge path
Burt Holzman has noticed that memcg v1 doesn't notify about OOM events via
eventfd anymore. The reason is that 29ef680ae7c2 ("memcg, oom: move
out_of_memory back to the charge path") has moved the oom handling back to
the charge path. While doing so the notification was left behind in
mem_cgroup_oom_synchronize.
Fix the issue by replicating the oom hierarchy locking and the
notification.
Link: http://lkml.kernel.org/r/20181224091107.18354-1-mhocko@kernel.org
Fixes: 29ef680ae7c2 ("memcg, oom: move out_of_memory back to the charge path")
Signed-off-by: Michal Hocko <mhocko(a)suse.com>
Reported-by: Burt Holzman <burt(a)fnal.gov>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com
Cc: <stable(a)vger.kernel.org> [4.19+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/mm/memcontrol.c~memcg-oom-notify-on-oom-killer-invocation-from-the-charge-path
+++ a/mm/memcontrol.c
@@ -1673,6 +1673,9 @@ enum oom_status {
static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
+ enum oom_status ret;
+ bool locked;
+
if (order > PAGE_ALLOC_COSTLY_ORDER)
return OOM_SKIPPED;
@@ -1707,10 +1710,23 @@ static enum oom_status mem_cgroup_oom(st
return OOM_ASYNC;
}
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ mem_cgroup_unmark_under_oom(memcg);
if (mem_cgroup_out_of_memory(memcg, mask, order))
- return OOM_SUCCESS;
+ ret = OOM_SUCCESS;
+ else
+ ret = OOM_FAILED;
+
+ if (locked)
+ mem_cgroup_oom_unlock(memcg);
- return OOM_FAILED;
+ return ret;
}
/**
_
Patches currently in -mm which might be from mhocko(a)suse.com are
mm-memcg-fix-reclaim-deadlock-with-writeback.patch