commit e1bf1687740ce1a3598a1c5e452b852ff2190682 upstream.
This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.
It was not a good idea. The custom hash table was a much better fit for this purpose.
A fast lookup is not essential, in fact for most cases there is no lookup at all because original tuple is not taken and can be used as-is. What needs to be fast is insertion and deletion.
rhlist removal however requires a rhlist walk. We can have thousands of entries in such a list if source port/addresses are reused for multiple flows, if this happens removal requests are so expensive that deletions of a few thousand flows can take several seconds(!).
The advantages that we got from rhashtable are: 1) table auto-sizing 2) multiple locks
1) would be nice to have, but it is not essential as we have at most one lookup per new flow, so even a million flows in the bysource table are not a problem compared to current deletion cost. 2) is easy to add to custom hash table.
I tried to add hlist_node to rhlist to speed up rhltable_remove but this isn't doable without changing semantics. rhltable_remove_fast will check that the to-be-deleted object is part of the table and that requires a list walk that we want to avoid.
Furthermore, using hlist_node increases size of struct rhlist_head, which in turn increases nf_conn size.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821 Reported-by: Ivan Babrou ibobrik@gmail.com Signed-off-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org --- Hi Greg,
this is a 4.9.y backport of the same revert that already went into 4.13.y; please consider applying this. I briefly tested this in kvm, at very least it doesn't break nat redirect in obvious ways...
include/net/netfilter/nf_conntrack.h | 3 +- include/net/netfilter/nf_nat.h | 1 - net/netfilter/nf_nat_core.c | 133 +++++++++++++++-------------------- 3 files changed, 57 insertions(+), 80 deletions(-)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index d9d52c020a70..9ae819e27940 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -17,7 +17,6 @@ #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/atomic.h> -#include <linux/rhashtable.h>
#include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/nf_conntrack_dccp.h> @@ -101,7 +100,7 @@ struct nf_conn { possible_net_t ct_net;
#if IS_ENABLED(CONFIG_NF_NAT) - struct rhlist_head nat_bysource; + struct hlist_node nat_bysource; #endif /* all members below initialized via memset */ u8 __nfct_init_offset[0]; diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index c327a431a6f3..02515f7ed4cc 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -1,6 +1,5 @@ #ifndef _NF_NAT_H #define _NF_NAT_H -#include <linux/rhashtable.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/nf_nat.h> #include <net/netfilter/nf_conntrack_tuple.h> diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 2916f4815c9c..87d67e714dc4 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -30,19 +30,17 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_nat.h>
+static DEFINE_SPINLOCK(nf_nat_lock); + static DEFINE_MUTEX(nf_nat_proto_mutex); static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly;
-struct nf_nat_conn_key { - const struct net *net; - const struct nf_conntrack_tuple *tuple; - const struct nf_conntrack_zone *zone; -}; - -static struct rhltable nf_nat_bysource_table; +static struct hlist_head *nf_nat_bysource __read_mostly; +static unsigned int nf_nat_htable_size __read_mostly; +static unsigned int nf_nat_hash_rnd __read_mostly;
inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -121,17 +119,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) EXPORT_SYMBOL(nf_xfrm_me_harder); #endif /* CONFIG_XFRM */
-static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) +/* We keep an extra hash for each conntrack, for fast searching. */ +static inline unsigned int +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { - const struct nf_conntrack_tuple *t; - const struct nf_conn *ct = data; + unsigned int hash; + + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
- t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* Original src, to ensure we map it consistently if poss. */ + hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), + tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
- seed ^= net_hash_mix(nf_ct_net(ct)); - return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32), - t->dst.protonum ^ seed); + return reciprocal_scale(hash, nf_nat_htable_size); }
/* Is this tuple already taken? (not by us) */ @@ -187,28 +187,6 @@ same_src(const struct nf_conn *ct, t->src.u.all == tuple->src.u.all); }
-static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct nf_nat_conn_key *key = arg->key; - const struct nf_conn *ct = obj; - - if (!same_src(ct, key->tuple) || - !net_eq(nf_ct_net(ct), key->net) || - !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL)) - return 1; - - return 0; -} - -static struct rhashtable_params nf_nat_bysource_params = { - .head_offset = offsetof(struct nf_conn, nat_bysource), - .obj_hashfn = nf_nat_bysource_hash, - .obj_cmpfn = nf_nat_bysource_cmp, - .nelem_hint = 256, - .min_size = 1024, -}; - /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, @@ -219,26 +197,22 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { + unsigned int h = hash_by_src(net, tuple); const struct nf_conn *ct; - struct nf_nat_conn_key key = { - .net = net, - .tuple = tuple, - .zone = zone - }; - struct rhlist_head *hl, *h; - - hl = rhltable_lookup(&nf_nat_bysource_table, &key, - nf_nat_bysource_params);
- rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) { - nf_ct_invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; - - if (in_range(l3proto, l4proto, result, range)) - return 1; + hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { + if (same_src(ct, tuple) && + net_eq(net, nf_ct_net(ct)) && + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { + /* Copy source part from reply tuple. */ + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + if (in_range(l3proto, l4proto, result, range)) + return 1; + } } - return 0; }
@@ -411,6 +385,7 @@ nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat;
@@ -452,19 +427,16 @@ nf_nat_setup_info(struct nf_conn *ct, }
if (maniptype == NF_NAT_MANIP_SRC) { - struct nf_nat_conn_key key = { - .net = nf_ct_net(ct), - .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - .zone = nf_ct_zone(ct), - }; - int err; - - err = rhltable_insert_key(&nf_nat_bysource_table, - &key, - &ct->nat_bysource, - nf_nat_bysource_params); - if (err) - return NF_DROP; + unsigned int srchash; + + srchash = hash_by_src(net, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + spin_lock_bh(&nf_nat_lock); + /* nf_conntrack_alter_reply might re-allocate extension aera */ + nat = nfct_nat(ct); + hlist_add_head_rcu(&ct->nat_bysource, + &nf_nat_bysource[srchash]); + spin_unlock_bh(&nf_nat_lock); }
/* It's done. */ @@ -578,9 +550,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&ct->nat_bysource); ct->status &= ~IPS_NAT_DONE_MASK; - rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, - nf_nat_bysource_params); + spin_unlock_bh(&nf_nat_lock);
/* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. @@ -710,8 +683,11 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) if (!nat) return;
- rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, - nf_nat_bysource_params); + NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE); + + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_lock); }
static struct nf_ct_ext_type nat_extend __read_mostly = { @@ -846,13 +822,16 @@ static int __init nf_nat_init(void) { int ret;
- ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); - if (ret) - return ret; + /* Leave them the same for the moment. */ + nf_nat_htable_size = nf_conntrack_htable_size; + + nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); + if (!nf_nat_bysource) + return -ENOMEM;
ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { - rhltable_destroy(&nf_nat_bysource_table); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -876,7 +855,7 @@ static int __init nf_nat_init(void) return 0;
cleanup_extend: - rhltable_destroy(&nf_nat_bysource_table); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); nf_ct_extend_unregister(&nat_extend); return ret; } @@ -896,8 +875,8 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); - - rhltable_destroy(&nf_nat_bysource_table); + synchronize_net(); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); }
MODULE_LICENSE("GPL");
On Sun, Nov 12, 2017 at 09:23:47PM +0100, Florian Westphal wrote:
commit e1bf1687740ce1a3598a1c5e452b852ff2190682 upstream.
This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.
It was not a good idea. The custom hash table was a much better fit for this purpose.
A fast lookup is not essential, in fact for most cases there is no lookup at all because original tuple is not taken and can be used as-is. What needs to be fast is insertion and deletion.
rhlist removal however requires a rhlist walk. We can have thousands of entries in such a list if source port/addresses are reused for multiple flows, if this happens removal requests are so expensive that deletions of a few thousand flows can take several seconds(!).
The advantages that we got from rhashtable are:
table auto-sizing
multiple locks
would be nice to have, but it is not essential as we have at
most one lookup per new flow, so even a million flows in the bysource table are not a problem compared to current deletion cost. 2) is easy to add to custom hash table.
I tried to add hlist_node to rhlist to speed up rhltable_remove but this isn't doable without changing semantics. rhltable_remove_fast will check that the to-be-deleted object is part of the table and that requires a list walk that we want to avoid.
Furthermore, using hlist_node increases size of struct rhlist_head, which in turn increases nf_conn size.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821 Reported-by: Ivan Babrou ibobrik@gmail.com Signed-off-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org
Hi Greg,
this is a 4.9.y backport of the same revert that already went into 4.13.y; please consider applying this. I briefly tested this in kvm, at very least it doesn't break nat redirect in obvious ways...
Very nice, thanks for this, now queued up.
greg k-h
this should belong to you. with the revert the kernel crashes for me
[ 24.838120] Unable to handle kernel NULL pointer dereference at virtual address 00000055 [ 24.846193] pgd = c0004000 [ 24.848893] [00000055] *pgd=00000000 [ 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM [ 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 compat [ 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90 [ 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree) [ 24.892921] task: ef029ac0 task.stack: ef05a000 [ 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74 [ 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74 [ 24.907869] pc : [<c04858c8>] lr : [<c04858b4>] psr: 60000153 [ 24.907869] sp : ef05bb58 ip : ef05bb58 fp : ef05bb6c [ 24.919317] r10: ed230cc0 r9 : ed230c00 r8 : edf45800 [ 24.924529] r7 : ebcd2f00 r6 : ec33739e r5 : c0892294 r4 : ebcd2f00 [ 24.931040] r3 : 00000000 r2 : 00000055 r1 : 00000000 r0 : c0892718 [ 24.937551] Flags: nZCv IRQs on FIQs off Mode SVC_32 ISA ARM Segment user [ 24.944755] Control: 10c5387d Table: 2bd1006a DAC: 00000055 [ 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210) [ 24.956477] Stack: (0xef05bb58 to 0xef05c000)
Am 12.11.2017 um 21:23 schrieb Florian Westphal:
commit e1bf1687740ce1a3598a1c5e452b852ff2190682 upstream.
This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.
It was not a good idea. The custom hash table was a much better fit for this purpose.
A fast lookup is not essential, in fact for most cases there is no lookup at all because original tuple is not taken and can be used as-is. What needs to be fast is insertion and deletion.
rhlist removal however requires a rhlist walk. We can have thousands of entries in such a list if source port/addresses are reused for multiple flows, if this happens removal requests are so expensive that deletions of a few thousand flows can take several seconds(!).
The advantages that we got from rhashtable are:
table auto-sizing
multiple locks
would be nice to have, but it is not essential as we have at
most one lookup per new flow, so even a million flows in the bysource table are not a problem compared to current deletion cost. 2) is easy to add to custom hash table.
I tried to add hlist_node to rhlist to speed up rhltable_remove but this isn't doable without changing semantics. rhltable_remove_fast will check that the to-be-deleted object is part of the table and that requires a list walk that we want to avoid.
Furthermore, using hlist_node increases size of struct rhlist_head, which in turn increases nf_conn size.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821 Reported-by: Ivan Babrou ibobrik@gmail.com Signed-off-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org
Hi Greg,
this is a 4.9.y backport of the same revert that already went into 4.13.y; please consider applying this. I briefly tested this in kvm, at very least it doesn't break nat redirect in obvious ways...
include/net/netfilter/nf_conntrack.h | 3 +- include/net/netfilter/nf_nat.h | 1 - net/netfilter/nf_nat_core.c | 133 +++++++++++++++-------------------- 3 files changed, 57 insertions(+), 80 deletions(-)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index d9d52c020a70..9ae819e27940 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -17,7 +17,6 @@ #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/atomic.h> -#include <linux/rhashtable.h> #include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/nf_conntrack_dccp.h> @@ -101,7 +100,7 @@ struct nf_conn { possible_net_t ct_net; #if IS_ENABLED(CONFIG_NF_NAT)
- struct rhlist_head nat_bysource;
- struct hlist_node nat_bysource; #endif /* all members below initialized via memset */ u8 __nfct_init_offset[0];
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index c327a431a6f3..02515f7ed4cc 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -1,6 +1,5 @@ #ifndef _NF_NAT_H #define _NF_NAT_H -#include <linux/rhashtable.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/nf_nat.h> #include <net/netfilter/nf_conntrack_tuple.h> diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 2916f4815c9c..87d67e714dc4 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -30,19 +30,17 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_nat.h> +static DEFINE_SPINLOCK(nf_nat_lock);
- static DEFINE_MUTEX(nf_nat_proto_mutex); static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly;
-struct nf_nat_conn_key {
- const struct net *net;
- const struct nf_conntrack_tuple *tuple;
- const struct nf_conntrack_zone *zone;
-};
-static struct rhltable nf_nat_bysource_table; +static struct hlist_head *nf_nat_bysource __read_mostly; +static unsigned int nf_nat_htable_size __read_mostly; +static unsigned int nf_nat_hash_rnd __read_mostly; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -121,17 +119,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) EXPORT_SYMBOL(nf_xfrm_me_harder); #endif /* CONFIG_XFRM */ -static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) +/* We keep an extra hash for each conntrack, for fast searching. */ +static inline unsigned int +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) {
- const struct nf_conntrack_tuple *t;
- const struct nf_conn *ct = data;
- unsigned int hash;
- get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
- t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* Original src, to ensure we map it consistently if poss. */
- hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
- seed ^= net_hash_mix(nf_ct_net(ct));
- return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
t->dst.protonum ^ seed);
- return reciprocal_scale(hash, nf_nat_htable_size); }
/* Is this tuple already taken? (not by us) */ @@ -187,28 +187,6 @@ same_src(const struct nf_conn *ct, t->src.u.all == tuple->src.u.all); } -static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
const void *obj)
-{
- const struct nf_nat_conn_key *key = arg->key;
- const struct nf_conn *ct = obj;
- if (!same_src(ct, key->tuple) ||
!net_eq(nf_ct_net(ct), key->net) ||
!nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
return 1;
- return 0;
-}
-static struct rhashtable_params nf_nat_bysource_params = {
- .head_offset = offsetof(struct nf_conn, nat_bysource),
- .obj_hashfn = nf_nat_bysource_hash,
- .obj_cmpfn = nf_nat_bysource_cmp,
- .nelem_hint = 256,
- .min_size = 1024,
-};
- /* Only called for SRC manip */ static int find_appropriate_src(struct net *net,
@@ -219,26 +197,22 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) {
- unsigned int h = hash_by_src(net, tuple); const struct nf_conn *ct;
- struct nf_nat_conn_key key = {
.net = net,
.tuple = tuple,
.zone = zone
- };
- struct rhlist_head *hl, *h;
- hl = rhltable_lookup(&nf_nat_bysource_table, &key,
nf_nat_bysource_params);
- rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
nf_ct_invert_tuplepr(result,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
result->dst = tuple->dst;
if (in_range(l3proto, l4proto, result, range))
return 1;
- hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
if (same_src(ct, tuple) &&
net_eq(net, nf_ct_net(ct)) &&
nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
/* Copy source part from reply tuple. */
nf_ct_invert_tuplepr(result,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
result->dst = tuple->dst;
if (in_range(l3proto, l4proto, result, range))
return 1;
}}
- return 0; }
@@ -411,6 +385,7 @@ nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) {
- struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat;
@@ -452,19 +427,16 @@ nf_nat_setup_info(struct nf_conn *ct, } if (maniptype == NF_NAT_MANIP_SRC) {
struct nf_nat_conn_key key = {
.net = nf_ct_net(ct),
.tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
.zone = nf_ct_zone(ct),
};
int err;
err = rhltable_insert_key(&nf_nat_bysource_table,
&key,
&ct->nat_bysource,
nf_nat_bysource_params);
if (err)
return NF_DROP;
unsigned int srchash;
srchash = hash_by_src(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
spin_lock_bh(&nf_nat_lock);
/* nf_conntrack_alter_reply might re-allocate extension aera */
nat = nfct_nat(ct);
hlist_add_head_rcu(&ct->nat_bysource,
&nf_nat_bysource[srchash]);
}spin_unlock_bh(&nf_nat_lock);
/* It's done. */ @@ -578,9 +550,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */
- spin_lock_bh(&nf_nat_lock);
- hlist_del_rcu(&ct->nat_bysource); ct->status &= ~IPS_NAT_DONE_MASK;
- rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
nf_nat_bysource_params);
- spin_unlock_bh(&nf_nat_lock);
/* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. @@ -710,8 +683,11 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) if (!nat) return;
- rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
nf_nat_bysource_params);
- NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
- spin_lock_bh(&nf_nat_lock);
- hlist_del_rcu(&ct->nat_bysource);
- spin_unlock_bh(&nf_nat_lock); }
static struct nf_ct_ext_type nat_extend __read_mostly = { @@ -846,13 +822,16 @@ static int __init nf_nat_init(void) { int ret;
- ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
- if (ret)
return ret;
- /* Leave them the same for the moment. */
- nf_nat_htable_size = nf_conntrack_htable_size;
- nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
- if (!nf_nat_bysource)
return -ENOMEM;
ret = nf_ct_extend_register(&nat_extend); if (ret < 0) {
rhltable_destroy(&nf_nat_bysource_table);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; }nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
@@ -876,7 +855,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend:
- rhltable_destroy(&nf_nat_bysource_table);
- nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); nf_ct_extend_unregister(&nat_extend); return ret; }
@@ -896,8 +875,8 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]);
- rhltable_destroy(&nf_nat_bysource_table);
- synchronize_net();
- nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); }
MODULE_LICENSE("GPL");
Sebastian Gottschall s.gottschall@dd-wrt.com wrote:
this should belong to you. with the revert the kernel crashes for me
[ 24.838120] Unable to handle kernel NULL pointer dereference at virtual address 00000055 [ 24.846193] pgd = c0004000 [ 24.848893] [00000055] *pgd=00000000 [ 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM [ 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 compat [ 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90 [ 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree) [ 24.892921] task: ef029ac0 task.stack: ef05a000 [ 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74 [ 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74 [ 24.907869] pc : [<c04858c8>] lr : [<c04858b4>] psr: 60000153 [ 24.907869] sp : ef05bb58 ip : ef05bb58 fp : ef05bb6c [ 24.919317] r10: ed230cc0 r9 : ed230c00 r8 : edf45800 [ 24.924529] r7 : ebcd2f00 r6 : ec33739e r5 : c0892294 r4 : ebcd2f00 [ 24.931040] r3 : 00000000 r2 : 00000055 r1 : 00000000 r0 : c0892718 [ 24.937551] Flags: nZCv IRQs on FIQs off Mode SVC_32 ISA ARM Segment user [ 24.944755] Control: 10c5387d Table: 2bd1006a DAC: 00000055 [ 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210) [ 24.956477] Stack: (0xef05bb58 to 0xef05c000)
Guillaume, can you check what I missed? IIRC you said you had a revert for your 4.9 tree, correct?
On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
Sebastian Gottschall s.gottschall@dd-wrt.com wrote:
this should belong to you. with the revert the kernel crashes for me
[ 24.838120] Unable to handle kernel NULL pointer dereference at virtual address 00000055 [ 24.846193] pgd = c0004000 [ 24.848893] [00000055] *pgd=00000000 [ 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM [ 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 compat [ 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90 [ 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree) [ 24.892921] task: ef029ac0 task.stack: ef05a000 [ 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74 [ 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74 [ 24.907869] pc : [<c04858c8>] lr : [<c04858b4>] psr: 60000153 [ 24.907869] sp : ef05bb58 ip : ef05bb58 fp : ef05bb6c [ 24.919317] r10: ed230cc0 r9 : ed230c00 r8 : edf45800 [ 24.924529] r7 : ebcd2f00 r6 : ec33739e r5 : c0892294 r4 : ebcd2f00 [ 24.931040] r3 : 00000000 r2 : 00000055 r1 : 00000000 r0 : c0892718 [ 24.937551] Flags: nZCv IRQs on FIQs off Mode SVC_32 ISA ARM Segment user [ 24.944755] Control: 10c5387d Table: 2bd1006a DAC: 00000055 [ 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210) [ 24.956477] Stack: (0xef05bb58 to 0xef05c000)
Guillaume, can you check what I missed? IIRC you said you had a revert for your 4.9 tree, correct?
Yes, but my backport was on 4.9.18. IIRC some other nat fixes have been applied in the mean time. I just arrived back home yesterday night. I'm going look at this in the afternoon.
On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
Sebastian Gottschall s.gottschall@dd-wrt.com wrote:
this should belong to you. with the revert the kernel crashes for me
[ 24.838120] Unable to handle kernel NULL pointer dereference at virtual address 00000055 [ 24.846193] pgd = c0004000 [ 24.848893] [00000055] *pgd=00000000 [ 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM [ 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 compat [ 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90 [ 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree) [ 24.892921] task: ef029ac0 task.stack: ef05a000 [ 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74 [ 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74 [ 24.907869] pc : [<c04858c8>] lr : [<c04858b4>] psr: 60000153 [ 24.907869] sp : ef05bb58 ip : ef05bb58 fp : ef05bb6c [ 24.919317] r10: ed230cc0 r9 : ed230c00 r8 : edf45800 [ 24.924529] r7 : ebcd2f00 r6 : ec33739e r5 : c0892294 r4 : ebcd2f00 [ 24.931040] r3 : 00000000 r2 : 00000055 r1 : 00000000 r0 : c0892718 [ 24.937551] Flags: nZCv IRQs on FIQs off Mode SVC_32 ISA ARM Segment user [ 24.944755] Control: 10c5387d Table: 2bd1006a DAC: 00000055 [ 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210) [ 24.956477] Stack: (0xef05bb58 to 0xef05c000)
Guillaume, can you check what I missed? IIRC you said you had a revert for your 4.9 tree, correct?
Actually I had forgotten some details when I told you about it. What I did is simpler than a revert. I just cherry-picked the following commits: 124dffea9e8e ("netfilter: nat: use atomic bit op to clear the _SRC_NAT_DONE_BIT") 6e699867f84c ("netfilter: nat: avoid use of nf_conn_nat extension") e1bf1687740c ("netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"")
Then I just had to fix one remaining call: @@ -842,7 +842,7 @@ static int __init nf_nat_init(void) return 0;
cleanup_extend: - rhltable_destroy(&nf_nat_bysource_table); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); nf_ct_extend_unregister(&nat_extend); return ret; }
My original approach was to revert commit 870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable") and fix the conflicts by hand. But I had crashes when testing the result. After a bit of debugging, I decided to simply try cherry-picking commit e1bf1687740c and its dependencies. Since I got better results with this approach, I dropped my original revert code (so, unfortunately, I can't compare with yours and I don't remember if the crash was similar to Sebastian's). We have the cherry-picked patches running on several machines since September. No problem found so far.
Now, looking at Sebastian's report, I suspect that using nf_ct_ext_find(NF_CT_EXT_NAT) in nf_nat_cleanup_conntrack() might not be appropriate anymore. My approach uses the IPS_SRC_NAT_DONE bit instead, which is now set atomically. Theses changes weren't intentional, but introduced by the dependency on commits 124dffea9e8e and 6e699867f84c.
Does that make sense to you? I can look more deeply into this tomorrow.
Am 14.11.2017 um 20:30 schrieb Guillaume Nault:
On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
Sebastian Gottschall s.gottschall@dd-wrt.com wrote:
this should belong to you. with the revert the kernel crashes for me
[ 24.838120] Unable to handle kernel NULL pointer dereference at virtual address 00000055 [ 24.846193] pgd = c0004000 [ 24.848893] [00000055] *pgd=00000000 [ 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM [ 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 compat [ 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90 [ 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree) [ 24.892921] task: ef029ac0 task.stack: ef05a000 [ 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74 [ 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74 [ 24.907869] pc : [<c04858c8>] lr : [<c04858b4>] psr: 60000153 [ 24.907869] sp : ef05bb58 ip : ef05bb58 fp : ef05bb6c [ 24.919317] r10: ed230cc0 r9 : ed230c00 r8 : edf45800 [ 24.924529] r7 : ebcd2f00 r6 : ec33739e r5 : c0892294 r4 : ebcd2f00 [ 24.931040] r3 : 00000000 r2 : 00000055 r1 : 00000000 r0 : c0892718 [ 24.937551] Flags: nZCv IRQs on FIQs off Mode SVC_32 ISA ARM Segment user [ 24.944755] Control: 10c5387d Table: 2bd1006a DAC: 00000055 [ 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210) [ 24.956477] Stack: (0xef05bb58 to 0xef05c000)
Guillaume, can you check what I missed? IIRC you said you had a revert for your 4.9 tree, correct?
Actually I had forgotten some details when I told you about it. What I did is simpler than a revert. I just cherry-picked the following commits: 124dffea9e8e ("netfilter: nat: use atomic bit op to clear the _SRC_NAT_DONE_BIT") 6e699867f84c ("netfilter: nat: avoid use of nf_conn_nat extension") e1bf1687740c ("netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"")
Then I just had to fix one remaining call: @@ -842,7 +842,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend:
rhltable_destroy(&nf_nat_bysource_table);
}nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); nf_ct_extend_unregister(&nat_extend); return ret;
My original approach was to revert commit 870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable") and fix the conflicts by hand. But I had crashes when testing the result. After a bit of debugging, I decided to simply try cherry-picking commit e1bf1687740c and its dependencies. Since I got better results with this approach, I dropped my original revert code (so, unfortunately, I can't compare with yours and I don't remember if the crash was similar to Sebastian's). We have the cherry-picked patches running on several machines since September. No problem found so far.
Now, looking at Sebastian's report, I suspect that using nf_ct_ext_find(NF_CT_EXT_NAT) in nf_nat_cleanup_conntrack() might not be appropriate anymore. My approach uses the IPS_SRC_NAT_DONE bit instead, which is now set atomically. Theses changes weren't intentional, but introduced by the dependency on commits 124dffea9e8e and 6e699867f84c.
Does that make sense to you? I can look more deeply into this tomorrow.
i love crashing my devices. let me test this approach
Sebastian
the following patch based on the suggestion by Guillaume works good in my tests and does not crash anymore
--- nf_nat_core.c (Revision 33766) +++ nf_nat_core.c (Arbeitskopie) @@ -678,16 +678,11 @@ /* No one using conntrack by the time this called. */ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { - struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); - - if (!nat) - return; - - NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE); - - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_lock); + if (ct->status & IPS_SRC_NAT_DONE) { + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_lock); + } }
static struct nf_ct_ext_type nat_extend __read_mostly = {
Am 14.11.2017 um 23:16 schrieb Sebastian Gottschall:
Am 14.11.2017 um 20:30 schrieb Guillaume Nault:
On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
Sebastian Gottschall s.gottschall@dd-wrt.com wrote:
this should belong to you. with the revert the kernel crashes for me
[ 24.838120] Unable to handle kernel NULL pointer dereference at virtual address 00000055 [ 24.846193] pgd = c0004000 [ 24.848893] [00000055] *pgd=00000000 [ 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM [ 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 compat [ 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90 [ 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree) [ 24.892921] task: ef029ac0 task.stack: ef05a000 [ 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74 [ 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74 [ 24.907869] pc : [<c04858c8>] lr : [<c04858b4>] psr: 60000153 [ 24.907869] sp : ef05bb58 ip : ef05bb58 fp : ef05bb6c [ 24.919317] r10: ed230cc0 r9 : ed230c00 r8 : edf45800 [ 24.924529] r7 : ebcd2f00 r6 : ec33739e r5 : c0892294 r4 : ebcd2f00 [ 24.931040] r3 : 00000000 r2 : 00000055 r1 : 00000000 r0 : c0892718 [ 24.937551] Flags: nZCv IRQs on FIQs off Mode SVC_32 ISA ARM Segment user [ 24.944755] Control: 10c5387d Table: 2bd1006a DAC: 00000055 [ 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210) [ 24.956477] Stack: (0xef05bb58 to 0xef05c000)
Guillaume, can you check what I missed? IIRC you said you had a revert for your 4.9 tree, correct?
Actually I had forgotten some details when I told you about it. What I did is simpler than a revert. I just cherry-picked the following commits: 124dffea9e8e ("netfilter: nat: use atomic bit op to clear the _SRC_NAT_DONE_BIT") 6e699867f84c ("netfilter: nat: avoid use of nf_conn_nat extension") e1bf1687740c ("netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"")
Then I just had to fix one remaining call: @@ -842,7 +842,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend: - rhltable_destroy(&nf_nat_bysource_table); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); nf_ct_extend_unregister(&nat_extend); return ret; }
My original approach was to revert commit 870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable") and fix the conflicts by hand. But I had crashes when testing the result. After a bit of debugging, I decided to simply try cherry-picking commit e1bf1687740c and its dependencies. Since I got better results with this approach, I dropped my original revert code (so, unfortunately, I can't compare with yours and I don't remember if the crash was similar to Sebastian's). We have the cherry-picked patches running on several machines since September. No problem found so far.
Now, looking at Sebastian's report, I suspect that using nf_ct_ext_find(NF_CT_EXT_NAT) in nf_nat_cleanup_conntrack() might not be appropriate anymore. My approach uses the IPS_SRC_NAT_DONE bit instead, which is now set atomically. Theses changes weren't intentional, but introduced by the dependency on commits 124dffea9e8e and 6e699867f84c.
Does that make sense to you? I can look more deeply into this tomorrow.
i love crashing my devices. let me test this approach
Sebastian
Sebastian Gottschall s.gottschall@dd-wrt.com wrote:
the following patch based on the suggestion by Guillaume works good in my tests and does not crash anymore
--- nf_nat_core.c (Revision 33766) +++ nf_nat_core.c (Arbeitskopie) @@ -678,16 +678,11 @@ /* No one using conntrack by the time this called. */ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { - struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
- if (!nat) - return;
- NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
- spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_lock); + if (ct->status & IPS_SRC_NAT_DONE) { + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_lock); + } }
FWIW I can now reproduce the crash on 4.9-stable + backport. This crash can be triggered by a simple
iptables -I INPUT 1 -j DROP
on first (new) incoming packet. Problem is that as not SRC nat transformation is there for inbound connections such conntrack isn't yet on nat_bysource list so hlist_del_rcu() gets garbage input.
bug was added in 7c9664351980aaa6a4b8837a314360b3a4ad382a ("netfilter: move nat hlist_head to nf_conn").
... and then 'masked' by the rhashtable conversion (removing non-existing rhashtable element has no ill effects).
The revert upstream and the 4.13.y one had no such issue because of earlier change 6e699867f84c0f358fed233fe6162173aca28e04 ("netfilter: nat: avoid use of nf_conn_nat extension"), which replaces if (!nat) condition with the if (ct->status & IPS_SRC_NAT_DONE) test quoted above.
I will have an updated patch shortly, I think best solution is to just cherry-pick 6e699867f84c0f358fed233fe6162173aca28e04 in 4.9.y too and update this backport accordingly.
On Wed, Nov 15, 2017 at 11:13:00AM +0100, Florian Westphal wrote:
Sebastian Gottschall s.gottschall@dd-wrt.com wrote:
the following patch based on the suggestion by Guillaume works good in my tests and does not crash anymore
--- nf_nat_core.c (Revision 33766) +++ nf_nat_core.c (Arbeitskopie) @@ -678,16 +678,11 @@ /* No one using conntrack by the time this called. */ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { - struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
- if (!nat) - return;
- NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
- spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_lock); + if (ct->status & IPS_SRC_NAT_DONE) { + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_lock); + } }
FWIW I can now reproduce the crash on 4.9-stable + backport. This crash can be triggered by a simple
iptables -I INPUT 1 -j DROP
on first (new) incoming packet. Problem is that as not SRC nat transformation is there for inbound connections such conntrack isn't yet on nat_bysource list so hlist_del_rcu() gets garbage input.
bug was added in 7c9664351980aaa6a4b8837a314360b3a4ad382a ("netfilter: move nat hlist_head to nf_conn").
... and then 'masked' by the rhashtable conversion (removing non-existing rhashtable element has no ill effects).
The revert upstream and the 4.13.y one had no such issue because of earlier change 6e699867f84c0f358fed233fe6162173aca28e04 ("netfilter: nat: avoid use of nf_conn_nat extension"), which replaces if (!nat) condition with the if (ct->status & IPS_SRC_NAT_DONE) test quoted above.
I will have an updated patch shortly, I think best solution is to just cherry-pick 6e699867f84c0f358fed233fe6162173aca28e04 in 4.9.y too and update this backport accordingly.
Ok, let me just drop this patch, get this 4.9 release out, and then can you all send me a patch, or series of patches, that you feel should be applied to 4.9 to resolve this issue?
thanks,
greg k-h
On Wed, Nov 15, 2017 at 03:04:06PM +0100, Greg Kroah-Hartman wrote:
Ok, let me just drop this patch, get this 4.9 release out, and then can you all send me a patch, or series of patches, that you feel should be applied to 4.9 to resolve this issue?
Please, indeed drop it for 4.9. We'll take the time here to sort out things.
Thanks Greg.
On Wed, Nov 15, 2017 at 03:16:48PM +0100, Pablo Neira Ayuso wrote:
On Wed, Nov 15, 2017 at 03:04:06PM +0100, Greg Kroah-Hartman wrote:
Ok, let me just drop this patch, get this 4.9 release out, and then can you all send me a patch, or series of patches, that you feel should be applied to 4.9 to resolve this issue?
Please, indeed drop it for 4.9. We'll take the time here to sort out things.
Now dropped, thanks all.
greg k-h
Am 15.11.2017 um 15:16 schrieb Pablo Neira Ayuso:
On Wed, Nov 15, 2017 at 03:04:06PM +0100, Greg Kroah-Hartman wrote:
Ok, let me just drop this patch, get this 4.9 release out, and then can you all send me a patch, or series of patches, that you feel should be applied to 4.9 to resolve this issue?
Please, indeed drop it for 4.9. We'll take the time here to sort out things.
Thanks Greg.
@florian:
this is the cherry picked variant which applies clean to the current 4.9 series. since i'm not the author, please resubmit it from your side using the correct format
On Wed, Nov 15, 2017 at 11:13:00AM +0100, Florian Westphal wrote:
Sebastian Gottschall s.gottschall@dd-wrt.com wrote:
the following patch based on the suggestion by Guillaume works good in my tests and does not crash anymore
--- nf_nat_core.c (Revision 33766) +++ nf_nat_core.c (Arbeitskopie) @@ -678,16 +678,11 @@ /* No one using conntrack by the time this called. */ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { - struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
- if (!nat) - return;
- NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
- spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_lock); + if (ct->status & IPS_SRC_NAT_DONE) { + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_lock); + } }
FWIW I can now reproduce the crash on 4.9-stable + backport. This crash can be triggered by a simple
iptables -I INPUT 1 -j DROP
on first (new) incoming packet. Problem is that as not SRC nat transformation is there for inbound connections such conntrack isn't yet on nat_bysource list so hlist_del_rcu() gets garbage input.
bug was added in 7c9664351980aaa6a4b8837a314360b3a4ad382a ("netfilter: move nat hlist_head to nf_conn").
... and then 'masked' by the rhashtable conversion (removing non-existing rhashtable element has no ill effects).
The revert upstream and the 4.13.y one had no such issue because of earlier change 6e699867f84c0f358fed233fe6162173aca28e04 ("netfilter: nat: avoid use of nf_conn_nat extension"), which replaces if (!nat) condition with the if (ct->status & IPS_SRC_NAT_DONE) test quoted above.
I will have an updated patch shortly, I think best solution is to just cherry-pick 6e699867f84c0f358fed233fe6162173aca28e04 in 4.9.y too and update this backport accordingly.
That's fine. Let's just make sure we get them both patches in together in the same go.
So I think it's better if we just keep back this cherry-pick until we have the necessary backport in place.
OK?
linux-stable-mirror@lists.linaro.org