This is a note to let you know that I've just added the patch titled
ipv4: igmp: guard against silly MTU values
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
ipv4-igmp-guard-against-silly-mtu-values.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Sun Dec 31 11:13:15 CET 2017
From: Eric Dumazet <edumazet(a)google.com>
Date: Mon, 11 Dec 2017 07:17:39 -0800
Subject: ipv4: igmp: guard against silly MTU values
From: Eric Dumazet <edumazet(a)google.com>
[ Upstream commit b5476022bbada3764609368f03329ca287528dc8 ]
IPv4 stack reacts to changes to small MTU, by disabling itself under
RTNL.
But there is a window where threads not using RTNL can see a wrong
device mtu. This can lead to surprises, in igmp code where it is
assumed the mtu is suitable.
Fix this by reading device mtu once and checking IPv4 minimal MTU.
This patch adds missing IPV4_MIN_MTU define, to not abuse
ETH_MIN_MTU anymore.
Signed-off-by: Eric Dumazet <edumazet(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/net/ip.h | 2 ++
net/ipv4/devinet.c | 2 +-
net/ipv4/igmp.c | 24 +++++++++++++++---------
net/ipv4/ip_tunnel.c | 4 ++--
4 files changed, 20 insertions(+), 12 deletions(-)
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -33,6 +33,8 @@
#include <net/flow.h>
#include <net/flow_dissector.h>
+#define IPV4_MIN_MTU 68 /* RFC 791 */
+
struct sock;
struct inet_skb_parm {
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1380,7 +1380,7 @@ skip:
static bool inetdev_valid_mtu(unsigned int mtu)
{
- return mtu >= 68;
+ return mtu >= IPV4_MIN_MTU;
}
static void inetdev_send_gratuitous_arp(struct net_device *dev,
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -404,16 +404,17 @@ static int grec_size(struct ip_mc_list *
}
static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
- int type, struct igmpv3_grec **ppgr)
+ int type, struct igmpv3_grec **ppgr, unsigned int mtu)
{
struct net_device *dev = pmc->interface->dev;
struct igmpv3_report *pih;
struct igmpv3_grec *pgr;
- if (!skb)
- skb = igmpv3_newpack(dev, dev->mtu);
- if (!skb)
- return NULL;
+ if (!skb) {
+ skb = igmpv3_newpack(dev, mtu);
+ if (!skb)
+ return NULL;
+ }
pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
pgr->grec_type = type;
pgr->grec_auxwords = 0;
@@ -436,12 +437,17 @@ static struct sk_buff *add_grec(struct s
struct igmpv3_grec *pgr = NULL;
struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
int scount, stotal, first, isquery, truncate;
+ unsigned int mtu;
if (pmc->multiaddr == IGMP_ALL_HOSTS)
return skb;
if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
return skb;
+ mtu = READ_ONCE(dev->mtu);
+ if (mtu < IPV4_MIN_MTU)
+ return skb;
+
isquery = type == IGMPV3_MODE_IS_INCLUDE ||
type == IGMPV3_MODE_IS_EXCLUDE;
truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
@@ -462,7 +468,7 @@ static struct sk_buff *add_grec(struct s
AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
if (skb)
igmpv3_sendpack(skb);
- skb = igmpv3_newpack(dev, dev->mtu);
+ skb = igmpv3_newpack(dev, mtu);
}
}
first = 1;
@@ -498,12 +504,12 @@ static struct sk_buff *add_grec(struct s
pgr->grec_nsrcs = htons(scount);
if (skb)
igmpv3_sendpack(skb);
- skb = igmpv3_newpack(dev, dev->mtu);
+ skb = igmpv3_newpack(dev, mtu);
first = 1;
scount = 0;
}
if (first) {
- skb = add_grhead(skb, pmc, type, &pgr);
+ skb = add_grhead(skb, pmc, type, &pgr, mtu);
first = 0;
}
if (!skb)
@@ -538,7 +544,7 @@ empty_source:
igmpv3_sendpack(skb);
skb = NULL; /* add_grhead will get a new one */
}
- skb = add_grhead(skb, pmc, type, &pgr);
+ skb = add_grhead(skb, pmc, type, &pgr, mtu);
}
}
if (pgr)
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -346,8 +346,8 @@ static int ip_tunnel_bind_dev(struct net
dev->needed_headroom = t_hlen + hlen;
mtu -= (dev->hard_header_len + t_hlen);
- if (mtu < 68)
- mtu = 68;
+ if (mtu < IPV4_MIN_MTU)
+ mtu = IPV4_MIN_MTU;
return mtu;
}
Patches currently in stable-queue which might be from edumazet(a)google.com are
queue-4.9/net-fix-double-free-and-memory-corruption-in-get_net_ns_by_id.patch
queue-4.9/sock-free-skb-in-skb_complete_tx_timestamp-on-error.patch
queue-4.9/tcp-invalidate-rate-samples-during-sack-reneging.patch
queue-4.9/tcp-md5sig-use-skb-s-saddr-when-replying-to-an-incoming-segment.patch
queue-4.9/net-ipv4-fix-for-a-race-condition-in-raw_sendmsg.patch
queue-4.9/ipv4-igmp-guard-against-silly-mtu-values.patch
queue-4.9/ipv6-mcast-better-catch-silly-mtu-values.patch
This is a note to let you know that I've just added the patch titled
ipv4: Fix use-after-free when flushing FIB tables
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
ipv4-fix-use-after-free-when-flushing-fib-tables.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Sun Dec 31 11:13:15 CET 2017
From: Ido Schimmel <idosch(a)mellanox.com>
Date: Wed, 20 Dec 2017 19:34:19 +0200
Subject: ipv4: Fix use-after-free when flushing FIB tables
From: Ido Schimmel <idosch(a)mellanox.com>
[ Upstream commit b4681c2829e24943aadd1a7bb3a30d41d0a20050 ]
Since commit 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse") the
local table uses the same trie allocated for the main table when custom
rules are not in use.
When a net namespace is dismantled, the main table is flushed and freed
(via an RCU callback) before the local table. In case the callback is
invoked before the local table is iterated, a use-after-free can occur.
Fix this by iterating over the FIB tables in reverse order, so that the
main table is always freed after the local table.
v3: Reworded comment according to Alex's suggestion.
v2: Add a comment to make the fix more explicit per Dave's and Alex's
feedback.
Fixes: 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse")
Signed-off-by: Ido Schimmel <idosch(a)mellanox.com>
Reported-by: Fengguang Wu <fengguang.wu(a)intel.com>
Acked-by: Alexander Duyck <alexander.h.duyck(a)intel.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv4/fib_frontend.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1253,14 +1253,19 @@ fail:
static void ip_fib_net_exit(struct net *net)
{
- unsigned int i;
+ int i;
rtnl_lock();
#ifdef CONFIG_IP_MULTIPLE_TABLES
RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
#endif
- for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+ /* Destroy the tables in reverse order to guarantee that the
+ * local table, ID 255, is destroyed before the main table, ID
+ * 254. This is necessary as the local table may contain
+ * references to data contained in the main table.
+ */
+ for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
struct hlist_head *head = &net->ipv4.fib_table_hash[i];
struct hlist_node *tmp;
struct fib_table *tb;
Patches currently in stable-queue which might be from idosch(a)mellanox.com are
queue-4.9/ipv4-fix-use-after-free-when-flushing-fib-tables.patch
This is a note to let you know that I've just added the patch titled
adding missing rcu_read_unlock in ipxip6_rcv
to the 4.9-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
adding-missing-rcu_read_unlock-in-ipxip6_rcv.patch
and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Sun Dec 31 11:13:15 CET 2017
From: "Nikita V. Shirokov" <tehnerd(a)fb.com>
Date: Wed, 6 Dec 2017 17:15:43 -0800
Subject: adding missing rcu_read_unlock in ipxip6_rcv
From: "Nikita V. Shirokov" <tehnerd(a)fb.com>
[ Upstream commit 74c4b656c3d92ec4c824ea1a4afd726b7b6568c8 ]
commit 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels")
introduced new exit point in ipxip6_rcv. however rcu_read_unlock is
missing there. this diff is fixing this
v1->v2:
instead of doing rcu_read_unlock in place, we are going to "drop"
section (to prevent skb leakage)
Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels")
Signed-off-by: Nikita V. Shirokov <tehnerd(a)fb.com>
Acked-by: Alexei Starovoitov <ast(a)kernel.org>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv6/ip6_tunnel.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -911,7 +911,7 @@ static int ipxip6_rcv(struct sk_buff *sk
if (t->parms.collect_md) {
tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
if (!tun_dst)
- return 0;
+ goto drop;
}
ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
log_ecn_error);
Patches currently in stable-queue which might be from tehnerd(a)fb.com are
queue-4.9/adding-missing-rcu_read_unlock-in-ipxip6_rcv.patch
This is a note to let you know that I've just added the patch titled
vxlan: restore dev->mtu setting based on lower device
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
vxlan-restore-dev-mtu-setting-based-on-lower-device.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Sun Dec 31 11:12:48 CET 2017
From: Alexey Kodanev <alexey.kodanev(a)oracle.com>
Date: Thu, 14 Dec 2017 20:20:00 +0300
Subject: vxlan: restore dev->mtu setting based on lower device
From: Alexey Kodanev <alexey.kodanev(a)oracle.com>
[ Upstream commit f870c1ff65a6d1f3a083f277280802ee09a5b44d ]
Stefano Brivio says:
Commit a985343ba906 ("vxlan: refactor verification and
application of configuration") introduced a change in the
behaviour of initial MTU setting: earlier, the MTU for a link
created on top of a given lower device, without an initial MTU
specification, was set to the MTU of the lower device minus
headroom as a result of this path in vxlan_dev_configure():
if (!conf->mtu)
dev->mtu = lowerdev->mtu -
(use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
which is now gone. Now, the initial MTU, in absence of a
configured value, is simply set by ether_setup() to ETH_DATA_LEN
(1500 bytes).
This breaks userspace expectations in case the MTU of
the lower device is higher than 1500 bytes minus headroom.
This patch restores the previous behaviour on newlink operation. Since
max_mtu can be negative and we update dev->mtu directly, also check it
for valid minimum.
Reported-by: Junhan Yan <juyan(a)redhat.com>
Fixes: a985343ba906 ("vxlan: refactor verification and application of configuration")
Signed-off-by: Alexey Kodanev <alexey.kodanev(a)oracle.com>
Acked-by: Stefano Brivio <sbrivio(a)redhat.com>
Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/net/vxlan.c | 5 +++++
1 file changed, 5 insertions(+)
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3105,6 +3105,11 @@ static void vxlan_config_apply(struct ne
max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
VXLAN_HEADROOM);
+ if (max_mtu < ETH_MIN_MTU)
+ max_mtu = ETH_MIN_MTU;
+
+ if (!changelink && !conf->mtu)
+ dev->mtu = max_mtu;
}
if (dev->mtu > max_mtu)
Patches currently in stable-queue which might be from alexey.kodanev(a)oracle.com are
queue-4.14/vxlan-restore-dev-mtu-setting-based-on-lower-device.patch
queue-4.14/ip6_gre-fix-device-features-for-ioctl-setup.patch
This is a note to let you know that I've just added the patch titled
tipc: fix hanging poll() for stream sockets
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tipc-fix-hanging-poll-for-stream-sockets.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Sun Dec 31 11:12:48 CET 2017
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan(a)gmail.com>
Date: Thu, 28 Dec 2017 12:03:06 +0100
Subject: tipc: fix hanging poll() for stream sockets
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan(a)gmail.com>
[ Upstream commit 517d7c79bdb39864e617960504bdc1aa560c75c6 ]
In commit 42b531de17d2f6 ("tipc: Fix missing connection request
handling"), we replaced unconditional wakeup() with condtional
wakeup for clients with flags POLLIN | POLLRDNORM | POLLRDBAND.
This breaks the applications which do a connect followed by poll
with POLLOUT flag. These applications are not woken when the
connection is ESTABLISHED and hence sleep forever.
In this commit, we fix it by including the POLLOUT event for
sockets in TIPC_CONNECTING state.
Fixes: 42b531de17d2f6 ("tipc: Fix missing connection request handling")
Acked-by: Jon Maloy <jon.maloy(a)ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/tipc/socket.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -709,11 +709,11 @@ static unsigned int tipc_poll(struct fil
switch (sk->sk_state) {
case TIPC_ESTABLISHED:
+ case TIPC_CONNECTING:
if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
mask |= POLLOUT;
/* fall thru' */
case TIPC_LISTEN:
- case TIPC_CONNECTING:
if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= (POLLIN | POLLRDNORM);
break;
Patches currently in stable-queue which might be from parthasarathy.bhuvaragan(a)gmail.com are
queue-4.14/tipc-fix-hanging-poll-for-stream-sockets.patch
This is a note to let you know that I've just added the patch titled
tg3: Fix rx hang on MTU change with 5717/5719
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tg3-fix-rx-hang-on-mtu-change-with-5717-5719.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Sun Dec 31 11:12:48 CET 2017
From: Brian King <brking(a)linux.vnet.ibm.com>
Date: Fri, 15 Dec 2017 15:21:50 -0600
Subject: tg3: Fix rx hang on MTU change with 5717/5719
From: Brian King <brking(a)linux.vnet.ibm.com>
[ Upstream commit 748a240c589824e9121befb1cba5341c319885bc ]
This fixes a hang issue seen when changing the MTU size from 1500 MTU
to 9000 MTU on both 5717 and 5719 chips. In discussion with Broadcom,
they've indicated that these chipsets have the same phy as the 57766
chipset, so the same workarounds apply. This has been tested by IBM
on both Power 8 and Power 9 systems as well as by Broadcom on x86
hardware and has been confirmed to resolve the hang issue.
Signed-off-by: Brian King <brking(a)linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/net/ethernet/broadcom/tg3.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -14227,7 +14227,9 @@ static int tg3_change_mtu(struct net_dev
/* Reset PHY, otherwise the read DMA engine will be in a mode that
* breaks all requests to 256 bytes.
*/
- if (tg3_asic_rev(tp) == ASIC_REV_57766)
+ if (tg3_asic_rev(tp) == ASIC_REV_57766 ||
+ tg3_asic_rev(tp) == ASIC_REV_5717 ||
+ tg3_asic_rev(tp) == ASIC_REV_5719)
reset_phy = true;
err = tg3_restart_hw(tp, reset_phy);
Patches currently in stable-queue which might be from brking(a)linux.vnet.ibm.com are
queue-4.14/tg3-fix-rx-hang-on-mtu-change-with-5717-5719.patch
This is a note to let you know that I've just added the patch titled
tcp_bbr: reset long-term bandwidth sampling on loss recovery undo
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tcp_bbr-reset-long-term-bandwidth-sampling-on-loss-recovery-undo.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Sun Dec 31 11:12:48 CET 2017
From: Neal Cardwell <ncardwell(a)google.com>
Date: Thu, 7 Dec 2017 12:43:32 -0500
Subject: tcp_bbr: reset long-term bandwidth sampling on loss recovery undo
From: Neal Cardwell <ncardwell(a)google.com>
[ Upstream commit 600647d467c6d04b3954b41a6ee1795b5ae00550 ]
Fix BBR so that upon notification of a loss recovery undo BBR resets
long-term bandwidth sampling.
Under high reordering, reordering events can be interpreted as loss.
If the reordering and spurious loss estimates are high enough, this
can cause BBR to spuriously estimate that we are seeing loss rates
high enough to trigger long-term bandwidth estimation. To avoid that
problem, this commit resets long-term bandwidth sampling on loss
recovery undo events.
Signed-off-by: Neal Cardwell <ncardwell(a)google.com>
Reviewed-by: Yuchung Cheng <ycheng(a)google.com>
Acked-by: Soheil Hassas Yeganeh <soheil(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv4/tcp_bbr.c | 1 +
1 file changed, 1 insertion(+)
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -878,6 +878,7 @@ static u32 bbr_undo_cwnd(struct sock *sk
bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
bbr->full_bw_cnt = 0;
+ bbr_reset_lt_bw_sampling(sk);
return tcp_sk(sk)->snd_cwnd;
}
Patches currently in stable-queue which might be from ncardwell(a)google.com are
queue-4.14/tcp-refresh-tcp_mstamp-from-timers-callbacks.patch
queue-4.14/tcp_bbr-reset-full-pipe-detection-on-loss-recovery-undo.patch
queue-4.14/tcp_bbr-reset-long-term-bandwidth-sampling-on-loss-recovery-undo.patch
queue-4.14/tcp-invalidate-rate-samples-during-sack-reneging.patch
queue-4.14/tcp-fix-potential-underestimation-on-rcv_rtt.patch
queue-4.14/tcp_bbr-record-full-bw-reached-decision-in-new-full_bw_reached-bit.patch
This is a note to let you know that I've just added the patch titled
tcp_bbr: reset full pipe detection on loss recovery undo
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tcp_bbr-reset-full-pipe-detection-on-loss-recovery-undo.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Sun Dec 31 11:12:48 CET 2017
From: Neal Cardwell <ncardwell(a)google.com>
Date: Thu, 7 Dec 2017 12:43:31 -0500
Subject: tcp_bbr: reset full pipe detection on loss recovery undo
From: Neal Cardwell <ncardwell(a)google.com>
[ Upstream commit 2f6c498e4f15d27852c04ed46d804a39137ba364 ]
Fix BBR so that upon notification of a loss recovery undo BBR resets
the full pipe detection (STARTUP exit) state machine.
Under high reordering, reordering events can be interpreted as loss.
If the reordering and spurious loss estimates are high enough, this
could previously cause BBR to spuriously estimate that the pipe is
full.
Since spurious loss recovery means that our overall sending will have
slowed down spuriously, this commit gives a flow more time to probe
robustly for bandwidth and decide the pipe is really full.
Signed-off-by: Neal Cardwell <ncardwell(a)google.com>
Reviewed-by: Yuchung Cheng <ycheng(a)google.com>
Acked-by: Soheil Hassas Yeganeh <soheil(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv4/tcp_bbr.c | 4 ++++
1 file changed, 4 insertions(+)
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -874,6 +874,10 @@ static u32 bbr_sndbuf_expand(struct sock
*/
static u32 bbr_undo_cwnd(struct sock *sk)
{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
+ bbr->full_bw_cnt = 0;
return tcp_sk(sk)->snd_cwnd;
}
Patches currently in stable-queue which might be from ncardwell(a)google.com are
queue-4.14/tcp-refresh-tcp_mstamp-from-timers-callbacks.patch
queue-4.14/tcp_bbr-reset-full-pipe-detection-on-loss-recovery-undo.patch
queue-4.14/tcp_bbr-reset-long-term-bandwidth-sampling-on-loss-recovery-undo.patch
queue-4.14/tcp-invalidate-rate-samples-during-sack-reneging.patch
queue-4.14/tcp-fix-potential-underestimation-on-rcv_rtt.patch
queue-4.14/tcp_bbr-record-full-bw-reached-decision-in-new-full_bw_reached-bit.patch
This is a note to let you know that I've just added the patch titled
tcp_bbr: record "full bw reached" decision in new full_bw_reached bit
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tcp_bbr-record-full-bw-reached-decision-in-new-full_bw_reached-bit.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Sun Dec 31 11:12:48 CET 2017
From: Neal Cardwell <ncardwell(a)google.com>
Date: Thu, 7 Dec 2017 12:43:30 -0500
Subject: tcp_bbr: record "full bw reached" decision in new full_bw_reached bit
From: Neal Cardwell <ncardwell(a)google.com>
[ Upstream commit c589e69b508d29ed8e644dfecda453f71c02ec27 ]
This commit records the "full bw reached" decision in a new
full_bw_reached bit. This is a pure refactor that does not change the
current behavior, but enables subsequent fixes and improvements.
In particular, this enables simple and clean fixes because the full_bw
and full_bw_cnt can be unconditionally zeroed without worrying about
forgetting that we estimated we filled the pipe in Startup. And it
enables future improvements because multiple code paths can be used
for estimating that we filled the pipe in Startup; any new code paths
only need to set this bit when they think the pipe is full.
Note that this fix intentionally reduces the width of the full_bw_cnt
counter, since we have never used the most significant bit.
Signed-off-by: Neal Cardwell <ncardwell(a)google.com>
Reviewed-by: Yuchung Cheng <ycheng(a)google.com>
Acked-by: Soheil Hassas Yeganeh <soheil(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv4/tcp_bbr.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -110,7 +110,8 @@ struct bbr {
u32 lt_last_lost; /* LT intvl start: tp->lost */
u32 pacing_gain:10, /* current gain for setting pacing rate */
cwnd_gain:10, /* current gain for setting cwnd */
- full_bw_cnt:3, /* number of rounds without large bw gains */
+ full_bw_reached:1, /* reached full bw in Startup? */
+ full_bw_cnt:2, /* number of rounds without large bw gains */
cycle_idx:3, /* current index in pacing_gain cycle array */
has_seen_rtt:1, /* have we seen an RTT sample yet? */
unused_b:5;
@@ -180,7 +181,7 @@ static bool bbr_full_bw_reached(const st
{
const struct bbr *bbr = inet_csk_ca(sk);
- return bbr->full_bw_cnt >= bbr_full_bw_cnt;
+ return bbr->full_bw_reached;
}
/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
@@ -717,6 +718,7 @@ static void bbr_check_full_bw_reached(st
return;
}
++bbr->full_bw_cnt;
+ bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
}
/* If pipe is probably full, drain the queue and then enter steady-state. */
@@ -850,6 +852,7 @@ static void bbr_init(struct sock *sk)
bbr->restore_cwnd = 0;
bbr->round_start = 0;
bbr->idle_restart = 0;
+ bbr->full_bw_reached = 0;
bbr->full_bw = 0;
bbr->full_bw_cnt = 0;
bbr->cycle_mstamp = 0;
Patches currently in stable-queue which might be from ncardwell(a)google.com are
queue-4.14/tcp-refresh-tcp_mstamp-from-timers-callbacks.patch
queue-4.14/tcp_bbr-reset-full-pipe-detection-on-loss-recovery-undo.patch
queue-4.14/tcp_bbr-reset-long-term-bandwidth-sampling-on-loss-recovery-undo.patch
queue-4.14/tcp-invalidate-rate-samples-during-sack-reneging.patch
queue-4.14/tcp-fix-potential-underestimation-on-rcv_rtt.patch
queue-4.14/tcp_bbr-record-full-bw-reached-decision-in-new-full_bw_reached-bit.patch