The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 311cca40661f428b7aa114fb5af578cfdbe3e8b6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023102024-blame-romp-1612@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 311cca40661f428b7aa114fb5af578cfdbe3e8b6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba(a)kernel.org>
Date: Tue, 17 Oct 2023 18:38:13 -0700
Subject: [PATCH] net: fix ifname in netlink ntf during netns move
dev_get_valid_name() overwrites the netdev's name on success.
This makes it hard to use in prepare-commit-like fashion,
where we do validation first, and "commit" to the change
later.
Factor out a helper which lets us save the new name to a buffer.
Use it to fix the problem of notification on netns move having
incorrect name:
5: eth0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default
link/ether be:4d:58:f9:d5:40 brd ff:ff:ff:ff:ff:ff
6: eth1: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default
link/ether 1e:4a:34:36:e3:cd brd ff:ff:ff:ff:ff:ff
[ ~]# ip link set dev eth0 netns 1 name eth1
ip monitor inside netns:
Deleted inet eth0
Deleted inet6 eth0
Deleted 5: eth1: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default
link/ether be:4d:58:f9:d5:40 brd ff:ff:ff:ff:ff:ff new-netnsid 0 new-ifindex 7
Name is reported as eth1 in old netns for ifindex 5, already renamed.
Fixes: d90310243fd7 ("net: device name allocation cleanups")
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Reviewed-by: Jiri Pirko <jiri(a)nvidia.com>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
diff --git a/net/core/dev.c b/net/core/dev.c
index 5aaf5753d4e4..f109ad34d660 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1123,6 +1123,26 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
return -ENFILE;
}
+static int dev_prep_valid_name(struct net *net, struct net_device *dev,
+ const char *want_name, char *out_name)
+{
+ int ret;
+
+ if (!dev_valid_name(want_name))
+ return -EINVAL;
+
+ if (strchr(want_name, '%')) {
+ ret = __dev_alloc_name(net, want_name, out_name);
+ return ret < 0 ? ret : 0;
+ } else if (netdev_name_in_use(net, want_name)) {
+ return -EEXIST;
+ } else if (out_name != want_name) {
+ strscpy(out_name, want_name, IFNAMSIZ);
+ }
+
+ return 0;
+}
+
static int dev_alloc_name_ns(struct net *net,
struct net_device *dev,
const char *name)
@@ -1160,19 +1180,13 @@ EXPORT_SYMBOL(dev_alloc_name);
static int dev_get_valid_name(struct net *net, struct net_device *dev,
const char *name)
{
- BUG_ON(!net);
-
- if (!dev_valid_name(name))
- return -EINVAL;
-
- if (strchr(name, '%'))
- return dev_alloc_name_ns(net, dev, name);
- else if (netdev_name_in_use(net, name))
- return -EEXIST;
- else if (dev->name != name)
- strscpy(dev->name, name, IFNAMSIZ);
+ char buf[IFNAMSIZ];
+ int ret;
- return 0;
+ ret = dev_prep_valid_name(net, dev, name, buf);
+ if (ret >= 0)
+ strscpy(dev->name, buf, IFNAMSIZ);
+ return ret;
}
/**
@@ -11038,6 +11052,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
const char *pat, int new_ifindex)
{
struct net *net_old = dev_net(dev);
+ char new_name[IFNAMSIZ] = {};
int err, new_nsid;
ASSERT_RTNL();
@@ -11064,7 +11079,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
/* We get here if we can't use the current device name */
if (!pat)
goto out;
- err = dev_get_valid_name(net, dev, pat);
+ err = dev_prep_valid_name(net, dev, pat, new_name);
if (err < 0)
goto out;
}
@@ -11135,6 +11150,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
netdev_adjacent_add_links(dev);
+ if (new_name[0]) /* Rename the netdev to prepared name */
+ strscpy(dev->name, new_name, IFNAMSIZ);
+
/* Fixup kobjects */
err = device_rename(&dev->dev, dev->name);
WARN_ON(err);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x a13b67c9a015c4e21601ef9aa4ec9c5d972df1b4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023102023-poster-opulently-4c96@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a13b67c9a015c4e21601ef9aa4ec9c5d972df1b4 Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela(a)mojatatu.com>
Date: Tue, 17 Oct 2023 11:36:02 -0300
Subject: [PATCH] net/sched: sch_hfsc: upgrade 'rt' to 'sc' when it becomes a
inner curve
Christian Theune says:
I upgraded from 6.1.38 to 6.1.55 this morning and it broke my traffic shaping script,
leaving me with a non-functional uplink on a remote router.
A 'rt' curve cannot be used as a inner curve (parent class), but we were
allowing such configurations since the qdisc was introduced. Such
configurations would trigger a UAF as Budimir explains:
The parent will have vttree_insert() called on it in init_vf(),
but will not have vttree_remove() called on it in update_vf()
because it does not have the HFSC_FSC flag set.
The qdisc always assumes that inner classes have the HFSC_FSC flag set.
This is by design as it doesn't make sense 'qdisc wise' for an 'rt'
curve to be an inner curve.
Budimir's original patch disallows users to add classes with a 'rt'
parent, but this is too strict as it breaks users that have been using
'rt' as a inner class. Another approach, taken by this patch, is to
upgrade the inner 'rt' into a 'sc', warning the user in the process.
It avoids the UAF reported by Budimir while also being more permissive
to bad scripts/users/code using 'rt' as a inner class.
Users checking the `tc class ls [...]` or `tc class get [...]` dumps would
observe the curve change and are potentially breaking with this change.
v1->v2: https://lore.kernel.org/all/20231013151057.2611860-1-pctammela@mojatatu.com/
- Correct 'Fixes' tag and merge with revert (Jakub)
Cc: Christian Theune <ct(a)flyingcircus.io>
Cc: Budimir Markovic <markovicbudimir(a)gmail.com>
Fixes: b3d26c5702c7 ("net/sched: sch_hfsc: Ensure inner classes have fsc curve")
Signed-off-by: Pedro Tammela <pctammela(a)mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs(a)mojatatu.com>
Link: https://lore.kernel.org/r/20231017143602.3191556-1-pctammela@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3554085bc2be..880c5f16b29c 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -902,6 +902,14 @@ hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc,
cl->cl_flags |= HFSC_USC;
}
+static void
+hfsc_upgrade_rt(struct hfsc_class *cl)
+{
+ cl->cl_fsc = cl->cl_rsc;
+ rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
+ cl->cl_flags |= HFSC_FSC;
+}
+
static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = {
[TCA_HFSC_RSC] = { .len = sizeof(struct tc_service_curve) },
[TCA_HFSC_FSC] = { .len = sizeof(struct tc_service_curve) },
@@ -1011,10 +1019,6 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
if (parent == NULL)
return -ENOENT;
}
- if (!(parent->cl_flags & HFSC_FSC) && parent != &q->root) {
- NL_SET_ERR_MSG(extack, "Invalid parent - parent class must have FSC");
- return -EINVAL;
- }
if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0)
return -EINVAL;
@@ -1065,6 +1069,12 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
cl->cf_tree = RB_ROOT;
sch_tree_lock(sch);
+ /* Check if the inner class is a misconfigured 'rt' */
+ if (!(parent->cl_flags & HFSC_FSC) && parent != &q->root) {
+ NL_SET_ERR_MSG(extack,
+ "Forced curve change on parent 'rt' to 'sc'");
+ hfsc_upgrade_rt(parent);
+ }
qdisc_class_hash_insert(&q->clhash, &cl->cl_common);
list_add_tail(&cl->siblings, &parent->children);
if (parent->level == 0)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 61b40cefe51af005c72dbdcf975a3d166c6e6406
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023102042-clinic-pruning-a3ca@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 61b40cefe51af005c72dbdcf975a3d166c6e6406 Mon Sep 17 00:00:00 2001
From: Jinjie Ruan <ruanjinjie(a)huawei.com>
Date: Wed, 11 Oct 2023 11:24:19 +0800
Subject: [PATCH] net: dsa: bcm_sf2: Fix possible memory leak in
bcm_sf2_mdio_register()
In bcm_sf2_mdio_register(), the class_find_device() will call get_device()
to increment reference count for priv->master_mii_bus->dev if
of_mdio_find_bus() succeeds. If mdiobus_alloc() or mdiobus_register()
fails, it will call get_device() twice without decrement reference count
for the device. And it is the same if bcm_sf2_mdio_register() succeeds but
fails in bcm_sf2_sw_probe(), or if bcm_sf2_sw_probe() succeeds. If the
reference count has not decremented to zero, the dev related resource will
not be freed.
So remove the get_device() in bcm_sf2_mdio_register(), and call
put_device() if mdiobus_alloc() or mdiobus_register() fails and in
bcm_sf2_mdio_unregister() to solve the issue.
And as Simon suggested, unwind from errors for bcm_sf2_mdio_register() and
just return 0 if it succeeds to make it cleaner.
Fixes: 461cd1b03e32 ("net: dsa: bcm_sf2: Register our slave MDIO bus")
Signed-off-by: Jinjie Ruan <ruanjinjie(a)huawei.com>
Suggested-by: Simon Horman <horms(a)kernel.org>
Reviewed-by: Simon Horman <horms(a)kernel.org>
Reviewed-by: Florian Fainelli <florian.fainelli(a)broadcom.com>
Link: https://lore.kernel.org/r/20231011032419.2423290-1-ruanjinjie@huawei.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 72374b066f64..cd1f240c90f3 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -617,17 +617,16 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
dn = of_find_compatible_node(NULL, NULL, "brcm,unimac-mdio");
priv->master_mii_bus = of_mdio_find_bus(dn);
if (!priv->master_mii_bus) {
- of_node_put(dn);
- return -EPROBE_DEFER;
+ err = -EPROBE_DEFER;
+ goto err_of_node_put;
}
- get_device(&priv->master_mii_bus->dev);
priv->master_mii_dn = dn;
priv->slave_mii_bus = mdiobus_alloc();
if (!priv->slave_mii_bus) {
- of_node_put(dn);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto err_put_master_mii_bus_dev;
}
priv->slave_mii_bus->priv = priv;
@@ -684,11 +683,17 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
}
err = mdiobus_register(priv->slave_mii_bus);
- if (err && dn) {
- mdiobus_free(priv->slave_mii_bus);
- of_node_put(dn);
- }
+ if (err && dn)
+ goto err_free_slave_mii_bus;
+ return 0;
+
+err_free_slave_mii_bus:
+ mdiobus_free(priv->slave_mii_bus);
+err_put_master_mii_bus_dev:
+ put_device(&priv->master_mii_bus->dev);
+err_of_node_put:
+ of_node_put(dn);
return err;
}
@@ -696,6 +701,7 @@ static void bcm_sf2_mdio_unregister(struct bcm_sf2_priv *priv)
{
mdiobus_unregister(priv->slave_mii_bus);
mdiobus_free(priv->slave_mii_bus);
+ put_device(&priv->master_mii_bus->dev);
of_node_put(priv->master_mii_dn);
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 61b40cefe51af005c72dbdcf975a3d166c6e6406
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023102041-speculate-spew-1d42@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 61b40cefe51af005c72dbdcf975a3d166c6e6406 Mon Sep 17 00:00:00 2001
From: Jinjie Ruan <ruanjinjie(a)huawei.com>
Date: Wed, 11 Oct 2023 11:24:19 +0800
Subject: [PATCH] net: dsa: bcm_sf2: Fix possible memory leak in
bcm_sf2_mdio_register()
In bcm_sf2_mdio_register(), the class_find_device() will call get_device()
to increment reference count for priv->master_mii_bus->dev if
of_mdio_find_bus() succeeds. If mdiobus_alloc() or mdiobus_register()
fails, it will call get_device() twice without decrement reference count
for the device. And it is the same if bcm_sf2_mdio_register() succeeds but
fails in bcm_sf2_sw_probe(), or if bcm_sf2_sw_probe() succeeds. If the
reference count has not decremented to zero, the dev related resource will
not be freed.
So remove the get_device() in bcm_sf2_mdio_register(), and call
put_device() if mdiobus_alloc() or mdiobus_register() fails and in
bcm_sf2_mdio_unregister() to solve the issue.
And as Simon suggested, unwind from errors for bcm_sf2_mdio_register() and
just return 0 if it succeeds to make it cleaner.
Fixes: 461cd1b03e32 ("net: dsa: bcm_sf2: Register our slave MDIO bus")
Signed-off-by: Jinjie Ruan <ruanjinjie(a)huawei.com>
Suggested-by: Simon Horman <horms(a)kernel.org>
Reviewed-by: Simon Horman <horms(a)kernel.org>
Reviewed-by: Florian Fainelli <florian.fainelli(a)broadcom.com>
Link: https://lore.kernel.org/r/20231011032419.2423290-1-ruanjinjie@huawei.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 72374b066f64..cd1f240c90f3 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -617,17 +617,16 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
dn = of_find_compatible_node(NULL, NULL, "brcm,unimac-mdio");
priv->master_mii_bus = of_mdio_find_bus(dn);
if (!priv->master_mii_bus) {
- of_node_put(dn);
- return -EPROBE_DEFER;
+ err = -EPROBE_DEFER;
+ goto err_of_node_put;
}
- get_device(&priv->master_mii_bus->dev);
priv->master_mii_dn = dn;
priv->slave_mii_bus = mdiobus_alloc();
if (!priv->slave_mii_bus) {
- of_node_put(dn);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto err_put_master_mii_bus_dev;
}
priv->slave_mii_bus->priv = priv;
@@ -684,11 +683,17 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
}
err = mdiobus_register(priv->slave_mii_bus);
- if (err && dn) {
- mdiobus_free(priv->slave_mii_bus);
- of_node_put(dn);
- }
+ if (err && dn)
+ goto err_free_slave_mii_bus;
+ return 0;
+
+err_free_slave_mii_bus:
+ mdiobus_free(priv->slave_mii_bus);
+err_put_master_mii_bus_dev:
+ put_device(&priv->master_mii_bus->dev);
+err_of_node_put:
+ of_node_put(dn);
return err;
}
@@ -696,6 +701,7 @@ static void bcm_sf2_mdio_unregister(struct bcm_sf2_priv *priv)
{
mdiobus_unregister(priv->slave_mii_bus);
mdiobus_free(priv->slave_mii_bus);
+ put_device(&priv->master_mii_bus->dev);
of_node_put(priv->master_mii_dn);
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 61b40cefe51af005c72dbdcf975a3d166c6e6406
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023102039-chance-unseen-916d@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 61b40cefe51af005c72dbdcf975a3d166c6e6406 Mon Sep 17 00:00:00 2001
From: Jinjie Ruan <ruanjinjie(a)huawei.com>
Date: Wed, 11 Oct 2023 11:24:19 +0800
Subject: [PATCH] net: dsa: bcm_sf2: Fix possible memory leak in
bcm_sf2_mdio_register()
In bcm_sf2_mdio_register(), the class_find_device() will call get_device()
to increment reference count for priv->master_mii_bus->dev if
of_mdio_find_bus() succeeds. If mdiobus_alloc() or mdiobus_register()
fails, it will call get_device() twice without decrement reference count
for the device. And it is the same if bcm_sf2_mdio_register() succeeds but
fails in bcm_sf2_sw_probe(), or if bcm_sf2_sw_probe() succeeds. If the
reference count has not decremented to zero, the dev related resource will
not be freed.
So remove the get_device() in bcm_sf2_mdio_register(), and call
put_device() if mdiobus_alloc() or mdiobus_register() fails and in
bcm_sf2_mdio_unregister() to solve the issue.
And as Simon suggested, unwind from errors for bcm_sf2_mdio_register() and
just return 0 if it succeeds to make it cleaner.
Fixes: 461cd1b03e32 ("net: dsa: bcm_sf2: Register our slave MDIO bus")
Signed-off-by: Jinjie Ruan <ruanjinjie(a)huawei.com>
Suggested-by: Simon Horman <horms(a)kernel.org>
Reviewed-by: Simon Horman <horms(a)kernel.org>
Reviewed-by: Florian Fainelli <florian.fainelli(a)broadcom.com>
Link: https://lore.kernel.org/r/20231011032419.2423290-1-ruanjinjie@huawei.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 72374b066f64..cd1f240c90f3 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -617,17 +617,16 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
dn = of_find_compatible_node(NULL, NULL, "brcm,unimac-mdio");
priv->master_mii_bus = of_mdio_find_bus(dn);
if (!priv->master_mii_bus) {
- of_node_put(dn);
- return -EPROBE_DEFER;
+ err = -EPROBE_DEFER;
+ goto err_of_node_put;
}
- get_device(&priv->master_mii_bus->dev);
priv->master_mii_dn = dn;
priv->slave_mii_bus = mdiobus_alloc();
if (!priv->slave_mii_bus) {
- of_node_put(dn);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto err_put_master_mii_bus_dev;
}
priv->slave_mii_bus->priv = priv;
@@ -684,11 +683,17 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
}
err = mdiobus_register(priv->slave_mii_bus);
- if (err && dn) {
- mdiobus_free(priv->slave_mii_bus);
- of_node_put(dn);
- }
+ if (err && dn)
+ goto err_free_slave_mii_bus;
+ return 0;
+
+err_free_slave_mii_bus:
+ mdiobus_free(priv->slave_mii_bus);
+err_put_master_mii_bus_dev:
+ put_device(&priv->master_mii_bus->dev);
+err_of_node_put:
+ of_node_put(dn);
return err;
}
@@ -696,6 +701,7 @@ static void bcm_sf2_mdio_unregister(struct bcm_sf2_priv *priv)
{
mdiobus_unregister(priv->slave_mii_bus);
mdiobus_free(priv->slave_mii_bus);
+ put_device(&priv->master_mii_bus->dev);
of_node_put(priv->master_mii_dn);
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 1c2709cfff1dedbb9591e989e2f001484208d914
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023102017-pedometer-skier-c67d@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1c2709cfff1dedbb9591e989e2f001484208d914 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell(a)google.com>
Date: Sun, 15 Oct 2023 13:47:00 -0400
Subject: [PATCH] tcp: fix excessive TLP and RACK timeouts from HZ rounding
We discovered from packet traces of slow loss recovery on kernels with
the default HZ=250 setting (and min_rtt < 1ms) that after reordering,
when receiving a SACKed sequence range, the RACK reordering timer was
firing after about 16ms rather than the desired value of roughly
min_rtt/4 + 2ms. The problem is largely due to the RACK reorder timer
calculation adding in TCP_TIMEOUT_MIN, which is 2 jiffies. On kernels
with HZ=250, this is 2*4ms = 8ms. The TLP timer calculation has the
exact same issue.
This commit fixes the TLP transmit timer and RACK reordering timer
floor calculation to more closely match the intended 2ms floor even on
kernels with HZ=250. It does this by adding in a new
TCP_TIMEOUT_MIN_US floor of 2000 us and then converting to jiffies,
instead of the current approach of converting to jiffies and then
adding th TCP_TIMEOUT_MIN value of 2 jiffies.
Our testing has verified that on kernels with HZ=1000, as expected,
this does not produce significant changes in behavior, but on kernels
with the default HZ=250 the latency improvement can be large. For
example, our tests show that for HZ=250 kernels at low RTTs this fix
roughly halves the latency for the RACK reorder timer: instead of
mostly firing at 16ms it mostly fires at 8ms.
Suggested-by: Eric Dumazet <edumazet(a)google.com>
Signed-off-by: Neal Cardwell <ncardwell(a)google.com>
Signed-off-by: Yuchung Cheng <ycheng(a)google.com>
Fixes: bb4d991a28cc ("tcp: adjust tail loss probe timeout")
Reviewed-by: Eric Dumazet <edumazet(a)google.com>
Link: https://lore.kernel.org/r/20231015174700.2206872-1-ncardwell.sw@gmail.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7b1a720691ae..4b03ca7cb8a5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -141,6 +141,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCP_RTO_MAX ((unsigned)(120*HZ))
#define TCP_RTO_MIN ((unsigned)(HZ/5))
#define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */
+
+#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */
+
#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */
#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now
* used as a fallback RTO for the
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9c8c42c280b7..bbd85672fda7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2788,7 +2788,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- u32 timeout, rto_delta_us;
+ u32 timeout, timeout_us, rto_delta_us;
int early_retrans;
/* Don't do any loss probe on a Fast Open connection before 3WHS
@@ -2812,11 +2812,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
* sample is available then probe after TCP_TIMEOUT_INIT.
*/
if (tp->srtt_us) {
- timeout = usecs_to_jiffies(tp->srtt_us >> 2);
+ timeout_us = tp->srtt_us >> 2;
if (tp->packets_out == 1)
- timeout += TCP_RTO_MIN;
+ timeout_us += tcp_rto_min_us(sk);
else
- timeout += TCP_TIMEOUT_MIN;
+ timeout_us += TCP_TIMEOUT_MIN_US;
+ timeout = usecs_to_jiffies(timeout_us);
} else {
timeout = TCP_TIMEOUT_INIT;
}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index acf4869c5d3b..bba10110fbbc 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -104,7 +104,7 @@ bool tcp_rack_mark_lost(struct sock *sk)
tp->rack.advanced = 0;
tcp_rack_detect_loss(sk, &timeout);
if (timeout) {
- timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
+ timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
timeout, inet_csk(sk)->icsk_rto);
}