The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9adc89af724f12a03b47099cd943ed54e877cd59 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni(a)redhat.com>
Date: Tue, 30 Mar 2021 18:43:54 +0200
Subject: [PATCH] net: let skb_orphan_partial wake-up waiters.
Currently the mentioned helper can end-up freeing the socket wmem
without waking-up any processes waiting for more write memory.
If the partially orphaned skb is attached to an UDP (or raw) socket,
the lack of wake-up can hang the user-space.
Even for TCP sockets not calling the sk destructor could have bad
effects on TSQ.
Address the issue using skb_orphan to release the sk wmem before
setting the new sock_efree destructor. Additionally bundle the
whole ownership update in a new helper, so that later other
potential users could avoid duplicate code.
v1 -> v2:
- use skb_orphan() instead of sort of open coding it (Eric)
- provide an helper for the ownership change (Eric)
Fixes: f6ba8d33cfbb ("netem: fix skb_orphan_partial()")
Suggested-by: Eric Dumazet <edumazet(a)google.com>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Reviewed-by: Eric Dumazet <edumazet(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/include/net/sock.h b/include/net/sock.h
index 0b6266fd6bf6..3e3a5da2ce5a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2221,6 +2221,15 @@ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
sk_mem_charge(sk, skb->truesize);
}
+static inline void skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk)
+{
+ if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
+ skb_orphan(skb);
+ skb->destructor = sock_efree;
+ skb->sk = sk;
+ }
+}
+
void sk_reset_timer(struct sock *sk, struct timer_list *timer,
unsigned long expires);
diff --git a/net/core/sock.c b/net/core/sock.c
index cc31b601ae10..5ec90f99e102 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2132,16 +2132,10 @@ void skb_orphan_partial(struct sk_buff *skb)
if (skb_is_tcp_pure_ack(skb))
return;
- if (can_skb_orphan_partial(skb)) {
- struct sock *sk = skb->sk;
-
- if (refcount_inc_not_zero(&sk->sk_refcnt)) {
- WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
- skb->destructor = sock_efree;
- }
- } else {
+ if (can_skb_orphan_partial(skb))
+ skb_set_owner_sk_safe(skb, skb->sk);
+ else
skb_orphan(skb);
- }
}
EXPORT_SYMBOL(skb_orphan_partial);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9adc89af724f12a03b47099cd943ed54e877cd59 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni(a)redhat.com>
Date: Tue, 30 Mar 2021 18:43:54 +0200
Subject: [PATCH] net: let skb_orphan_partial wake-up waiters.
Currently the mentioned helper can end-up freeing the socket wmem
without waking-up any processes waiting for more write memory.
If the partially orphaned skb is attached to an UDP (or raw) socket,
the lack of wake-up can hang the user-space.
Even for TCP sockets not calling the sk destructor could have bad
effects on TSQ.
Address the issue using skb_orphan to release the sk wmem before
setting the new sock_efree destructor. Additionally bundle the
whole ownership update in a new helper, so that later other
potential users could avoid duplicate code.
v1 -> v2:
- use skb_orphan() instead of sort of open coding it (Eric)
- provide an helper for the ownership change (Eric)
Fixes: f6ba8d33cfbb ("netem: fix skb_orphan_partial()")
Suggested-by: Eric Dumazet <edumazet(a)google.com>
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Reviewed-by: Eric Dumazet <edumazet(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/include/net/sock.h b/include/net/sock.h
index 0b6266fd6bf6..3e3a5da2ce5a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2221,6 +2221,15 @@ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
sk_mem_charge(sk, skb->truesize);
}
+static inline void skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk)
+{
+ if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
+ skb_orphan(skb);
+ skb->destructor = sock_efree;
+ skb->sk = sk;
+ }
+}
+
void sk_reset_timer(struct sock *sk, struct timer_list *timer,
unsigned long expires);
diff --git a/net/core/sock.c b/net/core/sock.c
index cc31b601ae10..5ec90f99e102 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2132,16 +2132,10 @@ void skb_orphan_partial(struct sk_buff *skb)
if (skb_is_tcp_pure_ack(skb))
return;
- if (can_skb_orphan_partial(skb)) {
- struct sock *sk = skb->sk;
-
- if (refcount_inc_not_zero(&sk->sk_refcnt)) {
- WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
- skb->destructor = sock_efree;
- }
- } else {
+ if (can_skb_orphan_partial(skb))
+ skb_set_owner_sk_safe(skb, skb->sk);
+ else
skb_orphan(skb);
- }
}
EXPORT_SYMBOL(skb_orphan_partial);
The patch below does not apply to the 5.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b3650bf76a32380d4d80a3e21b5583e7303f216c Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu(a)nvidia.com>
Date: Wed, 7 Apr 2021 18:36:04 +0300
Subject: [PATCH] net: sched: fix err handler in tcf_action_init()
With recent changes that separated action module load from action
initialization tcf_action_init() function error handling code was modified
to manually release the loaded modules if loading/initialization of any
further action in same batch failed. For the case when all modules
successfully loaded and some of the actions were initialized before one of
them failed in init handler. In this case for all previous actions the
module will be released twice by the error handler: First time by the loop
that manually calls module_put() for all ops, and second time by the action
destroy code that puts the module after destroying the action.
Reproduction:
$ sudo tc actions add action simple sdata \"2\" index 2
$ sudo tc actions add action simple sdata \"1\" index 1 \
action simple sdata \"2\" index 2
RTNETLINK answers: File exists
We have an error talking to the kernel
$ sudo tc actions ls action simple
total acts 1
action order 0: Simple <"2">
index 2 ref 1 bind 0
$ sudo tc actions flush action simple
$ sudo tc actions ls action simple
$ sudo tc actions add action simple sdata \"2\" index 2
Error: Failed to load TC action module.
We have an error talking to the kernel
$ lsmod | grep simple
act_simple 20480 -1
Fix the issue by modifying module reference counting handling in action
initialization code:
- Get module reference in tcf_idr_create() and put it in tcf_idr_release()
instead of taking over the reference held by the caller.
- Modify users of tcf_action_init_1() to always release the module
reference which they obtain before calling init function instead of
assuming that created action takes over the reference.
- Finally, modify tcf_action_init_1() to not release the module reference
when overwriting existing action as this is no longer necessary since both
upper and lower layers obtain and manage their own module references
independently.
Fixes: d349f9976868 ("net_sched: fix RTNL deadlock again caused by request_module()")
Suggested-by: Cong Wang <xiyou.wangcong(a)gmail.com>
Signed-off-by: Vlad Buslov <vladbu(a)nvidia.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 312f0f6554a0..086b291e9530 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -170,12 +170,7 @@ void tcf_idr_insert_many(struct tc_action *actions[]);
void tcf_idr_cleanup(struct tc_action_net *tn, u32 index);
int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
struct tc_action **a, int bind);
-int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
-
-static inline int tcf_idr_release(struct tc_action *a, bool bind)
-{
- return __tcf_idr_release(a, bind, false);
-}
+int tcf_idr_release(struct tc_action *a, bool bind);
int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops);
int tcf_unregister_action(struct tc_action_ops *a,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 50854cfbfcdb..f6d5755d669e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -158,7 +158,7 @@ static int __tcf_action_put(struct tc_action *p, bool bind)
return 0;
}
-int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
+static int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
{
int ret = 0;
@@ -184,7 +184,18 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
return ret;
}
-EXPORT_SYMBOL(__tcf_idr_release);
+
+int tcf_idr_release(struct tc_action *a, bool bind)
+{
+ const struct tc_action_ops *ops = a->ops;
+ int ret;
+
+ ret = __tcf_idr_release(a, bind, false);
+ if (ret == ACT_P_DELETED)
+ module_put(ops->owner);
+ return ret;
+}
+EXPORT_SYMBOL(tcf_idr_release);
static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
{
@@ -493,6 +504,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
}
p->idrinfo = idrinfo;
+ __module_get(ops->owner);
p->ops = ops;
*a = p;
return 0;
@@ -1037,13 +1049,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
if (!name)
a->hw_stats = hw_stats;
- /* module count goes up only when brand new policy is created
- * if it exists and is only bound to in a_o->init() then
- * ACT_P_CREATED is not returned (a zero is).
- */
- if (err != ACT_P_CREATED)
- module_put(a_o->owner);
-
return a;
err_out:
@@ -1103,7 +1108,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
tcf_idr_insert_many(actions);
*attr_size = tcf_action_full_attrs_size(sz);
- return i - 1;
+ err = i - 1;
+ goto err_mod;
err:
tcf_action_destroy(actions, bind);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 9ecb91ebf094..340d5af86e87 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3054,10 +3054,9 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
rate_tlv, "police", ovr,
TCA_ACT_BIND, a_o, init_res,
rtnl_held, extack);
- if (IS_ERR(act)) {
- module_put(a_o->owner);
+ module_put(a_o->owner);
+ if (IS_ERR(act))
return PTR_ERR(act);
- }
act->type = exts->type = TCA_OLD_COMPAT;
exts->actions[0] = act;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b3650bf76a32380d4d80a3e21b5583e7303f216c Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu(a)nvidia.com>
Date: Wed, 7 Apr 2021 18:36:04 +0300
Subject: [PATCH] net: sched: fix err handler in tcf_action_init()
With recent changes that separated action module load from action
initialization tcf_action_init() function error handling code was modified
to manually release the loaded modules if loading/initialization of any
further action in same batch failed. For the case when all modules
successfully loaded and some of the actions were initialized before one of
them failed in init handler. In this case for all previous actions the
module will be released twice by the error handler: First time by the loop
that manually calls module_put() for all ops, and second time by the action
destroy code that puts the module after destroying the action.
Reproduction:
$ sudo tc actions add action simple sdata \"2\" index 2
$ sudo tc actions add action simple sdata \"1\" index 1 \
action simple sdata \"2\" index 2
RTNETLINK answers: File exists
We have an error talking to the kernel
$ sudo tc actions ls action simple
total acts 1
action order 0: Simple <"2">
index 2 ref 1 bind 0
$ sudo tc actions flush action simple
$ sudo tc actions ls action simple
$ sudo tc actions add action simple sdata \"2\" index 2
Error: Failed to load TC action module.
We have an error talking to the kernel
$ lsmod | grep simple
act_simple 20480 -1
Fix the issue by modifying module reference counting handling in action
initialization code:
- Get module reference in tcf_idr_create() and put it in tcf_idr_release()
instead of taking over the reference held by the caller.
- Modify users of tcf_action_init_1() to always release the module
reference which they obtain before calling init function instead of
assuming that created action takes over the reference.
- Finally, modify tcf_action_init_1() to not release the module reference
when overwriting existing action as this is no longer necessary since both
upper and lower layers obtain and manage their own module references
independently.
Fixes: d349f9976868 ("net_sched: fix RTNL deadlock again caused by request_module()")
Suggested-by: Cong Wang <xiyou.wangcong(a)gmail.com>
Signed-off-by: Vlad Buslov <vladbu(a)nvidia.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 312f0f6554a0..086b291e9530 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -170,12 +170,7 @@ void tcf_idr_insert_many(struct tc_action *actions[]);
void tcf_idr_cleanup(struct tc_action_net *tn, u32 index);
int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index,
struct tc_action **a, int bind);
-int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
-
-static inline int tcf_idr_release(struct tc_action *a, bool bind)
-{
- return __tcf_idr_release(a, bind, false);
-}
+int tcf_idr_release(struct tc_action *a, bool bind);
int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops);
int tcf_unregister_action(struct tc_action_ops *a,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 50854cfbfcdb..f6d5755d669e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -158,7 +158,7 @@ static int __tcf_action_put(struct tc_action *p, bool bind)
return 0;
}
-int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
+static int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
{
int ret = 0;
@@ -184,7 +184,18 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
return ret;
}
-EXPORT_SYMBOL(__tcf_idr_release);
+
+int tcf_idr_release(struct tc_action *a, bool bind)
+{
+ const struct tc_action_ops *ops = a->ops;
+ int ret;
+
+ ret = __tcf_idr_release(a, bind, false);
+ if (ret == ACT_P_DELETED)
+ module_put(ops->owner);
+ return ret;
+}
+EXPORT_SYMBOL(tcf_idr_release);
static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
{
@@ -493,6 +504,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
}
p->idrinfo = idrinfo;
+ __module_get(ops->owner);
p->ops = ops;
*a = p;
return 0;
@@ -1037,13 +1049,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
if (!name)
a->hw_stats = hw_stats;
- /* module count goes up only when brand new policy is created
- * if it exists and is only bound to in a_o->init() then
- * ACT_P_CREATED is not returned (a zero is).
- */
- if (err != ACT_P_CREATED)
- module_put(a_o->owner);
-
return a;
err_out:
@@ -1103,7 +1108,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
tcf_idr_insert_many(actions);
*attr_size = tcf_action_full_attrs_size(sz);
- return i - 1;
+ err = i - 1;
+ goto err_mod;
err:
tcf_action_destroy(actions, bind);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 9ecb91ebf094..340d5af86e87 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3054,10 +3054,9 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
rate_tlv, "police", ovr,
TCA_ACT_BIND, a_o, init_res,
rtnl_held, extack);
- if (IS_ERR(act)) {
- module_put(a_o->owner);
+ module_put(a_o->owner);
+ if (IS_ERR(act))
return PTR_ERR(act);
- }
act->type = exts->type = TCA_OLD_COMPAT;
exts->actions[0] = act;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0f6925b3e8da0dbbb52447ca8a8b42b371aac7db Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet(a)google.com>
Date: Fri, 2 Apr 2021 06:26:02 -0700
Subject: [PATCH] virtio_net: Do not pull payload in skb->head
Xuan Zhuo reported that commit 3226b158e67c ("net: avoid 32 x truesize
under-estimation for tiny skbs") brought a ~10% performance drop.
The reason for the performance drop was that GRO was forced
to chain sk_buff (using skb_shinfo(skb)->frag_list), which
uses more memory but also cause packet consumers to go over
a lot of overhead handling all the tiny skbs.
It turns out that virtio_net page_to_skb() has a wrong strategy :
It allocates skbs with GOOD_COPY_LEN (128) bytes in skb->head, then
copies 128 bytes from the page, before feeding the packet to GRO stack.
This was suboptimal before commit 3226b158e67c ("net: avoid 32 x truesize
under-estimation for tiny skbs") because GRO was using 2 frags per MSS,
meaning we were not packing MSS with 100% efficiency.
Fix is to pull only the ethernet header in page_to_skb()
Then, we change virtio_net_hdr_to_skb() to pull the missing
headers, instead of assuming they were already pulled by callers.
This fixes the performance regression, but could also allow virtio_net
to accept packets with more than 128bytes of headers.
Many thanks to Xuan Zhuo for his report, and his tests/help.
Fixes: 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs")
Reported-by: Xuan Zhuo <xuanzhuo(a)linux.alibaba.com>
Link: https://www.spinics.net/lists/netdev/msg731397.html
Co-Developed-by: Xuan Zhuo <xuanzhuo(a)linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo(a)linux.alibaba.com>
Signed-off-by: Eric Dumazet <edumazet(a)google.com>
Cc: "Michael S. Tsirkin" <mst(a)redhat.com>
Cc: Jason Wang <jasowang(a)redhat.com>
Cc: virtualization(a)lists.linux-foundation.org
Acked-by: Jason Wang <jasowang(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 82e520d2cb12..0824e6999e49 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -406,9 +406,13 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
offset += hdr_padded_len;
p += hdr_padded_len;
- copy = len;
- if (copy > skb_tailroom(skb))
- copy = skb_tailroom(skb);
+ /* Copy all frame if it fits skb->head, otherwise
+ * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
+ */
+ if (len <= skb_tailroom(skb))
+ copy = len;
+ else
+ copy = ETH_HLEN + metasize;
skb_put_data(skb, p, copy);
if (metasize) {
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 98775d7fa696..b465f8f3e554 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -65,14 +65,18 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
skb_reset_mac_header(skb);
if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
- u16 start = __virtio16_to_cpu(little_endian, hdr->csum_start);
- u16 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
+ u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start);
+ u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
+ u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16));
+
+ if (!pskb_may_pull(skb, needed))
+ return -EINVAL;
if (!skb_partial_csum_set(skb, start, off))
return -EINVAL;
p_off = skb_transport_offset(skb) + thlen;
- if (p_off > skb_headlen(skb))
+ if (!pskb_may_pull(skb, p_off))
return -EINVAL;
} else {
/* gso packets without NEEDS_CSUM do not set transport_offset.
@@ -102,14 +106,14 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
}
p_off = keys.control.thoff + thlen;
- if (p_off > skb_headlen(skb) ||
+ if (!pskb_may_pull(skb, p_off) ||
keys.basic.ip_proto != ip_proto)
return -EINVAL;
skb_set_transport_header(skb, keys.control.thoff);
} else if (gso_type) {
p_off = thlen;
- if (p_off > skb_headlen(skb))
+ if (!pskb_may_pull(skb, p_off))
return -EINVAL;
}
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4b5923249b8fa427943b50b8f35265176472be38 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
Date: Thu, 8 Apr 2021 20:38:28 +0200
Subject: [PATCH] net: dsa: lantiq_gswip: Configure all remaining GSWIP_MII_CFG
bits
There are a few more bits in the GSWIP_MII_CFG register for which we
did rely on the boot-loader (or the hardware defaults) to set them up
properly.
For some external RMII PHYs we need to select the GSWIP_MII_CFG_RMII_CLK
bit and also we should un-set it for non-RMII PHYs. The
GSWIP_MII_CFG_RMII_CLK bit is ignored for other PHY connection modes.
The GSWIP IP also supports in-band auto-negotiation for RGMII PHYs when
the GSWIP_MII_CFG_RGMII_IBS bit is set. Clear this bit always as there's
no known hardware which uses this (so it is not tested yet).
Clear the xMII isolation bit when set at initialization time if it was
previously set by the bootloader. Not doing so could lead to no traffic
(neither RX nor TX) on a port with this bit set.
While here, also add the GSWIP_MII_CFG_RESET bit. We don't need to
manage it because this bit is self-clearning when set. We still add it
here to get a better overview of the GSWIP_MII_CFG register.
Fixes: 14fceff4771e51 ("net: dsa: Add Lantiq / Intel DSA driver for vrx200")
Cc: stable(a)vger.kernel.org
Suggested-by: Hauke Mehrtens <hauke(a)hauke-m.de>
Acked-by: Hauke Mehrtens <hauke(a)hauke-m.de>
Signed-off-by: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
Reviewed-by: Florian Fainelli <f.fainelli(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c
index 126d4ea868ba..bf5c62e5c0b0 100644
--- a/drivers/net/dsa/lantiq_gswip.c
+++ b/drivers/net/dsa/lantiq_gswip.c
@@ -93,8 +93,12 @@
/* GSWIP MII Registers */
#define GSWIP_MII_CFGp(p) (0x2 * (p))
+#define GSWIP_MII_CFG_RESET BIT(15)
#define GSWIP_MII_CFG_EN BIT(14)
+#define GSWIP_MII_CFG_ISOLATE BIT(13)
#define GSWIP_MII_CFG_LDCLKDIS BIT(12)
+#define GSWIP_MII_CFG_RGMII_IBS BIT(8)
+#define GSWIP_MII_CFG_RMII_CLK BIT(7)
#define GSWIP_MII_CFG_MODE_MIIP 0x0
#define GSWIP_MII_CFG_MODE_MIIM 0x1
#define GSWIP_MII_CFG_MODE_RMIIP 0x2
@@ -821,9 +825,11 @@ static int gswip_setup(struct dsa_switch *ds)
/* Configure the MDIO Clock 2.5 MHz */
gswip_mdio_mask(priv, 0xff, 0x09, GSWIP_MDIO_MDC_CFG1);
- /* Disable the xMII link */
+ /* Disable the xMII interface and clear it's isolation bit */
for (i = 0; i < priv->hw_info->max_ports; i++)
- gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_EN, 0, i);
+ gswip_mii_mask_cfg(priv,
+ GSWIP_MII_CFG_EN | GSWIP_MII_CFG_ISOLATE,
+ 0, i);
/* enable special tag insertion on cpu port */
gswip_switch_mask(priv, 0, GSWIP_FDMA_PCTRL_STEN,
@@ -1597,6 +1603,9 @@ static void gswip_phylink_mac_config(struct dsa_switch *ds, int port,
break;
case PHY_INTERFACE_MODE_RMII:
miicfg |= GSWIP_MII_CFG_MODE_RMIIM;
+
+ /* Configure the RMII clock as output: */
+ miicfg |= GSWIP_MII_CFG_RMII_CLK;
break;
case PHY_INTERFACE_MODE_RGMII:
case PHY_INTERFACE_MODE_RGMII_ID:
@@ -1609,7 +1618,11 @@ static void gswip_phylink_mac_config(struct dsa_switch *ds, int port,
"Unsupported interface: %d\n", state->interface);
return;
}
- gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_MODE_MASK, miicfg, port);
+
+ gswip_mii_mask_cfg(priv,
+ GSWIP_MII_CFG_MODE_MASK | GSWIP_MII_CFG_RMII_CLK |
+ GSWIP_MII_CFG_RGMII_IBS | GSWIP_MII_CFG_LDCLKDIS,
+ miicfg, port);
switch (state->interface) {
case PHY_INTERFACE_MODE_RGMII_ID:
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3e9005be87777afc902b9f5497495898202d335d Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
Date: Thu, 8 Apr 2021 20:38:27 +0200
Subject: [PATCH] net: dsa: lantiq_gswip: Don't use PHY auto polling
PHY auto polling on the GSWIP hardware can be used so link changes
(speed, link up/down, etc.) can be detected automatically. Internally
GSWIP reads the PHY's registers for this functionality. Based on this
automatic detection GSWIP can also automatically re-configure it's port
settings. Unfortunately this auto polling (and configuration) mechanism
seems to cause various issues observed by different people on different
devices:
- FritzBox 7360v2: the two Gbit/s ports (connected to the two internal
PHY11G instances) are working fine but the two Fast Ethernet ports
(using an AR8030 RMII PHY) are completely dead (neither RX nor TX are
received). It turns out that the AR8030 PHY sets the BMSR_ESTATEN bit
as well as the ESTATUS_1000_TFULL and ESTATUS_1000_XFULL bits. This
makes the PHY auto polling state machine (rightfully?) think that the
established link speed (when the other side is Gbit/s capable) is
1Gbit/s.
- None of the Ethernet ports on the Zyxel P-2812HNU-F1 (two are
connected to the internal PHY11G GPHYs while the other three are
external RGMII PHYs) are working. Neither RX nor TX traffic was
observed. It is not clear which part of the PHY auto polling state-
machine caused this.
- FritzBox 7412 (only one LAN port which is connected to one of the
internal GPHYs running in PHY22F / Fast Ethernet mode) was seeing
random disconnects (link down events could be seen). Sometimes all
traffic would stop after such disconnect. It is not clear which part
of the PHY auto polling state-machine cauased this.
- TP-Link TD-W9980 (two ports are connected to the internal GPHYs
running in PHY11G / Gbit/s mode, the other two are external RGMII
PHYs) was affected by similar issues as the FritzBox 7412 just without
the "link down" events
Switch to software based configuration instead of PHY auto polling (and
letting the GSWIP hardware configure the ports automatically) for the
following link parameters:
- link up/down
- link speed
- full/half duplex
- flow control (RX / TX pause)
After a big round of manual testing by various people (who helped test
this on OpenWrt) it turns out that this fixes all reported issues.
Additionally it can be considered more future proof because any
"quirk" which is implemented for a PHY on the driver side can now be
used with the GSWIP hardware as well because Linux is in control of the
link parameters.
As a nice side-effect this also solves a problem where fixed-links were
not supported previously because we were relying on the PHY auto polling
mechanism, which cannot work for fixed-links as there's no PHY from
where it can read the registers. Configuring the link settings on the
GSWIP ports means that we now use the settings from device-tree also for
ports with fixed-links.
Fixes: 14fceff4771e51 ("net: dsa: Add Lantiq / Intel DSA driver for vrx200")
Fixes: 3e6fdeb28f4c33 ("net: dsa: lantiq_gswip: Let GSWIP automatically set the xMII clock")
Cc: stable(a)vger.kernel.org
Acked-by: Hauke Mehrtens <hauke(a)hauke-m.de>
Reviewed-by: Andrew Lunn <andrew(a)lunn.ch>
Signed-off-by: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
Reviewed-by: Florian Fainelli <f.fainelli(a)gmail.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c
index 809dfa3be6bb..126d4ea868ba 100644
--- a/drivers/net/dsa/lantiq_gswip.c
+++ b/drivers/net/dsa/lantiq_gswip.c
@@ -190,6 +190,23 @@
#define GSWIP_PCE_DEFPVID(p) (0x486 + ((p) * 0xA))
#define GSWIP_MAC_FLEN 0x8C5
+#define GSWIP_MAC_CTRL_0p(p) (0x903 + ((p) * 0xC))
+#define GSWIP_MAC_CTRL_0_PADEN BIT(8)
+#define GSWIP_MAC_CTRL_0_FCS_EN BIT(7)
+#define GSWIP_MAC_CTRL_0_FCON_MASK 0x0070
+#define GSWIP_MAC_CTRL_0_FCON_AUTO 0x0000
+#define GSWIP_MAC_CTRL_0_FCON_RX 0x0010
+#define GSWIP_MAC_CTRL_0_FCON_TX 0x0020
+#define GSWIP_MAC_CTRL_0_FCON_RXTX 0x0030
+#define GSWIP_MAC_CTRL_0_FCON_NONE 0x0040
+#define GSWIP_MAC_CTRL_0_FDUP_MASK 0x000C
+#define GSWIP_MAC_CTRL_0_FDUP_AUTO 0x0000
+#define GSWIP_MAC_CTRL_0_FDUP_EN 0x0004
+#define GSWIP_MAC_CTRL_0_FDUP_DIS 0x000C
+#define GSWIP_MAC_CTRL_0_GMII_MASK 0x0003
+#define GSWIP_MAC_CTRL_0_GMII_AUTO 0x0000
+#define GSWIP_MAC_CTRL_0_GMII_MII 0x0001
+#define GSWIP_MAC_CTRL_0_GMII_RGMII 0x0002
#define GSWIP_MAC_CTRL_2p(p) (0x905 + ((p) * 0xC))
#define GSWIP_MAC_CTRL_2_MLEN BIT(3) /* Maximum Untagged Frame Lnegth */
@@ -653,16 +670,13 @@ static int gswip_port_enable(struct dsa_switch *ds, int port,
GSWIP_SDMA_PCTRLp(port));
if (!dsa_is_cpu_port(ds, port)) {
- u32 macconf = GSWIP_MDIO_PHY_LINK_AUTO |
- GSWIP_MDIO_PHY_SPEED_AUTO |
- GSWIP_MDIO_PHY_FDUP_AUTO |
- GSWIP_MDIO_PHY_FCONTX_AUTO |
- GSWIP_MDIO_PHY_FCONRX_AUTO |
- (phydev->mdio.addr & GSWIP_MDIO_PHY_ADDR_MASK);
-
- gswip_mdio_w(priv, macconf, GSWIP_MDIO_PHYp(port));
- /* Activate MDIO auto polling */
- gswip_mdio_mask(priv, 0, BIT(port), GSWIP_MDIO_MDC_CFG0);
+ u32 mdio_phy = 0;
+
+ if (phydev)
+ mdio_phy = phydev->mdio.addr & GSWIP_MDIO_PHY_ADDR_MASK;
+
+ gswip_mdio_mask(priv, GSWIP_MDIO_PHY_ADDR_MASK, mdio_phy,
+ GSWIP_MDIO_PHYp(port));
}
return 0;
@@ -675,14 +689,6 @@ static void gswip_port_disable(struct dsa_switch *ds, int port)
if (!dsa_is_user_port(ds, port))
return;
- if (!dsa_is_cpu_port(ds, port)) {
- gswip_mdio_mask(priv, GSWIP_MDIO_PHY_LINK_DOWN,
- GSWIP_MDIO_PHY_LINK_MASK,
- GSWIP_MDIO_PHYp(port));
- /* Deactivate MDIO auto polling */
- gswip_mdio_mask(priv, BIT(port), 0, GSWIP_MDIO_MDC_CFG0);
- }
-
gswip_switch_mask(priv, GSWIP_FDMA_PCTRL_EN, 0,
GSWIP_FDMA_PCTRLp(port));
gswip_switch_mask(priv, GSWIP_SDMA_PCTRL_EN, 0,
@@ -794,20 +800,31 @@ static int gswip_setup(struct dsa_switch *ds)
gswip_switch_w(priv, BIT(cpu_port), GSWIP_PCE_PMAP2);
gswip_switch_w(priv, BIT(cpu_port), GSWIP_PCE_PMAP3);
- /* disable PHY auto polling */
+ /* Deactivate MDIO PHY auto polling. Some PHYs as the AR8030 have an
+ * interoperability problem with this auto polling mechanism because
+ * their status registers think that the link is in a different state
+ * than it actually is. For the AR8030 it has the BMSR_ESTATEN bit set
+ * as well as ESTATUS_1000_TFULL and ESTATUS_1000_XFULL. This makes the
+ * auto polling state machine consider the link being negotiated with
+ * 1Gbit/s. Since the PHY itself is a Fast Ethernet RMII PHY this leads
+ * to the switch port being completely dead (RX and TX are both not
+ * working).
+ * Also with various other PHY / port combinations (PHY11G GPHY, PHY22F
+ * GPHY, external RGMII PEF7071/7072) any traffic would stop. Sometimes
+ * it would work fine for a few minutes to hours and then stop, on
+ * other device it would no traffic could be sent or received at all.
+ * Testing shows that when PHY auto polling is disabled these problems
+ * go away.
+ */
gswip_mdio_w(priv, 0x0, GSWIP_MDIO_MDC_CFG0);
+
/* Configure the MDIO Clock 2.5 MHz */
gswip_mdio_mask(priv, 0xff, 0x09, GSWIP_MDIO_MDC_CFG1);
- for (i = 0; i < priv->hw_info->max_ports; i++) {
- /* Disable the xMII link */
+ /* Disable the xMII link */
+ for (i = 0; i < priv->hw_info->max_ports; i++)
gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_EN, 0, i);
- /* Automatically select the xMII interface clock */
- gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_RATE_MASK,
- GSWIP_MII_CFG_RATE_AUTO, i);
- }
-
/* enable special tag insertion on cpu port */
gswip_switch_mask(priv, 0, GSWIP_FDMA_PCTRL_STEN,
GSWIP_FDMA_PCTRLp(cpu_port));
@@ -1455,6 +1472,112 @@ static void gswip_phylink_validate(struct dsa_switch *ds, int port,
return;
}
+static void gswip_port_set_link(struct gswip_priv *priv, int port, bool link)
+{
+ u32 mdio_phy;
+
+ if (link)
+ mdio_phy = GSWIP_MDIO_PHY_LINK_UP;
+ else
+ mdio_phy = GSWIP_MDIO_PHY_LINK_DOWN;
+
+ gswip_mdio_mask(priv, GSWIP_MDIO_PHY_LINK_MASK, mdio_phy,
+ GSWIP_MDIO_PHYp(port));
+}
+
+static void gswip_port_set_speed(struct gswip_priv *priv, int port, int speed,
+ phy_interface_t interface)
+{
+ u32 mdio_phy = 0, mii_cfg = 0, mac_ctrl_0 = 0;
+
+ switch (speed) {
+ case SPEED_10:
+ mdio_phy = GSWIP_MDIO_PHY_SPEED_M10;
+
+ if (interface == PHY_INTERFACE_MODE_RMII)
+ mii_cfg = GSWIP_MII_CFG_RATE_M50;
+ else
+ mii_cfg = GSWIP_MII_CFG_RATE_M2P5;
+
+ mac_ctrl_0 = GSWIP_MAC_CTRL_0_GMII_MII;
+ break;
+
+ case SPEED_100:
+ mdio_phy = GSWIP_MDIO_PHY_SPEED_M100;
+
+ if (interface == PHY_INTERFACE_MODE_RMII)
+ mii_cfg = GSWIP_MII_CFG_RATE_M50;
+ else
+ mii_cfg = GSWIP_MII_CFG_RATE_M25;
+
+ mac_ctrl_0 = GSWIP_MAC_CTRL_0_GMII_MII;
+ break;
+
+ case SPEED_1000:
+ mdio_phy = GSWIP_MDIO_PHY_SPEED_G1;
+
+ mii_cfg = GSWIP_MII_CFG_RATE_M125;
+
+ mac_ctrl_0 = GSWIP_MAC_CTRL_0_GMII_RGMII;
+ break;
+ }
+
+ gswip_mdio_mask(priv, GSWIP_MDIO_PHY_SPEED_MASK, mdio_phy,
+ GSWIP_MDIO_PHYp(port));
+ gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_RATE_MASK, mii_cfg, port);
+ gswip_switch_mask(priv, GSWIP_MAC_CTRL_0_GMII_MASK, mac_ctrl_0,
+ GSWIP_MAC_CTRL_0p(port));
+}
+
+static void gswip_port_set_duplex(struct gswip_priv *priv, int port, int duplex)
+{
+ u32 mac_ctrl_0, mdio_phy;
+
+ if (duplex == DUPLEX_FULL) {
+ mac_ctrl_0 = GSWIP_MAC_CTRL_0_FDUP_EN;
+ mdio_phy = GSWIP_MDIO_PHY_FDUP_EN;
+ } else {
+ mac_ctrl_0 = GSWIP_MAC_CTRL_0_FDUP_DIS;
+ mdio_phy = GSWIP_MDIO_PHY_FDUP_DIS;
+ }
+
+ gswip_switch_mask(priv, GSWIP_MAC_CTRL_0_FDUP_MASK, mac_ctrl_0,
+ GSWIP_MAC_CTRL_0p(port));
+ gswip_mdio_mask(priv, GSWIP_MDIO_PHY_FDUP_MASK, mdio_phy,
+ GSWIP_MDIO_PHYp(port));
+}
+
+static void gswip_port_set_pause(struct gswip_priv *priv, int port,
+ bool tx_pause, bool rx_pause)
+{
+ u32 mac_ctrl_0, mdio_phy;
+
+ if (tx_pause && rx_pause) {
+ mac_ctrl_0 = GSWIP_MAC_CTRL_0_FCON_RXTX;
+ mdio_phy = GSWIP_MDIO_PHY_FCONTX_EN |
+ GSWIP_MDIO_PHY_FCONRX_EN;
+ } else if (tx_pause) {
+ mac_ctrl_0 = GSWIP_MAC_CTRL_0_FCON_TX;
+ mdio_phy = GSWIP_MDIO_PHY_FCONTX_EN |
+ GSWIP_MDIO_PHY_FCONRX_DIS;
+ } else if (rx_pause) {
+ mac_ctrl_0 = GSWIP_MAC_CTRL_0_FCON_RX;
+ mdio_phy = GSWIP_MDIO_PHY_FCONTX_DIS |
+ GSWIP_MDIO_PHY_FCONRX_EN;
+ } else {
+ mac_ctrl_0 = GSWIP_MAC_CTRL_0_FCON_NONE;
+ mdio_phy = GSWIP_MDIO_PHY_FCONTX_DIS |
+ GSWIP_MDIO_PHY_FCONRX_DIS;
+ }
+
+ gswip_switch_mask(priv, GSWIP_MAC_CTRL_0_FCON_MASK,
+ mac_ctrl_0, GSWIP_MAC_CTRL_0p(port));
+ gswip_mdio_mask(priv,
+ GSWIP_MDIO_PHY_FCONTX_MASK |
+ GSWIP_MDIO_PHY_FCONRX_MASK,
+ mdio_phy, GSWIP_MDIO_PHYp(port));
+}
+
static void gswip_phylink_mac_config(struct dsa_switch *ds, int port,
unsigned int mode,
const struct phylink_link_state *state)
@@ -1511,6 +1634,9 @@ static void gswip_phylink_mac_link_down(struct dsa_switch *ds, int port,
struct gswip_priv *priv = ds->priv;
gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_EN, 0, port);
+
+ if (!dsa_is_cpu_port(ds, port))
+ gswip_port_set_link(priv, port, false);
}
static void gswip_phylink_mac_link_up(struct dsa_switch *ds, int port,
@@ -1522,6 +1648,13 @@ static void gswip_phylink_mac_link_up(struct dsa_switch *ds, int port,
{
struct gswip_priv *priv = ds->priv;
+ if (!dsa_is_cpu_port(ds, port)) {
+ gswip_port_set_link(priv, port, true);
+ gswip_port_set_speed(priv, port, speed, interface);
+ gswip_port_set_duplex(priv, port, duplex);
+ gswip_port_set_pause(priv, port, tx_pause, rx_pause);
+ }
+
gswip_mii_mask_cfg(priv, 0, GSWIP_MII_CFG_EN, port);
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0f6925b3e8da0dbbb52447ca8a8b42b371aac7db Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet(a)google.com>
Date: Fri, 2 Apr 2021 06:26:02 -0700
Subject: [PATCH] virtio_net: Do not pull payload in skb->head
Xuan Zhuo reported that commit 3226b158e67c ("net: avoid 32 x truesize
under-estimation for tiny skbs") brought a ~10% performance drop.
The reason for the performance drop was that GRO was forced
to chain sk_buff (using skb_shinfo(skb)->frag_list), which
uses more memory but also cause packet consumers to go over
a lot of overhead handling all the tiny skbs.
It turns out that virtio_net page_to_skb() has a wrong strategy :
It allocates skbs with GOOD_COPY_LEN (128) bytes in skb->head, then
copies 128 bytes from the page, before feeding the packet to GRO stack.
This was suboptimal before commit 3226b158e67c ("net: avoid 32 x truesize
under-estimation for tiny skbs") because GRO was using 2 frags per MSS,
meaning we were not packing MSS with 100% efficiency.
Fix is to pull only the ethernet header in page_to_skb()
Then, we change virtio_net_hdr_to_skb() to pull the missing
headers, instead of assuming they were already pulled by callers.
This fixes the performance regression, but could also allow virtio_net
to accept packets with more than 128bytes of headers.
Many thanks to Xuan Zhuo for his report, and his tests/help.
Fixes: 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs")
Reported-by: Xuan Zhuo <xuanzhuo(a)linux.alibaba.com>
Link: https://www.spinics.net/lists/netdev/msg731397.html
Co-Developed-by: Xuan Zhuo <xuanzhuo(a)linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo(a)linux.alibaba.com>
Signed-off-by: Eric Dumazet <edumazet(a)google.com>
Cc: "Michael S. Tsirkin" <mst(a)redhat.com>
Cc: Jason Wang <jasowang(a)redhat.com>
Cc: virtualization(a)lists.linux-foundation.org
Acked-by: Jason Wang <jasowang(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 82e520d2cb12..0824e6999e49 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -406,9 +406,13 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
offset += hdr_padded_len;
p += hdr_padded_len;
- copy = len;
- if (copy > skb_tailroom(skb))
- copy = skb_tailroom(skb);
+ /* Copy all frame if it fits skb->head, otherwise
+ * we let virtio_net_hdr_to_skb() and GRO pull headers as needed.
+ */
+ if (len <= skb_tailroom(skb))
+ copy = len;
+ else
+ copy = ETH_HLEN + metasize;
skb_put_data(skb, p, copy);
if (metasize) {
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 98775d7fa696..b465f8f3e554 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -65,14 +65,18 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
skb_reset_mac_header(skb);
if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
- u16 start = __virtio16_to_cpu(little_endian, hdr->csum_start);
- u16 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
+ u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start);
+ u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
+ u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16));
+
+ if (!pskb_may_pull(skb, needed))
+ return -EINVAL;
if (!skb_partial_csum_set(skb, start, off))
return -EINVAL;
p_off = skb_transport_offset(skb) + thlen;
- if (p_off > skb_headlen(skb))
+ if (!pskb_may_pull(skb, p_off))
return -EINVAL;
} else {
/* gso packets without NEEDS_CSUM do not set transport_offset.
@@ -102,14 +106,14 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
}
p_off = keys.control.thoff + thlen;
- if (p_off > skb_headlen(skb) ||
+ if (!pskb_may_pull(skb, p_off) ||
keys.basic.ip_proto != ip_proto)
return -EINVAL;
skb_set_transport_header(skb, keys.control.thoff);
} else if (gso_type) {
p_off = thlen;
- if (p_off > skb_headlen(skb))
+ if (!pskb_may_pull(skb, p_off))
return -EINVAL;
}
}