The patch below does not apply to the 4.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d60d8b64280c8b36c085eda7821585c1ce911795 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall(a)linaro.org>
Date: Fri, 26 Jan 2018 16:06:51 +0100
Subject: [PATCH] KVM: arm/arm64: Fix arch timers with userspace irqchips
When introducing support for irqchip in userspace we needed a way to
mask the timer signal to prevent the guest continuously exiting due to a
screaming timer.
We did this by disabling the corresponding percpu interrupt on the
host interrupt controller, because we cannot rely on the host system
having a GIC, and therefore cannot make any assumptions about having an
active state to hide the timer signal.
Unfortunately, when introducing this feature, it became entirely
possible that a VCPU which belongs to a VM that has a userspace irqchip
can disable the vtimer irq on the host on some physical CPU, and then go
away without ever enabling the vtimer irq on that physical CPU again.
This means that using irqchips in userspace on a system that also
supports running VMs with an in-kernel GIC can prevent forward progress
from in-kernel GIC VMs.
Later on, when we started taking virtual timer interrupts in the arch
timer code, we would also leave this timer state active for userspace
irqchip VMs, because we leave it up to a VGIC-enabled guest to
deactivate the hardware IRQ using the HW bit in the LR.
Both issues are solved by only using the enable/disable trick on systems
that do not have a host GIC which supports the active state, because all
VMs on such systems must use irqchips in userspace. Systems that have a
working GIC with support for an active state use the active state to
mask the timer signal for both userspace and in-kernel irqchips.
Cc: Alexander Graf <agraf(a)suse.de>
Reviewed-by: Marc Zyngier <marc.zyngier(a)arm.com>
Cc: <stable(a)vger.kernel.org> # v4.12+
Fixes: d9e139778376 ("KVM: arm/arm64: Support arch timers with a userspace gic")
Signed-off-by: Christoffer Dall <christoffer.dall(a)linaro.org>
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 70268c0bec79..70f4c30918eb 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -36,6 +36,8 @@ static struct timecounter *timecounter;
static unsigned int host_vtimer_irq;
static u32 host_vtimer_irq_flags;
+static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
+
static const struct kvm_irq_level default_ptimer_irq = {
.irq = 30,
.level = 1,
@@ -56,6 +58,12 @@ u64 kvm_phys_timer_read(void)
return timecounter->cc->read(timecounter->cc);
}
+static inline bool userspace_irqchip(struct kvm *kvm)
+{
+ return static_branch_unlikely(&userspace_irqchip_in_use) &&
+ unlikely(!irqchip_in_kernel(kvm));
+}
+
static void soft_timer_start(struct hrtimer *hrt, u64 ns)
{
hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
@@ -69,25 +77,6 @@ static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work)
cancel_work_sync(work);
}
-static void kvm_vtimer_update_mask_user(struct kvm_vcpu *vcpu)
-{
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
- /*
- * When using a userspace irqchip with the architected timers, we must
- * prevent continuously exiting from the guest, and therefore mask the
- * physical interrupt by disabling it on the host interrupt controller
- * when the virtual level is high, such that the guest can make
- * forward progress. Once we detect the output level being
- * de-asserted, we unmask the interrupt again so that we exit from the
- * guest when the timer fires.
- */
- if (vtimer->irq.level)
- disable_percpu_irq(host_vtimer_irq);
- else
- enable_percpu_irq(host_vtimer_irq, 0);
-}
-
static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
{
struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
@@ -106,9 +95,9 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
if (kvm_timer_should_fire(vtimer))
kvm_timer_update_irq(vcpu, true, vtimer);
- if (static_branch_unlikely(&userspace_irqchip_in_use) &&
- unlikely(!irqchip_in_kernel(vcpu->kvm)))
- kvm_vtimer_update_mask_user(vcpu);
+ if (userspace_irqchip(vcpu->kvm) &&
+ !static_branch_unlikely(&has_gic_active_state))
+ disable_percpu_irq(host_vtimer_irq);
return IRQ_HANDLED;
}
@@ -290,8 +279,7 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
timer_ctx->irq.level);
- if (!static_branch_unlikely(&userspace_irqchip_in_use) ||
- likely(irqchip_in_kernel(vcpu->kvm))) {
+ if (!userspace_irqchip(vcpu->kvm)) {
ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
timer_ctx->irq.irq,
timer_ctx->irq.level,
@@ -350,12 +338,6 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
phys_timer_emulate(vcpu);
}
-static void __timer_snapshot_state(struct arch_timer_context *timer)
-{
- timer->cnt_ctl = read_sysreg_el0(cntv_ctl);
- timer->cnt_cval = read_sysreg_el0(cntv_cval);
-}
-
static void vtimer_save_state(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@@ -367,8 +349,10 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu)
if (!vtimer->loaded)
goto out;
- if (timer->enabled)
- __timer_snapshot_state(vtimer);
+ if (timer->enabled) {
+ vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
+ vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
+ }
/* Disable the virtual timer */
write_sysreg_el0(0, cntv_ctl);
@@ -460,23 +444,43 @@ static void set_cntvoff(u64 cntvoff)
kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
}
-static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu)
+static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active)
+{
+ int r;
+ r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active);
+ WARN_ON(r);
+}
+
+static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu)
{
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
bool phys_active;
- int ret;
- phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
-
- ret = irq_set_irqchip_state(host_vtimer_irq,
- IRQCHIP_STATE_ACTIVE,
- phys_active);
- WARN_ON(ret);
+ if (irqchip_in_kernel(vcpu->kvm))
+ phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
+ else
+ phys_active = vtimer->irq.level;
+ set_vtimer_irq_phys_active(vcpu, phys_active);
}
-static void kvm_timer_vcpu_load_user(struct kvm_vcpu *vcpu)
+static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
{
- kvm_vtimer_update_mask_user(vcpu);
+ struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+ /*
+ * When using a userspace irqchip with the architected timers and a
+ * host interrupt controller that doesn't support an active state, we
+ * must still prevent continuously exiting from the guest, and
+ * therefore mask the physical interrupt by disabling it on the host
+ * interrupt controller when the virtual level is high, such that the
+ * guest can make forward progress. Once we detect the output level
+ * being de-asserted, we unmask the interrupt again so that we exit
+ * from the guest when the timer fires.
+ */
+ if (vtimer->irq.level)
+ disable_percpu_irq(host_vtimer_irq);
+ else
+ enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
}
void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
@@ -487,10 +491,10 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
if (unlikely(!timer->enabled))
return;
- if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
- kvm_timer_vcpu_load_user(vcpu);
+ if (static_branch_likely(&has_gic_active_state))
+ kvm_timer_vcpu_load_gic(vcpu);
else
- kvm_timer_vcpu_load_vgic(vcpu);
+ kvm_timer_vcpu_load_nogic(vcpu);
set_cntvoff(vtimer->cntvoff);
@@ -555,18 +559,24 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
{
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
- __timer_snapshot_state(vtimer);
- if (!kvm_timer_should_fire(vtimer)) {
- kvm_timer_update_irq(vcpu, false, vtimer);
- kvm_vtimer_update_mask_user(vcpu);
- }
+ if (!kvm_timer_should_fire(vtimer)) {
+ kvm_timer_update_irq(vcpu, false, vtimer);
+ if (static_branch_likely(&has_gic_active_state))
+ set_vtimer_irq_phys_active(vcpu, false);
+ else
+ enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
}
}
void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
{
- unmask_vtimer_irq_user(vcpu);
+ struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+ if (unlikely(!timer->enabled))
+ return;
+
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+ unmask_vtimer_irq_user(vcpu);
}
int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -753,6 +763,8 @@ int kvm_timer_hyp_init(bool has_gic)
kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
goto out_free_irq;
}
+
+ static_branch_enable(&has_gic_active_state);
}
kvm_info("virtual timer IRQ%d\n", host_vtimer_irq);
The patch below does not apply to the 4.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c37406e05d1e541df40b8b81c4bd40753fcaf414 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <ckoenig.leichtzumerken(a)gmail.com>
Date: Mon, 26 Feb 2018 14:51:13 -0600
Subject: [PATCH] PCI: Allow release of resources that were never assigned
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
It is entirely possible that the BIOS wasn't able to assign resources to a
device. In this case don't crash in pci_release_resource() when we try to
resize the resource.
Fixes: 8bb705e3e79d ("PCI: Add pci_resize_resource() for resizing BARs")
Signed-off-by: Christian König <christian.koenig(a)amd.com>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko(a)gmail.com>
CC: stable(a)vger.kernel.org # v4.15+
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 369d48d6c6f1..365447240d95 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -401,6 +401,10 @@ void pci_release_resource(struct pci_dev *dev, int resno)
struct resource *res = dev->resource + resno;
pci_info(dev, "BAR %d: releasing %pR\n", resno, res);
+
+ if (!res->parent)
+ return;
+
release_resource(res);
res->end = resource_size(res) - 1;
res->start = 0;
This is a note to let you know that I've just added the patch titled
virtio-net: disable NAPI only when enabled during XDP set
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Tue Mar 6 19:02:56 PST 2018
From: Jason Wang <jasowang(a)redhat.com>
Date: Wed, 28 Feb 2018 18:20:04 +0800
Subject: virtio-net: disable NAPI only when enabled during XDP set
From: Jason Wang <jasowang(a)redhat.com>
[ Upstream commit 4e09ff5362843dff3accfa84c805c7f3a99de9cd ]
We try to disable NAPI to prevent a single XDP TX queue being used by
multiple cpus. But we don't check if device is up (NAPI is enabled),
this could result stall because of infinite wait in
napi_disable(). Fixing this by checking device state through
netif_running() before.
Fixes: 4941d472bf95b ("virtio-net: do not reset during XDP set")
Signed-off-by: Jason Wang <jasowang(a)redhat.com>
Acked-by: Michael S. Tsirkin <mst(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/net/virtio_net.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2040,8 +2040,9 @@ static int virtnet_xdp_set(struct net_de
}
/* Make sure NAPI is not using any XDP TX queues for RX. */
- for (i = 0; i < vi->max_queue_pairs; i++)
- napi_disable(&vi->rq[i].napi);
+ if (netif_running(dev))
+ for (i = 0; i < vi->max_queue_pairs; i++)
+ napi_disable(&vi->rq[i].napi);
netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
@@ -2060,7 +2061,8 @@ static int virtnet_xdp_set(struct net_de
}
if (old_prog)
bpf_prog_put(old_prog);
- virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
+ if (netif_running(dev))
+ virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
}
return 0;
Patches currently in stable-queue which might be from jasowang(a)redhat.com are
queue-4.15/tuntap-correctly-add-the-missing-xdp-flush.patch
queue-4.15/virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch
queue-4.15/tuntap-disable-preemption-during-xdp-processing.patch
This is a note to let you know that I've just added the patch titled
udplite: fix partial checksum initialization
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
udplite-fix-partial-checksum-initialization.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Tue Mar 6 19:02:56 PST 2018
From: Alexey Kodanev <alexey.kodanev(a)oracle.com>
Date: Thu, 15 Feb 2018 20:18:43 +0300
Subject: udplite: fix partial checksum initialization
From: Alexey Kodanev <alexey.kodanev(a)oracle.com>
[ Upstream commit 15f35d49c93f4fa9875235e7bf3e3783d2dd7a1b ]
Since UDP-Lite is always using checksum, the following path is
triggered when calculating pseudo header for it:
udp4_csum_init() or udp6_csum_init()
skb_checksum_init_zero_check()
__skb_checksum_validate_complete()
The problem can appear if skb->len is less than CHECKSUM_BREAK. In
this particular case __skb_checksum_validate_complete() also invokes
__skb_checksum_complete(skb). If UDP-Lite is using partial checksum
that covers only part of a packet, the function will return bad
checksum and the packet will be dropped.
It can be fixed if we skip skb_checksum_init_zero_check() and only
set the required pseudo header checksum for UDP-Lite with partial
checksum before udp4_csum_init()/udp6_csum_init() functions return.
Fixes: ed70fcfcee95 ("net: Call skb_checksum_init in IPv4")
Fixes: e4f45b7f40bd ("net: Call skb_checksum_init in IPv6")
Signed-off-by: Alexey Kodanev <alexey.kodanev(a)oracle.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
include/net/udplite.h | 1 +
net/ipv4/udp.c | 5 +++++
net/ipv6/ip6_checksum.c | 5 +++++
3 files changed, 11 insertions(+)
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -64,6 +64,7 @@ static inline int udplite_checksum_init(
UDP_SKB_CB(skb)->cscov = cscov;
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
+ skb->csum_valid = 0;
}
return 0;
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2031,6 +2031,11 @@ static inline int udp4_csum_init(struct
err = udplite_checksum_init(skb, uh);
if (err)
return err;
+
+ if (UDP_SKB_CB(skb)->partial_cov) {
+ skb->csum = inet_compute_pseudo(skb, proto);
+ return 0;
+ }
}
/* Note, we are only interested in != 0 or == 0, thus the
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -73,6 +73,11 @@ int udp6_csum_init(struct sk_buff *skb,
err = udplite_checksum_init(skb, uh);
if (err)
return err;
+
+ if (UDP_SKB_CB(skb)->partial_cov) {
+ skb->csum = ip6_compute_pseudo(skb, proto);
+ return 0;
+ }
}
/* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels)
Patches currently in stable-queue which might be from alexey.kodanev(a)oracle.com are
queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch
queue-4.15/udplite-fix-partial-checksum-initialization.patch
queue-4.15/sctp-verify-size-of-a-new-chunk-in-_sctp_make_chunk.patch
This is a note to let you know that I've just added the patch titled
tuntap: disable preemption during XDP processing
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tuntap-disable-preemption-during-xdp-processing.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Tue Mar 6 19:02:56 PST 2018
From: Jason Wang <jasowang(a)redhat.com>
Date: Sat, 24 Feb 2018 11:32:25 +0800
Subject: tuntap: disable preemption during XDP processing
From: Jason Wang <jasowang(a)redhat.com>
[ Upstream commit 23e43f07f896f8578318cfcc9466f1e8b8ab21b6 ]
Except for tuntap, all other drivers' XDP was implemented at NAPI
poll() routine in a bh. This guarantees all XDP operation were done at
the same CPU which is required by e.g BFP_MAP_TYPE_PERCPU_ARRAY. But
for tuntap, we do it in process context and we try to protect XDP
processing by RCU reader lock. This is insufficient since
CONFIG_PREEMPT_RCU can preempt the RCU reader critical section which
breaks the assumption that all XDP were processed in the same CPU.
Fixing this by simply disabling preemption during XDP processing.
Fixes: 761876c857cb ("tap: XDP support")
Signed-off-by: Jason Wang <jasowang(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/net/tun.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1471,6 +1471,7 @@ static struct sk_buff *tun_build_skb(str
else
*skb_xdp = 0;
+ preempt_disable();
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
if (xdp_prog && !*skb_xdp) {
@@ -1494,6 +1495,7 @@ static struct sk_buff *tun_build_skb(str
if (err)
goto err_redirect;
rcu_read_unlock();
+ preempt_enable();
return NULL;
case XDP_TX:
xdp_xmit = true;
@@ -1515,6 +1517,7 @@ static struct sk_buff *tun_build_skb(str
skb = build_skb(buf, buflen);
if (!skb) {
rcu_read_unlock();
+ preempt_enable();
return ERR_PTR(-ENOMEM);
}
@@ -1527,10 +1530,12 @@ static struct sk_buff *tun_build_skb(str
skb->dev = tun->dev;
generic_xdp_tx(skb, xdp_prog);
rcu_read_unlock();
+ preempt_enable();
return NULL;
}
rcu_read_unlock();
+ preempt_enable();
return skb;
@@ -1538,6 +1543,7 @@ err_redirect:
put_page(alloc_frag->page);
err_xdp:
rcu_read_unlock();
+ preempt_enable();
this_cpu_inc(tun->pcpu_stats->rx_dropped);
return NULL;
}
Patches currently in stable-queue which might be from jasowang(a)redhat.com are
queue-4.15/tuntap-correctly-add-the-missing-xdp-flush.patch
queue-4.15/virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch
queue-4.15/tuntap-disable-preemption-during-xdp-processing.patch
This is a note to let you know that I've just added the patch titled
tuntap: correctly add the missing XDP flush
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tuntap-correctly-add-the-missing-xdp-flush.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Tue Mar 6 19:02:56 PST 2018
From: Jason Wang <jasowang(a)redhat.com>
Date: Sat, 24 Feb 2018 11:32:26 +0800
Subject: tuntap: correctly add the missing XDP flush
From: Jason Wang <jasowang(a)redhat.com>
[ Upstream commit 1bb4f2e868a2891ab8bc668b8173d6ccb8c4ce6f ]
We don't flush batched XDP packets through xdp_do_flush_map(), this
will cause packets stall at TX queue. Consider we don't do XDP on NAPI
poll(), the only possible fix is to call xdp_do_flush_map()
immediately after xdp_do_redirect().
Note, this in fact won't try to batch packets through devmap, we could
address in the future.
Reported-by: Christoffer Dall <christoffer.dall(a)linaro.org>
Fixes: 761876c857cb ("tap: XDP support")
Signed-off-by: Jason Wang <jasowang(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/net/tun.c | 1 +
1 file changed, 1 insertion(+)
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1490,6 +1490,7 @@ static struct sk_buff *tun_build_skb(str
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
+ xdp_do_flush_map();
if (err)
goto err_redirect;
rcu_read_unlock();
Patches currently in stable-queue which might be from jasowang(a)redhat.com are
queue-4.15/tuntap-correctly-add-the-missing-xdp-flush.patch
queue-4.15/virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch
queue-4.15/tuntap-disable-preemption-during-xdp-processing.patch
This is a note to let you know that I've just added the patch titled
tls: Use correct sk->sk_prot for IPV6
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tls-use-correct-sk-sk_prot-for-ipv6.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Tue Mar 6 19:02:56 PST 2018
From: Boris Pismenny <borisp(a)mellanox.com>
Date: Tue, 27 Feb 2018 14:18:39 +0200
Subject: tls: Use correct sk->sk_prot for IPV6
From: Boris Pismenny <borisp(a)mellanox.com>
[ Upstream commit c113187d38ff85dc302a1bb55864b203ebb2ba10 ]
The tls ulp overrides sk->prot with a new tls specific proto structs.
The tls specific structs were previously based on the ipv4 specific
tcp_prot sturct.
As a result, attaching the tls ulp to an ipv6 tcp socket replaced
some ipv6 callback with the ipv4 equivalents.
This patch adds ipv6 tls proto structs and uses them when
attached to ipv6 sockets.
Fixes: 3c4d7559159b ('tls: kernel TLS support')
Signed-off-by: Boris Pismenny <borisp(a)mellanox.com>
Signed-off-by: Ilya Lesokhin <ilyal(a)mellanox.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/tls/tls_main.c | 52 +++++++++++++++++++++++++++++++++++++---------------
1 file changed, 37 insertions(+), 15 deletions(-)
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -46,16 +46,26 @@ MODULE_DESCRIPTION("Transport Layer Secu
MODULE_LICENSE("Dual BSD/GPL");
enum {
+ TLSV4,
+ TLSV6,
+ TLS_NUM_PROTS,
+};
+
+enum {
TLS_BASE_TX,
TLS_SW_TX,
TLS_NUM_CONFIG,
};
-static struct proto tls_prots[TLS_NUM_CONFIG];
+static struct proto *saved_tcpv6_prot;
+static DEFINE_MUTEX(tcpv6_prot_mutex);
+static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG];
static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx)
{
- sk->sk_prot = &tls_prots[ctx->tx_conf];
+ int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
+
+ sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf];
}
int wait_on_pending_writer(struct sock *sk, long *timeo)
@@ -450,8 +460,21 @@ static int tls_setsockopt(struct sock *s
return do_tls_setsockopt(sk, optname, optval, optlen);
}
+static void build_protos(struct proto *prot, struct proto *base)
+{
+ prot[TLS_BASE_TX] = *base;
+ prot[TLS_BASE_TX].setsockopt = tls_setsockopt;
+ prot[TLS_BASE_TX].getsockopt = tls_getsockopt;
+ prot[TLS_BASE_TX].close = tls_sk_proto_close;
+
+ prot[TLS_SW_TX] = prot[TLS_BASE_TX];
+ prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg;
+ prot[TLS_SW_TX].sendpage = tls_sw_sendpage;
+}
+
static int tls_init(struct sock *sk)
{
+ int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
struct inet_connection_sock *icsk = inet_csk(sk);
struct tls_context *ctx;
int rc = 0;
@@ -476,6 +499,17 @@ static int tls_init(struct sock *sk)
ctx->getsockopt = sk->sk_prot->getsockopt;
ctx->sk_proto_close = sk->sk_prot->close;
+ /* Build IPv6 TLS whenever the address of tcpv6_prot changes */
+ if (ip_ver == TLSV6 &&
+ unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
+ mutex_lock(&tcpv6_prot_mutex);
+ if (likely(sk->sk_prot != saved_tcpv6_prot)) {
+ build_protos(tls_prots[TLSV6], sk->sk_prot);
+ smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
+ }
+ mutex_unlock(&tcpv6_prot_mutex);
+ }
+
ctx->tx_conf = TLS_BASE_TX;
update_sk_prot(sk, ctx);
out:
@@ -488,21 +522,9 @@ static struct tcp_ulp_ops tcp_tls_ulp_op
.init = tls_init,
};
-static void build_protos(struct proto *prot, struct proto *base)
-{
- prot[TLS_BASE_TX] = *base;
- prot[TLS_BASE_TX].setsockopt = tls_setsockopt;
- prot[TLS_BASE_TX].getsockopt = tls_getsockopt;
- prot[TLS_BASE_TX].close = tls_sk_proto_close;
-
- prot[TLS_SW_TX] = prot[TLS_BASE_TX];
- prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg;
- prot[TLS_SW_TX].sendpage = tls_sw_sendpage;
-}
-
static int __init tls_register(void)
{
- build_protos(tls_prots, &tcp_prot);
+ build_protos(tls_prots[TLSV4], &tcp_prot);
tcp_register_ulp(&tcp_tls_ulp_ops);
Patches currently in stable-queue which might be from borisp(a)mellanox.com are
queue-4.15/tls-use-correct-sk-sk_prot-for-ipv6.patch
This is a note to let you know that I've just added the patch titled
tcp_bbr: better deal with suboptimal GSO
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tcp_bbr-better-deal-with-suboptimal-gso.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Tue Mar 6 19:02:56 PST 2018
From: Eric Dumazet <edumazet(a)google.com>
Date: Wed, 21 Feb 2018 06:43:03 -0800
Subject: tcp_bbr: better deal with suboptimal GSO
From: Eric Dumazet <edumazet(a)google.com>
[ Upstream commit 350c9f484bde93ef229682eedd98cd5f74350f7f ]
BBR uses tcp_tso_autosize() in an attempt to probe what would be the
burst sizes and to adjust cwnd in bbr_target_cwnd() with following
gold formula :
/* Allow enough full-sized skbs in flight to utilize end systems. */
cwnd += 3 * bbr->tso_segs_goal;
But GSO can be lacking or be constrained to very small
units (ip link set dev ... gso_max_segs 2)
What we really want is to have enough packets in flight so that both
GSO and GRO are efficient.
So in the case GSO is off or downgraded, we still want to have the same
number of packets in flight as if GSO/TSO was fully operational, so
that GRO can hopefully be working efficiently.
To fix this issue, we make tcp_tso_autosize() unaware of
sk->sk_gso_max_segs
Only tcp_tso_segs() has to enforce the gso_max_segs limit.
Tested:
ethtool -K eth0 tso off gso off
tc qd replace dev eth0 root pfifo_fast
Before patch:
for f in {1..5}; do ./super_netperf 1 -H lpaa24 -- -K bbr; done
691 (ss -temoi shows cwnd is stuck around 6 )
667
651
631
517
After patch :
# for f in {1..5}; do ./super_netperf 1 -H lpaa24 -- -K bbr; done
1733 (ss -temoi shows cwnd is around 386 )
1778
1746
1781
1718
Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control")
Signed-off-by: Eric Dumazet <edumazet(a)google.com>
Reported-by: Oleksandr Natalenko <oleksandr(a)natalenko.name>
Acked-by: Neal Cardwell <ncardwell(a)google.com>
Acked-by: Soheil Hassas Yeganeh <soheil(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv4/tcp_output.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1730,7 +1730,7 @@ u32 tcp_tso_autosize(const struct sock *
*/
segs = max_t(u32, bytes / mss_now, min_tso_segs);
- return min_t(u32, segs, sk->sk_gso_max_segs);
+ return segs;
}
EXPORT_SYMBOL(tcp_tso_autosize);
@@ -1742,9 +1742,10 @@ static u32 tcp_tso_segs(struct sock *sk,
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
- return tso_segs ? :
- tcp_tso_autosize(sk, mss_now,
- sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ if (!tso_segs)
+ tso_segs = tcp_tso_autosize(sk, mss_now,
+ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+ return min_t(u32, tso_segs, sk->sk_gso_max_segs);
}
/* Returns the portion of skb which can be sent right away */
Patches currently in stable-queue which might be from edumazet(a)google.com are
queue-4.15/doc-change-the-min-default-value-of-tcp_wmem-tcp_rmem.patch
queue-4.15/tcp-purge-write-queue-upon-rst.patch
queue-4.15/net_sched-gen_estimator-fix-broken-estimators-based-on-percpu-stats.patch
queue-4.15/tcp_bbr-better-deal-with-suboptimal-gso.patch
This is a note to let you know that I've just added the patch titled
tcp: tracepoint: only call trace_tcp_send_reset with full socket
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Tue Mar 6 19:02:56 PST 2018
From: Song Liu <songliubraving(a)fb.com>
Date: Tue, 6 Feb 2018 20:50:23 -0800
Subject: tcp: tracepoint: only call trace_tcp_send_reset with full socket
From: Song Liu <songliubraving(a)fb.com>
[ Upstream commit 5c487bb9adddbc1d23433e09d2548759375c2b52 ]
tracepoint tcp_send_reset requires a full socket to work. However, it
may be called when in TCP_TIME_WAIT:
case TCP_TW_RST:
tcp_v6_send_reset(sk, skb);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
To avoid this problem, this patch checks the socket with sk_fullsock()
before calling trace_tcp_send_reset().
Fixes: c24b14c46bb8 ("tcp: add tracepoint trace_tcp_send_reset")
Signed-off-by: Song Liu <songliubraving(a)fb.com>
Reviewed-by: Lawrence Brakmo <brakmo(a)fb.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv4/tcp_ipv4.c | 3 ++-
net/ipv6/tcp_ipv6.c | 3 ++-
2 files changed, 4 insertions(+), 2 deletions(-)
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -705,7 +705,8 @@ static void tcp_v4_send_reset(const stru
*/
if (sk) {
arg.bound_dev_if = sk->sk_bound_dev_if;
- trace_tcp_send_reset(sk, skb);
+ if (sk_fullsock(sk))
+ trace_tcp_send_reset(sk, skb);
}
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -943,7 +943,8 @@ static void tcp_v6_send_reset(const stru
if (sk) {
oif = sk->sk_bound_dev_if;
- trace_tcp_send_reset(sk, skb);
+ if (sk_fullsock(sk))
+ trace_tcp_send_reset(sk, skb);
}
tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
Patches currently in stable-queue which might be from songliubraving(a)fb.com are
queue-4.15/tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch
This is a note to let you know that I've just added the patch titled
tcp: revert F-RTO middle-box workaround
to the 4.15-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
tcp-revert-f-rto-middle-box-workaround.patch
and it can be found in the queue-4.15 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
>From foo@baz Tue Mar 6 19:02:56 PST 2018
From: Yuchung Cheng <ycheng(a)google.com>
Date: Tue, 27 Feb 2018 14:15:01 -0800
Subject: tcp: revert F-RTO middle-box workaround
From: Yuchung Cheng <ycheng(a)google.com>
[ Upstream commit d4131f09770d9b7471c9da65e6ecd2477746ac5c ]
This reverts commit cc663f4d4c97b7297fb45135ab23cfd508b35a77. While fixing
some broken middle-boxes that modifies receive window fields, it does not
address middle-boxes that strip off SACK options. The best solution is
to fully revert this patch and the root F-RTO enhancement.
Fixes: cc663f4d4c97 ("tcp: restrict F-RTO to work-around broken middle-boxes")
Reported-by: Teodor Milkov <tm(a)del.bg>
Signed-off-by: Yuchung Cheng <ycheng(a)google.com>
Signed-off-by: Neal Cardwell <ncardwell(a)google.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
net/ipv4/tcp_input.c | 17 +++++++----------
1 file changed, 7 insertions(+), 10 deletions(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 45f750e85714..50963f92a67d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1915,7 +1915,6 @@ void tcp_enter_loss(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sk_buff *skb;
- bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
bool mark_lost;
@@ -1974,17 +1973,15 @@ void tcp_enter_loss(struct sock *sk)
tp->high_seq = tp->snd_nxt;
tcp_ecn_queue_cwr(tp);
- /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
- * loss recovery is underway except recurring timeout(s) on
- * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
- *
- * In theory F-RTO can be used repeatedly during loss recovery.
- * In practice this interacts badly with broken middle-boxes that
- * falsely raise the receive window, which results in repeated
- * timeouts and stop-and-go behavior.
+ /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
+ * if a previous recovery is underway, otherwise it may incorrectly
+ * call a timeout spurious if some previously retransmitted packets
+ * are s/acked (sec 3.2). We do not apply that retriction since
+ * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
+ * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
+ * on PTMU discovery to avoid sending new data.
*/
tp->frto = net->ipv4.sysctl_tcp_frto &&
- (new_recovery || icsk->icsk_retransmits) &&
!inet_csk(sk)->icsk_mtup.probe_size;
}
--
2.14.3
Patches currently in stable-queue which might be from ycheng(a)google.com are
queue-4.15/tcp-purge-write-queue-upon-rst.patch
queue-4.15/tcp-revert-f-rto-extension-to-detect-more-spurious-timeouts.patch
queue-4.15/tcp-revert-f-rto-middle-box-workaround.patch