Juergen Gross (13): xen/events: don't use chip_data for legacy IRQs xen/events: avoid removing an event channel while handling it xen/events: add a proper barrier to 2-level uevent unmasking xen/events: fix race in evtchn_fifo_unmask() xen/events: add a new "late EOI" evtchn framework xen/blkback: use lateeoi irq binding xen/netback: use lateeoi irq binding xen/scsiback: use lateeoi irq binding xen/pciback: use lateeoi irq binding xen/events: switch user event channels to lateeoi model xen/events: use a common cpu hotplug hook for event channels xen/events: defer eoi in case of excessive number of events xen/events: block rogue events for some time
Documentation/kernel-parameters.txt | 8 + drivers/block/xen-blkback/blkback.c | 22 +- drivers/block/xen-blkback/xenbus.c | 5 +- drivers/net/xen-netback/common.h | 39 +++ drivers/net/xen-netback/interface.c | 59 +++- drivers/net/xen-netback/netback.c | 17 +- drivers/xen/events/events_2l.c | 9 +- drivers/xen/events/events_base.c | 473 ++++++++++++++++++++++++-- drivers/xen/events/events_fifo.c | 102 +++--- drivers/xen/events/events_internal.h | 20 +- drivers/xen/evtchn.c | 7 +- drivers/xen/xen-pciback/pci_stub.c | 14 +- drivers/xen/xen-pciback/pciback.h | 12 +- drivers/xen/xen-pciback/pciback_ops.c | 48 ++- drivers/xen/xen-pciback/xenbus.c | 2 +- drivers/xen/xen-scsiback.c | 23 +- include/xen/events.h | 29 +- 17 files changed, 729 insertions(+), 160 deletions(-)
Since commit c330fb1ddc0a ("XEN uses irqdesc::irq_data_common::handler_data to store a per interrupt XEN data pointer which contains XEN specific information.") Xen is using the chip_data pointer for storing IRQ specific data. When running as a HVM domain this can result in problems for legacy IRQs, as those might use chip_data for their own purposes.
Use a local array for this purpose in case of legacy IRQs, avoiding the double use.
This is upstream commit 0891fb39ba67bd7ae023ea0d367297ffff010781
Cc: stable@vger.kernel.org Fixes: c330fb1ddc0a ("XEN uses irqdesc::irq_data_common::handler_data to store a per interrupt XEN data pointer which contains XEN specific information.") Signed-off-by: Juergen Gross jgross@suse.com Tested-by: Stefan Bader stefan.bader@canonical.com Reviewed-by: Boris Ostrovsky boris.ostrovsky@oracle.com Link: https://lore.kernel.org/r/20200930091614.13660-1-jgross@suse.com --- drivers/xen/events/events_base.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-)
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index e4dd991e2888..9a126732d5d9 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -91,6 +91,8 @@ static bool (*pirq_needs_eoi)(unsigned irq); /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0)
+static struct irq_info *legacy_info_ptrs[NR_IRQS_LEGACY]; + static struct irq_chip xen_dynamic_chip; static struct irq_chip xen_percpu_chip; static struct irq_chip xen_pirq_chip; @@ -155,7 +157,18 @@ int get_evtchn_to_irq(unsigned evtchn) /* Get info for IRQ */ struct irq_info *info_for_irq(unsigned irq) { - return irq_get_chip_data(irq); + if (irq < nr_legacy_irqs()) + return legacy_info_ptrs[irq]; + else + return irq_get_chip_data(irq); +} + +static void set_info_for_irq(unsigned int irq, struct irq_info *info) +{ + if (irq < nr_legacy_irqs()) + legacy_info_ptrs[irq] = info; + else + irq_set_chip_data(irq, info); }
/* Constructors for packed IRQ information. */ @@ -384,7 +397,7 @@ static void xen_irq_init(unsigned irq) info->type = IRQT_UNBOUND; info->refcnt = -1;
- irq_set_chip_data(irq, info); + set_info_for_irq(irq, info);
list_add_tail(&info->list, &xen_irq_list_head); } @@ -433,14 +446,14 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi)
static void xen_free_irq(unsigned irq) { - struct irq_info *info = irq_get_chip_data(irq); + struct irq_info *info = info_for_irq(irq);
if (WARN_ON(!info)) return;
list_del(&info->list);
- irq_set_chip_data(irq, NULL); + set_info_for_irq(irq, NULL);
WARN_ON(info->refcnt > 0);
@@ -610,7 +623,7 @@ EXPORT_SYMBOL_GPL(xen_irq_from_gsi); static void __unbind_from_irq(unsigned int irq) { int evtchn = evtchn_from_irq(irq); - struct irq_info *info = irq_get_chip_data(irq); + struct irq_info *info = info_for_irq(irq);
if (info->refcnt > 0) { info->refcnt--; @@ -1114,7 +1127,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
void unbind_from_irqhandler(unsigned int irq, void *dev_id) { - struct irq_info *info = irq_get_chip_data(irq); + struct irq_info *info = info_for_irq(irq);
if (WARN_ON(!info)) return; @@ -1148,7 +1161,7 @@ int evtchn_make_refcounted(unsigned int evtchn) if (irq == -1) return -ENOENT;
- info = irq_get_chip_data(irq); + info = info_for_irq(irq);
if (!info) return -ENOENT; @@ -1176,7 +1189,7 @@ int evtchn_get(unsigned int evtchn) if (irq == -1) goto done;
- info = irq_get_chip_data(irq); + info = info_for_irq(irq);
if (!info) goto done;
Today it can happen that an event channel is being removed from the system while the event handling loop is active. This can lead to a race resulting in crashes or WARN() splats when trying to access the irq_info structure related to the event channel.
Fix this problem by using a rwlock taken as reader in the event handling loop and as writer when deallocating the irq_info structure.
As the observed problem was a NULL dereference in evtchn_from_irq() make this function more robust against races by testing the irq_info pointer to be not NULL before dereferencing it.
And finally make all accesses to evtchn_to_irq[row][col] atomic ones in order to avoid seeing partial updates of an array element in irq handling. Note that irq handling can be entered only for event channels which have been valid before, so any not populated row isn't a problem in this regard, as rows are only ever added and never removed.
This is XSA-331.
This is upstream commit 073d0552ead5bfc7a3a9c01de590e924f11b5dd2
Cc: stable@vger.kernel.org Reported-by: Marek Marczykowski-Górecki marmarek@invisiblethingslab.com Reported-by: Jinoh Kang luke1337@theori.io Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Stefano Stabellini sstabellini@kernel.org Reviewed-by: Wei Liu wl@xen.org --- drivers/xen/events/events_base.c | 40 ++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-)
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 9a126732d5d9..6308d4fdf41d 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -32,6 +32,7 @@ #include <linux/slab.h> #include <linux/irqnr.h> #include <linux/pci.h> +#include <linux/spinlock.h>
#ifdef CONFIG_X86 #include <asm/desc.h> @@ -70,6 +71,23 @@ const struct evtchn_ops *evtchn_ops; */ static DEFINE_MUTEX(irq_mapping_update_lock);
+/* + * Lock protecting event handling loop against removing event channels. + * Adding of event channels is no issue as the associated IRQ becomes active + * only after everything is setup (before request_[threaded_]irq() the handler + * can't be entered for an event, as the event channel will be unmasked only + * then). + */ +static DEFINE_RWLOCK(evtchn_rwlock); + +/* + * Lock hierarchy: + * + * irq_mapping_update_lock + * evtchn_rwlock + * IRQ-desc lock + */ + static LIST_HEAD(xen_irq_list_head);
/* IRQ <-> VIRQ mapping. */ @@ -104,7 +122,7 @@ static void clear_evtchn_to_irq_row(unsigned row) unsigned col;
for (col = 0; col < EVTCHN_PER_ROW; col++) - evtchn_to_irq[row][col] = -1; + WRITE_ONCE(evtchn_to_irq[row][col], -1); }
static void clear_evtchn_to_irq_all(void) @@ -141,7 +159,7 @@ static int set_evtchn_to_irq(unsigned evtchn, unsigned irq) clear_evtchn_to_irq_row(row); }
- evtchn_to_irq[row][col] = irq; + WRITE_ONCE(evtchn_to_irq[row][col], irq); return 0; }
@@ -151,7 +169,7 @@ int get_evtchn_to_irq(unsigned evtchn) return -1; if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) return -1; - return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]; + return READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]); }
/* Get info for IRQ */ @@ -260,10 +278,14 @@ static void xen_irq_info_cleanup(struct irq_info *info) */ unsigned int evtchn_from_irq(unsigned irq) { - if (unlikely(WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq))) + const struct irq_info *info = NULL; + + if (likely(irq < nr_irqs)) + info = info_for_irq(irq); + if (!info) return 0;
- return info_for_irq(irq)->evtchn; + return info->evtchn; }
unsigned irq_from_evtchn(unsigned int evtchn) @@ -447,16 +469,21 @@ static int __must_check xen_allocate_irq_gsi(unsigned gsi) static void xen_free_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); + unsigned long flags;
if (WARN_ON(!info)) return;
+ write_lock_irqsave(&evtchn_rwlock, flags); + list_del(&info->list);
set_info_for_irq(irq, NULL);
WARN_ON(info->refcnt > 0);
+ write_unlock_irqrestore(&evtchn_rwlock, flags); + kfree(info);
/* Legacy IRQ descriptors are managed by the arch. */ @@ -1241,6 +1268,8 @@ static void __xen_evtchn_do_upcall(void) int cpu = get_cpu(); unsigned count;
+ read_lock(&evtchn_rwlock); + do { vcpu_info->evtchn_upcall_pending = 0;
@@ -1256,6 +1285,7 @@ static void __xen_evtchn_do_upcall(void) } while (count != 1 || vcpu_info->evtchn_upcall_pending);
out: + read_unlock(&evtchn_rwlock);
put_cpu(); }
A follow-up patch will require certain write to happen before an event channel is unmasked.
While the memory barrier is not strictly necessary for all the callers, the main one will need it. In order to avoid an extra memory barrier when using fifo event channels, mandate evtchn_unmask() to provide write ordering.
The 2-level event handling unmask operation is missing an appropriate barrier, so add it. Fifo event channels are fine in this regard due to using sync_cmpxchg().
This is part of XSA-332.
This is upstream commit 4d3fe31bd993ef504350989786858aefdb877daa
Cc: stable@vger.kernel.org Suggested-by: Julien Grall julien@xen.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Julien Grall jgrall@amazon.com Reviewed-by: Wei Liu wl@xen.org --- drivers/xen/events/events_2l.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c index 7dd46312c180..04d1b025b29d 100644 --- a/drivers/xen/events/events_2l.c +++ b/drivers/xen/events/events_2l.c @@ -90,6 +90,8 @@ static void evtchn_2l_unmask(unsigned port)
BUG_ON(!irqs_disabled());
+ smp_wmb(); /* All writes before unmask must be visible. */ + if (unlikely((cpu != cpu_from_evtchn(port)))) do_hypercall = 1; else {
Unmasking a fifo event channel can result in unmasking it twice, once directly in the kernel and once via a hypercall in case the event was pending.
Fix that by doing the local unmask only if the event is not pending.
This is part of XSA-332.
This is upstream commit f01337197419b7e8a492e83089552b77d3b5fb90
Cc: stable@vger.kernel.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Jan Beulich jbeulich@suse.com --- drivers/xen/events/events_fifo.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index 96a1b8da5371..ce909d830973 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -227,19 +227,25 @@ static bool evtchn_fifo_is_masked(unsigned port) return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); } /* - * Clear MASKED, spinning if BUSY is set. + * Clear MASKED if not PENDING, spinning if BUSY is set. + * Return true if mask was cleared. */ -static void clear_masked(volatile event_word_t *word) +static bool clear_masked_cond(volatile event_word_t *word) { event_word_t new, old, w;
w = *word;
do { + if (w & (1 << EVTCHN_FIFO_PENDING)) + return false; + old = w & ~(1 << EVTCHN_FIFO_BUSY); new = old & ~(1 << EVTCHN_FIFO_MASKED); w = sync_cmpxchg(word, old, new); } while (w != old); + + return true; }
static void evtchn_fifo_unmask(unsigned port) @@ -248,8 +254,7 @@ static void evtchn_fifo_unmask(unsigned port)
BUG_ON(!irqs_disabled());
- clear_masked(word); - if (evtchn_fifo_is_pending(port)) { + if (!clear_masked_cond(word)) { struct evtchn_unmask unmask = { .port = port }; (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); }
In order to avoid tight event channel related IRQ loops add a new framework of "late EOI" handling: the IRQ the event channel is bound to will be masked until the event has been handled and the related driver is capable to handle another event. The driver is responsible for unmasking the event channel via the new function xen_irq_lateeoi().
This is similar to binding an event channel to a threaded IRQ, but without having to structure the driver accordingly.
In order to support a future special handling in case a rogue guest is sending lots of unsolicited events, add a flag to xen_irq_lateeoi() which can be set by the caller to indicate the event was a spurious one.
This is part of XSA-332.
This is upstream commit 54c9de89895e0a36047fcc4ae754ea5b8655fb9d
Cc: stable@vger.kernel.org Reported-by: Julien Grall julien@xen.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Stefano Stabellini sstabellini@kernel.org Reviewed-by: Wei Liu wl@xen.org --- drivers/xen/events/events_base.c | 151 +++++++++++++++++++++++++++---- include/xen/events.h | 29 +++++- 2 files changed, 159 insertions(+), 21 deletions(-)
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 6308d4fdf41d..fe01950ef78c 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -112,6 +112,7 @@ static bool (*pirq_needs_eoi)(unsigned irq); static struct irq_info *legacy_info_ptrs[NR_IRQS_LEGACY];
static struct irq_chip xen_dynamic_chip; +static struct irq_chip xen_lateeoi_chip; static struct irq_chip xen_percpu_chip; static struct irq_chip xen_pirq_chip; static void enable_dynirq(struct irq_data *data); @@ -404,6 +405,33 @@ void notify_remote_via_irq(int irq) } EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+static void xen_irq_lateeoi_locked(struct irq_info *info) +{ + evtchn_port_t evtchn; + + evtchn = info->evtchn; + if (!VALID_EVTCHN(evtchn)) + return; + + unmask_evtchn(evtchn); +} + +void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags) +{ + struct irq_info *info; + unsigned long flags; + + read_lock_irqsave(&evtchn_rwlock, flags); + + info = info_for_irq(irq); + + if (info) + xen_irq_lateeoi_locked(info); + + read_unlock_irqrestore(&evtchn_rwlock, flags); +} +EXPORT_SYMBOL_GPL(xen_irq_lateeoi); + static void xen_irq_init(unsigned irq) { struct irq_info *info; @@ -875,7 +903,7 @@ int xen_pirq_from_irq(unsigned irq) } EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
-int bind_evtchn_to_irq(unsigned int evtchn) +static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip) { int irq; int ret; @@ -892,7 +920,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) if (irq < 0) goto out;
- irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "event");
ret = xen_irq_info_evtchn_setup(irq, evtchn); @@ -913,8 +941,19 @@ out:
return irq; } + +int bind_evtchn_to_irq(evtchn_port_t evtchn) +{ + return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip); +} EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn) +{ + return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi); + static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) { struct evtchn_bind_ipi bind_ipi; @@ -956,8 +995,9 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) return irq; }
-int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, - unsigned int remote_port) +static int bind_interdomain_evtchn_to_irq_chip(unsigned int remote_domain, + evtchn_port_t remote_port, + struct irq_chip *chip) { struct evtchn_bind_interdomain bind_interdomain; int err; @@ -968,10 +1008,26 @@ int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, &bind_interdomain);
- return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); + return err ? : bind_evtchn_to_irq_chip(bind_interdomain.local_port, + chip); +} + +int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + evtchn_port_t remote_port) +{ + return bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port, + &xen_dynamic_chip); } EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq);
+int bind_interdomain_evtchn_to_irq_lateeoi(unsigned int remote_domain, + evtchn_port_t remote_port) +{ + return bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port, + &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi); + static int find_virq(unsigned int virq, unsigned int cpu) { struct evtchn_status status; @@ -1067,14 +1123,15 @@ static void unbind_from_irq(unsigned int irq) mutex_unlock(&irq_mapping_update_lock); }
-int bind_evtchn_to_irqhandler(unsigned int evtchn, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, void *dev_id) +static int bind_evtchn_to_irqhandler_chip(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id, + struct irq_chip *chip) { int irq, retval;
- irq = bind_evtchn_to_irq(evtchn); + irq = bind_evtchn_to_irq_chip(evtchn, chip); if (irq < 0) return irq; retval = request_irq(irq, handler, irqflags, devname, dev_id); @@ -1085,18 +1142,38 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
return irq; } + +int bind_evtchn_to_irqhandler(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) +{ + return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags, + devname, dev_id, + &xen_dynamic_chip); +} EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
-int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, - unsigned int remote_port, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, - void *dev_id) +int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) +{ + return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags, + devname, dev_id, + &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi); + +static int bind_interdomain_evtchn_to_irqhandler_chip( + unsigned int remote_domain, evtchn_port_t remote_port, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id, struct irq_chip *chip) { int irq, retval;
- irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); + irq = bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port, + chip); if (irq < 0) return irq;
@@ -1108,8 +1185,33 @@ int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
return irq; } + +int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, + evtchn_port_t remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + return bind_interdomain_evtchn_to_irqhandler_chip(remote_domain, + remote_port, handler, irqflags, devname, + dev_id, &xen_dynamic_chip); +} EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+int bind_interdomain_evtchn_to_irqhandler_lateeoi(unsigned int remote_domain, + evtchn_port_t remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + return bind_interdomain_evtchn_to_irqhandler_chip(remote_domain, + remote_port, handler, irqflags, devname, + dev_id, &xen_lateeoi_chip); +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler_lateeoi); + int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) @@ -1642,6 +1744,21 @@ static struct irq_chip xen_dynamic_chip __read_mostly = { .irq_retrigger = retrigger_dynirq, };
+static struct irq_chip xen_lateeoi_chip __read_mostly = { + /* The chip name needs to contain "xen-dyn" for irqbalance to work. */ + .name = "xen-dyn-lateeoi", + + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = mask_ack_dynirq, + .irq_mask_ack = mask_ack_dynirq, + + .irq_set_affinity = set_affinity_irq, + .irq_retrigger = retrigger_dynirq, +}; + static struct irq_chip xen_pirq_chip __read_mostly = { .name = "xen-pirq",
diff --git a/include/xen/events.h b/include/xen/events.h index 88da2abaf535..ad0c61cf399b 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -12,11 +12,16 @@
unsigned xen_evtchn_nr_channels(void);
-int bind_evtchn_to_irq(unsigned int evtchn); -int bind_evtchn_to_irqhandler(unsigned int evtchn, +int bind_evtchn_to_irq(evtchn_port_t evtchn); +int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn); +int bind_evtchn_to_irqhandler(evtchn_port_t evtchn, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); +int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn, + irq_handler_t handler, + unsigned long irqflags, const char *devname, + void *dev_id); int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu); int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, irq_handler_t handler, @@ -29,13 +34,21 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, const char *devname, void *dev_id); int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, - unsigned int remote_port); + evtchn_port_t remote_port); +int bind_interdomain_evtchn_to_irq_lateeoi(unsigned int remote_domain, + evtchn_port_t remote_port); int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, - unsigned int remote_port, + evtchn_port_t remote_port, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id); +int bind_interdomain_evtchn_to_irqhandler_lateeoi(unsigned int remote_domain, + evtchn_port_t remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id);
/* * Common unbind function for all event sources. Takes IRQ to unbind from. @@ -44,6 +57,14 @@ int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, */ void unbind_from_irqhandler(unsigned int irq, void *dev_id);
+/* + * Send late EOI for an IRQ bound to an event channel via one of the *_lateeoi + * functions above. + */ +void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags); +/* Signal an event was spurious, i.e. there was no action resulting from it. */ +#define XEN_EOI_FLAG_SPURIOUS 0x00000001 + #define XEN_IRQ_PRIORITY_MAX EVTCHN_FIFO_PRIORITY_MAX #define XEN_IRQ_PRIORITY_DEFAULT EVTCHN_FIFO_PRIORITY_DEFAULT #define XEN_IRQ_PRIORITY_MIN EVTCHN_FIFO_PRIORITY_MIN
In order to reduce the chance for the system becoming unresponsive due to event storms triggered by a misbehaving blkfront use the lateeoi irq binding for blkback and unmask the event channel only after processing all pending requests.
As the thread processing requests is used to do purging work in regular intervals an EOI may be sent only after having received an event. If there was no pending I/O request flag the EOI as spurious.
This is part of XSA-332.
This is upstream commit 01263a1fabe30b4d542f34c7e2364a22587ddaf2
Cc: stable@vger.kernel.org Reported-by: Julien Grall julien@xen.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Wei Liu wl@xen.org --- drivers/block/xen-blkback/blkback.c | 22 +++++++++++++++++----- drivers/block/xen-blkback/xenbus.c | 5 ++--- 2 files changed, 19 insertions(+), 8 deletions(-)
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index a295ad6a1674..8dbdd156e0d3 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -173,7 +173,7 @@ static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
-static int do_block_io_op(struct xen_blkif *blkif); +static int do_block_io_op(struct xen_blkif *blkif, unsigned int *eoi_flags); static int dispatch_rw_block_io(struct xen_blkif *blkif, struct blkif_request *req, struct pending_req *pending_req); @@ -594,6 +594,8 @@ int xen_blkif_schedule(void *arg) struct xen_vbd *vbd = &blkif->vbd; unsigned long timeout; int ret; + bool do_eoi; + unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
while (!kthread_should_stop()) { if (try_to_freeze()) @@ -617,16 +619,23 @@ int xen_blkif_schedule(void *arg) if (timeout == 0) goto purge_gnt_list;
+ do_eoi = blkif->waiting_reqs; + blkif->waiting_reqs = 0; smp_mb(); /* clear flag *before* checking for work */
- ret = do_block_io_op(blkif); + ret = do_block_io_op(blkif, &eoi_flags); if (ret > 0) blkif->waiting_reqs = 1; if (ret == -EACCES) wait_event_interruptible(blkif->shutdown_wq, kthread_should_stop());
+ if (do_eoi && !blkif->waiting_reqs) { + xen_irq_lateeoi(blkif->irq, eoi_flags); + eoi_flags |= XEN_EOI_FLAG_SPURIOUS; + } + purge_gnt_list: if (blkif->vbd.feature_gnt_persistent && time_after(jiffies, blkif->next_lru)) { @@ -1094,7 +1103,7 @@ static void end_block_io_op(struct bio *bio) * and transmute it to the block API to hand it over to the proper block disk. */ static int -__do_block_io_op(struct xen_blkif *blkif) +__do_block_io_op(struct xen_blkif *blkif, unsigned int *eoi_flags) { union blkif_back_rings *blk_rings = &blkif->blk_rings; struct blkif_request req; @@ -1117,6 +1126,9 @@ __do_block_io_op(struct xen_blkif *blkif) if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) break;
+ /* We've seen a request, so clear spurious eoi flag. */ + *eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS; + if (kthread_should_stop()) { more_to_do = 1; break; @@ -1175,13 +1187,13 @@ done: }
static int -do_block_io_op(struct xen_blkif *blkif) +do_block_io_op(struct xen_blkif *blkif, unsigned int *eoi_flags) { union blkif_back_rings *blk_rings = &blkif->blk_rings; int more_to_do;
do { - more_to_do = __do_block_io_op(blkif); + more_to_do = __do_block_io_op(blkif, eoi_flags); if (more_to_do) break;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 923308201375..0ec257e69e95 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -200,9 +200,8 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, BUG(); }
- err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, - xen_blkif_be_int, 0, - "blkif-backend", blkif); + err = bind_interdomain_evtchn_to_irqhandler_lateeoi(blkif->domid, + evtchn, xen_blkif_be_int, 0, "blkif-backend", blkif); if (err < 0) { xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); blkif->blk_rings.common.sring = NULL;
In order to reduce the chance for the system becoming unresponsive due to event storms triggered by a misbehaving netfront use the lateeoi irq binding for netback and unmask the event channel only just before going to sleep waiting for new events.
Make sure not to issue an EOI when none is pending by introducing an eoi_pending element to struct xenvif_queue.
When no request has been consumed set the spurious flag when sending the EOI for an interrupt.
This is part of XSA-332.
This is upstream commit 23025393dbeb3b8b3b60ebfa724cdae384992e27
Cc: stable@vger.kernel.org Reported-by: Julien Grall julien@xen.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Wei Liu wl@xen.org --- drivers/net/xen-netback/common.h | 39 +++++++++++++++++++ drivers/net/xen-netback/interface.c | 59 +++++++++++++++++++++++++---- drivers/net/xen-netback/netback.c | 17 +++++++-- 3 files changed, 103 insertions(+), 12 deletions(-)
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 34173b5e886f..53c2fa244c64 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -137,6 +137,20 @@ struct xenvif_queue { /* Per-queue data for xenvif */ char name[QUEUE_NAME_SIZE]; /* DEVNAME-qN */ struct xenvif *vif; /* Parent VIF */
+ /* + * TX/RX common EOI handling. + * When feature-split-event-channels = 0, interrupt handler sets + * NETBK_COMMON_EOI, otherwise NETBK_RX_EOI and NETBK_TX_EOI are set + * by the RX and TX interrupt handlers. + * RX and TX handler threads will issue an EOI when either + * NETBK_COMMON_EOI or their specific bits (NETBK_RX_EOI or + * NETBK_TX_EOI) are set and they will reset those bits. + */ + atomic_t eoi_pending; +#define NETBK_RX_EOI 0x01 +#define NETBK_TX_EOI 0x02 +#define NETBK_COMMON_EOI 0x04 + /* Use NAPI for guest TX */ struct napi_struct napi; /* When feature-split-event-channels = 0, tx_irq = rx_irq. */ @@ -317,6 +331,7 @@ void xenvif_kick_thread(struct xenvif_queue *queue);
int xenvif_dealloc_kthread(void *data);
+bool xenvif_have_rx_work(struct xenvif_queue *queue, bool test_kthread); void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb);
void xenvif_carrier_on(struct xenvif *vif); @@ -353,4 +368,28 @@ void xenvif_skb_zerocopy_complete(struct xenvif_queue *queue); bool xenvif_mcast_match(struct xenvif *vif, const u8 *addr); void xenvif_mcast_addr_list_free(struct xenvif *vif);
+#include <linux/atomic.h> + +static inline int xenvif_atomic_fetch_or(int i, atomic_t *v) +{ + int c, old; + + c = v->counter; + while ((old = cmpxchg(&v->counter, c, c | i)) != c) + c = old; + + return c; +} + +static inline int xenvif_atomic_fetch_andnot(int i, atomic_t *v) +{ + int c, old; + + c = v->counter; + while ((old = cmpxchg(&v->counter, c, c & ~i)) != c) + c = old; + + return c; +} + #endif /* __XEN_NETBACK__COMMON_H__ */ diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 2008c6a02b8a..66260ea74d7d 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -76,12 +76,28 @@ int xenvif_schedulable(struct xenvif *vif) !vif->disabled; }
+static bool xenvif_handle_tx_interrupt(struct xenvif_queue *queue) +{ + bool rc; + + rc = RING_HAS_UNCONSUMED_REQUESTS(&queue->tx); + if (rc) + napi_schedule(&queue->napi); + return rc; +} + static irqreturn_t xenvif_tx_interrupt(int irq, void *dev_id) { struct xenvif_queue *queue = dev_id; + int old;
- if (RING_HAS_UNCONSUMED_REQUESTS(&queue->tx)) - napi_schedule(&queue->napi); + old = xenvif_atomic_fetch_or(NETBK_TX_EOI, &queue->eoi_pending); + WARN(old & NETBK_TX_EOI, "Interrupt while EOI pending\n"); + + if (!xenvif_handle_tx_interrupt(queue)) { + atomic_andnot(NETBK_TX_EOI, &queue->eoi_pending); + xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS); + }
return IRQ_HANDLED; } @@ -115,19 +131,46 @@ static int xenvif_poll(struct napi_struct *napi, int budget) return work_done; }
+static bool xenvif_handle_rx_interrupt(struct xenvif_queue *queue) +{ + bool rc; + + rc = xenvif_have_rx_work(queue, false); + if (rc) + xenvif_kick_thread(queue); + return rc; +} + static irqreturn_t xenvif_rx_interrupt(int irq, void *dev_id) { struct xenvif_queue *queue = dev_id; + int old;
- xenvif_kick_thread(queue); + old = xenvif_atomic_fetch_or(NETBK_RX_EOI, &queue->eoi_pending); + WARN(old & NETBK_RX_EOI, "Interrupt while EOI pending\n"); + + if (!xenvif_handle_rx_interrupt(queue)) { + atomic_andnot(NETBK_RX_EOI, &queue->eoi_pending); + xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS); + }
return IRQ_HANDLED; }
irqreturn_t xenvif_interrupt(int irq, void *dev_id) { - xenvif_tx_interrupt(irq, dev_id); - xenvif_rx_interrupt(irq, dev_id); + struct xenvif_queue *queue = dev_id; + int old; + + old = xenvif_atomic_fetch_or(NETBK_COMMON_EOI, &queue->eoi_pending); + WARN(old, "Interrupt while EOI pending\n"); + + /* Use bitwise or as we need to call both functions. */ + if ((!xenvif_handle_tx_interrupt(queue) | + !xenvif_handle_rx_interrupt(queue))) { + atomic_andnot(NETBK_COMMON_EOI, &queue->eoi_pending); + xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS); + }
return IRQ_HANDLED; } @@ -555,7 +598,7 @@ int xenvif_connect(struct xenvif_queue *queue, unsigned long tx_ring_ref,
if (tx_evtchn == rx_evtchn) { /* feature-split-event-channels == 0 */ - err = bind_interdomain_evtchn_to_irqhandler( + err = bind_interdomain_evtchn_to_irqhandler_lateeoi( queue->vif->domid, tx_evtchn, xenvif_interrupt, 0, queue->name, queue); if (err < 0) @@ -566,7 +609,7 @@ int xenvif_connect(struct xenvif_queue *queue, unsigned long tx_ring_ref, /* feature-split-event-channels == 1 */ snprintf(queue->tx_irq_name, sizeof(queue->tx_irq_name), "%s-tx", queue->name); - err = bind_interdomain_evtchn_to_irqhandler( + err = bind_interdomain_evtchn_to_irqhandler_lateeoi( queue->vif->domid, tx_evtchn, xenvif_tx_interrupt, 0, queue->tx_irq_name, queue); if (err < 0) @@ -576,7 +619,7 @@ int xenvif_connect(struct xenvif_queue *queue, unsigned long tx_ring_ref,
snprintf(queue->rx_irq_name, sizeof(queue->rx_irq_name), "%s-rx", queue->name); - err = bind_interdomain_evtchn_to_irqhandler( + err = bind_interdomain_evtchn_to_irqhandler_lateeoi( queue->vif->domid, rx_evtchn, xenvif_rx_interrupt, 0, queue->rx_irq_name, queue); if (err < 0) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 65d37257e033..ee7a800c16d5 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -670,6 +670,10 @@ void xenvif_napi_schedule_or_enable_events(struct xenvif_queue *queue)
if (more_to_do) napi_schedule(&queue->napi); + else if (xenvif_atomic_fetch_andnot(NETBK_TX_EOI | NETBK_COMMON_EOI, + &queue->eoi_pending) & + (NETBK_TX_EOI | NETBK_COMMON_EOI)) + xen_irq_lateeoi(queue->tx_irq, 0); }
static void tx_add_credit(struct xenvif_queue *queue) @@ -2010,14 +2014,14 @@ static bool xenvif_rx_queue_ready(struct xenvif_queue *queue) return queue->stalled && prod - cons >= 1; }
-static bool xenvif_have_rx_work(struct xenvif_queue *queue) +bool xenvif_have_rx_work(struct xenvif_queue *queue, bool test_kthread) { return (!skb_queue_empty(&queue->rx_queue) && xenvif_rx_ring_slots_available(queue)) || (queue->vif->stall_timeout && (xenvif_rx_queue_stalled(queue) || xenvif_rx_queue_ready(queue))) - || kthread_should_stop() + || (test_kthread && kthread_should_stop()) || queue->vif->disabled; }
@@ -2048,15 +2052,20 @@ static void xenvif_wait_for_rx_work(struct xenvif_queue *queue) { DEFINE_WAIT(wait);
- if (xenvif_have_rx_work(queue)) + if (xenvif_have_rx_work(queue, true)) return;
for (;;) { long ret;
prepare_to_wait(&queue->wq, &wait, TASK_INTERRUPTIBLE); - if (xenvif_have_rx_work(queue)) + if (xenvif_have_rx_work(queue, true)) break; + if (xenvif_atomic_fetch_andnot(NETBK_RX_EOI | NETBK_COMMON_EOI, + &queue->eoi_pending) & + (NETBK_RX_EOI | NETBK_COMMON_EOI)) + xen_irq_lateeoi(queue->rx_irq, 0); + ret = schedule_timeout(xenvif_rx_queue_timeout(queue)); if (!ret) break;
In order to reduce the chance for the system becoming unresponsive due to event storms triggered by a misbehaving scsifront use the lateeoi irq binding for scsiback and unmask the event channel only just before leaving the event handling function.
In case of a ring protocol error don't issue an EOI in order to avoid the possibility to use that for producing an event storm. This at once will result in no further call of scsiback_irq_fn(), so the ring_error struct member can be dropped and scsiback_do_cmd_fn() can signal the protocol error via a negative return value.
This is part of XSA-332.
This is upstream commit 86991b6e7ea6c613b7692f65106076943449b6b7
Cc: stable@vger.kernel.org Reported-by: Julien Grall julien@xen.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Wei Liu wl@xen.org --- drivers/xen/xen-scsiback.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c index 51387d75c7bf..29a1b8054a4d 100644 --- a/drivers/xen/xen-scsiback.c +++ b/drivers/xen/xen-scsiback.c @@ -91,7 +91,6 @@ struct vscsibk_info { unsigned int irq;
struct vscsiif_back_ring ring; - int ring_error;
spinlock_t ring_lock; atomic_t nr_unreplied_reqs; @@ -698,7 +697,8 @@ static int prepare_pending_reqs(struct vscsibk_info *info, return 0; }
-static int scsiback_do_cmd_fn(struct vscsibk_info *info) +static int scsiback_do_cmd_fn(struct vscsibk_info *info, + unsigned int *eoi_flags) { struct vscsiif_back_ring *ring = &info->ring; struct vscsiif_request ring_req; @@ -715,11 +715,12 @@ static int scsiback_do_cmd_fn(struct vscsibk_info *info) rc = ring->rsp_prod_pvt; pr_warn("Dom%d provided bogus ring requests (%#x - %#x = %u). Halting ring processing\n", info->domid, rp, rc, rp - rc); - info->ring_error = 1; - return 0; + return -EINVAL; }
while ((rc != rp)) { + *eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS; + if (RING_REQUEST_CONS_OVERFLOW(ring, rc)) break; pending_req = kmem_cache_alloc(scsiback_cachep, GFP_KERNEL); @@ -782,13 +783,16 @@ static int scsiback_do_cmd_fn(struct vscsibk_info *info) static irqreturn_t scsiback_irq_fn(int irq, void *dev_id) { struct vscsibk_info *info = dev_id; + int rc; + unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
- if (info->ring_error) - return IRQ_HANDLED; - - while (scsiback_do_cmd_fn(info)) + while ((rc = scsiback_do_cmd_fn(info, &eoi_flags)) > 0) cond_resched();
+ /* In case of a ring error we keep the event channel masked. */ + if (!rc) + xen_irq_lateeoi(irq, eoi_flags); + return IRQ_HANDLED; }
@@ -809,7 +813,7 @@ static int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref, sring = (struct vscsiif_sring *)area; BACK_RING_INIT(&info->ring, sring, PAGE_SIZE);
- err = bind_interdomain_evtchn_to_irq(info->domid, evtchn); + err = bind_interdomain_evtchn_to_irq_lateeoi(info->domid, evtchn); if (err < 0) goto unmap_page;
@@ -1210,7 +1214,6 @@ static int scsiback_probe(struct xenbus_device *dev,
info->domid = dev->otherend_id; spin_lock_init(&info->ring_lock); - info->ring_error = 0; atomic_set(&info->nr_unreplied_reqs, 0); init_waitqueue_head(&info->waiting_to_free); info->dev = dev;
In order to reduce the chance for the system becoming unresponsive due to event storms triggered by a misbehaving pcifront use the lateeoi irq binding for pciback and unmask the event channel only just before leaving the event handling function.
Restructure the handling to support that scheme. Basically an event can come in for two reasons: either a normal request for a pciback action, which is handled in a worker, or in case the guest has finished an AER request which was requested by pciback.
When an AER request is issued to the guest and a normal pciback action is currently active issue an EOI early in order to be able to receive another event when the AER request has been finished by the guest.
Let the worker processing the normal requests run until no further request is pending, instead of starting a new worker ion that case. Issue the EOI only just before leaving the worker.
This scheme allows to drop calling the generic function xen_pcibk_test_and_schedule_op() after processing of any request as the handling of both request types is now separated more cleanly.
This is part of XSA-332.
This is upstream commit c2711441bc961b37bba0615dd7135857d189035f
Cc: stable@vger.kernel.org Reported-by: Julien Grall julien@xen.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Wei Liu wl@xen.org --- drivers/xen/xen-pciback/pci_stub.c | 14 ++++---- drivers/xen/xen-pciback/pciback.h | 12 +++++-- drivers/xen/xen-pciback/pciback_ops.c | 48 +++++++++++++++++++++------ drivers/xen/xen-pciback/xenbus.c | 2 +- 4 files changed, 56 insertions(+), 20 deletions(-)
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 47c6df53cabf..e21b82921c33 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -681,10 +681,17 @@ static pci_ers_result_t common_process(struct pcistub_device *psdev, wmb(); notify_remote_via_irq(pdev->evtchn_irq);
+ /* Enable IRQ to signal "request done". */ + xen_pcibk_lateeoi(pdev, 0); + ret = wait_event_timeout(xen_pcibk_aer_wait_queue, !(test_bit(_XEN_PCIB_active, (unsigned long *) &sh_info->flags)), 300*HZ);
+ /* Enable IRQ for pcifront request if not already active. */ + if (!test_bit(_PDEVF_op_active, &pdev->flags)) + xen_pcibk_lateeoi(pdev, 0); + if (!ret) { if (test_bit(_XEN_PCIB_active, (unsigned long *)&sh_info->flags)) { @@ -698,13 +705,6 @@ static pci_ers_result_t common_process(struct pcistub_device *psdev, } clear_bit(_PCIB_op_pending, (unsigned long *)&pdev->flags);
- if (test_bit(_XEN_PCIF_active, - (unsigned long *)&sh_info->flags)) { - dev_dbg(&psdev->dev->dev, - "schedule pci_conf service in " DRV_NAME "\n"); - xen_pcibk_test_and_schedule_op(psdev->pdev); - } - res = (pci_ers_result_t)aer_op->err; return res; } diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h index 4d529f3e40df..f44a425d1a5a 100644 --- a/drivers/xen/xen-pciback/pciback.h +++ b/drivers/xen/xen-pciback/pciback.h @@ -13,6 +13,7 @@ #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/atomic.h> +#include <xen/events.h> #include <xen/interface/io/pciif.h>
#define DRV_NAME "xen-pciback" @@ -26,6 +27,8 @@ struct pci_dev_entry { #define PDEVF_op_active (1<<(_PDEVF_op_active)) #define _PCIB_op_pending (1) #define PCIB_op_pending (1<<(_PCIB_op_pending)) +#define _EOI_pending (2) +#define EOI_pending (1<<(_EOI_pending))
struct xen_pcibk_device { void *pci_dev_data; @@ -182,12 +185,17 @@ static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev) irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id); void xen_pcibk_do_op(struct work_struct *data);
+static inline void xen_pcibk_lateeoi(struct xen_pcibk_device *pdev, + unsigned int eoi_flag) +{ + if (test_and_clear_bit(_EOI_pending, &pdev->flags)) + xen_irq_lateeoi(pdev->evtchn_irq, eoi_flag); +} + int xen_pcibk_xenbus_register(void); void xen_pcibk_xenbus_unregister(void);
extern int verbose_request; - -void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev); #endif
/* Handles shared IRQs that can to device domain and control domain. */ diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 49c5f0e9600a..232db7fcc523 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -296,26 +296,41 @@ int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev, return 0; } #endif + +static inline bool xen_pcibk_test_op_pending(struct xen_pcibk_device *pdev) +{ + return test_bit(_XEN_PCIF_active, + (unsigned long *)&pdev->sh_info->flags) && + !test_and_set_bit(_PDEVF_op_active, &pdev->flags); +} + /* * Now the same evtchn is used for both pcifront conf_read_write request * as well as pcie aer front end ack. We use a new work_queue to schedule * xen_pcibk conf_read_write service for avoiding confict with aer_core * do_recovery job which also use the system default work_queue */ -void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) +static void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) { + bool eoi = true; + /* Check that frontend is requesting an operation and that we are not * already processing a request */ - if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags) - && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) { + if (xen_pcibk_test_op_pending(pdev)) { queue_work(xen_pcibk_wq, &pdev->op_work); + eoi = false; } /*_XEN_PCIB_active should have been cleared by pcifront. And also make sure xen_pcibk is waiting for ack by checking _PCIB_op_pending*/ if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags) && test_bit(_PCIB_op_pending, &pdev->flags)) { wake_up(&xen_pcibk_aer_wait_queue); + eoi = false; } + + /* EOI if there was nothing to do. */ + if (eoi) + xen_pcibk_lateeoi(pdev, XEN_EOI_FLAG_SPURIOUS); }
/* Performing the configuration space reads/writes must not be done in atomic @@ -323,10 +338,8 @@ void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) * use of semaphores). This function is intended to be called from a work * queue in process context taking a struct xen_pcibk_device as a parameter */
-void xen_pcibk_do_op(struct work_struct *data) +static void xen_pcibk_do_one_op(struct xen_pcibk_device *pdev) { - struct xen_pcibk_device *pdev = - container_of(data, struct xen_pcibk_device, op_work); struct pci_dev *dev; struct xen_pcibk_dev_data *dev_data = NULL; struct xen_pci_op *op = &pdev->op; @@ -399,16 +412,31 @@ void xen_pcibk_do_op(struct work_struct *data) smp_mb__before_atomic(); /* /after/ clearing PCIF_active */ clear_bit(_PDEVF_op_active, &pdev->flags); smp_mb__after_atomic(); /* /before/ final check for work */ +}
- /* Check to see if the driver domain tried to start another request in - * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. - */ - xen_pcibk_test_and_schedule_op(pdev); +void xen_pcibk_do_op(struct work_struct *data) +{ + struct xen_pcibk_device *pdev = + container_of(data, struct xen_pcibk_device, op_work); + + do { + xen_pcibk_do_one_op(pdev); + } while (xen_pcibk_test_op_pending(pdev)); + + xen_pcibk_lateeoi(pdev, 0); }
irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id) { struct xen_pcibk_device *pdev = dev_id; + bool eoi; + + /* IRQs might come in before pdev->evtchn_irq is written. */ + if (unlikely(pdev->evtchn_irq != irq)) + pdev->evtchn_irq = irq; + + eoi = test_and_set_bit(_EOI_pending, &pdev->flags); + WARN(eoi, "IRQ while EOI pending\n");
xen_pcibk_test_and_schedule_op(pdev);
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index 4843741e703a..48196347f2f9 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -124,7 +124,7 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
pdev->sh_info = vaddr;
- err = bind_interdomain_evtchn_to_irqhandler( + err = bind_interdomain_evtchn_to_irqhandler_lateeoi( pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event, 0, DRV_NAME, pdev); if (err < 0) {
Instead of disabling the irq when an event is received and enabling it again when handled by the user process use the lateeoi model.
This is part of XSA-332.
This is upstream commit c44b849cee8c3ac587da3b0980e01f77500d158c
Cc: stable@vger.kernel.org Reported-by: Julien Grall julien@xen.org Signed-off-by: Juergen Gross jgross@suse.com Tested-by: Stefano Stabellini sstabellini@kernel.org Reviewed-by: Stefano Stabellini sstabellini@kernel.org Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Wei Liu wl@xen.org --- drivers/xen/evtchn.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index f4edd6df3df2..96c3007576b6 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -173,7 +173,6 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) "Interrupt for port %d, but apparently not enabled; per-user %p\n", evtchn->port, u);
- disable_irq_nosync(irq); evtchn->enabled = false;
spin_lock(&u->ring_prod_lock); @@ -299,7 +298,7 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, evtchn = find_evtchn(u, port); if (evtchn && !evtchn->enabled) { evtchn->enabled = true; - enable_irq(irq_from_evtchn(port)); + xen_irq_lateeoi(irq_from_evtchn(port), 0); } }
@@ -399,8 +398,8 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) if (rc < 0) goto err;
- rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0, - u->name, evtchn); + rc = bind_evtchn_to_irqhandler_lateeoi(port, evtchn_interrupt, 0, + u->name, evtchn); if (rc < 0) goto err;
Today only fifo event channels have a cpu hotplug callback. In order to prepare for more percpu (de)init work move that callback into events_base.c and add percpu_init() and percpu_deinit() hooks to struct evtchn_ops.
This is part of XSA-332.
This is upstream commit 7beb290caa2adb0a399e735a1e175db9aae0523a
Cc: stable@vger.kernel.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Wei Liu wl@xen.org --- drivers/xen/events/events_base.c | 47 +++++++++++++++++++++ drivers/xen/events/events_fifo.c | 61 ++++++++++++---------------- drivers/xen/events/events_internal.h | 3 ++ 3 files changed, 75 insertions(+), 36 deletions(-)
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index fe01950ef78c..9e7cbc3369f7 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -33,6 +33,7 @@ #include <linux/irqnr.h> #include <linux/pci.h> #include <linux/spinlock.h> +#include <linux/cpu.h>
#ifdef CONFIG_X86 #include <asm/desc.h> @@ -1832,6 +1833,50 @@ void xen_callback_vector(void) {} static bool fifo_events = true; module_param(fifo_events, bool, 0);
+static int xen_evtchn_cpu_prepare(unsigned int cpu) +{ + int ret = 0; + + if (evtchn_ops->percpu_init) + ret = evtchn_ops->percpu_init(cpu); + + return ret; +} + +static int xen_evtchn_cpu_dead(unsigned int cpu) +{ + int ret = 0; + + if (evtchn_ops->percpu_deinit) + ret = evtchn_ops->percpu_deinit(cpu); + + return ret; +} + +static int evtchn_cpu_notification(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + int ret = 0; + + switch (action) { + case CPU_UP_PREPARE: + ret = xen_evtchn_cpu_prepare(cpu); + break; + case CPU_DEAD: + ret = xen_evtchn_cpu_dead(cpu); + break; + default: + break; + } + + return ret < 0 ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block evtchn_cpu_notifier = { + .notifier_call = evtchn_cpu_notification, +}; + void __init xen_init_IRQ(void) { int ret = -EINVAL; @@ -1841,6 +1886,8 @@ void __init xen_init_IRQ(void) if (ret < 0) xen_evtchn_2l_init();
+ register_cpu_notifier(&evtchn_cpu_notifier); + evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), sizeof(*evtchn_to_irq), GFP_KERNEL); BUG_ON(!evtchn_to_irq); diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index ce909d830973..db809ee3197b 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -386,21 +386,6 @@ static void evtchn_fifo_resume(void) event_array_pages = 0; }
-static const struct evtchn_ops evtchn_ops_fifo = { - .max_channels = evtchn_fifo_max_channels, - .nr_channels = evtchn_fifo_nr_channels, - .setup = evtchn_fifo_setup, - .bind_to_cpu = evtchn_fifo_bind_to_cpu, - .clear_pending = evtchn_fifo_clear_pending, - .set_pending = evtchn_fifo_set_pending, - .is_pending = evtchn_fifo_is_pending, - .test_and_set_mask = evtchn_fifo_test_and_set_mask, - .mask = evtchn_fifo_mask, - .unmask = evtchn_fifo_unmask, - .handle_events = evtchn_fifo_handle_events, - .resume = evtchn_fifo_resume, -}; - static int evtchn_fifo_alloc_control_block(unsigned cpu) { void *control_block = NULL; @@ -423,29 +408,34 @@ static int evtchn_fifo_alloc_control_block(unsigned cpu) return ret; }
-static int evtchn_fifo_cpu_notification(struct notifier_block *self, - unsigned long action, - void *hcpu) +static int evtchn_fifo_percpu_init(unsigned int cpu) { - int cpu = (long)hcpu; - int ret = 0; - - switch (action) { - case CPU_UP_PREPARE: - if (!per_cpu(cpu_control_block, cpu)) - ret = evtchn_fifo_alloc_control_block(cpu); - break; - case CPU_DEAD: - __evtchn_fifo_handle_events(cpu, true); - break; - default: - break; - } - return ret < 0 ? NOTIFY_BAD : NOTIFY_OK; + if (!per_cpu(cpu_control_block, cpu)) + return evtchn_fifo_alloc_control_block(cpu); + return 0; }
-static struct notifier_block evtchn_fifo_cpu_notifier = { - .notifier_call = evtchn_fifo_cpu_notification, +static int evtchn_fifo_percpu_deinit(unsigned int cpu) +{ + __evtchn_fifo_handle_events(cpu, true); + return 0; +} + +static const struct evtchn_ops evtchn_ops_fifo = { + .max_channels = evtchn_fifo_max_channels, + .nr_channels = evtchn_fifo_nr_channels, + .setup = evtchn_fifo_setup, + .bind_to_cpu = evtchn_fifo_bind_to_cpu, + .clear_pending = evtchn_fifo_clear_pending, + .set_pending = evtchn_fifo_set_pending, + .is_pending = evtchn_fifo_is_pending, + .test_and_set_mask = evtchn_fifo_test_and_set_mask, + .mask = evtchn_fifo_mask, + .unmask = evtchn_fifo_unmask, + .handle_events = evtchn_fifo_handle_events, + .resume = evtchn_fifo_resume, + .percpu_init = evtchn_fifo_percpu_init, + .percpu_deinit = evtchn_fifo_percpu_deinit, };
int __init xen_evtchn_fifo_init(void) @@ -461,7 +451,6 @@ int __init xen_evtchn_fifo_init(void)
evtchn_ops = &evtchn_ops_fifo;
- register_cpu_notifier(&evtchn_fifo_cpu_notifier); out: put_cpu(); return ret; diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h index 50c2050a1e32..980a56d51e4c 100644 --- a/drivers/xen/events/events_internal.h +++ b/drivers/xen/events/events_internal.h @@ -71,6 +71,9 @@ struct evtchn_ops {
void (*handle_events)(unsigned cpu); void (*resume)(void); + + int (*percpu_init)(unsigned int cpu); + int (*percpu_deinit)(unsigned int cpu); };
extern const struct evtchn_ops *evtchn_ops;
In case rogue guests are sending events at high frequency it might happen that xen_evtchn_do_upcall() won't stop processing events in dom0. As this is done in irq handling a crash might be the result.
In order to avoid that, delay further inter-domain events after some time in xen_evtchn_do_upcall() by forcing eoi processing into a worker on the same cpu, thus inhibiting new events coming in.
The time after which eoi processing is to be delayed is configurable via a new module parameter "event_loop_timeout" which specifies the maximum event loop time in jiffies (default: 2, the value was chosen after some tests showing that a value of 2 was the lowest with an only slight drop of dom0 network throughput while multiple guests performed an event storm).
How long eoi processing will be delayed can be specified via another parameter "event_eoi_delay" (again in jiffies, default 10, again the value was chosen after testing with different delay values).
This is part of XSA-332.
This is upstream commit e99502f76271d6bc4e374fe368c50c67a1fd3070
Cc: stable@vger.kernel.org Reported-by: Julien Grall julien@xen.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Stefano Stabellini sstabellini@kernel.org Reviewed-by: Wei Liu wl@xen.org --- Documentation/kernel-parameters.txt | 8 ++ drivers/xen/events/events_2l.c | 7 +- drivers/xen/events/events_base.c | 189 ++++++++++++++++++++++++++- drivers/xen/events/events_fifo.c | 30 ++--- drivers/xen/events/events_internal.h | 14 +- 5 files changed, 216 insertions(+), 32 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index b19d872feb56..17747dcd0e77 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -4488,6 +4488,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Disables the PV optimizations forcing the HVM guest to run as generic HVM guest with no PV drivers.
+ xen.event_eoi_delay= [XEN] + How long to delay EOI handling in case of event + storms (jiffies). Default is 10. + + xen.event_loop_timeout= [XEN] + After which time (jiffies) the event handling loop + should start to delay EOI handling. Default is 2. + xirc2ps_cs= [NET,PCMCIA] Format: <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]] diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c index 04d1b025b29d..e902512fcfb5 100644 --- a/drivers/xen/events/events_2l.c +++ b/drivers/xen/events/events_2l.c @@ -160,7 +160,7 @@ static inline xen_ulong_t active_evtchns(unsigned int cpu, * a bitset of words which contain pending event bits. The second * level is a bitset of pending events themselves. */ -static void evtchn_2l_handle_events(unsigned cpu) +static void evtchn_2l_handle_events(unsigned cpu, struct evtchn_loop_ctrl *ctrl) { int irq; xen_ulong_t pending_words; @@ -241,10 +241,7 @@ static void evtchn_2l_handle_events(unsigned cpu)
/* Process port. */ port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; - irq = get_evtchn_to_irq(port); - - if (irq != -1) - generic_handle_irq(irq); + handle_irq_for_port(port, ctrl);
bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD;
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 9e7cbc3369f7..07cef5fb4515 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -34,6 +34,8 @@ #include <linux/pci.h> #include <linux/spinlock.h> #include <linux/cpu.h> +#include <linux/atomic.h> +#include <linux/ktime.h>
#ifdef CONFIG_X86 #include <asm/desc.h> @@ -64,6 +66,15 @@
#include "events_internal.h"
+#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static uint __read_mostly event_loop_timeout = 2; +module_param(event_loop_timeout, uint, 0644); + +static uint __read_mostly event_eoi_delay = 10; +module_param(event_eoi_delay, uint, 0644); + const struct evtchn_ops *evtchn_ops;
/* @@ -87,6 +98,7 @@ static DEFINE_RWLOCK(evtchn_rwlock); * irq_mapping_update_lock * evtchn_rwlock * IRQ-desc lock + * percpu eoi_list_lock */
static LIST_HEAD(xen_irq_list_head); @@ -119,6 +131,8 @@ static struct irq_chip xen_pirq_chip; static void enable_dynirq(struct irq_data *data); static void disable_dynirq(struct irq_data *data);
+static DEFINE_PER_CPU(unsigned int, irq_epoch); + static void clear_evtchn_to_irq_row(unsigned row) { unsigned col; @@ -406,17 +420,120 @@ void notify_remote_via_irq(int irq) } EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+struct lateeoi_work { + struct delayed_work delayed; + spinlock_t eoi_list_lock; + struct list_head eoi_list; +}; + +static DEFINE_PER_CPU(struct lateeoi_work, lateeoi); + +static void lateeoi_list_del(struct irq_info *info) +{ + struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu); + unsigned long flags; + + spin_lock_irqsave(&eoi->eoi_list_lock, flags); + list_del_init(&info->eoi_list); + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); +} + +static void lateeoi_list_add(struct irq_info *info) +{ + struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu); + struct irq_info *elem; + u64 now = get_jiffies_64(); + unsigned long delay; + unsigned long flags; + + if (now < info->eoi_time) + delay = info->eoi_time - now; + else + delay = 1; + + spin_lock_irqsave(&eoi->eoi_list_lock, flags); + + if (list_empty(&eoi->eoi_list)) { + list_add(&info->eoi_list, &eoi->eoi_list); + mod_delayed_work_on(info->eoi_cpu, system_wq, + &eoi->delayed, delay); + } else { + list_for_each_entry_reverse(elem, &eoi->eoi_list, eoi_list) { + if (elem->eoi_time <= info->eoi_time) + break; + } + list_add(&info->eoi_list, &elem->eoi_list); + } + + spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); +} + static void xen_irq_lateeoi_locked(struct irq_info *info) { evtchn_port_t evtchn; + unsigned int cpu;
evtchn = info->evtchn; - if (!VALID_EVTCHN(evtchn)) + if (!VALID_EVTCHN(evtchn) || !list_empty(&info->eoi_list)) return;
+ cpu = info->eoi_cpu; + if (info->eoi_time && info->irq_epoch == per_cpu(irq_epoch, cpu)) { + lateeoi_list_add(info); + return; + } + + info->eoi_time = 0; unmask_evtchn(evtchn); }
+static void xen_irq_lateeoi_worker(struct work_struct *work) +{ + struct lateeoi_work *eoi; + struct irq_info *info; + u64 now = get_jiffies_64(); + unsigned long flags; + + eoi = container_of(to_delayed_work(work), struct lateeoi_work, delayed); + + read_lock_irqsave(&evtchn_rwlock, flags); + + while (true) { + spin_lock(&eoi->eoi_list_lock); + + info = list_first_entry_or_null(&eoi->eoi_list, struct irq_info, + eoi_list); + + if (info == NULL || now < info->eoi_time) { + spin_unlock(&eoi->eoi_list_lock); + break; + } + + list_del_init(&info->eoi_list); + + spin_unlock(&eoi->eoi_list_lock); + + info->eoi_time = 0; + + xen_irq_lateeoi_locked(info); + } + + if (info) + mod_delayed_work_on(info->eoi_cpu, system_wq, + &eoi->delayed, info->eoi_time - now); + + read_unlock_irqrestore(&evtchn_rwlock, flags); +} + +static void xen_cpu_init_eoi(unsigned int cpu) +{ + struct lateeoi_work *eoi = &per_cpu(lateeoi, cpu); + + INIT_DELAYED_WORK(&eoi->delayed, xen_irq_lateeoi_worker); + spin_lock_init(&eoi->eoi_list_lock); + INIT_LIST_HEAD(&eoi->eoi_list); +} + void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags) { struct irq_info *info; @@ -436,6 +553,7 @@ EXPORT_SYMBOL_GPL(xen_irq_lateeoi); static void xen_irq_init(unsigned irq) { struct irq_info *info; + #ifdef CONFIG_SMP /* By default all event channels notify CPU#0. */ cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0)); @@ -450,6 +568,7 @@ static void xen_irq_init(unsigned irq)
set_info_for_irq(irq, info);
+ INIT_LIST_HEAD(&info->eoi_list); list_add_tail(&info->list, &xen_irq_list_head); }
@@ -505,6 +624,9 @@ static void xen_free_irq(unsigned irq)
write_lock_irqsave(&evtchn_rwlock, flags);
+ if (!list_empty(&info->eoi_list)) + lateeoi_list_del(info); + list_del(&info->list);
set_info_for_irq(irq, NULL); @@ -1363,6 +1485,54 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) notify_remote_via_irq(irq); }
+struct evtchn_loop_ctrl { + ktime_t timeout; + unsigned count; + bool defer_eoi; +}; + +void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl) +{ + int irq; + struct irq_info *info; + + irq = get_evtchn_to_irq(port); + if (irq == -1) + return; + + /* + * Check for timeout every 256 events. + * We are setting the timeout value only after the first 256 + * events in order to not hurt the common case of few loop + * iterations. The 256 is basically an arbitrary value. + * + * In case we are hitting the timeout we need to defer all further + * EOIs in order to ensure to leave the event handling loop rather + * sooner than later. + */ + if (!ctrl->defer_eoi && !(++ctrl->count & 0xff)) { + ktime_t kt = ktime_get(); + + if (!ctrl->timeout.tv64) { + kt = ktime_add_ms(kt, + jiffies_to_msecs(event_loop_timeout)); + ctrl->timeout = kt; + } else if (kt.tv64 > ctrl->timeout.tv64) { + ctrl->defer_eoi = true; + } + } + + info = info_for_irq(irq); + + if (ctrl->defer_eoi) { + info->eoi_cpu = smp_processor_id(); + info->irq_epoch = __this_cpu_read(irq_epoch); + info->eoi_time = get_jiffies_64() + event_eoi_delay; + } + + generic_handle_irq(irq); +} + static DEFINE_PER_CPU(unsigned, xed_nesting_count);
static void __xen_evtchn_do_upcall(void) @@ -1370,6 +1540,7 @@ static void __xen_evtchn_do_upcall(void) struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); int cpu = get_cpu(); unsigned count; + struct evtchn_loop_ctrl ctrl = { 0 };
read_lock(&evtchn_rwlock);
@@ -1379,7 +1550,7 @@ static void __xen_evtchn_do_upcall(void) if (__this_cpu_inc_return(xed_nesting_count) - 1) goto out;
- xen_evtchn_handle_events(cpu); + xen_evtchn_handle_events(cpu, &ctrl);
BUG_ON(!irqs_disabled());
@@ -1390,6 +1561,13 @@ static void __xen_evtchn_do_upcall(void) out: read_unlock(&evtchn_rwlock);
+ /* + * Increment irq_epoch only now to defer EOIs only for + * xen_irq_lateeoi() invocations occurring from inside the loop + * above. + */ + __this_cpu_inc(irq_epoch); + put_cpu(); }
@@ -1827,9 +2005,6 @@ void xen_callback_vector(void) void xen_callback_vector(void) {} #endif
-#undef MODULE_PARAM_PREFIX -#define MODULE_PARAM_PREFIX "xen." - static bool fifo_events = true; module_param(fifo_events, bool, 0);
@@ -1837,6 +2012,8 @@ static int xen_evtchn_cpu_prepare(unsigned int cpu) { int ret = 0;
+ xen_cpu_init_eoi(cpu); + if (evtchn_ops->percpu_init) ret = evtchn_ops->percpu_init(cpu);
@@ -1886,6 +2063,8 @@ void __init xen_init_IRQ(void) if (ret < 0) xen_evtchn_2l_init();
+ xen_cpu_init_eoi(smp_processor_id()); + register_cpu_notifier(&evtchn_cpu_notifier);
evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c index db809ee3197b..7addca0d8d26 100644 --- a/drivers/xen/events/events_fifo.c +++ b/drivers/xen/events/events_fifo.c @@ -275,19 +275,9 @@ static uint32_t clear_linked(volatile event_word_t *word) return w & EVTCHN_FIFO_LINK_MASK; }
-static void handle_irq_for_port(unsigned port) -{ - int irq; - - irq = get_evtchn_to_irq(port); - if (irq != -1) - generic_handle_irq(irq); -} - -static void consume_one_event(unsigned cpu, +static void consume_one_event(unsigned cpu, struct evtchn_loop_ctrl *ctrl, struct evtchn_fifo_control_block *control_block, - unsigned priority, unsigned long *ready, - bool drop) + unsigned priority, unsigned long *ready) { struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); uint32_t head; @@ -320,16 +310,17 @@ static void consume_one_event(unsigned cpu, clear_bit(priority, ready);
if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) { - if (unlikely(drop)) + if (unlikely(!ctrl)) pr_warn("Dropping pending event for port %u\n", port); else - handle_irq_for_port(port); + handle_irq_for_port(port, ctrl); }
q->head[priority] = head; }
-static void __evtchn_fifo_handle_events(unsigned cpu, bool drop) +static void __evtchn_fifo_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) { struct evtchn_fifo_control_block *control_block; unsigned long ready; @@ -341,14 +332,15 @@ static void __evtchn_fifo_handle_events(unsigned cpu, bool drop)
while (ready) { q = find_first_bit(&ready, EVTCHN_FIFO_MAX_QUEUES); - consume_one_event(cpu, control_block, q, &ready, drop); + consume_one_event(cpu, ctrl, control_block, q, &ready); ready |= xchg(&control_block->ready, 0); } }
-static void evtchn_fifo_handle_events(unsigned cpu) +static void evtchn_fifo_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) { - __evtchn_fifo_handle_events(cpu, false); + __evtchn_fifo_handle_events(cpu, ctrl); }
static void evtchn_fifo_resume(void) @@ -417,7 +409,7 @@ static int evtchn_fifo_percpu_init(unsigned int cpu)
static int evtchn_fifo_percpu_deinit(unsigned int cpu) { - __evtchn_fifo_handle_events(cpu, true); + __evtchn_fifo_handle_events(cpu, NULL); return 0; }
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h index 980a56d51e4c..2cb9c2d2c5c0 100644 --- a/drivers/xen/events/events_internal.h +++ b/drivers/xen/events/events_internal.h @@ -32,11 +32,15 @@ enum xen_irq_type { */ struct irq_info { struct list_head list; + struct list_head eoi_list; int refcnt; enum xen_irq_type type; /* type */ unsigned irq; unsigned int evtchn; /* event channel */ unsigned short cpu; /* cpu bound */ + unsigned short eoi_cpu; /* EOI must happen on this cpu */ + unsigned int irq_epoch; /* If eoi_cpu valid: irq_epoch of event */ + u64 eoi_time; /* Time in jiffies when to EOI. */
union { unsigned short virq; @@ -55,6 +59,8 @@ struct irq_info { #define PIRQ_SHAREABLE (1 << 1) #define PIRQ_MSI_GROUP (1 << 2)
+struct evtchn_loop_ctrl; + struct evtchn_ops { unsigned (*max_channels)(void); unsigned (*nr_channels)(void); @@ -69,7 +75,7 @@ struct evtchn_ops { void (*mask)(unsigned port); void (*unmask)(unsigned port);
- void (*handle_events)(unsigned cpu); + void (*handle_events)(unsigned cpu, struct evtchn_loop_ctrl *ctrl); void (*resume)(void);
int (*percpu_init)(unsigned int cpu); @@ -80,6 +86,7 @@ extern const struct evtchn_ops *evtchn_ops;
extern int **evtchn_to_irq; int get_evtchn_to_irq(unsigned int evtchn); +void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl);
struct irq_info *info_for_irq(unsigned irq); unsigned cpu_from_irq(unsigned irq); @@ -137,9 +144,10 @@ static inline void unmask_evtchn(unsigned port) return evtchn_ops->unmask(port); }
-static inline void xen_evtchn_handle_events(unsigned cpu) +static inline void xen_evtchn_handle_events(unsigned cpu, + struct evtchn_loop_ctrl *ctrl) { - return evtchn_ops->handle_events(cpu); + return evtchn_ops->handle_events(cpu, ctrl); }
static inline void xen_evtchn_resume(void)
In order to avoid high dom0 load due to rogue guests sending events at high frequency, block those events in case there was no action needed in dom0 to handle the events.
This is done by adding a per-event counter, which set to zero in case an EOI without the XEN_EOI_FLAG_SPURIOUS is received from a backend driver, and incremented when this flag has been set. In case the counter is 2 or higher delay the EOI by 1 << (cnt - 2) jiffies, but not more than 1 second.
In order not to waste memory shorten the per-event refcnt to two bytes (it should normally never exceed a value of 2). Add an overflow check to evtchn_get() to make sure the 2 bytes really won't overflow.
This is part of XSA-332.
This is upstream commit 5f7f77400ab5b357b5fdb7122c3442239672186c
Cc: stable@vger.kernel.org Signed-off-by: Juergen Gross jgross@suse.com Reviewed-by: Jan Beulich jbeulich@suse.com Reviewed-by: Stefano Stabellini sstabellini@kernel.org Reviewed-by: Wei Liu wl@xen.org --- drivers/xen/events/events_base.c | 27 ++++++++++++++++++++++----- drivers/xen/events/events_internal.h | 3 ++- 2 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 07cef5fb4515..ec4074c66d9d 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -468,17 +468,34 @@ static void lateeoi_list_add(struct irq_info *info) spin_unlock_irqrestore(&eoi->eoi_list_lock, flags); }
-static void xen_irq_lateeoi_locked(struct irq_info *info) +static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious) { evtchn_port_t evtchn; unsigned int cpu; + unsigned int delay = 0;
evtchn = info->evtchn; if (!VALID_EVTCHN(evtchn) || !list_empty(&info->eoi_list)) return;
+ if (spurious) { + if ((1 << info->spurious_cnt) < (HZ << 2)) + info->spurious_cnt++; + if (info->spurious_cnt > 1) { + delay = 1 << (info->spurious_cnt - 2); + if (delay > HZ) + delay = HZ; + if (!info->eoi_time) + info->eoi_cpu = smp_processor_id(); + info->eoi_time = get_jiffies_64() + delay; + } + } else { + info->spurious_cnt = 0; + } + cpu = info->eoi_cpu; - if (info->eoi_time && info->irq_epoch == per_cpu(irq_epoch, cpu)) { + if (info->eoi_time && + (info->irq_epoch == per_cpu(irq_epoch, cpu) || delay)) { lateeoi_list_add(info); return; } @@ -515,7 +532,7 @@ static void xen_irq_lateeoi_worker(struct work_struct *work)
info->eoi_time = 0;
- xen_irq_lateeoi_locked(info); + xen_irq_lateeoi_locked(info, false); }
if (info) @@ -544,7 +561,7 @@ void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags) info = info_for_irq(irq);
if (info) - xen_irq_lateeoi_locked(info); + xen_irq_lateeoi_locked(info, eoi_flags & XEN_EOI_FLAG_SPURIOUS);
read_unlock_irqrestore(&evtchn_rwlock, flags); } @@ -1447,7 +1464,7 @@ int evtchn_get(unsigned int evtchn) goto done;
err = -EINVAL; - if (info->refcnt <= 0) + if (info->refcnt <= 0 || info->refcnt == SHRT_MAX) goto done;
info->refcnt++; diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h index 2cb9c2d2c5c0..b9b4f5919893 100644 --- a/drivers/xen/events/events_internal.h +++ b/drivers/xen/events/events_internal.h @@ -33,7 +33,8 @@ enum xen_irq_type { struct irq_info { struct list_head list; struct list_head eoi_list; - int refcnt; + short refcnt; + short spurious_cnt; enum xen_irq_type type; /* type */ unsigned irq; unsigned int evtchn; /* event channel */
On Tue, Nov 03, 2020 at 05:22:25PM +0100, Juergen Gross wrote:
Juergen Gross (13): xen/events: don't use chip_data for legacy IRQs xen/events: avoid removing an event channel while handling it xen/events: add a proper barrier to 2-level uevent unmasking xen/events: fix race in evtchn_fifo_unmask() xen/events: add a new "late EOI" evtchn framework xen/blkback: use lateeoi irq binding xen/netback: use lateeoi irq binding xen/scsiback: use lateeoi irq binding xen/pciback: use lateeoi irq binding xen/events: switch user event channels to lateeoi model xen/events: use a common cpu hotplug hook for event channels xen/events: defer eoi in case of excessive number of events xen/events: block rogue events for some time
All now queued up, thanks.
greg k-h
linux-stable-mirror@lists.linaro.org