From: Ranjan Kumar <ranjan.kumar(a)broadcom.com>
[ Upstream commit ee0017c3ed8a8abfa4d40e42f908fb38c31e7515 ]
If the driver detects that the controller is not ready before sending the
first IOC facts command, it will wait for a maximum of 10 seconds for it to
become ready. However, even if the controller becomes ready within 10
seconds, the driver will still issue a diagnostic reset.
Modify the driver to avoid sending a diag reset if the controller becomes
ready within the 10-second wait time.
Signed-off-by: Ranjan Kumar <ranjan.kumar(a)broadcom.com>
Link: https://lore.kernel.org/r/20240221071724.14986-1-ranjan.kumar@broadcom.com
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/scsi/mpt3sas/mpt3sas_base.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c
index 447ac667f4b2b..7588c2c11a879 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -5584,7 +5584,9 @@ _base_wait_for_iocstate(struct MPT3SAS_ADAPTER *ioc, int timeout)
return -EFAULT;
}
- issue_diag_reset:
+ return 0;
+
+issue_diag_reset:
rc = _base_diag_reset(ioc);
return rc;
}
--
2.43.0
From: Filipe Manana <fdmanana(a)suse.com>
[ Upstream commit c7bb26b847e5b97814f522686068c5628e2b3646 ]
At btrfs_use_block_rsv() we read the size of a block reserve without
locking its spinlock, which makes KCSAN complain because the size of a
block reserve is always updated while holding its spinlock. The report
from KCSAN is the following:
[653.313148] BUG: KCSAN: data-race in btrfs_update_delayed_refs_rsv [btrfs] / btrfs_use_block_rsv [btrfs]
[653.314755] read to 0x000000017f5871b8 of 8 bytes by task 7519 on cpu 0:
[653.314779] btrfs_use_block_rsv+0xe4/0x2f8 [btrfs]
[653.315606] btrfs_alloc_tree_block+0xdc/0x998 [btrfs]
[653.316421] btrfs_force_cow_block+0x220/0xe38 [btrfs]
[653.317242] btrfs_cow_block+0x1ac/0x568 [btrfs]
[653.318060] btrfs_search_slot+0xda2/0x19b8 [btrfs]
[653.318879] btrfs_del_csums+0x1dc/0x798 [btrfs]
[653.319702] __btrfs_free_extent.isra.0+0xc24/0x2028 [btrfs]
[653.320538] __btrfs_run_delayed_refs+0xd3c/0x2390 [btrfs]
[653.321340] btrfs_run_delayed_refs+0xae/0x290 [btrfs]
[653.322140] flush_space+0x5e4/0x718 [btrfs]
[653.322958] btrfs_preempt_reclaim_metadata_space+0x102/0x2f8 [btrfs]
[653.323781] process_one_work+0x3b6/0x838
[653.323800] worker_thread+0x75e/0xb10
[653.323817] kthread+0x21a/0x230
[653.323836] __ret_from_fork+0x6c/0xb8
[653.323855] ret_from_fork+0xa/0x30
[653.323887] write to 0x000000017f5871b8 of 8 bytes by task 576 on cpu 3:
[653.323906] btrfs_update_delayed_refs_rsv+0x1a4/0x250 [btrfs]
[653.324699] btrfs_add_delayed_data_ref+0x468/0x6d8 [btrfs]
[653.325494] btrfs_free_extent+0x76/0x120 [btrfs]
[653.326280] __btrfs_mod_ref+0x6a8/0x6b8 [btrfs]
[653.327064] btrfs_dec_ref+0x50/0x70 [btrfs]
[653.327849] walk_up_proc+0x236/0xa50 [btrfs]
[653.328633] walk_up_tree+0x21c/0x448 [btrfs]
[653.329418] btrfs_drop_snapshot+0x802/0x1328 [btrfs]
[653.330205] btrfs_clean_one_deleted_snapshot+0x184/0x238 [btrfs]
[653.330995] cleaner_kthread+0x2b0/0x2f0 [btrfs]
[653.331781] kthread+0x21a/0x230
[653.331800] __ret_from_fork+0x6c/0xb8
[653.331818] ret_from_fork+0xa/0x30
So add a helper to get the size of a block reserve while holding the lock.
Reading the field while holding the lock instead of using the data_race()
annotation is used in order to prevent load tearing.
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
fs/btrfs/block-rsv.c | 2 +-
fs/btrfs/block-rsv.h | 16 ++++++++++++++++
2 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 36ef3228bac86..63205d2f4d84c 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -392,7 +392,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
- if (unlikely(block_rsv->size == 0))
+ if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
goto try_reserve;
again:
ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index d1428bb73fc5a..69770360917cb 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -98,4 +98,20 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
btrfs_block_rsv_release(fs_info, block_rsv, 0);
}
+/*
+ * Get the size of a block reserve in a context where getting a stale value is
+ * acceptable, instead of accessing it directly and trigger data race warning
+ * from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->size;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
--
2.43.0
From: Ranjan Kumar <ranjan.kumar(a)broadcom.com>
[ Upstream commit ee0017c3ed8a8abfa4d40e42f908fb38c31e7515 ]
If the driver detects that the controller is not ready before sending the
first IOC facts command, it will wait for a maximum of 10 seconds for it to
become ready. However, even if the controller becomes ready within 10
seconds, the driver will still issue a diagnostic reset.
Modify the driver to avoid sending a diag reset if the controller becomes
ready within the 10-second wait time.
Signed-off-by: Ranjan Kumar <ranjan.kumar(a)broadcom.com>
Link: https://lore.kernel.org/r/20240221071724.14986-1-ranjan.kumar@broadcom.com
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/scsi/mpt3sas/mpt3sas_base.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c
index 814ac25238058..105d781d0cacf 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -6357,7 +6357,9 @@ _base_wait_for_iocstate(struct MPT3SAS_ADAPTER *ioc, int timeout)
return -EFAULT;
}
- issue_diag_reset:
+ return 0;
+
+issue_diag_reset:
rc = _base_diag_reset(ioc);
return rc;
}
--
2.43.0
Currently for devices requiring masking at the irqchip for INTx, ie.
devices without DisINTx support, the IRQ is enabled in request_irq()
and subsequently disabled as necessary to align with the masked status
flag. This presents a window where the interrupt could fire between
these events, resulting in the IRQ incrementing the disable depth twice.
This would be unrecoverable for a user since the masked flag prevents
nested enables through vfio.
Instead, invert the logic using IRQF_NO_AUTOEN such that exclusive INTx
is never auto-enabled, then unmask as required.
Cc: stable(a)vger.kernel.org
Fixes: 89e1f7d4c66d ("vfio: Add PCI device driver")
Reviewed-by: Kevin Tian <kevin.tian(a)intel.com>
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com>
---
drivers/vfio/pci/vfio_pci_intrs.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 237beac83809..136101179fcb 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -296,8 +296,15 @@ static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd)
ctx->trigger = trigger;
+ /*
+ * Devices without DisINTx support require an exclusive interrupt,
+ * IRQ masking is performed at the IRQ chip. The masked status is
+ * protected by vdev->irqlock. Setup the IRQ without auto-enable and
+ * unmask as necessary below under lock. DisINTx is unmodified by
+ * the IRQ configuration and may therefore use auto-enable.
+ */
if (!vdev->pci_2_3)
- irqflags = 0;
+ irqflags = IRQF_NO_AUTOEN;
ret = request_irq(pdev->irq, vfio_intx_handler,
irqflags, ctx->name, vdev);
@@ -308,13 +315,9 @@ static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd)
return ret;
}
- /*
- * INTx disable will stick across the new irq setup,
- * disable_irq won't.
- */
spin_lock_irqsave(&vdev->irqlock, flags);
- if (!vdev->pci_2_3 && ctx->masked)
- disable_irq_nosync(pdev->irq);
+ if (!vdev->pci_2_3 && !ctx->masked)
+ enable_irq(pdev->irq);
spin_unlock_irqrestore(&vdev->irqlock, flags);
return 0;
--
2.44.0
From: Vitor Soares <vitor.soares(a)toradex.com>
When the mcp251xfd_start_xmit() function fails, the driver stops
processing messages, and the interrupt routine does not return,
running indefinitely even after killing the running application.
Error messages:
[ 441.298819] mcp251xfd spi2.0 can0: ERROR in mcp251xfd_start_xmit: -16
[ 441.306498] mcp251xfd spi2.0 can0: Transmit Event FIFO buffer not empty. (seq=0x000017c7, tef_tail=0x000017cf, tef_head=0x000017d0, tx_head=0x000017d3).
... and repeat forever.
The issue can be triggered when multiple devices share the same
SPI interface. And there is concurrent access to the bus.
The problem occurs because tx_ring->head increments even if
mcp251xfd_start_xmit() fails. Consequently, the driver skips one
TX package while still expecting a response in
mcp251xfd_handle_tefif_one().
This patch resolves the issue by decreasing tx_ring->head if
mcp251xfd_start_xmit() fails. With the fix, if we trigger the issue and
the err = -EBUSY, the driver returns NETDEV_TX_BUSY. The network stack
retries to transmit the message.
Otherwise, it prints an error and discards the message.
Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN")
Cc: stable(a)vger.kernel.org
Signed-off-by: Vitor Soares <vitor.soares(a)toradex.com>
---
V2->V3:
- Add tx_dropped stats.
- netdev_sent_queue() only if can_put_echo_skb() succeed.
V1->V2:
- Return NETDEV_TX_BUSY if mcp251xfd_tx_obj_write() == -EBUSY.
- Rework the commit message to address the change above.
- Change can_put_echo_skb() to be called after mcp251xfd_tx_obj_write() succeed.
Otherwise, we get Kernel NULL pointer dereference error.
drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c | 34 ++++++++++++--------
1 file changed, 21 insertions(+), 13 deletions(-)
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
index 160528d3cc26..146c44e47c60 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c
@@ -166,6 +166,7 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb,
struct net_device *ndev)
{
struct mcp251xfd_priv *priv = netdev_priv(ndev);
+ struct net_device_stats *stats = &ndev->stats;
struct mcp251xfd_tx_ring *tx_ring = priv->tx;
struct mcp251xfd_tx_obj *tx_obj;
unsigned int frame_len;
@@ -181,25 +182,32 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb,
tx_obj = mcp251xfd_get_tx_obj_next(tx_ring);
mcp251xfd_tx_obj_from_skb(priv, tx_obj, skb, tx_ring->head);
- /* Stop queue if we occupy the complete TX FIFO */
tx_head = mcp251xfd_get_tx_head(tx_ring);
- tx_ring->head++;
- if (mcp251xfd_get_tx_free(tx_ring) == 0)
- netif_stop_queue(ndev);
-
frame_len = can_skb_get_frame_len(skb);
- err = can_put_echo_skb(skb, ndev, tx_head, frame_len);
- if (!err)
- netdev_sent_queue(priv->ndev, frame_len);
+
+ tx_ring->head++;
err = mcp251xfd_tx_obj_write(priv, tx_obj);
- if (err)
- goto out_err;
+ if (err) {
+ tx_ring->head--;
- return NETDEV_TX_OK;
+ if (err == -EBUSY)
+ return NETDEV_TX_BUSY;
- out_err:
- netdev_err(priv->ndev, "ERROR in %s: %d\n", __func__, err);
+ stats->tx_dropped++;
+
+ if (net_ratelimit())
+ netdev_err(priv->ndev,
+ "ERROR in %s: %d\n", __func__, err);
+ } else {
+ err = can_put_echo_skb(skb, ndev, tx_head, frame_len);
+ if (!err)
+ netdev_sent_queue(priv->ndev, frame_len);
+
+ /* Stop queue if we occupy the complete TX FIFO */
+ if (mcp251xfd_get_tx_free(tx_ring) == 0)
+ netif_stop_queue(ndev);
+ }
return NETDEV_TX_OK;
}
--
2.34.1
When running the simult_flow selftest in slow environments -- e.g. QEmu
without KVM support --, the results can be unstable. This selftest
checks if the aggregated bandwidth is (almost) fully used as expected.
To help improving the stability while still keeping the same validation
in place, the BW and the delay are reduced to lower the pressure on the
CPU.
Fixes: 1a418cb8e888 ("mptcp: simult flow self-tests")
Fixes: 219d04992b68 ("mptcp: push pending frames when subflow has free space")
Cc: stable(a)vger.kernel.org
Suggested-by: Paolo Abeni <pabeni(a)redhat.com>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Link: https://lore.kernel.org/r/20240131-upstream-net-20240131-mptcp-ci-issues-v1…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
(cherry picked from commit 5e2f3c65af47e527ccac54060cf909e3306652ff)
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Notes:
- Conflicts in simult_flows.sh, because v5.15 doesn't have commit
675d99338e7a ("selftests: mptcp: simult flows: format subtests
results in TAP") which modifies the context for a new but unrelated
feature.
- This is a new version to the one recently proposed by Sasha, this
time without dependences:
https://lore.kernel.org/stable/9f185a3f-9373-401c-9a5c-ec0f106c0cbc@kernel.…
- This is the same patch as the one recently sent for v6.1:
https://lore.kernel.org/stable/20240311111224.1421344-2-matttbe@kernel.org/
---
tools/testing/selftests/net/mptcp/simult_flows.sh | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
index 752cef168804..cab3e3a5481d 100755
--- a/tools/testing/selftests/net/mptcp/simult_flows.sh
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -289,10 +289,11 @@ done
setup
run_test 10 10 0 0 "balanced bwidth"
-run_test 10 10 1 50 "balanced bwidth with unbalanced delay"
+run_test 10 10 1 25 "balanced bwidth with unbalanced delay"
# we still need some additional infrastructure to pass the following test-cases
-run_test 30 10 0 0 "unbalanced bwidth"
-run_test 30 10 1 50 "unbalanced bwidth with unbalanced delay"
-run_test 30 10 50 1 "unbalanced bwidth with opposed, unbalanced delay"
+run_test 10 3 0 0 "unbalanced bwidth"
+run_test 10 3 1 25 "unbalanced bwidth with unbalanced delay"
+run_test 10 3 25 1 "unbalanced bwidth with opposed, unbalanced delay"
+
exit $ret
--
2.43.0
When running the simult_flow selftest in slow environments -- e.g. QEmu
without KVM support --, the results can be unstable. This selftest
checks if the aggregated bandwidth is (almost) fully used as expected.
To help improving the stability while still keeping the same validation
in place, the BW and the delay are reduced to lower the pressure on the
CPU.
Fixes: 1a418cb8e888 ("mptcp: simult flow self-tests")
Fixes: 219d04992b68 ("mptcp: push pending frames when subflow has free space")
Cc: stable(a)vger.kernel.org
Suggested-by: Paolo Abeni <pabeni(a)redhat.com>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
Link: https://lore.kernel.org/r/20240131-upstream-net-20240131-mptcp-ci-issues-v1…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
(cherry picked from commit 5e2f3c65af47e527ccac54060cf909e3306652ff)
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Notes:
- Conflicts in simult_flows.sh, because v6.1 doesn't have commit
675d99338e7a ("selftests: mptcp: simult flows: format subtests
results in TAP") which modifies the context for a new but unrelated
feature.
- This is a new version to the one recently proposed by Sasha, this
time without dependences:
https://lore.kernel.org/stable/9f185a3f-9373-401c-9a5c-ec0f106c0cbc@kernel.…
---
tools/testing/selftests/net/mptcp/simult_flows.sh | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
index 4a417f9d51d6..ee24e06521e6 100755
--- a/tools/testing/selftests/net/mptcp/simult_flows.sh
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -301,10 +301,11 @@ done
setup
run_test 10 10 0 0 "balanced bwidth"
-run_test 10 10 1 50 "balanced bwidth with unbalanced delay"
+run_test 10 10 1 25 "balanced bwidth with unbalanced delay"
# we still need some additional infrastructure to pass the following test-cases
-run_test 30 10 0 0 "unbalanced bwidth"
-run_test 30 10 1 50 "unbalanced bwidth with unbalanced delay"
-run_test 30 10 50 1 "unbalanced bwidth with opposed, unbalanced delay"
+run_test 10 3 0 0 "unbalanced bwidth"
+run_test 10 3 1 25 "unbalanced bwidth with unbalanced delay"
+run_test 10 3 25 1 "unbalanced bwidth with opposed, unbalanced delay"
+
exit $ret
--
2.43.0
The eventfd_ctx trigger pointer of the vfio_fsl_mc_irq object is
initially NULL and may become NULL if the user sets the trigger
eventfd to -1. The interrupt handler itself is guaranteed that
trigger is always valid between request_irq() and free_irq(), but
the loopback testing mechanisms to invoke the handler function
need to test the trigger. The triggering and setting ioctl paths
both make use of igate and are therefore mutually exclusive.
The vfio-fsl-mc driver does not make use of irqfds, nor does it
support any sort of masking operations, therefore unlike vfio-pci
and vfio-platform, the flow can remain essentially unchanged.
Cc: Diana Craciun <diana.craciun(a)oss.nxp.com>
Cc: stable(a)vger.kernel.org
Fixes: cc0ee20bd969 ("vfio/fsl-mc: trigger an interrupt via eventfd")
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com>
---
drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
index d62fbfff20b8..82b2afa9b7e3 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
@@ -141,13 +141,14 @@ static int vfio_fsl_mc_set_irq_trigger(struct vfio_fsl_mc_device *vdev,
irq = &vdev->mc_irqs[index];
if (flags & VFIO_IRQ_SET_DATA_NONE) {
- vfio_fsl_mc_irq_handler(hwirq, irq);
+ if (irq->trigger)
+ eventfd_signal(irq->trigger);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
u8 trigger = *(u8 *)data;
- if (trigger)
- vfio_fsl_mc_irq_handler(hwirq, irq);
+ if (trigger && irq->trigger)
+ eventfd_signal(irq->trigger);
}
return 0;
--
2.44.0
irqfds for mask and unmask that are not specifically disabled by the
user are leaked. Remove any irqfds during cleanup
Cc: Eric Auger <eric.auger(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: a7fa7c77cf15 ("vfio/platform: implement IRQ masking/unmasking via an eventfd")
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com>
---
drivers/vfio/platform/vfio_platform_irq.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/vfio/platform/vfio_platform_irq.c b/drivers/vfio/platform/vfio_platform_irq.c
index 61a1bfb68ac7..e5dcada9e86c 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -321,8 +321,11 @@ void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev)
{
int i;
- for (i = 0; i < vdev->num_irqs; i++)
+ for (i = 0; i < vdev->num_irqs; i++) {
+ vfio_virqfd_disable(&vdev->irqs[i].mask);
+ vfio_virqfd_disable(&vdev->irqs[i].unmask);
vfio_set_trigger(vdev, i, -1, NULL);
+ }
vdev->num_irqs = 0;
kfree(vdev->irqs);
--
2.44.0
A vulnerability exists where the eventfd for INTx signaling can be
deconfigured, which unregisters the IRQ handler but still allows
eventfds to be signaled with a NULL context through the SET_IRQS ioctl
or through unmask irqfd if the device interrupt is pending.
Ideally this could be solved with some additional locking; the igate
mutex serializes the ioctl and config space accesses, and the interrupt
handler is unregistered relative to the trigger, but the irqfd path
runs asynchronous to those. The igate mutex cannot be acquired from the
atomic context of the eventfd wake function. Disabling the irqfd
relative to the eventfd registration is potentially incompatible with
existing userspace.
As a result, the solution implemented here moves configuration of the
INTx interrupt handler to track the lifetime of the INTx context object
and irq_type configuration, rather than registration of a particular
trigger eventfd. Synchronization is added between the ioctl path and
eventfd_signal() wrapper such that the eventfd trigger can be
dynamically updated relative to in-flight interrupts or irqfd callbacks.
Cc: stable(a)vger.kernel.org
Fixes: 89e1f7d4c66d ("vfio: Add PCI device driver")
Reported-by: Reinette Chatre <reinette.chatre(a)intel.com>
Reviewed-by: Kevin Tian <kevin.tian(a)intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com>
Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com>
---
drivers/vfio/pci/vfio_pci_intrs.c | 145 ++++++++++++++++--------------
1 file changed, 78 insertions(+), 67 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 75c85eec21b3..fb5392b749ff 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -90,11 +90,15 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused)
if (likely(is_intx(vdev) && !vdev->virq_disabled)) {
struct vfio_pci_irq_ctx *ctx;
+ struct eventfd_ctx *trigger;
ctx = vfio_irq_ctx_get(vdev, 0);
if (WARN_ON_ONCE(!ctx))
return;
- eventfd_signal(ctx->trigger);
+
+ trigger = READ_ONCE(ctx->trigger);
+ if (likely(trigger))
+ eventfd_signal(trigger);
}
}
@@ -253,100 +257,100 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
return ret;
}
-static int vfio_intx_enable(struct vfio_pci_core_device *vdev)
+static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
+ struct eventfd_ctx *trigger)
{
+ struct pci_dev *pdev = vdev->pdev;
struct vfio_pci_irq_ctx *ctx;
+ unsigned long irqflags;
+ char *name;
+ int ret;
if (!is_irq_none(vdev))
return -EINVAL;
- if (!vdev->pdev->irq)
+ if (!pdev->irq)
return -ENODEV;
+ name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev));
+ if (!name)
+ return -ENOMEM;
+
ctx = vfio_irq_ctx_alloc(vdev, 0);
if (!ctx)
return -ENOMEM;
+ ctx->name = name;
+ ctx->trigger = trigger;
+
/*
- * If the virtual interrupt is masked, restore it. Devices
- * supporting DisINTx can be masked at the hardware level
- * here, non-PCI-2.3 devices will have to wait until the
- * interrupt is enabled.
+ * Fill the initial masked state based on virq_disabled. After
+ * enable, changing the DisINTx bit in vconfig directly changes INTx
+ * masking. igate prevents races during setup, once running masked
+ * is protected via irqlock.
+ *
+ * Devices supporting DisINTx also reflect the current mask state in
+ * the physical DisINTx bit, which is not affected during IRQ setup.
+ *
+ * Devices without DisINTx support require an exclusive interrupt.
+ * IRQ masking is performed at the IRQ chip. Again, igate protects
+ * against races during setup and IRQ handlers and irqfds are not
+ * yet active, therefore masked is stable and can be used to
+ * conditionally auto-enable the IRQ.
+ *
+ * irq_type must be stable while the IRQ handler is registered,
+ * therefore it must be set before request_irq().
*/
ctx->masked = vdev->virq_disabled;
- if (vdev->pci_2_3)
- pci_intx(vdev->pdev, !ctx->masked);
+ if (vdev->pci_2_3) {
+ pci_intx(pdev, !ctx->masked);
+ irqflags = IRQF_SHARED;
+ } else {
+ irqflags = ctx->masked ? IRQF_NO_AUTOEN : 0;
+ }
vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
+ ret = request_irq(pdev->irq, vfio_intx_handler,
+ irqflags, ctx->name, vdev);
+ if (ret) {
+ vdev->irq_type = VFIO_PCI_NUM_IRQS;
+ kfree(name);
+ vfio_irq_ctx_free(vdev, ctx, 0);
+ return ret;
+ }
+
return 0;
}
-static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd)
+static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev,
+ struct eventfd_ctx *trigger)
{
struct pci_dev *pdev = vdev->pdev;
- unsigned long irqflags = IRQF_SHARED;
struct vfio_pci_irq_ctx *ctx;
- struct eventfd_ctx *trigger;
- unsigned long flags;
- int ret;
+ struct eventfd_ctx *old;
ctx = vfio_irq_ctx_get(vdev, 0);
if (WARN_ON_ONCE(!ctx))
return -EINVAL;
- if (ctx->trigger) {
- free_irq(pdev->irq, vdev);
- kfree(ctx->name);
- eventfd_ctx_put(ctx->trigger);
- ctx->trigger = NULL;
- }
-
- if (fd < 0) /* Disable only */
- return 0;
-
- ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)",
- pci_name(pdev));
- if (!ctx->name)
- return -ENOMEM;
-
- trigger = eventfd_ctx_fdget(fd);
- if (IS_ERR(trigger)) {
- kfree(ctx->name);
- return PTR_ERR(trigger);
- }
+ old = ctx->trigger;
- ctx->trigger = trigger;
+ WRITE_ONCE(ctx->trigger, trigger);
- /*
- * Devices without DisINTx support require an exclusive interrupt,
- * IRQ masking is performed at the IRQ chip. The masked status is
- * protected by vdev->irqlock. Setup the IRQ without auto-enable and
- * unmask as necessary below under lock. DisINTx is unmodified by
- * the IRQ configuration and may therefore use auto-enable.
- */
- if (!vdev->pci_2_3)
- irqflags = IRQF_NO_AUTOEN;
-
- ret = request_irq(pdev->irq, vfio_intx_handler,
- irqflags, ctx->name, vdev);
- if (ret) {
- ctx->trigger = NULL;
- kfree(ctx->name);
- eventfd_ctx_put(trigger);
- return ret;
+ /* Releasing an old ctx requires synchronizing in-flight users */
+ if (old) {
+ synchronize_irq(pdev->irq);
+ vfio_virqfd_flush_thread(&ctx->unmask);
+ eventfd_ctx_put(old);
}
- spin_lock_irqsave(&vdev->irqlock, flags);
- if (!vdev->pci_2_3 && !ctx->masked)
- enable_irq(pdev->irq);
- spin_unlock_irqrestore(&vdev->irqlock, flags);
-
return 0;
}
static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
{
+ struct pci_dev *pdev = vdev->pdev;
struct vfio_pci_irq_ctx *ctx;
ctx = vfio_irq_ctx_get(vdev, 0);
@@ -354,10 +358,13 @@ static void vfio_intx_disable(struct vfio_pci_core_device *vdev)
if (ctx) {
vfio_virqfd_disable(&ctx->unmask);
vfio_virqfd_disable(&ctx->mask);
+ free_irq(pdev->irq, vdev);
+ if (ctx->trigger)
+ eventfd_ctx_put(ctx->trigger);
+ kfree(ctx->name);
+ vfio_irq_ctx_free(vdev, ctx, 0);
}
- vfio_intx_set_signal(vdev, -1);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
- vfio_irq_ctx_free(vdev, ctx, 0);
}
/*
@@ -641,19 +648,23 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
return -EINVAL;
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ struct eventfd_ctx *trigger = NULL;
int32_t fd = *(int32_t *)data;
int ret;
- if (is_intx(vdev))
- return vfio_intx_set_signal(vdev, fd);
+ if (fd >= 0) {
+ trigger = eventfd_ctx_fdget(fd);
+ if (IS_ERR(trigger))
+ return PTR_ERR(trigger);
+ }
- ret = vfio_intx_enable(vdev);
- if (ret)
- return ret;
+ if (is_intx(vdev))
+ ret = vfio_intx_set_signal(vdev, trigger);
+ else
+ ret = vfio_intx_enable(vdev, trigger);
- ret = vfio_intx_set_signal(vdev, fd);
- if (ret)
- vfio_intx_disable(vdev);
+ if (ret && trigger)
+ eventfd_ctx_put(trigger);
return ret;
}
--
2.44.0