March 2024 - Linux-stable-mirror

[PATCH AUTOSEL 4.19 1/3] scsi: mpt3sas: Prevent sending diag_reset when the controller is ready

by Sasha Levin

From: Ranjan Kumar <ranjan.kumar(a)broadcom.com> [ Upstream commit ee0017c3ed8a8abfa4d40e42f908fb38c31e7515 ] If the driver detects that the controller is not ready before sending the first IOC facts command, it will wait for a maximum of 10 seconds for it to become ready. However, even if the controller becomes ready within 10 seconds, the driver will still issue a diagnostic reset. Modify the driver to avoid sending a diag reset if the controller becomes ready within the 10-second wait time. Signed-off-by: Ranjan Kumar <ranjan.kumar(a)broadcom.com> Link: https://lore.kernel.org/r/20240221071724.14986-1-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com> Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- drivers/scsi/mpt3sas/mpt3sas_base.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 447ac667f4b2b..7588c2c11a879 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -5584,7 +5584,9 @@ _base_wait_for_iocstate(struct MPT3SAS_ADAPTER *ioc, int timeout) return -EFAULT; } - issue_diag_reset: + return 0; + +issue_diag_reset: rc = _base_diag_reset(ioc); return rc; } -- 2.43.0

1 year, 9 months

1
2
0 0

[PATCH AUTOSEL 5.4 1/4] btrfs: fix data race at btrfs_use_block_rsv() when accessing block reserve

by Sasha Levin

From: Filipe Manana <fdmanana(a)suse.com> [ Upstream commit c7bb26b847e5b97814f522686068c5628e2b3646 ] At btrfs_use_block_rsv() we read the size of a block reserve without locking its spinlock, which makes KCSAN complain because the size of a block reserve is always updated while holding its spinlock. The report from KCSAN is the following: [653.313148] BUG: KCSAN: data-race in btrfs_update_delayed_refs_rsv [btrfs] / btrfs_use_block_rsv [btrfs] [653.314755] read to 0x000000017f5871b8 of 8 bytes by task 7519 on cpu 0: [653.314779] btrfs_use_block_rsv+0xe4/0x2f8 [btrfs] [653.315606] btrfs_alloc_tree_block+0xdc/0x998 [btrfs] [653.316421] btrfs_force_cow_block+0x220/0xe38 [btrfs] [653.317242] btrfs_cow_block+0x1ac/0x568 [btrfs] [653.318060] btrfs_search_slot+0xda2/0x19b8 [btrfs] [653.318879] btrfs_del_csums+0x1dc/0x798 [btrfs] [653.319702] __btrfs_free_extent.isra.0+0xc24/0x2028 [btrfs] [653.320538] __btrfs_run_delayed_refs+0xd3c/0x2390 [btrfs] [653.321340] btrfs_run_delayed_refs+0xae/0x290 [btrfs] [653.322140] flush_space+0x5e4/0x718 [btrfs] [653.322958] btrfs_preempt_reclaim_metadata_space+0x102/0x2f8 [btrfs] [653.323781] process_one_work+0x3b6/0x838 [653.323800] worker_thread+0x75e/0xb10 [653.323817] kthread+0x21a/0x230 [653.323836] __ret_from_fork+0x6c/0xb8 [653.323855] ret_from_fork+0xa/0x30 [653.323887] write to 0x000000017f5871b8 of 8 bytes by task 576 on cpu 3: [653.323906] btrfs_update_delayed_refs_rsv+0x1a4/0x250 [btrfs] [653.324699] btrfs_add_delayed_data_ref+0x468/0x6d8 [btrfs] [653.325494] btrfs_free_extent+0x76/0x120 [btrfs] [653.326280] __btrfs_mod_ref+0x6a8/0x6b8 [btrfs] [653.327064] btrfs_dec_ref+0x50/0x70 [btrfs] [653.327849] walk_up_proc+0x236/0xa50 [btrfs] [653.328633] walk_up_tree+0x21c/0x448 [btrfs] [653.329418] btrfs_drop_snapshot+0x802/0x1328 [btrfs] [653.330205] btrfs_clean_one_deleted_snapshot+0x184/0x238 [btrfs] [653.330995] cleaner_kthread+0x2b0/0x2f0 [btrfs] [653.331781] kthread+0x21a/0x230 [653.331800] __ret_from_fork+0x6c/0xb8 [653.331818] ret_from_fork+0xa/0x30 So add a helper to get the size of a block reserve while holding the lock. Reading the field while holding the lock instead of using the data_race() annotation is used in order to prevent load tearing. Signed-off-by: Filipe Manana <fdmanana(a)suse.com> Reviewed-by: David Sterba <dsterba(a)suse.com> Signed-off-by: David Sterba <dsterba(a)suse.com> Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- fs/btrfs/block-rsv.c | 2 +- fs/btrfs/block-rsv.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 36ef3228bac86..63205d2f4d84c 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -392,7 +392,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); - if (unlikely(block_rsv->size == 0)) + if (unlikely(btrfs_block_rsv_size(block_rsv) == 0)) goto try_reserve; again: ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index d1428bb73fc5a..69770360917cb 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -98,4 +98,20 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, btrfs_block_rsv_release(fs_info, block_rsv, 0); } +/* + * Get the size of a block reserve in a context where getting a stale value is + * acceptable, instead of accessing it directly and trigger data race warning + * from KCSAN. + */ +static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv) +{ + u64 ret; + + spin_lock(&rsv->lock); + ret = rsv->size; + spin_unlock(&rsv->lock); + + return ret; +} + #endif /* BTRFS_BLOCK_RSV_H */ -- 2.43.0

1 year, 9 months

1
3
0 0

[PATCH AUTOSEL 5.10 1/3] scsi: mpt3sas: Prevent sending diag_reset when the controller is ready

by Sasha Levin

From: Ranjan Kumar <ranjan.kumar(a)broadcom.com> [ Upstream commit ee0017c3ed8a8abfa4d40e42f908fb38c31e7515 ] If the driver detects that the controller is not ready before sending the first IOC facts command, it will wait for a maximum of 10 seconds for it to become ready. However, even if the controller becomes ready within 10 seconds, the driver will still issue a diagnostic reset. Modify the driver to avoid sending a diag reset if the controller becomes ready within the 10-second wait time. Signed-off-by: Ranjan Kumar <ranjan.kumar(a)broadcom.com> Link: https://lore.kernel.org/r/20240221071724.14986-1-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com> Signed-off-by: Sasha Levin <sashal(a)kernel.org> --- drivers/scsi/mpt3sas/mpt3sas_base.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 814ac25238058..105d781d0cacf 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -6357,7 +6357,9 @@ _base_wait_for_iocstate(struct MPT3SAS_ADAPTER *ioc, int timeout) return -EFAULT; } - issue_diag_reset: + return 0; + +issue_diag_reset: rc = _base_diag_reset(ioc); return rc; } -- 2.43.0

1 year, 9 months

1
2
0 0

[PATCH v2 1/7] vfio/pci: Disable auto-enable of exclusive INTx IRQ

by Alex Williamson

Currently for devices requiring masking at the irqchip for INTx, ie. devices without DisINTx support, the IRQ is enabled in request_irq() and subsequently disabled as necessary to align with the masked status flag. This presents a window where the interrupt could fire between these events, resulting in the IRQ incrementing the disable depth twice. This would be unrecoverable for a user since the masked flag prevents nested enables through vfio. Instead, invert the logic using IRQF_NO_AUTOEN such that exclusive INTx is never auto-enabled, then unmask as required. Cc: stable(a)vger.kernel.org Fixes: 89e1f7d4c66d ("vfio: Add PCI device driver") Reviewed-by: Kevin Tian <kevin.tian(a)intel.com> Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com> --- drivers/vfio/pci/vfio_pci_intrs.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 237beac83809..136101179fcb 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -296,8 +296,15 @@ static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd) ctx->trigger = trigger; + /* + * Devices without DisINTx support require an exclusive interrupt, + * IRQ masking is performed at the IRQ chip. The masked status is + * protected by vdev->irqlock. Setup the IRQ without auto-enable and + * unmask as necessary below under lock. DisINTx is unmodified by + * the IRQ configuration and may therefore use auto-enable. + */ if (!vdev->pci_2_3) - irqflags = 0; + irqflags = IRQF_NO_AUTOEN; ret = request_irq(pdev->irq, vfio_intx_handler, irqflags, ctx->name, vdev); @@ -308,13 +315,9 @@ static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd) return ret; } - /* - * INTx disable will stick across the new irq setup, - * disable_irq won't. - */ spin_lock_irqsave(&vdev->irqlock, flags); - if (!vdev->pci_2_3 && ctx->masked) - disable_irq_nosync(pdev->irq); + if (!vdev->pci_2_3 && !ctx->masked) + enable_irq(pdev->irq); spin_unlock_irqrestore(&vdev->irqlock, flags); return 0; -- 2.44.0

1 year, 9 months

2
2
0 0

[PATCH] can: mcp251xfd: fix infinite loop when xmit fails

by Vitor Soares

From: Vitor Soares <vitor.soares(a)toradex.com> When the mcp251xfd_start_xmit() function fails, the driver stops processing messages, and the interrupt routine does not return, running indefinitely even after killing the running application. Error messages: [ 441.298819] mcp251xfd spi2.0 can0: ERROR in mcp251xfd_start_xmit: -16 [ 441.306498] mcp251xfd spi2.0 can0: Transmit Event FIFO buffer not empty. (seq=0x000017c7, tef_tail=0x000017cf, tef_head=0x000017d0, tx_head=0x000017d3). ... and repeat forever. The issue can be triggered when multiple devices share the same SPI interface. And there is concurrent access to the bus. The problem occurs because tx_ring->head increments even if mcp251xfd_start_xmit() fails. Consequently, the driver skips one TX package while still expecting a response in mcp251xfd_handle_tefif_one(). This patch resolves the issue by decreasing tx_ring->head if mcp251xfd_start_xmit() fails. With the fix, if we trigger the issue and the err = -EBUSY, the driver returns NETDEV_TX_BUSY. The network stack retries to transmit the message. Otherwise, it prints an error and discards the message. Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN") Cc: stable(a)vger.kernel.org Signed-off-by: Vitor Soares <vitor.soares(a)toradex.com> --- V2->V3: - Add tx_dropped stats. - netdev_sent_queue() only if can_put_echo_skb() succeed. V1->V2: - Return NETDEV_TX_BUSY if mcp251xfd_tx_obj_write() == -EBUSY. - Rework the commit message to address the change above. - Change can_put_echo_skb() to be called after mcp251xfd_tx_obj_write() succeed. Otherwise, we get Kernel NULL pointer dereference error. drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c | 34 ++++++++++++-------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c index 160528d3cc26..146c44e47c60 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-tx.c @@ -166,6 +166,7 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb, struct net_device *ndev) { struct mcp251xfd_priv *priv = netdev_priv(ndev); + struct net_device_stats *stats = &ndev->stats; struct mcp251xfd_tx_ring *tx_ring = priv->tx; struct mcp251xfd_tx_obj *tx_obj; unsigned int frame_len; @@ -181,25 +182,32 @@ netdev_tx_t mcp251xfd_start_xmit(struct sk_buff *skb, tx_obj = mcp251xfd_get_tx_obj_next(tx_ring); mcp251xfd_tx_obj_from_skb(priv, tx_obj, skb, tx_ring->head); - /* Stop queue if we occupy the complete TX FIFO */ tx_head = mcp251xfd_get_tx_head(tx_ring); - tx_ring->head++; - if (mcp251xfd_get_tx_free(tx_ring) == 0) - netif_stop_queue(ndev); - frame_len = can_skb_get_frame_len(skb); - err = can_put_echo_skb(skb, ndev, tx_head, frame_len); - if (!err) - netdev_sent_queue(priv->ndev, frame_len); + + tx_ring->head++; err = mcp251xfd_tx_obj_write(priv, tx_obj); - if (err) - goto out_err; + if (err) { + tx_ring->head--; - return NETDEV_TX_OK; + if (err == -EBUSY) + return NETDEV_TX_BUSY; - out_err: - netdev_err(priv->ndev, "ERROR in %s: %d\n", __func__, err); + stats->tx_dropped++; + + if (net_ratelimit()) + netdev_err(priv->ndev, + "ERROR in %s: %d\n", __func__, err); + } else { + err = can_put_echo_skb(skb, ndev, tx_head, frame_len); + if (!err) + netdev_sent_queue(priv->ndev, frame_len); + + /* Stop queue if we occupy the complete TX FIFO */ + if (mcp251xfd_get_tx_free(tx_ring) == 0) + netif_stop_queue(ndev); + } return NETDEV_TX_OK; } -- 2.34.1

1 year, 9 months

1
0
0 0

[PATCH 5.15.y] selftests: mptcp: decrease BW in simult flows

by Matthieu Baerts (NGI0)

When running the simult_flow selftest in slow environments -- e.g. QEmu without KVM support --, the results can be unstable. This selftest checks if the aggregated bandwidth is (almost) fully used as expected. To help improving the stability while still keeping the same validation in place, the BW and the delay are reduced to lower the pressure on the CPU. Fixes: 1a418cb8e888 ("mptcp: simult flow self-tests") Fixes: 219d04992b68 ("mptcp: push pending frames when subflow has free space") Cc: stable(a)vger.kernel.org Suggested-by: Paolo Abeni <pabeni(a)redhat.com> Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org> Link: https://lore.kernel.org/r/20240131-upstream-net-20240131-mptcp-ci-issues-v1… Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> (cherry picked from commit 5e2f3c65af47e527ccac54060cf909e3306652ff) Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org> --- Notes: - Conflicts in simult_flows.sh, because v5.15 doesn't have commit 675d99338e7a ("selftests: mptcp: simult flows: format subtests results in TAP") which modifies the context for a new but unrelated feature. - This is a new version to the one recently proposed by Sasha, this time without dependences: https://lore.kernel.org/stable/9f185a3f-9373-401c-9a5c-ec0f106c0cbc@kernel.… - This is the same patch as the one recently sent for v6.1: https://lore.kernel.org/stable/20240311111224.1421344-2-matttbe@kernel.org/ --- tools/testing/selftests/net/mptcp/simult_flows.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 752cef168804..cab3e3a5481d 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -289,10 +289,11 @@ done setup run_test 10 10 0 0 "balanced bwidth" -run_test 10 10 1 50 "balanced bwidth with unbalanced delay" +run_test 10 10 1 25 "balanced bwidth with unbalanced delay" # we still need some additional infrastructure to pass the following test-cases -run_test 30 10 0 0 "unbalanced bwidth" -run_test 30 10 1 50 "unbalanced bwidth with unbalanced delay" -run_test 30 10 50 1 "unbalanced bwidth with opposed, unbalanced delay" +run_test 10 3 0 0 "unbalanced bwidth" +run_test 10 3 1 25 "unbalanced bwidth with unbalanced delay" +run_test 10 3 25 1 "unbalanced bwidth with opposed, unbalanced delay" + exit $ret -- 2.43.0

1 year, 9 months

1
0
0 0

[PATCH 6.1.y] selftests: mptcp: decrease BW in simult flows

by Matthieu Baerts (NGI0)

When running the simult_flow selftest in slow environments -- e.g. QEmu without KVM support --, the results can be unstable. This selftest checks if the aggregated bandwidth is (almost) fully used as expected. To help improving the stability while still keeping the same validation in place, the BW and the delay are reduced to lower the pressure on the CPU. Fixes: 1a418cb8e888 ("mptcp: simult flow self-tests") Fixes: 219d04992b68 ("mptcp: push pending frames when subflow has free space") Cc: stable(a)vger.kernel.org Suggested-by: Paolo Abeni <pabeni(a)redhat.com> Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org> Link: https://lore.kernel.org/r/20240131-upstream-net-20240131-mptcp-ci-issues-v1… Signed-off-by: Jakub Kicinski <kuba(a)kernel.org> (cherry picked from commit 5e2f3c65af47e527ccac54060cf909e3306652ff) Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org> --- Notes: - Conflicts in simult_flows.sh, because v6.1 doesn't have commit 675d99338e7a ("selftests: mptcp: simult flows: format subtests results in TAP") which modifies the context for a new but unrelated feature. - This is a new version to the one recently proposed by Sasha, this time without dependences: https://lore.kernel.org/stable/9f185a3f-9373-401c-9a5c-ec0f106c0cbc@kernel.… --- tools/testing/selftests/net/mptcp/simult_flows.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 4a417f9d51d6..ee24e06521e6 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -301,10 +301,11 @@ done setup run_test 10 10 0 0 "balanced bwidth" -run_test 10 10 1 50 "balanced bwidth with unbalanced delay" +run_test 10 10 1 25 "balanced bwidth with unbalanced delay" # we still need some additional infrastructure to pass the following test-cases -run_test 30 10 0 0 "unbalanced bwidth" -run_test 30 10 1 50 "unbalanced bwidth with unbalanced delay" -run_test 30 10 50 1 "unbalanced bwidth with opposed, unbalanced delay" +run_test 10 3 0 0 "unbalanced bwidth" +run_test 10 3 1 25 "unbalanced bwidth with unbalanced delay" +run_test 10 3 25 1 "unbalanced bwidth with opposed, unbalanced delay" + exit $ret -- 2.43.0

1 year, 9 months

1
0
0 0

[PATCH v2 7/7] vfio/fsl-mc: Block calling interrupt handler without trigger

by Alex Williamson

The eventfd_ctx trigger pointer of the vfio_fsl_mc_irq object is initially NULL and may become NULL if the user sets the trigger eventfd to -1. The interrupt handler itself is guaranteed that trigger is always valid between request_irq() and free_irq(), but the loopback testing mechanisms to invoke the handler function need to test the trigger. The triggering and setting ioctl paths both make use of igate and are therefore mutually exclusive. The vfio-fsl-mc driver does not make use of irqfds, nor does it support any sort of masking operations, therefore unlike vfio-pci and vfio-platform, the flow can remain essentially unchanged. Cc: Diana Craciun <diana.craciun(a)oss.nxp.com> Cc: stable(a)vger.kernel.org Fixes: cc0ee20bd969 ("vfio/fsl-mc: trigger an interrupt via eventfd") Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com> --- drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c index d62fbfff20b8..82b2afa9b7e3 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c @@ -141,13 +141,14 @@ static int vfio_fsl_mc_set_irq_trigger(struct vfio_fsl_mc_device *vdev, irq = &vdev->mc_irqs[index]; if (flags & VFIO_IRQ_SET_DATA_NONE) { - vfio_fsl_mc_irq_handler(hwirq, irq); + if (irq->trigger) + eventfd_signal(irq->trigger); } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { u8 trigger = *(u8 *)data; - if (trigger) - vfio_fsl_mc_irq_handler(hwirq, irq); + if (trigger && irq->trigger) + eventfd_signal(irq->trigger); } return 0; -- 2.44.0

1 year, 9 months

3
2
0 0

[PATCH v2 5/7] vfio/platform: Disable virqfds on cleanup

by Alex Williamson

irqfds for mask and unmask that are not specifically disabled by the user are leaked. Remove any irqfds during cleanup Cc: Eric Auger <eric.auger(a)redhat.com> Cc: stable(a)vger.kernel.org Fixes: a7fa7c77cf15 ("vfio/platform: implement IRQ masking/unmasking via an eventfd") Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com> --- drivers/vfio/platform/vfio_platform_irq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/platform/vfio_platform_irq.c b/drivers/vfio/platform/vfio_platform_irq.c index 61a1bfb68ac7..e5dcada9e86c 100644 --- a/drivers/vfio/platform/vfio_platform_irq.c +++ b/drivers/vfio/platform/vfio_platform_irq.c @@ -321,8 +321,11 @@ void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev) { int i; - for (i = 0; i < vdev->num_irqs; i++) + for (i = 0; i < vdev->num_irqs; i++) { + vfio_virqfd_disable(&vdev->irqs[i].mask); + vfio_virqfd_disable(&vdev->irqs[i].unmask); vfio_set_trigger(vdev, i, -1, NULL); + } vdev->num_irqs = 0; kfree(vdev->irqs); -- 2.44.0

1 year, 9 months

3
2
0 0

[PATCH v2 4/7] vfio/pci: Create persistent INTx handler

by Alex Williamson

A vulnerability exists where the eventfd for INTx signaling can be deconfigured, which unregisters the IRQ handler but still allows eventfds to be signaled with a NULL context through the SET_IRQS ioctl or through unmask irqfd if the device interrupt is pending. Ideally this could be solved with some additional locking; the igate mutex serializes the ioctl and config space accesses, and the interrupt handler is unregistered relative to the trigger, but the irqfd path runs asynchronous to those. The igate mutex cannot be acquired from the atomic context of the eventfd wake function. Disabling the irqfd relative to the eventfd registration is potentially incompatible with existing userspace. As a result, the solution implemented here moves configuration of the INTx interrupt handler to track the lifetime of the INTx context object and irq_type configuration, rather than registration of a particular trigger eventfd. Synchronization is added between the ioctl path and eventfd_signal() wrapper such that the eventfd trigger can be dynamically updated relative to in-flight interrupts or irqfd callbacks. Cc: stable(a)vger.kernel.org Fixes: 89e1f7d4c66d ("vfio: Add PCI device driver") Reported-by: Reinette Chatre <reinette.chatre(a)intel.com> Reviewed-by: Kevin Tian <kevin.tian(a)intel.com> Reviewed-by: Reinette Chatre <reinette.chatre(a)intel.com> Signed-off-by: Alex Williamson <alex.williamson(a)redhat.com> --- drivers/vfio/pci/vfio_pci_intrs.c | 145 ++++++++++++++++-------------- 1 file changed, 78 insertions(+), 67 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 75c85eec21b3..fb5392b749ff 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -90,11 +90,15 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused) if (likely(is_intx(vdev) && !vdev->virq_disabled)) { struct vfio_pci_irq_ctx *ctx; + struct eventfd_ctx *trigger; ctx = vfio_irq_ctx_get(vdev, 0); if (WARN_ON_ONCE(!ctx)) return; - eventfd_signal(ctx->trigger); + + trigger = READ_ONCE(ctx->trigger); + if (likely(trigger)) + eventfd_signal(trigger); } } @@ -253,100 +257,100 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id) return ret; } -static int vfio_intx_enable(struct vfio_pci_core_device *vdev) +static int vfio_intx_enable(struct vfio_pci_core_device *vdev, + struct eventfd_ctx *trigger) { + struct pci_dev *pdev = vdev->pdev; struct vfio_pci_irq_ctx *ctx; + unsigned long irqflags; + char *name; + int ret; if (!is_irq_none(vdev)) return -EINVAL; - if (!vdev->pdev->irq) + if (!pdev->irq) return -ENODEV; + name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev)); + if (!name) + return -ENOMEM; + ctx = vfio_irq_ctx_alloc(vdev, 0); if (!ctx) return -ENOMEM; + ctx->name = name; + ctx->trigger = trigger; + /* - * If the virtual interrupt is masked, restore it. Devices - * supporting DisINTx can be masked at the hardware level - * here, non-PCI-2.3 devices will have to wait until the - * interrupt is enabled. + * Fill the initial masked state based on virq_disabled. After + * enable, changing the DisINTx bit in vconfig directly changes INTx + * masking. igate prevents races during setup, once running masked + * is protected via irqlock. + * + * Devices supporting DisINTx also reflect the current mask state in + * the physical DisINTx bit, which is not affected during IRQ setup. + * + * Devices without DisINTx support require an exclusive interrupt. + * IRQ masking is performed at the IRQ chip. Again, igate protects + * against races during setup and IRQ handlers and irqfds are not + * yet active, therefore masked is stable and can be used to + * conditionally auto-enable the IRQ. + * + * irq_type must be stable while the IRQ handler is registered, + * therefore it must be set before request_irq(). */ ctx->masked = vdev->virq_disabled; - if (vdev->pci_2_3) - pci_intx(vdev->pdev, !ctx->masked); + if (vdev->pci_2_3) { + pci_intx(pdev, !ctx->masked); + irqflags = IRQF_SHARED; + } else { + irqflags = ctx->masked ? IRQF_NO_AUTOEN : 0; + } vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX; + ret = request_irq(pdev->irq, vfio_intx_handler, + irqflags, ctx->name, vdev); + if (ret) { + vdev->irq_type = VFIO_PCI_NUM_IRQS; + kfree(name); + vfio_irq_ctx_free(vdev, ctx, 0); + return ret; + } + return 0; } -static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd) +static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, + struct eventfd_ctx *trigger) { struct pci_dev *pdev = vdev->pdev; - unsigned long irqflags = IRQF_SHARED; struct vfio_pci_irq_ctx *ctx; - struct eventfd_ctx *trigger; - unsigned long flags; - int ret; + struct eventfd_ctx *old; ctx = vfio_irq_ctx_get(vdev, 0); if (WARN_ON_ONCE(!ctx)) return -EINVAL; - if (ctx->trigger) { - free_irq(pdev->irq, vdev); - kfree(ctx->name); - eventfd_ctx_put(ctx->trigger); - ctx->trigger = NULL; - } - - if (fd < 0) /* Disable only */ - return 0; - - ctx->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", - pci_name(pdev)); - if (!ctx->name) - return -ENOMEM; - - trigger = eventfd_ctx_fdget(fd); - if (IS_ERR(trigger)) { - kfree(ctx->name); - return PTR_ERR(trigger); - } + old = ctx->trigger; - ctx->trigger = trigger; + WRITE_ONCE(ctx->trigger, trigger); - /* - * Devices without DisINTx support require an exclusive interrupt, - * IRQ masking is performed at the IRQ chip. The masked status is - * protected by vdev->irqlock. Setup the IRQ without auto-enable and - * unmask as necessary below under lock. DisINTx is unmodified by - * the IRQ configuration and may therefore use auto-enable. - */ - if (!vdev->pci_2_3) - irqflags = IRQF_NO_AUTOEN; - - ret = request_irq(pdev->irq, vfio_intx_handler, - irqflags, ctx->name, vdev); - if (ret) { - ctx->trigger = NULL; - kfree(ctx->name); - eventfd_ctx_put(trigger); - return ret; + /* Releasing an old ctx requires synchronizing in-flight users */ + if (old) { + synchronize_irq(pdev->irq); + vfio_virqfd_flush_thread(&ctx->unmask); + eventfd_ctx_put(old); } - spin_lock_irqsave(&vdev->irqlock, flags); - if (!vdev->pci_2_3 && !ctx->masked) - enable_irq(pdev->irq); - spin_unlock_irqrestore(&vdev->irqlock, flags); - return 0; } static void vfio_intx_disable(struct vfio_pci_core_device *vdev) { + struct pci_dev *pdev = vdev->pdev; struct vfio_pci_irq_ctx *ctx; ctx = vfio_irq_ctx_get(vdev, 0); @@ -354,10 +358,13 @@ static void vfio_intx_disable(struct vfio_pci_core_device *vdev) if (ctx) { vfio_virqfd_disable(&ctx->unmask); vfio_virqfd_disable(&ctx->mask); + free_irq(pdev->irq, vdev); + if (ctx->trigger) + eventfd_ctx_put(ctx->trigger); + kfree(ctx->name); + vfio_irq_ctx_free(vdev, ctx, 0); } - vfio_intx_set_signal(vdev, -1); vdev->irq_type = VFIO_PCI_NUM_IRQS; - vfio_irq_ctx_free(vdev, ctx, 0); } /* @@ -641,19 +648,23 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev, return -EINVAL; if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + struct eventfd_ctx *trigger = NULL; int32_t fd = *(int32_t *)data; int ret; - if (is_intx(vdev)) - return vfio_intx_set_signal(vdev, fd); + if (fd >= 0) { + trigger = eventfd_ctx_fdget(fd); + if (IS_ERR(trigger)) + return PTR_ERR(trigger); + } - ret = vfio_intx_enable(vdev); - if (ret) - return ret; + if (is_intx(vdev)) + ret = vfio_intx_set_signal(vdev, trigger); + else + ret = vfio_intx_enable(vdev, trigger); - ret = vfio_intx_set_signal(vdev, fd); - if (ret) - vfio_intx_disable(vdev); + if (ret && trigger) + eventfd_ctx_put(trigger); return ret; } -- 2.44.0

1 year, 9 months

2
1
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror March 2024