From: Souradeep Chakrabarti schakrabarti@linux.microsoft.com
This patch addresses the VF unload issue, where mana_dealloc_queues() gets stuck in infinite while loop, because of host unresponsiveness. It adds a timeout in the while loop, to fix it.
Also this patch adds a new attribute in mana_context, which gets set when mana_hwc_send_request() hits a timeout because of host unresponsiveness. This flag then helps to avoid the timeouts in successive calls.
Signed-off-by: Souradeep Chakrabarti schakrabarti@linux.microsoft.com --- V1 -> V2: * Added net branch * Removed the typecasting to (struct mana_context*) of void pointer * Repositioned timeout variable in mana_dealloc_queues() * Repositioned vf_unload_timeout in mana_context struct, to utilise the 6 bytes hole --- .../net/ethernet/microsoft/mana/gdma_main.c | 4 +++- .../net/ethernet/microsoft/mana/hw_channel.c | 12 ++++++++++- drivers/net/ethernet/microsoft/mana/mana_en.c | 21 +++++++++++++++++-- include/net/mana/mana.h | 2 ++ 4 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 8f3f78b68592..6411f01be0d9 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -946,10 +946,12 @@ int mana_gd_deregister_device(struct gdma_dev *gd) struct gdma_context *gc = gd->gdma_context; struct gdma_general_resp resp = {}; struct gdma_general_req req = {}; + struct mana_context *ac; int err;
if (gd->pdid == INVALID_PDID) return -EINVAL; + ac = gd->driver_data;
mana_gd_init_req_hdr(&req.hdr, GDMA_DEREGISTER_DEVICE, sizeof(req), sizeof(resp)); @@ -957,7 +959,7 @@ int mana_gd_deregister_device(struct gdma_dev *gd) req.hdr.dev_id = gd->dev_id;
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); - if (err || resp.hdr.status) { + if ((err || resp.hdr.status) && !ac->vf_unload_timeout) { dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n", err, resp.hdr.status); if (!err) diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index 9d1507eba5b9..492cb2c6e2cb 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Copyright (c) 2021, Microsoft Corporation. */
+#include "asm-generic/errno.h" #include <net/mana/gdma.h> #include <net/mana/hw_channel.h> +#include <net/mana/mana.h>
static int mana_hwc_get_msg_index(struct hw_channel_context *hwc, u16 *msg_id) { @@ -786,12 +788,19 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, struct hwc_wq *txq = hwc->txq; struct gdma_req_hdr *req_msg; struct hwc_caller_ctx *ctx; + struct mana_context *ac; u32 dest_vrcq = 0; u32 dest_vrq = 0; u16 msg_id; int err;
mana_hwc_get_msg_index(hwc, &msg_id); + ac = hwc->gdma_dev->driver_data; + if (ac->vf_unload_timeout) { + dev_err(hwc->dev, "HWC: vport is already unloaded.\n"); + err = -ETIMEDOUT; + goto out; + }
tx_wr = &txq->msg_buf->reqs[msg_id];
@@ -825,9 +834,10 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, goto out; }
- if (!wait_for_completion_timeout(&ctx->comp_event, 30 * HZ)) { + if (!wait_for_completion_timeout(&ctx->comp_event, 5 * HZ)) { dev_err(hwc->dev, "HWC: Request timed out!\n"); err = -ETIMEDOUT; + ac->vf_unload_timeout = true; goto out; }
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index d907727c7b7a..cb2080b3a00c 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2329,7 +2329,10 @@ static int mana_dealloc_queues(struct net_device *ndev) { struct mana_port_context *apc = netdev_priv(ndev); struct gdma_dev *gd = apc->ac->gdma_dev; + unsigned long timeout; struct mana_txq *txq; + struct sk_buff *skb; + struct mana_cq *cq; int i, err;
if (apc->port_is_up) @@ -2348,13 +2351,26 @@ static int mana_dealloc_queues(struct net_device *ndev) * * Drain all the in-flight TX packets */ + + timeout = jiffies + 120 * HZ; for (i = 0; i < apc->num_queues; i++) { txq = &apc->tx_qp[i].txq; - - while (atomic_read(&txq->pending_sends) > 0) + while (atomic_read(&txq->pending_sends) > 0 && + time_before(jiffies, timeout)) { usleep_range(1000, 2000); + } }
+ for (i = 0; i < apc->num_queues; i++) { + txq = &apc->tx_qp[i].txq; + cq = &apc->tx_qp[i].tx_cq; + while (atomic_read(&txq->pending_sends)) { + skb = skb_dequeue(&txq->pending_skbs); + mana_unmap_skb(skb, apc); + napi_consume_skb(skb, cq->budget); + atomic_sub(1, &txq->pending_sends); + } + } /* We're 100% sure the queues can no longer be woken up, because * we're sure now mana_poll_tx_cq() can't be running. */ @@ -2605,6 +2621,7 @@ int mana_probe(struct gdma_dev *gd, bool resuming) } }
+ ac->vf_unload_timeout = false; err = add_adev(gd); out: if (err) diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 9eef19972845..5f5affdca1eb 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -358,6 +358,8 @@ struct mana_context {
u16 num_ports;
+ bool vf_unload_timeout; + struct mana_eq *eqs;
struct net_device *ports[MAX_PORTS_IN_MANA_DEV];
Hi,
Thanks for your patch.
FYI: kernel test robot notices the stable kernel rule is not satisfied.
Rule: 'Cc: stable@vger.kernel.org' or 'commit <sha1> upstream.' Subject: [PATCH V2 net] net: mana: Fix MANA VF unload when host is unresponsive Link: https://lore.kernel.org/stable/1687505355-29212-1-git-send-email-schakrabart...
The check is based on https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
Fri, Jun 23, 2023 at 09:29:15AM CEST, schakrabarti@linux.microsoft.com wrote:
From: Souradeep Chakrabarti schakrabarti@linux.microsoft.com
This patch addresses the VF unload issue, where mana_dealloc_queues()
double space here ^^
gets stuck in infinite while loop, because of host unresponsiveness. It adds a timeout in the while loop, to fix it.
Also this patch adds a new attribute in mana_context, which gets set when mana_hwc_send_request() hits a timeout because of host unresponsiveness. This flag then helps to avoid the timeouts in successive calls.
You aparently combine 2 patches together. Please split.
Signed-off-by: Souradeep Chakrabarti schakrabarti@linux.microsoft.com
Please provide a "Fixes" tag with pointer to the commit that introduced the problem.
V1 -> V2:
- Added net branch
- Removed the typecasting to (struct mana_context*) of void pointer
- Repositioned timeout variable in mana_dealloc_queues()
- Repositioned vf_unload_timeout in mana_context struct, to utilise the
6 bytes hole
.../net/ethernet/microsoft/mana/gdma_main.c | 4 +++- .../net/ethernet/microsoft/mana/hw_channel.c | 12 ++++++++++- drivers/net/ethernet/microsoft/mana/mana_en.c | 21 +++++++++++++++++-- include/net/mana/mana.h | 2 ++ 4 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 8f3f78b68592..6411f01be0d9 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -946,10 +946,12 @@ int mana_gd_deregister_device(struct gdma_dev *gd) struct gdma_context *gc = gd->gdma_context; struct gdma_general_resp resp = {}; struct gdma_general_req req = {};
struct mana_context *ac; int err;
if (gd->pdid == INVALID_PDID) return -EINVAL;
ac = gd->driver_data;
mana_gd_init_req_hdr(&req.hdr, GDMA_DEREGISTER_DEVICE, sizeof(req), sizeof(resp));
@@ -957,7 +959,7 @@ int mana_gd_deregister_device(struct gdma_dev *gd) req.hdr.dev_id = gd->dev_id;
err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
- if (err || resp.hdr.status) {
- if ((err || resp.hdr.status) && !ac->vf_unload_timeout) { dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n", err, resp.hdr.status); if (!err)
diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index 9d1507eba5b9..492cb2c6e2cb 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Copyright (c) 2021, Microsoft Corporation. */
+#include "asm-generic/errno.h" #include <net/mana/gdma.h> #include <net/mana/hw_channel.h> +#include <net/mana/mana.h>
static int mana_hwc_get_msg_index(struct hw_channel_context *hwc, u16 *msg_id) { @@ -786,12 +788,19 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, struct hwc_wq *txq = hwc->txq; struct gdma_req_hdr *req_msg; struct hwc_caller_ctx *ctx;
struct mana_context *ac; u32 dest_vrcq = 0; u32 dest_vrq = 0; u16 msg_id; int err;
mana_hwc_get_msg_index(hwc, &msg_id);
ac = hwc->gdma_dev->driver_data;
if (ac->vf_unload_timeout) {
dev_err(hwc->dev, "HWC: vport is already unloaded.\n");
err = -ETIMEDOUT;
goto out;
}
tx_wr = &txq->msg_buf->reqs[msg_id];
@@ -825,9 +834,10 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, goto out; }
- if (!wait_for_completion_timeout(&ctx->comp_event, 30 * HZ)) {
- if (!wait_for_completion_timeout(&ctx->comp_event, 5 * HZ)) { dev_err(hwc->dev, "HWC: Request timed out!\n"); err = -ETIMEDOUT;
goto out; }ac->vf_unload_timeout = true;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index d907727c7b7a..cb2080b3a00c 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2329,7 +2329,10 @@ static int mana_dealloc_queues(struct net_device *ndev) { struct mana_port_context *apc = netdev_priv(ndev); struct gdma_dev *gd = apc->ac->gdma_dev;
unsigned long timeout; struct mana_txq *txq;
struct sk_buff *skb;
struct mana_cq *cq; int i, err;
if (apc->port_is_up)
@@ -2348,13 +2351,26 @@ static int mana_dealloc_queues(struct net_device *ndev) * * Drain all the in-flight TX packets */
- timeout = jiffies + 120 * HZ; for (i = 0; i < apc->num_queues; i++) { txq = &apc->tx_qp[i].txq;
while (atomic_read(&txq->pending_sends) > 0)
while (atomic_read(&txq->pending_sends) > 0 &&
time_before(jiffies, timeout)) { usleep_range(1000, 2000);
}
}
for (i = 0; i < apc->num_queues; i++) {
txq = &apc->tx_qp[i].txq;
cq = &apc->tx_qp[i].tx_cq;
while (atomic_read(&txq->pending_sends)) {
skb = skb_dequeue(&txq->pending_skbs);
mana_unmap_skb(skb, apc);
napi_consume_skb(skb, cq->budget);
atomic_sub(1, &txq->pending_sends);
}
} /* We're 100% sure the queues can no longer be woken up, because
- we're sure now mana_poll_tx_cq() can't be running.
*/
@@ -2605,6 +2621,7 @@ int mana_probe(struct gdma_dev *gd, bool resuming) } }
- ac->vf_unload_timeout = false;
Pointless init. You have the struct zeroed during allocation.
err = add_adev(gd); out: if (err) diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 9eef19972845..5f5affdca1eb 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -358,6 +358,8 @@ struct mana_context {
u16 num_ports;
bool vf_unload_timeout;
struct mana_eq *eqs;
struct net_device *ports[MAX_PORTS_IN_MANA_DEV];
-- 2.34.1
On Fri, Jun 23, 2023 at 12:29:15AM -0700, souradeep chakrabarti wrote:
From: Souradeep Chakrabarti schakrabarti@linux.microsoft.com
This patch addresses the VF unload issue, where mana_dealloc_queues() gets stuck in infinite while loop, because of host unresponsiveness. It adds a timeout in the while loop, to fix it.
Also this patch adds a new attribute in mana_context, which gets set when mana_hwc_send_request() hits a timeout because of host unresponsiveness. This flag then helps to avoid the timeouts in successive calls.
Signed-off-by: Souradeep Chakrabarti schakrabarti@linux.microsoft.com
...
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index d907727c7b7a..cb2080b3a00c 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
...
@@ -2348,13 +2351,26 @@ static int mana_dealloc_queues(struct net_device *ndev) * * Drain all the in-flight TX packets */
- timeout = jiffies + 120 * HZ; for (i = 0; i < apc->num_queues; i++) { txq = &apc->tx_qp[i].txq;
while (atomic_read(&txq->pending_sends) > 0)
while (atomic_read(&txq->pending_sends) > 0 &&
time_before(jiffies, timeout)) { usleep_range(1000, 2000);
}
Hi Souradeep,
minor feedback from my side, as it seems there will be a new version anyway: I think the braces - '{' '}' - are unnecessary above.
...
linux-stable-mirror@lists.linaro.org