From: Praveen Kaligineedi <pkaligineedi(a)google.com>
gve_tx_timeout was calculating missed completions in a way that is only
relevant in the GQ queue format. Additionally, it was attempting to
disable device interrupts, which is not needed in either GQ or DQ queue
formats.
As a result, TX timeouts with the DQ queue format likely would have
triggered early resets without kicking the queue at all.
This patch drops the check for pending work altogether and always kicks
the queue after validating the queue has not seen a TX timeout too
recently.
Fixes: 87a7f321bb6a ("gve: Recover from queue stall due to missed IRQ")
Co-developed-by: Tim Hostetler <thostet(a)google.com>
Signed-off-by: Tim Hostetler <thostet(a)google.com>
Signed-off-by: Praveen Kaligineedi <pkaligineedi(a)google.com>
Signed-off-by: Harshitha Ramamurthy <hramamurthy(a)google.com>
---
drivers/net/ethernet/google/gve/gve_main.c | 16 ++++------------
1 file changed, 4 insertions(+), 12 deletions(-)
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index c3791cf..0c6328b 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1921,7 +1921,6 @@ static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue)
struct gve_notify_block *block;
struct gve_tx_ring *tx = NULL;
struct gve_priv *priv;
- u32 last_nic_done;
u32 current_time;
u32 ntfy_idx;
@@ -1941,17 +1940,10 @@ static void gve_tx_timeout(struct net_device *dev, unsigned int txqueue)
if (tx->last_kick_msec + MIN_TX_TIMEOUT_GAP > current_time)
goto reset;
- /* Check to see if there are missed completions, which will allow us to
- * kick the queue.
- */
- last_nic_done = gve_tx_load_event_counter(priv, tx);
- if (last_nic_done - tx->done) {
- netdev_info(dev, "Kicking queue %d", txqueue);
- iowrite32be(GVE_IRQ_MASK, gve_irq_doorbell(priv, block));
- napi_schedule(&block->napi);
- tx->last_kick_msec = current_time;
- goto out;
- } // Else reset.
+ netdev_info(dev, "Kicking queue %d", txqueue);
+ napi_schedule(&block->napi);
+ tx->last_kick_msec = current_time;
+ goto out;
reset:
gve_schedule_reset(priv);
--
2.49.0.805.g082f7c87e0-goog
This reverts commit d95fdee2253e612216e72f29c65b92ec42d254eb which is
upstream commit be4ae8c19492cd6d5de61ccb34ffb3f5ede5eec8.
This commit is causing a suspend regression on Tegra186 Jetson TX2 with
Linux v6.12.y kernels. This is not seen with Linux v6.15 that includes
this change but indicates that there are there changes missing.
Therefore, revert this change.
Fixes: d95fdee2253e ("cpufreq: tegra186: Share policy per cluster")
Link: https://lore.kernel.org/linux-tegra/bf1dabf7-0337-40e9-8b8e-4e93a0ffd4cc@nv…
Signed-off-by: Jon Hunter <jonathanh(a)nvidia.com>
---
drivers/cpufreq/tegra186-cpufreq.c | 7 -------
1 file changed, 7 deletions(-)
diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c
index 4e5b6f9a56d1..7b8fcfa55038 100644
--- a/drivers/cpufreq/tegra186-cpufreq.c
+++ b/drivers/cpufreq/tegra186-cpufreq.c
@@ -73,18 +73,11 @@ static int tegra186_cpufreq_init(struct cpufreq_policy *policy)
{
struct tegra186_cpufreq_data *data = cpufreq_get_driver_data();
unsigned int cluster = data->cpus[policy->cpu].bpmp_cluster_id;
- u32 cpu;
policy->freq_table = data->clusters[cluster].table;
policy->cpuinfo.transition_latency = 300 * 1000;
policy->driver_data = NULL;
- /* set same policy for all cpus in a cluster */
- for (cpu = 0; cpu < ARRAY_SIZE(tegra186_cpus); cpu++) {
- if (data->cpus[cpu].bpmp_cluster_id == cluster)
- cpumask_set_cpu(cpu, policy->cpus);
- }
-
return 0;
}
--
2.43.0
This reverts commit 89172666228de1cefcacf5bc6f61c6281751d2ed which is
upstream commit be4ae8c19492cd6d5de61ccb34ffb3f5ede5eec8.
This commit is causing a suspend regression on Tegra186 Jetson TX2 with
Linux v6.12.y kernels. This is not seen with Linux v6.15 that includes
this change but indicates that there are there changes missing.
Therefore, revert this change.
Link: https://lore.kernel.org/linux-tegra/bf1dabf7-0337-40e9-8b8e-4e93a0ffd4cc@nv…
Fixes: 89172666228d ("cpufreq: tegra186: Share policy per cluster")
Signed-off-by: Jon Hunter <jonathanh(a)nvidia.com>
---
drivers/cpufreq/tegra186-cpufreq.c | 7 -------
1 file changed, 7 deletions(-)
diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c
index 1d6b54303723..6c88827f4e62 100644
--- a/drivers/cpufreq/tegra186-cpufreq.c
+++ b/drivers/cpufreq/tegra186-cpufreq.c
@@ -73,18 +73,11 @@ static int tegra186_cpufreq_init(struct cpufreq_policy *policy)
{
struct tegra186_cpufreq_data *data = cpufreq_get_driver_data();
unsigned int cluster = data->cpus[policy->cpu].bpmp_cluster_id;
- u32 cpu;
policy->freq_table = data->clusters[cluster].table;
policy->cpuinfo.transition_latency = 300 * 1000;
policy->driver_data = NULL;
- /* set same policy for all cpus in a cluster */
- for (cpu = 0; cpu < ARRAY_SIZE(tegra186_cpus); cpu++) {
- if (data->cpus[cpu].bpmp_cluster_id == cluster)
- cpumask_set_cpu(cpu, policy->cpus);
- }
-
return 0;
}
--
2.43.0
This reverts commit 2592aeda794c9ea73193effdab69f1cf90d0851a which is
upstream commit be4ae8c19492cd6d5de61ccb34ffb3f5ede5eec8.
This commit is causing a suspend regression on Tegra186 Jetson TX2 with
Linux v6.12.y kernels. This is not seen with Linux v6.15 that includes
this change but indicates that there are there changes missing.
Therefore, revert this change.
Fixes: 2592aeda794c ("cpufreq: tegra186: Share policy per cluster")
Link: https://lore.kernel.org/linux-tegra/bf1dabf7-0337-40e9-8b8e-4e93a0ffd4cc@nv…
Signed-off-by: Jon Hunter <jonathanh(a)nvidia.com>
---
drivers/cpufreq/tegra186-cpufreq.c | 7 -------
1 file changed, 7 deletions(-)
diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c
index 19597246f9cc..5d1943e787b0 100644
--- a/drivers/cpufreq/tegra186-cpufreq.c
+++ b/drivers/cpufreq/tegra186-cpufreq.c
@@ -73,18 +73,11 @@ static int tegra186_cpufreq_init(struct cpufreq_policy *policy)
{
struct tegra186_cpufreq_data *data = cpufreq_get_driver_data();
unsigned int cluster = data->cpus[policy->cpu].bpmp_cluster_id;
- u32 cpu;
policy->freq_table = data->clusters[cluster].table;
policy->cpuinfo.transition_latency = 300 * 1000;
policy->driver_data = NULL;
- /* set same policy for all cpus in a cluster */
- for (cpu = 0; cpu < ARRAY_SIZE(tegra186_cpus); cpu++) {
- if (data->cpus[cpu].bpmp_cluster_id == cluster)
- cpumask_set_cpu(cpu, policy->cpus);
- }
-
return 0;
}
--
2.43.0
Don't WARN if imported buffers are in use in ivpu_gem_bo_free() as they
can be indeed used in the original context/driver.
Fixes: 647371a6609d ("accel/ivpu: Add GEM buffer object management")
Cc: stable(a)vger.kernel.org # v6.3
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz(a)linux.intel.com>
---
v2: Use drm_gem_is_imported() to check if the buffer is imported.
---
drivers/accel/ivpu/ivpu_gem.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/accel/ivpu/ivpu_gem.c b/drivers/accel/ivpu/ivpu_gem.c
index c193a80241f5f..5ff0bac739fc9 100644
--- a/drivers/accel/ivpu/ivpu_gem.c
+++ b/drivers/accel/ivpu/ivpu_gem.c
@@ -278,7 +278,8 @@ static void ivpu_gem_bo_free(struct drm_gem_object *obj)
list_del(&bo->bo_list_node);
mutex_unlock(&vdev->bo_list_lock);
- drm_WARN_ON(&vdev->drm, !dma_resv_test_signaled(obj->resv, DMA_RESV_USAGE_READ));
+ drm_WARN_ON(&vdev->drm, !drm_gem_is_imported(&bo->base.base) &&
+ !dma_resv_test_signaled(obj->resv, DMA_RESV_USAGE_READ));
drm_WARN_ON(&vdev->drm, ivpu_bo_size(bo) == 0);
drm_WARN_ON(&vdev->drm, bo->base.vaddr);
--
2.45.1
From: Karol Wachowski <karol.wachowski(a)intel.com>
Trigger full device recovery when the driver fails to restore device state
via engine reset and resume operations. This is necessary because, even if
submissions from a faulty context are blocked, the NPU may still process
previously submitted faulty jobs if the engine reset fails to abort them.
Such jobs can continue to generate faults and occupy device resources.
When engine reset is ineffective, the only way to recover is to perform
a full device recovery.
Fixes: dad945c27a42 ("accel/ivpu: Add handling of VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW")
Cc: <stable(a)vger.kernel.org> # v6.15+
Signed-off-by: Karol Wachowski <karol.wachowski(a)intel.com>
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz(a)linux.intel.com>
---
drivers/accel/ivpu/ivpu_job.c | 6 ++++--
drivers/accel/ivpu/ivpu_jsm_msg.c | 9 +++++++--
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c
index 1c8e283ad9854..fae8351aa3309 100644
--- a/drivers/accel/ivpu/ivpu_job.c
+++ b/drivers/accel/ivpu/ivpu_job.c
@@ -986,7 +986,8 @@ void ivpu_context_abort_work_fn(struct work_struct *work)
return;
if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW)
- ivpu_jsm_reset_engine(vdev, 0);
+ if (ivpu_jsm_reset_engine(vdev, 0))
+ return;
mutex_lock(&vdev->context_list_lock);
xa_for_each(&vdev->context_xa, ctx_id, file_priv) {
@@ -1009,7 +1010,8 @@ void ivpu_context_abort_work_fn(struct work_struct *work)
if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW)
goto runtime_put;
- ivpu_jsm_hws_resume_engine(vdev, 0);
+ if (ivpu_jsm_hws_resume_engine(vdev, 0))
+ return;
/*
* In hardware scheduling mode NPU already has stopped processing jobs
* and won't send us any further notifications, thus we have to free job related resources
diff --git a/drivers/accel/ivpu/ivpu_jsm_msg.c b/drivers/accel/ivpu/ivpu_jsm_msg.c
index 219ab8afefabd..0256b2dfefc10 100644
--- a/drivers/accel/ivpu/ivpu_jsm_msg.c
+++ b/drivers/accel/ivpu/ivpu_jsm_msg.c
@@ -7,6 +7,7 @@
#include "ivpu_hw.h"
#include "ivpu_ipc.h"
#include "ivpu_jsm_msg.h"
+#include "ivpu_pm.h"
#include "vpu_jsm_api.h"
const char *ivpu_jsm_msg_type_to_str(enum vpu_ipc_msg_type type)
@@ -163,8 +164,10 @@ int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine)
ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_ENGINE_RESET_DONE, &resp,
VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
- if (ret)
+ if (ret) {
ivpu_err_ratelimited(vdev, "Failed to reset engine %d: %d\n", engine, ret);
+ ivpu_pm_trigger_recovery(vdev, "Engine reset failed");
+ }
return ret;
}
@@ -354,8 +357,10 @@ int ivpu_jsm_hws_resume_engine(struct ivpu_device *vdev, u32 engine)
ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_HWS_RESUME_ENGINE_DONE, &resp,
VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
- if (ret)
+ if (ret) {
ivpu_err_ratelimited(vdev, "Failed to resume engine %d: %d\n", engine, ret);
+ ivpu_pm_trigger_recovery(vdev, "Engine resume failed");
+ }
return ret;
}
--
2.45.1