From: "Stanley.Yang" Stanley.Yang@amd.com
[ Upstream commit a32c6f7f5737cc7e31cd7ad5133f0d96fca12ea6 ]
The ecc_irq is disabled while GPU mode2 reset suspending process, but not be enabled during GPU mode2 reset resume process.
Changed from V1: only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip delete amdgpu_ras_late_resume function
Changed from V2: check umc ras supported before put ecc_irq
Signed-off-by: Stanley.Yang Stanley.Yang@amd.com Reviewed-by: Tao Zhou tao.zhou1@amd.com Reviewed-by: Hawking Zhang Hawking.Zhang@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/amd/amdgpu/aldebaran.c | 26 +++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 ++++ 4 files changed, 38 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c index 2b97b8a96fb4..fa6193535d48 100644 --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c @@ -333,6 +333,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, { struct list_head *reset_device_list = reset_context->reset_device_list; struct amdgpu_device *tmp_adev = NULL; + struct amdgpu_ras *con; int r;
if (reset_device_list == NULL) @@ -358,7 +359,30 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl, */ amdgpu_register_gpu_instance(tmp_adev);
- /* Resume RAS */ + /* Resume RAS, ecc_irq */ + con = amdgpu_ras_get_context(tmp_adev); + if (!amdgpu_sriov_vf(tmp_adev) && con) { + if (tmp_adev->sdma.ras && + tmp_adev->sdma.ras->ras_block.ras_late_init) { + r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev, + &tmp_adev->sdma.ras->ras_block.ras_comm); + if (r) { + dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r); + goto end; + } + } + + if (tmp_adev->gfx.ras && + tmp_adev->gfx.ras->ras_block.ras_late_init) { + r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev, + &tmp_adev->gfx.ras->ras_block.ras_comm); + if (r) { + dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r); + goto end; + } + } + } + amdgpu_ras_resume(tmp_adev);
/* Update PSP FW topology after reset */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index fa87a85e1017..62ecf4d89cb9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -1141,6 +1141,10 @@ static int gmc_v10_0_hw_fini(void *handle)
amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
+ if (adev->gmc.ecc_irq.funcs && + amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); + return 0; }
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index e3b76fd28d15..3d797a1adef3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -974,6 +974,11 @@ static int gmc_v11_0_hw_fini(void *handle) }
amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); + + if (adev->gmc.ecc_irq.funcs && + amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); + gmc_v11_0_gart_disable(adev);
return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 89550d3df68d..f9f43742e9ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -2413,6 +2413,10 @@ static int gmc_v9_0_hw_fini(void *handle)
amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
+ if (adev->gmc.ecc_irq.funcs && + amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); + return 0; }