commit <88467db6e2f46a2e79b1b67ce6873c284e4cf417> upstream
Backport from upstream to match function amdgpu_vm_bo_update_mapping
change.
Migration range from system memory to VRAM, if system page can not be
locked or unmapped, we do partial migration and leave some pages in
system memory. Several bugs found to copy pages and update GPU mapping
for this situation:
1. copy to vram should use migrate->npage which is total pages of range
as npages, not migrate->cpages which is number of pages can be migrated.
2. After partial copy, set VRAM res cursor as j + 1, j is number of
system pages copied plus 1 page to skip copy.
3. copy to ram, should collect all continuous VRAM pages and copy
together.
4. Call amdgpu_vm_update_range, should pass in offset as bytes, not
as number of pages.
Signed-off-by: Philip Yang <Philip.Yang(a)amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling(a)amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 6 +++---
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index ed5385137f48..9d5324b6298c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -299,7 +299,7 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
struct migrate_vma *migrate, struct dma_fence **mfence,
dma_addr_t *scratch)
{
- uint64_t npages = migrate->cpages;
+ uint64_t npages = migrate->npages;
struct device *dev = adev->dev;
struct amdgpu_res_cursor cursor;
dma_addr_t *src;
@@ -346,7 +346,7 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange,
mfence);
if (r)
goto out_free_vram_pages;
- amdgpu_res_next(&cursor, j << PAGE_SHIFT);
+ amdgpu_res_next(&cursor, (j + 1) << PAGE_SHIFT);
j = 0;
} else {
amdgpu_res_next(&cursor, PAGE_SIZE);
@@ -593,7 +593,7 @@ svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
continue;
}
src[i] = svm_migrate_addr(adev, spage);
- if (i > 0 && src[i] != src[i - 1] + PAGE_SIZE) {
+ if (j > 0 && src[i] != src[i - 1] + PAGE_SIZE) {
r = svm_migrate_copy_memory_gart(adev, dst + i - j,
src + i - j, j,
FROM_VRAM_TO_RAM,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index f2805ba74c80..6d108dbbabdc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1275,7 +1275,7 @@ svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
r = amdgpu_vm_bo_update_mapping(adev, bo_adev, vm, false, false,
NULL, last_start,
prange->start + i, pte_flags,
- last_start - prange->start,
+ (last_start - prange->start) << PAGE_SHIFT,
NULL, dma_addr,
&vm->last_update,
&table_freed);
--
2.35.1
From: He Ying <heying24(a)huawei.com>
[ Upstream commit a1b29ba2f2c171b9bea73be993bfdf0a62d37d15 ]
The following KASAN warning was reported in our kernel.
BUG: KASAN: stack-out-of-bounds in get_wchan+0x188/0x250
Read of size 4 at addr d216f958 by task ps/14437
CPU: 3 PID: 14437 Comm: ps Tainted: G O 5.10.0 #1
Call Trace:
[daa63858] [c0654348] dump_stack+0x9c/0xe4 (unreliable)
[daa63888] [c035cf0c] print_address_description.constprop.3+0x8c/0x570
[daa63908] [c035d6bc] kasan_report+0x1ac/0x218
[daa63948] [c00496e8] get_wchan+0x188/0x250
[daa63978] [c0461ec8] do_task_stat+0xce8/0xe60
[daa63b98] [c0455ac8] proc_single_show+0x98/0x170
[daa63bc8] [c03cab8c] seq_read_iter+0x1ec/0x900
[daa63c38] [c03cb47c] seq_read+0x1dc/0x290
[daa63d68] [c037fc94] vfs_read+0x164/0x510
[daa63ea8] [c03808e4] ksys_read+0x144/0x1d0
[daa63f38] [c005b1dc] ret_from_syscall+0x0/0x38
--- interrupt: c00 at 0x8fa8f4
LR = 0x8fa8cc
The buggy address belongs to the page:
page:98ebcdd2 refcount:0 mapcount:0 mapping:00000000 index:0x2 pfn:0x1216f
flags: 0x0()
raw: 00000000 00000000 01010122 00000000 00000002 00000000 ffffffff 00000000
raw: 00000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
d216f800: 00 00 00 00 00 f1 f1 f1 f1 00 00 00 00 00 00 00
d216f880: f2 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>d216f900: 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00
^
d216f980: f2 f2 f2 f2 f2 f2 f2 00 00 00 00 00 00 00 00 00
d216fa00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
After looking into this issue, I find the buggy address belongs
to the task stack region. It seems KASAN has something wrong.
I look into the code of __get_wchan in x86 architecture and
find the same issue has been resolved by the commit
f7d27c35ddff ("x86/mm, kasan: Silence KASAN warnings in get_wchan()").
The solution could be applied to powerpc architecture too.
As Andrey Ryabinin said, get_wchan() is racy by design, it may
access volatile stack of running task, thus it may access
redzone in a stack frame and cause KASAN to warn about this.
Use READ_ONCE_NOCHECK() to silence these warnings.
Reported-by: Wanming Hu <huwanming(a)huaweil.com>
Signed-off-by: He Ying <heying24(a)huawei.com>
Signed-off-by: Chen Jingwen <chenjingwen6(a)huawei.com>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20220121014418.155675-1-heying24@huawei.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/powerpc/kernel/process.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 984813a4d5dc..a75d20f23dac 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -2160,12 +2160,12 @@ static unsigned long ___get_wchan(struct task_struct *p)
return 0;
do {
- sp = *(unsigned long *)sp;
+ sp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) ||
task_is_running(p))
return 0;
if (count > 0) {
- ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE];
+ ip = READ_ONCE_NOCHECK(((unsigned long *)sp)[STACK_FRAME_LR_SAVE]);
if (!in_sched_functions(ip))
return ip;
}
--
2.35.1