Hi Jason, Julian,
Subject: Re: [PATCH] lib/scatterlist: fix sg_page_count and sg_dma_page_count
On Sun, Mar 8, 2026 at 7:08 PM Jason Gunthorpe jgg@ziepe.ca wrote:
On Sun, Mar 08, 2026 at 02:55:27PM +0100, Julian Orth wrote:
A user reported memory corruption in the Jay wayland compositor [1].
The
corruption started when archlinux enabled CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE in
kernel 6.19.5.
The compositor uses udmabuf to upload memory from memfds to the
GPU.
When running an affected kernel, the following warnings are logged:
a - addrs >= max_entries WARNING: drivers/gpu/drm/drm_prime.c:1089 atdrm_prime_sg_to_dma_addr_array+0x86/0xc0, CPU#31: jay/1864
[...] Call Trace: <TASK> amdgpu_bo_move+0x188/0x800 [amdgpu3b451640234948027c09e9b39e6520bc7e5471cf]
Disabling the use of huge pages at runtime via /sys/kernel/mm/transparent_hugepage/shmem_enabled fixes the
issue.
udmabuf allocates a scatterlist with buffer_size/PAGE_SIZE entries.
Each
entry has a length of PAGE_SIZE. With huge pages disabled, it appears that sg->offset is always 0. With huge pages enabled, sg->offset is incremented by PAGE_SIZE until the end of the huge page.
This was broken by 0c8b91ef5100 ("udmabuf: add back support for mapping hugetlb pages") which switched from a working sg_alloc_table_from_pages() to a messed up sg_set_pages loop:
for_each_sg(sg->sgl, sgl, ubuf->pagecount, i)sg_set_page(sgl, ubuf->pages[i], PAGE_SIZE, ubuf->offsets[i]);[..]
ubuf->offsets[*pgbuf] = subpgoff << PAGE_SHIFT;Which is just the wrong way to use the scatterlist API.
This was later changed to sg_set_folio() which I'm also suspecting has a bug, it should be setting page_link to the proper tail page because as you observe page_offset must fall within 0 to PAGE_SIZE-1 to make the iterator work.
I think the whole design here in udmabuf makes very little sense. It starts out with an actual list of folios then expands them to a per-4K double array of folio/offset. This is nonsensical, if it wants to build a way to direct index the mapping for mmap it should just build itself a page * array like the code used to do and continue to use sg_alloc_table_from_pages() which builds properly formed scatterlists.
There are a couple of reasons why we got rid of the pages array: - Back then, there was some confusion about whether a struct page would exist or not for tail pages when HVO is enabled. Regardless, there was also a concern about exposing tail pages outside hugetlb code.
- And, we also wanted to prepare for a future where struct page would not exist anymore, so, it made sense to just use folios only.
I am not sure if these concerns are valid anymore. If they are not, I am OK with Jason's patch below that brings back the pages array.
Thanks, Vivek
This would save memory, use the APIs properly and build a correct and optimized scatterlist to boot. It uses vmf_insert_pfn() and vm_map_ram() anyhow so it doesn't even use a folio :\
Here, a few mins of AI shows what I think udmabuf should look like. If you wish to persue this please add my signed-off-by and handle testing it and getting it merged. I reviewed it enough to see it was showing what I wanted.
I don't know enough about folios or udmabuf to efficiently work on this.
If offset is supposed to be in [0, PAGE_SIZE-1], then my patch is incorrect and it's probably better if some of the udmabuf maintainers take a look at this. I've added them to CC.
Jason
diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index 94b8ecb892bb17..5d687860445137 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -26,10 +26,10 @@ MODULE_PARM_DESC(size_limit_mb, "Max size
of a dmabuf, in megabytes. Default is
struct udmabuf { pgoff_t pagecount;
struct folio **folios;
struct page **pages; /**
* Unlike folios, pinned_folios is only used for unpin.
* Unlike pages, pinned_folios is only used for unpin. * So, nr_pinned is not the same to pagecount, the pinned_folios * only set each folio which already pinned when udmabuf_create. * Note that, since a folio may be pinned multiple times, each folio@@ -41,7 +41,6 @@ struct udmabuf {
struct sg_table *sg; struct miscdevice *device;
pgoff_t *offsets;};
static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf) @@ -55,8 +54,7 @@ static vm_fault_t udmabuf_vm_fault(struct
vm_fault *vmf)
if (pgoff >= ubuf->pagecount) return VM_FAULT_SIGBUS;
pfn = folio_pfn(ubuf->folios[pgoff]);pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT;
pfn = page_to_pfn(ubuf->pages[pgoff]); ret = vmf_insert_pfn(vma, vmf->address, pfn); if (ret & VM_FAULT_ERROR)@@ -73,8 +71,7 @@ static vm_fault_t udmabuf_vm_fault(struct
vm_fault *vmf)
if (WARN_ON(pgoff >= ubuf->pagecount)) break;
pfn = folio_pfn(ubuf->folios[pgoff]);pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT;
pfn = page_to_pfn(ubuf->pages[pgoff]); /** * If the below vmf_insert_pfn() fails, we do not return an@@ -109,22 +106,11 @@ static int mmap_udmabuf(struct dma_buf
*buf, struct vm_area_struct *vma)
static int vmap_udmabuf(struct dma_buf *buf, struct iosys_map *map) { struct udmabuf *ubuf = buf->priv;
struct page **pages; void *vaddr;pgoff_t pg; dma_resv_assert_held(buf->resv);pages = kvmalloc_objs(*pages, ubuf->pagecount);if (!pages)return -ENOMEM;for (pg = 0; pg < ubuf->pagecount; pg++)pages[pg] = folio_page(ubuf->folios[pg],ubuf->offsets[pg] >> PAGE_SHIFT);vaddr = vm_map_ram(pages, ubuf->pagecount, -1);kvfree(pages);
vaddr = vm_map_ram(ubuf->pages, ubuf->pagecount, -1); if (!vaddr) return -EINVAL;@@ -146,22 +132,18 @@ static struct sg_table *get_sg_table(struct
device *dev, struct dma_buf *buf,
{ struct udmabuf *ubuf = buf->priv; struct sg_table *sg;
struct scatterlist *sgl;unsigned int i = 0; int ret; sg = kzalloc_obj(*sg); if (!sg) return ERR_PTR(-ENOMEM);ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf-pagecount, 0,
ubuf->pagecount << PAGE_SHIFT,GFP_KERNEL); if (ret < 0) goto err_alloc;
for_each_sg(sg->sgl, sgl, ubuf->pagecount, i)sg_set_folio(sgl, ubuf->folios[i], PAGE_SIZE,ubuf->offsets[i]);ret = dma_map_sgtable(dev, sg, direction, 0); if (ret < 0) goto err_map;@@ -207,12 +189,8 @@ static void unpin_all_folios(struct udmabuf
*ubuf)
static __always_inline int init_udmabuf(struct udmabuf *ubuf, pgoff_t
pgcnt)
{
ubuf->folios = kvmalloc_objs(*ubuf->folios, pgcnt);if (!ubuf->folios)return -ENOMEM;ubuf->offsets = kvzalloc_objs(*ubuf->offsets, pgcnt);if (!ubuf->offsets)
ubuf->pages = kvmalloc_objs(*ubuf->pages, pgcnt);if (!ubuf->pages) return -ENOMEM; ubuf->pinned_folios = kvmalloc_objs(*ubuf->pinned_folios, pgcnt);@@ -225,8 +203,7 @@ static __always_inline int init_udmabuf(struct
udmabuf *ubuf, pgoff_t pgcnt)
static __always_inline void deinit_udmabuf(struct udmabuf *ubuf) { unpin_all_folios(ubuf);
kvfree(ubuf->offsets);kvfree(ubuf->folios);
kvfree(ubuf->pages);}
static void release_udmabuf(struct dma_buf *buf) @@ -344,8 +321,8 @@ static long udmabuf_pin_folios(struct udmabuf
*ubuf, struct file *memfd,
ubuf->pinned_folios[nr_pinned++] = folios[cur_folio]; for (; subpgoff < fsize; subpgoff += PAGE_SIZE) {
ubuf->folios[upgcnt] = folios[cur_folio];ubuf->offsets[upgcnt] = subpgoff;
ubuf->pages[upgcnt] = folio_page(folios[cur_folio],subpgoff >> PAGE_SHIFT); ++upgcnt; if (++cur_pgcnt >= pgcnt)