On Wed, Dec 09, 2020 at 03:25:24PM +0100, Thomas Zimmermann wrote:
> Implementations of the vmap/vunmap GEM callbacks may perform pinning
> of the BO and may acquire the associated reservation object's lock.
> Callers that only require a mapping of the contained memory can thus
> interfere with other tasks that require exact pinning, such as scanout.
> This is less of an issue with private CMA buffers, but may happen
> with imported ones.
>
> Therefore provide the new interface drm_gem_cma_vmap_local(), which only
> performs the vmap operations. Callers have to hold the reservation lock
> while the mapping persists.
>
> This patch also connects GEM CMA helpers to the GEM object function with
> equivalent functionality.
>
> Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
> ---
> drivers/gpu/drm/drm_gem_cma_helper.c | 35 ++++++++++++++++++++++++++++
> drivers/gpu/drm/vc4/vc4_bo.c | 13 +++++++++++
> drivers/gpu/drm/vc4/vc4_drv.h | 1 +
> include/drm/drm_gem_cma_helper.h | 1 +
> 4 files changed, 50 insertions(+)
>
> diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c
> index 7942cf05cd93..40b3e8e3fc42 100644
> --- a/drivers/gpu/drm/drm_gem_cma_helper.c
> +++ b/drivers/gpu/drm/drm_gem_cma_helper.c
> @@ -38,6 +38,7 @@ static const struct drm_gem_object_funcs drm_gem_cma_default_funcs = {
> .print_info = drm_gem_cma_print_info,
> .get_sg_table = drm_gem_cma_get_sg_table,
> .vmap = drm_gem_cma_vmap,
> + .vmap_local = drm_gem_cma_vmap_local,
> .mmap = drm_gem_cma_mmap,
> .vm_ops = &drm_gem_cma_vm_ops,
> };
> @@ -471,6 +472,40 @@ int drm_gem_cma_vmap(struct drm_gem_object *obj, struct dma_buf_map *map)
> }
> EXPORT_SYMBOL_GPL(drm_gem_cma_vmap);
>
> +/**
> + * drm_gem_cma_vmap_local - map a CMA GEM object into the kernel's virtual
> + * address space
> + * @obj: GEM object
> + * @map: Returns the kernel virtual address of the CMA GEM object's backing
> + * store.
> + *
> + * This function maps a buffer into the kernel's
> + * virtual address space. Since the CMA buffers are already mapped into the
> + * kernel virtual address space this simply returns the cached virtual
> + * address. Drivers using the CMA helpers should set this as their DRM
> + * driver's &drm_gem_object_funcs.vmap_local callback.
> + *
> + * Returns:
> + * 0 on success, or a negative error code otherwise.
> + */
> +int drm_gem_cma_vmap_local(struct drm_gem_object *obj, struct dma_buf_map *map)
> +{
> + struct drm_gem_cma_object *cma_obj = to_drm_gem_cma_obj(obj);
> +
> + /*
> + * TODO: The code in drm_gem_cma_prime_import_sg_table_vmap()
> + * establishes this mapping. The correct solution would
> + * be to call dma_buf_vmap_local() here.
> + *
> + * If we find a case where we absolutely have to call
> + * dma_buf_vmap_local(), the code needs to be restructured.
dma_buf_vmap_local is only relevant for dynamic importers, pinning at
import time is actually what you get anyway. That's what Christian meant
with his comments for the ->pin hook. So the TODO here doesn't make sense
imo, just delete it. We're very far away from making cma dynamic :-)
> + */
> + dma_buf_map_set_vaddr(map, cma_obj->vaddr);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(drm_gem_cma_vmap_local);
> +
> /**
> * drm_gem_cma_mmap - memory-map an exported CMA GEM object
> * @obj: GEM object
> diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
> index dc316cb79e00..ec57326c69c4 100644
> --- a/drivers/gpu/drm/vc4/vc4_bo.c
> +++ b/drivers/gpu/drm/vc4/vc4_bo.c
> @@ -387,6 +387,7 @@ static const struct drm_gem_object_funcs vc4_gem_object_funcs = {
> .export = vc4_prime_export,
> .get_sg_table = drm_gem_cma_get_sg_table,
> .vmap = vc4_prime_vmap,
> + .vmap_local = vc4_prime_vmap_local,
> .vm_ops = &vc4_vm_ops,
> };
>
> @@ -797,6 +798,18 @@ int vc4_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map)
> return drm_gem_cma_vmap(obj, map);
> }
>
> +int vc4_prime_vmap_local(struct drm_gem_object *obj, struct dma_buf_map *map)
> +{
> + struct vc4_bo *bo = to_vc4_bo(obj);
> +
> + if (bo->validated_shader) {
This freaks me out. It should be impossible to export a validated shader
as a dma-buf, and indeed the check exists already.
All the wrapper functions here are imo pointless. Either we should remove
them, or replace the if with a BUG_ON here since if that ever happens we
have a security bug already. I'd go with removing, less code. Maybe throw
a patch on top?
Anyway this patch looks good, with the todo deleted:
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
> + DRM_DEBUG("mmaping of shader BOs not allowed.\n");
> + return -EINVAL;
> + }
> +
> + return drm_gem_cma_vmap_local(obj, map);
> +}
> +
> struct drm_gem_object *
> vc4_prime_import_sg_table(struct drm_device *dev,
> struct dma_buf_attachment *attach,
> diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
> index 43a1af110b3e..efb6c47d318f 100644
> --- a/drivers/gpu/drm/vc4/vc4_drv.h
> +++ b/drivers/gpu/drm/vc4/vc4_drv.h
> @@ -812,6 +812,7 @@ struct drm_gem_object *vc4_prime_import_sg_table(struct drm_device *dev,
> struct dma_buf_attachment *attach,
> struct sg_table *sgt);
> int vc4_prime_vmap(struct drm_gem_object *obj, struct dma_buf_map *map);
> +int vc4_prime_vmap_local(struct drm_gem_object *obj, struct dma_buf_map *map);
> int vc4_bo_cache_init(struct drm_device *dev);
> int vc4_bo_inc_usecnt(struct vc4_bo *bo);
> void vc4_bo_dec_usecnt(struct vc4_bo *bo);
> diff --git a/include/drm/drm_gem_cma_helper.h b/include/drm/drm_gem_cma_helper.h
> index 0a9711caa3e8..05122e71bc6d 100644
> --- a/include/drm/drm_gem_cma_helper.h
> +++ b/include/drm/drm_gem_cma_helper.h
> @@ -99,6 +99,7 @@ drm_gem_cma_prime_import_sg_table(struct drm_device *dev,
> struct dma_buf_attachment *attach,
> struct sg_table *sgt);
> int drm_gem_cma_vmap(struct drm_gem_object *obj, struct dma_buf_map *map);
> +int drm_gem_cma_vmap_local(struct drm_gem_object *obj, struct dma_buf_map *map);
> int drm_gem_cma_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma);
>
> /**
> --
> 2.29.2
>
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
On Wed, Dec 09, 2020 at 03:25:22PM +0100, Thomas Zimmermann wrote:
> The existing dma-buf calls dma_buf_vmap() and dma_buf_vunmap() are
> allowed to pin the buffer or acquire the buffer's reservation object
> lock.
>
> This is a problem for callers that only require a short-term mapping
> of the buffer without the pinning, or callers that have special locking
> requirements. These may suffer from unnecessary overhead or interfere
> with regular pin operations.
>
> The new interfaces dma_buf_vmap_local(), dma_buf_vunmapo_local(), and
> their rsp callbacks in struct dma_buf_ops provide an alternative without
> pinning or reservation locking. Callers are responsible for these
> operations.
>
> Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
> ---
> drivers/dma-buf/dma-buf.c | 80 +++++++++++++++++++++++++++++++++++++++
> include/linux/dma-buf.h | 34 +++++++++++++++++
> 2 files changed, 114 insertions(+)
>
> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> index e63684d4cd90..be9f80190a66 100644
> --- a/drivers/dma-buf/dma-buf.c
> +++ b/drivers/dma-buf/dma-buf.c
> @@ -1265,6 +1265,86 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map)
> }
> EXPORT_SYMBOL_GPL(dma_buf_vunmap);
>
> +/**
> + * dma_buf_vmap_local - Create virtual mapping for the buffer object into kernel
> + * address space.
> + * @dmabuf: [in] buffer to vmap
> + * @map: [out] returns the vmap pointer
> + *
> + * This call may fail due to lack of virtual mapping address space.
> + * These calls are optional in drivers. The intended use for them
> + * is for mapping objects linear in kernel space for high use objects.
> + * Please attempt to use kmap/kunmap before thinking about these interfaces.
Kmap is gone, so the entire 2 sentences here are no longer needed. Maybe
mention something like "Unlike dma_buf_vmap() this is a short term mapping
and will not pin the buffer. The struct dma_resv for the @dmabuf must be
locked until dma_buf_vunmap_local() is called."
> + *
> + * Returns:
> + * 0 on success, or a negative errno code otherwise.
> + */
> +int dma_buf_vmap_local(struct dma_buf *dmabuf, struct dma_buf_map *map)
> +{
> + struct dma_buf_map ptr;
> + int ret = 0;
> +
> + dma_buf_map_clear(map);
> +
> + if (WARN_ON(!dmabuf))
> + return -EINVAL;
> +
> + dma_resv_assert_held(dmabuf->resv);
> +
> + if (!dmabuf->ops->vmap_local)
> + return -EINVAL;
> +
> + mutex_lock(&dmabuf->lock);
> + if (dmabuf->vmapping_counter) {
> + dmabuf->vmapping_counter++;
> + BUG_ON(dma_buf_map_is_null(&dmabuf->vmap_ptr));
> + *map = dmabuf->vmap_ptr;
> + goto out_unlock;
> + }
> +
> + BUG_ON(dma_buf_map_is_set(&dmabuf->vmap_ptr));
> +
> + ret = dmabuf->ops->vmap_local(dmabuf, &ptr);
> + if (WARN_ON_ONCE(ret))
> + goto out_unlock;
> +
> + dmabuf->vmap_ptr = ptr;
> + dmabuf->vmapping_counter = 1;
> +
> + *map = dmabuf->vmap_ptr;
> +
> +out_unlock:
> + mutex_unlock(&dmabuf->lock);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(dma_buf_vmap_local);
> +
> +/**
> + * dma_buf_vunmap_local - Unmap a vmap obtained by dma_buf_vmap_local.
> + * @dmabuf: [in] buffer to vunmap
> + * @map: [in] vmap pointer to vunmap
Maybe for hyperlinking add "Release a mapping established with
dma_buf_vmap_local()."
> + */
> +void dma_buf_vunmap_local(struct dma_buf *dmabuf, struct dma_buf_map *map)
> +{
> + if (WARN_ON(!dmabuf))
> + return;
> +
> + dma_resv_assert_held(dmabuf->resv);
> +
> + BUG_ON(dma_buf_map_is_null(&dmabuf->vmap_ptr));
> + BUG_ON(dmabuf->vmapping_counter == 0);
> + BUG_ON(!dma_buf_map_is_equal(&dmabuf->vmap_ptr, map));
> +
> + mutex_lock(&dmabuf->lock);
> + if (--dmabuf->vmapping_counter == 0) {
> + if (dmabuf->ops->vunmap_local)
> + dmabuf->ops->vunmap_local(dmabuf, map);
> + dma_buf_map_clear(&dmabuf->vmap_ptr);
> + }
> + mutex_unlock(&dmabuf->lock);
> +}
> +EXPORT_SYMBOL_GPL(dma_buf_vunmap_local);
> +
> #ifdef CONFIG_DEBUG_FS
> static int dma_buf_debug_show(struct seq_file *s, void *unused)
> {
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index cf72699cb2bc..f66580d23a9b 100644
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -269,6 +269,38 @@ struct dma_buf_ops {
>
> int (*vmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
> void (*vunmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
> +
> + /**
> + * @vmap_local:
> + *
> + * Creates a virtual mapping for the buffer into kernel address space.
> + *
> + * This callback establishes short-term mappings for situations where
> + * callers only use the buffer for a bounded amount of time; such as
> + * updates to the framebuffer or reading back contained information.
> + * In contrast to the regular @vmap callback, vmap_local does never pin
> + * the buffer to a specific domain or acquire the buffer's reservation
> + * lock.
> + *
> + * This is called with the dmabuf->resv object locked. Callers must hold
^^Not the right kerneldoc, I think it
should be &dma_buf.resv to get the
hyperlink.
> + * the lock until after removing the mapping with @vunmap_local.
> + *
> + * This callback is optional.
> + *
> + * Returns:
> + *
> + * 0 on success or a negative error code on failure.
> + */
> + int (*vmap_local)(struct dma_buf *dmabuf, struct dma_buf_map *map);
> +
> + /**
> + * @vunmap_local:
> + *
> + * Removes a virtual mapping that wa sestablished by @vmap_local.
^^established
> + *
> + * This callback is optional.
> + */
> + void (*vunmap_local)(struct dma_buf *dmabuf, struct dma_buf_map *map);
> };
>
> /**
> @@ -506,4 +538,6 @@ int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *,
> unsigned long);
> int dma_buf_vmap(struct dma_buf *dmabuf, struct dma_buf_map *map);
> void dma_buf_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map);
> +int dma_buf_vmap_local(struct dma_buf *dmabuf, struct dma_buf_map *map);
> +void dma_buf_vunmap_local(struct dma_buf *dmabuf, struct dma_buf_map *map);
> #endif /* __DMA_BUF_H__ */
With the doc nits addressed:
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
> --
> 2.29.2
>
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
This patchset introduces a new dma heap, chunk heap that makes it
easy to perform the bulk allocation of high order pages.
It has been created to help optimize the 4K/8K HDR video playback
with secure DRM HW to protect contents on memory. The HW needs
physically contiguous memory chunks up to several hundred MB memory.
The chunk heap is registered by device tree with alignment and memory
node of Contiguous Memory Allocator(CMA). Alignment defines chunk page size.
For example, alignment 0x1_0000 means chunk page size is 64KB.
The phandle to memory node indicates contiguous memory allocator(CMA).
If device node doesn't have cma, the registration of chunk heap fails.
This patchset is against on next-20201110.
The patchset includes the following:
- cma_alloc_bulk API
- export dma-heap API to register kernel module dma heap.
- add chunk heap implementation.
- devicetree
Hyesoo Yu (3):
dma-buf: add export symbol for dma-heap
dma-buf: heaps: add chunk heap to dmabuf heaps
dma-heap: Devicetree binding for chunk heap
Minchan Kim (1):
mm: introduce cma_alloc_bulk API
.../bindings/dma-buf/chunk_heap.yaml | 52 ++
drivers/dma-buf/dma-heap.c | 2 +
drivers/dma-buf/heaps/Kconfig | 9 +
drivers/dma-buf/heaps/Makefile | 1 +
drivers/dma-buf/heaps/chunk_heap.c | 458 ++++++++++++++++++
include/linux/cma.h | 5 +
include/linux/page-isolation.h | 1 +
mm/cma.c | 129 ++++-
mm/page_alloc.c | 19 +-
mm/page_isolation.c | 3 +-
10 files changed, 666 insertions(+), 13 deletions(-)
create mode 100644 Documentation/devicetree/bindings/dma-buf/chunk_heap.yaml
create mode 100644 drivers/dma-buf/heaps/chunk_heap.c
--
2.29.2.299.gdc1121823c-goog
On Wed, Dec 9, 2020 at 10:32 AM Thomas Zimmermann <tzimmermann(a)suse.de> wrote:
>
> Hi
>
> Am 09.12.20 um 01:13 schrieb Daniel Vetter:
> > On Fri, Dec 04, 2020 at 09:47:08AM +0100, Christian König wrote:
> >> Am 04.12.20 um 09:32 schrieb Thomas Zimmermann:
> >>> Hi
> >>>
> >>> Am 03.12.20 um 21:41 schrieb Daniel Vetter:
> >>>> On Thu, Dec 03, 2020 at 07:59:04PM +0100, Thomas Zimmermann wrote:
> >>>>> Hi
> >>>>>
> >>>>> Am 03.12.20 um 16:26 schrieb Daniel Vetter:
> >>>>>> On Thu, Dec 03, 2020 at 03:02:59PM +0100, Thomas Zimmermann wrote:
> >>>>>>> Dma-buf's vmap and vunmap callbacks are undocumented and various
> >>>>>>> exporters currently have slightly different semantics for them. Add
> >>>>>>> documentation on how to implement and use these interfaces correctly.
> >>>>>>>
> >>>>>>> v2:
> >>>>>>> * document vmap semantics in struct dma_buf_ops
> >>>>>>> * add TODO item for reviewing and maybe fixing dma-buf exporters
> >>>>>>>
> >>>>>>> Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
> >>>>>>
> >>>>>> I still don't think this works, we're breaking dma_buf_vmap
> >>>>>> for everyone
> >>>>>> else here.
> >>>>>
> >>>>> I removed the text on the importer. These notes for callers in
> >>>>> the docs are
> >>>>> more or less a consequence of the exporter semantics.
> >>>>
> >>>> Callers are importers, so I'm not seeing how that fixes anything.
> >>>>
> >>>>> I thought we at least agreed on the exporter semantics in the
> >>>>> other thread,
> >>>>> didn't we?
> >>>>>
> >>>>> What I'm trying to do is to write dome some rules for exporters,
> >>>>> even if not
> >>>>> all exporters follow them.
> >>>>
> >>>> This is a standard interface, everyone needs to follow the same
> >>>> rules. And
> >>>> if they change, we need to make sure nothing breaks and we're not
> >>>> creating
> >>>> issues.
> >>>>
> >>>> In the past the rule was the dma_buf_vmap was allowed to take the
> >>>> dma_resv_lock, and that the buffer should be pinned. Now some ttm
> >>>> drivers
> >>>> didn't ever bother with the pinning, and mostly got away with that
> >>>> because
> >>>> drm_prime helpers do the pinning by default at attach time, and most
> >>>> users
> >>>> do call dma_buf_attach.
> >>>>
> >>>> But if you look through dma-buf docs nothing ever said you have to do a
> >>>> dummy attachment before you call dma_buf_vmap, that's just slightly
> >>>> crappy
> >>>> implementations that didn't blow up yet.
> >>>
> >>> I had a patch for adding pin to radeon's implementation of vmap. [1]
> >>> Christian told me to not do this; instead just get the lock in the fbdev
> >>> code. His advise almost seems the opposite of what you're telling me
> >>> here.
> >>
> >> I think what Daniel suggests here is that we need to smoothly transition the
> >> code from making assumptions to having a straight interface where importers
> >> explicitly say when stuff is locked and when stuff is pinned.
> >>
> >> I've started this with the attach interface by adding a new dynamic approach
> >> to that, but you probably need to carry on here with that for vmap as well.
> >>
> >> When that is done we can migrate every exporter over to the new dynamic
> >> approach.
> >>
> >>>
> >>> For the GEM VRAM helpers, that implicit pin in vmap gave me headaches.
> >>> Because scanouts can only be done from VRAM, which is badly suited for
> >>> exporting. So I ended up with an implicit pin that pins the buffer to
> >>> whatever domain it currently is. I got away with it because GEM VRAM BOs
> >>> are not sharable among devices; fbdev is the only user of that
> >>> functionality and only pins for short periods of time.
> >>>
> >>> I suspect that fixing TTM-based drivers by adding an implicit pin would
> >>> result in a similar situation. Whatever domain it ends up pinning, some
> >>> functionality might not be compatible with that.
> >>
> >> Correct, exactly that's the problem.
> >>
> >>>
> >>>>
> >>>>> Given the circumstances, we should leave out this patch from the
> >>>>> patchset.
> >>>>
> >>>> So the defacto rules are already a big mess, but that's not a good
> >>>> excuse
> >>>> to make it worse.
> >>>>
> >>>> What I had in mind is that we split dma_buf_vmap up into two variants:
> >>>>
> >>>> - The current one, which should guarantee that the buffer is pinned.
> >>>> Because that's what all current callers wanted, before the fbdev code
> >>>> started allowing non-pinned buffers.
> >>>
> >>> Can we add an explicit pin operation to dma_buf_vmap() to enforce the
> >>> semantics?
> >>
> >> At least I would be fine with that. For now amdgpu is the only exporter who
> >> implements the explicit pin/unpin semantics anyway.
> >
> > Yup, I think that makes sense (if it works). Maybe we could use something
> > like:
> >
> > a) dma_buf pin exists, driver is dynamic. This means dma_buf_vmap needs to
> > first pin, then call ->vmap. dma_buf_vmap_local otoh can directly call
> > ->vmap since the exporter relies on either a pin or dma_resv_lock.
> >
> > b) dma_buf pin not implement, driver is a legacy pile. dma_buf_vmap will
> > pin (somewhere at least, or rely on some implicit pin), dma_buf_vmap_local
> > doesn't work and should fail.
>
> I think I read in the dma-buf documentation that pin is supposed to put
> the BO in a domain that is suitable for scanout. Now I don't really
> trust this to work. Amdgpu, radeon and nouveau put it into the GTT
> region. Qxl appears to put it wherever it is.
Uh that sounds wrong, or at least not the full complexity. ->pin's
main use right now is to freeze the dma-buf into system memory when
there's a non-dynamic attachement for a dynamic exporter. There's also
a dma_buf_pin call in amdgpu, and I think that only works for scanout
on integrated gpu. Pinning to vram would break sharing for all
existing dma-buf users.
Christian can you perhaps explain when amdgpu uses dma_buf_pin()?
My take is the ->pin callback should clarify that it should pin into
system memory, for legacy (non-dynamic) dma-buf users. And
dma_buf_pin() should gain some kerneldoc which then states that "This
should only be used in limited use cases like scanout and not for
temporary pin operations." I think the problem with this kerneldoc is
simply that it tries to document the exporter and importer side of the
contract in one place and makes a mess of it, plus leaves the actual
importer side function undocumented. I think the kerneldoc also
predates the final patch version, and we didn't update it fully.
> > I think for less transition work fbdev helpers could first try
> > dma_resv_lock + dma_buf_vmap_local, if that fails, drop the dma_resv_lock
> > and do the pinning dma_buf_vmap. That way we don't have to convert shmem
> > helpers over to dma_resv locking, which should help.
>
> I have meanwhile made a patchset that updates helpers for cma, shmem and
> vram with vmap_local; and converts fbdev emulation as well. It needs a
> bit more testing before being posted.
Nice, even better!
-Daniel
> Best regards
> Thomas
>
> >
> > And ttm drivers would do the new clean interface, so at least everyone
> > using dma_resv today is all fine. Intel's conversion to dma_resv lock is
> > in-flight, but that needs a conversion to the dynamic interface anyway,
> > the current code splats. And dynamic brings means explicit pin/unpin
> > callbacks, so should be good too.
> > -Daniel
> >
> >>
> >> Regards,
> >> Christian.
> >>
> >>>
> >>> Best regards
> >>> Thomas
> >>>
> >>> [1] https://patchwork.freedesktop.org/patch/400054/?series=83765&rev=1
> >>>
> >>>>
> >>>> - The new one, which allows vmapping with just dma_resv locked, and
> >>>> should
> >>>> have some caching in exporters.
> >>>>
> >>>> Breaking code and then adding todos about that is kinda not so cool
> >>>> approach here imo.
> >>>>
> >>>> Also I guess ttm_bo_vmap should have a check that either the buffer is
> >>>> pinned, or dma_resv_lock is held.
> >>>>
> >>>> Cheers, Daniel
> >>>>
> >>>>
> >>>>
> >>>>>
> >>>>> Best regards
> >>>>> Thomas
> >>>>>
> >>>>>>
> >>>>>>> ---
> >>>>>>> Documentation/gpu/todo.rst | 15 +++++++++++++
> >>>>>>> include/drm/drm_gem.h | 4 ++++
> >>>>>>> include/linux/dma-buf.h | 45
> >>>>>>> ++++++++++++++++++++++++++++++++++++++
> >>>>>>> 3 files changed, 64 insertions(+)
> >>>>>>>
> >>>>>>> diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst
> >>>>>>> index 009d8e6c7e3c..32bb797a84fc 100644
> >>>>>>> --- a/Documentation/gpu/todo.rst
> >>>>>>> +++ b/Documentation/gpu/todo.rst
> >>>>>>> @@ -505,6 +505,21 @@ Contact: Thomas Zimmermann
> >>>>>>> <tzimmermann(a)suse.de>, Christian König, Daniel Vette
> >>>>>>> Level: Intermediate
> >>>>>>> +Enforce rules for dma-buf vmap and pin ops
> >>>>>>> +------------------------------------------
> >>>>>>> +
> >>>>>>> +Exporter implementations of vmap and pin in struct
> >>>>>>> dma_buf_ops (and consequently
> >>>>>>> +struct drm_gem_object_funcs) use a variety of locking
> >>>>>>> semantics. Some rely on
> >>>>>>> +the caller holding the dma-buf's reservation lock, some
> >>>>>>> do their own locking,
> >>>>>>> +some don't require any locking. VRAM helpers even used
> >>>>>>> to pin as part of vmap.
> >>>>>>> +
> >>>>>>> +We need to review each exporter and enforce the documented rules.
> >>>>>>> +
> >>>>>>> +Contact: Christian König, Daniel Vetter, Thomas
> >>>>>>> Zimmermann <tzimmermann(a)suse.de>
> >>>>>>> +
> >>>>>>> +Level: Advanced
> >>>>>>> +
> >>>>>>> +
> >>>>>>> Core refactorings
> >>>>>>> =================
> >>>>>>> diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
> >>>>>>> index 5e6daa1c982f..1864c6a721b1 100644
> >>>>>>> --- a/include/drm/drm_gem.h
> >>>>>>> +++ b/include/drm/drm_gem.h
> >>>>>>> @@ -138,6 +138,8 @@ struct drm_gem_object_funcs {
> >>>>>>> * drm_gem_dmabuf_vmap() helper.
> >>>>>>> *
> >>>>>>> * This callback is optional.
> >>>>>>> + *
> >>>>>>> + * See also struct dma_buf_ops.vmap
> >>>>>>> */
> >>>>>>> int (*vmap)(struct drm_gem_object *obj, struct
> >>>>>>> dma_buf_map *map);
> >>>>>>> @@ -148,6 +150,8 @@ struct drm_gem_object_funcs {
> >>>>>>> * drm_gem_dmabuf_vunmap() helper.
> >>>>>>> *
> >>>>>>> * This callback is optional.
> >>>>>>> + *
> >>>>>>> + * See also struct dma_buf_ops.vunmap
> >>>>>>> */
> >>>>>>> void (*vunmap)(struct drm_gem_object *obj, struct
> >>>>>>> dma_buf_map *map);
> >>>>>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> >>>>>>> index cf72699cb2bc..dc81fdc01dda 100644
> >>>>>>> --- a/include/linux/dma-buf.h
> >>>>>>> +++ b/include/linux/dma-buf.h
> >>>>>>> @@ -267,7 +267,52 @@ struct dma_buf_ops {
> >>>>>>> */
> >>>>>>> int (*mmap)(struct dma_buf *, struct vm_area_struct *vma);
> >>>>>>> + /**
> >>>>>>> + * @vmap:
> >>>>>>
> >>>>>> There's already a @vmap and @vunamp kerneldoc at the top comment, that
> >>>>>> needs to be removed.
> >>>>>> -Daniel
> >>>>>>
> >>>>>>> + *
> >>>>>>> + * Returns a virtual address for the buffer.
> >>>>>>> + *
> >>>>>>> + * Notes to callers:
> >>>>>>> + *
> >>>>>>> + * - Callers must hold the struct dma_buf.resv lock
> >>>>>>> before calling
> >>>>>>> + * this interface.
> >>>>>>> + *
> >>>>>>> + * - Callers must provide means to prevent the
> >>>>>>> mappings from going
> >>>>>>> + * stale, such as holding the reservation lock or providing a
> >>>>>>> + * move-notify callback to the exporter.
> >>>>>>> + *
> >>>>>>> + * Notes to implementors:
> >>>>>>> + *
> >>>>>>> + * - Implementations must expect pairs of @vmap and
> >>>>>>> @vunmap to be
> >>>>>>> + * called frequently and should optimize for this case.
> >>>>>>> + *
> >>>>>>> + * - Implementations should avoid additional operations, such as
> >>>>>>> + * pinning.
> >>>>>>> + *
> >>>>>>> + * - Implementations may expect the caller to hold the dma-buf's
> >>>>>>> + * reservation lock to protect against concurrent calls and
> >>>>>>> + * relocation.
> >>>>>>> + *
> >>>>>>> + * - Implementations may provide additional
> >>>>>>> guarantees, such as working
> >>>>>>> + * without holding the reservation lock.
> >>>>>>> + *
> >>>>>>> + * This callback is optional.
> >>>>>>> + *
> >>>>>>> + * Returns:
> >>>>>>> + *
> >>>>>>> + * 0 on success or a negative error code on failure.
> >>>>>>> + */
> >>>>>>> int (*vmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
> >>>>>>> +
> >>>>>>> + /**
> >>>>>>> + * @vunmap:
> >>>>>>> + *
> >>>>>>> + * Releases the address previously returned by @vmap.
> >>>>>>> + *
> >>>>>>> + * This callback is optional.
> >>>>>>> + *
> >>>>>>> + * See also @vmap()
> >>>>>>> + */
> >>>>>>> void (*vunmap)(struct dma_buf *dmabuf, struct
> >>>>>>> dma_buf_map *map);
> >>>>>>> };
> >>>>>>> --
> >>>>>>> 2.29.2
> >>>>>>>
> >>>>>>
> >>>>>
> >>>>> --
> >>>>> Thomas Zimmermann
> >>>>> Graphics Driver Developer
> >>>>> SUSE Software Solutions Germany GmbH
> >>>>> Maxfeldstr. 5, 90409 Nürnberg, Germany
> >>>>> (HRB 36809, AG Nürnberg)
> >>>>> Geschäftsführer: Felix Imendörffer
> >>>>>
> >>>>
> >>>>
> >>>>
> >>>>
> >>>
> >>
> >
>
> --
> Thomas Zimmermann
> Graphics Driver Developer
> SUSE Software Solutions Germany GmbH
> Maxfeldstr. 5, 90409 Nürnberg, Germany
> (HRB 36809, AG Nürnberg)
> Geschäftsführer: Felix Imendörffer
>
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
Am 04.12.20 um 09:32 schrieb Thomas Zimmermann:
> Hi
>
> Am 03.12.20 um 21:41 schrieb Daniel Vetter:
>> On Thu, Dec 03, 2020 at 07:59:04PM +0100, Thomas Zimmermann wrote:
>>> Hi
>>>
>>> Am 03.12.20 um 16:26 schrieb Daniel Vetter:
>>>> On Thu, Dec 03, 2020 at 03:02:59PM +0100, Thomas Zimmermann wrote:
>>>>> Dma-buf's vmap and vunmap callbacks are undocumented and various
>>>>> exporters currently have slightly different semantics for them. Add
>>>>> documentation on how to implement and use these interfaces correctly.
>>>>>
>>>>> v2:
>>>>> * document vmap semantics in struct dma_buf_ops
>>>>> * add TODO item for reviewing and maybe fixing dma-buf exporters
>>>>>
>>>>> Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
>>>>
>>>> I still don't think this works, we're breaking dma_buf_vmap for
>>>> everyone
>>>> else here.
>>>
>>> I removed the text on the importer. These notes for callers in the
>>> docs are
>>> more or less a consequence of the exporter semantics.
>>
>> Callers are importers, so I'm not seeing how that fixes anything.
>>
>>> I thought we at least agreed on the exporter semantics in the other
>>> thread,
>>> didn't we?
>>>
>>> What I'm trying to do is to write dome some rules for exporters,
>>> even if not
>>> all exporters follow them.
>>
>> This is a standard interface, everyone needs to follow the same
>> rules. And
>> if they change, we need to make sure nothing breaks and we're not
>> creating
>> issues.
>>
>> In the past the rule was the dma_buf_vmap was allowed to take the
>> dma_resv_lock, and that the buffer should be pinned. Now some ttm
>> drivers
>> didn't ever bother with the pinning, and mostly got away with that
>> because
>> drm_prime helpers do the pinning by default at attach time, and most
>> users
>> do call dma_buf_attach.
>>
>> But if you look through dma-buf docs nothing ever said you have to do a
>> dummy attachment before you call dma_buf_vmap, that's just slightly
>> crappy
>> implementations that didn't blow up yet.
>
> I had a patch for adding pin to radeon's implementation of vmap. [1]
> Christian told me to not do this; instead just get the lock in the
> fbdev code. His advise almost seems the opposite of what you're
> telling me here.
I think what Daniel suggests here is that we need to smoothly transition
the code from making assumptions to having a straight interface where
importers explicitly say when stuff is locked and when stuff is pinned.
I've started this with the attach interface by adding a new dynamic
approach to that, but you probably need to carry on here with that for
vmap as well.
When that is done we can migrate every exporter over to the new dynamic
approach.
>
> For the GEM VRAM helpers, that implicit pin in vmap gave me headaches.
> Because scanouts can only be done from VRAM, which is badly suited for
> exporting. So I ended up with an implicit pin that pins the buffer to
> whatever domain it currently is. I got away with it because GEM VRAM
> BOs are not sharable among devices; fbdev is the only user of that
> functionality and only pins for short periods of time.
>
> I suspect that fixing TTM-based drivers by adding an implicit pin
> would result in a similar situation. Whatever domain it ends up
> pinning, some functionality might not be compatible with that.
Correct, exactly that's the problem.
>
>>
>>> Given the circumstances, we should leave out this patch from the
>>> patchset.
>>
>> So the defacto rules are already a big mess, but that's not a good
>> excuse
>> to make it worse.
>>
>> What I had in mind is that we split dma_buf_vmap up into two variants:
>>
>> - The current one, which should guarantee that the buffer is pinned.
>> Because that's what all current callers wanted, before the fbdev code
>> started allowing non-pinned buffers.
>
> Can we add an explicit pin operation to dma_buf_vmap() to enforce the
> semantics?
At least I would be fine with that. For now amdgpu is the only exporter
who implements the explicit pin/unpin semantics anyway.
Regards,
Christian.
>
> Best regards
> Thomas
>
> [1] https://patchwork.freedesktop.org/patch/400054/?series=83765&rev=1
>
>>
>> - The new one, which allows vmapping with just dma_resv locked, and
>> should
>> have some caching in exporters.
>>
>> Breaking code and then adding todos about that is kinda not so cool
>> approach here imo.
>>
>> Also I guess ttm_bo_vmap should have a check that either the buffer is
>> pinned, or dma_resv_lock is held.
>>
>> Cheers, Daniel
>>
>>
>>
>>>
>>> Best regards
>>> Thomas
>>>
>>>>
>>>>> ---
>>>>> Documentation/gpu/todo.rst | 15 +++++++++++++
>>>>> include/drm/drm_gem.h | 4 ++++
>>>>> include/linux/dma-buf.h | 45
>>>>> ++++++++++++++++++++++++++++++++++++++
>>>>> 3 files changed, 64 insertions(+)
>>>>>
>>>>> diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst
>>>>> index 009d8e6c7e3c..32bb797a84fc 100644
>>>>> --- a/Documentation/gpu/todo.rst
>>>>> +++ b/Documentation/gpu/todo.rst
>>>>> @@ -505,6 +505,21 @@ Contact: Thomas Zimmermann
>>>>> <tzimmermann(a)suse.de>, Christian König, Daniel Vette
>>>>> Level: Intermediate
>>>>> +Enforce rules for dma-buf vmap and pin ops
>>>>> +------------------------------------------
>>>>> +
>>>>> +Exporter implementations of vmap and pin in struct dma_buf_ops
>>>>> (and consequently
>>>>> +struct drm_gem_object_funcs) use a variety of locking semantics.
>>>>> Some rely on
>>>>> +the caller holding the dma-buf's reservation lock, some do their
>>>>> own locking,
>>>>> +some don't require any locking. VRAM helpers even used to pin as
>>>>> part of vmap.
>>>>> +
>>>>> +We need to review each exporter and enforce the documented rules.
>>>>> +
>>>>> +Contact: Christian König, Daniel Vetter, Thomas Zimmermann
>>>>> <tzimmermann(a)suse.de>
>>>>> +
>>>>> +Level: Advanced
>>>>> +
>>>>> +
>>>>> Core refactorings
>>>>> =================
>>>>> diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
>>>>> index 5e6daa1c982f..1864c6a721b1 100644
>>>>> --- a/include/drm/drm_gem.h
>>>>> +++ b/include/drm/drm_gem.h
>>>>> @@ -138,6 +138,8 @@ struct drm_gem_object_funcs {
>>>>> * drm_gem_dmabuf_vmap() helper.
>>>>> *
>>>>> * This callback is optional.
>>>>> + *
>>>>> + * See also struct dma_buf_ops.vmap
>>>>> */
>>>>> int (*vmap)(struct drm_gem_object *obj, struct dma_buf_map
>>>>> *map);
>>>>> @@ -148,6 +150,8 @@ struct drm_gem_object_funcs {
>>>>> * drm_gem_dmabuf_vunmap() helper.
>>>>> *
>>>>> * This callback is optional.
>>>>> + *
>>>>> + * See also struct dma_buf_ops.vunmap
>>>>> */
>>>>> void (*vunmap)(struct drm_gem_object *obj, struct
>>>>> dma_buf_map *map);
>>>>> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
>>>>> index cf72699cb2bc..dc81fdc01dda 100644
>>>>> --- a/include/linux/dma-buf.h
>>>>> +++ b/include/linux/dma-buf.h
>>>>> @@ -267,7 +267,52 @@ struct dma_buf_ops {
>>>>> */
>>>>> int (*mmap)(struct dma_buf *, struct vm_area_struct *vma);
>>>>> + /**
>>>>> + * @vmap:
>>>>
>>>> There's already a @vmap and @vunamp kerneldoc at the top comment, that
>>>> needs to be removed.
>>>> -Daniel
>>>>
>>>>> + *
>>>>> + * Returns a virtual address for the buffer.
>>>>> + *
>>>>> + * Notes to callers:
>>>>> + *
>>>>> + * - Callers must hold the struct dma_buf.resv lock before
>>>>> calling
>>>>> + * this interface.
>>>>> + *
>>>>> + * - Callers must provide means to prevent the mappings from
>>>>> going
>>>>> + * stale, such as holding the reservation lock or providing a
>>>>> + * move-notify callback to the exporter.
>>>>> + *
>>>>> + * Notes to implementors:
>>>>> + *
>>>>> + * - Implementations must expect pairs of @vmap and @vunmap
>>>>> to be
>>>>> + * called frequently and should optimize for this case.
>>>>> + *
>>>>> + * - Implementations should avoid additional operations, such as
>>>>> + * pinning.
>>>>> + *
>>>>> + * - Implementations may expect the caller to hold the dma-buf's
>>>>> + * reservation lock to protect against concurrent calls and
>>>>> + * relocation.
>>>>> + *
>>>>> + * - Implementations may provide additional guarantees, such
>>>>> as working
>>>>> + * without holding the reservation lock.
>>>>> + *
>>>>> + * This callback is optional.
>>>>> + *
>>>>> + * Returns:
>>>>> + *
>>>>> + * 0 on success or a negative error code on failure.
>>>>> + */
>>>>> int (*vmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
>>>>> +
>>>>> + /**
>>>>> + * @vunmap:
>>>>> + *
>>>>> + * Releases the address previously returned by @vmap.
>>>>> + *
>>>>> + * This callback is optional.
>>>>> + *
>>>>> + * See also @vmap()
>>>>> + */
>>>>> void (*vunmap)(struct dma_buf *dmabuf, struct dma_buf_map
>>>>> *map);
>>>>> };
>>>>> --
>>>>> 2.29.2
>>>>>
>>>>
>>>
>>> --
>>> Thomas Zimmermann
>>> Graphics Driver Developer
>>> SUSE Software Solutions Germany GmbH
>>> Maxfeldstr. 5, 90409 Nürnberg, Germany
>>> (HRB 36809, AG Nürnberg)
>>> Geschäftsführer: Felix Imendörffer
>>>
>>
>>
>>
>>
>
On Thu, Dec 03, 2020 at 07:59:04PM +0100, Thomas Zimmermann wrote:
> Hi
>
> Am 03.12.20 um 16:26 schrieb Daniel Vetter:
> > On Thu, Dec 03, 2020 at 03:02:59PM +0100, Thomas Zimmermann wrote:
> > > Dma-buf's vmap and vunmap callbacks are undocumented and various
> > > exporters currently have slightly different semantics for them. Add
> > > documentation on how to implement and use these interfaces correctly.
> > >
> > > v2:
> > > * document vmap semantics in struct dma_buf_ops
> > > * add TODO item for reviewing and maybe fixing dma-buf exporters
> > >
> > > Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
> >
> > I still don't think this works, we're breaking dma_buf_vmap for everyone
> > else here.
>
> I removed the text on the importer. These notes for callers in the docs are
> more or less a consequence of the exporter semantics.
Callers are importers, so I'm not seeing how that fixes anything.
> I thought we at least agreed on the exporter semantics in the other thread,
> didn't we?
>
> What I'm trying to do is to write dome some rules for exporters, even if not
> all exporters follow them.
This is a standard interface, everyone needs to follow the same rules. And
if they change, we need to make sure nothing breaks and we're not creating
issues.
In the past the rule was the dma_buf_vmap was allowed to take the
dma_resv_lock, and that the buffer should be pinned. Now some ttm drivers
didn't ever bother with the pinning, and mostly got away with that because
drm_prime helpers do the pinning by default at attach time, and most users
do call dma_buf_attach.
But if you look through dma-buf docs nothing ever said you have to do a
dummy attachment before you call dma_buf_vmap, that's just slightly crappy
implementations that didn't blow up yet.
> Given the circumstances, we should leave out this patch from the patchset.
So the defacto rules are already a big mess, but that's not a good excuse
to make it worse.
What I had in mind is that we split dma_buf_vmap up into two variants:
- The current one, which should guarantee that the buffer is pinned.
Because that's what all current callers wanted, before the fbdev code
started allowing non-pinned buffers.
- The new one, which allows vmapping with just dma_resv locked, and should
have some caching in exporters.
Breaking code and then adding todos about that is kinda not so cool
approach here imo.
Also I guess ttm_bo_vmap should have a check that either the buffer is
pinned, or dma_resv_lock is held.
Cheers, Daniel
>
> Best regards
> Thomas
>
> >
> > > ---
> > > Documentation/gpu/todo.rst | 15 +++++++++++++
> > > include/drm/drm_gem.h | 4 ++++
> > > include/linux/dma-buf.h | 45 ++++++++++++++++++++++++++++++++++++++
> > > 3 files changed, 64 insertions(+)
> > >
> > > diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst
> > > index 009d8e6c7e3c..32bb797a84fc 100644
> > > --- a/Documentation/gpu/todo.rst
> > > +++ b/Documentation/gpu/todo.rst
> > > @@ -505,6 +505,21 @@ Contact: Thomas Zimmermann <tzimmermann(a)suse.de>, Christian König, Daniel Vette
> > > Level: Intermediate
> > > +Enforce rules for dma-buf vmap and pin ops
> > > +------------------------------------------
> > > +
> > > +Exporter implementations of vmap and pin in struct dma_buf_ops (and consequently
> > > +struct drm_gem_object_funcs) use a variety of locking semantics. Some rely on
> > > +the caller holding the dma-buf's reservation lock, some do their own locking,
> > > +some don't require any locking. VRAM helpers even used to pin as part of vmap.
> > > +
> > > +We need to review each exporter and enforce the documented rules.
> > > +
> > > +Contact: Christian König, Daniel Vetter, Thomas Zimmermann <tzimmermann(a)suse.de>
> > > +
> > > +Level: Advanced
> > > +
> > > +
> > > Core refactorings
> > > =================
> > > diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
> > > index 5e6daa1c982f..1864c6a721b1 100644
> > > --- a/include/drm/drm_gem.h
> > > +++ b/include/drm/drm_gem.h
> > > @@ -138,6 +138,8 @@ struct drm_gem_object_funcs {
> > > * drm_gem_dmabuf_vmap() helper.
> > > *
> > > * This callback is optional.
> > > + *
> > > + * See also struct dma_buf_ops.vmap
> > > */
> > > int (*vmap)(struct drm_gem_object *obj, struct dma_buf_map *map);
> > > @@ -148,6 +150,8 @@ struct drm_gem_object_funcs {
> > > * drm_gem_dmabuf_vunmap() helper.
> > > *
> > > * This callback is optional.
> > > + *
> > > + * See also struct dma_buf_ops.vunmap
> > > */
> > > void (*vunmap)(struct drm_gem_object *obj, struct dma_buf_map *map);
> > > diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> > > index cf72699cb2bc..dc81fdc01dda 100644
> > > --- a/include/linux/dma-buf.h
> > > +++ b/include/linux/dma-buf.h
> > > @@ -267,7 +267,52 @@ struct dma_buf_ops {
> > > */
> > > int (*mmap)(struct dma_buf *, struct vm_area_struct *vma);
> > > + /**
> > > + * @vmap:
> >
> > There's already a @vmap and @vunamp kerneldoc at the top comment, that
> > needs to be removed.
> > -Daniel
> >
> > > + *
> > > + * Returns a virtual address for the buffer.
> > > + *
> > > + * Notes to callers:
> > > + *
> > > + * - Callers must hold the struct dma_buf.resv lock before calling
> > > + * this interface.
> > > + *
> > > + * - Callers must provide means to prevent the mappings from going
> > > + * stale, such as holding the reservation lock or providing a
> > > + * move-notify callback to the exporter.
> > > + *
> > > + * Notes to implementors:
> > > + *
> > > + * - Implementations must expect pairs of @vmap and @vunmap to be
> > > + * called frequently and should optimize for this case.
> > > + *
> > > + * - Implementations should avoid additional operations, such as
> > > + * pinning.
> > > + *
> > > + * - Implementations may expect the caller to hold the dma-buf's
> > > + * reservation lock to protect against concurrent calls and
> > > + * relocation.
> > > + *
> > > + * - Implementations may provide additional guarantees, such as working
> > > + * without holding the reservation lock.
> > > + *
> > > + * This callback is optional.
> > > + *
> > > + * Returns:
> > > + *
> > > + * 0 on success or a negative error code on failure.
> > > + */
> > > int (*vmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
> > > +
> > > + /**
> > > + * @vunmap:
> > > + *
> > > + * Releases the address previously returned by @vmap.
> > > + *
> > > + * This callback is optional.
> > > + *
> > > + * See also @vmap()
> > > + */
> > > void (*vunmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
> > > };
> > > --
> > > 2.29.2
> > >
> >
>
> --
> Thomas Zimmermann
> Graphics Driver Developer
> SUSE Software Solutions Germany GmbH
> Maxfeldstr. 5, 90409 Nürnberg, Germany
> (HRB 36809, AG Nürnberg)
> Geschäftsführer: Felix Imendörffer
>
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
On Thu, Dec 03, 2020 at 03:02:59PM +0100, Thomas Zimmermann wrote:
> Dma-buf's vmap and vunmap callbacks are undocumented and various
> exporters currently have slightly different semantics for them. Add
> documentation on how to implement and use these interfaces correctly.
>
> v2:
> * document vmap semantics in struct dma_buf_ops
> * add TODO item for reviewing and maybe fixing dma-buf exporters
>
> Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
I still don't think this works, we're breaking dma_buf_vmap for everyone
else here.
> ---
> Documentation/gpu/todo.rst | 15 +++++++++++++
> include/drm/drm_gem.h | 4 ++++
> include/linux/dma-buf.h | 45 ++++++++++++++++++++++++++++++++++++++
> 3 files changed, 64 insertions(+)
>
> diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst
> index 009d8e6c7e3c..32bb797a84fc 100644
> --- a/Documentation/gpu/todo.rst
> +++ b/Documentation/gpu/todo.rst
> @@ -505,6 +505,21 @@ Contact: Thomas Zimmermann <tzimmermann(a)suse.de>, Christian König, Daniel Vette
> Level: Intermediate
>
>
> +Enforce rules for dma-buf vmap and pin ops
> +------------------------------------------
> +
> +Exporter implementations of vmap and pin in struct dma_buf_ops (and consequently
> +struct drm_gem_object_funcs) use a variety of locking semantics. Some rely on
> +the caller holding the dma-buf's reservation lock, some do their own locking,
> +some don't require any locking. VRAM helpers even used to pin as part of vmap.
> +
> +We need to review each exporter and enforce the documented rules.
> +
> +Contact: Christian König, Daniel Vetter, Thomas Zimmermann <tzimmermann(a)suse.de>
> +
> +Level: Advanced
> +
> +
> Core refactorings
> =================
>
> diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
> index 5e6daa1c982f..1864c6a721b1 100644
> --- a/include/drm/drm_gem.h
> +++ b/include/drm/drm_gem.h
> @@ -138,6 +138,8 @@ struct drm_gem_object_funcs {
> * drm_gem_dmabuf_vmap() helper.
> *
> * This callback is optional.
> + *
> + * See also struct dma_buf_ops.vmap
> */
> int (*vmap)(struct drm_gem_object *obj, struct dma_buf_map *map);
>
> @@ -148,6 +150,8 @@ struct drm_gem_object_funcs {
> * drm_gem_dmabuf_vunmap() helper.
> *
> * This callback is optional.
> + *
> + * See also struct dma_buf_ops.vunmap
> */
> void (*vunmap)(struct drm_gem_object *obj, struct dma_buf_map *map);
>
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index cf72699cb2bc..dc81fdc01dda 100644
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -267,7 +267,52 @@ struct dma_buf_ops {
> */
> int (*mmap)(struct dma_buf *, struct vm_area_struct *vma);
>
> + /**
> + * @vmap:
There's already a @vmap and @vunamp kerneldoc at the top comment, that
needs to be removed.
-Daniel
> + *
> + * Returns a virtual address for the buffer.
> + *
> + * Notes to callers:
> + *
> + * - Callers must hold the struct dma_buf.resv lock before calling
> + * this interface.
> + *
> + * - Callers must provide means to prevent the mappings from going
> + * stale, such as holding the reservation lock or providing a
> + * move-notify callback to the exporter.
> + *
> + * Notes to implementors:
> + *
> + * - Implementations must expect pairs of @vmap and @vunmap to be
> + * called frequently and should optimize for this case.
> + *
> + * - Implementations should avoid additional operations, such as
> + * pinning.
> + *
> + * - Implementations may expect the caller to hold the dma-buf's
> + * reservation lock to protect against concurrent calls and
> + * relocation.
> + *
> + * - Implementations may provide additional guarantees, such as working
> + * without holding the reservation lock.
> + *
> + * This callback is optional.
> + *
> + * Returns:
> + *
> + * 0 on success or a negative error code on failure.
> + */
> int (*vmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
> +
> + /**
> + * @vunmap:
> + *
> + * Releases the address previously returned by @vmap.
> + *
> + * This callback is optional.
> + *
> + * See also @vmap()
> + */
> void (*vunmap)(struct dma_buf *dmabuf, struct dma_buf_map *map);
> };
>
> --
> 2.29.2
>
--
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
On 03.12.20 12:47, Michal Hocko wrote:
> On Thu 03-12-20 10:47:02, David Hildenbrand wrote:
>> On 03.12.20 09:28, Michal Hocko wrote:
> [...]
>>> I think we should aim at easy and very highlevel behavior:
>>> - GFP_NOWAIT - unsupported currently IIRC but something that something
>>> that should be possible to implement. Isolation is non blocking,
>>> migration could be skipped
>>> - GFP_KERNEL - default behavior whatever that means
>>> - GFP_NORETRY - opportunistic allocation as lightweight as we can get.
>>> Failures to be expected also for transient reasons.
>>> - GFP_RETRY_MAYFAIL - try hard but not as hard as to trigger disruption
>>> (e.g. via oom killer).
>>
>> I think we currently see demand for 3 modes for alloc_contig_range()
>>
>> a) normal
>>
>> As is. Try, but don't try too hard. E.g., drain LRU, drain PCP, retry a
>> couple of times. Failures in some cases (short-term pinning, PCP races)
>> are still possible and acceptable.
>>
>> GFP_RETRY_MAYFAIL ?
>
> normal shouldn't really require anybody to think about gfp flags hard.
> That to most people really means GFP_KERNEL.
>
>> E.g., "Allocations with this flag may fail, but only when there is
>> genuinely little unused memory." - current description does not match at
>> all. When allocating ranges things behave completely different.
>>
>>
>> b) fast
>>
>> Try, but fail fast. Leave optimizations that can improve the result to
>> the caller. E.g., don't drain LRU, don't drain PCP, don't retry.
>> Frequent failures are expected and acceptable.
>>
>> __GFP_NORETRY ?
>>
>> E.g., "The VM implementation will try only very lightweight memory
>> direct reclaim to get some memory under memory pressure" - again, I
>> think current description does not really match.
>
> Agreed. As mentioned above this would be an opportunistic allocation
> mode.
>
>
>> c) hard
>>
>> Try hard, E.g., temporarily disabling the PCP. Certainly not
>> __GFP_NOFAIL, that would be highly dangerous. So no flags / GFP_KERNEL?
>
> NOFAIL semantic is out of question. Should we have a mode to try harder
> than the default? I dunno. Do we have users? I think RETRY_MAYFAIL is a
> middle ground between the default and NORETRY which is just too easy to
> fail. This is the case for the allocator as well. And from what I have
> seen people are already using MAYFAIL in order to prevent oom killer so
> this is a generally recognized pattern.
virtio-mem might be one user. It might first try in normal mode to get
as much memory out as possible, but switch to hard mode when it might
make sense.
>
>>> - __GFP_THIS_NODE - stick to a node without fallback
>>> - we can support zone modifiers although there is no existing user.
>>> - __GFP_NOWARN - obvious
>>>
>>> And that is it. Or maybe I am seeing that oversimplified.
>>>
>>
>> Again, I think most flags make sense for the migration target allocation
>> path and mainly deal with OOM situations and reclaim. For the migration
>> path - which is specific to the alloc_contig_range() allocater - they
>> don't really apply and create more confusion than they actually help - IMHO.
>
> Migration is really an implementation detail of this interface. You
> shouldn't be even thinking that there is a migration underneath not even
> mention to actually trying to control it.
CMA? I tend to agree.
alloc_contig_range? I disagree.
--
Thanks,
David / dhildenb
On 03.12.20 09:28, Michal Hocko wrote:
> On Wed 02-12-20 21:22:36, David Hildenbrand wrote:
>> On 02.12.20 20:26, Minchan Kim wrote:
>>> On Wed, Dec 02, 2020 at 07:51:07PM +0100, Michal Hocko wrote:
> [...]
>>>> I am still not sure a specific flag is a good interface. Really can this
>>>> be gfp_mask instead?
>>>
>>> I am not strong(even, I did it with GFP_NORETRY) but David wanted to
>>> have special mode and I agreed when he mentioned ALLOC_CONTIG_HARD as
>>> one of options in future(it would be hard to indicate that mode with
>>> gfp flags).
>>
>> I can't tell regarding the CMA interface, but for the alloc_contig()
>> interface I think modes make sense. Yes, it's different to other
>> allocaters, but the contig range allocater is different already. E.g.,
>> the CMA allocater mostly hides "which exact PFNs you try to allocate".
>
> Yes, alloc_contig_range is a low level API but it already has a gfp_mask
> parameter. Adding yet another allocation mode sounds like API
> convolution to me.
Well, if another parameter is a concern, we can introduce
alloc_contig_range_fast()
instead.
>
>> In the contig range allocater, gfp flags are currently used to express
>> how to allocate pages used as migration targets. I don't think mangling
>> in other gfp flags (or even overloading them) makes things a lot
>> clearer. E.g., GFP_NORETRY: don't retry to allocate migration targets?
>> don't retry to migrate pages? both?
>>
>> As I said, other aspects might be harder to model (e.g., don't drain
>> LRU) and hiding them behind generic gfp flags (e.g., GFP_NORETRY) feels
>> wrong.
>>
>> With the mode, we're expressing details for the necessary page
>> migration. Suggestions on how to model that are welcome.
>
> The question is whether the caller should really have such an intimate
> knowledge and control of the alloc_contig_range implementation. This all
> are implementation details. Should really anybody think about how many
> times migration retries or control LRU draining? Those can change in the
The question is not "how many times", rather "if at all". I can
understand the possible performance improvements by letting the caller
handle things (lru draining, pcp draining) like that when issuing
gazillions of alloc_contig_range() calls.
> future and I do not think we really want to go over all users grown over
> that time and try to deduce what was the intention behind.
That's why I think we need a clear mechanism to express the expected
behavior - something we can properly document and users can actually
understand to optimize for - and we can fix them up when the documented
behavior changes. Mangling this into somewhat-fitting gfp flags makes
the interface harder to use and more error-prone IMHO.
>
> I think we should aim at easy and very highlevel behavior:
> - GFP_NOWAIT - unsupported currently IIRC but something that something
> that should be possible to implement. Isolation is non blocking,
> migration could be skipped
> - GFP_KERNEL - default behavior whatever that means
> - GFP_NORETRY - opportunistic allocation as lightweight as we can get.
> Failures to be expected also for transient reasons.
> - GFP_RETRY_MAYFAIL - try hard but not as hard as to trigger disruption
> (e.g. via oom killer).
I think we currently see demand for 3 modes for alloc_contig_range()
a) normal
As is. Try, but don't try too hard. E.g., drain LRU, drain PCP, retry a
couple of times. Failures in some cases (short-term pinning, PCP races)
are still possible and acceptable.
GFP_RETRY_MAYFAIL ?
E.g., "Allocations with this flag may fail, but only when there is
genuinely little unused memory." - current description does not match at
all. When allocating ranges things behave completely different.
b) fast
Try, but fail fast. Leave optimizations that can improve the result to
the caller. E.g., don't drain LRU, don't drain PCP, don't retry.
Frequent failures are expected and acceptable.
__GFP_NORETRY ?
E.g., "The VM implementation will try only very lightweight memory
direct reclaim to get some memory under memory pressure" - again, I
think current description does not really match.
c) hard
Try hard, E.g., temporarily disabling the PCP. Certainly not
__GFP_NOFAIL, that would be highly dangerous. So no flags / GFP_KERNEL?
>
> - __GFP_THIS_NODE - stick to a node without fallback
> - we can support zone modifiers although there is no existing user.
> - __GFP_NOWARN - obvious
>
> And that is it. Or maybe I am seeing that oversimplified.
>
Again, I think most flags make sense for the migration target allocation
path and mainly deal with OOM situations and reclaim. For the migration
path - which is specific to the alloc_contig_range() allocater - they
don't really apply and create more confusion than they actually help - IMHO.
--
Thanks,
David / dhildenb
On Wed, Dec 02, 2020 at 07:51:07PM +0100, Michal Hocko wrote:
> On Wed 02-12-20 09:54:29, Minchan Kim wrote:
> > On Wed, Dec 02, 2020 at 05:48:34PM +0100, Michal Hocko wrote:
> > > On Wed 02-12-20 08:15:49, Minchan Kim wrote:
> > > > On Wed, Dec 02, 2020 at 04:49:15PM +0100, Michal Hocko wrote:
> > > [...]
> > > > > Well, what I can see is that this new interface is an antipatern to our
> > > > > allocation routines. We tend to control allocations by gfp mask yet you
> > > > > are introducing a bool parameter to make something faster... What that
> > > > > really means is rather arbitrary. Would it make more sense to teach
> > > > > cma_alloc resp. alloc_contig_range to recognize GFP_NOWAIT, GFP_NORETRY resp.
> > > > > GFP_RETRY_MAYFAIL instead?
> > > >
> > > > If we use cma_alloc, that interface requires "allocate one big memory
> > > > chunk". IOW, return value is just struct page and expected that the page
> > > > is a big contiguos memory. That means it couldn't have a hole in the
> > > > range.
> > > > However the idea here, what we asked is much smaller chunk rather
> > > > than a big contiguous memory so we could skip some of pages if they are
> > > > randomly pinned(long-term/short-term whatever) and search other pages
> > > > in the CMA area to avoid long stall. Thus, it couldn't work with exising
> > > > cma_alloc API with simple gfp_mak.
> > >
> > > I really do not see that as something really alient to the cma_alloc
> > > interface. All you should care about, really, is what size of the object
> > > you want and how hard the system should try. If you have a problem with
> > > an internal implementation of CMA and how it chooses a range and deal
> > > with pinned pages then it should be addressed inside the CMA allocator.
> > > I suspect that you are effectivelly trying to workaround those problems
> > > by a side implementation with a slightly different API. Or maybe I still
> > > do not follow the actual problem.
> > >
> > > > > I am not deeply familiar with the cma allocator so sorry for a
> > > > > potentially stupid question. Why does a bulk interface performs better
> > > > > than repeated calls to cma_alloc? Is this because a failure would help
> > > > > to move on to the next pfn range while a repeated call would have to
> > > > > deal with the same range?
> > > >
> > > > Yub, true with other overheads(e.g., migration retrial, waiting writeback
> > > > PCP/LRU draining IPI)
> > >
> > > Why cannot this be implemented in the cma_alloc layer? I mean you can
> > > cache failed cases and optimize the proper pfn range search.
> >
> > So do you suggest this?
> >
> > enum cma_alloc_mode {
> > CMA_ALLOC_NORMAL,
> > CMA_ALLOC_FAIL_FAST,
> > };
> >
> > struct page *cma_alloc(struct cma *cma, size_t count, unsigned int
> > align, enum cma_alloc_mode mode);
> >
> > >From now on, cma_alloc will keep last failed pfn and then start to
> > search from the next pfn for both CMA_ALLOC_NORMAL and
> > CMA_ALLOC_FAIL_FAST if requested size from the cached pfn is okay
> > within CMA area and then wraparound it couldn't find right pages
> > from the cached pfn. Othewise, the cached pfn will reset to the zero
> > so that it starts the search from the 0. I like the idea since it's
> > general improvement, I think.
>
> Yes something like that. There are more options to be clever here - e.g.
> track ranges etc. but I am not sure this is worth the complexity.
Agree. Just last pfn caching would be good enough as simple start.
>
> > Furthemore, With CMA_ALLOC_FAIL_FAST, it could avoid several overheads
> > at the cost of sacrificing allocation success ratio like GFP_NORETRY.
>
> I am still not sure a specific flag is a good interface. Really can this
> be gfp_mask instead?
I am not strong(even, I did it with GFP_NORETRY) but David wanted to
have special mode and I agreed when he mentioned ALLOC_CONTIG_HARD as
one of options in future(it would be hard to indicate that mode with
gfp flags).
>
> > I think that would solve the issue with making the API more flexible.
> > Before diving into it, I'd like to confirm we are on same page.
> > Please correct me if I misunderstood.
>
> I am not sure you are still thinking about a bulk interface.
No I am thinking of just using cma_alloc API with cached pfn
as interal improvement and adding new fast fail mode to the API
so driver could call the API repeatedly until then can get enough
pages.
On Wed, Dec 02, 2020 at 05:48:34PM +0100, Michal Hocko wrote:
> On Wed 02-12-20 08:15:49, Minchan Kim wrote:
> > On Wed, Dec 02, 2020 at 04:49:15PM +0100, Michal Hocko wrote:
> [...]
> > > Well, what I can see is that this new interface is an antipatern to our
> > > allocation routines. We tend to control allocations by gfp mask yet you
> > > are introducing a bool parameter to make something faster... What that
> > > really means is rather arbitrary. Would it make more sense to teach
> > > cma_alloc resp. alloc_contig_range to recognize GFP_NOWAIT, GFP_NORETRY resp.
> > > GFP_RETRY_MAYFAIL instead?
> >
> > If we use cma_alloc, that interface requires "allocate one big memory
> > chunk". IOW, return value is just struct page and expected that the page
> > is a big contiguos memory. That means it couldn't have a hole in the
> > range.
> > However the idea here, what we asked is much smaller chunk rather
> > than a big contiguous memory so we could skip some of pages if they are
> > randomly pinned(long-term/short-term whatever) and search other pages
> > in the CMA area to avoid long stall. Thus, it couldn't work with exising
> > cma_alloc API with simple gfp_mak.
>
> I really do not see that as something really alient to the cma_alloc
> interface. All you should care about, really, is what size of the object
> you want and how hard the system should try. If you have a problem with
> an internal implementation of CMA and how it chooses a range and deal
> with pinned pages then it should be addressed inside the CMA allocator.
> I suspect that you are effectivelly trying to workaround those problems
> by a side implementation with a slightly different API. Or maybe I still
> do not follow the actual problem.
>
> > > I am not deeply familiar with the cma allocator so sorry for a
> > > potentially stupid question. Why does a bulk interface performs better
> > > than repeated calls to cma_alloc? Is this because a failure would help
> > > to move on to the next pfn range while a repeated call would have to
> > > deal with the same range?
> >
> > Yub, true with other overheads(e.g., migration retrial, waiting writeback
> > PCP/LRU draining IPI)
>
> Why cannot this be implemented in the cma_alloc layer? I mean you can
> cache failed cases and optimize the proper pfn range search.
So do you suggest this?
enum cma_alloc_mode {
CMA_ALLOC_NORMAL,
CMA_ALLOC_FAIL_FAST,
};
struct page *cma_alloc(struct cma *cma, size_t count, unsigned int
align, enum cma_alloc_mode mode);
>From now on, cma_alloc will keep last failed pfn and then start to
search from the next pfn for both CMA_ALLOC_NORMAL and
CMA_ALLOC_FAIL_FAST if requested size from the cached pfn is okay
within CMA area and then wraparound it couldn't find right pages
from the cached pfn. Othewise, the cached pfn will reset to the zero
so that it starts the search from the 0. I like the idea since it's
general improvement, I think.
Furthemore, With CMA_ALLOC_FAIL_FAST, it could avoid several overheads
at the cost of sacrificing allocation success ratio like GFP_NORETRY.
I think that would solve the issue with making the API more flexible.
Before diving into it, I'd like to confirm we are on same page.
Please correct me if I misunderstood.
David, any objection?
On Wed, Dec 02, 2020 at 04:49:15PM +0100, Michal Hocko wrote:
> On Wed 02-12-20 10:14:41, David Hildenbrand wrote:
> > On 01.12.20 18:51, Minchan Kim wrote:
> > > There is a need for special HW to require bulk allocation of
> > > high-order pages. For example, 4800 * order-4 pages, which
> > > would be minimum, sometimes, it requires more.
> > >
> > > To meet the requirement, a option reserves 300M CMA area and
> > > requests the whole 300M contiguous memory. However, it doesn't
> > > work if even one of those pages in the range is long-term pinned
> > > directly or indirectly. The other option is to ask higher-order
> >
> > My latest knowledge is that pages in the CMA area are never long term
> > pinned.
> >
> > https://lore.kernel.org/lkml/20201123090129.GD27488@dhcp22.suse.cz/
> >
> > "gup already tries to deal with long term pins on CMA regions and migrate
> > to a non CMA region. Have a look at __gup_longterm_locked."
> >
> > We should rather identify ways how that is still possible and get rid of
> > them.
> >
> >
> > Now, short-term pinnings and PCP are other issues where
> > alloc_contig_range() could be improved (e.g., in contrast to a FAST
> > mode, a HARD mode which temporarily disables the PCP, ...).
>
> Agreed!
>
> > > size (e.g., 2M) than requested order(64K) repeatedly until driver
> > > could gather necessary amount of memory. Basically, this approach
> > > makes the allocation very slow due to cma_alloc's function
> > > slowness and it could be stuck on one of the pageblocks if it
> > > encounters unmigratable page.
> > >
> > > To solve the issue, this patch introduces cma_alloc_bulk.
> > >
> > > int cma_alloc_bulk(struct cma *cma, unsigned int align,
> > > bool fast, unsigned int order, size_t nr_requests,
> > > struct page **page_array, size_t *nr_allocated);
> > >
> > > Most parameters are same with cma_alloc but it additionally passes
> > > vector array to store allocated memory. What's different with cma_alloc
> > > is it will skip pageblocks without waiting/stopping if it has unmovable
> > > page so that API continues to scan other pageblocks to find requested
> > > order page.
> > >
> > > cma_alloc_bulk is best effort approach in that it skips some pageblocks
> > > if they have unmovable pages unlike cma_alloc. It doesn't need to be
> > > perfect from the beginning at the cost of performance. Thus, the API
> > > takes "bool fast parameter" which is propagated into alloc_contig_range to
> > > avoid significat overhead functions to inrecase CMA allocation success
> > > ratio(e.g., migration retrial, PCP, LRU draining per pageblock)
> > > at the cost of less allocation success ratio. If the caller couldn't
> > > allocate enough, they could call it with "false" to increase success ratio
> > > if they are okay to expense the overhead for the success ratio.
> >
> > Just so I understand what the idea is:
> >
> > alloc_contig_range() sometimes fails on CMA regions when trying to
> > allocate big chunks (e.g., 300M). Instead of tackling that issue, you
> > rather allocate plenty of small chunks, and make these small allocations
> > fail faster/ make the allocations less reliable. Correct?
> >
> > I don't really have a strong opinion on that. Giving up fast rather than
> > trying for longer sounds like a useful thing to have - but I wonder if
> > it's strictly necessary for the use case you describe.
> >
> > I'd like to hear Michals opinion on that.
>
> Well, what I can see is that this new interface is an antipatern to our
> allocation routines. We tend to control allocations by gfp mask yet you
> are introducing a bool parameter to make something faster... What that
> really means is rather arbitrary. Would it make more sense to teach
> cma_alloc resp. alloc_contig_range to recognize GFP_NOWAIT, GFP_NORETRY resp.
> GFP_RETRY_MAYFAIL instead?
If we use cma_alloc, that interface requires "allocate one big memory
chunk". IOW, return value is just struct page and expected that the page
is a big contiguos memory. That means it couldn't have a hole in the
range. However the idea here, what we asked is much smaller chunk rather
than a big contiguous memory so we could skip some of pages if they are
randomly pinned(long-term/short-term whatever) and search other pages
in the CMA area to avoid long stall. Thus, it couldn't work with exising
cma_alloc API with simple gfp_mak.
>
> I am not deeply familiar with the cma allocator so sorry for a
> potentially stupid question. Why does a bulk interface performs better
> than repeated calls to cma_alloc? Is this because a failure would help
> to move on to the next pfn range while a repeated call would have to
> deal with the same range?
Yub, true with other overheads(e.g., migration retrial, waiting writeback
PCP/LRU draining IPI)
>
> > > Signed-off-by: Minchan Kim <minchan(a)kernel.org>
> > > ---
> > > include/linux/cma.h | 5 ++
> > > include/linux/gfp.h | 2 +
> > > mm/cma.c | 126 ++++++++++++++++++++++++++++++++++++++++++--
> > > mm/page_alloc.c | 19 ++++---
> > > 4 files changed, 140 insertions(+), 12 deletions(-)
> > >
> --
> Michal Hocko
> SUSE Labs
On 02.12.20 16:49, Michal Hocko wrote:
> On Wed 02-12-20 10:14:41, David Hildenbrand wrote:
>> On 01.12.20 18:51, Minchan Kim wrote:
>>> There is a need for special HW to require bulk allocation of
>>> high-order pages. For example, 4800 * order-4 pages, which
>>> would be minimum, sometimes, it requires more.
>>>
>>> To meet the requirement, a option reserves 300M CMA area and
>>> requests the whole 300M contiguous memory. However, it doesn't
>>> work if even one of those pages in the range is long-term pinned
>>> directly or indirectly. The other option is to ask higher-order
>>
>> My latest knowledge is that pages in the CMA area are never long term
>> pinned.
>>
>> https://lore.kernel.org/lkml/20201123090129.GD27488@dhcp22.suse.cz/
>>
>> "gup already tries to deal with long term pins on CMA regions and migrate
>> to a non CMA region. Have a look at __gup_longterm_locked."
>>
>> We should rather identify ways how that is still possible and get rid of
>> them.
>>
>>
>> Now, short-term pinnings and PCP are other issues where
>> alloc_contig_range() could be improved (e.g., in contrast to a FAST
>> mode, a HARD mode which temporarily disables the PCP, ...).
>
> Agreed!
>
>>> size (e.g., 2M) than requested order(64K) repeatedly until driver
>>> could gather necessary amount of memory. Basically, this approach
>>> makes the allocation very slow due to cma_alloc's function
>>> slowness and it could be stuck on one of the pageblocks if it
>>> encounters unmigratable page.
>>>
>>> To solve the issue, this patch introduces cma_alloc_bulk.
>>>
>>> int cma_alloc_bulk(struct cma *cma, unsigned int align,
>>> bool fast, unsigned int order, size_t nr_requests,
>>> struct page **page_array, size_t *nr_allocated);
>>>
>>> Most parameters are same with cma_alloc but it additionally passes
>>> vector array to store allocated memory. What's different with cma_alloc
>>> is it will skip pageblocks without waiting/stopping if it has unmovable
>>> page so that API continues to scan other pageblocks to find requested
>>> order page.
>>>
>>> cma_alloc_bulk is best effort approach in that it skips some pageblocks
>>> if they have unmovable pages unlike cma_alloc. It doesn't need to be
>>> perfect from the beginning at the cost of performance. Thus, the API
>>> takes "bool fast parameter" which is propagated into alloc_contig_range to
>>> avoid significat overhead functions to inrecase CMA allocation success
>>> ratio(e.g., migration retrial, PCP, LRU draining per pageblock)
>>> at the cost of less allocation success ratio. If the caller couldn't
>>> allocate enough, they could call it with "false" to increase success ratio
>>> if they are okay to expense the overhead for the success ratio.
>>
>> Just so I understand what the idea is:
>>
>> alloc_contig_range() sometimes fails on CMA regions when trying to
>> allocate big chunks (e.g., 300M). Instead of tackling that issue, you
>> rather allocate plenty of small chunks, and make these small allocations
>> fail faster/ make the allocations less reliable. Correct?
>>
>> I don't really have a strong opinion on that. Giving up fast rather than
>> trying for longer sounds like a useful thing to have - but I wonder if
>> it's strictly necessary for the use case you describe.
>>
>> I'd like to hear Michals opinion on that.
>
> Well, what I can see is that this new interface is an antipatern to our
> allocation routines. We tend to control allocations by gfp mask yet you
> are introducing a bool parameter to make something faster... What that
> really means is rather arbitrary. Would it make more sense to teach
> cma_alloc resp. alloc_contig_range to recognize GFP_NOWAIT, GFP_NORETRY resp.
> GFP_RETRY_MAYFAIL instead?
Minchan did that before, but I disliked gluing things like "don't drain
lru, don't drain pcp" to GFP_NORETRY and shifting responsibility to the
user.
--
Thanks,
David / dhildenb
This patchset introduces a new dma heap, chunk heap that makes it
easy to perform the bulk allocation of high order pages.
It has been created to help optimize the 4K/8K HDR video playback
with secure DRM HW to protect contents on memory. The HW needs
physically contiguous memory chunks up to several hundred MB memory.
This patchset is against on next-20201130.
The patchset includes the following:
- cma_alloc_bulk API
- export dma-heap API to register kernel module dma heap.
- add chunk heap implementation.
* Since v1 -
https://lore.kernel.org/linux-mm/20201117181935.3613581-1-minchan@kernel.or…
* introduce alloc_contig_mode - David
* use default CMA instead of device tree - John
Hyesoo Yu (2):
dma-buf: add export symbol for dma-heap
dma-buf: heaps: add chunk heap to dmabuf heaps
Minchan Kim (2):
mm: introduce alloc_contig_mode
mm: introduce cma_alloc_bulk API
drivers/dma-buf/dma-heap.c | 2 +
drivers/dma-buf/heaps/Kconfig | 15 +
drivers/dma-buf/heaps/Makefile | 1 +
drivers/dma-buf/heaps/chunk_heap.c | 429 +++++++++++++++++++++++++++++
drivers/virtio/virtio_mem.c | 2 +-
include/linux/cma.h | 5 +
include/linux/gfp.h | 10 +-
kernel/dma/contiguous.c | 1 +
mm/cma.c | 134 ++++++++-
mm/page_alloc.c | 25 +-
10 files changed, 607 insertions(+), 17 deletions(-)
create mode 100644 drivers/dma-buf/heaps/chunk_heap.c
--
2.29.2.454.gaff20da3a2-goog
On Tue, Nov 24, 2020 at 2:45 PM Lee Jones <lee.jones(a)linaro.org> wrote:
>
> Fixes the following W=1 kernel build warning(s):
>
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:403: warning: Function parameter or member 'job' not described in 'sdma_v5_0_ring_emit_ib'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:403: warning: Function parameter or member 'flags' not described in 'sdma_v5_0_ring_emit_ib'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:480: warning: Function parameter or member 'addr' not described in 'sdma_v5_0_ring_emit_fence'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:480: warning: Function parameter or member 'seq' not described in 'sdma_v5_0_ring_emit_fence'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:480: warning: Function parameter or member 'flags' not described in 'sdma_v5_0_ring_emit_fence'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:480: warning: Excess function parameter 'fence' description in 'sdma_v5_0_ring_emit_fence'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:967: warning: Function parameter or member 'timeout' not described in 'sdma_v5_0_ring_test_ib'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1074: warning: Function parameter or member 'value' not described in 'sdma_v5_0_vm_write_pte'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1074: warning: Excess function parameter 'addr' description in 'sdma_v5_0_vm_write_pte'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1074: warning: Excess function parameter 'flags' description in 'sdma_v5_0_vm_write_pte'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1126: warning: Function parameter or member 'ring' not described in 'sdma_v5_0_ring_pad_ib'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1180: warning: Function parameter or member 'vmid' not described in 'sdma_v5_0_ring_emit_vm_flush'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1180: warning: Function parameter or member 'pd_addr' not described in 'sdma_v5_0_ring_emit_vm_flush'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1180: warning: Excess function parameter 'vm' description in 'sdma_v5_0_ring_emit_vm_flush'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1703: warning: Function parameter or member 'ib' not described in 'sdma_v5_0_emit_copy_buffer'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1703: warning: Function parameter or member 'tmz' not described in 'sdma_v5_0_emit_copy_buffer'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1703: warning: Excess function parameter 'ring' description in 'sdma_v5_0_emit_copy_buffer'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1729: warning: Function parameter or member 'ib' not described in 'sdma_v5_0_emit_fill_buffer'
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c:1729: warning: Excess function parameter 'ring' description in 'sdma_v5_0_emit_fill_buffer'
>
> Cc: Alex Deucher <alexander.deucher(a)amd.com>
> Cc: "Christian König" <christian.koenig(a)amd.com>
> Cc: David Airlie <airlied(a)linux.ie>
> Cc: Daniel Vetter <daniel(a)ffwll.ch>
> Cc: Sumit Semwal <sumit.semwal(a)linaro.org>
> Cc: amd-gfx(a)lists.freedesktop.org
> Cc: dri-devel(a)lists.freedesktop.org
> Cc: linux-media(a)vger.kernel.org
> Cc: linaro-mm-sig(a)lists.linaro.org
> Signed-off-by: Lee Jones <lee.jones(a)linaro.org>
Applied with minor fixes. Thanks!
Alex
> ---
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 19 +++++++++++++------
> 1 file changed, 13 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index 9c72b95b74639..5180a52a79a54 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -392,7 +392,9 @@ static void sdma_v5_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
> * sdma_v5_0_ring_emit_ib - Schedule an IB on the DMA engine
> *
> * @ring: amdgpu ring pointer
> + * @job: job to retrive vmid from
> * @ib: IB object to schedule
> + * @flags: unused
> *
> * Schedule an IB in the DMA ring (NAVI10).
> */
> @@ -469,7 +471,9 @@ static void sdma_v5_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)
> * sdma_v5_0_ring_emit_fence - emit a fence on the DMA ring
> *
> * @ring: amdgpu ring pointer
> - * @fence: amdgpu fence object
> + * @addr: address
> + * @seq: sequence number
> + * @flags: fence related flags
> *
> * Add a DMA fence packet to the ring to write
> * the fence seq number and DMA trap packet to generate
> @@ -959,6 +963,7 @@ static int sdma_v5_0_ring_test_ring(struct amdgpu_ring *ring)
> * sdma_v5_0_ring_test_ib - test an IB on the DMA engine
> *
> * @ring: amdgpu_ring structure holding ring information
> + * @timeout: timeout value in jiffies, or MAX_SCHEDULE_TIMEOUT
> *
> * Test a simple IB in the DMA ring (NAVI10).
> * Returns 0 on success, error on failure.
> @@ -1061,10 +1066,9 @@ static void sdma_v5_0_vm_copy_pte(struct amdgpu_ib *ib,
> *
> * @ib: indirect buffer to fill with commands
> * @pe: addr of the page entry
> - * @addr: dst addr to write into pe
> + * @value: dst addr to write into pe
> * @count: number of page entries to update
> * @incr: increase next addr by incr bytes
> - * @flags: access flags
> *
> * Update PTEs by writing them manually using sDMA (NAVI10).
> */
> @@ -1118,6 +1122,7 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib,
>
> /**
> * sdma_v5_0_ring_pad_ib - pad the IB
> + * @ring: amdgpu_ring structure holding ring information
> * @ib: indirect buffer to fill with padding
> *
> * Pad the IB with NOPs to a boundary multiple of 8.
> @@ -1170,7 +1175,8 @@ static void sdma_v5_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> * sdma_v5_0_ring_emit_vm_flush - vm flush using sDMA
> *
> * @ring: amdgpu_ring pointer
> - * @vm: amdgpu_vm pointer
> + * @vmid: vmid number to use
> + * @pd_addr: address
> *
> * Update the page table base and flush the VM TLB
> * using sDMA (NAVI10).
> @@ -1686,10 +1692,11 @@ static void sdma_v5_0_set_irq_funcs(struct amdgpu_device *adev)
> /**
> * sdma_v5_0_emit_copy_buffer - copy buffer using the sDMA engine
> *
> - * @ring: amdgpu_ring structure holding ring information
> + * @ib: indirect buffer to copy to
> * @src_offset: src GPU address
> * @dst_offset: dst GPU address
> * @byte_count: number of bytes to xfer
> + * @tmz: if a secure copy should be used
> *
> * Copy GPU buffers using the DMA engine (NAVI10).
> * Used by the amdgpu ttm implementation to move pages if
> @@ -1715,7 +1722,7 @@ static void sdma_v5_0_emit_copy_buffer(struct amdgpu_ib *ib,
> /**
> * sdma_v5_0_emit_fill_buffer - fill buffer using the sDMA engine
> *
> - * @ring: amdgpu_ring structure holding ring information
> + * @ib: indirect buffer to fill
> * @src_data: value to write to buffer
> * @dst_offset: dst GPU address
> * @byte_count: number of bytes to xfer
> --
> 2.25.1
>
> _______________________________________________
> dri-devel mailing list
> dri-devel(a)lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel